Bump copyright year
[idzebra-moved-to-github.git] / dict / insert.c
1 /* This file is part of the Zebra server.
2    Copyright (C) 1994-2010 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17
18 */
19
20
21
22 #include <string.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <assert.h>
26
27 #include "dict-p.h"
28
29 #define CHECK 0
30
31 static int dict_ins(Dict dict, const Dict_char *str,
32                     Dict_ptr back_ptr, int userlen, void *userinfo);
33 static void clean_page(Dict dict, Dict_ptr ptr, void *p, Dict_char *out,
34                        Dict_ptr subptr, char *userinfo);
35
36
37 static Dict_ptr new_page(Dict dict, Dict_ptr back_ptr, void **pp)
38 {
39     void *p;
40     Dict_ptr ptr = dict->head.last;
41     if (!dict->head.freelist)
42     {
43         dict_bf_newp(dict->dbf, dict->head.last, &p, dict->head.page_size);
44         (dict->head.last)++;
45     }
46     else
47     {
48         ptr = dict->head.freelist;
49         dict_bf_readp(dict->dbf, ptr, &p);
50         dict->head.freelist = DICT_backptr(p);
51     }
52     assert(p);
53     DICT_type(p) = 0;
54     DICT_backptr(p) = back_ptr;
55     DICT_nodir(p) = 0;
56     DICT_size(p) = DICT_infoffset;
57     DICT_bsize(p) = dict->head.page_size;
58     if (pp)
59         *pp = p;
60     return ptr;
61 }
62
63 static int split_page(Dict dict, Dict_ptr ptr, void *p)
64 {
65     void *subp;
66     char *info_here;
67     Dict_ptr subptr;
68     int i, j;
69     short *indxp, *best_indxp = NULL;
70     Dict_char best_char = 0;
71     Dict_char prev_char = 0;
72     int best_no = -1, no_current = 1;
73
74     dict->no_split++;
75     /* determine splitting char... */
76     indxp = (short*) ((char*) p+DICT_bsize(p)-sizeof(short));
77     for (i = DICT_nodir(p); --i >= 0; --indxp)
78     {
79         if (*indxp > 0) /* tail string here! */
80         {
81             Dict_char dc;
82
83             memcpy(&dc, (char*) p + *indxp, sizeof(dc));
84             if (best_no < 0)
85             {   /* first entry met */
86                 best_char = prev_char = dc;
87                 best_no = 1;
88                 best_indxp = indxp;
89             }
90             else if (prev_char == dc)
91             {   /* same char prefix. update */
92                 if (++no_current > best_no)
93                 {   /* best entry so far */
94                     best_no = no_current;
95                     best_char = dc;
96                     best_indxp = indxp;
97                 }
98             }
99             else 
100             {   /* new char prefix. restore */
101                 prev_char = dc;
102                 no_current = 1;
103             }
104         }
105     }
106     assert(best_no >= 0); /* we didn't find any tail string entry at all! */
107
108     j = best_indxp - (short*) p;
109     subptr = new_page(dict, ptr, &subp);
110     /* scan entries to see if there is a string with */
111     /* length 1. info_here indicates if such entry exist */
112     info_here = NULL;
113     for (i=0; i<best_no; i++, j++)
114     {
115         char *info, *info1;
116         int slen;
117         Dict_char dc;
118
119         info = (char*) p + ((short*) p)[j];
120         /* entry start */
121         memcpy(&dc, info, sizeof(dc));
122         assert(dc == best_char);
123         slen = 1+dict_strlen((Dict_char*) info);
124
125         assert(slen > 1);
126         if (slen == 2)
127         {
128             assert(!info_here);
129             info_here = info+slen*sizeof(Dict_char);
130         }
131         else
132         {
133             info1 = info+slen*sizeof(Dict_char);  /* info start */
134             dict_ins(dict, (Dict_char*) (info+sizeof(Dict_char)),
135                      subptr, *info1, info1+1);
136             dict_bf_readp(dict->dbf, ptr, &p);
137         }
138     }
139     /* now clean the page ... */
140     clean_page(dict, ptr, p, &best_char, subptr, info_here);
141     return 0;
142 }
143
144 static void clean_page(Dict dict, Dict_ptr ptr, void *p, Dict_char *out,
145                        Dict_ptr subptr, char *userinfo)             
146 {
147     char *np = (char *) xmalloc(dict->head.page_size);
148     int i, slen, no = 0;
149     short *indxp1, *indxp2;
150     char *info1, *info2;
151
152     DICT_bsize(np) = dict->head.page_size;
153     indxp1 = (short*) ((char*) p+DICT_bsize(p)-sizeof(short));
154     indxp2 = (short*) ((char*) np+DICT_bsize(np));
155     info2 = (char*) np + DICT_infoffset;
156     for (i = DICT_nodir(p); --i >= 0; --indxp1)
157     {
158         if (*indxp1 > 0) /* tail string here! */
159         {
160             /* string (Dict_char *) DICT_EOS terminated */
161             /* unsigned char        length of information */
162             /* char *               information */
163
164             info1 = (char*) p + *indxp1;
165             if (out && memcmp(out, info1, sizeof(Dict_char)) == 0)
166             {
167                 if (subptr == 0)
168                     continue;
169                 *--indxp2 = -(info2 - np);
170                 memcpy(info2, &subptr, sizeof(Dict_ptr));
171                 info2 += sizeof(Dict_ptr);
172                 memcpy(info2, out, sizeof(Dict_char));
173                 info2 += sizeof(Dict_char);
174                 if (userinfo)
175                 {
176                     memcpy(info2, userinfo, *userinfo+1);
177                     info2 += *userinfo + 1;
178                 }
179                 else
180                     *info2++ = 0;       
181                 subptr = 0; 
182                 ++no;
183                 continue;
184             }
185             *--indxp2 = info2 - np;
186             slen = (dict_strlen((Dict_char*) info1)+1)*sizeof(Dict_char);
187             memcpy(info2, info1, slen);
188             info1 += slen;
189             info2 += slen;
190         }
191         else
192         {
193             /* Dict_ptr             subptr */
194             /* Dict_char            sub char */
195             /* unsigned char        length of information */
196             /* char *               information */
197
198             assert(*indxp1 < 0);
199             *--indxp2 = -(info2 - np);
200             info1 = (char*) p - *indxp1;
201             memcpy(info2, info1, sizeof(Dict_ptr)+sizeof(Dict_char));
202             info1 += sizeof(Dict_ptr)+sizeof(Dict_char);
203             info2 += sizeof(Dict_ptr)+sizeof(Dict_char);
204         }
205         slen = *info1+1;
206         memcpy(info2, info1, slen);
207         info2 += slen;
208         ++no;
209     }
210 #if 1
211     memcpy((char*)p+DICT_infoffset, 
212            (char*)np+DICT_infoffset,
213            info2 - ((char*)np+DICT_infoffset));
214     memcpy((char*)p + ((char*)indxp2 - (char*)np),
215            indxp2,
216            ((char*) np+DICT_bsize(p)) - (char*)indxp2);
217 #else
218     memcpy((char*)p+DICT_infoffset, (char*)np+DICT_infoffset,
219            DICT_pagesize(dict)-DICT_infoffset);
220 #endif
221     DICT_size(p) = info2 - np;
222     DICT_type(p) = 0;
223     DICT_nodir(p) = no;
224     xfree(np);
225     dict_bf_touch(dict->dbf, ptr);
226 }
227
228
229
230 /* return 0 if new */
231 /* return 1 if before but change of info */
232 /* return 2 if same as before */
233
234 static int dict_ins(Dict dict, const Dict_char *str,
235                     Dict_ptr ptr, int userlen, void *userinfo)
236 {
237     int hi, lo, mid, slen, cmp = 1;
238     short *indxp;
239     char *info;
240     void *p;
241
242     dict_bf_readp(dict->dbf, ptr, &p);
243         
244     assert(p);
245     assert(ptr);
246
247     mid = lo = 0;
248     hi = DICT_nodir(p)-1;
249     indxp = (short*) ((char*) p+DICT_bsize(p)-sizeof(short));
250     while (lo <= hi)
251     {
252         mid = (lo+hi)/2;
253         if (indxp[-mid] > 0)
254         {
255             /* string (Dict_char *) DICT_EOS terminated */
256             /* unsigned char        length of information */
257             /* char *               information */
258             info = (char*)p + indxp[-mid];
259             cmp = dict_strcmp((Dict_char*) info, str);
260             if (!cmp)
261             {
262                 info += (dict_strlen((Dict_char*) info)+1)*sizeof(Dict_char);
263                 /* consider change of userinfo length... */
264                 if (*info == userlen)
265                 {
266                     /* change of userinfo ? */
267                     if (memcmp(info+1, userinfo, userlen))
268                     {
269                         dict_bf_touch(dict->dbf, ptr);
270                         memcpy(info+1, userinfo, userlen);
271                         return 1;
272                     }
273                     /* same userinfo */
274                     return 2;
275                 }
276                 else if (*info > userlen)
277                 {
278                     /* room for new userinfo */
279                     DICT_type(p) = 1;
280                     *info = userlen;
281                     dict_bf_touch(dict->dbf, ptr);
282                     memcpy(info+1, userinfo, userlen);
283                     return 1;
284                 }
285                 break;
286             }
287         }
288         else
289         {
290             Dict_char dc;
291             Dict_ptr subptr;
292
293             /* Dict_ptr             subptr */
294             /* Dict_char            sub char */
295             /* unsigned char        length of information */
296             /* char *               information */
297             info = (char*)p - indxp[-mid];
298             memcpy(&dc, info+sizeof(Dict_ptr), sizeof(Dict_char));
299             cmp = dc- *str;
300             if (!cmp)
301             {
302                 memcpy(&subptr, info, sizeof(Dict_ptr));
303                 if (*++str == DICT_EOS)
304                 {
305                     /* finish of string. Store userinfo here... */
306
307                     int xlen = info[sizeof(Dict_ptr)+sizeof(Dict_char)];
308                     if (xlen == userlen)
309                     {
310                         if (memcmp(info+sizeof(Dict_ptr)+sizeof(Dict_char)+1,
311                                    userinfo, userlen))
312                         {
313                             dict_bf_touch(dict->dbf, ptr);
314                             memcpy(info+sizeof(Dict_ptr)+sizeof(Dict_char)+1,
315                                    userinfo, userlen);
316                             return 1;
317                         }
318                         return 2;
319                     }
320                     else if (xlen > userlen)
321                     {
322                         DICT_type(p) = 1;
323                         info[sizeof(Dict_ptr)+sizeof(Dict_char)] = userlen;
324                         memcpy(info+sizeof(Dict_ptr)+sizeof(Dict_char)+1,
325                                userinfo, userlen);
326                         dict_bf_touch(dict->dbf, ptr);
327                         return 1;
328                     }
329                     /* xlen < userlen, expanding needed ... */
330                     if (DICT_size(p)+sizeof(Dict_char)+sizeof(Dict_ptr)+
331                         userlen >=
332                         DICT_bsize(p) - (1+DICT_nodir(p))*sizeof(short))
333                     {
334                         /* not enough room - split needed ... */
335                         if (DICT_type(p) == 1)
336                         {
337                             clean_page(dict, ptr, p, NULL, 0, NULL);
338                             return dict_ins(dict, str-1, ptr,
339                                             userlen, userinfo);
340                         }
341                         if (split_page(dict, ptr, p)) 
342                         {
343                             yaz_log(YLOG_FATAL, "Unable to split page %d\n", ptr);
344                             assert(0);
345                         }
346                         return dict_ins(dict, str-1, ptr, userlen, userinfo);
347                     }
348                     else
349                     {   /* enough room - no split needed ... */
350                         info = (char*)p + DICT_size(p);
351                         memcpy(info, &subptr, sizeof(subptr));
352                         memcpy(info+sizeof(Dict_ptr), &dc, sizeof(Dict_char));
353                         info[sizeof(Dict_char)+sizeof(Dict_ptr)] = userlen;
354                         memcpy(info+sizeof(Dict_char)+sizeof(Dict_ptr)+1,
355                                userinfo, userlen);
356                         indxp[-mid] = -DICT_size(p);
357                         DICT_size(p) += sizeof(Dict_char)+sizeof(Dict_ptr)
358                             +1+userlen;
359                         DICT_type(p) = 1;
360                         dict_bf_touch(dict->dbf, ptr);
361                     }
362                     if (xlen)
363                         return 1;
364                     return 0;
365                 }
366                 else
367                 {
368                     if (subptr == 0)
369                     {
370                         subptr = new_page(dict, ptr, NULL);
371                         memcpy(info, &subptr, sizeof(subptr));
372                         dict_bf_touch(dict->dbf, ptr);
373                     }
374                     return dict_ins(dict, str, subptr, userlen, userinfo);
375                 }
376             }
377         }
378         if (cmp < 0)
379             lo = mid+1;
380         else
381             hi = mid-1;
382     }
383     indxp = indxp-mid;
384     if (lo>hi && cmp < 0)
385         --indxp;
386     slen = (dict_strlen(str)+1)*sizeof(Dict_char);
387     if (DICT_size(p)+slen+userlen >=
388         (int)(DICT_bsize(p) - (1+DICT_nodir(p))*sizeof(short)))/* overflow? */
389     {
390         if (DICT_type(p))
391         {
392             clean_page(dict, ptr, p, NULL, 0, NULL);
393             return dict_ins(dict, str, ptr, userlen, userinfo);
394         }
395         split_page(dict, ptr, p);
396         return dict_ins(dict, str, ptr, userlen, userinfo);
397     }
398     if (cmp)
399     {
400         short *indxp1;
401         (DICT_nodir(p))++;
402         indxp1 = (short*)((char*) p + DICT_bsize(p)
403                           - DICT_nodir(p)*sizeof(short));
404         for (; indxp1 != indxp; indxp1++)
405             indxp1[0] = indxp1[1];
406 #if CHECK
407         indxp1 = (short*) ((char*) p+DICT_pagesize(dict)-sizeof(short));
408         for (i = DICT_nodir (p); --i >= 0; --indxp1)
409         {
410             if (*indxp1 < 0)
411             {
412                 info = (char*)p - *indxp1;
413                 assert (info[sizeof(Dict_ptr)] > ' ');
414             }
415         }
416 #endif
417     }
418     else
419         DICT_type(p) = 1;
420     info = (char*)p + DICT_size(p);
421     memcpy(info, str, slen);
422     info += slen;
423     *info++ = userlen;
424     memcpy(info, userinfo, userlen);
425     info += userlen;
426
427     *indxp = DICT_size(p);
428     DICT_size(p) = info- (char*) p;
429     dict_bf_touch(dict->dbf, ptr);
430     if (cmp)
431         return 0;
432     return 1;
433 }
434
435 int dict_insert(Dict dict, const char *str, int userlen, void *userinfo)
436 {
437     if (!dict->rw)
438         return -1;
439     dict->no_insert++;
440     if (!dict->head.root)
441     {
442         void *p;
443         dict->head.root = new_page(dict, 0, &p);
444         if (!dict->head.root)
445             return -1;
446     }
447     return dict_ins(dict, (const Dict_char *) str, dict->head.root,
448                     userlen, userinfo);
449 }
450
451 /*
452  * Local variables:
453  * c-basic-offset: 4
454  * c-file-style: "Stroustrup"
455  * indent-tabs-mode: nil
456  * End:
457  * vim: shiftwidth=4 tabstop=8 expandtab
458  */
459