From a91dfbe03b8d58265ea20e06e0e8849e3f9e24b3 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 6 Sep 1994 13:05:12 +0000 Subject: [PATCH] Further development of insertion. Some special cases are not properly handled yet! assert(0) are put here. The binary search in each page definitely reduce usr CPU. --- dict/Makefile | 4 +- dict/dicttest.c | 50 +++++++- dict/drdwr.c | 10 +- dict/insert.c | 370 ++++++++++++++++++++++++++++++++++++++++++++++++++----- include/dict.h | 9 +- 5 files changed, 401 insertions(+), 42 deletions(-) diff --git a/dict/Makefile b/dict/Makefile index 8fe8a4c..cd1aa9c 100644 --- a/dict/Makefile +++ b/dict/Makefile @@ -1,12 +1,12 @@ # Copyright (C) 1994, Index Data I/S # All rights reserved. # Sebastian Hammer, Adam Dickmeiss -# $Id: Makefile,v 1.4 1994-09-01 17:44:05 adam Exp $ +# $Id: Makefile,v 1.5 1994-09-06 13:05:12 adam Exp $ SHELL=/bin/sh INCLUDE=-I../include TPROG=dicttest -CFLAGS=-g -Wall -pedantic +CFLAGS=-Wall -pg -pedantic DEFS=$(INCLUDE) LIB=../lib/dict.a PO = dopen.o dclose.o drdwr.o open.o close.o insert.o lookup.o diff --git a/dict/dicttest.c b/dict/dicttest.c index a8647c1..52dadb1 100644 --- a/dict/dicttest.c +++ b/dict/dicttest.c @@ -4,7 +4,12 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: dicttest.c,v $ - * Revision 1.4 1994-09-01 17:49:37 adam + * Revision 1.5 1994-09-06 13:05:14 adam + * Further development of insertion. Some special cases are + * not properly handled yet! assert(0) are put here. The + * binary search in each page definitely reduce usr CPU. + * + * Revision 1.4 1994/09/01 17:49:37 adam * Removed stupid line. Work on insertion in dictionary. Not finished yet. * * Revision 1.3 1994/09/01 17:44:06 adam @@ -21,6 +26,7 @@ #include #include #include +#include #include @@ -33,9 +39,11 @@ int main (int argc, char **argv) const char *inputfile = NULL; const char *base = NULL; int rw = 0; - int infosize = 2; + int infosize = 4; int cache = 10; int ret; + int no_of_insertions = 0; + int no_of_new = 0, no_of_same = 0, no_of_change = 0; char *arg; prog = argv[0]; @@ -108,7 +116,6 @@ int main (int argc, char **argv) { FILE *ipf; char ipf_buf[256]; - char word[256]; int line = 1; char infobytes[120]; memset (infobytes, 0, 120); @@ -121,15 +128,46 @@ int main (int argc, char **argv) while (fgets (ipf_buf, 255, ipf)) { - if (sscanf (ipf_buf, "%s", word) == 1) + char *ipf_ptr = ipf_buf; + sprintf (infobytes, "%d", line); + for (;*ipf_ptr && *ipf_ptr != '\n';ipf_ptr++) { - sprintf (infobytes, "%d", line); - dict_insert (dict, word, infosize, infobytes); + if (isalpha(*ipf_ptr) || *ipf_ptr == '_') + { + int i = 1; + while (ipf_ptr[i] && (isalnum(ipf_ptr[i]) || + ipf_ptr[i] == '_')) + i++; + if (ipf_ptr[i]) + ipf_ptr[i++] = '\0'; +#if 1 + switch(dict_insert (dict, ipf_ptr, infosize, infobytes)) + { + case 0: + no_of_new++; + break; + case 1: + no_of_change++; + break; + case 2: + no_of_same++; + break; + } +#else + printf ("%s\n", ipf_ptr); +#endif + ++no_of_insertions; + ipf_ptr += (i-1); + } } ++line; } fclose (ipf); } + log (LOG_LOG, "Insertions.... %d", no_of_insertions); + log (LOG_LOG, "No of new..... %d", no_of_new); + log (LOG_LOG, "No of change.. %d", no_of_change); + log (LOG_LOG, "No of same.... %d", no_of_same); dict_close (dict); res_close (common_resource); return 0; diff --git a/dict/drdwr.c b/dict/drdwr.c index 7ec3046..8a02d15 100644 --- a/dict/drdwr.c +++ b/dict/drdwr.c @@ -4,7 +4,12 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: drdwr.c,v $ - * Revision 1.5 1994-09-01 17:49:38 adam + * Revision 1.6 1994-09-06 13:05:14 adam + * Further development of insertion. Some special cases are + * not properly handled yet! assert(0) are put here. The + * binary search in each page definitely reduce usr CPU. + * + * Revision 1.5 1994/09/01 17:49:38 adam * Removed stupid line. Work on insertion in dictionary. Not finished yet. * */ @@ -138,6 +143,7 @@ int dict_bf_readp (Dict_BFile bf, int no, void **bufp) { struct Dict_file_block *p; int i; + assert (no < 1000); if ((p = find_block (bf, no))) { *bufp = p->data; @@ -168,7 +174,7 @@ int dict_bf_newp (Dict_BFile dbf, int no, void **bufp) *bufp = p->data; memset (p->data, 0, dbf->block_size); p->dirty = 1; -#if 1 +#if 0 printf ("bf_newp of %d:", no); pr_lru (dbf); #endif diff --git a/dict/insert.c b/dict/insert.c index 6d6b32c..82cc694 100644 --- a/dict/insert.c +++ b/dict/insert.c @@ -4,7 +4,12 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: insert.c,v $ - * Revision 1.5 1994-09-01 17:49:39 adam + * Revision 1.6 1994-09-06 13:05:15 adam + * Further development of insertion. Some special cases are + * not properly handled yet! assert(0) are put here. The + * binary search in each page definitely reduce usr CPU. + * + * Revision 1.5 1994/09/01 17:49:39 adam * Removed stupid line. Work on insertion in dictionary. Not finished yet. * * Revision 1.4 1994/09/01 17:44:09 adam @@ -28,6 +33,9 @@ #include +#define USE_BINARY_SEARCH 1 +#define CHECK 0 + static int dict_ins (Dict dict, const Dict_char *str, Dict_ptr back_ptr, int userlen, void *userinfo); @@ -55,7 +63,8 @@ static Dict_ptr new_page (Dict dict, Dict_ptr back_ptr, void **pp) DICT_nextptr(p) = 0; DICT_nodir(p) = 0; DICT_size(p) = DICT_infoffset; - *pp = p; + if (pp) + *pp = p; return ptr; } @@ -114,6 +123,7 @@ static int split_page (Dict dict, Dict_ptr ptr, void *p) assert (*indxp > 0); info = (char*) p + *indxp; /* entry start */ + assert (*info == best_char); slen = dict_strlen(info); assert (slen > 0); @@ -141,6 +151,7 @@ static int split_page (Dict dict, Dict_ptr ptr, void *p) assert (*indxp > 0); info = (char*) p + *indxp; /* entry start */ + assert (*info == best_char); slen = dict_strlen(info); if (slen > 1) @@ -150,6 +161,7 @@ static int split_page (Dict dict, Dict_ptr ptr, void *p) if (need <= (1+slen)*sizeof(Dict_char) + 1 + *info1) best_indxp = indxp; /* space for entry */ dict_ins (dict, info+sizeof(Dict_char), subptr, *info1, info1+1); + dict_bf_readp (dict->dbf, ptr, &p); } } if (best_indxp) @@ -174,6 +186,37 @@ static int split_page (Dict dict, Dict_ptr ptr, void *p) memcpy (info, info_here, *info_here+1); /* with information */ else *info = 0; /* without info */ +#if CHECK + best_indxp = NULL; + prev_char = 0; + indxp = (short*) ((char*) p+DICT_PAGESIZE-sizeof(short)); + for (i = DICT_nodir (p); --i >= 0; --indxp) + { + if (*indxp > 0) /* tail string here! */ + { + Dict_char dc; + + memcpy (&dc, (char*) p + *indxp, sizeof(dc)); + assert (dc != best_char); + assert (dc >= prev_char); + prev_char = dc; + } + else + { + Dict_char dc; + memcpy (&dc, (char*)p - *indxp+sizeof(Dict_ptr), + sizeof(dc)); + assert (dc > prev_char); + if (dc == best_char) + { + assert (best_indxp == NULL); + best_indxp = indxp; + } + prev_char = dc; + } + } + assert (best_indxp); +#endif } else { @@ -192,7 +235,7 @@ static int split_page (Dict dict, Dict_ptr ptr, void *p) return 0; } -static void clean_page (Dict dict, void *p) +static void clean_page (Dict dict, Dict_ptr ptr, void *p) { char *np = xmalloc (dict->head.page_size); int i, slen; @@ -219,7 +262,6 @@ static void clean_page (Dict dict, void *p) slen = *info1+1; memcpy (info2, info1, slen); info2 += slen; - info1 += slen; } else { @@ -228,6 +270,7 @@ static void clean_page (Dict dict, void *p) /* unsigned char length of information */ /* char * information */ + assert (*indxp1 < 0); *--indxp2 = -(info2 - np); info1 = (char*) p - *indxp1; memcpy (info2, info1, sizeof(Dict_ptr)+sizeof(Dict_char)); @@ -236,16 +279,17 @@ static void clean_page (Dict dict, void *p) slen = *info1+1; memcpy (info2, info1, slen); info2 += slen; - info1 += slen; } } - memcpy ((char*) p + DICT_infoffset, (char*) np + DICT_infoffset, + memcpy ((char*)p+DICT_infoffset, (char*)np+DICT_infoffset, DICT_PAGESIZE-DICT_infoffset); DICT_size(p) = info2 - np; DICT_type(p) = 0; xfree (np); } + +#if !USE_BINARY_SEARCH static int dict_ins (Dict dict, const Dict_char *str, Dict_ptr back_ptr, int userlen, void *userinfo) { @@ -254,6 +298,9 @@ static int dict_ins (Dict dict, const Dict_char *str, short *indxp; char *info; void *p; +#if CHECK + Dict_char prev_char = 0; +#endif if (ptr == 0) ptr = new_page (dict, back_ptr, &p); @@ -273,6 +320,10 @@ static int dict_ins (Dict dict, const Dict_char *str, /* unsigned char length of information */ /* char * information */ cmp = dict_strcmp ((Dict_char*) info, str); +#if CHECK + assert (info[0] >= prev_char); + prev_char=info[0]; +#endif if (!cmp) { info += (dict_strlen(info)+1)*sizeof(Dict_char); @@ -283,7 +334,9 @@ static int dict_ins (Dict dict, const Dict_char *str, { dict_bf_touch (dict->dbf, ptr); memcpy (info+1, userinfo, userlen); + return 1; } + return 2; } else if (*info > userlen) { @@ -291,13 +344,13 @@ static int dict_ins (Dict dict, const Dict_char *str, *info = userlen; dict_bf_touch (dict->dbf, ptr); memcpy (info+1, userinfo, userlen); + return 1; } else { DICT_type(p) = 1; break; } - return 0; } else if(cmp > 0) break; @@ -305,39 +358,84 @@ static int dict_ins (Dict dict, const Dict_char *str, else /* tail of string in sub page */ { Dict_char dc; + Dict_ptr subptr; + assert (*indxp < 0); info = (char*) p - *indxp; /* Dict_ptr subptr */ /* Dict_char sub char */ /* unsigned char length of information */ /* char * information */ + memcpy (&subptr, info, sizeof(Dict_ptr)); memcpy (&dc, info+sizeof(Dict_ptr), sizeof(Dict_char)); cmp = dc- *str; +#if CHECK + assert (dc > prev_char); + prev_char=dc; +#endif if (!cmp) { - Dict_ptr subptr; - void *pp; if (*++str == DICT_EOS) - { /* missing: consider change of userinfo length ... */ - if (memcmp (info+sizeof(Dict_char)+sizeof(Dict_ptr)+1, - userinfo, userlen)) + { + int xlen; + + xlen = info[sizeof(Dict_ptr)+sizeof(Dict_char)]; + if (xlen == userlen) { - memcpy (dict+sizeof(Dict_char)+sizeof(Dict_ptr)+1, + if (memcmp (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1, + userinfo, userlen)) + { + dict_bf_touch (dict->dbf, ptr); + memcpy (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1, + userinfo, userlen); + return 1; + } + return 2; + } + else if (xlen > userlen) + { + DICT_type(p) = 1; + info[sizeof(Dict_ptr)+sizeof(Dict_char)] = userlen; + memcpy (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1, userinfo, userlen); dict_bf_touch (dict->dbf, ptr); + return 1; + } + if (DICT_size(p)+sizeof(Dict_char)+sizeof(Dict_ptr)+ + userlen >= + DICT_PAGESIZE - (1+DICT_nodir(p))*sizeof(short)) + { + assert (0); + clean_page (dict, ptr, p); + dict_ins (dict, str-1, ptr, userlen, userinfo); } + else + { + info = (char*)p + DICT_size(p); + memcpy (info, &subptr, sizeof(subptr)); + memcpy (info+sizeof(Dict_ptr), &dc, sizeof(Dict_char)); + info[sizeof(Dict_char)+sizeof(Dict_ptr)] = userlen; + memcpy (info+sizeof(Dict_char)+sizeof(Dict_ptr)+1, + userinfo, userlen); + *indxp = -DICT_size(p); + DICT_size(p) += sizeof(Dict_char)+sizeof(Dict_ptr) + +1+userlen; + DICT_type(p) = 1; + dict_bf_touch (dict->dbf, ptr); + } + if (xlen) + return 1; return 0; } else { - memcpy (&subptr, info, sizeof(subptr)); if (subptr == 0) { - subptr = new_page (dict, ptr, &pp); + subptr = new_page (dict, ptr, NULL); memcpy (info, &subptr, sizeof(subptr)); dict_bf_touch (dict->dbf, ptr); } - return dict_ins (dict, str, ptr, userlen, userinfo); + return dict_ins (dict, str, subptr, userlen, userinfo); } } else if(cmp > 0) @@ -350,15 +448,14 @@ static int dict_ins (Dict dict, const Dict_char *str, { if (DICT_type(p) == 1) { - clean_page (dict, p); - dict_ins (dict, str, ptr, userlen, userinfo); - return 0; + clean_page (dict, ptr, p); + dict_bf_touch (dict->dbf, ptr); + return dict_ins (dict, str, ptr, userlen, userinfo); } i = 0; do { - if (i > 0) - assert (0); + assert (i <= 1); if (split_page (dict, ptr, p)) { log (LOG_FATAL, "Unable to split page %d\n", ptr); @@ -368,11 +465,10 @@ static int dict_ins (Dict dict, const Dict_char *str, DICT_PAGESIZE - (1+DICT_nodir(p))*sizeof(short)) break; i++; - clean_page (dict, p); + clean_page (dict, ptr, p); } while (DICT_size(p)+slen+userlen > DICT_PAGESIZE - (1+DICT_nodir(p))*sizeof(short)); - dict_ins (dict, str, ptr, userlen, userinfo); - return 0; + return dict_ins (dict, str, ptr, userlen, userinfo); } if (cmp) { @@ -382,6 +478,17 @@ static int dict_ins (Dict dict, const Dict_char *str, - DICT_nodir(p)*sizeof(short)); for (; indxp1 != indxp; indxp1++) indxp1[0] = indxp1[1]; +#if CHECK + indxp1 = (short*) ((char*) p+DICT_PAGESIZE-sizeof(short)); + for (i = DICT_nodir (p); --i >= 0; --indxp1) + { + if (*indxp1 < 0) + { + info = (char*)p - *indxp1; + assert (info[sizeof(Dict_ptr)] > ' '); + } + } +#endif } info = (char*)p + DICT_size(p); memcpy (info, str, slen); @@ -391,21 +498,224 @@ static int dict_ins (Dict dict, const Dict_char *str, info += userlen; *indxp = DICT_size(p); -#if 0 - printf ("indxp[%d]\n", (char*) indxp - (char*) p); + DICT_size(p) = info- (char*) p; + dict_bf_touch (dict->dbf, ptr); + if (cmp) + return 0; + return 1; +} +/* return 0 if new */ +/* return 1 if before but change of info */ +/* return 2 if same as before */ + +#else +static int dict_ins (Dict dict, const Dict_char *str, + Dict_ptr back_ptr, int userlen, void *userinfo) +{ + int hi, lo, mid, i, slen, cmp = 1; + Dict_ptr ptr = back_ptr; + short *indxp; + char *info; + void *p; + + if (ptr == 0) + ptr = new_page (dict, back_ptr, &p); + else + dict_bf_readp (dict->dbf, ptr, &p); + + assert (p); + assert (ptr); + + mid = lo = 0; + hi = DICT_nodir(p)-1; + indxp = (short*) ((char*) p+DICT_PAGESIZE-sizeof(short)); + while (lo <= hi) + { + mid = (lo+hi)/2; + if (indxp[-mid] > 0) + { + info = (char*)p + indxp[-mid]; + cmp = dict_strcmp((Dict_char*) info, str); + if (!cmp) + { + info += (dict_strlen(info)+1)*sizeof(Dict_char); + /* consider change of userinfo length... */ + if (*info == userlen) + { + if (memcmp (info+1, userinfo, userlen)) + { + dict_bf_touch (dict->dbf, ptr); + memcpy (info+1, userinfo, userlen); + return 1; + } + return 2; + } + else if (*info > userlen) + { + DICT_type(p) = 1; + *info = userlen; + dict_bf_touch (dict->dbf, ptr); + memcpy (info+1, userinfo, userlen); + return 1; + } + else + DICT_type(p) = 1; + break; + } + } + else + { + Dict_char dc; + Dict_ptr subptr; + + info = (char*)p - indxp[-mid]; + memcpy (&dc, info+sizeof(Dict_ptr), sizeof(Dict_char)); + cmp = dc- *str; + if (!cmp) + { + memcpy (&subptr, info, sizeof(Dict_ptr)); + if (*++str == DICT_EOS) + { + int xlen; + + xlen = info[sizeof(Dict_ptr)+sizeof(Dict_char)]; + if (xlen == userlen) + { + if (memcmp (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1, + userinfo, userlen)) + { + dict_bf_touch (dict->dbf, ptr); + memcpy (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1, + userinfo, userlen); + return 1; + } + return 2; + } + else if (xlen > userlen) + { + DICT_type(p) = 1; + info[sizeof(Dict_ptr)+sizeof(Dict_char)] = userlen; + memcpy (info+sizeof(Dict_ptr)+sizeof(Dict_char)+1, + userinfo, userlen); + dict_bf_touch (dict->dbf, ptr); + return 1; + } + if (DICT_size(p)+sizeof(Dict_char)+sizeof(Dict_ptr)+ + userlen >= + DICT_PAGESIZE - (1+DICT_nodir(p))*sizeof(short)) + { + assert (0); + clean_page (dict, ptr, p); + dict_ins (dict, str-1, ptr, userlen, userinfo); + } + else + { + info = (char*)p + DICT_size(p); + memcpy (info, &subptr, sizeof(subptr)); + memcpy (info+sizeof(Dict_ptr), &dc, sizeof(Dict_char)); + info[sizeof(Dict_char)+sizeof(Dict_ptr)] = userlen; + memcpy (info+sizeof(Dict_char)+sizeof(Dict_ptr)+1, + userinfo, userlen); + indxp[-mid] = -DICT_size(p); + DICT_size(p) += sizeof(Dict_char)+sizeof(Dict_ptr) + +1+userlen; + DICT_type(p) = 1; + dict_bf_touch (dict->dbf, ptr); + } + if (xlen) + return 1; + return 0; + } + else + { + if (subptr == 0) + { + subptr = new_page (dict, ptr, NULL); + memcpy (info, &subptr, sizeof(subptr)); + dict_bf_touch (dict->dbf, ptr); + } + return dict_ins (dict, str, subptr, userlen, userinfo); + } + } + } + if (cmp < 0) + lo = mid+1; + else + hi = mid-1; + } + indxp = indxp-mid; + if (lo>hi && cmp < 0) + --indxp; + slen = (dict_strlen(str)+1)*sizeof(Dict_char); + if (DICT_size(p)+slen+userlen >= + DICT_PAGESIZE - (1+DICT_nodir(p))*sizeof(short)) /* overflow? */ + { + if (DICT_type(p) == 1) + { + clean_page (dict, ptr, p); + dict_bf_touch (dict->dbf, ptr); + return dict_ins (dict, str, ptr, userlen, userinfo); + } + i = 0; + do + { + assert (i <= 1); + if (split_page (dict, ptr, p)) + { + log (LOG_FATAL, "Unable to split page %d\n", ptr); + abort (); + } + if (DICT_size(p)+slen+userlen < + DICT_PAGESIZE - (1+DICT_nodir(p))*sizeof(short)) + break; + i++; + clean_page (dict, ptr, p); + } while (DICT_size(p)+slen+userlen > DICT_PAGESIZE - + (1+DICT_nodir(p))*sizeof(short)); + return dict_ins (dict, str, ptr, userlen, userinfo); + } + if (cmp) + { + short *indxp1; + (DICT_nodir(p))++; + indxp1 = (short*)((char*) p + DICT_PAGESIZE + - DICT_nodir(p)*sizeof(short)); + for (; indxp1 != indxp; indxp1++) + indxp1[0] = indxp1[1]; +#if CHECK + indxp1 = (short*) ((char*) p+DICT_PAGESIZE-sizeof(short)); + for (i = DICT_nodir (p); --i >= 0; --indxp1) + { + if (*indxp1 < 0) + { + info = (char*)p - *indxp1; + assert (info[sizeof(Dict_ptr)] > ' '); + } + } #endif + } + info = (char*)p + DICT_size(p); + memcpy (info, str, slen); + info += slen; + *info++ = userlen; + memcpy (info, userinfo, userlen); + info += userlen; + *indxp = DICT_size(p); DICT_size(p) = info- (char*) p; dict_bf_touch (dict->dbf, ptr); - return 0; + if (cmp) + return 0; + return 1; } +#endif int dict_insert (Dict dict, const Dict_char *str, int userlen, void *userinfo) { assert (dict->head.last > 0); if (dict->head.last == 1) - dict_ins (dict, str, 0, userlen, userinfo); + return dict_ins (dict, str, 0, userlen, userinfo); else - dict_ins (dict, str, 1, userlen, userinfo); - return 0; + return dict_ins (dict, str, 1, userlen, userinfo); } + diff --git a/include/dict.h b/include/dict.h index 61c7e19..b5347bf 100644 --- a/include/dict.h +++ b/include/dict.h @@ -4,7 +4,12 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: dict.h,v $ - * Revision 1.4 1994-09-01 17:44:40 adam + * Revision 1.5 1994-09-06 13:05:29 adam + * Further development of insertion. Some special cases are + * not properly handled yet! assert(0) are put here. The + * binary search in each page definitely reduce usr CPU. + * + * Revision 1.4 1994/09/01 17:44:40 adam * Work on insertion in dictionary. Not finished yet. * CVS ---------------------------------------------------------------------- * @@ -74,7 +79,7 @@ Dict_BFile dict_bf_open (const char *name, int block_size, int cache, int rw); int dict_bf_close (Dict_BFile dbf); #define DICT_MAGIC "dict00" -#define DICT_PAGESIZE 64 +#define DICT_PAGESIZE 8192 Dict dict_open (const char *name, int cache, int rw); int dict_close (Dict dict); -- 1.7.10.4