From: Adam Dickmeiss Date: Tue, 10 Oct 1995 12:24:38 +0000 (+0000) Subject: Temporary sort files are compressed. X-Git-Tag: ZEBRA.1.0~710 X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=fc7107844c9ec8ea23e680ca4f3231923db4e9c5 Temporary sort files are compressed. --- diff --git a/index/extract.c b/index/extract.c index ea31d07..1830850 100644 --- a/index/extract.c +++ b/index/extract.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: extract.c,v $ - * Revision 1.20 1995-10-06 13:52:05 adam + * Revision 1.21 1995-10-10 12:24:38 adam + * Temporary sort files are compressed. + * + * Revision 1.20 1995/10/06 13:52:05 adam * Bug fixes. Handler may abort further scanning. * * Revision 1.19 1995/10/04 12:55:16 adam @@ -119,12 +122,73 @@ void key_open (int mem) exit (1); } } + +struct encode_info { + int sysno; + int seqno; + char buf[512]; +}; + +void encode_key_init (struct encode_info *i) +{ + i->sysno = 0; + i->seqno = 0; +} + +char *encode_key_int (int d, char *bp) +{ + if (d <= 63) + *bp++ = d; + else if (d <= 16383) + { + *bp++ = 64 + (d>>8); + *bp++ = d & 255; + } + else if (d <= 4194303) + { + *bp++ = 128 + (d>>16); + *bp++ = (d>>8) & 255; + *bp++ = d & 255; + } + else + { + *bp++ = 192 + (d>>24); + *bp++ = (d>>16) & 255; + *bp++ = (d>>8) & 255; + *bp++ = d & 255; + } + return bp; +} + +void encode_key_write (char *k, struct encode_info *i, FILE *outf) +{ + struct it_key key; + char *bp = i->buf; + + while ((*bp++ = *k++)) + ; + memcpy (&key, k+1, sizeof(struct it_key)); + bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp); + if (i->sysno != key.sysno) + { + i->sysno = key.sysno; + i->seqno = 0; + } + bp = encode_key_int (key.seqno - i->seqno, bp); + i->seqno = key.seqno; + if (fwrite (i->buf, bp - i->buf, 1, outf) != 1) + { + logf (LOG_FATAL|LOG_ERRNO, "fwrite"); + exit (1); + } +} void key_flush (void) { FILE *outf; char out_fname[200]; char *prevcp, *cp; + struct encode_info encode_info; if (ptr_i <= 0) return; @@ -142,33 +206,19 @@ void key_flush (void) logf (LOG_LOG, "writing section %d", key_file_no); prevcp = cp = key_buf[ptr_top-ptr_i]; - if (fwrite (cp, strlen (cp)+2+sizeof(struct it_key), 1, outf) != 1) - { - logf (LOG_FATAL|LOG_ERRNO, "fwrite %s", out_fname); - exit (1); - } + encode_key_init (&encode_info); + encode_key_write (cp, &encode_info, outf); while (--ptr_i > 0) { cp = key_buf[ptr_top-ptr_i]; if (strcmp (cp, prevcp)) { - if (fwrite (cp, strlen (cp)+2+sizeof(struct it_key), 1, - outf) != 1) - { - logf (LOG_FATAL|LOG_ERRNO, "fwrite %s", out_fname); - exit (1); - } + encode_key_init (&encode_info); + encode_key_write (cp, &encode_info, outf); prevcp = cp; } else - { - cp = strlen (cp) + cp; - if (fwrite (cp, 2+sizeof(struct it_key), 1, outf) != 1) - { - logf (LOG_FATAL|LOG_ERRNO, "fwrite %s", out_fname); - exit (1); - } - } + encode_key_write (cp + strlen(cp), &encode_info, outf); } if (fclose (outf)) { diff --git a/index/index.h b/index/index.h index 08ced19..ece96af 100644 --- a/index/index.h +++ b/index/index.h @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: index.h,v $ - * Revision 1.15 1995-10-04 16:57:19 adam + * Revision 1.16 1995-10-10 12:24:38 adam + * Temporary sort files are compressed. + * + * Revision 1.15 1995/10/04 16:57:19 adam * Key input and merge sort in one pass. * * Revision 1.14 1995/09/29 14:01:40 adam @@ -84,9 +87,7 @@ void key_write (int cmd, struct it_key *k, const char *str); int key_compare (const void *p1, const void *p2); int key_qsort_compare (const void *p1, const void *p2); void key_logdump (int mask, const void *p); -void key_input (const char *dict_fname, const char *isam_fname, - const char *key_fname, int cache); -void key_input2 (const char *dict_fname, const char *isam_fname, +void key_input (const char *dict_fname, const char *isam_fname, int nkeys, int cache); int merge_sort (char **buf, int from, int to); diff --git a/index/kdump.c b/index/kdump.c index 899f508..93344e6 100644 --- a/index/kdump.c +++ b/index/kdump.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: kdump.c,v $ - * Revision 1.6 1995-09-29 14:01:42 adam + * Revision 1.7 1995-10-10 12:24:38 adam + * Temporary sort files are compressed. + * + * Revision 1.6 1995/09/29 14:01:42 adam * Bug fixes. * * Revision 1.5 1995/09/11 13:09:35 adam @@ -28,24 +31,68 @@ #include #include #include +#include #include #include "index.h" char *prog; -static int read_one (FILE *inf, char *name, char *key) + +int key_file_decode (FILE *f) +{ + int c, d; + + c = getc (f); + switch (c & 192) + { + case 0: + d = c; + break; + case 64: + d = ((c&63) << 8) + (getc (f) & 0xff); + break; + case 128: + d = ((c&63) << 8) + (getc (f) & 0xff); + d = (d << 8) + (getc (f) & 0xff); + break; + case 192: + d = ((c&63) << 8) + (getc (f) & 0xff); + d = (d << 8) + (getc (f) & 0xff); + d = (d << 8) + (getc (f) & 0xff); + break; + } + return d; +} + + +static int read_one (FILE *inf, char *name, char *key, struct it_key *prevk) { int c; int i = 0; + struct it_key itkey; do { if ((c=getc(inf)) == EOF) return 0; name[i++] = c; } while (c); - for (i = 0; i 1) + prevk->sysno = 0; + c = key_file_decode (inf); + key[0] = c & 1; + c = c >> 1; + itkey.sysno = c + prevk->sysno; + if (c) + { + prevk->sysno = itkey.sysno; + prevk->seqno = 0; + } + c = key_file_decode (inf); + itkey.seqno = c + prevk->seqno; + prevk->seqno = itkey.seqno; + + memcpy (key+1, &itkey, sizeof(itkey)); return 1; } @@ -57,6 +104,10 @@ int main (int argc, char **argv) char key_string[IT_MAX_WORD]; char key_info[256]; FILE *inf; + struct it_key prevk; + + prevk.sysno = 0; + prevk.seqno = 0; prog = *argv; while ((ret = options ("v:", argv, argc, &arg)) != -2) @@ -85,21 +136,15 @@ int main (int argc, char **argv) logf (LOG_FATAL|LOG_ERRNO, "fopen %s", key_fname); exit (1); } - while (read_one (inf, key_string, key_info)) + while (read_one (inf, key_string, key_info, &prevk)) { struct it_key k; int op; op = key_info[0]; memcpy (&k, 1+key_info, sizeof(k)); -#if IT_KEY_HAVE_SEQNO printf ("%7d op=%d s=%-5d %s\n", k.sysno, op, k.seqno, key_string); -#else - printf ("%7d op=%d f=%-3d %s\n", k.sysno, op, k.freq, - key_string); - -#endif } if (fclose (inf)) { diff --git a/index/kinput.c b/index/kinput.c index 71f5372..e9d467a 100644 --- a/index/kinput.c +++ b/index/kinput.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: kinput.c,v $ - * Revision 1.8 1995-10-04 16:57:19 adam + * Revision 1.9 1995-10-10 12:24:39 adam + * Temporary sort files are compressed. + * + * Revision 1.8 1995/10/04 16:57:19 adam * Key input and merge sort in one pass. * * Revision 1.7 1995/10/02 15:18:52 adam @@ -52,120 +55,6 @@ static int no_updates = 0; static int no_insertions = 0; static int no_iterations = 0; -static int read_one (FILE *inf, char *name, char *key) -{ - int c; - int i = 0; - do - { - if ((c=getc(inf)) == EOF) - return 0; - name[i++] = c; - } while (c); - for (i = 0; i= key_buf_size) - { - char *new_key_buf; - new_key_buf = xmalloc (key_buf_size + INP_BUF_ADD); - memcpy (new_key_buf, key_buf, key_buf_size); - key_buf_size += INP_BUF_ADD; - xfree (key_buf); - key_buf = new_key_buf; - } - } - no_diffs++; - nmemb = key_buf_ptr / KEY_SIZE; - assert (nmemb*KEY_SIZE == key_buf_ptr); - if ((info = dict_lookup (dict, cur_name))) - { - ISAM_P isam_p, isam_p2; - logf (LOG_DEBUG, "updating %s", cur_name); - no_updates++; - memcpy (&isam_p, info+1, sizeof(ISAM_P)); - isam_p2 = is_merge (isam, isam_p, nmemb, key_buf); - if (isam_p2 != isam_p) - dict_insert (dict, cur_name, sizeof(ISAM_P), &isam_p2); - } - else - { - ISAM_P isam_p; - logf (LOG_DEBUG, "inserting %s", cur_name); - no_insertions++; - isam_p = is_merge (isam, 0, nmemb, key_buf); - dict_insert (dict, cur_name, sizeof(ISAM_P), &isam_p); - } - memcpy (key_buf, next_key, KEY_SIZE); - strcpy (cur_name, next_name); - } - fclose (inf); - return 0; -} - -void key_input (const char *dict_fname, const char *isam_fname, - const char *key_fname, int cache) -{ - Dict dict; - ISAM isam; - - dict = dict_open (dict_fname, cache, 1); - if (!dict) - { - logf (LOG_FATAL, "dict_open fail of `%s'", dict_fname); - exit (1); - } - isam = is_open (isam_fname, key_compare, 1, sizeof(struct it_key)); - if (!isam) - { - logf (LOG_FATAL, "is_open fail of `%s'", isam_fname); - exit (1); - } - inp (dict, isam, key_fname); - dict_close (dict); - is_close (isam); - logf (LOG_LOG, "Iterations . . .%7d", no_iterations); - logf (LOG_LOG, "Distinct words .%7d", no_diffs); - logf (LOG_LOG, "Updates. . . . .%7d", no_updates); - logf (LOG_LOG, "Insertions . . .%7d", no_insertions); -} - - struct key_file { int no; /* file no */ off_t offset; /* file offset */ @@ -174,6 +63,8 @@ struct key_file { size_t chunk; /* number of bytes allocated */ size_t buf_ptr; /* current position in buffer */ char *prev_name; /* last word read */ + int sysno; /* last sysno */ + int seqno; /* last seqno */ }; void key_file_chunk_read (struct key_file *f) @@ -215,6 +106,8 @@ struct key_file *key_file_init (int no, int chunk) struct key_file *f; f = xmalloc (sizeof(*f)); + f->sysno = 0; + f->seqno = 0; f->no = no; f->chunk = chunk; f->offset = 0; @@ -239,9 +132,36 @@ int key_file_getc (struct key_file *f) return EOF; } +int key_file_decode (struct key_file *f) +{ + int c, d; + + c = key_file_getc (f); + switch (c & 192) + { + case 0: + d = c; + break; + case 64: + d = ((c&63) << 8) + (key_file_getc (f) & 0xff); + break; + case 128: + d = ((c&63) << 8) + (key_file_getc (f) & 0xff); + d = (d << 8) + (key_file_getc (f) & 0xff); + break; + case 192: + d = ((c&63) << 8) + (key_file_getc (f) & 0xff); + d = (d << 8) + (key_file_getc (f) & 0xff); + d = (d << 8) + (key_file_getc (f) & 0xff); + break; + } + return d; +} + int key_file_read (struct key_file *f, char *key) { - int i, j, c; + int i, d, c; + struct it_key itkey; c = key_file_getc (f); if (c == 0) @@ -258,10 +178,22 @@ int key_file_read (struct key_file *f, char *key) while ((key[i++] = key_file_getc (f))) ; strcpy (f->prev_name, key); + f->sysno = 0; } - for (j = KEY_SIZE; --j >= 0; ) - key[i++] = key_file_getc (f); - return i; + d = key_file_decode (f); + key[i++] = d & 1; + d = d >> 1; + itkey.sysno = d + f->sysno; + if (d) + { + f->sysno = itkey.sysno; + f->seqno = 0; + } + d = key_file_decode (f); + itkey.seqno = d + f->seqno; + f->seqno = itkey.seqno; + memcpy (key + i, &itkey, sizeof(struct it_key)); + return i + sizeof (struct it_key); } struct heap_info { @@ -310,12 +242,8 @@ static void key_heap_delete (struct heap_info *hi) assert (hi->heapnum > 0); -#if 1 key_heap_swap (hi, 1, hi->heapnum); hi->heapnum--; -#else - hi->ptr[1] = hi->ptr[hi->heapnum--]; -#endif while (child <= hi->heapnum) { if (child < hi->heapnum && (*hi->cmp)(&hi->info.buf[hi->ptr[child]], @@ -435,7 +363,7 @@ int heap_inp (Dict dict, ISAM isam, struct heap_info *hi) return 0; } -void key_input2 (const char *dict_fname, const char *isam_fname, +void key_input (const char *dict_fname, const char *isam_fname, int nkeys, int cache) { Dict dict; diff --git a/index/main.c b/index/main.c index 7c56296..e0fa497 100644 --- a/index/main.c +++ b/index/main.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: main.c,v $ - * Revision 1.12 1995-10-04 16:57:20 adam + * Revision 1.13 1995-10-10 12:24:39 adam + * Temporary sort files are compressed. + * + * Revision 1.12 1995/10/04 16:57:20 adam * Key input and merge sort in one pass. * * Revision 1.11 1995/09/29 14:01:45 adam @@ -61,7 +64,6 @@ int main (int argc, char **argv) char *base_name = NULL; char *base_path = NULL; int nsections; - char **mbuf; prog = *argv; while ((ret = options ("r:v:m:", argv, argc, &arg)) != -2) @@ -130,17 +132,8 @@ int main (int argc, char **argv) nsections = key_close (); if (!nsections) exit (0); -#if 0 - logf (LOG_LOG, "Merge sorting"); - mbuf = xmalloc (100000); - merge_sort (mbuf, 1, nsections+1); - xfree (mbuf); - logf (LOG_LOG, "Input"); - key_input (FNAME_WORD_DICT, FNAME_WORD_ISAM, "keys1.tmp", 60); -#else logf (LOG_LOG, "Input"); - key_input2 (FNAME_WORD_DICT, FNAME_WORD_ISAM, nsections, 60); -#endif + key_input (FNAME_WORD_DICT, FNAME_WORD_ISAM, nsections, 60); exit (0); }