X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=index%2Fextract.c;h=829bb68df75758c4a69bd3398741ba2f183fb720;hp=aa695502246c84e03bf02c969aeae1dcaf557b18;hb=5c693d36af8be6f6642257160b3c6441d2e2d762;hpb=0ca8a9894b43c838980d650dd265f7a3da4ed219 diff --git a/index/extract.c b/index/extract.c index aa69550..829bb68 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,10 +1,29 @@ /* - * Copyright (C) 1994-1996, Index Data I/S + * Copyright (C) 1994-1998, Index Data I/S * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * * $Log: extract.c,v $ - * Revision 1.73 1997-09-04 13:57:20 adam + * Revision 1.78 1998-02-10 12:03:05 adam + * Implemented Sort. + * + * Revision 1.77 1998/01/12 15:04:08 adam + * The test option (-s) only uses read-lock (and not write lock). + * + * Revision 1.76 1997/10/27 14:33:04 adam + * Moved towards generic character mapping depending on "structure" + * field in abstract syntax file. Fixed a few memory leaks. Fixed + * bug with negative integers when doing searches with relational + * operators. + * + * Revision 1.75 1997/09/17 12:19:12 adam + * Zebra version corresponds to YAZ version 1.4. + * Changed Zebra server so that it doesn't depend on global common_resource. + * + * Revision 1.74 1997/09/09 13:38:06 adam + * Partial port to WIN95/NT. + * + * Revision 1.73 1997/09/04 13:57:20 adam * New file extract/retrieve method tellf (added). * Added O_BINARY for open calls. * @@ -266,10 +285,16 @@ */ #include #include +#ifdef WINDOWS +#include +#else #include +#endif #include #include +#include +#include #include "index.h" #include "zinfo.h" @@ -277,6 +302,7 @@ static Dict matchDict; static Records records = NULL; +static SortIdx sortIdx = NULL; static char **key_buf; static size_t ptr_top; @@ -303,7 +329,7 @@ static void logRecord (int showFlag) } } -void key_open (int mem) +int key_open (BFiles bfs, int mem, int rw) { if (!mem) mem = atoi(res_get_def (common_resource, "memMax", "4"))*1024*1024; @@ -316,16 +342,27 @@ void key_open (int mem) key_buf_used = 0; key_file_no = 0; - if (!(matchDict = dict_open (GMATCH_DICT, 50, 1))) + if (!(matchDict = dict_open (bfs, GMATCH_DICT, 50, rw))) { logf (LOG_FATAL, "dict_open fail of %s", GMATCH_DICT); - exit (1); + return -1; } assert (!records); - records = rec_open (1); -#if 1 - zti = zebTargetInfo_open (records, 1); -#endif + records = rec_open (bfs, rw); + if (!records) + { + dict_close (matchDict); + return -1; + } + zti = zebTargetInfo_open (records, rw); + if (!zti) + { + rec_close (&records); + dict_close (matchDict); + return -1; + } + sortIdx = sortIdx_open (bfs, 1); + return 0; } struct encode_info { @@ -430,7 +467,7 @@ void key_flush (void) if (!(outf = fopen (out_fname, "w"))) { - logf (LOG_FATAL|LOG_ERRNO, "fopen (4) %s", out_fname); + logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname); exit (1); } logf (LOG_LOG, "writing section %d", key_file_no); @@ -456,7 +493,7 @@ void key_flush (void) if (!(outf = fopen (out_fname, "w"))) { - logf (LOG_FATAL|LOG_ERRNO, "fopen (4) %s", out_fname); + logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname); exit (1); } logf (LOG_LOG, "writing section %d", key_file_no); @@ -496,7 +533,7 @@ void key_flush (void) key_buf_used = 0; } -int key_close (void) +int key_close () { key_flush (); xfree (key_buf); @@ -505,19 +542,29 @@ int key_close (void) #endif rec_close (&records); dict_close (matchDict); + sortIdx_close (sortIdx); logRecord (1); return key_file_no; } -static void wordInit (RecWord *p) +static void wordInit (struct recExtractCtrl *p, RecWord *w) { - p->attrSet = 1; - p->attrUse = 1016; - p->which = Word_String; + w->zebra_maps = p->zebra_maps; + w->attrSet = 1; + w->attrUse = 1016; + w->reg_type = 'w'; } -struct recKeys { +static struct sortKey { + char *string; + int length; + int attrSet; + int attrUse; + struct sortKey *next; +} *sortKeys = NULL; + +static struct recKeys { int buf_used; int buf_max; char *buf; @@ -526,12 +573,11 @@ struct recKeys { int prevSeqNo; } reckeys; -static void addRecordKey (const RecWord *p) +static void addIndexString (RecWord *p, const char *string, int length) { char *dst; char attrSet; short attrUse; - size_t i; int lead = 0; int diff = 0; @@ -578,19 +624,9 @@ static void addRecordKey (const RecWord *p) memcpy (dst, &attrUse, sizeof(attrUse)); dst += sizeof(attrUse); } - switch (p->which) - { - case Word_String: - *dst++ = 'w'; - break; - case Word_Phrase: - *dst++ = 'p'; - break; - case Word_Numeric: - *dst++ = 'n'; - } - for (i = 0; p->u.string[i] && i < IT_MAX_WORD-3; i++) - *dst++ = p->u.string[i]; + *dst++ = p->reg_type; + memcpy (dst, string, length); + dst += length; *dst++ = '\0'; if (!diff) @@ -599,6 +635,149 @@ static void addRecordKey (const RecWord *p) dst += sizeof(p->seqno); } reckeys.buf_used = dst - reckeys.buf; + (p->seqno)++; +} + +static void addSortString (RecWord *p, const char *string, int length) +{ + struct sortKey *sk; + + for (sk = sortKeys; sk; sk = sk->next) + if (sk->attrSet == p->attrSet && sk->attrUse == p->attrUse) + return; + + sk = xmalloc (sizeof(*sk)); + sk->next = sortKeys; + sortKeys = sk; + + sk->string = xmalloc (p->length); + sk->length = p->length; + memcpy (sk->string, p->string, p->length); + sk->attrSet = p->attrSet; + sk->attrUse = p->attrUse; +} + +static void addString (RecWord *p, const char *string, int length) +{ + if (zebra_maps_is_sort (p->zebra_maps, p->reg_type)) + addSortString (p, string, length); + else + addIndexString (p, string, length); +} + +static void addIncompleteField (RecWord *p) +{ + const char *b = p->string; + int remain = p->length; + const char **map = 0; + + if (remain > 0) + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + + while (map) + { + char buf[IT_MAX_WORD+1]; + int i, remain; + + /* Skip spaces */ + while (map && *map && **map == *CHR_SPACE) + { + remain = p->length - (b - p->string); + if (remain > 0) + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + else + map = 0; + } + if (!map) + break; + i = 0; + while (map && *map && **map != *CHR_SPACE) + { + const char *cp = *map; + + while (i < IT_MAX_WORD && *cp) + buf[i++] = *(cp++); + remain = p->length - (b - p->string); + if (remain > 0) + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + else + map = 0; + } + if (!i) + return; + addString (p, buf, i); + } +} + +static void addCompleteField (RecWord *p) +{ + const char *b = p->string; + char buf[IT_MAX_WORD+1]; + const char **map = 0; + int i = 0, remain = p->length; + + if (remain > 0) + map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain); + + while (remain > 0 && i < IT_MAX_WORD) + { + while (map && *map && **map == *CHR_SPACE) + { + remain = p->length - (b - p->string); + if (remain > 0) + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + else + map = 0; + } + if (!map) + break; + + if (i && i < IT_MAX_WORD) + buf[i++] = *CHR_SPACE; + while (map && *map && **map != *CHR_SPACE) + { + const char *cp = *map; + + if (i >= IT_MAX_WORD) + break; + while (i < IT_MAX_WORD && *cp) + buf[i++] = *(cp++); + remain = p->length - (b - p->string); + if (remain > 0) + map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, + remain); + else + map = 0; + } + } + if (!i) + return; + addString (p, buf, i); +} + +static void addRecordKey (RecWord *p) +{ + if (zebra_maps_is_complete (p->zebra_maps, p->reg_type)) + addCompleteField (p); + else + addIncompleteField(p); +} + +static void flushSortKeys (SYSNO sysno, int cmd) +{ + struct sortKey *sk = sortKeys; + + sortIdx_sysno (sortIdx, sysno); + while (sk) + { + struct sortKey *sk_next = sk->next; + sortIdx_type (sortIdx, sk->attrUse); + sortIdx_add (sortIdx, sk->string, sk->length); + xfree (sk->string); + xfree (sk); + sk = sk_next; + } + sortKeys = NULL; } static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, @@ -978,6 +1157,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, extractCtrl.subType = subType; extractCtrl.init = wordInit; extractCtrl.add = addRecordKey; + extractCtrl.dh = rGroup->dh; reckeys.buf_used = 0; reckeys.prevAttrUse = -1; @@ -990,9 +1170,9 @@ static int recordExtract (SYSNO *sysno, const char *fname, extractCtrl.seekf = file_seek; extractCtrl.tellf = file_tell; extractCtrl.endf = file_end; - extractCtrl.map_chrs_input = map_chrs_input; - extractCtrl.flagShowRecords = rGroup->flagShowRecords; - if (rGroup->flagShowRecords) + extractCtrl.zebra_maps = rGroup->zebra_maps; + extractCtrl.flagShowRecords = !rGroup->flagRw; + if (!rGroup->flagRw) printf ("File: %s %ld\n", fname, (long) recordOffset); logInfo.fname = fname; @@ -1007,8 +1187,8 @@ static int recordExtract (SYSNO *sysno, const char *fname, if (r) { /* error occured during extraction ... */ - if (!rGroup->flagShowRecords && - records_processed < rGroup->fileVerboseLimit) + if (rGroup->flagRw && + records_processed < rGroup->fileVerboseLimit) { logf (LOG_WARN, "fail %s %s %ld code = %d", rGroup->recordType, fname, (long) recordOffset, r); @@ -1019,7 +1199,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, { /* the extraction process returned no information - the record is probably empty - unless flagShowRecords is in use */ - if (rGroup->flagShowRecords) + if (!rGroup->flagRw) return 1; logf (LOG_WARN, "No keys generated for file %s", fname); logf (LOG_WARN, " The file is probably empty"); @@ -1059,7 +1239,9 @@ static int recordExtract (SYSNO *sysno, const char *fname, /* new record */ if (deleteFlag) { - logf (LOG_LOG, "Cannot delete new record"); + logf (LOG_LOG, "delete %s %s %ld", rGroup->recordType, + fname, (long) recordOffset); + logf (LOG_WARN, "cannot delete record above (seems new)"); return 1; } if (records_processed < rGroup->fileVerboseLimit) @@ -1073,6 +1255,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, dict_insert (matchDict, matchStr, sizeof(*sysno), sysno); } flushRecordKeys (*sysno, 1, &reckeys, rGroup->databaseName); + flushSortKeys (*sysno, 1); records_inserted++; } @@ -1085,6 +1268,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, assert (rec); delkeys.buf_used = rec->size[recInfo_delKeys]; delkeys.buf = rec->info[recInfo_delKeys]; + flushSortKeys (*sysno, 0); flushRecordKeys (*sysno, 0, &delkeys, rec->info[recInfo_databaseName]); if (deleteFlag) {