X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fextract.c;h=32aed3667db1a5ce7c01f792eaee843b3fba5767;hb=ce3907338568fce46c5751e7e1091a5ad1c8e291;hp=03208017d91f4900b6e624d38678f926dd035160;hpb=edf09fc5529eae3e8214a432058b4c07b2b8d2f9;p=idzebra-moved-to-github.git diff --git a/index/extract.c b/index/extract.c index 0320801..32aed36 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,10 +1,43 @@ /* - * Copyright (C) 1994-1996, Index Data I/S + * Copyright (C) 1994-1998, Index Data I/S * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * * $Log: extract.c,v $ - * Revision 1.71 1997-07-15 16:28:41 adam + * Revision 1.80 1998-03-05 08:45:11 adam + * New result set model and modular ranking system. Moved towards + * descent server API. System information stored as "SGML" records. + * + * Revision 1.79 1998/02/17 10:32:52 adam + * Fixed bug: binary files weren't opened with flag b on NT. + * + * Revision 1.78 1998/02/10 12:03:05 adam + * Implemented Sort. + * + * Revision 1.77 1998/01/12 15:04:08 adam + * The test option (-s) only uses read-lock (and not write lock). + * + * Revision 1.76 1997/10/27 14:33:04 adam + * Moved towards generic character mapping depending on "structure" + * field in abstract syntax file. Fixed a few memory leaks. Fixed + * bug with negative integers when doing searches with relational + * operators. + * + * Revision 1.75 1997/09/17 12:19:12 adam + * Zebra version corresponds to YAZ version 1.4. + * Changed Zebra server so that it doesn't depend on global common_resource. + * + * Revision 1.74 1997/09/09 13:38:06 adam + * Partial port to WIN95/NT. + * + * Revision 1.73 1997/09/04 13:57:20 adam + * New file extract/retrieve method tellf (added). + * Added O_BINARY for open calls. + * + * Revision 1.72 1997/07/15 16:32:29 adam + * Bug fix: Match handler didn't terminate the resulting string! + * + * Revision 1.71 1997/07/15 16:28:41 adam * Bug fix: storeData didn't work with files with multiple records. * Bug fix: fixed memory management with records; not really well * thought through. @@ -259,10 +292,16 @@ */ #include #include +#ifdef WINDOWS +#include +#else #include +#endif #include #include +#include +#include #include "index.h" #include "zinfo.h" @@ -270,6 +309,7 @@ static Dict matchDict; static Records records = NULL; +static SortIdx sortIdx = NULL; static char **key_buf; static size_t ptr_top; @@ -282,7 +322,7 @@ static int records_updated = 0; static int records_deleted = 0; static int records_processed = 0; -static ZebTargetInfo *zti = NULL; +static ZebraExplainInfo zti = NULL; static void logRecord (int showFlag) { @@ -296,7 +336,7 @@ static void logRecord (int showFlag) } } -void key_open (int mem) +int key_open (BFiles bfs, int mem, int rw, data1_handle dh) { if (!mem) mem = atoi(res_get_def (common_resource, "memMax", "4"))*1024*1024; @@ -309,16 +349,27 @@ void key_open (int mem) key_buf_used = 0; key_file_no = 0; - if (!(matchDict = dict_open (GMATCH_DICT, 50, 1))) + if (!(matchDict = dict_open (bfs, GMATCH_DICT, 50, rw))) { logf (LOG_FATAL, "dict_open fail of %s", GMATCH_DICT); - exit (1); + return -1; } assert (!records); - records = rec_open (1); -#if 1 - zti = zebTargetInfo_open (records, 1); -#endif + records = rec_open (bfs, rw); + if (!records) + { + dict_close (matchDict); + return -1; + } + zti = zebraExplain_open (records, dh, rw); + if (!zti) + { + rec_close (&records); + dict_close (matchDict); + return -1; + } + sortIdx = sortIdx_open (bfs, 1); + return 0; } struct encode_info { @@ -421,9 +472,9 @@ void key_flush (void) qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_qsort_compare); getFnameTmp (out_fname, key_file_no); - if (!(outf = fopen (out_fname, "w"))) + if (!(outf = fopen (out_fname, "wb"))) { - logf (LOG_FATAL|LOG_ERRNO, "fopen (4) %s", out_fname); + logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname); exit (1); } logf (LOG_LOG, "writing section %d", key_file_no); @@ -447,9 +498,9 @@ void key_flush (void) qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_x_compare); getFnameTmp (out_fname, key_file_no); - if (!(outf = fopen (out_fname, "w"))) + if (!(outf = fopen (out_fname, "wb"))) { - logf (LOG_FATAL|LOG_ERRNO, "fopen (4) %s", out_fname); + logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname); exit (1); } logf (LOG_LOG, "writing section %d", key_file_no); @@ -489,28 +540,38 @@ void key_flush (void) key_buf_used = 0; } -int key_close (void) +int key_close (int rw) { key_flush (); xfree (key_buf); -#if 1 - zebTargetInfo_close (zti, 1); -#endif + if (rw) + zebraExplain_runNumberIncrement (zti, 1); + zebraExplain_close (zti, rw); rec_close (&records); dict_close (matchDict); + sortIdx_close (sortIdx); logRecord (1); return key_file_no; } -static void wordInit (RecWord *p) +static void wordInit (struct recExtractCtrl *p, RecWord *w) { - p->attrSet = 1; - p->attrUse = 1016; - p->which = Word_String; + w->zebra_maps = p->zebra_maps; + w->attrSet = 1; + w->attrUse = 1016; + w->reg_type = 'w'; } -struct recKeys { +static struct sortKey { + char *string; + int length; + int attrSet; + int attrUse; + struct sortKey *next; +} *sortKeys = NULL; + +static struct recKeys { int buf_used; int buf_max; char *buf; @@ -519,12 +580,11 @@ struct recKeys { int prevSeqNo; } reckeys; -static void addRecordKey (const RecWord *p) +static void addIndexString (RecWord *p, const char *string, int length) { char *dst; char attrSet; short attrUse; - size_t i; int lead = 0; int diff = 0; @@ -571,19 +631,9 @@ static void addRecordKey (const RecWord *p) memcpy (dst, &attrUse, sizeof(attrUse)); dst += sizeof(attrUse); } - switch (p->which) - { - case Word_String: - *dst++ = 'w'; - break; - case Word_Phrase: - *dst++ = 'p'; - break; - case Word_Numeric: - *dst++ = 'n'; - } - for (i = 0; p->u.string[i] && i < IT_MAX_WORD-3; i++) - *dst++ = p->u.string[i]; + *dst++ = p->reg_type; + memcpy (dst, string, length); + dst += length; *dst++ = '\0'; if (!diff) @@ -592,6 +642,150 @@ static void addRecordKey (const RecWord *p) dst += sizeof(p->seqno); } reckeys.buf_used = dst - reckeys.buf; + (p->seqno)++; +} + +static void addSortString (RecWord *p, const char *string, int length) +{ + struct sortKey *sk; + + for (sk = sortKeys; sk; sk = sk->next) + if (sk->attrSet == p->attrSet && sk->attrUse == p->attrUse) + return; + + sk = xmalloc (sizeof(*sk)); + sk->next = sortKeys; + sortKeys = sk; + + sk->string = xmalloc (p->length); + sk->length = p->length; + memcpy (sk->string, p->string, p->length); + sk->attrSet = p->attrSet; + sk->attrUse = p->attrUse; +} + +static void addString (RecWord *p, const char *string, int length) +{ + assert (length > 0); + if (zebra_maps_is_sort (p->zebra_maps, p->reg_type)) + addSortString (p, string, length); + else + addIndexString (p, string, length); +} + +static void addIncompleteField (RecWord *p) +{ + const char *b = p->string; + int remain = p->length; + const char **map = 0; + + if (remain > 0) + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + + while (map) + { + char buf[IT_MAX_WORD+1]; + int i, remain; + + /* Skip spaces */ + while (map && *map && **map == *CHR_SPACE) + { + remain = p->length - (b - p->string); + if (remain > 0) + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + else + map = 0; + } + if (!map) + break; + i = 0; + while (map && *map && **map != *CHR_SPACE) + { + const char *cp = *map; + + while (i < IT_MAX_WORD && *cp) + buf[i++] = *(cp++); + remain = p->length - (b - p->string); + if (remain > 0) + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + else + map = 0; + } + if (!i) + return; + addString (p, buf, i); + } +} + +static void addCompleteField (RecWord *p) +{ + const char *b = p->string; + char buf[IT_MAX_WORD+1]; + const char **map = 0; + int i = 0, remain = p->length; + + if (remain > 0) + map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain); + + while (remain > 0 && i < IT_MAX_WORD) + { + while (map && *map && **map == *CHR_SPACE) + { + remain = p->length - (b - p->string); + if (remain > 0) + map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain); + else + map = 0; + } + if (!map) + break; + + if (i && i < IT_MAX_WORD) + buf[i++] = *CHR_SPACE; + while (map && *map && **map != *CHR_SPACE) + { + const char *cp = *map; + + if (i >= IT_MAX_WORD) + break; + while (i < IT_MAX_WORD && *cp) + buf[i++] = *(cp++); + remain = p->length - (b - p->string); + if (remain > 0) + map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, + remain); + else + map = 0; + } + } + if (!i) + return; + addString (p, buf, i); +} + +static void addRecordKey (RecWord *p) +{ + if (zebra_maps_is_complete (p->zebra_maps, p->reg_type)) + addCompleteField (p); + else + addIncompleteField(p); +} + +static void flushSortKeys (SYSNO sysno, int cmd) +{ + struct sortKey *sk = sortKeys; + + sortIdx_sysno (sortIdx, sysno); + while (sk) + { + struct sortKey *sk_next = sk->next; + sortIdx_type (sortIdx, sk->attrUse); + sortIdx_add (sortIdx, sk->string, sk->length); + xfree (sk->string); + xfree (sk); + sk = sk_next; + } + sortKeys = NULL; } static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, @@ -602,11 +796,12 @@ static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, int seqno = 0; int off = 0; - if (zebTargetInfo_curDatabase (zti, databaseName)) + if (zebraExplain_curDatabase (zti, databaseName)) { - if (zebTargetInfo_newDatabase (zti, databaseName)) + if (zebraExplain_newDatabase (zti, databaseName)) abort (); } + zebraExplain_recordCountIncrement (zti, cmd ? 1 : -1); while (off < reckeys->buf_used) { const char *src = reckeys->buf + off; @@ -630,9 +825,9 @@ static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, ++ptr_i; key_buf[ptr_top-ptr_i] = (char*)key_buf + key_buf_used; - ch = zebTargetInfo_lookupSU (zti, attrSet, attrUse); + ch = zebraExplain_lookupSU (zti, attrSet, attrUse); if (ch < 0) - ch = zebTargetInfo_addSU (zti, attrSet, attrUse); + ch = zebraExplain_addSU (zti, attrSet, attrUse); assert (ch > 0); ((char*) key_buf) [key_buf_used++] = ch; while (*src) @@ -658,7 +853,7 @@ static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, } static const char **searchRecordKey (struct recKeys *reckeys, - int attrSetS, int attrUseS) + int attrSetS, int attrUseS) { static const char *ws[32]; int off = 0; @@ -749,6 +944,12 @@ static off_t file_seek (void *handle, off_t offset) return lseek (p->fd, offset, SEEK_SET); } +static off_t file_tell (void *handle) +{ + struct file_read_info *p = handle; + return p->file_offset; +} + static int file_read (void *handle, char *buf, size_t count) { struct file_read_info *p = handle; @@ -923,6 +1124,7 @@ static char *fileMatchStr (struct recKeys *reckeys, struct recordGroup *rGroup, fname, rGroup->groupName ? rGroup->groupName : "none"); return NULL; } + *dst = '\0'; return dstBuf; } @@ -950,6 +1152,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, char *subType) { struct recExtractCtrl extractCtrl; + RecordAttr *recordAttr; int r; char *matchStr; SYSNO sysnotmp; @@ -964,6 +1167,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, extractCtrl.subType = subType; extractCtrl.init = wordInit; extractCtrl.add = addRecordKey; + extractCtrl.dh = rGroup->dh; reckeys.buf_used = 0; reckeys.prevAttrUse = -1; @@ -974,10 +1178,11 @@ static int recordExtract (SYSNO *sysno, const char *fname, extractCtrl.offset = recordOffset; extractCtrl.readf = file_read; extractCtrl.seekf = file_seek; + extractCtrl.tellf = file_tell; extractCtrl.endf = file_end; - extractCtrl.map_chrs_input = map_chrs_input; - extractCtrl.flagShowRecords = rGroup->flagShowRecords; - if (rGroup->flagShowRecords) + extractCtrl.zebra_maps = rGroup->zebra_maps; + extractCtrl.flagShowRecords = !rGroup->flagRw; + if (!rGroup->flagRw) printf ("File: %s %ld\n", fname, (long) recordOffset); logInfo.fname = fname; @@ -992,8 +1197,8 @@ static int recordExtract (SYSNO *sysno, const char *fname, if (r) { /* error occured during extraction ... */ - if (!rGroup->flagShowRecords && - records_processed < rGroup->fileVerboseLimit) + if (rGroup->flagRw && + records_processed < rGroup->fileVerboseLimit) { logf (LOG_WARN, "fail %s %s %ld code = %d", rGroup->recordType, fname, (long) recordOffset, r); @@ -1004,7 +1209,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, { /* the extraction process returned no information - the record is probably empty - unless flagShowRecords is in use */ - if (rGroup->flagShowRecords) + if (!rGroup->flagRw) return 1; logf (LOG_WARN, "No keys generated for file %s", fname); logf (LOG_WARN, " The file is probably empty"); @@ -1044,20 +1249,26 @@ static int recordExtract (SYSNO *sysno, const char *fname, /* new record */ if (deleteFlag) { - logf (LOG_LOG, "Cannot delete new record"); + logf (LOG_LOG, "delete %s %s %ld", rGroup->recordType, + fname, (long) recordOffset); + logf (LOG_WARN, "cannot delete record above (seems new)"); return 1; } if (records_processed < rGroup->fileVerboseLimit) logf (LOG_LOG, "add %s %s %ld", rGroup->recordType, fname, (long) recordOffset); rec = rec_new (records); + *sysno = rec->sysno; + recordAttr = rec_init_attr (zti, rec); + if (matchStr) { dict_insert (matchDict, matchStr, sizeof(*sysno), sysno); } flushRecordKeys (*sysno, 1, &reckeys, rGroup->databaseName); + flushSortKeys (*sysno, 1); records_inserted++; } @@ -1068,8 +1279,20 @@ static int recordExtract (SYSNO *sysno, const char *fname, rec = rec_get (records, *sysno); assert (rec); + + recordAttr = rec_init_attr (zti, rec); + + if (recordAttr->runNumber == zebraExplain_runNumberIncrement (zti, 0)) + { + logf (LOG_LOG, "skipped %s %s %ld", rGroup->recordType, + fname, (long) recordOffset); + rec_rm (&rec); + logRecord (0); + return 1; + } delkeys.buf_used = rec->size[recInfo_delKeys]; delkeys.buf = rec->info[recInfo_delKeys]; + flushSortKeys (*sysno, 0); flushRecordKeys (*sysno, 0, &delkeys, rec->info[recInfo_databaseName]); if (deleteFlag) { @@ -1090,6 +1313,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, dict_delete (matchDict, matchStr); rec_del (records, &rec); } + rec_rm (&rec); logRecord (0); return 1; } @@ -1144,25 +1368,33 @@ static int recordExtract (SYSNO *sysno, const char *fname, rec->size[recInfo_delKeys] = 0; } + /* save file size of original record */ + zebraExplain_recordBytesIncrement (zti, - recordAttr->recordSize); + recordAttr->recordSize = fi->file_moffset - recordOffset; + if (!recordAttr->recordSize) + recordAttr->recordSize = fi->file_max - recordOffset; + zebraExplain_recordBytesIncrement (zti, recordAttr->recordSize); + + /* set run-number for this record */ + recordAttr->runNumber = zebraExplain_runNumberIncrement (zti, 0); + /* update store data */ xfree (rec->info[recInfo_storeData]); if (rGroup->flagStoreData == 1) { - int size = fi->file_moffset - recordOffset; - if (!size) - size = fi->file_max - recordOffset; - rec->size[recInfo_storeData] = size; - rec->info[recInfo_storeData] = xmalloc (size); + rec->size[recInfo_storeData] = recordAttr->recordSize; + rec->info[recInfo_storeData] = xmalloc (recordAttr->recordSize); if (lseek (fi->fd, recordOffset, SEEK_SET) < 0) { logf (LOG_ERRNO|LOG_FATAL, "seek to %ld in %s", fname, (long) recordOffset); exit (1); } - if (read (fi->fd, rec->info[recInfo_storeData], size) < size) + if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize) + < recordAttr->recordSize) { logf (LOG_ERRNO|LOG_FATAL, "read %d bytes of %s", - fi->file_max, fname); + recordAttr->recordSize, fname); exit (1); } } @@ -1177,11 +1409,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, rec_strdup (rGroup->databaseName, &rec->size[recInfo_databaseName]); /* update offset */ - xfree (rec->info[recInfo_offset]); - - rec->size[recInfo_offset] = sizeof(recordOffset); - rec->info[recInfo_offset] = xmalloc (sizeof(recordOffset)); - memcpy (rec->info[recInfo_offset], &recordOffset, sizeof(recordOffset)); + recordAttr->recordOffset = recordOffset; /* commit this record */ rec_put (records, &rec); @@ -1305,7 +1533,7 @@ int fileExtract (SYSNO *sysno, const char *fname, fd = -1; else { - if ((fd = open (fname, O_RDONLY)) == -1) + if ((fd = open (fname, O_BINARY|O_RDONLY)) == -1) { logf (LOG_WARN|LOG_ERRNO, "open %s", fname); return 0;