X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fextract.c;h=f3bceaee76cdd1a2970137d12ba96987bd725a06;hb=85a2a0b28cb516d28ac70b7824f2b7d4b07e56ae;hp=829bb68df75758c4a69bd3398741ba2f183fb720;hpb=5c693d36af8be6f6642257160b3c6441d2e2d762;p=idzebra-moved-to-github.git diff --git a/index/extract.c b/index/extract.c index 829bb68..f3bceae 100644 --- a/index/extract.c +++ b/index/extract.c @@ -4,7 +4,21 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: extract.c,v $ - * Revision 1.78 1998-02-10 12:03:05 adam + * Revision 1.82 1998-05-20 10:12:15 adam + * Implemented automatic EXPLAIN database maintenance. + * Modified Zebra to work with ASN.1 compiled version of YAZ. + * + * Revision 1.81 1998/03/11 11:19:04 adam + * Changed the way sequence numbers are generated. + * + * Revision 1.80 1998/03/05 08:45:11 adam + * New result set model and modular ranking system. Moved towards + * descent server API. System information stored as "SGML" records. + * + * Revision 1.79 1998/02/17 10:32:52 adam + * Fixed bug: binary files weren't opened with flag b on NT. + * + * Revision 1.78 1998/02/10 12:03:05 adam * Implemented Sort. * * Revision 1.77 1998/01/12 15:04:08 adam @@ -315,7 +329,7 @@ static int records_updated = 0; static int records_deleted = 0; static int records_processed = 0; -static ZebTargetInfo *zti = NULL; +static ZebraExplainInfo zti = NULL; static void logRecord (int showFlag) { @@ -329,8 +343,13 @@ static void logRecord (int showFlag) } } -int key_open (BFiles bfs, int mem, int rw) +static int explain_extract (void *handle, Record drec, data1_node *n); + +int key_open (struct recordGroup *rGroup, int mem) { + BFiles bfs = rGroup->bfs; + int rw = rGroup->flagRw; + data1_handle dh = rGroup->dh; if (!mem) mem = atoi(res_get_def (common_resource, "memMax", "4"))*1024*1024; if (mem < 50000) @@ -354,7 +373,8 @@ int key_open (BFiles bfs, int mem, int rw) dict_close (matchDict); return -1; } - zti = zebTargetInfo_open (records, rw); + zti = zebraExplain_open (records, dh, common_resource, + rw, rGroup, explain_extract); if (!zti) { rec_close (&records); @@ -465,7 +485,7 @@ void key_flush (void) qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_qsort_compare); getFnameTmp (out_fname, key_file_no); - if (!(outf = fopen (out_fname, "w"))) + if (!(outf = fopen (out_fname, "wb"))) { logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname); exit (1); @@ -491,7 +511,7 @@ void key_flush (void) qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_x_compare); getFnameTmp (out_fname, key_file_no); - if (!(outf = fopen (out_fname, "w"))) + if (!(outf = fopen (out_fname, "wb"))) { logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname); exit (1); @@ -533,13 +553,14 @@ void key_flush (void) key_buf_used = 0; } -int key_close () +int key_close (struct recordGroup *rGroup) { + int rw = rGroup->flagRw; + if (rw) + zebraExplain_runNumberIncrement (zti, 1); + zebraExplain_close (zti, rw, 0); key_flush (); xfree (key_buf); -#if 1 - zebTargetInfo_close (zti, 1); -#endif rec_close (&records); dict_close (matchDict); sortIdx_close (sortIdx); @@ -551,7 +572,8 @@ int key_close () static void wordInit (struct recExtractCtrl *p, RecWord *w) { w->zebra_maps = p->zebra_maps; - w->attrSet = 1; + w->seqnos = p->seqno; + w->attrSet = VAL_BIB1; w->attrUse = 1016; w->reg_type = 'w'; } @@ -580,6 +602,7 @@ static void addIndexString (RecWord *p, const char *string, int length) short attrUse; int lead = 0; int diff = 0; + int *pseqno = &p->seqnos[p->reg_type]; if (reckeys.buf_used+1024 > reckeys.buf_max) { @@ -604,14 +627,14 @@ static void addIndexString (RecWord *p, const char *string, int length) else reckeys.prevAttrUse = attrUse; #if 1 - diff = 1 + p->seqno - reckeys.prevSeqNo; + diff = 1 + *pseqno - reckeys.prevSeqNo; if (diff >= 1 && diff <= 15) lead |= (diff << 2); else diff = 0; #endif - reckeys.prevSeqNo = p->seqno; - + reckeys.prevSeqNo = *pseqno; + *dst++ = lead; if (!(lead & 1)) @@ -631,11 +654,11 @@ static void addIndexString (RecWord *p, const char *string, int length) if (!diff) { - memcpy (dst, &p->seqno, sizeof(p->seqno)); - dst += sizeof(p->seqno); + memcpy (dst, pseqno, sizeof(*pseqno)); + dst += sizeof(*pseqno); } reckeys.buf_used = dst - reckeys.buf; - (p->seqno)++; + (*pseqno)++; } static void addSortString (RecWord *p, const char *string, int length) @@ -659,6 +682,7 @@ static void addSortString (RecWord *p, const char *string, int length) static void addString (RecWord *p, const char *string, int length) { + assert (length > 0); if (zebra_maps_is_sort (p->zebra_maps, p->reg_type)) addSortString (p, string, length); else @@ -780,19 +804,14 @@ static void flushSortKeys (SYSNO sysno, int cmd) sortKeys = NULL; } -static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, - const char *databaseName) +static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys) { char attrSet = -1; short attrUse = -1; int seqno = 0; int off = 0; - if (zebTargetInfo_curDatabase (zti, databaseName)) - { - if (zebTargetInfo_newDatabase (zti, databaseName)) - abort (); - } + zebraExplain_recordCountIncrement (zti, cmd ? 1 : -1); while (off < reckeys->buf_used) { const char *src = reckeys->buf + off; @@ -816,9 +835,9 @@ static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, ++ptr_i; key_buf[ptr_top-ptr_i] = (char*)key_buf + key_buf_used; - ch = zebTargetInfo_lookupSU (zti, attrSet, attrUse); + ch = zebraExplain_lookupSU (zti, attrSet, attrUse); if (ch < 0) - ch = zebTargetInfo_addSU (zti, attrSet, attrUse); + ch = zebraExplain_addSU (zti, attrSet, attrUse); assert (ch > 0); ((char*) key_buf) [key_buf_used++] = ch; while (*src) @@ -844,7 +863,7 @@ static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, } static const char **searchRecordKey (struct recKeys *reckeys, - int attrSetS, int attrUseS) + int attrSetS, int attrUseS) { static const char *ws[32]; int off = 0; @@ -924,7 +943,6 @@ static struct file_read_info *file_read_start (int fd) static void file_read_stop (struct file_read_info *fi) { - assert (fi); xfree (fi); } @@ -1137,41 +1155,53 @@ static void recordLogPreamble (int level, const char *msg, void *info) log_event_start (NULL, NULL); } +void addSchema (struct recExtractCtrl *p, Odr_oid *oid) +{ + zebraExplain_addSchema (zti, oid); +} + static int recordExtract (SYSNO *sysno, const char *fname, struct recordGroup *rGroup, int deleteFlag, - struct file_read_info *fi, RecType recType, - char *subType) + struct file_read_info *fi, + RecType recType, char *subType) { - struct recExtractCtrl extractCtrl; + RecordAttr *recordAttr; int r; char *matchStr; SYSNO sysnotmp; - off_t recordOffset = 0; Record rec; struct recordLogInfo logInfo; + off_t recordOffset = 0; if (fi->fd != -1) { + struct recExtractCtrl extractCtrl; + /* we are going to read from a file, so prepare the extraction */ - extractCtrl.fh = fi; - extractCtrl.subType = subType; - extractCtrl.init = wordInit; - extractCtrl.add = addRecordKey; + int i; + + reckeys.buf_used = 0; + reckeys.prevAttrUse = -1; + reckeys.prevAttrSet = -1; + reckeys.prevSeqNo = 0; + + recordOffset = fi->file_moffset; + extractCtrl.offset = fi->file_moffset; + extractCtrl.readf = file_read; + extractCtrl.seekf = file_seek; + extractCtrl.tellf = file_tell; + extractCtrl.endf = file_end; + extractCtrl.fh = fi; + extractCtrl.subType = subType; + extractCtrl.init = wordInit; + extractCtrl.addWord = addRecordKey; + extractCtrl.addSchema = addSchema; extractCtrl.dh = rGroup->dh; - - reckeys.buf_used = 0; - reckeys.prevAttrUse = -1; - reckeys.prevAttrSet = -1; - reckeys.prevSeqNo = 0; - - recordOffset = fi->file_moffset; - extractCtrl.offset = recordOffset; - extractCtrl.readf = file_read; - extractCtrl.seekf = file_seek; - extractCtrl.tellf = file_tell; - extractCtrl.endf = file_end; + for (i = 0; i<256; i++) + extractCtrl.seqno[i] = 0; extractCtrl.zebra_maps = rGroup->zebra_maps; - extractCtrl.flagShowRecords = !rGroup->flagRw; + extractCtrl.flagShowRecords = !rGroup->flagRw; + if (!rGroup->flagRw) printf ("File: %s %ld\n", fname, (long) recordOffset); @@ -1248,13 +1278,16 @@ static int recordExtract (SYSNO *sysno, const char *fname, logf (LOG_LOG, "add %s %s %ld", rGroup->recordType, fname, (long) recordOffset); rec = rec_new (records); + *sysno = rec->sysno; + recordAttr = rec_init_attr (zti, rec); + if (matchStr) { dict_insert (matchDict, matchStr, sizeof(*sysno), sysno); } - flushRecordKeys (*sysno, 1, &reckeys, rGroup->databaseName); + flushRecordKeys (*sysno, 1, &reckeys); flushSortKeys (*sysno, 1); records_inserted++; @@ -1266,10 +1299,21 @@ static int recordExtract (SYSNO *sysno, const char *fname, rec = rec_get (records, *sysno); assert (rec); + + recordAttr = rec_init_attr (zti, rec); + + if (recordAttr->runNumber == zebraExplain_runNumberIncrement (zti, 0)) + { + logf (LOG_LOG, "skipped %s %s %ld", rGroup->recordType, + fname, (long) recordOffset); + rec_rm (&rec); + logRecord (0); + return 1; + } delkeys.buf_used = rec->size[recInfo_delKeys]; delkeys.buf = rec->info[recInfo_delKeys]; flushSortKeys (*sysno, 0); - flushRecordKeys (*sysno, 0, &delkeys, rec->info[recInfo_databaseName]); + flushRecordKeys (*sysno, 0, &delkeys); if (deleteFlag) { /* record going to be deleted */ @@ -1289,6 +1333,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, dict_delete (matchDict, matchStr); rec_del (records, &rec); } + rec_rm (&rec); logRecord (0); return 1; } @@ -1306,7 +1351,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, if (records_processed < rGroup->fileVerboseLimit) logf (LOG_LOG, "update %s %s %ld", rGroup->recordType, fname, (long) recordOffset); - flushRecordKeys (*sysno, 1, &reckeys, rGroup->databaseName); + flushRecordKeys (*sysno, 1, &reckeys); records_updated++; } } @@ -1343,25 +1388,33 @@ static int recordExtract (SYSNO *sysno, const char *fname, rec->size[recInfo_delKeys] = 0; } + /* save file size of original record */ + zebraExplain_recordBytesIncrement (zti, - recordAttr->recordSize); + recordAttr->recordSize = fi->file_moffset - recordOffset; + if (!recordAttr->recordSize) + recordAttr->recordSize = fi->file_max - recordOffset; + zebraExplain_recordBytesIncrement (zti, recordAttr->recordSize); + + /* set run-number for this record */ + recordAttr->runNumber = zebraExplain_runNumberIncrement (zti, 0); + /* update store data */ xfree (rec->info[recInfo_storeData]); if (rGroup->flagStoreData == 1) { - int size = fi->file_moffset - recordOffset; - if (!size) - size = fi->file_max - recordOffset; - rec->size[recInfo_storeData] = size; - rec->info[recInfo_storeData] = xmalloc (size); + rec->size[recInfo_storeData] = recordAttr->recordSize; + rec->info[recInfo_storeData] = xmalloc (recordAttr->recordSize); if (lseek (fi->fd, recordOffset, SEEK_SET) < 0) { logf (LOG_ERRNO|LOG_FATAL, "seek to %ld in %s", fname, (long) recordOffset); exit (1); } - if (read (fi->fd, rec->info[recInfo_storeData], size) < size) + if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize) + < recordAttr->recordSize) { logf (LOG_ERRNO|LOG_FATAL, "read %d bytes of %s", - fi->file_max, fname); + recordAttr->recordSize, fname); exit (1); } } @@ -1376,11 +1429,7 @@ static int recordExtract (SYSNO *sysno, const char *fname, rec_strdup (rGroup->databaseName, &rec->size[recInfo_databaseName]); /* update offset */ - xfree (rec->info[recInfo_offset]); - - rec->size[recInfo_offset] = sizeof(recordOffset); - rec->info[recInfo_offset] = xmalloc (sizeof(recordOffset)); - memcpy (rec->info[recInfo_offset], &recordOffset, sizeof(recordOffset)); + recordAttr->recordOffset = recordOffset; /* commit this record */ rec_put (records, &rec); @@ -1469,6 +1518,12 @@ int fileExtract (SYSNO *sysno, const char *fname, if (!rGroup->databaseName) rGroup->databaseName = "Default"; + if (zebraExplain_curDatabase (zti, rGroup->databaseName)) + { + if (zebraExplain_newDatabase (zti, rGroup->databaseName)) + abort (); + } + if (rGroup->flagStoreData == -1) { const char *sval; @@ -1523,3 +1578,53 @@ int fileExtract (SYSNO *sysno, const char *fname, return r; } +static int explain_extract (void *handle, Record rec, data1_node *n) +{ + struct recordGroup *rGroup = (struct recordGroup*) handle; + struct recExtractCtrl extractCtrl; + int i; + + if (zebraExplain_curDatabase (zti, rec->info[recInfo_databaseName])) + { + if (zebraExplain_newDatabase (zti, rec->info[recInfo_databaseName])) + abort (); + } + + reckeys.buf_used = 0; + reckeys.prevAttrUse = -1; + reckeys.prevAttrSet = -1; + reckeys.prevSeqNo = 0; + + extractCtrl.init = wordInit; + extractCtrl.addWord = addRecordKey; + extractCtrl.addSchema = addSchema; + extractCtrl.dh = rGroup->dh; + for (i = 0; i<256; i++) + extractCtrl.seqno[i] = 0; + extractCtrl.zebra_maps = rGroup->zebra_maps; + extractCtrl.flagShowRecords = !rGroup->flagRw; + + grs_extract_tree(&extractCtrl, n); + + logf (LOG_DEBUG, "flush explain record, sysno=%d", rec->sysno); + + if (rec->size[recInfo_delKeys]) + { + struct recKeys delkeys; + + delkeys.buf_used = rec->size[recInfo_delKeys]; + delkeys.buf = rec->info[recInfo_delKeys]; + flushSortKeys (rec->sysno, 0); + flushRecordKeys (rec->sysno, 0, &delkeys); + } + flushRecordKeys (rec->sysno, 1, &reckeys); + flushSortKeys (rec->sysno, 1); + + xfree (rec->info[recInfo_delKeys]); + rec->size[recInfo_delKeys] = reckeys.buf_used; + rec->info[recInfo_delKeys] = reckeys.buf; + reckeys.buf = NULL; + reckeys.buf_max = 0; + + return 0; +}