X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fextract.c;h=be70f55e7ed43cf86f521e63b25737a781097315;hb=3d4f0f1ddbc8b1ec6df244bb400184f086e211c8;hp=a726916ddfb773b07f7ea16359ebde1101b0545d;hpb=7e75317bed8eecabcb57e59b16093a32238738e2;p=idzebra-moved-to-github.git diff --git a/index/extract.c b/index/extract.c index a726916..be70f55 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,10 +1,26 @@ -/* - * Copyright (C) 1994-2002, Index Data - * All rights reserved. - * Sebastian Hammer, Adam Dickmeiss - * - * $Id: extract.c,v 1.110 2002-02-20 17:30:01 adam Exp $ - */ +/* $Id: extract.c,v 1.128 2002-10-24 13:07:02 heikki Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 + Index Data Aps + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + + #include #include #ifdef WIN32 @@ -14,12 +30,9 @@ #endif #include -#include -#include -#include #include "index.h" -#include "zserver.h" -#include "zinfo.h" +#include +#include #if _FILE_OFFSET_BITS == 64 #define PRINTF_OFF_T "%Ld" @@ -27,35 +40,48 @@ #define PRINTF_OFF_T "%ld" #endif -#ifndef ZEBRASDR -#define ZEBRASDR 0 -#endif +#define USE_SHELLSORT 0 -#if ZEBRASDR -#include "zebrasdr.h" +#if USE_SHELLSORT +static void shellsort(void *ar, int r, size_t s, + int (*cmp)(const void *a, const void *b)) +{ + char *a = ar; + char v[100]; + int h, i, j, k; + static const int incs[16] = { 1391376, 463792, 198768, 86961, 33936, + 13776, 4592, 1968, 861, 336, + 112, 48, 21, 7, 3, 1 }; + for ( k = 0; k < 16; k++) + for (h = incs[k], i = h; i < r; i++) + { + memcpy (v, a+s*i, s); + j = i; + while (j > h && (*cmp)(a + s*(j-h), v) > 0) + { + memcpy (a + s*j, a + s*(j-h), s); + j -= h; + } + memcpy (a+s*j, v, s); + } +} #endif -static int records_inserted = 0; -static int records_updated = 0; -static int records_deleted = 0; -static int records_processed = 0; - -static void logRecord (int showFlag) +static void logRecord (ZebraHandle zh) { - if (!showFlag) - ++records_processed; - if (showFlag || !(records_processed % 1000)) + ++zh->records_processed; + if (!(zh->records_processed % 1000)) { logf (LOG_LOG, "Records: %7d i/u/d %d/%d/%d", - records_processed, records_inserted, records_updated, - records_deleted); + zh->records_processed, zh->records_inserted, zh->records_updated, + zh->records_deleted); } } static void extract_init (struct recExtractCtrl *p, RecWord *w) { w->zebra_maps = p->zebra_maps; - w->seqnos = p->seqno; + w->seqno = 1; w->attrSet = VAL_BIB1; w->attrUse = 1016; w->reg_type = 'w'; @@ -82,7 +108,7 @@ static const char **searchRecordKey (ZebraHandle zh, ws[i] = NULL; #if SU_SCHEME - chS = zebraExplain_lookupSU (zh->service->zei, attrSetS, attrUseS); + chS = zebraExplain_lookupSU (zh->reg->zei, attrSetS, attrUseS); if (chS < 0) return ws; #endif @@ -232,7 +258,7 @@ static void file_end (void *handle, off_t offset) p->file_moffset = offset; } -static char *fileMatchStr (ZebraHandle zh, +static char *fileMatchStr (ZebraHandle zh, struct recKeys *reckeys, struct recordGroup *rGroup, const char *fname, const char *spec) { @@ -271,11 +297,11 @@ static char *fileMatchStr (ZebraHandle zh, attname_str[i] = '\0'; } - if ((attset = data1_get_attset (zh->service->dh, attset_str))) + if ((attset = data1_get_attset (zh->reg->dh, attset_str))) { data1_att *att; attSet = attset->reference; - att = data1_getattbyname(zh->service->dh, attset, attname_str); + att = data1_getattbyname(zh->reg->dh, attset, attname_str); if (att) attUse = att->value; else @@ -388,19 +414,6 @@ struct recordLogInfo { struct recordGroup *rGroup; }; -static void recordLogPreamble (int level, const char *msg, void *info) -{ - struct recordLogInfo *p = (struct recordLogInfo *) info; - FILE *outf = yaz_log_file (); - - if (level & LOG_LOG) - return ; - fprintf (outf, "File %s, offset %d, type %s\n", - p->rGroup->recordType, p->recordOffset, p->fname); - log_event_start (NULL, NULL); -} - - static int recordExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, struct recordGroup *rGroup, int deleteFlag, @@ -412,7 +425,6 @@ static int recordExtract (ZebraHandle zh, char *matchStr; SYSNO sysnotmp; Record rec; - struct recordLogInfo logInfo; off_t recordOffset = 0; if (fi->fd != -1) @@ -422,10 +434,13 @@ static int recordExtract (ZebraHandle zh, /* we are going to read from a file, so prepare the extraction */ int i; - zh->keys.buf_used = 0; - zh->keys.prevAttrUse = -1; - zh->keys.prevAttrSet = -1; - zh->keys.prevSeqNo = 0; + zh->reg->keys.buf_used = 0; + zh->reg->keys.prevAttrUse = -1; + zh->reg->keys.prevAttrSet = -1; + zh->reg->keys.prevSeqNo = 0; + zh->reg->sortKeys.buf_used = 0; + zh->reg->sortKeys.buf_max = 0; + zh->reg->sortKeys.buf = 0; recordOffset = fi->file_moffset; extractCtrl.offset = fi->file_moffset; @@ -438,44 +453,56 @@ static int recordExtract (ZebraHandle zh, extractCtrl.init = extract_init; extractCtrl.tokenAdd = extract_token_add; extractCtrl.schemaAdd = extract_schema_add; - extractCtrl.dh = zh->service->dh; + extractCtrl.dh = zh->reg->dh; extractCtrl.handle = zh; for (i = 0; i<256; i++) { - if (zebra_maps_is_positioned(zh->service->zebra_maps, i)) + if (zebra_maps_is_positioned(zh->reg->zebra_maps, i)) extractCtrl.seqno[i] = 1; else extractCtrl.seqno[i] = 0; } - extractCtrl.zebra_maps = zh->service->zebra_maps; + extractCtrl.zebra_maps = zh->reg->zebra_maps; extractCtrl.flagShowRecords = !rGroup->flagRw; if (!rGroup->flagRw) printf ("File: %s " PRINTF_OFF_T "\n", fname, recordOffset); - - logInfo.fname = fname; - logInfo.recordOffset = recordOffset; - logInfo.rGroup = rGroup; - log_event_start (recordLogPreamble, &logInfo); + if (rGroup->flagRw) + { + char msg[512]; + sprintf (msg, "%s:" PRINTF_OFF_T , fname, recordOffset); + yaz_log_init_prefix2 (msg); + } r = (*recType->extract)(clientData, &extractCtrl); - log_event_start (NULL, NULL); - + yaz_log_init_prefix2 (0); if (r == RECCTRL_EXTRACT_EOF) return 0; - else if (r == RECCTRL_EXTRACT_ERROR) + else if (r == RECCTRL_EXTRACT_ERROR_GENERIC) { /* error occured during extraction ... */ if (rGroup->flagRw && - records_processed < rGroup->fileVerboseLimit) + zh->records_processed < rGroup->fileVerboseLimit) { logf (LOG_WARN, "fail %s %s " PRINTF_OFF_T, rGroup->recordType, fname, recordOffset); } return 0; } - if (zh->keys.buf_used == 0) + else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER) + { + /* error occured during extraction ... */ + if (rGroup->flagRw && + zh->records_processed < rGroup->fileVerboseLimit) + { + logf (LOG_WARN, "no filter for %s %s " + PRINTF_OFF_T, rGroup->recordType, + fname, recordOffset); + } + return 0; + } + if (zh->reg->keys.buf_used == 0) { /* the extraction process returned no information - the record is probably empty - unless flagShowRecords is in use */ @@ -499,11 +526,11 @@ static int recordExtract (ZebraHandle zh, { char *rinfo; - matchStr = fileMatchStr (zh, &zh->keys, rGroup, fname, + matchStr = fileMatchStr (zh, &zh->reg->keys, rGroup, fname, rGroup->recordId); if (matchStr) { - rinfo = dict_lookup (zh->service->matchDict, matchStr); + rinfo = dict_lookup (zh->reg->matchDict, matchStr); if (rinfo) memcpy (sysno, rinfo+1, sizeof(*sysno)); } @@ -525,47 +552,53 @@ static int recordExtract (ZebraHandle zh, logf (LOG_WARN, "cannot delete record above (seems new)"); return 1; } - if (records_processed < rGroup->fileVerboseLimit) + if (zh->records_processed < rGroup->fileVerboseLimit) logf (LOG_LOG, "add %s %s " PRINTF_OFF_T, rGroup->recordType, fname, recordOffset); - rec = rec_new (zh->service->records); + rec = rec_new (zh->reg->records); *sysno = rec->sysno; - recordAttr = rec_init_attr (zh->service->zei, rec); + recordAttr = rec_init_attr (zh->reg->zei, rec); if (matchStr) { - dict_insert (zh->service->matchDict, matchStr, sizeof(*sysno), sysno); + dict_insert (zh->reg->matchDict, matchStr, sizeof(*sysno), sysno); } - extract_flushRecordKeys (zh, *sysno, 1, &zh->keys); - extract_flushSortKeys (zh, *sysno, 1, &zh->sortKeys); + extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys); + extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys); - records_inserted++; + zh->records_inserted++; } else { /* record already exists */ struct recKeys delkeys; + struct sortKeys sortKeys; - rec = rec_get (zh->service->records, *sysno); + rec = rec_get (zh->reg->records, *sysno); assert (rec); - recordAttr = rec_init_attr (zh->service->zei, rec); + recordAttr = rec_init_attr (zh->reg->zei, rec); if (recordAttr->runNumber == - zebraExplain_runNumberIncrement (zh->service->zei, 0)) + zebraExplain_runNumberIncrement (zh->reg->zei, 0)) { - logf (LOG_LOG, "skipped %s %s " PRINTF_OFF_T, rGroup->recordType, - fname, recordOffset); - extract_flushSortKeys (zh, *sysno, -1, &zh->sortKeys); + yaz_log (LOG_LOG, "run number = %d", recordAttr->runNumber); + yaz_log (LOG_LOG, "skipped %s %s " PRINTF_OFF_T, + rGroup->recordType, fname, recordOffset); + extract_flushSortKeys (zh, *sysno, -1, &zh->reg->sortKeys); rec_rm (&rec); - logRecord (0); + logRecord (zh); return 1; } delkeys.buf_used = rec->size[recInfo_delKeys]; delkeys.buf = rec->info[recInfo_delKeys]; - extract_flushSortKeys (zh, *sysno, 0, &zh->sortKeys); + + sortKeys.buf_used = rec->size[recInfo_sortKeys]; + sortKeys.buf = rec->info[recInfo_sortKeys]; + + extract_flushSortKeys (zh, *sysno, 0, &sortKeys); extract_flushRecordKeys (zh, *sysno, 0, &delkeys); if (deleteFlag) { @@ -578,16 +611,16 @@ static int recordExtract (ZebraHandle zh, } else { - if (records_processed < rGroup->fileVerboseLimit) + if (zh->records_processed < rGroup->fileVerboseLimit) logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T, rGroup->recordType, fname, recordOffset); - records_deleted++; + zh->records_deleted++; if (matchStr) - dict_delete (zh->service->matchDict, matchStr); - rec_del (zh->service->records, &rec); + dict_delete (zh->reg->matchDict, matchStr); + rec_del (zh->reg->records, &rec); } rec_rm (&rec); - logRecord (0); + logRecord (zh); return 1; } else @@ -601,11 +634,12 @@ static int recordExtract (ZebraHandle zh, } else { - if (records_processed < rGroup->fileVerboseLimit) + if (zh->records_processed < rGroup->fileVerboseLimit) logf (LOG_LOG, "update %s %s " PRINTF_OFF_T, rGroup->recordType, fname, recordOffset); - extract_flushRecordKeys (zh, *sysno, 1, &zh->keys); - records_updated++; + extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys); + extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys); + zh->records_updated++; } } } @@ -621,19 +655,12 @@ static int recordExtract (ZebraHandle zh, /* update delete keys */ xfree (rec->info[recInfo_delKeys]); - if (zh->keys.buf_used > 0 && rGroup->flagStoreKeys == 1) + if (zh->reg->keys.buf_used > 0 && rGroup->flagStoreKeys == 1) { -#if 1 - rec->size[recInfo_delKeys] = zh->keys.buf_used; - rec->info[recInfo_delKeys] = zh->keys.buf; - zh->keys.buf = NULL; - zh->keys.buf_max = 0; -#else - rec->info[recInfo_delKeys] = xmalloc (reckeys.buf_used); - rec->size[recInfo_delKeys] = reckeys.buf_used; - memcpy (rec->info[recInfo_delKeys], reckeys.buf, - rec->size[recInfo_delKeys]); -#endif + rec->size[recInfo_delKeys] = zh->reg->keys.buf_used; + rec->info[recInfo_delKeys] = zh->reg->keys.buf; + zh->reg->keys.buf = NULL; + zh->reg->keys.buf_max = 0; } else { @@ -641,17 +668,25 @@ static int recordExtract (ZebraHandle zh, rec->size[recInfo_delKeys] = 0; } + /* update sort keys */ + xfree (rec->info[recInfo_sortKeys]); + + rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used; + rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf; + zh->reg->sortKeys.buf = NULL; + zh->reg->sortKeys.buf_max = 0; + /* save file size of original record */ - zebraExplain_recordBytesIncrement (zh->service->zei, + zebraExplain_recordBytesIncrement (zh->reg->zei, - recordAttr->recordSize); recordAttr->recordSize = fi->file_moffset - recordOffset; if (!recordAttr->recordSize) recordAttr->recordSize = fi->file_max - recordOffset; - zebraExplain_recordBytesIncrement (zh->service->zei, + zebraExplain_recordBytesIncrement (zh->reg->zei, recordAttr->recordSize); /* set run-number for this record */ - recordAttr->runNumber = zebraExplain_runNumberIncrement (zh->service->zei, + recordAttr->runNumber = zebraExplain_runNumberIncrement (zh->reg->zei, 0); /* update store data */ @@ -689,8 +724,8 @@ static int recordExtract (ZebraHandle zh, recordAttr->recordOffset = recordOffset; /* commit this record */ - rec_put (zh->service->records, &rec); - logRecord (0); + rec_put (zh->reg->records, &rec); + logRecord (zh); return 1; } @@ -731,22 +766,22 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, if (!rGroup->recordType) { sprintf (ext_res, "%srecordType.%s", gprefix, ext); - if (!(rGroup->recordType = res_get (zh->service->res, ext_res))) + if (!(rGroup->recordType = res_get (zh->res, ext_res))) { sprintf (ext_res, "%srecordType", gprefix); - rGroup->recordType = res_get (zh->service->res, ext_res); + rGroup->recordType = res_get (zh->res, ext_res); } } if (!rGroup->recordType) { - if (records_processed < rGroup->fileVerboseLimit) + if (zh->records_processed < rGroup->fileVerboseLimit) logf (LOG_LOG, "? %s", fname); return 0; } if (!*rGroup->recordType) return 0; if (!(recType = - recType_byName (zh->service->recTypes, rGroup->recordType, subType, + recType_byName (zh->reg->recTypes, rGroup->recordType, subType, &clientData))) { logf (LOG_WARN, "No such record type: %s", rGroup->recordType); @@ -757,17 +792,17 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, if (!rGroup->recordId) { sprintf (ext_res, "%srecordId.%s", gprefix, ext); - rGroup->recordId = res_get (zh->service->res, ext_res); + rGroup->recordId = res_get (zh->res, ext_res); } /* determine database name */ if (!rGroup->databaseName) { sprintf (ext_res, "%sdatabase.%s", gprefix, ext); - if (!(rGroup->databaseName = res_get (zh->service->res, ext_res))) + if (!(rGroup->databaseName = res_get (zh->res, ext_res))) { sprintf (ext_res, "%sdatabase", gprefix); - rGroup->databaseName = res_get (zh->service->res, ext_res); + rGroup->databaseName = res_get (zh->res, ext_res); } } if (!rGroup->databaseName) @@ -777,12 +812,12 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, sprintf (ext_res, "%sexplainDatabase", gprefix); rGroup->explainDatabase = - atoi (res_get_def (zh->service->res, ext_res, "0")); + atoi (res_get_def (zh->res, ext_res, "0")); /* announce database */ - if (zebraExplain_curDatabase (zh->service->zei, rGroup->databaseName)) + if (zebraExplain_curDatabase (zh->reg->zei, rGroup->databaseName)) { - if (zebraExplain_newDatabase (zh->service->zei, rGroup->databaseName, + if (zebraExplain_newDatabase (zh->reg->zei, rGroup->databaseName, rGroup->explainDatabase)) return 0; } @@ -791,10 +826,10 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, { const char *sval; sprintf (ext_res, "%sstoreData.%s", gprefix, ext); - if (!(sval = res_get (zh->service->res, ext_res))) + if (!(sval = res_get (zh->res, ext_res))) { sprintf (ext_res, "%sstoreData", gprefix); - sval = res_get (zh->service->res, ext_res); + sval = res_get (zh->res, ext_res); } if (sval) rGroup->flagStoreData = atoi (sval); @@ -807,14 +842,14 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, const char *sval; sprintf (ext_res, "%sstoreKeys.%s", gprefix, ext); - sval = res_get (zh->service->res, ext_res); + sval = res_get (zh->res, ext_res); if (!sval) { sprintf (ext_res, "%sstoreKeys", gprefix); - sval = res_get (zh->service->res, ext_res); + sval = res_get (zh->res, ext_res); } if (!sval) - sval = res_get (zh->service->res, "storeKeys"); + sval = res_get (zh->res, "storeKeys"); if (sval) rGroup->flagStoreKeys = atoi (sval); } @@ -825,9 +860,21 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, fd = -1; else { - if ((fd = open (fname, O_BINARY|O_RDONLY)) == -1) + char full_rep[1024]; + + if (zh->path_reg && !yaz_is_abspath (fname)) { - logf (LOG_WARN|LOG_ERRNO, "open %s", fname); + strcpy (full_rep, zh->path_reg); + strcat (full_rep, "/"); + strcat (full_rep, fname); + } + else + strcpy (full_rep, fname); + + + if ((fd = open (full_rep, O_BINARY|O_RDONLY)) == -1) + { + logf (LOG_WARN|LOG_ERRNO, "open %s", full_rep); return 0; } } @@ -855,7 +902,7 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, RecordAttr *recordAttr; struct recExtractCtrl extractCtrl; int i, r; - char *matchStr; + char *matchStr = 0; RecType recType; char subType[1024]; void *clientData; @@ -879,36 +926,36 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, extractCtrl.fh = &fc; /* announce database */ - if (zebraExplain_curDatabase (zh->service->zei, databaseName)) + if (zebraExplain_curDatabase (zh->reg->zei, databaseName)) { - if (zebraExplain_newDatabase (zh->service->zei, databaseName, 0)) + if (zebraExplain_newDatabase (zh->reg->zei, databaseName, 0)) return 0; } if (!(recType = - recType_byName (zh->service->recTypes, recordType, subType, + recType_byName (zh->reg->recTypes, recordType, subType, &clientData))) { logf (LOG_WARN, "No such record type: %s", recordType); return 0; } - zh->keys.buf_used = 0; - zh->keys.prevAttrUse = -1; - zh->keys.prevAttrSet = -1; - zh->keys.prevSeqNo = 0; - zh->sortKeys = 0; + zh->reg->keys.buf_used = 0; + zh->reg->keys.prevAttrUse = -1; + zh->reg->keys.prevAttrSet = -1; + zh->reg->keys.prevSeqNo = 0; + zh->reg->sortKeys.buf_used = 0; extractCtrl.subType = subType; extractCtrl.init = extract_init; extractCtrl.tokenAdd = extract_token_add; extractCtrl.schemaAdd = extract_schema_add; - extractCtrl.dh = zh->service->dh; + extractCtrl.dh = zh->reg->dh; extractCtrl.handle = zh; - extractCtrl.zebra_maps = zh->service->zebra_maps; + extractCtrl.zebra_maps = zh->reg->zebra_maps; extractCtrl.flagShowRecords = 0; for (i = 0; i<256; i++) { - if (zebra_maps_is_positioned(zh->service->zebra_maps, i)) + if (zebra_maps_is_positioned(zh->reg->zebra_maps, i)) extractCtrl.seqno[i] = 1; else extractCtrl.seqno[i] = 0; @@ -918,22 +965,19 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, if (r == RECCTRL_EXTRACT_EOF) return 0; - else if (r == RECCTRL_EXTRACT_ERROR) + else if (r == RECCTRL_EXTRACT_ERROR_GENERIC) { /* error occured during extraction ... */ -#if 1 - yaz_log (LOG_WARN, "extract error"); -#else - if (rGroup->flagRw && - records_processed < rGroup->fileVerboseLimit) - { - logf (LOG_WARN, "fail %s %s %ld", rGroup->recordType, - fname, (long) recordOffset); - } -#endif + yaz_log (LOG_WARN, "extract error: generic"); + return 0; + } + else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER) + { + /* error occured during extraction ... */ + yaz_log (LOG_WARN, "extract error: no such filter"); return 0; } - if (zh->keys.buf_used == 0) + if (zh->reg->keys.buf_used == 0) { /* the extraction process returned no information - the record is probably empty - unless flagShowRecords is in use */ @@ -945,7 +989,6 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, } /* match criteria */ - if (! *sysno) { /* new record */ @@ -958,32 +1001,32 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, } logf (LOG_LOG, "add %s %s %ld", recordType, fname, (long) recordOffset); - rec = rec_new (zh->service->records); + rec = rec_new (zh->reg->records); *sysno = rec->sysno; - recordAttr = rec_init_attr (zh->service->zei, rec); + recordAttr = rec_init_attr (zh->reg->zei, rec); if (matchStr) { - dict_insert (zh->service->matchDict, matchStr, + dict_insert (zh->reg->matchDict, matchStr, sizeof(*sysno), sysno); } - extract_flushRecordKeys (zh, *sysno, 1, &zh->keys); - extract_flushSortKeys (zh, *sysno, 1, &zh->sortKeys); + extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys); + extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys); } else { /* record already exists */ struct recKeys delkeys; - rec = rec_get (zh->service->records, *sysno); + rec = rec_get (zh->reg->records, *sysno); assert (rec); - recordAttr = rec_init_attr (zh->service->zei, rec); + recordAttr = rec_init_attr (zh->reg->zei, rec); if (recordAttr->runNumber == - zebraExplain_runNumberIncrement (zh->service->zei, 0)) + zebraExplain_runNumberIncrement (zh->reg->zei, 0)) { logf (LOG_LOG, "skipped %s %s %ld", recordType, fname, (long) recordOffset); @@ -992,7 +1035,7 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, } delkeys.buf_used = rec->size[recInfo_delKeys]; delkeys.buf = rec->info[recInfo_delKeys]; - extract_flushSortKeys (zh, *sysno, 0, &zh->sortKeys); + extract_flushSortKeys (zh, *sysno, 0, &zh->reg->sortKeys); extract_flushRecordKeys (zh, *sysno, 0, &delkeys); if (delete_flag) { @@ -1011,7 +1054,7 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, if (matchStr) dict_delete (matchDict, matchStr); #endif - rec_del (zh->service->records, &rec); + rec_del (zh->reg->records, &rec); } rec_rm (&rec); return 1; @@ -1029,7 +1072,8 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, { logf (LOG_LOG, "update %s %s %ld", recordType, fname, (long) recordOffset); - extract_flushRecordKeys (zh, *sysno, 1, &zh->keys); + extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys); + extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys); } } } @@ -1045,12 +1089,12 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, /* update delete keys */ xfree (rec->info[recInfo_delKeys]); - if (zh->keys.buf_used > 0 && store_keys == 1) + if (zh->reg->keys.buf_used > 0 && store_keys == 1) { - rec->size[recInfo_delKeys] = zh->keys.buf_used; - rec->info[recInfo_delKeys] = zh->keys.buf; - zh->keys.buf = NULL; - zh->keys.buf_max = 0; + rec->size[recInfo_delKeys] = zh->reg->keys.buf_used; + rec->info[recInfo_delKeys] = zh->reg->keys.buf; + zh->reg->keys.buf = NULL; + zh->reg->keys.buf_max = 0; } else { @@ -1059,7 +1103,7 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, } /* save file size of original record */ - zebraExplain_recordBytesIncrement (zh->service->zei, + zebraExplain_recordBytesIncrement (zh->reg->zei, - recordAttr->recordSize); #if 0 recordAttr->recordSize = fi->file_moffset - recordOffset; @@ -1068,12 +1112,12 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, #else recordAttr->recordSize = buf_size; #endif - zebraExplain_recordBytesIncrement (zh->service->zei, + zebraExplain_recordBytesIncrement (zh->reg->zei, recordAttr->recordSize); /* set run-number for this record */ recordAttr->runNumber = - zebraExplain_runNumberIncrement (zh->service->zei, 0); + zebraExplain_runNumberIncrement (zh->reg->zei, 0); /* update store data */ xfree (rec->info[recInfo_storeData]); @@ -1114,7 +1158,7 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, recordAttr->recordOffset = recordOffset; /* commit this record */ - rec_put (zh->service->records, &rec); + rec_put (zh->reg->records, &rec); return 0; } @@ -1125,53 +1169,62 @@ int explain_extract (void *handle, Record rec, data1_node *n) struct recExtractCtrl extractCtrl; int i; - if (zebraExplain_curDatabase (zh->service->zei, + if (zebraExplain_curDatabase (zh->reg->zei, rec->info[recInfo_databaseName])) { abort(); - if (zebraExplain_newDatabase (zh->service->zei, + if (zebraExplain_newDatabase (zh->reg->zei, rec->info[recInfo_databaseName], 0)) abort (); } - zh->keys.buf_used = 0; - zh->keys.prevAttrUse = -1; - zh->keys.prevAttrSet = -1; - zh->keys.prevSeqNo = 0; - zh->sortKeys = 0; + zh->reg->keys.buf_used = 0; + zh->reg->keys.prevAttrUse = -1; + zh->reg->keys.prevAttrSet = -1; + zh->reg->keys.prevSeqNo = 0; + zh->reg->sortKeys.buf_used = 0; extractCtrl.init = extract_init; extractCtrl.tokenAdd = extract_token_add; extractCtrl.schemaAdd = extract_schema_add; - extractCtrl.dh = zh->service->dh; + extractCtrl.dh = zh->reg->dh; for (i = 0; i<256; i++) extractCtrl.seqno[i] = 0; - extractCtrl.zebra_maps = zh->service->zebra_maps; + extractCtrl.zebra_maps = zh->reg->zebra_maps; extractCtrl.flagShowRecords = 0; extractCtrl.handle = handle; grs_extract_tree(&extractCtrl, n); - logf (LOG_LOG, "flush explain record, sysno=%d", rec->sysno); - if (rec->size[recInfo_delKeys]) { struct recKeys delkeys; - struct sortKey *sortKeys = 0; + struct sortKeys sortkeys; delkeys.buf_used = rec->size[recInfo_delKeys]; delkeys.buf = rec->info[recInfo_delKeys]; - extract_flushSortKeys (zh, rec->sysno, 0, &sortKeys); + + sortkeys.buf_used = rec->size[recInfo_sortKeys]; + sortkeys.buf = rec->info[recInfo_sortKeys]; + + extract_flushSortKeys (zh, rec->sysno, 0, &sortkeys); extract_flushRecordKeys (zh, rec->sysno, 0, &delkeys); } - extract_flushRecordKeys (zh, rec->sysno, 1, &zh->keys); - extract_flushSortKeys (zh, rec->sysno, 1, &zh->sortKeys); + extract_flushRecordKeys (zh, rec->sysno, 1, &zh->reg->keys); + extract_flushSortKeys (zh, rec->sysno, 1, &zh->reg->sortKeys); xfree (rec->info[recInfo_delKeys]); - rec->size[recInfo_delKeys] = zh->keys.buf_used; - rec->info[recInfo_delKeys] = zh->keys.buf; - zh->keys.buf = NULL; - zh->keys.buf_max = 0; + rec->size[recInfo_delKeys] = zh->reg->keys.buf_used; + rec->info[recInfo_delKeys] = zh->reg->keys.buf; + zh->reg->keys.buf = NULL; + zh->reg->keys.buf_max = 0; + + xfree (rec->info[recInfo_sortKeys]); + rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used; + rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf; + zh->reg->sortKeys.buf = NULL; + zh->reg->sortKeys.buf_max = 0; + return 0; } @@ -1186,16 +1239,23 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, int seqno = 0; int off = 0; int ch = 0; - ZebraExplainInfo zei = zh->service->zei; + ZebraExplainInfo zei = zh->reg->zei; - if (!zh->key_buf) + if (!zh->reg->key_buf) { - int mem = 8*1024*1024; - zh->key_buf = (char**) xmalloc (mem); - zh->ptr_top = mem/sizeof(char*); - zh->ptr_i = 0; - zh->key_buf_used = 0; - zh->key_file_no = 0; + int mem= 1024*1024* atoi( res_get_def( zh->res, "memmax", "8")); + if (mem <= 0) + { + logf(LOG_WARN, "Invalid memory setting, using default 8 MB"); + mem= 1024*1024*8; + } + /* FIXME: That "8" should be in a default settings include */ + /* not hard-coded here! -H */ + zh->reg->key_buf = (char**) xmalloc (mem); + zh->reg->ptr_top = mem/sizeof(char*); + zh->reg->ptr_i = 0; + zh->reg->key_buf_used = 0; + zh->reg->key_file_no = 0; } zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1); while (off < reckeys->buf_used) @@ -1224,11 +1284,12 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, src += sizeof(attrUse); } #endif - if (zh->key_buf_used + 1024 > (zh->ptr_top-zh->ptr_i)*sizeof(char*)) + if (zh->reg->key_buf_used + 1024 > + (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*)) extract_flushWriteKeys (zh); - ++(zh->ptr_i); - (zh->key_buf)[zh->ptr_top - zh->ptr_i] = - (char*)zh->key_buf + zh->key_buf_used; + ++(zh->reg->ptr_i); + (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] = + (char*)zh->reg->key_buf + zh->reg->key_buf_used; #if SU_SCHEME #else ch = zebraExplain_lookupSU (zei, attrSet, attrUse); @@ -1236,14 +1297,15 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, ch = zebraExplain_addSU (zei, attrSet, attrUse); #endif assert (ch > 0); - zh->key_buf_used += - key_SU_encode (ch,((char*)zh->key_buf) + zh->key_buf_used); + zh->reg->key_buf_used += + key_SU_encode (ch,((char*)zh->reg->key_buf) + + zh->reg->key_buf_used); while (*src) - ((char*)zh->key_buf) [(zh->key_buf_used)++] = *src++; + ((char*)zh->reg->key_buf) [(zh->reg->key_buf_used)++] = *src++; src++; - ((char*)(zh->key_buf))[(zh->key_buf_used)++] = '\0'; - ((char*)(zh->key_buf))[(zh->key_buf_used)++] = cmd; + ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = '\0'; + ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = cmd; if (lead & 60) seqno += ((lead>>2) & 15)-1; @@ -1254,8 +1316,8 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, } key.seqno = seqno; key.sysno = sysno; - memcpy ((char*)zh->key_buf + zh->key_buf_used, &key, sizeof(key)); - (zh->key_buf_used) += sizeof(key); + memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used, &key, sizeof(key)); + (zh->reg->key_buf_used) += sizeof(key); off = src - reckeys->buf; } assert (off == reckeys->buf_used); @@ -1267,36 +1329,37 @@ void extract_flushWriteKeys (ZebraHandle zh) char out_fname[200]; char *prevcp, *cp; struct encode_info encode_info; - int ptr_i = zh->ptr_i; + int ptr_i = zh->reg->ptr_i; #if SORT_EXTRA int i; #endif - if (!zh->key_buf || ptr_i <= 0) + if (!zh->reg->key_buf || ptr_i <= 0) return; - (zh->key_file_no)++; - logf (LOG_LOG, "sorting section %d", (zh->key_file_no)); + (zh->reg->key_file_no)++; + logf (LOG_LOG, "sorting section %d", (zh->reg->key_file_no)); #if !SORT_EXTRA - qsort (zh->key_buf + zh->ptr_top - ptr_i, ptr_i, sizeof(char*), - key_qsort_compare); - extract_get_fname_tmp (zh, out_fname, zh->key_file_no); + qsort (zh->reg->key_buf + zh->reg->ptr_top - ptr_i, ptr_i, + sizeof(char*), key_qsort_compare); + extract_get_fname_tmp (zh, out_fname, zh->reg->key_file_no); if (!(outf = fopen (out_fname, "wb"))) { logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname); exit (1); } - logf (LOG_LOG, "writing section %d", zh->key_file_no); - prevcp = cp = (zh->key_buf)[zh->ptr_top - ptr_i]; + logf (LOG_LOG, "writing section %d", zh->reg->key_file_no); + prevcp = cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i]; encode_key_init (&encode_info); encode_key_write (cp, &encode_info, outf); while (--ptr_i > 0) { - cp = (zh->key_buf)[zh->ptr_top - ptr_i]; + cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i]; if (strcmp (cp, prevcp)) { + encode_key_flush ( &encode_info, outf); encode_key_init (&encode_info); encode_key_write (cp, &encode_info, outf); prevcp = cp; @@ -1304,6 +1367,7 @@ void extract_flushWriteKeys (ZebraHandle zh) else encode_key_write (cp + strlen(cp), &encode_info, outf); } + encode_key_flush ( &encode_info, outf); #else qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_x_compare); extract_get_fname_tmp (out_fname, key_file_no); @@ -1335,6 +1399,7 @@ void extract_flushWriteKeys (ZebraHandle zh) cp = key_buf[ptr_top-ptr_i]; encode_key_write (cp+key_y_len, &encode_info, outf); } + encode_key_flush ( &encode_info, outf); if (!i) break; prevcp = key_buf[ptr_top-ptr_i]; @@ -1345,9 +1410,9 @@ void extract_flushWriteKeys (ZebraHandle zh) logf (LOG_FATAL|LOG_ERRNO, "fclose %s", out_fname); exit (1); } - logf (LOG_LOG, "finished section %d", zh->key_file_no); - zh->ptr_i = 0; - zh->key_buf_used = 0; + logf (LOG_LOG, "finished section %d", zh->reg->key_file_no); + zh->reg->ptr_i = 0; + zh->reg->key_buf_used = 0; } void extract_add_index_string (RecWord *p, const char *string, @@ -1358,11 +1423,11 @@ void extract_add_index_string (RecWord *p, const char *string, unsigned short attrUse; int lead = 0; int diff = 0; - int *pseqno = &p->seqnos[p->reg_type]; + int *pseqno = &p->seqno; ZebraHandle zh = p->extractCtrl->handle; - ZebraExplainInfo zei = zh->service->zei; - struct recKeys *keys = &zh->keys; - + ZebraExplainInfo zei = zh->reg->zei; + struct recKeys *keys = &zh->reg->keys; + if (keys->buf_used+1024 > keys->buf_max) { char *b; @@ -1403,7 +1468,7 @@ void extract_add_index_string (RecWord *p, const char *string, if (ch < 0) { ch = zebraExplain_addSU (zei, attrSet, attrUse); - yaz_log (LOG_LOG, "addSU set=%d use=%d SU=%d", + yaz_log (LOG_DEBUG, "addSU set=%d use=%d SU=%d", attrSet, attrUse, ch); } assert (ch > 0); @@ -1433,24 +1498,60 @@ void extract_add_index_string (RecWord *p, const char *string, dst += sizeof(*pseqno); } keys->buf_used = dst - keys->buf; - if (*pseqno) - (*pseqno)++; } static void extract_add_sort_string (RecWord *p, const char *string, int length) { +#if 1 + ZebraHandle zh = p->extractCtrl->handle; + struct sortKeys *sk = &zh->reg->sortKeys; + size_t off = 0; + int slen; + + while (off < sk->buf_used) + { + int set, use, l; + + l = key_SU_decode(&set, sk->buf + off); + off += l; + l = key_SU_decode(&use, sk->buf + off); + off += l; + l = key_SU_decode(&slen, sk->buf + off); + off += l + slen; + if (p->attrSet == set && p->attrUse == use) + return; + } + assert (off == sk->buf_used); + + if (sk->buf_used + IT_MAX_WORD > sk->buf_max) + { + char *b; + + b = (char *) xmalloc (sk->buf_max += 128000); + if (sk->buf_used > 0) + memcpy (b, sk->buf, sk->buf_used); + xfree (sk->buf); + sk->buf = b; + } + off += key_SU_encode(p->attrSet, sk->buf + off); + off += key_SU_encode(p->attrUse, sk->buf + off); + slen = strlen(string); + off += key_SU_encode(slen, sk->buf + off); + memcpy (sk->buf + off, string, slen); + sk->buf_used = off + slen; +#else struct sortKey *sk; ZebraHandle zh = p->extractCtrl->handle; - struct sortKey *sortKeys = zh->sortKeys; - for (sk = sortKeys; sk; sk = sk->next) + + for (sk = zh->reg->sortKeys; sk; sk = sk->next) if (sk->attrSet == p->attrSet && sk->attrUse == p->attrUse) return; sk = (struct sortKey *) xmalloc (sizeof(*sk)); - sk->next = sortKeys; - sortKeys = sk; + sk->next = zh->reg->sortKeys; + zh->reg->sortKeys = sk; sk->string = (char *) xmalloc (length); sk->length = length; @@ -1458,6 +1559,7 @@ static void extract_add_sort_string (RecWord *p, const char *string, sk->attrSet = p->attrSet; sk->attrUse = p->attrUse; +#endif } void extract_add_string (RecWord *p, const char *string, int length) @@ -1510,8 +1612,8 @@ static void extract_add_incomplete_field (RecWord *p) if (!i) return; extract_add_string (p, buf, i); + p->seqno++; } - (p->seqnos[p->reg_type])++; /* to separate this from next one */ } static void extract_add_complete_field (RecWord *p) @@ -1578,26 +1680,32 @@ void extract_token_add (RecWord *p) void extract_schema_add (struct recExtractCtrl *p, Odr_oid *oid) { ZebraHandle zh = (ZebraHandle) (p->handle); - zebraExplain_addSchema (zh->service->zei, oid); + zebraExplain_addSchema (zh->reg->zei, oid); } void extract_flushSortKeys (ZebraHandle zh, SYSNO sysno, - int cmd, struct sortKey **skp) + int cmd, struct sortKeys *sk) { - struct sortKey *sk = *skp; - SortIdx sortIdx = zh->service->sortIdx; + SortIdx sortIdx = zh->reg->sortIdx; + size_t off = 0; sortIdx_sysno (sortIdx, sysno); - while (sk) + + while (off < sk->buf_used) { - struct sortKey *sk_next = sk->next; - sortIdx_type (sortIdx, sk->attrUse); - sortIdx_add (sortIdx, sk->string, sk->length); - xfree (sk->string); - xfree (sk); - sk = sk_next; + int set, use, slen, l; + + off += key_SU_decode(&set, sk->buf + off); + off += key_SU_decode(&use, sk->buf + off); + off += key_SU_decode(&slen, sk->buf + off); + + sortIdx_type(sortIdx, use); + if (cmd == 1) + sortIdx_add(sortIdx, sk->buf + off, slen); + else + sortIdx_add(sortIdx, "", 1); + off += slen; } - *skp = 0; } void encode_key_init (struct encode_info *i) @@ -1605,6 +1713,10 @@ void encode_key_init (struct encode_info *i) i->sysno = 0; i->seqno = 0; i->cmd = -1; + i->prevsys=0; + i->prevseq=0; + i->prevcmd=-1; + i->keylen=0; } char *encode_key_int (int d, char *bp) @@ -1632,6 +1744,11 @@ char *encode_key_int (int d, char *bp) return bp; } +#ifdef OLDENCODE +/* this is the old encode_key_write + * may be deleted once we are confident that the new works + * HL 15-oct-2002 + */ void encode_key_write (char *k, struct encode_info *i, FILE *outf) { struct it_key key; @@ -1658,3 +1775,108 @@ void encode_key_write (char *k, struct encode_info *i, FILE *outf) } } +void encode_key_flush (struct encode_info *i, FILE *outf) +{ /* dummy routine */ +} + +#else + +/* new encode_key_write + * The idea is to buffer one more key, and compare them + * If we are going to delete and insert the same key, + * we may as well not bother. Should make a difference in + * updates with small modifications (appending to a mbox) + */ +void encode_key_write (char *k, struct encode_info *i, FILE *outf) +{ + struct it_key key; + char *bp; + + if (*k) /* first time for new key */ + { + bp = i->buf; + while ((*bp++ = *k++)) + ; + i->keylen= bp - i->buf -1; + assert(i->keylen+1+sizeof(struct it_key) < ENCODE_BUFLEN); + } + else + { + bp=i->buf + i->keylen; + *bp++=0; + k++; + } + + memcpy (&key, k+1, sizeof(struct it_key)); + if (0==i->prevsys) /* no previous filter, fill up */ + { + i->prevsys=key.sysno; + i->prevseq=key.seqno; + i->prevcmd=*k; + } + else if ( (i->prevsys==key.sysno) && + (i->prevseq==key.seqno) && + (i->prevcmd!=*k) ) + { /* same numbers, diff cmd, they cancel out */ + i->prevsys=0; + } + else + { /* different stuff, write previous, move buf */ + bp = encode_key_int ( (i->prevsys - i->sysno) * 2 + i->prevcmd, bp); + if (i->sysno != i->prevsys) + { + i->sysno = i->prevsys; + i->seqno = 0; + } + else if (!i->seqno && !i->prevseq && i->cmd == i->prevcmd) + { + return; /* ??? Filters some sort of duplicates away */ + /* ??? Can this ever happen -H 15oct02 */ + } + bp = encode_key_int (i->prevseq - i->seqno, bp); + i->seqno = i->prevseq; + i->cmd = i->prevcmd; + if (fwrite (i->buf, bp - i->buf, 1, outf) != 1) + { + logf (LOG_FATAL|LOG_ERRNO, "fwrite"); + exit (1); + } + i->keylen=0; /* ok, it's written, forget it */ + i->prevsys=key.sysno; + i->prevseq=key.seqno; + i->prevcmd=*k; + } +} + +void encode_key_flush (struct encode_info *i, FILE *outf) +{ /* flush the last key from i */ + char *bp =i->buf + i->keylen; + if (0==i->prevsys) + { + return; /* nothing to flush */ + } + *bp++=0; + bp = encode_key_int ( (i->prevsys - i->sysno) * 2 + i->prevcmd, bp); + if (i->sysno != i->prevsys) + { + i->sysno = i->prevsys; + i->seqno = 0; + } + else if (!i->seqno && !i->prevseq && i->cmd == i->prevcmd) + { + return; /* ??? Filters some sort of duplicates away */ + /* ??? Can this ever happen -H 15oct02 */ + } + bp = encode_key_int (i->prevseq - i->seqno, bp); + i->seqno = i->prevseq; + i->cmd = i->prevcmd; + if (fwrite (i->buf, bp - i->buf, 1, outf) != 1) + { + logf (LOG_FATAL|LOG_ERRNO, "fwrite"); + exit (1); + } + i->keylen=0; /* ok, it's written, forget it */ + i->prevsys=0; /* forget the values too */ + i->prevseq=0; +} +#endif