X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;ds=sidebyside;f=index%2Fextract.c;h=85f91bd959d72217cc3db874ada6f34bd41def84;hb=53f50a1b1dd002ef484a41f50f3598386335cae1;hp=c490210384b22b5f9c3701303f28c94c15c81734;hpb=d4451321ac90adc2c6c52d7d07fadc5af632a59a;p=idzebra-moved-to-github.git diff --git a/index/extract.c b/index/extract.c index c490210..85f91bd 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,5 +1,5 @@ -/* $Id: extract.c,v 1.235 2006-11-09 14:39:24 adam Exp $ - Copyright (C) 1995-2006 +/* $Id: extract.c,v 1.257 2007-05-08 12:50:04 adam Exp $ + Copyright (C) 1995-2007 Index Data ApS This file is part of the Zebra server. @@ -36,17 +36,20 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include -#define ENCODE_BUFLEN 768 -struct encode_info { - void *encode_handle; - void *decode_handle; - char buf[ENCODE_BUFLEN]; -}; - static int log_level_extract = 0; static int log_level_details = 0; static int log_level_initialized = 0; +/* 1 if we use eliminitate identical delete/insert keys */ +/* eventually this the 0-case code will be removed */ +#define FLUSH2 1 + +void extract_flush_record_keys2(ZebraHandle zh, zint sysno, + zebra_rec_keys_t ins_keys, + zint ins_rank, + zebra_rec_keys_t del_keys, + zint del_rank); + static void zebra_init_log_level(void) { if (!log_level_initialized) @@ -58,47 +61,26 @@ static void zebra_init_log_level(void) } } -static void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, - int cmd, zebra_rec_keys_t reckeys, - zint staticrank); -static void extract_flushSortKeys (ZebraHandle zh, SYSNO sysno, - int cmd, zebra_rec_keys_t skp); +static void extract_flush_record_keys(ZebraHandle zh, zint sysno, + int cmd, zebra_rec_keys_t reckeys, + zint staticrank); +static void extract_flush_sort_keys(ZebraHandle zh, zint sysno, + int cmd, zebra_rec_keys_t skp); static void extract_schema_add (struct recExtractCtrl *p, Odr_oid *oid); static void extract_token_add (RecWord *p); -static void encode_key_init (struct encode_info *i); -static void encode_key_write (char *k, struct encode_info *i, FILE *outf); -static void encode_key_flush (struct encode_info *i, FILE *outf); - -#define USE_SHELLSORT 0 - -#if USE_SHELLSORT -static void shellsort(void *ar, int r, size_t s, - int (*cmp)(const void *a, const void *b)) +static void check_log_limit(ZebraHandle zh) { - char *a = ar; - char v[100]; - int h, i, j, k; - static const int incs[16] = { 1391376, 463792, 198768, 86961, 33936, - 13776, 4592, 1968, 861, 336, - 112, 48, 21, 7, 3, 1 }; - for ( k = 0; k < 16; k++) - for (h = incs[k], i = h; i < r; i++) - { - memcpy (v, a+s*i, s); - j = i; - while (j > h && (*cmp)(a + s*(j-h), v) > 0) - { - memcpy (a + s*j, a + s*(j-h), s); - j -= h; - } - memcpy (a+s*j, v, s); - } + if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit) + { + yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest", + zh->m_file_verbose_limit); + } } -#endif static void logRecord (ZebraHandle zh) { + check_log_limit(zh); ++zh->records_processed; if (!(zh->records_processed % 1000)) { @@ -177,9 +159,9 @@ static void searchRecordKey(ZebraHandle zh, #define FILE_MATCH_BLANK "\t " -static char *fileMatchStr (ZebraHandle zh, - zebra_rec_keys_t reckeys, - const char *fname, const char *spec) +static char *get_match_from_spec(ZebraHandle zh, + zebra_rec_keys_t reckeys, + const char *fname, const char *spec) { static char dstBuf[2048]; /* static here ??? */ char *dst = dstBuf; @@ -347,7 +329,19 @@ static void all_matches_add(struct recExtractCtrl *ctrl) "", 0); } -ZEBRA_RES zebra_extract_file(ZebraHandle zh, SYSNO *sysno, const char *fname, +ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, + struct ZebraRecStream *stream, + enum zebra_recctrl_action_t action, + int test_mode, + const char *recordType, + zint *sysno, + const char *match_criteria, + const char *fname, + RecType recType, + void *recTypeClientData); + + +ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, int deleteFlag) { ZEBRA_RES r = ZEBRA_OK; @@ -389,8 +383,11 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, SYSNO *sysno, const char *fname, } if (!zh->m_record_type) { - if (zh->records_processed < zh->m_file_verbose_limit) + check_log_limit(zh); + if (zh->records_processed + zh->records_skipped + < zh->m_file_verbose_limit) yaz_log (YLOG_LOG, "? %s", fname); + zh->records_skipped++; return 0; } /* determine match criteria */ @@ -442,27 +439,15 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, SYSNO *sysno, const char *fname, streamp = &stream; zebra_create_stream_fd(streamp, fd, 0); } - while(1) - { - r = zebra_extract_record_stream(zh, streamp, - deleteFlag, - 0, /* tst_mode */ - zh->m_record_type, - sysno, - 0, /*match_criteria */ - fname, - 1, /* force_update */ - 1, /* allow_update */ - recType, recTypeClientData); - if (r != ZEBRA_OK) - { - break; - } - if (sysno) - { - break; - } - } + r = zebra_extract_records_stream(zh, streamp, + deleteFlag ? + action_delete : action_update, + 0, /* tst_mode */ + zh->m_record_type, + sysno, + 0, /*match_criteria */ + fname, + recType, recTypeClientData); if (streamp) stream.destroy(streamp); zh->m_record_type = original_record_type; @@ -478,14 +463,12 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, SYSNO *sysno, const char *fname, ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, const char *buf, size_t buf_size, - int delete_flag, + enum zebra_recctrl_action_t action, int test_mode, const char *recordType, - SYSNO *sysno, + zint *sysno, const char *match_criteria, - const char *fname, - int force_update, - int allow_update) + const char *fname) { struct ZebraRecStream stream; ZEBRA_RES res; @@ -519,48 +502,94 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, return ZEBRA_FAIL; } - - zebra_create_stream_mem(&stream, buf, buf_size); - res = zebra_extract_record_stream(zh, &stream, - delete_flag, - test_mode, - recordType, - sysno, - match_criteria, - fname, - force_update, - allow_update, - recType, clientData); + res = zebra_extract_records_stream(zh, &stream, + action, + test_mode, + recordType, + sysno, + match_criteria, + fname, + recType, clientData); stream.destroy(&stream); return res; } +ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, + struct ZebraRecStream *stream, + enum zebra_recctrl_action_t action, + int test_mode, + const char *recordType, + zint *sysno, + const char *match_criteria, + const char *fname, + RecType recType, + void *recTypeClientData) +{ + ZEBRA_RES res = ZEBRA_OK; + while (1) + { + int more = 0; + res = zebra_extract_record_stream(zh, stream, + action, + test_mode, + recordType, + sysno, + match_criteria, + fname, + recType, recTypeClientData, &more); + if (!more) + { + res = ZEBRA_OK; + break; + } + if (res != ZEBRA_OK) + break; + if (sysno) + break; + } + return res; +} + + +static WRBUF wrbuf_hex_str(const char *cstr) +{ + size_t i; + WRBUF w = wrbuf_alloc(); + for (i = 0; cstr[i]; i++) + { + if (cstr[i] < ' ' || cstr[i] > 126) + wrbuf_printf(w, "\\%02X", cstr[i] & 0xff); + else + wrbuf_putc(w, cstr[i]); + } + return w; +} ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, struct ZebraRecStream *stream, - int delete_flag, + enum zebra_recctrl_action_t action, int test_mode, const char *recordType, - SYSNO *sysno, + zint *sysno, const char *match_criteria, const char *fname, - int force_update, - int allow_update, RecType recType, - void *recTypeClientData) + void *recTypeClientData, + int *more) { - SYSNO sysno0 = 0; + zint sysno0 = 0; RecordAttr *recordAttr; struct recExtractCtrl extractCtrl; int r; const char *matchStr = 0; Record rec; - off_t start_offset = 0; + off_t start_offset = 0, end_offset = 0; const char *pr_fname = fname; /* filename to print .. */ - int show_progress = zh->records_processed < zh->m_file_verbose_limit ? 1:0; + int show_progress = zh->records_processed + zh->records_skipped + < zh->m_file_verbose_limit ? 1:0; zebra_init_log_level(); @@ -595,47 +624,75 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, extractCtrl.handle = zh; extractCtrl.match_criteria[0] = '\0'; extractCtrl.staticrank = 0; + extractCtrl.action = action; - init_extractCtrl(zh, &extractCtrl); - + extract_set_store_data_prepare(&extractCtrl); r = (*recType->extract)(recTypeClientData, &extractCtrl); + + if (action == action_update) + { + action = extractCtrl.action; + } - if (r == RECCTRL_EXTRACT_EOF) - return ZEBRA_FAIL; - else if (r == RECCTRL_EXTRACT_ERROR_GENERIC) + switch (r) { + case RECCTRL_EXTRACT_EOF: + return ZEBRA_FAIL; + case RECCTRL_EXTRACT_ERROR_GENERIC: /* error occured during extraction ... */ yaz_log (YLOG_WARN, "extract error: generic"); return ZEBRA_FAIL; - } - else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER) - { + case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER: /* error occured during extraction ... */ yaz_log (YLOG_WARN, "extract error: no such filter"); return ZEBRA_FAIL; + case RECCTRL_EXTRACT_SKIP: + if (show_progress) + yaz_log (YLOG_LOG, "skip %s %s " ZINT_FORMAT, + recordType, pr_fname, (zint) start_offset); + *more = 1; + + end_offset = stream->endf(stream, 0); + if (end_offset) + stream->seekf(stream, end_offset); + + return ZEBRA_OK; + case RECCTRL_EXTRACT_OK: + break; + default: + yaz_log (YLOG_WARN, "extract error: unknown error: %d", r); + return ZEBRA_FAIL; } - + end_offset = stream->endf(stream, 0); + if (end_offset) + stream->seekf(stream, end_offset); + else + end_offset = stream->tellf(stream); + all_matches_add(&extractCtrl); if (extractCtrl.match_criteria[0]) match_criteria = extractCtrl.match_criteria; } - if (!sysno) { + *more = 1; + if (!sysno) + { sysno = &sysno0; if (match_criteria && *match_criteria) { matchStr = match_criteria; } else { if (zh->m_record_id && *zh->m_record_id) { - matchStr = fileMatchStr (zh, zh->reg->keys, pr_fname, - zh->m_record_id); + matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, + zh->m_record_id); if (!matchStr) { - yaz_log (YLOG_WARN, "Bad match criteria (recordID)"); + yaz_log (YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType, + pr_fname, (zint) start_offset); return ZEBRA_FAIL; } } @@ -645,12 +702,20 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, int db_ord = zebraExplain_get_database_ord(zh->reg->zei); char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord, matchStr); + + + if (1) + { + WRBUF w = wrbuf_hex_str(matchStr); + yaz_log(YLOG_LOG, "matchStr: %s", wrbuf_cstr(w)); + wrbuf_destroy(w); + } if (rinfo) { assert(*rinfo == sizeof(*sysno)); memcpy (sysno, rinfo+1, sizeof(*sysno)); } - } + } } if (zebra_rec_keys_empty(zh->reg->keys)) { @@ -663,13 +728,20 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, if (! *sysno) { /* new record */ - if (delete_flag) + if (action == action_delete) { yaz_log (YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); yaz_log (YLOG_WARN, "cannot delete record above (seems new)"); return ZEBRA_FAIL; } + else if (action == action_replace) + { + yaz_log (YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType, + pr_fname, (zint) start_offset); + yaz_log (YLOG_WARN, "cannot update record above (seems new)"); + return ZEBRA_FAIL; + } if (show_progress) yaz_log (YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); @@ -678,7 +750,11 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, *sysno = rec->sysno; recordAttr = rec_init_attr (zh->reg->zei, rec); - recordAttr->staticrank = extractCtrl.staticrank; + if (extractCtrl.staticrank < 0) + { + yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0"); + extractCtrl.staticrank = 0; + } if (matchStr) { @@ -687,10 +763,16 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, sizeof(*sysno), sysno); } - - extract_flushSortKeys (zh, *sysno, 1, zh->reg->sortKeys); - extract_flushRecordKeys (zh, *sysno, 1, zh->reg->keys, - recordAttr->staticrank); + extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys); +#if FLUSH2 + extract_flush_record_keys2(zh, *sysno, + zh->reg->keys, extractCtrl.staticrank, + 0, recordAttr->staticrank); +#else + extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys, + extractCtrl.staticrank); +#endif + recordAttr->staticrank = extractCtrl.staticrank; zh->records_inserted++; } else @@ -698,7 +780,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, /* record already exists */ zebra_rec_keys_t delkeys = zebra_rec_keys_open(); zebra_rec_keys_t sortKeys = zebra_rec_keys_open(); - if (!allow_update) + if (action == action_insert) { yaz_log (YLOG_LOG, "skipped %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); @@ -711,6 +793,10 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, recordAttr = rec_init_attr (zh->reg->zei, rec); + /* decrease total size */ + zebraExplain_recordBytesIncrement (zh->reg->zei, + - recordAttr->recordSize); + zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys], rec->size[recInfo_delKeys], @@ -720,12 +806,18 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, rec->size[recInfo_sortKeys], 0); - extract_flushSortKeys (zh, *sysno, 0, sortKeys); - extract_flushRecordKeys (zh, *sysno, 0, delkeys, - recordAttr->staticrank); - if (delete_flag) + extract_flush_sort_keys(zh, *sysno, 0, sortKeys); +#if !FLUSH2 + extract_flush_record_keys(zh, *sysno, 0, delkeys, + recordAttr->staticrank); +#endif + if (action == action_delete) { /* record going to be deleted */ +#if FLUSH2 + extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank, + delkeys, recordAttr->staticrank); +#endif if (zebra_rec_keys_empty(delkeys)) { yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, @@ -746,19 +838,28 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, } rec_del (zh->reg->records, &rec); } + zebra_rec_keys_close(delkeys); + zebra_rec_keys_close(sortKeys); rec_free(&rec); logRecord(zh); return ZEBRA_OK; } else - { + { /* update or special_update */ if (show_progress) - yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType, - pr_fname, (zint) ZINT_FORMAT); + yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType, + pr_fname, (zint) start_offset); + extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys); + +#if FLUSH2 + extract_flush_record_keys2(zh, *sysno, + zh->reg->keys, extractCtrl.staticrank, + delkeys, recordAttr->staticrank); +#else + extract_flush_record_keys(zh, *sysno, 1, + zh->reg->keys, extractCtrl.staticrank); +#endif recordAttr->staticrank = extractCtrl.staticrank; - extract_flushSortKeys (zh, *sysno, 1, zh->reg->sortKeys); - extract_flushRecordKeys (zh, *sysno, 1, zh->reg->keys, - recordAttr->staticrank); zh->records_updated++; } zebra_rec_keys_close(delkeys); @@ -794,18 +895,8 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, &rec->info[recInfo_sortKeys], &rec->size[recInfo_sortKeys]); - /* save file size of original record */ - zebraExplain_recordBytesIncrement (zh->reg->zei, - - recordAttr->recordSize); if (stream) { - off_t end_offset = stream->endf(stream, 0); - - if (!end_offset) - end_offset = stream->tellf(stream); - else - stream->seekf(stream, end_offset); - recordAttr->recordSize = end_offset - start_offset; zebraExplain_recordBytesIncrement(zh->reg->zei, recordAttr->recordSize); @@ -824,6 +915,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, rec->size[recInfo_storeData] = zh->store_data_size; rec->info[recInfo_storeData] = zh->store_data_buf; zh->store_data_buf = 0; + recordAttr->recordSize = zh->store_data_size; } else if (zh->m_store_data) { @@ -883,6 +975,8 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) extractCtrl.flagShowRecords = 0; extractCtrl.match_criteria[0] = '\0'; extractCtrl.staticrank = 0; + extractCtrl.action = action_update; + extractCtrl.handle = handle; extractCtrl.first_record = 1; @@ -900,19 +994,32 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys], rec->size[recInfo_delKeys], 0); - extract_flushRecordKeys (zh, rec->sysno, 0, delkeys, 0); +#if FLUSH2 + extract_flush_record_keys2(zh, rec->sysno, + zh->reg->keys, 0, delkeys, 0); +#else + extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0); + extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0); +#endif zebra_rec_keys_close(delkeys); zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys], rec->size[recInfo_sortKeys], 0); - extract_flushSortKeys (zh, rec->sysno, 0, sortkeys); + extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys); zebra_rec_keys_close(sortkeys); } - extract_flushRecordKeys (zh, rec->sysno, 1, zh->reg->keys, 0); - extract_flushSortKeys (zh, rec->sysno, 1, zh->reg->sortKeys); - + else + { +#if FLUSH2 + extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0); +#else + extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0); +#endif + } + extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys); + xfree (rec->info[recInfo_delKeys]); zebra_rec_keys_get_buf(zh->reg->keys, &rec->info[recInfo_delKeys], @@ -1041,234 +1148,119 @@ void extract_rec_keys_adjust(ZebraHandle zh, int is_insert, } } -void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, - int cmd, - zebra_rec_keys_t reckeys, - zint staticrank) +void extract_flush_record_keys2(ZebraHandle zh, zint sysno, + zebra_rec_keys_t ins_keys, zint ins_rank, + zebra_rec_keys_t del_keys, zint del_rank) { ZebraExplainInfo zei = zh->reg->zei; + int normal = 0; + int optimized = 0; - extract_rec_keys_adjust(zh, cmd, reckeys); - - if (log_level_details) + if (!zh->reg->key_block) { - yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s", - sysno, cmd ? "insert" : "delete"); - extract_rec_keys_log(zh, cmd, reckeys, log_level_details); + int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8")); + const char *key_tmp_dir = res_get_def (zh->res, "keyTmpDir", "."); + int use_threads = atoi(res_get_def (zh->res, "threads", "1")); + zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads); } - if (!zh->reg->key_buf) + if (ins_keys) { - int mem= 1024*1024* atoi( res_get_def( zh->res, "memmax", "8")); - if (mem <= 0) - { - yaz_log(YLOG_WARN, "Invalid memory setting, using default 8 MB"); - mem= 1024*1024*8; - } - /* FIXME: That "8" should be in a default settings include */ - /* not hard-coded here! -H */ - zh->reg->key_buf = (char**) xmalloc (mem); - zh->reg->ptr_top = mem/sizeof(char*); - zh->reg->ptr_i = 0; - zh->reg->key_buf_used = 0; - zh->reg->key_file_no = 0; + extract_rec_keys_adjust(zh, 1, ins_keys); + if (!del_keys) + zebraExplain_recordCountIncrement (zei, 1); + zebra_rec_keys_rewind(ins_keys); } - zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1); - - if (zebra_rec_keys_rewind(reckeys)) + if (del_keys) { - size_t slen; - const char *str; - struct it_key key_in; - while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) - { - int ch = 0; - int i, j = 0; - struct it_key key_out; - - assert(key_in.len >= 2); - assert(key_in.len <= IT_KEY_LEVEL_MAX); - - /* check for buffer overflow */ - if (zh->reg->key_buf_used + 1024 > - (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*)) - extract_flushWriteKeys (zh, 0); - - ++(zh->reg->ptr_i); - assert(zh->reg->ptr_i > 0); - (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] = - (char*)zh->reg->key_buf + zh->reg->key_buf_used; - - /* key_in.mem[0] ord/ch */ - /* key_in.mem[1] filter specified record ID */ - - /* encode the ordinal value (field/use/attribute) .. */ - ch = CAST_ZINT_TO_INT(key_in.mem[0]); - zh->reg->key_buf_used += - key_SU_encode(ch, (char*)zh->reg->key_buf + - zh->reg->key_buf_used); - - /* copy the 0-terminated stuff from str to output */ - memcpy((char*)zh->reg->key_buf + zh->reg->key_buf_used, str, slen); - zh->reg->key_buf_used += slen; - ((char*)zh->reg->key_buf)[(zh->reg->key_buf_used)++] = '\0'; - - /* the delete/insert indicator */ - ((char*)zh->reg->key_buf)[(zh->reg->key_buf_used)++] = cmd; - - if (zh->m_staticrank) /* rank config enabled ? */ - { - if (staticrank < 0) - { - yaz_log(YLOG_WARN, "staticrank = %ld. Setting to 0", - (long) staticrank); - staticrank = 0; - } - key_out.mem[j++] = staticrank; - } - - if (key_in.mem[1]) /* filter specified record ID */ - key_out.mem[j++] = key_in.mem[1]; - else - key_out.mem[j++] = sysno; - for (i = 2; i < key_in.len; i++) - key_out.mem[j++] = key_in.mem[i]; - key_out.len = j; - - memcpy((char*)zh->reg->key_buf + zh->reg->key_buf_used, - &key_out, sizeof(key_out)); - (zh->reg->key_buf_used) += sizeof(key_out); - } + extract_rec_keys_adjust(zh, 0, del_keys); + if (!ins_keys) + zebraExplain_recordCountIncrement (zei, -1); + zebra_rec_keys_rewind(del_keys); } -} -void extract_flushWriteKeys (ZebraHandle zh, int final) - /* optimizing: if final=1, and no files written yet */ - /* push the keys directly to merge, sidestepping the */ - /* temp file altogether. Speeds small updates */ -{ - FILE *outf; - char out_fname[200]; - char *prevcp, *cp; - struct encode_info encode_info; - int ptr_i = zh->reg->ptr_i; - int temp_policy; -#if SORT_EXTRA - int i; -#endif - if (!zh->reg->key_buf || ptr_i <= 0) + while (1) { - yaz_log(log_level_extract, " nothing to flush section=%d buf=%p i=%d", - zh->reg->key_file_no, zh->reg->key_buf, ptr_i); - return; - } - - (zh->reg->key_file_no)++; - yaz_log (YLOG_LOG, "sorting section %d", (zh->reg->key_file_no)); - yaz_log(log_level_extract, " sort_buff at %p n=%d", - zh->reg->key_buf + zh->reg->ptr_top - ptr_i,ptr_i); -#if !SORT_EXTRA - qsort (zh->reg->key_buf + zh->reg->ptr_top - ptr_i, ptr_i, - sizeof(char*), key_qsort_compare); - - /* zebra.cfg: tempfiles: - Y: always use temp files (old way) - A: use temp files, if more than one (auto) - = if this is both the last and the first - N: never bother with temp files (new) */ - - temp_policy=toupper(res_get_def(zh->res,"tempfiles","auto")[0]); - if (temp_policy != 'Y' && temp_policy != 'N' && temp_policy != 'A') { - yaz_log (YLOG_WARN, "Illegal tempfiles setting '%c'. using 'Auto' ", - temp_policy); - temp_policy='A'; + size_t del_slen; + const char *del_str; + struct it_key del_key_in; + int del = 0; + + size_t ins_slen; + const char *ins_str; + struct it_key ins_key_in; + int ins = 0; + + if (del_keys) + del = zebra_rec_keys_read(del_keys, &del_str, &del_slen, + &del_key_in); + if (ins_keys) + ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen, + &ins_key_in); + + if (del && ins && ins_rank == del_rank + && !key_compare(&del_key_in, &ins_key_in) + && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen)) + { + optimized++; + continue; + } + if (!del && !ins) + break; + + normal++; + if (del) + key_block_write(zh->reg->key_block, sysno, + &del_key_in, 0, del_str, del_slen, + del_rank, zh->m_staticrank); + if (ins) + key_block_write(zh->reg->key_block, sysno, + &ins_key_in, 1, ins_str, ins_slen, + ins_rank, zh->m_staticrank); } + yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized); +} - if ( ( temp_policy =='N' ) || /* always from memory */ - ( ( temp_policy =='A' ) && /* automatic */ - (zh->reg->key_file_no == 1) && /* this is first time */ - (final) ) ) /* and last (=only) time */ - { /* go directly from memory */ - zh->reg->key_file_no =0; /* signal not to read files */ - zebra_index_merge(zh); - zh->reg->ptr_i = 0; - zh->reg->key_buf_used = 0; - return; - } +void extract_flush_record_keys(ZebraHandle zh, zint sysno, int cmd, + zebra_rec_keys_t reckeys, + zint staticrank) +{ + ZebraExplainInfo zei = zh->reg->zei; - /* Not doing directly from memory, write into a temp file */ - extract_get_fname_tmp (zh, out_fname, zh->reg->key_file_no); + extract_rec_keys_adjust(zh, cmd, reckeys); - if (!(outf = fopen (out_fname, "wb"))) - { - yaz_log (YLOG_FATAL|YLOG_ERRNO, "fopen %s", out_fname); - exit (1); - } - yaz_log (YLOG_LOG, "writing section %d", zh->reg->key_file_no); - prevcp = cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i]; - - encode_key_init (&encode_info); - encode_key_write (cp, &encode_info, outf); - - while (--ptr_i > 0) + if (log_level_details) { - cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i]; - if (strcmp (cp, prevcp)) - { - encode_key_flush ( &encode_info, outf); - encode_key_init (&encode_info); - encode_key_write (cp, &encode_info, outf); - prevcp = cp; - } - else - encode_key_write (cp + strlen(cp), &encode_info, outf); + yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s", + sysno, cmd ? "insert" : "delete"); + extract_rec_keys_log(zh, cmd, reckeys, log_level_details); } - encode_key_flush ( &encode_info, outf); -#else - qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_x_compare); - extract_get_fname_tmp (out_fname, key_file_no); - if (!(outf = fopen (out_fname, "wb"))) + if (!zh->reg->key_block) { - yaz_log (YLOG_FATAL|YLOG_ERRNO, "fopen %s", out_fname); - exit (1); + int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8")); + const char *key_tmp_dir = res_get_def (zh->res, "keyTmpDir", "."); + int use_threads = atoi(res_get_def (zh->res, "threads", "1")); + zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads); } - yaz_log (YLOG_LOG, "writing section %d", key_file_no); - i = ptr_i; - prevcp = key_buf[ptr_top-i]; - while (1) - if (!--i || strcmp (prevcp, key_buf[ptr_top-i])) - { - key_y_len = strlen(prevcp)+1; + zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1); + #if 0 - yaz_log (YLOG_LOG, "key_y_len: %2d %02x %02x %s", - key_y_len, prevcp[0], prevcp[1], 2+prevcp); -#endif - qsort (key_buf + ptr_top-ptr_i, ptr_i - i, - sizeof(char*), key_y_compare); - cp = key_buf[ptr_top-ptr_i]; - --key_y_len; - encode_key_init (&encode_info); - encode_key_write (cp, &encode_info, outf); - while (--ptr_i > i) - { - cp = key_buf[ptr_top-ptr_i]; - encode_key_write (cp+key_y_len, &encode_info, outf); - } - encode_key_flush ( &encode_info, outf); - if (!i) - break; - prevcp = key_buf[ptr_top-ptr_i]; - } + yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd); + print_rec_keys(zh, reckeys); #endif - if (fclose (outf)) + if (zebra_rec_keys_rewind(reckeys)) { - yaz_log (YLOG_FATAL|YLOG_ERRNO, "fclose %s", out_fname); - exit (1); + size_t slen; + const char *str; + struct it_key key_in; + while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) + { + key_block_write(zh->reg->key_block, sysno, + &key_in, cmd, str, slen, + staticrank, zh->m_staticrank); + } } - yaz_log (YLOG_LOG, "finished section %d", zh->reg->key_file_no); - zh->reg->ptr_i = 0; - zh->reg->key_buf_used = 0; } ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh, @@ -1283,8 +1275,7 @@ ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh, struct it_key key; while (zebra_rec_keys_read(reckeys, &str, &slen, &key)) { - char dst_buf[IT_MAX_WORD]; - char *dst_term = dst_buf; + char *dst_term = 0; int ord; zint seqno; int index_type; @@ -1378,6 +1369,20 @@ static void extract_add_sort_string(RecWord *p, const char *str, int length) zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key); } +static void extract_add_staticrank_string(RecWord *p, + const char *str, int length) +{ + char valz[40]; + struct recExtractCtrl *ctrl = p->extractCtrl; + + if (length > sizeof(valz)-1) + length = sizeof(valz)-1; + + memcpy(valz, str, length); + valz[length] = '\0'; + ctrl->staticrank = atozint(valz); +} + static void extract_add_string(RecWord *p, const char *string, int length) { ZebraHandle zh = p->extractCtrl->handle; @@ -1386,9 +1391,7 @@ static void extract_add_string(RecWord *p, const char *string, int length) if (!p->index_name) return; - if (zebra_maps_is_sort(zh->reg->zebra_maps, p->index_type)) - extract_add_sort_string(p, string, length); - else + if (zebra_maps_is_index(zh->reg->zebra_maps, p->index_type)) { extract_add_index_string(p, zinfo_index_category_index, string, length); @@ -1402,6 +1405,14 @@ static void extract_add_string(RecWord *p, const char *string, int length) &word, zinfo_index_category_alwaysmatches, "", 0); } } + else if (zebra_maps_is_sort(zh->reg->zebra_maps, p->index_type)) + { + extract_add_sort_string(p, string, length); + } + else if (zebra_maps_is_staticrank(zh->reg->zebra_maps, p->index_type)) + { + extract_add_staticrank_string(p, string, length); + } } static void extract_add_incomplete_field(RecWord *p) @@ -1409,20 +1420,12 @@ static void extract_add_incomplete_field(RecWord *p) ZebraHandle zh = p->extractCtrl->handle; const char *b = p->term_buf; int remain = p->term_len; + int first = 1; const char **map = 0; if (remain > 0) map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0); - if (map) - { - if (zebra_maps_is_first_in_field(zh->reg->zebra_maps, p->index_type)) - { - /* first in field marker */ - extract_add_string(p, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN); - p->seqno++; - } - } while (map) { char buf[IT_MAX_WORD+1]; @@ -1455,6 +1458,17 @@ static void extract_add_incomplete_field(RecWord *p) } if (!i) return; + + if (first) + { + first = 0; + if (zebra_maps_is_first_in_field(zh->reg->zebra_maps, p->index_type)) + { + /* first in field marker */ + extract_add_string(p, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN); + p->seqno++; + } + } extract_add_string (p, buf, i); p->seqno++; } @@ -1525,9 +1539,9 @@ static void extract_token_add(RecWord *p) ZebraHandle zh = p->extractCtrl->handle; WRBUF wrbuf; - if (log_level_extract) + if (log_level_details) { - yaz_log(log_level_extract, "extract_token_add " + yaz_log(log_level_details, "extract_token_add " "type=%c index=%s seqno=" ZINT_FORMAT " s=%.*s", p->index_type, p->index_name, p->seqno, p->term_len, p->term_buf); @@ -1569,93 +1583,43 @@ static void extract_set_store_data_prepare(struct recExtractCtrl *p) p->setStoreData = extract_set_store_data_cb; } -static void extract_schema_add (struct recExtractCtrl *p, Odr_oid *oid) +static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid) { ZebraHandle zh = (ZebraHandle) p->handle; zebraExplain_addSchema (zh->reg->zei, oid); } -void extract_flushSortKeys (ZebraHandle zh, SYSNO sysno, - int cmd, zebra_rec_keys_t reckeys) +void extract_flush_sort_keys(ZebraHandle zh, zint sysno, + int cmd, zebra_rec_keys_t reckeys) { +#if 0 + yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT, + cmd, sysno); + extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG); +#endif + if (zebra_rec_keys_rewind(reckeys)) { - SortIdx sortIdx = zh->reg->sortIdx; + zebra_sort_index_t si = zh->reg->sort_index; size_t slen; const char *str; struct it_key key_in; - sortIdx_sysno (sortIdx, sysno); + zebra_sort_sysno(si, sysno); while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) { int ord = CAST_ZINT_TO_INT(key_in.mem[0]); - sortIdx_type(sortIdx, ord); + zebra_sort_type(si, ord); if (cmd == 1) - sortIdx_add(sortIdx, str, slen); + zebra_sort_add(si, str, slen); else - sortIdx_add(sortIdx, "", 1); + zebra_sort_delete(si); } } } -static void encode_key_init(struct encode_info *i) -{ - i->encode_handle = iscz1_start(); - i->decode_handle = iscz1_start(); -} - -static void encode_key_write (char *k, struct encode_info *i, FILE *outf) -{ - struct it_key key; - char *bp = i->buf, *bp0; - const char *src = (char *) &key; - - /* copy term to output buf */ - while ((*bp++ = *k++)) - ; - /* and copy & align key so we can mangle */ - memcpy (&key, k+1, sizeof(struct it_key)); /* *k is insert/delete */ - -#if 0 - /* debugging */ - key_logdump_txt(YLOG_LOG, &key, *k ? "i" : "d"); -#endif - assert(key.mem[0] >= 0); - - bp0 = bp++; - iscz1_encode(i->encode_handle, &bp, &src); - - *bp0 = (*k * 128) + bp - bp0 - 1; /* length and insert/delete combined */ - if (fwrite (i->buf, bp - i->buf, 1, outf) != 1) - { - yaz_log (YLOG_FATAL|YLOG_ERRNO, "fwrite"); - exit (1); - } - -#if 0 - /* debugging */ - if (1) - { - struct it_key key2; - const char *src = bp0+1; - char *dst = (char*) &key2; - iscz1_decode(i->decode_handle, &dst, &src); - - key_logdump_txt(YLOG_LOG, &key2, *k ? "i" : "d"); - - assert(key2.mem[1]); - } -#endif -} - -static void encode_key_flush (struct encode_info *i, FILE *outf) -{ - iscz1_stop(i->encode_handle); - iscz1_stop(i->decode_handle); -} - /* * Local variables: * c-basic-offset: 4