X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fextract.c;h=1312bfe06408a5d1a1c7842f4401c749fdd94d03;hb=951a16f58a1372353a1c85ef9e162ddbe84bbff6;hp=65a6be98c29e5d165788a58eb63b4622eb0bb3f0;hpb=46c0e649af38cec11668a4a15ab10915b06ccbc1;p=idzebra-moved-to-github.git diff --git a/index/extract.c b/index/extract.c index 65a6be9..1312bfe 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.251 2007-03-13 13:46:11 adam Exp $ +/* $Id: extract.c,v 1.258 2007-05-08 14:27:23 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -40,6 +40,16 @@ static int log_level_extract = 0; static int log_level_details = 0; static int log_level_initialized = 0; +/* 1 if we use eliminitate identical delete/insert keys */ +/* eventually this the 0-case code will be removed */ +#define FLUSH2 1 + +void extract_flush_record_keys2(ZebraHandle zh, zint sysno, + zebra_rec_keys_t ins_keys, + zint ins_rank, + zebra_rec_keys_t del_keys, + zint del_rank); + static void zebra_init_log_level(void) { if (!log_level_initialized) @@ -321,14 +331,12 @@ static void all_matches_add(struct recExtractCtrl *ctrl) ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, struct ZebraRecStream *stream, - int delete_flag, + enum zebra_recctrl_action_t action, int test_mode, const char *recordType, zint *sysno, const char *match_criteria, const char *fname, - int force_update, - int allow_update, RecType recType, void *recTypeClientData); @@ -432,14 +440,13 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, zebra_create_stream_fd(streamp, fd, 0); } r = zebra_extract_records_stream(zh, streamp, - deleteFlag, + deleteFlag ? + action_delete : action_update, 0, /* tst_mode */ zh->m_record_type, sysno, 0, /*match_criteria */ fname, - 1, /* force_update */ - 1, /* allow_update */ recType, recTypeClientData); if (streamp) stream.destroy(streamp); @@ -456,14 +463,12 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, const char *buf, size_t buf_size, - int delete_flag, + enum zebra_recctrl_action_t action, int test_mode, const char *recordType, zint *sysno, const char *match_criteria, - const char *fname, - int force_update, - int allow_update) + const char *fname) { struct ZebraRecStream stream; ZEBRA_RES res; @@ -500,14 +505,12 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, zebra_create_stream_mem(&stream, buf, buf_size); res = zebra_extract_records_stream(zh, &stream, - delete_flag, + action, test_mode, recordType, sysno, match_criteria, fname, - force_update, - allow_update, recType, clientData); stream.destroy(&stream); return res; @@ -515,14 +518,12 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, struct ZebraRecStream *stream, - int delete_flag, + enum zebra_recctrl_action_t action, int test_mode, const char *recordType, zint *sysno, const char *match_criteria, const char *fname, - int force_update, - int allow_update, RecType recType, void *recTypeClientData) { @@ -531,14 +532,12 @@ ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, { int more = 0; res = zebra_extract_record_stream(zh, stream, - delete_flag, + action, test_mode, recordType, sysno, match_criteria, fname, - force_update, - allow_update, recType, recTypeClientData, &more); if (!more) { @@ -554,16 +553,28 @@ ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, } +static WRBUF wrbuf_hex_str(const char *cstr) +{ + size_t i; + WRBUF w = wrbuf_alloc(); + for (i = 0; cstr[i]; i++) + { + if (cstr[i] < ' ' || cstr[i] > 126) + wrbuf_printf(w, "\\%02X", cstr[i] & 0xff); + else + wrbuf_putc(w, cstr[i]); + } + return w; +} + ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, struct ZebraRecStream *stream, - int delete_flag, + enum zebra_recctrl_action_t action, int test_mode, const char *recordType, zint *sysno, const char *match_criteria, const char *fname, - int force_update, - int allow_update, RecType recType, void *recTypeClientData, int *more) @@ -613,13 +624,19 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, extractCtrl.handle = zh; extractCtrl.match_criteria[0] = '\0'; extractCtrl.staticrank = 0; + extractCtrl.action = action; init_extractCtrl(zh, &extractCtrl); - + extract_set_store_data_prepare(&extractCtrl); r = (*recType->extract)(recTypeClientData, &extractCtrl); + if (action == action_update) + { + action = extractCtrl.action; + } + switch (r) { case RECCTRL_EXTRACT_EOF: @@ -685,12 +702,20 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, int db_ord = zebraExplain_get_database_ord(zh->reg->zei); char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord, matchStr); + + + if (log_level_extract) + { + WRBUF w = wrbuf_hex_str(matchStr); + yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w)); + wrbuf_destroy(w); + } if (rinfo) { assert(*rinfo == sizeof(*sysno)); memcpy (sysno, rinfo+1, sizeof(*sysno)); } - } + } } if (zebra_rec_keys_empty(zh->reg->keys)) { @@ -703,14 +728,14 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, if (! *sysno) { /* new record */ - if (delete_flag) + if (action == action_delete) { yaz_log (YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); yaz_log (YLOG_WARN, "cannot delete record above (seems new)"); return ZEBRA_FAIL; } - else if (!force_update) + else if (action == action_replace) { yaz_log (YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); @@ -730,7 +755,6 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0"); extractCtrl.staticrank = 0; } - recordAttr->staticrank = extractCtrl.staticrank; if (matchStr) { @@ -740,8 +764,15 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, } extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys); +#if FLUSH2 + extract_flush_record_keys2(zh, *sysno, + zh->reg->keys, extractCtrl.staticrank, + 0, recordAttr->staticrank); +#else extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys, - recordAttr->staticrank); + extractCtrl.staticrank); +#endif + recordAttr->staticrank = extractCtrl.staticrank; zh->records_inserted++; } else @@ -749,7 +780,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, /* record already exists */ zebra_rec_keys_t delkeys = zebra_rec_keys_open(); zebra_rec_keys_t sortKeys = zebra_rec_keys_open(); - if (!allow_update) + if (action == action_insert) { yaz_log (YLOG_LOG, "skipped %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); @@ -776,11 +807,17 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, 0); extract_flush_sort_keys(zh, *sysno, 0, sortKeys); +#if !FLUSH2 extract_flush_record_keys(zh, *sysno, 0, delkeys, recordAttr->staticrank); - if (delete_flag) +#endif + if (action == action_delete) { /* record going to be deleted */ +#if FLUSH2 + extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank, + delkeys, recordAttr->staticrank); +#endif if (zebra_rec_keys_empty(delkeys)) { yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, @@ -801,19 +838,28 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, } rec_del (zh->reg->records, &rec); } + zebra_rec_keys_close(delkeys); + zebra_rec_keys_close(sortKeys); rec_free(&rec); logRecord(zh); return ZEBRA_OK; } else - { + { /* update or special_update */ if (show_progress) yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); - recordAttr->staticrank = extractCtrl.staticrank; extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys); - extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys, - recordAttr->staticrank); + +#if FLUSH2 + extract_flush_record_keys2(zh, *sysno, + zh->reg->keys, extractCtrl.staticrank, + delkeys, recordAttr->staticrank); +#else + extract_flush_record_keys(zh, *sysno, 1, + zh->reg->keys, extractCtrl.staticrank); +#endif + recordAttr->staticrank = extractCtrl.staticrank; zh->records_updated++; } zebra_rec_keys_close(delkeys); @@ -929,6 +975,8 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) extractCtrl.flagShowRecords = 0; extractCtrl.match_criteria[0] = '\0'; extractCtrl.staticrank = 0; + extractCtrl.action = action_update; + extractCtrl.handle = handle; extractCtrl.first_record = 1; @@ -946,7 +994,13 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys], rec->size[recInfo_delKeys], 0); +#if FLUSH2 + extract_flush_record_keys2(zh, rec->sysno, + zh->reg->keys, 0, delkeys, 0); +#else extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0); + extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0); +#endif zebra_rec_keys_close(delkeys); zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys], @@ -956,7 +1010,14 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys); zebra_rec_keys_close(sortkeys); } - extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0); + else + { +#if FLUSH2 + extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0); +#else + extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0); +#endif + } extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys); xfree (rec->info[recInfo_delKeys]); @@ -1087,6 +1148,79 @@ void extract_rec_keys_adjust(ZebraHandle zh, int is_insert, } } +void extract_flush_record_keys2(ZebraHandle zh, zint sysno, + zebra_rec_keys_t ins_keys, zint ins_rank, + zebra_rec_keys_t del_keys, zint del_rank) +{ + ZebraExplainInfo zei = zh->reg->zei; + int normal = 0; + int optimized = 0; + + if (!zh->reg->key_block) + { + int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8")); + const char *key_tmp_dir = res_get_def (zh->res, "keyTmpDir", "."); + int use_threads = atoi(res_get_def (zh->res, "threads", "1")); + zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads); + } + + if (ins_keys) + { + extract_rec_keys_adjust(zh, 1, ins_keys); + if (!del_keys) + zebraExplain_recordCountIncrement (zei, 1); + zebra_rec_keys_rewind(ins_keys); + } + if (del_keys) + { + extract_rec_keys_adjust(zh, 0, del_keys); + if (!ins_keys) + zebraExplain_recordCountIncrement (zei, -1); + zebra_rec_keys_rewind(del_keys); + } + + while (1) + { + size_t del_slen; + const char *del_str; + struct it_key del_key_in; + int del = 0; + + size_t ins_slen; + const char *ins_str; + struct it_key ins_key_in; + int ins = 0; + + if (del_keys) + del = zebra_rec_keys_read(del_keys, &del_str, &del_slen, + &del_key_in); + if (ins_keys) + ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen, + &ins_key_in); + + if (del && ins && ins_rank == del_rank + && !key_compare(&del_key_in, &ins_key_in) + && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen)) + { + optimized++; + continue; + } + if (!del && !ins) + break; + + normal++; + if (del) + key_block_write(zh->reg->key_block, sysno, + &del_key_in, 0, del_str, del_slen, + del_rank, zh->m_staticrank); + if (ins) + key_block_write(zh->reg->key_block, sysno, + &ins_key_in, 1, ins_str, ins_slen, + ins_rank, zh->m_staticrank); + } + yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized); +} + void extract_flush_record_keys(ZebraHandle zh, zint sysno, int cmd, zebra_rec_keys_t reckeys, zint staticrank) @@ -1111,6 +1245,10 @@ void extract_flush_record_keys(ZebraHandle zh, zint sysno, int cmd, } zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1); +#if 0 + yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd); + print_rec_keys(zh, reckeys); +#endif if (zebra_rec_keys_rewind(reckeys)) { size_t slen; @@ -1125,7 +1263,6 @@ void extract_flush_record_keys(ZebraHandle zh, zint sysno, int cmd, } } - ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh, zebra_rec_keys_t reckeys, zebra_snippets *snippets) @@ -1402,9 +1539,9 @@ static void extract_token_add(RecWord *p) ZebraHandle zh = p->extractCtrl->handle; WRBUF wrbuf; - if (log_level_extract) + if (log_level_details) { - yaz_log(log_level_extract, "extract_token_add " + yaz_log(log_level_details, "extract_token_add " "type=%c index=%s seqno=" ZINT_FORMAT " s=%.*s", p->index_type, p->index_name, p->seqno, p->term_len, p->term_buf);