X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=index%2Fextract.c;h=5071fd74fbfae9a8c5f7a25d83fe43babda50017;hp=48dd978f7b8e3077caa25f10bb1075dc09b7fa6c;hb=e2e073b5c947e996304ed7d577497af5e9a879ee;hpb=896b30853daabb6294afe8b0a2f74fa6d6e397d8 diff --git a/index/extract.c b/index/extract.c index 48dd978..5071fd7 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,8 +1,5 @@ -/* $Id: extract.c,v 1.271 2007-12-07 14:09:09 adam Exp $ - Copyright (C) 1995-2007 - Index Data ApS - -This file is part of the Zebra server. +/* This file is part of the Zebra server. + Copyright (C) Index Data Zebra is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -24,6 +21,9 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA \brief indexes records and extract tokens for indexing and sorting */ +#if HAVE_CONFIG_H +#include +#endif #include #include #include @@ -50,11 +50,18 @@ static int log_level_initialized = 0; /* eventually this the 0-case code will be removed */ #define FLUSH2 1 -void extract_flush_record_keys2(ZebraHandle zh, zint sysno, - zebra_rec_keys_t ins_keys, - zint ins_rank, - zebra_rec_keys_t del_keys, - zint del_rank); +#if FLUSH2 +static void extract_flush_record_keys2(ZebraHandle zh, zint sysno, + zebra_rec_keys_t ins_keys, + zint ins_rank, + zebra_rec_keys_t del_keys, + zint del_rank); +#else +static void extract_flush_record_keys(ZebraHandle zh, zint sysno, + int cmd, + zebra_rec_keys_t reckeys, + zint staticrank); +#endif static void zebra_init_log_level(void) { @@ -67,6 +74,21 @@ static void zebra_init_log_level(void) } } +static WRBUF wrbuf_hex_str(const char *cstr) +{ + size_t i; + WRBUF w = wrbuf_alloc(); + for (i = 0; cstr[i]; i++) + { + if (cstr[i] < ' ' || cstr[i] > 126) + wrbuf_printf(w, "\\%02X", cstr[i] & 0xff); + else + wrbuf_putc(w, cstr[i]); + } + return w; +} + + static void extract_flush_sort_keys(ZebraHandle zh, zint sysno, int cmd, zebra_rec_keys_t skp); static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid); @@ -88,8 +110,8 @@ static void logRecord(ZebraHandle zh) if (!(zh->records_processed % 1000)) { yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d " - ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, - zh->records_processed, zh->records_inserted, + ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT, + zh->records_processed, zh->records_inserted, zh->records_updated, zh->records_deleted); } } @@ -100,7 +122,7 @@ static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl) } -static void extract_add_index_string(RecWord *p, +static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat, const char *str, int length); @@ -122,34 +144,24 @@ struct snip_rec_info { zebra_snippets *snippets; }; - -static void snippet_add_complete_field(RecWord *p, int ord, - zebra_map_t zm) +static int parse_complete_field(RecWord *p, zebra_map_t zm, + char *buf) { - struct snip_rec_info *h = p->extractCtrl->handle; - const char *b = p->term_buf; - char buf[IT_MAX_WORD+1]; const char **map = 0; int i = 0, remain = p->term_len; - const char *start = b; - const char *last = 0; if (remain > 0) map = zebra_maps_input(zm, &b, remain, 1); - while (remain > 0 && i < IT_MAX_WORD) { while (map && *map && **map == *CHR_SPACE) { remain = p->term_len - (b - p->term_buf); - if (i == 0) - start = b; /* set to first non-ws area */ if (remain > 0) { int first = i ? 0 : 1; /* first position */ - map = zebra_maps_input(zm, &b, remain, first); } else @@ -175,7 +187,6 @@ static void snippet_add_complete_field(RecWord *p, int ord, while (i < IT_MAX_WORD && *cp) buf[i++] = *(cp++); } - last = b; remain = p->term_len - (b - p->term_buf); if (remain > 0) { @@ -185,11 +196,23 @@ static void snippet_add_complete_field(RecWord *p, int ord, map = 0; } } + return i; +} + +static void snippet_add_complete_field(RecWord *p, int ord, + zebra_map_t zm) +{ + struct snip_rec_info *h = p->extractCtrl->handle; + char buf[IT_MAX_WORD+1]; + int i = parse_complete_field(p, zm, buf); + if (!i) - return; - if (last && start != last) + return; + + if (p->term_len && p->term_buf && zebra_maps_is_index(zm)) zebra_snippets_appendn(h->snippets, p->seqno, 0, ord, - start, last - start); + p->term_buf, p->term_len); + p->seqno++; } static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) @@ -207,8 +230,7 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) while (map) { - char buf[IT_MAX_WORD+1]; - int i, remain; + int remain; /* Skip spaces */ while (map && *map && **map == *CHR_SPACE) @@ -222,21 +244,14 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) } if (!map) break; - if (start != last) + if (start != last && zebra_maps_is_index(zm)) { zebra_snippets_appendn(h->snippets, p->seqno, 1, ord, start, last - start); - } start = last; - - i = 0; while (map && *map && **map != *CHR_SPACE) { - const char *cp = *map; - - while (i < IT_MAX_WORD && *cp) - buf[i++] = *(cp++); remain = p->term_len - (b - p->term_buf); last = b; if (remain > 0) @@ -244,11 +259,11 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) else map = 0; } - if (!i) - return; + if (start == last) + return ; if (first) - { + { first = 0; if (zebra_maps_is_first_in_field(zm)) { @@ -256,7 +271,7 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) p->seqno++; } } - if (start != last) + if (start != last && zebra_maps_is_index(zm)) zebra_snippets_appendn(h->snippets, p->seqno, 0, ord, start, last - start); start = last; @@ -265,22 +280,48 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) } +static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm) +{ + struct snip_rec_info *h = p->extractCtrl->handle; + + const char *res_buf = 0; + size_t res_len = 0; + + const char *display_buf = 0; + size_t display_len = 0; + + zebra_map_tokenize_start(zm, p->term_buf, p->term_len); + while (zebra_map_tokenize_next(zm, &res_buf, &res_len, + &display_buf, &display_len)) + { + if (zebra_maps_is_index(zm)) + zebra_snippets_appendn(h->snippets, p->seqno, 0, ord, + display_buf, display_len); + p->seqno++; + } +} + static void snippet_token_add(RecWord *p) { struct snip_rec_info *h = p->extractCtrl->handle; ZebraHandle zh = h->zh; - zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, p->index_type); + zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type); - if (zm && zebra_maps_is_index(zm)) + if (zm) { ZebraExplainInfo zei = zh->reg->zei; int ch = zebraExplain_lookup_attr_str( zei, zinfo_index_category_index, p->index_type, p->index_name); - if (zebra_maps_is_complete(zm)) - snippet_add_complete_field(p, ch, zm); + if (zebra_maps_is_icu(zm)) + snippet_add_icu(p, ch, zm); else - snippet_add_incomplete_field(p, ch, zm); + { + if (zebra_maps_is_complete(zm)) + snippet_add_complete_field(p, ch, zm); + else + snippet_add_incomplete_field(p, ch, zm); + } } } @@ -296,7 +337,6 @@ void extract_snippet(ZebraHandle zh, zebra_snippets *sn, { struct recExtractCtrl extractCtrl; struct snip_rec_info info; - int r; extractCtrl.stream = stream; extractCtrl.first_record = 1; @@ -307,20 +347,19 @@ void extract_snippet(ZebraHandle zh, zebra_snippets *sn, assert(zh->reg->dh); extractCtrl.dh = zh->reg->dh; - + info.zh = zh; info.snippets = sn; extractCtrl.handle = &info; extractCtrl.match_criteria[0] = '\0'; extractCtrl.staticrank = 0; extractCtrl.action = action_insert; - + init_extractCtrl(zh, &extractCtrl); extractCtrl.setStoreData = 0; - r = (*rt->extract)(recTypeClientData, &extractCtrl); - + (*rt->extract)(recTypeClientData, &extractCtrl); } static void searchRecordKey(ZebraHandle zh, @@ -357,11 +396,11 @@ static void searchRecordKey(ZebraHandle zh, assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); seqno = key.mem[key.len-1]; - + if (key.mem[0] == ch) { zint woff; - + if (startSeq == -1) startSeq = seqno; woff = seqno - startSeq; @@ -394,15 +433,15 @@ static char *get_match_from_spec(ZebraHandle zh, char attset_str[64], attname_str[64]; int i; int first = 1; - + for (s++; strchr(FILE_MATCH_BLANK, *s); s++) ; - for (i = 0; *s && *s != ',' && *s != ')' && + for (i = 0; *s && *s != ',' && *s != ')' && !strchr(FILE_MATCH_BLANK, *s); s++) if (i+1 < sizeof(attset_str)) attset_str[i++] = *s; attset_str[i] = '\0'; - + for (; strchr(FILE_MATCH_BLANK, *s); s++) ; if (*s != ',') @@ -411,15 +450,12 @@ static char *get_match_from_spec(ZebraHandle zh, { for (s++; strchr(FILE_MATCH_BLANK, *s); s++) ; - for (i = 0; *s && *s != ')' && + for (i = 0; *s && *s != ')' && !strchr(FILE_MATCH_BLANK, *s); s++) if (i+1 < sizeof(attname_str)) attname_str[i++] = *s; attname_str[i] = '\0'; } - - searchRecordKey(zh, reckeys, attname_str, ws, 32); - if (*s != ')') { yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s", @@ -428,6 +464,20 @@ static char *get_match_from_spec(ZebraHandle zh, } s++; + searchRecordKey(zh, reckeys, attname_str, ws, 32); + if (0) /* for debugging */ + { + for (i = 0; i<32; i++) + { + if (ws[i]) + { + WRBUF w = wrbuf_hex_str(ws[i]); + yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w)); + wrbuf_destroy(w); + } + } + } + for (i = 0; i<32; i++) if (ws[i]) { @@ -471,7 +521,7 @@ static char *get_match_from_spec(ZebraHandle zh, } else if (!strcmp(special, "type")) spec_src = zh->m_record_type; - else + else spec_src = NULL; if (spec_src) { @@ -511,6 +561,14 @@ static char *get_match_from_spec(ZebraHandle zh, return NULL; } *dst = '\0'; + + if (0) /* for debugging */ + { + WRBUF w = wrbuf_hex_str(dstBuf); + yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w)); + wrbuf_destroy(w); + } + return dstBuf; } @@ -524,7 +582,7 @@ struct recordLogInfo { \param ctrl record control \param record_id custom record ID \param sysno system record ID - + This function serves two purposes.. It adds the always matches entry and makes a pointer from the custom record ID (if defined) back to the system record ID (sysno) @@ -547,10 +605,10 @@ static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id, "", 0); } -ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, +/* forward declaration */ +ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, struct ZebraRecStream *stream, enum zebra_recctrl_action_t action, - int test_mode, const char *recordType, zint *sysno, const char *match_criteria, @@ -559,15 +617,14 @@ ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, void *recTypeClientData); -ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, - int deleteFlag) +ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, + enum zebra_recctrl_action_t action) { ZEBRA_RES r = ZEBRA_OK; int i, fd; char gprefix[128]; char ext[128]; char ext_res[128]; - struct file_read_info *fi = 0; const char *original_record_type = 0; RecType recType; void *recTypeClientData; @@ -579,7 +636,7 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, *gprefix = '\0'; else sprintf(gprefix, "%s.", zh->m_group); - + yaz_log(log_level_extract, "zebra_extract_file %s", fname); /* determine file extension */ @@ -630,10 +687,9 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, default: yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type); } - if (sysno && deleteFlag) + if (sysno && (action == action_delete || action == action_a_delete)) { streamp = 0; - fi = 0; } else { @@ -647,7 +703,7 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, } else strcpy(full_rep, fname); - + if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1) { yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep); @@ -658,9 +714,7 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, zebra_create_stream_fd(streamp, fd, 0); } r = zebra_extract_records_stream(zh, streamp, - deleteFlag ? - action_delete : action_update, - 0, /* tst_mode */ + action, zh->m_record_type, sysno, 0, /*match_criteria */ @@ -676,13 +730,12 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, If sysno is provided, then it's used to identify the reocord. If not, and match_criteria is provided, then sysno is guessed If not, and a record is provided, then sysno is got from there - + */ -ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, +ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, const char *buf, size_t buf_size, enum zebra_recctrl_action_t action, - int test_mode, const char *recordType, zint *sysno, const char *match_criteria, @@ -699,7 +752,7 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, "Record type explicitly specified: %s", recordType); recType = recType_byName(zh->reg->recTypes, zh->res, recordType, &clientData); - } + } else { if (!(zh->m_record_type)) @@ -713,7 +766,7 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, zh->m_record_type, &clientData); recordType = zh->m_record_type; } - + if (!recType) { yaz_log(YLOG_WARN, "No such record type: %s", recordType); @@ -724,7 +777,6 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, res = zebra_extract_records_stream(zh, &stream, action, - test_mode, recordType, sysno, match_criteria, @@ -734,68 +786,16 @@ ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh, return res; } -ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, - struct ZebraRecStream *stream, - enum zebra_recctrl_action_t action, - int test_mode, - const char *recordType, - zint *sysno, - const char *match_criteria, - const char *fname, - RecType recType, - void *recTypeClientData) -{ - ZEBRA_RES res = ZEBRA_OK; - while (1) - { - int more = 0; - res = zebra_extract_record_stream(zh, stream, - action, - test_mode, - recordType, - sysno, - match_criteria, - fname, - recType, recTypeClientData, &more); - if (!more) - { - res = ZEBRA_OK; - break; - } - if (res != ZEBRA_OK) - break; - if (sysno) - break; - } - return res; -} - - -static WRBUF wrbuf_hex_str(const char *cstr) -{ - size_t i; - WRBUF w = wrbuf_alloc(); - for (i = 0; cstr[i]; i++) - { - if (cstr[i] < ' ' || cstr[i] > 126) - wrbuf_printf(w, "\\%02X", cstr[i] & 0xff); - else - wrbuf_putc(w, cstr[i]); - } - return w; -} - -ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, - struct ZebraRecStream *stream, - enum zebra_recctrl_action_t action, - int test_mode, - const char *recordType, - zint *sysno, - const char *match_criteria, - const char *fname, - RecType recType, - void *recTypeClientData, - int *more) +static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, + struct ZebraRecStream *stream, + enum zebra_recctrl_action_t action, + const char *recordType, + zint *sysno, + const char *match_criteria, + const char *fname, + RecType recType, + void *recTypeClientData, + int *more) { zint sysno0 = 0; @@ -806,7 +806,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, Record rec; off_t start_offset = 0, end_offset = 0; const char *pr_fname = fname; /* filename to print .. */ - int show_progress = zh->records_processed + zh->records_skipped + int show_progress = zh->records_processed + zh->records_skipped < zh->m_file_verbose_limit ? 1:0; zebra_init_log_level(); @@ -819,7 +819,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0])) { - if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], + if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0], zh->m_explain_database)) return ZEBRA_FAIL; } @@ -832,7 +832,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, start_offset = stream->tellf(stream); extractCtrl.first_record = start_offset ? 0 : 1; - + stream->endf(stream, &null_offset);; extractCtrl.init = extract_init; @@ -847,14 +847,14 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, init_extractCtrl(zh, &extractCtrl); extract_set_store_data_prepare(&extractCtrl); - + r = (*recType->extract)(recTypeClientData, &extractCtrl); if (action == action_update) { action = extractCtrl.action; } - + switch (r) { case RECCTRL_EXTRACT_EOF: @@ -872,7 +872,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); *more = 1; - + end_offset = stream->endf(stream, 0); if (end_offset) stream->seekf(stream, end_offset); @@ -895,15 +895,26 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, } *more = 1; + + if (zh->m_flag_rw == 0) + { + yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType, + pr_fname, (zint) start_offset); + /* test mode .. Do not perform match */ + return ZEBRA_OK; + } + if (!sysno) { sysno = &sysno0; - if (match_criteria && *match_criteria) { + if (match_criteria && *match_criteria) matchStr = match_criteria; - } else { - if (zh->m_record_id && *zh->m_record_id) { - matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, + else + { + if (zh->m_record_id && *zh->m_record_id) + { + matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname, zh->m_record_id); if (!matchStr) { @@ -911,15 +922,26 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, pr_fname, (zint) start_offset); return ZEBRA_FAIL; } + if (0 && matchStr) + { + WRBUF w = wrbuf_alloc(); + size_t i; + for (i = 0; i < strlen(matchStr); i++) + { + wrbuf_printf(w, "%02X", matchStr[i] & 0xff); + } + yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w)); + wrbuf_destroy(w); + } } } - if (matchStr) + if (matchStr) { int db_ord = zebraExplain_get_database_ord(zh->reg->zei); char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord, matchStr); - + if (log_level_extract) { WRBUF w = wrbuf_hex_str(matchStr); @@ -934,24 +956,23 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, } } - if (zebra_rec_keys_empty(zh->reg->keys)) - { - /* the extraction process returned no information - the record - is probably empty - unless flagShowRecords is in use */ - if (test_mode) - return ZEBRA_OK; - } - if (! *sysno) { - /* new record */ + /* new record AKA does not exist already */ if (action == action_delete) { - yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, - pr_fname, (zint) start_offset); + yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, + pr_fname, (zint) start_offset); yaz_log(YLOG_WARN, "cannot delete record above (seems new)"); return ZEBRA_FAIL; } + else if (action == action_a_delete) + { + if (show_progress) + yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType, + pr_fname, (zint) start_offset); + return ZEBRA_OK; + } else if (action == action_replace) { yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType, @@ -1000,7 +1021,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, #endif recordAttr->staticrank = extractCtrl.staticrank; zh->records_inserted++; - } + } else { /* record already exists */ @@ -1008,7 +1029,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, zebra_rec_keys_t sortKeys = zebra_rec_keys_open(); if (action == action_insert) { - yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, + yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT, recordType, pr_fname, (zint) start_offset); logRecord(zh); return ZEBRA_FAIL; @@ -1023,7 +1044,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, zebra_rec_keys_get_custom_record_id(zh->reg->keys), *sysno); } - + recordAttr = rec_init_attr(zh->reg->zei, rec); /* decrease total size */ @@ -1044,13 +1065,13 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, extract_flush_record_keys(zh, *sysno, 0, delkeys, recordAttr->staticrank); #endif - if (action == action_delete) + if (action == action_delete || action == action_a_delete) { /* record going to be deleted */ #if FLUSH2 extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank, delkeys, recordAttr->staticrank); -#endif +#endif if (zebra_rec_keys_empty(delkeys)) { yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, @@ -1089,7 +1110,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, zh->reg->keys, extractCtrl.staticrank, delkeys, recordAttr->staticrank); #else - extract_flush_record_keys(zh, *sysno, 1, + extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys, extractCtrl.staticrank); #endif recordAttr->staticrank = extractCtrl.staticrank; @@ -1170,17 +1191,63 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, /* update database name */ xfree(rec->info[recInfo_databaseName]); rec->info[recInfo_databaseName] = - rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); + rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]); /* update offset */ recordAttr->recordOffset = start_offset; - + /* commit this record */ rec_put(zh->reg->records, &rec); logRecord(zh); return ZEBRA_OK; } +/** \brief extracts records from stream + \param zh Zebra Handle + \param stream stream that we read from + \param action (action_insert, action_replace, action_delete, ..) + \param recordType Record filter type "grs.xml", etc. + \param sysno pointer to sysno if already known; NULL otherwise + \param match_criteria (NULL if not already given) + \param fname filename that we read from (for logging purposes only) + \param recType record type + \param recTypeClientData client data for record type + \returns ZEBRA_OK for success; ZEBRA_FAIL for failure +*/ +ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, + struct ZebraRecStream *stream, + enum zebra_recctrl_action_t action, + const char *recordType, + zint *sysno, + const char *match_criteria, + const char *fname, + RecType recType, + void *recTypeClientData) +{ + ZEBRA_RES res = ZEBRA_OK; + while (1) + { + int more = 0; + res = zebra_extract_record_stream(zh, stream, + action, + recordType, + sysno, + match_criteria, + fname, + recType, recTypeClientData, &more); + if (!more) + { + res = ZEBRA_OK; + break; + } + if (res != ZEBRA_OK) + break; + if (sysno) + break; + } + return res; +} + ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) { ZebraHandle zh = (ZebraHandle) handle; @@ -1212,7 +1279,7 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) extractCtrl.handle = handle; extractCtrl.first_record = 1; - + extract_set_store_data_prepare(&extractCtrl); if (n) @@ -1221,14 +1288,14 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) if (rec->size[recInfo_delKeys]) { zebra_rec_keys_t delkeys = zebra_rec_keys_open(); - + zebra_rec_keys_t sortkeys = zebra_rec_keys_open(); zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys], rec->size[recInfo_delKeys], 0); #if FLUSH2 - extract_flush_record_keys2(zh, rec->sysno, + extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, delkeys, 0); #else extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0); @@ -1248,14 +1315,14 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) #if FLUSH2 extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0); #else - extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0); + extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0); #endif } extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys); - + xfree(rec->info[recInfo_delKeys]); zebra_rec_keys_get_buf(zh->reg->keys, - &rec->info[recInfo_delKeys], + &rec->info[recInfo_delKeys], &rec->size[recInfo_delKeys]); xfree(rec->info[recInfo_sortKeys]); @@ -1265,6 +1332,65 @@ ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n) return ZEBRA_OK; } +void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key, + const char *str, size_t slen, NMEM nmem, int level) +{ + char keystr[200]; /* room for zints to print */ + int ord = CAST_ZINT_TO_INT(key->mem[0]); + const char *index_type; + int i; + const char *string_index; + + zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, + 0/* db */, &string_index); + assert(index_type); + *keystr = '\0'; + for (i = 0; i < key->len; i++) + { + sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]); + } + + if (*str < CHR_BASE_CHAR) + { + int i; + char dst_buf[200]; /* room for special chars */ + + strcpy(dst_buf , "?"); + + if (!strcmp(str, "")) + strcpy(dst_buf, "alwaysmatches"); + if (!strcmp(str, FIRST_IN_FIELD_STR)) + strcpy(dst_buf, "firstinfield"); + else if (!strcmp(str, CHR_UNKNOWN)) + strcpy(dst_buf, "unknown"); + else if (!strcmp(str, CHR_SPACE)) + strcpy(dst_buf, "space"); + + for (i = 0; ireg->zei, ord, &index_type, - 0/* db */, &string_index); - assert(index_type); - zebra_term_untrans_iconv(zh, nmem, index_type, - &dst_term, str); - *keystr = '\0'; - for (i = 0; ireg->zei; int normal = 0; @@ -1433,7 +1516,7 @@ void extract_flush_record_keys2(ZebraHandle zh, zint sysno, &ins_key_in); if (del && ins && ins_rank == del_rank - && !key_compare(&del_key_in, &ins_key_in) + && !key_compare(&del_key_in, &ins_key_in) && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen)) { optimized++; @@ -1441,27 +1524,70 @@ void extract_flush_record_keys2(ZebraHandle zh, zint sysno, } if (!del && !ins) break; - + normal++; if (del) - key_block_write(zh->reg->key_block, sysno, + key_block_write(zh->reg->key_block, sysno, &del_key_in, 0, del_str, del_slen, del_rank, zh->m_staticrank); if (ins) - key_block_write(zh->reg->key_block, sysno, + key_block_write(zh->reg->key_block, sysno, &ins_key_in, 1, ins_str, ins_slen, ins_rank, zh->m_staticrank); } yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized); } +#else +static void extract_flush_record_keys( + ZebraHandle zh, zint sysno, int cmd, + zebra_rec_keys_t reckeys, + zint staticrank) +{ + ZebraExplainInfo zei = zh->reg->zei; + + extract_rec_keys_adjust(zh, cmd, reckeys); + + if (log_level_details) + { + yaz_log(log_level_details, "Keys for record " ZINT_FORMAT " %s", + sysno, cmd ? "insert" : "delete"); + extract_rec_keys_log(zh, cmd, reckeys, log_level_details); + } + if (!zh->reg->key_block) + { + int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8")); + const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", "."); + int use_threads = atoi(res_get_def(zh->res, "threads", "1")); + zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads); + } + zebraExplain_recordCountIncrement(zei, cmd ? 1 : -1); + +#if 0 + yaz_log(YLOG_LOG, "sysno=" ZINT_FORMAT " cmd=%d", sysno, cmd); + print_rec_keys(zh, reckeys); +#endif + if (zebra_rec_keys_rewind(reckeys)) + { + size_t slen; + const char *str; + struct it_key key_in; + while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) + { + key_block_write(zh->reg->key_block, sysno, + &key_in, cmd, str, slen, + staticrank, zh->m_staticrank); + } + } +} +#endif -ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh, +ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh, zebra_rec_keys_t reckeys, zebra_snippets *snippets) { NMEM nmem = nmem_create(); - if (zebra_rec_keys_rewind(reckeys)) + if (zebra_rec_keys_rewind(reckeys)) { const char *str; size_t slen; @@ -1476,7 +1602,7 @@ ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh, assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); seqno = key.mem[key.len-1]; ord = CAST_ZINT_TO_INT(key.mem[0]); - + zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, 0/* db */, 0 /* string_index */); assert(index_type); @@ -1508,13 +1634,13 @@ void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys) assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0); - + seqno = key.mem[key.len-1]; - + zebra_term_untrans(zh, index_type, dst_buf, str); - - yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT - " term=%s", ord, seqno, dst_buf); + + yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT + " term=%s", ord, seqno, dst_buf); } } } @@ -1555,9 +1681,10 @@ static void extract_add_sort_string(RecWord *p, const char *str, int length) ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name); if (ch < 0) ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name); - key.len = 2; + key.len = 3; key.mem[0] = ch; key.mem[1] = p->record_id; + key.mem[2] = p->section_id; zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key); } @@ -1583,7 +1710,15 @@ static void extract_add_string(RecWord *p, zebra_map_t zm, if (!p->index_name) return; + if (log_level_details) + { + + WRBUF w = wrbuf_alloc(); + wrbuf_write_escaped(w, string, length); + yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w)); + wrbuf_destroy(w); + } if (zebra_maps_is_index(zm)) { extract_add_index_string(p, zinfo_index_category_index, @@ -1614,7 +1749,7 @@ static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm) int remain = p->term_len; int first = 1; const char **map = 0; - + if (remain > 0) map = zebra_maps_input(zm, &b, remain, 0); @@ -1651,7 +1786,7 @@ static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm) return; if (first) - { + { first = 0; if (zebra_maps_is_first_in_field(zm)) { @@ -1667,90 +1802,28 @@ static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm) static void extract_add_complete_field(RecWord *p, zebra_map_t zm) { - const char *b = p->term_buf; char buf[IT_MAX_WORD+1]; - const char **map = 0; - int i = 0, remain = p->term_len; - - if (remain > 0) - map = zebra_maps_input(zm, &b, remain, 1); - - while (remain > 0 && i < IT_MAX_WORD) - { - while (map && *map && **map == *CHR_SPACE) - { - remain = p->term_len - (b - p->term_buf); - - if (remain > 0) - { - int first = i ? 0 : 1; /* first position */ - map = zebra_maps_input(zm, &b, remain, first); - } - else - map = 0; - } - if (!map) - break; - - if (i && i < IT_MAX_WORD) - buf[i++] = *CHR_SPACE; - while (map && *map && **map != *CHR_SPACE) - { - const char *cp = *map; - - if (**map == *CHR_CUT) - { - i = 0; - } - else - { - if (i >= IT_MAX_WORD) - break; - while (i < IT_MAX_WORD && *cp) - buf[i++] = *(cp++); - } - remain = p->term_len - (b - p->term_buf); - if (remain > 0) - { - map = zebra_maps_input(zm, &b, remain, 0); - } - else - map = 0; - } - } + int i = parse_complete_field(p, zm, buf); if (!i) return; extract_add_string(p, zm, buf, i); + p->seqno++; } static void extract_add_icu(RecWord *p, zebra_map_t zm) { - struct it_key key; const char *res_buf = 0; size_t res_len = 0; - ZebraHandle zh = p->extractCtrl->handle; - - int cat = zinfo_index_category_index; - int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, - p->index_type, p->index_name); - if (ch < 0) - ch = zebraExplain_add_attr_str(zh->reg->zei, cat, - p->index_type, p->index_name); + zebra_map_tokenize_start(zm, p->term_buf, p->term_len); - while (zebra_map_tokenize_next(zm, &res_buf, &res_len)) + while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0)) { - int i = 0; - key.mem[i++] = ch; - key.mem[i++] = p->record_id; - key.mem[i++] = p->section_id; - - if (zh->m_segment_indexing) - key.mem[i++] = p->segment; - key.mem[i++] = p->seqno; - key.len = i; - - zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key); - + if (res_len > IT_MAX_WORD) + { + yaz_log(YLOG_LOG, "Truncating long term %ld", (long) res_len); + res_len = IT_MAX_WORD; + } + extract_add_string(p, zm, res_buf, res_len); p->seqno++; } } @@ -1760,35 +1833,29 @@ static void extract_add_icu(RecWord *p, zebra_map_t zm) \param p token data to be indexed Call sequence: - extract_token - zebra_add_{in}_complete + extract_token_add + extract_add_{in}_complete / extract_add_icu extract_add_string - + extract_add_index_string or extract_add_sort_string or extract_add_staticrank_string - + */ static void extract_token_add(RecWord *p) { ZebraHandle zh = p->extractCtrl->handle; zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type); - WRBUF wrbuf; if (log_level_details) { yaz_log(log_level_details, "extract_token_add " "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s", - p->index_type, p->index_name, + p->index_type, p->index_name, p->seqno, p->term_len, p->term_buf); } - if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len))) - { - p->term_buf = wrbuf_buf(wrbuf); - p->term_len = wrbuf_len(wrbuf); - } if (zebra_maps_is_icu(zm)) { extract_add_icu(p, zm); @@ -1849,24 +1916,68 @@ void extract_flush_sort_keys(ZebraHandle zh, zint sysno, const char *str; struct it_key key_in; - zebra_sort_sysno(si, sysno); + NMEM nmem = nmem_create(); + struct sort_add_ent { + int ord; + int cmd; + struct sort_add_ent *next; + WRBUF wrbuf; + zint sysno; + zint section_id; + }; + struct sort_add_ent *sort_ent_list = 0; while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) { int ord = CAST_ZINT_TO_INT(key_in.mem[0]); - - zebra_sort_type(si, ord); - if (cmd == 1) - zebra_sort_add(si, str, slen); - else - zebra_sort_delete(si); + zint filter_sysno = key_in.mem[1]; + zint section_id = key_in.mem[2]; + + struct sort_add_ent **e = &sort_ent_list; + for (; *e; e = &(*e)->next) + if ((*e)->ord == ord && section_id == (*e)->section_id) + break; + if (!*e) + { + *e = nmem_malloc(nmem, sizeof(**e)); + (*e)->next = 0; + (*e)->wrbuf = wrbuf_alloc(); + (*e)->ord = ord; + (*e)->cmd = cmd; + (*e)->sysno = filter_sysno ? filter_sysno : sysno; + (*e)->section_id = section_id; + } + + wrbuf_write((*e)->wrbuf, str, slen); + wrbuf_putc((*e)->wrbuf, '\0'); } + if (sort_ent_list) + { + zint last_sysno = 0; + struct sort_add_ent *e = sort_ent_list; + for (; e; e = e->next) + { + if (last_sysno != e->sysno) + { + zebra_sort_sysno(si, e->sysno); + last_sysno = e->sysno; + } + zebra_sort_type(si, e->ord); + if (e->cmd == 1) + zebra_sort_add(si, e->section_id, e->wrbuf); + else + zebra_sort_delete(si, e->section_id); + wrbuf_destroy(e->wrbuf); + } + } + nmem_destroy(nmem); } } /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab