X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=index%2Fextract.c;h=e092a9b3f7f0e0bb6ad268b5b336eb622fda9551;hp=c70e6a1ee26b25b37476e21f23e0f53442ca11ba;hb=af102b1fb451ba27bfa7343528c4240b3ab3a80b;hpb=e199777080c6fa0963d51b7df1763fd5286ca9a4 diff --git a/index/extract.c b/index/extract.c index c70e6a1..e092a9b 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,8 +1,5 @@ -/* $Id: extract.c,v 1.267 2007-10-31 16:56:14 adam Exp $ - Copyright (C) 1995-2007 - Index Data ApS - -This file is part of the Zebra server. +/* This file is part of the Zebra server. + Copyright (C) 1995-2008 Index Data Zebra is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -71,7 +68,6 @@ static void extract_flush_sort_keys(ZebraHandle zh, zint sysno, int cmd, zebra_rec_keys_t skp); static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid); static void extract_token_add(RecWord *p); -static void extract_token_add2(RecWord *p); static void check_log_limit(ZebraHandle zh) { @@ -188,7 +184,7 @@ static void snippet_add_complete_field(RecWord *p, int ord, } if (!i) return; - if (last && start != last) + if (last && start != last && zebra_maps_is_index(zm)) zebra_snippets_appendn(h->snippets, p->seqno, 0, ord, start, last - start); } @@ -223,7 +219,7 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) } if (!map) break; - if (start != last) + if (start != last && zebra_maps_is_index(zm)) { zebra_snippets_appendn(h->snippets, p->seqno, 1, ord, start, last - start); @@ -257,7 +253,7 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) p->seqno++; } } - if (start != last) + if (start != last && zebra_maps_is_index(zm)) zebra_snippets_appendn(h->snippets, p->seqno, 0, ord, start, last - start); start = last; @@ -266,22 +262,48 @@ static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm) } +static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm) +{ + struct snip_rec_info *h = p->extractCtrl->handle; + + const char *res_buf = 0; + size_t res_len = 0; + + const char *display_buf = 0; + size_t display_len = 0; + + zebra_map_tokenize_start(zm, p->term_buf, p->term_len); + while (zebra_map_tokenize_next(zm, &res_buf, &res_len, + &display_buf, &display_len)) + { + if (zebra_maps_is_index(zm)) + zebra_snippets_appendn(h->snippets, p->seqno, 0, ord, + display_buf, display_len); + p->seqno++; + } +} + static void snippet_token_add(RecWord *p) { struct snip_rec_info *h = p->extractCtrl->handle; ZebraHandle zh = h->zh; zebra_map_t zm = zebra_map_get(zh->reg->zebra_maps, p->index_type); - if (zm && zebra_maps_is_index(zm)) + if (zm) { ZebraExplainInfo zei = zh->reg->zei; int ch = zebraExplain_lookup_attr_str( zei, zinfo_index_category_index, p->index_type, p->index_name); - if (zebra_maps_is_complete(zm)) - snippet_add_complete_field(p, ch, zm); + if (zebra_maps_is_icu(zm)) + snippet_add_icu(p, ch, zm); else - snippet_add_incomplete_field(p, ch, zm); + { + if (zebra_maps_is_complete(zm)) + snippet_add_complete_field(p, ch, zm); + else + snippet_add_incomplete_field(p, ch, zm); + } } } @@ -521,13 +543,29 @@ struct recordLogInfo { struct recordGroup *rGroup; }; -static void all_matches_add(struct recExtractCtrl *ctrl) +/** \brief add the always-matches index entry and map to real record ID + \param ctrl record control + \param record_id custom record ID + \param sysno system record ID + + This function serves two purposes.. It adds the always matches + entry and makes a pointer from the custom record ID (if defined) + back to the system record ID (sysno) + See zebra_recid_to_sysno . + */ +static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id, + zint sysno) { RecWord word; extract_init(ctrl, &word); + word.record_id = record_id; + /* we use the seqno as placeholder for a way to get back to + record database from _ALLRECORDS.. This is used if a custom + RECORD was defined */ + word.seqno = sysno; word.index_name = "_ALLRECORDS"; word.index_type = "w"; - word.seqno = 1; + extract_add_index_string(&word, zinfo_index_category_alwaysmatches, "", 0); } @@ -545,7 +583,7 @@ ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh, ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, - int deleteFlag) + enum zebra_recctrl_action_t action) { ZEBRA_RES r = ZEBRA_OK; int i, fd; @@ -615,7 +653,7 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, default: yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type); } - if (sysno && deleteFlag) + if (sysno && (action == action_delete || action == action_a_delete)) { streamp = 0; fi = 0; @@ -643,8 +681,7 @@ ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname, zebra_create_stream_fd(streamp, fd, 0); } r = zebra_extract_records_stream(zh, streamp, - deleteFlag ? - action_delete : action_update, + action, 0, /* tst_mode */ zh->m_record_type, sysno, @@ -821,14 +858,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, stream->endf(stream, &null_offset);; extractCtrl.init = extract_init; - if (zh->reg->index_types) - { - extractCtrl.tokenAdd = extract_token_add2; - } - else - { - extractCtrl.tokenAdd = extract_token_add; - } + extractCtrl.tokenAdd = extract_token_add; extractCtrl.schemaAdd = extract_schema_add; extractCtrl.dh = zh->reg->dh; extractCtrl.handle = zh; @@ -882,8 +912,6 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, else end_offset = stream->tellf(stream); - all_matches_add(&extractCtrl); - if (extractCtrl.match_criteria[0]) match_criteria = extractCtrl.match_criteria; } @@ -927,6 +955,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, } } } + if (zebra_rec_keys_empty(zh->reg->keys)) { /* the extraction process returned no information - the record @@ -937,14 +966,21 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, if (! *sysno) { - /* new record */ + /* new record AKA does not exist already */ if (action == action_delete) { - yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, - pr_fname, (zint) start_offset); + yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType, + pr_fname, (zint) start_offset); yaz_log(YLOG_WARN, "cannot delete record above (seems new)"); return ZEBRA_FAIL; } + else if (action == action_a_delete) + { + if (show_progress) + yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType, + pr_fname, (zint) start_offset); + return ZEBRA_OK; + } else if (action == action_replace) { yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType, @@ -959,6 +995,15 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, *sysno = rec->sysno; + + if (stream) + { + all_matches_add(&extractCtrl, + zebra_rec_keys_get_custom_record_id(zh->reg->keys), + *sysno); + } + + recordAttr = rec_init_attr(zh->reg->zei, rec); if (extractCtrl.staticrank < 0) { @@ -1000,6 +1045,13 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, rec = rec_get(zh->reg->records, *sysno); assert(rec); + + if (stream) + { + all_matches_add(&extractCtrl, + zebra_rec_keys_get_custom_record_id(zh->reg->keys), + *sysno); + } recordAttr = rec_init_attr(zh->reg->zei, rec); @@ -1021,7 +1073,7 @@ ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh, extract_flush_record_keys(zh, *sysno, 0, delkeys, recordAttr->staticrank); #endif - if (action == action_delete) + if (action == action_delete || action == action_a_delete) { /* record going to be deleted */ #if FLUSH2 @@ -1433,7 +1485,7 @@ void extract_flush_record_keys2(ZebraHandle zh, zint sysno, } -ZEBRA_RES zebra_rec_keys_to_snippets(ZebraHandle zh, +ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh, zebra_rec_keys_t reckeys, zebra_snippets *snippets) { @@ -1560,7 +1612,15 @@ static void extract_add_string(RecWord *p, zebra_map_t zm, if (!p->index_name) return; + if (log_level_details) + { + WRBUF w = wrbuf_alloc(); + + wrbuf_write_escaped(w, string, length); + yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w)); + wrbuf_destroy(w); + } if (zebra_maps_is_index(zm)) { extract_add_index_string(p, zinfo_index_category_index, @@ -1700,62 +1760,26 @@ static void extract_add_complete_field(RecWord *p, zebra_map_t zm) extract_add_string(p, zm, buf, i); } -static void extract_token_add2_index(ZebraHandle zh, zebra_index_type_t type, - RecWord *p) +static void extract_add_icu(RecWord *p, zebra_map_t zm) { - struct it_key key; const char *res_buf = 0; size_t res_len = 0; - int r = zebra_index_type_tokenize(type, p->term_buf, p->term_len, - &res_buf, &res_len); - int cat = zinfo_index_category_index; - int ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, p->index_type, p->index_name); - if (ch < 0) - ch = zebraExplain_add_attr_str(zh->reg->zei, cat, p->index_type, p->index_name); - while (r) - { - int i = 0; - key.mem[i++] = ch; - key.mem[i++] = p->record_id; - key.mem[i++] = p->section_id; - - if (zh->m_segment_indexing) - key.mem[i++] = p->segment; - key.mem[i++] = p->seqno; - key.len = i; - yaz_log(YLOG_LOG, "keys_write %.*s", (int) res_len, res_buf); - zebra_rec_keys_write(zh->reg->keys, res_buf, res_len, &key); - + zebra_map_tokenize_start(zm, p->term_buf, p->term_len); + while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0)) + { + extract_add_string(p, zm, res_buf, res_len); p->seqno++; - r = zebra_index_type_tokenize(type, 0, 0, &res_buf, &res_len); } } -static void extract_token_add2(RecWord *p) -{ - ZebraHandle zh = p->extractCtrl->handle; - zebra_index_type_t type = zebra_index_type_get(zh->reg->index_types, p->index_type); - if (type) - { - if (zebra_index_type_is_index(type)) - { - extract_token_add2_index(zh, type, p); - } - else if (zebra_index_type_is_sort(type)) - { - ; - - } - } -} /** \brief top-level indexing handler for recctrl system \param p token data to be indexed Call sequence: - extract_token - zebra_add_{in}_complete + extract_token_add + extract_add_{in}_complete / extract_add_icu extract_add_string extract_add_index_string @@ -1780,13 +1804,20 @@ static void extract_token_add(RecWord *p) } if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len))) { - p->term_buf = wrbuf_buf(wrbuf); - p->term_len = wrbuf_len(wrbuf); + p->term_buf = wrbuf_buf(wrbuf); + p->term_len = wrbuf_len(wrbuf); + } + if (zebra_maps_is_icu(zm)) + { + extract_add_icu(p, zm); } - if (zebra_maps_is_complete(zm)) - extract_add_complete_field(p, zm); else - extract_add_incomplete_field(p, zm); + { + if (zebra_maps_is_complete(zm)) + extract_add_complete_field(p, zm); + else + extract_add_incomplete_field(p, zm); + } } static void extract_set_store_data_cb(struct recExtractCtrl *p, @@ -1836,18 +1867,49 @@ void extract_flush_sort_keys(ZebraHandle zh, zint sysno, const char *str; struct it_key key_in; + NMEM nmem = nmem_create(); + struct sort_add_ent { + int ord; + int cmd; + struct sort_add_ent *next; + WRBUF wrbuf; + }; + struct sort_add_ent *sort_ent_list = 0; zebra_sort_sysno(si, sysno); while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) { int ord = CAST_ZINT_TO_INT(key_in.mem[0]); + + struct sort_add_ent **e = &sort_ent_list; + while (*e && (*e)->ord != ord) + e = &(*e)->next; + if (!*e) + { + *e = nmem_malloc(nmem, sizeof(**e)); + (*e)->next = 0; + (*e)->wrbuf = wrbuf_alloc(); + (*e)->ord = ord; + (*e)->cmd = cmd; + } - zebra_sort_type(si, ord); - if (cmd == 1) - zebra_sort_add(si, str, slen); - else - zebra_sort_delete(si); + wrbuf_write((*e)->wrbuf, str, slen); + wrbuf_putc((*e)->wrbuf, '\0'); } + if (sort_ent_list) + { + struct sort_add_ent *e = sort_ent_list; + for (; e; e = e->next) + { + zebra_sort_type(si, e->ord); + if (e->cmd == 1) + zebra_sort_add(si, e->wrbuf); + else + zebra_sort_delete(si); + wrbuf_destroy(e->wrbuf); + } + } + nmem_destroy(nmem); } }