X-Git-Url: http://git.indexdata.com/cgi-bin?a=blobdiff_plain;f=index%2Fextract.c;h=e4973ab7fcd1fced79401de6cc9abb4c9cf4a29f;hb=deb0cef3d4d19dc6508b2fed71711b3fb1be26a2;hp=8358ade76267aafb7d93100dae38f678c74f3174;hpb=b88909df16157ed1e7859bc3fad6b01520d4865e;p=idzebra-moved-to-github.git diff --git a/index/extract.c b/index/extract.c index 8358ade..e4973ab 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.220 2006-06-13 12:02:06 adam Exp $ +/* $Id: extract.c,v 1.227 2006-08-16 13:16:36 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -15,9 +15,9 @@ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Zebra; see the file LICENSE.zebra. If not, write to the -Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA -02111-1307, USA. +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ #include @@ -36,18 +36,6 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include -#ifdef WIN32 -#define PRINTF_OFF_T "%I64d" -#else -/* !WIN32 */ -#if SIZEOF_OFF_T == SIZEOF_LONG_LONG -#define PRINTF_OFF_T "%lld" -#else -#define PRINTF_OFF_T "%ld" -#endif - -#endif - #define ENCODE_BUFLEN 768 struct encode_info { void *encode_handle; @@ -118,19 +106,21 @@ static void logRecord (ZebraHandle zh) } } -static void extract_add_index_string (RecWord *p, const char *str, int length); +static void extract_add_index_string (RecWord *p, + zinfo_index_category_t cat, + const char *str, int length); static void extract_set_store_data_prepare(struct recExtractCtrl *p); -static void extract_init (struct recExtractCtrl *p, RecWord *w) +static void extract_init(struct recExtractCtrl *p, RecWord *w) { - w->zebra_maps = p->zebra_maps; w->seqno = 1; w->index_name = "any"; w->index_type = 'w'; w->extractCtrl = p; w->record_id = 0; w->section_id = 0; + w->segment = 0; } static void searchRecordKey(ZebraHandle zh, @@ -140,16 +130,17 @@ static void searchRecordKey(ZebraHandle zh, { int i; int ch = -1; + zinfo_index_category_t cat = zinfo_index_category_index; for (i = 0; ireg->zei, '0', index_name); + ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, '0', index_name); if (ch < 0) - ch = zebraExplain_lookup_attr_str(zh->reg->zei, 'p', index_name); + ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, 'p', index_name); if (ch < 0) - ch = zebraExplain_lookup_attr_str(zh->reg->zei, 'w', index_name); + ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, 'w', index_name); if (ch < 0) return ; @@ -163,7 +154,7 @@ static void searchRecordKey(ZebraHandle zh, zint seqno; while (zebra_rec_keys_read(reckeys, &str, &slen, &key)) { - assert(key.len <= 4 && key.len > 2); + assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); seqno = key.mem[key.len-1]; @@ -404,7 +395,6 @@ static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl) else ctrl->seqno[i] = 0; } - ctrl->zebra_maps = zh->reg->zebra_maps; ctrl->flagShowRecords = !zh->m_flag_rw; } @@ -412,10 +402,11 @@ static void all_matches_add(struct recExtractCtrl *ctrl) { RecWord word; extract_init(ctrl, &word); - word.index_name = "allrecords"; + word.index_name = "_ALLRECORDS"; word.index_type = 'w'; word.seqno = 1; - extract_add_index_string (&word, "", 0); + extract_add_index_string (&word, zinfo_index_category_alwaysmatches, + "", 0); } static ZEBRA_RES file_extract_record(ZebraHandle zh, @@ -471,11 +462,11 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, init_extractCtrl(zh, &extractCtrl); if (!zh->m_flag_rw) - printf ("File: %s " PRINTF_OFF_T "\n", fname, recordOffset); + printf ("File: %s " ZINT_FORMAT "\n", fname, (zint)recordOffset); if (zh->m_flag_rw) { char msg[512]; - sprintf (msg, "%s:" PRINTF_OFF_T , fname, recordOffset); + sprintf (msg, "%s:" ZINT_FORMAT , fname, (zint)recordOffset); yaz_log_init_prefix2 (msg); } @@ -490,8 +481,9 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, if (zh->m_flag_rw && zh->records_processed < zh->m_file_verbose_limit) { - yaz_log (YLOG_WARN, "fail %s %s " PRINTF_OFF_T, zh->m_record_type, - fname, recordOffset); + yaz_log (YLOG_WARN, "fail %s %s " ZINT_FORMAT, + zh->m_record_type, + fname, (zint) recordOffset); } return ZEBRA_FAIL; } @@ -502,8 +494,8 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, zh->records_processed < zh->m_file_verbose_limit) { yaz_log (YLOG_WARN, "no filter for %s %s " - PRINTF_OFF_T, zh->m_record_type, - fname, recordOffset); + ZINT_FORMAT, zh->m_record_type, + fname, (zint) recordOffset); } return ZEBRA_FAIL; } @@ -559,8 +551,8 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, return ZEBRA_OK; if (zh->records_processed < zh->m_file_verbose_limit) - yaz_log (YLOG_WARN, "empty %s %s " PRINTF_OFF_T, zh->m_record_type, - fname, recordOffset); + yaz_log(YLOG_WARN, "empty %s %s " ZINT_FORMAT, zh->m_record_type, + fname, (zint)recordOffset); return ZEBRA_OK; } @@ -569,9 +561,9 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, /* new record */ if (deleteFlag) { - yaz_log (YLOG_LOG, "delete %s %s " PRINTF_OFF_T, zh->m_record_type, - fname, recordOffset); - yaz_log (YLOG_WARN, "cannot delete record above (seems new)"); + yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, zh->m_record_type, + fname, (zint)recordOffset); + yaz_log(YLOG_WARN, "cannot delete record above (seems new)"); return ZEBRA_OK; } @@ -581,10 +573,10 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, if (zh->records_processed < zh->m_file_verbose_limit) { - yaz_log(YLOG_LOG, "add %s %s " PRINTF_OFF_T + yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT " " ZINT_FORMAT " %s" , zh->m_record_type, - fname, recordOffset, *sysno, match_str_to_print); + fname, (zint) recordOffset, *sysno, match_str_to_print); } recordAttr = rec_init_attr (zh->reg->zei, rec); recordAttr->staticrank = extractCtrl.staticrank; @@ -631,19 +623,19 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, /* record going to be deleted */ if (zebra_rec_keys_empty(delkeys)) { - yaz_log (YLOG_LOG, "delete %s %s " PRINTF_OFF_T - " " ZINT_FORMAT, - zh->m_record_type, fname, recordOffset, *sysno); - yaz_log (YLOG_WARN, "cannot delete file above, storeKeys false (1)"); + yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT + " " ZINT_FORMAT, + zh->m_record_type, fname, (zint)recordOffset, *sysno); + yaz_log(YLOG_WARN, "cannot delete file above, storeKeys false (1)"); } else { if (zh->records_processed < zh->m_file_verbose_limit) { - yaz_log(YLOG_LOG, "delete %s %s " PRINTF_OFF_T + yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT " " ZINT_FORMAT " %s" , - zh->m_record_type, - fname, recordOffset, *sysno, match_str_to_print); + zh->m_record_type, fname, (zint) recordOffset, + *sysno, match_str_to_print); } zh->records_deleted++; if (matchStr) @@ -662,10 +654,10 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, /* flush new keys for sort&search etc */ if (zh->records_processed < zh->m_file_verbose_limit) { - yaz_log(YLOG_LOG, "update %s %s " PRINTF_OFF_T + yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT " " ZINT_FORMAT " %s" , - zh->m_record_type, - fname, recordOffset, *sysno, match_str_to_print); + zh->m_record_type, fname, (zint) recordOffset, + *sysno, match_str_to_print); } recordAttr->staticrank = extractCtrl.staticrank; extract_flushSortKeys (zh, *sysno, 1, zh->reg->sortKeys); @@ -735,8 +727,8 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, xmalloc (recordAttr->recordSize); if (lseek (fi->fd, recordOffset, SEEK_SET) < 0) { - yaz_log (YLOG_ERRNO|YLOG_FATAL, "seek to " PRINTF_OFF_T " in %s", - recordOffset, fname); + yaz_log(YLOG_ERRNO|YLOG_FATAL, "seek to " ZINT_FORMAT " in %s", + (zint)recordOffset, fname); exit (1); } if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize) @@ -1383,10 +1375,11 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) { int ch = 0; + int i, j = 0; struct it_key key_out; - zint *keyp = key_out.mem; - assert(key_in.len == 4); + assert(key_in.len >= 2); + assert(key_in.len <= IT_KEY_LEVEL_MAX); /* check for buffer overflow */ if (zh->reg->key_buf_used + 1024 > @@ -1398,6 +1391,9 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] = (char*)zh->reg->key_buf + zh->reg->key_buf_used; + /* key_in.mem[0] ord/ch */ + /* key_in.mem[1] filter specified record ID */ + /* encode the ordinal value (field/use/attribute) .. */ ch = CAST_ZINT_TO_INT(key_in.mem[0]); zh->reg->key_buf_used += @@ -1420,19 +1416,17 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, (long) staticrank); staticrank = 0; } - *keyp++ = staticrank; - key_out.len = 4; + key_out.mem[j++] = staticrank; } - else - key_out.len = 3; if (key_in.mem[1]) /* filter specified record ID */ - *keyp++ = key_in.mem[1]; + key_out.mem[j++] = key_in.mem[1]; else - *keyp++ = sysno; - *keyp++ = key_in.mem[2]; /* section_id */ - *keyp++ = key_in.mem[3]; /* sequence .. */ - + key_out.mem[j++] = sysno; + for (i = 2; i < key_in.len; i++) + key_out.mem[j++] = key_in.mem[i]; + key_out.len = j; + memcpy((char*)zh->reg->key_buf + zh->reg->key_buf_used, &key_out, sizeof(key_out)); (zh->reg->key_buf_used) += sizeof(key_out); @@ -1458,11 +1452,6 @@ void extract_flushWriteKeys (ZebraHandle zh, int final) { yaz_log(log_level, " nothing to flush section=%d buf=%p i=%d", zh->reg->key_file_no, zh->reg->key_buf, ptr_i); - yaz_log(log_level, " buf=%p ", - zh->reg->key_buf); - yaz_log(log_level, " ptr=%d ",zh->reg->ptr_i); - yaz_log(log_level, " reg=%p ",zh->reg); - return; } @@ -1592,7 +1581,7 @@ ZEBRA_RES zebra_snippets_rec_keys(ZebraHandle zh, zint seqno; int index_type; - assert(key.len <= 4 && key.len > 2); + assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); seqno = key.mem[key.len-1]; ord = CAST_ZINT_TO_INT(key.mem[0]); @@ -1624,7 +1613,7 @@ void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys) int index_type; int ord = CAST_ZINT_TO_INT(key.mem[0]); const char *db = 0; - assert(key.len <= 4 && key.len > 2); + assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2); zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0); @@ -1638,43 +1627,27 @@ void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys) } } -static void extract_add_index_string(RecWord *p, const char *str, int length) +static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat, + const char *str, int length) { struct it_key key; - ZebraHandle zh = p->extractCtrl->handle; ZebraExplainInfo zei = zh->reg->zei; - int ch; - - if (!p->index_name) - return; + int ch, i; - ch = zebraExplain_lookup_attr_str(zei, p->index_type, p->index_name); + ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name); if (ch < 0) - ch = zebraExplain_add_attr_str(zei, p->index_type, p->index_name); + ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name); - key.len = 4; - key.mem[0] = ch; - key.mem[1] = p->record_id; - key.mem[2] = p->section_id; - key.mem[3] = p->seqno; + i = 0; + key.mem[i++] = ch; + key.mem[i++] = p->record_id; + key.mem[i++] = p->section_id; -#if 0 - if (1) - { - char strz[80]; - int i; - - strz[0] = 0; - for (i = 0; iattrSet, p->attrUse, p->record_id, p->section_id, p->seqno, - strz); - } -#endif + if (zh->m_segment_indexing) + key.mem[i++] = p->segment; + key.mem[i++] = p->seqno; + key.len = i; zebra_rec_keys_write(zh->reg->keys, str, length, &key); } @@ -1682,59 +1655,56 @@ static void extract_add_index_string(RecWord *p, const char *str, int length) static void extract_add_sort_string(RecWord *p, const char *str, int length) { struct it_key key; - ZebraHandle zh = p->extractCtrl->handle; ZebraExplainInfo zei = zh->reg->zei; int ch; + zinfo_index_category_t cat = zinfo_index_category_sort; - if (!p->index_name) - return; - - ch = zebraExplain_lookup_attr_str(zei, p->index_type, p->index_name); + ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name); if (ch < 0) - ch = zebraExplain_add_attr_str(zei, p->index_type, p->index_name); - key.len = 4; + ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name); + key.len = 2; key.mem[0] = ch; key.mem[1] = p->record_id; - key.mem[2] = p->section_id; - key.mem[3] = p->seqno; -#if 0 - if (1) - { - char strz[80]; - int i; - - strz[0] = 0; - for (i = 0; iattrSet, p->attrUse, p->record_id, p->section_id, p->seqno, - strz); - } -#endif zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key); } -static void extract_add_string (RecWord *p, const char *string, int length) +static void extract_add_string(RecWord *p, const char *string, int length) { + ZebraHandle zh = p->extractCtrl->handle; assert (length > 0); - if (zebra_maps_is_sort (p->zebra_maps, p->index_type)) - extract_add_sort_string (p, string, length); + + if (!p->index_name) + return; + + if (zebra_maps_is_sort(zh->reg->zebra_maps, p->index_type)) + extract_add_sort_string(p, string, length); else - extract_add_index_string (p, string, length); + { + extract_add_index_string(p, zinfo_index_category_index, + string, length); + if (zebra_maps_is_alwaysmatches(zh->reg->zebra_maps, p->index_type)) + { + RecWord word; + memcpy(&word, p, sizeof(word)); + + word.seqno = 1; + extract_add_index_string( + &word, zinfo_index_category_alwaysmatches, "", 0); + } + } } static void extract_add_incomplete_field (RecWord *p) { + ZebraHandle zh = p->extractCtrl->handle; const char *b = p->term_buf; int remain = p->term_len; const char **map = 0; if (remain > 0) - map = zebra_maps_input(p->zebra_maps, p->index_type, &b, remain, 0); + map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0); while (map) { @@ -1746,7 +1716,7 @@ static void extract_add_incomplete_field (RecWord *p) { remain = p->term_len - (b - p->term_buf); if (remain > 0) - map = zebra_maps_input(p->zebra_maps, p->index_type, &b, + map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0); else map = 0; @@ -1762,7 +1732,7 @@ static void extract_add_incomplete_field (RecWord *p) buf[i++] = *(cp++); remain = p->term_len - (b - p->term_buf); if (remain > 0) - map = zebra_maps_input(p->zebra_maps, p->index_type, &b, remain, 0); + map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0); else map = 0; } @@ -1775,13 +1745,14 @@ static void extract_add_incomplete_field (RecWord *p) static void extract_add_complete_field (RecWord *p) { + ZebraHandle zh = p->extractCtrl->handle; const char *b = p->term_buf; char buf[IT_MAX_WORD+1]; const char **map = 0; int i = 0, remain = p->term_len; if (remain > 0) - map = zebra_maps_input (p->zebra_maps, p->index_type, &b, remain, 1); + map = zebra_maps_input (zh->reg->zebra_maps, p->index_type, &b, remain, 1); while (remain > 0 && i < IT_MAX_WORD) { @@ -1792,7 +1763,7 @@ static void extract_add_complete_field (RecWord *p) if (remain > 0) { int first = i ? 0 : 1; /* first position */ - map = zebra_maps_input(p->zebra_maps, p->index_type, &b, remain, first); + map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, first); } else map = 0; @@ -1820,7 +1791,7 @@ static void extract_add_complete_field (RecWord *p) remain = p->term_len - (b - p->term_buf); if (remain > 0) { - map = zebra_maps_input (p->zebra_maps, p->index_type, &b, + map = zebra_maps_input (zh->reg->zebra_maps, p->index_type, &b, remain, 0); } else @@ -1834,19 +1805,20 @@ static void extract_add_complete_field (RecWord *p) static void extract_token_add(RecWord *p) { + ZebraHandle zh = p->extractCtrl->handle; WRBUF wrbuf; if (log_level) yaz_log(log_level, "extract_token_add " "type=%c index=%s seqno=" ZINT_FORMAT " s=%.*s", p->index_type, p->index_name, p->seqno, p->term_len, p->term_buf); - if ((wrbuf = zebra_replace(p->zebra_maps, p->index_type, 0, + if ((wrbuf = zebra_replace(zh->reg->zebra_maps, p->index_type, 0, p->term_buf, p->term_len))) { p->term_buf = wrbuf_buf(wrbuf); p->term_len = wrbuf_len(wrbuf); } - if (zebra_maps_is_complete (p->zebra_maps, p->index_type)) + if (zebra_maps_is_complete (zh->reg->zebra_maps, p->index_type)) extract_add_complete_field (p); else extract_add_incomplete_field(p);