From: Adam Dickmeiss Date: Wed, 10 May 2006 12:31:08 +0000 (+0000) Subject: Added code which maintains number of term occurrences and document X-Git-Tag: before.bug.529~123 X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=54ddb0c5a3a8157235271665cd874576e656eed4 Added code which maintains number of term occurrences and document occurrences for an index. --- diff --git a/index/extract.c b/index/extract.c index 3f21862..66212ce 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,5 +1,5 @@ -/* $Id: extract.c,v 1.209 2006-05-10 08:13:21 adam Exp $ - Copyright (C) 1995-2005 +/* $Id: extract.c,v 1.210 2006-05-10 12:31:08 adam Exp $ + Copyright (C) 1995-2006 Index Data ApS This file is part of the Zebra server. @@ -532,17 +532,18 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, *sysno = rec->sysno; if (zh->records_processed < zh->m_file_verbose_limit) - if (matchStr) - yaz_log(YLOG_LOG, "add %s %s " PRINTF_OFF_T - " " ZINT_FORMAT " %s" , - zh->m_record_type, - fname, recordOffset, *sysno, matchStr); - else - yaz_log(YLOG_LOG, "add %s %s " PRINTF_OFF_T - " " ZINT_FORMAT , - zh->m_record_type, - fname, recordOffset, *sysno); - + { + if (matchStr) + yaz_log(YLOG_LOG, "add %s %s " PRINTF_OFF_T + " " ZINT_FORMAT " %s" , + zh->m_record_type, + fname, recordOffset, *sysno, matchStr); + else + yaz_log(YLOG_LOG, "add %s %s " PRINTF_OFF_T + " " ZINT_FORMAT , + zh->m_record_type, + fname, recordOffset, *sysno); + } recordAttr = rec_init_attr (zh->reg->zei, rec); recordAttr->staticrank = extractCtrl.staticrank; @@ -611,19 +612,18 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, else { if (zh->records_processed < zh->m_file_verbose_limit) - if (matchStr) - yaz_log(YLOG_LOG, "delete %s %s " PRINTF_OFF_T - " " ZINT_FORMAT " %s" , - zh->m_record_type, - fname, recordOffset, *sysno, matchStr); - else - yaz_log(YLOG_LOG, "delete %s %s " PRINTF_OFF_T - " " ZINT_FORMAT , - zh->m_record_type, - fname, recordOffset, *sysno); - - - + { + if (matchStr) + yaz_log(YLOG_LOG, "delete %s %s " PRINTF_OFF_T + " " ZINT_FORMAT " %s" , + zh->m_record_type, + fname, recordOffset, *sysno, matchStr); + else + yaz_log(YLOG_LOG, "delete %s %s " PRINTF_OFF_T + " " ZINT_FORMAT , + zh->m_record_type, + fname, recordOffset, *sysno); + } zh->records_deleted++; if (matchStr) { @@ -640,17 +640,18 @@ static ZEBRA_RES file_extract_record(ZebraHandle zh, { /* flush new keys for sort&search etc */ if (zh->records_processed < zh->m_file_verbose_limit) - if (matchStr) + { + if (matchStr) yaz_log(YLOG_LOG, "update %s %s " PRINTF_OFF_T " " ZINT_FORMAT " %s" , zh->m_record_type, fname, recordOffset, *sysno, matchStr); - else + else yaz_log(YLOG_LOG, "update %s %s " PRINTF_OFF_T " " ZINT_FORMAT , zh->m_record_type, fname, recordOffset, *sysno); - + } recordAttr->staticrank = extractCtrl.staticrank; #if NATTR extract_flushSortKeys (zh, *sysno, 1, zh->reg->sortKeys); @@ -1350,6 +1351,58 @@ int explain_extract (void *handle, Record rec, data1_node *n) return 0; } +void extract_rec_keys_adjust(ZebraHandle zh, int is_insert, + zebra_rec_keys_t reckeys) +{ + ZebraExplainInfo zei = zh->reg->zei; + struct ord_stat { + int no; + int ord; + struct ord_stat *next; + }; + + if (zebra_rec_keys_rewind(reckeys)) + { + struct ord_stat *ord_list = 0; + struct ord_stat *p; + size_t slen; + const char *str; + struct it_key key_in; + while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) + { + int ord = key_in.mem[0]; + + for (p = ord_list; p ; p = p->next) + if (p->ord == ord) + { + p->no++; + break; + } + if (!p) + { + p = xmalloc(sizeof(*p)); + p->no = 1; + p->ord = ord; + p->next = ord_list; + ord_list = p; + } + } + + p = ord_list; + while (p) + { + struct ord_stat *p1 = p; + + if (is_insert) + zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1); + else + zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1); + p = p->next; + xfree(p1); + } + } +} + void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, int cmd, zebra_rec_keys_t reckeys, @@ -1357,6 +1410,8 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, { ZebraExplainInfo zei = zh->reg->zei; + extract_rec_keys_adjust(zh, cmd, reckeys); + if (!zh->reg->key_buf) { int mem= 1024*1024* atoi( res_get_def( zh->res, "memmax", "8")); @@ -1403,7 +1458,7 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, zh->reg->key_buf_used += key_SU_encode(ch, (char*)zh->reg->key_buf + zh->reg->key_buf_used); - + /* copy the 0-terminated stuff from str to output */ memcpy((char*)zh->reg->key_buf + zh->reg->key_buf_used, str, slen); zh->reg->key_buf_used += slen; diff --git a/index/zinfo.c b/index/zinfo.c index 2235089..562ae67 100644 --- a/index/zinfo.c +++ b/index/zinfo.c @@ -1,5 +1,5 @@ -/* $Id: zinfo.c,v 1.59 2006-05-10 09:08:55 adam Exp $ - Copyright (C) 1995-2005 +/* $Id: zinfo.c,v 1.60 2006-05-10 12:31:08 adam Exp $ + Copyright (C) 1995-2006 Index Data ApS This file is part of the Zebra server. @@ -44,6 +44,8 @@ struct zebSUInfo { } su; } u; int ordinal; + zint doc_occurrences; + zint term_occurrences; }; struct zebSUInfoB { @@ -594,6 +596,8 @@ static void zebraExplain_readAttributeDetails(ZebraExplainInfo zei, data1_node *node_str = NULL; data1_node *node_ordinal = NULL; data1_node *node_type = NULL; + data1_node *node_doc_occurrences = NULL; + data1_node *node_term_occurrences = NULL; data1_node *np2; char oid_str[128]; int oid_str_len; @@ -615,6 +619,15 @@ static void zebraExplain_readAttributeDetails(ZebraExplainInfo zei, node_ordinal = np2->child; else if (!strcmp(np2->u.tag.tag, "type")) node_type = np2->child; + else if (!strcmp(np2->u.tag.tag, "dococcurrences")) + node_doc_occurrences = np2->child; + else if (!strcmp(np2->u.tag.tag, "termoccurrences")) + node_term_occurrences = np2->child; + else + { + yaz_log(YLOG_LOG, "Unknown tag '%s' in attributeDetails", + np2->u.tag.tag); + } } assert(node_ordinal); @@ -629,6 +642,18 @@ static void zebraExplain_readAttributeDetails(ZebraExplainInfo zei, (*zsuip)->info.index_type = 'w'; } + if (node_doc_occurrences) + { + data1_node *np = node_doc_occurrences; + (*zsuip)->info.doc_occurrences = atoi_zn(np->u.data.data, + np->u.data.len); + } + if (node_term_occurrences) + { + data1_node *np = node_term_occurrences; + (*zsuip)->info.term_occurrences = atoi_zn(np->u.data.data, + np->u.data.len); + } if (node_set && node_use) { (*zsuip)->info.which = ZEB_SU_SET_USE; @@ -1165,6 +1190,11 @@ static void zebraExplain_writeAttributeDetails (ZebraExplainInfo zei, } data1_mk_tag_data_int (zei->dh, node_attr, "ordinal", zsui->info.ordinal, zei->nmem); + + data1_mk_tag_data_zint (zei->dh, node_attr, "dococcurrences", + zsui->info.doc_occurrences, zei->nmem); + data1_mk_tag_data_zint (zei->dh, node_attr, "termoccurrences", + zsui->info.term_occurrences, zei->nmem); } /* convert to "SGML" and write it */ #if ZINFO_DEBUG @@ -1482,15 +1512,79 @@ int zebraExplain_trav_ord(ZebraExplainInfo zei, void *handle, } return 0; } - -int zebraExplain_lookup_ord (ZebraExplainInfo zei, int ord, - int *index_type, - const char **db, - int *set, int *use, - const char **string_index) + + +struct zebSUInfoB *zebraExplain_get_sui_info (ZebraExplainInfo zei, int ord, + int dirty_mark, + const char **db) { struct zebDatabaseInfoB *zdb; + for (zdb = zei->databaseInfo; zdb; zdb = zdb->next) + { + struct zebSUInfoB **zsui; + + if (zdb->attributeDetails->readFlag) + zebraExplain_readAttributeDetails (zei, zdb->attributeDetails); + + for (zsui = &zdb->attributeDetails->SUInfo; *zsui; + zsui = &(*zsui)->next) + if ((*zsui)->info.ordinal == ord) + { + struct zebSUInfoB *zsui_this = *zsui; + + /* take it out of the list and move to front */ + *zsui = (*zsui)->next; + zsui_this->next = zdb->attributeDetails->SUInfo; + zdb->attributeDetails->SUInfo = zsui_this; + + if (dirty_mark) + zdb->attributeDetails->dirty = 1; + if (db) + *db = zdb->databaseName; + return zsui_this; + } + } + return 0; +} + + + +int zebraExplain_ord_adjust_occurrences(ZebraExplainInfo zei, int ord, + int term_delta, int doc_delta) +{ + struct zebSUInfoB *zsui = zebraExplain_get_sui_info(zei, ord, 1, 0); + if (zsui) + { + zsui->info.term_occurrences += term_delta; + zsui->info.doc_occurrences += doc_delta; + return 0; + } + return -1; +} + +int zebraExplain_ord_get_occurrences(ZebraExplainInfo zei, int ord, + zint *term_occurrences, + zint *doc_occurrences) +{ + struct zebSUInfoB *zsui = zebraExplain_get_sui_info(zei, ord, 0, 0); + if (zsui) + { + *term_occurrences = zsui->info.term_occurrences; + *doc_occurrences = zsui->info.doc_occurrences; + return 0; + } + return -1; +} + +int zebraExplain_lookup_ord(ZebraExplainInfo zei, int ord, + int *index_type, + const char **db, + int *set, int *use, + const char **string_index) +{ + struct zebSUInfoB *zsui; + if (set) *set = -1; if (use) @@ -1500,38 +1594,30 @@ int zebraExplain_lookup_ord (ZebraExplainInfo zei, int ord, if (string_index) *string_index = 0; - for (zdb = zei->databaseInfo; zdb; zdb = zdb->next) + zsui = zebraExplain_get_sui_info(zei, ord, 0, db); + if (zsui) { - struct zebSUInfoB *zsui; - - if (zdb->attributeDetails->readFlag) - zebraExplain_readAttributeDetails (zei, zdb->attributeDetails); - - for (zsui = zdb->attributeDetails->SUInfo; zsui; zsui = zsui->next) - if (zsui->info.ordinal == ord) - { - if (db) - *db = zdb->databaseName; - if (zsui->info.which == ZEB_SU_SET_USE) - { - if (set) - *set = zsui->info.u.su.set; - if (use) - *use = zsui->info.u.su.use; - } - - if (zsui->info.which == ZEB_SU_STR) - if (string_index) - *string_index = zsui->info.u.str; - - if (index_type) - *index_type = zsui->info.index_type; - return 0; - } + if (zsui->info.which == ZEB_SU_SET_USE) + { + if (set) + *set = zsui->info.u.su.set; + if (use) + *use = zsui->info.u.su.use; + } + + if (zsui->info.which == ZEB_SU_STR) + if (string_index) + *string_index = zsui->info.u.str; + + if (index_type) + *index_type = zsui->info.index_type; + return 0; } return -1; } + + zebAccessObject zebraExplain_announceOid (ZebraExplainInfo zei, zebAccessObject *op, Odr_oid *oid) @@ -1587,6 +1673,8 @@ int zebraExplain_add_attr_su(ZebraExplainInfo zei, int index_type, zsui->info.u.su.set = set; zsui->info.u.su.use = use; zsui->info.ordinal = (zei->ordinalSU)++; + zsui->info.doc_occurrences = 0; + zsui->info.term_occurrences = 0; return zsui->info.ordinal; } diff --git a/index/zinfo.h b/index/zinfo.h index a0cfacc..73347ed 100644 --- a/index/zinfo.h +++ b/index/zinfo.h @@ -1,5 +1,5 @@ -/* $Id: zinfo.h,v 1.30 2006-05-10 08:13:23 adam Exp $ - Copyright (C) 1995-2005 +/* $Id: zinfo.h,v 1.31 2006-05-10 12:31:09 adam Exp $ + Copyright (C) 1995-2006 Index Data ApS This file is part of the Zebra server. @@ -71,6 +71,13 @@ int zebraExplain_lookup_ord (ZebraExplainInfo zei, int ord, int *index_type, const char **db, int *set, int *use, const char **string_index); +int zebraExplain_ord_adjust_occurrences(ZebraExplainInfo zei, int ord, + int term_delta, int doc_delta); + +int zebraExplain_ord_get_occurrences(ZebraExplainInfo zei, int ord, + zint *term_occurrences, + zint *doc_occurrences); + int zebraExplain_trav_ord(ZebraExplainInfo zei, void *handle, int (*f)(void *handle, int ord));