From 4b25ee9a2fbd9a641095362fc424c1f7b7c0ca2c Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 11 Sep 2008 15:27:17 +0200 Subject: [PATCH] Facets using sort register functional --- index/retrieve.c | 364 +++++++++++++++++++++++++++++++----------------- test/api/test_safari.c | 12 +- test/api/test_search.c | 29 ++-- 3 files changed, 260 insertions(+), 145 deletions(-) diff --git a/index/retrieve.c b/index/retrieve.c index 107639e..39bbca1 100644 --- a/index/retrieve.c +++ b/index/retrieve.c @@ -717,6 +717,212 @@ static struct term_collect *term_collect_create(zebra_strmap_t sm, return col; } +static int perform_facet_sort(ZebraHandle zh, int no_ord, int *ord_array, + zebra_strmap_t *map_array, + int num_recs, ZebraMetaRecord *poset) +{ + int rec_i; + WRBUF w = wrbuf_alloc(); + int ord_i; + + for (ord_i = 0; ord_i < no_ord; ord_i++) + { + for (rec_i = 0; rec_i < num_recs; rec_i++) + { + if (!poset[rec_i].sysno) + continue; + + zebra_sort_sysno(zh->reg->sort_index, poset[rec_i].sysno); + zebra_sort_type(zh->reg->sort_index, ord_array[ord_i]); + + wrbuf_rewind(w); + if (zebra_sort_read(zh->reg->sort_index, w)) + { + zebra_strmap_t sm = map_array[ord_i]; + int off = 0; + while (off != wrbuf_len(w)) + { + const char *str = wrbuf_buf(w) + off; + int *freq = zebra_strmap_lookup(sm, str, 0, 0); + if (freq) + (*freq)++; + else + { + int v = 1; + zebra_strmap_add(sm, str, &v, sizeof v); + } + off += strlen(str)+1; + } + } + } + } + wrbuf_destroy(w); + return 0; +} + + +static int perform_facet_index(ZebraHandle zh, + struct special_fetch_s *fi, + int no_ord, int *ord_array, + zebra_strmap_t *map_array, + int num_recs, ZebraMetaRecord *poset, + struct index_spec *spec_list) +{ + int max_chunks = 2; + int rec_i; + res_get_int(zh->res, "facetMaxChunks", &max_chunks); + + for (rec_i = 0; rec_i < num_recs; rec_i++) + { + int ret; + int j; + zint sysnos[MAX_SYSNOS_PER_RECORD]; + int no_sysnos = MAX_SYSNOS_PER_RECORD; + if (!poset[rec_i].sysno) + continue; + ret = zebra_result_recid_to_sysno(zh, fi->setname, + poset[rec_i].sysno, + sysnos, &no_sysnos); + assert(no_sysnos > 0); + yaz_log(YLOG_DEBUG, "Analyzing rec=%d ISAM sysno=" ZINT_FORMAT " chunks=%d", + rec_i, poset[rec_i].sysno, no_sysnos); + for (j = 0; j < no_sysnos && j < max_chunks; j++) + { + size_t slen; + const char *str; + struct it_key key_in; + Record rec = rec_get(zh->reg->records, sysnos[j]); + zebra_rec_keys_t keys = zebra_rec_keys_open(); + zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys], + rec->size[recInfo_delKeys], 0); + + yaz_log(YLOG_DEBUG, "rec %d " ZINT_FORMAT " %s", + j, sysnos[j], zebra_rec_keys_empty(keys) ? "empty" : "non-empty"); + if (zebra_rec_keys_rewind(keys)) + { + while (zebra_rec_keys_read(keys, &str, &slen, &key_in)) + { + int ord_i; + struct index_spec *spec; + for (spec = spec_list, ord_i = 0; ord_i < no_ord; + ord_i++, spec = spec->next) + { + int ord = CAST_ZINT_TO_INT(key_in.mem[0]); + if (ord == ord_array[ord_i] && + str[0] != FIRST_IN_FIELD_CHAR) + { + int *freq; + zebra_strmap_t sm = map_array[ord_i]; + + freq = zebra_strmap_lookup(sm, str, 0, 0); + if (freq) + (*freq)++; + else + { + int v = 1; + zebra_strmap_add(sm, str, &v, sizeof v); + } + } + } + } + } + zebra_rec_keys_close(keys); + rec_free(&rec); + } + } + return 0; +} + +static int perform_facet(ZebraHandle zh, + struct special_fetch_s *fi, + WRBUF result, + int num_recs, ZebraMetaRecord *poset, + struct index_spec *spec_list, + int no_ord, int *ord_array, + int use_xml, + zinfo_index_category_t cat) +{ + int i; + int ret = 0; + WRBUF wr = result; + struct index_spec *spec; + yaz_timing_t timing = yaz_timing_create(); + zebra_strmap_t *map_array + = nmem_malloc(fi->nmem, sizeof *map_array * no_ord); + for (i = 0; i < no_ord; i++) + map_array[i] = zebra_strmap_create(); + + if (cat == zinfo_index_category_sort) + perform_facet_sort(zh, no_ord, ord_array, map_array, + num_recs, poset); + else + perform_facet_index(zh, fi, no_ord, ord_array, map_array, + num_recs, poset, spec_list); + yaz_timing_stop(timing); + yaz_log(YLOG_LOG, "facet first phase real=%4.2f cat=%s", + yaz_timing_get_real(timing), + (cat == zinfo_index_category_sort) ? "sort" : "index"); + yaz_timing_start(timing); + for (spec = spec_list, i = 0; i < no_ord; i++, spec = spec->next) + { + int j; + NMEM nmem = nmem_create(); + struct term_collect *col; + int no_collect_terms = 20; + + if (spec->extra) + no_collect_terms = atoi(spec->extra); + if (no_collect_terms < 1) + no_collect_terms = 1; + col = term_collect_create(map_array[i], no_collect_terms, nmem); + term_collect_freq(zh, col, no_collect_terms, ord_array[i], + resultSetRef(zh, fi->setname)); + + if (use_xml) + wrbuf_printf(wr, " \n", + spec->index_type, spec->index_name); + else + wrbuf_printf(wr, "facet %s %s\n", + spec->index_type, spec->index_name); + for (j = 0; j < no_collect_terms; j++) + { + if (col[j].term) + { + char dst_buf[IT_MAX_WORD]; + zebra_term_untrans(zh, spec->index_type, dst_buf, col[j].term); + if (use_xml) + { + wrbuf_printf(wr, " "); + wrbuf_xmlputs(wr, dst_buf); + wrbuf_printf(wr, "\n"); + } + else + { + wrbuf_printf(wr, "term %d", col[j].oc); + if (col[j].set_occur) + wrbuf_printf(wr, " " ZINT_FORMAT, + col[j].set_occur); + wrbuf_printf(wr, ": %s\n", dst_buf); + } + } + } + if (use_xml) + wrbuf_puts(wr, " \n"); + nmem_destroy(nmem); + } + for (i = 0; i < no_ord; i++) + zebra_strmap_destroy(map_array[i]); + yaz_timing_stop(timing); + yaz_log(YLOG_LOG, "facet second phase real=%4.2f", + yaz_timing_get_real(timing)); + yaz_timing_destroy(&timing); + return ret; +} + static int facet_fetch( struct special_fetch_s *fi, const char *elemsetname, const Odr_oid *input_format, @@ -726,19 +932,18 @@ static int facet_fetch( zint *pos_array; int i; int num_recs = 10; /* number of records to analyze */ - int max_chunks = 2; ZebraMetaRecord *poset; ZEBRA_RES ret = ZEBRA_OK; int *ord_array; - WRBUF wr = result; int use_xml = 0; int no_ord = 0; struct index_spec *spec, *spec_list; int error; ZebraHandle zh = fi->zh; + /* whether sort or index based */ + zinfo_index_category_t cat = zinfo_index_category_sort; res_get_int(zh->res, "facetNumRecs", &num_recs); - res_get_int(zh->res, "facetMaxChunks", &max_chunks); /* see if XML is required for response */ if (oid_oidcmp(input_format, yaz_oid_recsyn_xml) == 0) @@ -758,20 +963,38 @@ static int facet_fetch( no_ord++; } + /* try to see if all specs are sort based.. If not, try the + index based ones */ ord_array = nmem_malloc(fi->nmem, sizeof(*ord_array) * no_ord); for (spec = spec_list, i = 0; spec; spec = spec->next, i++) { int ord = zebraExplain_lookup_attr_str(zh->reg->zei, - zinfo_index_category_index, + zinfo_index_category_sort, spec->index_type, spec->index_name); if (ord == -1) + break; + ord_array[i] = ord; + } + if (spec) + { + cat = zinfo_index_category_index; + for (spec = spec_list, i = 0; spec; spec = spec->next, i++) { - return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; + int ord = zebraExplain_lookup_attr_str(zh->reg->zei, + zinfo_index_category_index, + spec->index_type, + spec->index_name); + if (ord == -1) + break; + ord_array[i] = ord; + } - ord_array[i] = ord; } + if (spec) + return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; + pos_array = (zint *) nmem_malloc(fi->nmem, num_recs * sizeof(*pos_array)); for (i = 0; i < num_recs; i++) pos_array[i] = i+1; @@ -783,134 +1006,15 @@ static int facet_fetch( } else { - yaz_timing_t timing = yaz_timing_create(); - zebra_strmap_t *map_array - = nmem_malloc(fi->nmem, sizeof *map_array * no_ord); - for (i = 0; i < no_ord; i++) - map_array[i] = zebra_strmap_create(); - - for (i = 0; i < num_recs; i++) - { - int j; - zint sysnos[MAX_SYSNOS_PER_RECORD]; - int no_sysnos = MAX_SYSNOS_PER_RECORD; - if (!poset[i].sysno) - continue; - ret = zebra_result_recid_to_sysno(zh, fi->setname, - poset[i].sysno, - sysnos, &no_sysnos); - assert(no_sysnos > 0); - yaz_log(YLOG_DEBUG, "Analyzing rec=%d ISAM sysno=" ZINT_FORMAT " chunks=%d", - i, poset[i].sysno, no_sysnos); - for (j = 0; j < no_sysnos && j < max_chunks; j++) - { - size_t slen; - const char *str; - struct it_key key_in; - Record rec = rec_get(zh->reg->records, sysnos[j]); - zebra_rec_keys_t keys = zebra_rec_keys_open(); - zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys], - rec->size[recInfo_delKeys], 0); - - yaz_log(YLOG_DEBUG, "rec %d " ZINT_FORMAT " %s", - j, sysnos[j], zebra_rec_keys_empty(keys) ? "empty" : "non-empty"); - if (zebra_rec_keys_rewind(keys)) - { - while (zebra_rec_keys_read(keys, &str, &slen, &key_in)) - { - int i; - struct index_spec *spec; - for (spec = spec_list, i = 0; i < no_ord; - i++, spec = spec->next) - { - int ord = CAST_ZINT_TO_INT(key_in.mem[0]); - if (ord == ord_array[i] && - str[0] != FIRST_IN_FIELD_CHAR) - { - int *freq; - zebra_strmap_t sm = map_array[i]; - - freq = zebra_strmap_lookup(sm, str, 0, 0); - if (freq) - (*freq)++; - else - { - int v = 1; - zebra_strmap_add(sm, str, &v, sizeof v); - } - } - } - } - } - zebra_rec_keys_close(keys); - rec_free(&rec); - } - } - yaz_timing_stop(timing); - yaz_log(YLOG_LOG, "facet first phase real=%4.2f", - yaz_timing_get_real(timing)); - yaz_timing_start(timing); if (use_xml) - wrbuf_puts(wr, "\n"); - for (spec = spec_list, i = 0; i < no_ord; i++, spec = spec->next) { - int j; - NMEM nmem = nmem_create(); - struct term_collect *col; - int no_collect_terms = 20; - - if (spec->extra) - no_collect_terms = atoi(spec->extra); - if (no_collect_terms < 1) - no_collect_terms = 1; - col = term_collect_create(map_array[i], no_collect_terms, nmem); - term_collect_freq(zh, col, no_collect_terms, ord_array[i], - resultSetRef(zh, fi->setname)); - - if (use_xml) - wrbuf_printf(wr, " \n", - spec->index_type, spec->index_name); - else - wrbuf_printf(wr, "facet %s %s\n", - spec->index_type, spec->index_name); - for (j = 0; j < no_collect_terms; j++) - { - if (col[j].term) - { - char dst_buf[IT_MAX_WORD]; - zebra_term_untrans(zh, spec->index_type, dst_buf, col[j].term); - if (use_xml) - { - wrbuf_printf(wr, " "); - wrbuf_xmlputs(wr, dst_buf); - wrbuf_printf(wr, "\n"); - } - else - { - wrbuf_printf(wr, "term %d", col[j].oc); - if (col[j].set_occur) - wrbuf_printf(wr, " " ZINT_FORMAT, - col[j].set_occur); - wrbuf_printf(wr, ": %s\n", dst_buf); - } - } - } - if (use_xml) - wrbuf_puts(wr, " \n"); - nmem_destroy(nmem); + wrbuf_printf(result, ZEBRA_XML_HEADER_STR ">\n"); } + ret = perform_facet(zh, fi, result, num_recs, poset, + spec_list, no_ord, ord_array, use_xml, + cat); if (use_xml) - wrbuf_puts(wr, "\n"); - for (i = 0; i < no_ord; i++) - zebra_strmap_destroy(map_array[i]); - yaz_timing_stop(timing); - yaz_log(YLOG_LOG, "facet second phase real=%4.2f", - yaz_timing_get_real(timing)); - yaz_timing_destroy(&timing); + wrbuf_puts(result, "\n"); } *output_format = yaz_oid_recsyn_xml; zebra_meta_records_destroy(zh, poset, num_recs); diff --git a/test/api/test_safari.c b/test/api/test_safari.c index e28ac6d..1f38486 100644 --- a/test/api/test_safari.c +++ b/test/api/test_safari.c @@ -45,7 +45,7 @@ const char *myrec[] = "00024339 125061 0 1 any the\n" "00024339 125061 0 2 any gamle\n" "00024339 125061 0 3 any mand\n" - "w 00024339 125661 0 4 any Hello\n" + "w 00024339 125661 0 4 any Hello\n" /* index type given */ , "1001\n" /* separate record */ "00024340 125062 0 1 any the\n" @@ -103,7 +103,7 @@ static void tst(int argc, char **argv) myrec[4]), ZEBRA_OK); YAZ_CHECK_EQ(tl_fetch_compare(zh, 1, "zebra::facet::any:0", yaz_oid_recsyn_xml, - "\n" + "\n" " \n" " mand\n" " the\n" @@ -111,15 +111,15 @@ static void tst(int argc, char **argv) " gamle\n" " old\n" " \n" - "\n"), ZEBRA_OK); + "\n"), ZEBRA_OK); YAZ_CHECK_EQ(tl_fetch_compare(zh, 1, "zebra::facet::any:0:2", yaz_oid_recsyn_xml, - "\n" - " \n" + "\n" + " \n" " mand\n" " the\n" " \n" - "\n"), ZEBRA_OK); + "\n"), ZEBRA_OK); /* limit to 125061 */ limits[0] = 125061; diff --git a/test/api/test_search.c b/test/api/test_search.c index 55219f9..de6730f 100644 --- a/test/api/test_search.c +++ b/test/api/test_search.c @@ -67,20 +67,31 @@ static void tst(int argc, char **argv) /* simple term */ YAZ_CHECK(tl_query(zh, "@attr 1=4 notfound", 0)); YAZ_CHECK(tl_query(zh, "@attr 1=4 title", 3)); - tl_fetch_compare(zh, 1, "zebra::facet::title:w", yaz_oid_recsyn_sutrs, - "term 3 3: my\n" - "term 3 3: title\n" - "term 2 2: x\n"); + YAZ_CHECK_EQ(tl_fetch_compare(zh, 1, "zebra::facet::title:w", + yaz_oid_recsyn_sutrs, + "facet w title\n" + "term 3 3: my\n" + "term 3 3: title\n" + "term 2 2: x\n"), ZEBRA_OK); + + YAZ_CHECK_EQ(tl_fetch_compare(zh, 1, "zebra::facet::title:s", + yaz_oid_recsyn_sutrs, + "facet s title\n" + "term 1: my title\n" + "term 1: my title x\n" + "term 1: my x title\n"), ZEBRA_OK); /* trunc right */ YAZ_CHECK(tl_query(zh, "@attr 1=4 @attr 5=1 titl", 3)); YAZ_CHECK(tl_query(zh, "@attr 1=4 @attr 5=1 x", 2)); - tl_fetch_compare(zh, 1, "zebra::facet::title:w", yaz_oid_recsyn_sutrs, - "term 2 2: my\n" - "term 2 2: title\n" - "term 2 2: x\n"); - + YAZ_CHECK_EQ(tl_fetch_compare(zh, 1, "zebra::facet::title:w", + yaz_oid_recsyn_sutrs, + "facet w title\n" + "term 2 2: my\n" + "term 2 2: title\n" + "term 2 2: x\n"), ZEBRA_OK); + /* trunc left */ YAZ_CHECK(tl_query(zh, "@attr 1=4 @attr 5=2 titl", 0)); YAZ_CHECK(tl_query(zh, "@attr 1=4 @attr 5=2 x", 2)); -- 1.7.10.4