X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fretrieve.c;h=7dc82b858e528c366b4a8e8a1e07a77f4083fe94;hb=131e8143a9b8da294d582f0793833679101a2672;hp=021a7ea70ba9a7365d74e09f7c10c3e6b15aef89;hpb=52faec54d6e3cc18105f36546df7b23faeb9c945;p=idzebra-moved-to-github.git diff --git a/index/retrieve.c b/index/retrieve.c index 021a7ea..7dc82b8 100644 --- a/index/retrieve.c +++ b/index/retrieve.c @@ -1,4 +1,4 @@ -/* $Id: retrieve.c,v 1.80 2007-12-04 12:52:33 adam Exp $ +/* $Id: retrieve.c,v 1.85 2008-03-05 09:21:48 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -49,7 +49,8 @@ static int zebra_create_record_stream(ZebraHandle zh, { RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, *rec); - if ((*rec)->size[recInfo_storeData] > 0) + if ((*rec)->size[recInfo_storeData] > 0 + || (*rec)->info[recInfo_filename] == 0) zebra_create_stream_mem(stream, (*rec)->info[recInfo_storeData], (*rec)->size[recInfo_storeData]); else @@ -75,11 +76,12 @@ static int zebra_create_record_stream(ZebraHandle zh, } return 0; } - + struct index_spec { const char *index_name; const char *index_type; + const char *extra; struct index_spec *next; }; @@ -103,6 +105,7 @@ struct index_spec *parse_index_spec(const char *elem, NMEM nmem, struct index_spec *spec = nmem_malloc(nmem, sizeof(*spec)); spec->index_type = 0; spec->next = 0; + spec->extra = 0; if (!first) first = spec; @@ -119,10 +122,19 @@ struct index_spec *parse_index_spec(const char *elem, NMEM nmem, cp++; cp0 = cp; - while (*cp != '\0' && *cp != ',') + while (*cp != '\0' && *cp != ',' && *cp != ':') cp++; spec->index_type = nmem_strdupn(nmem, cp0, cp - cp0); } + if (*cp == ':') /* extra arguments */ + { + cp++; + cp0 = cp; + + while (*cp != '\0' && *cp != ',' && *cp != ':') + cp++; + spec->extra = nmem_strdupn(nmem, cp0, cp - cp0); + } if (*cp != ',') break; } @@ -383,36 +395,35 @@ int zebra_special_index_fetch(ZebraHandle zh, zint sysno, ODR odr, if (retrieval_type == 0 || !strcmp(retrieval_type_cstr, index_type)) { - zebra_term_untrans(zh, index_type, dst_buf, str); - if (strlen(dst_buf)) + if (zebra_term_untrans(zh, index_type, dst_buf, str)) + *dst_buf = '\0'; /* untrans failed */ + + if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml)) { - if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml)) - { - wrbuf_printf(wrbuf, " ", - key_in.mem[key_in.len -1]); + wrbuf_printf(wrbuf, " \n"); - } - else - { - wrbuf_printf(wrbuf, "%s ", string_index); - - wrbuf_printf(wrbuf, "%s", index_type); - - for (i = 1; i < key_in.len; i++) - wrbuf_printf(wrbuf, " " ZINT_FORMAT, + wrbuf_printf(wrbuf, " type=\"%s\"", index_type); + + wrbuf_printf(wrbuf, " seq=\"" ZINT_FORMAT "\">", + key_in.mem[key_in.len -1]); + wrbuf_xmlputs(wrbuf, dst_buf); + wrbuf_printf(wrbuf, "\n"); + } + else + { + wrbuf_printf(wrbuf, "%s ", string_index); + + wrbuf_printf(wrbuf, "%s", index_type); + + for (i = 1; i < key_in.len; i++) + wrbuf_printf(wrbuf, " " ZINT_FORMAT, key_in.mem[i]); - - wrbuf_printf(wrbuf, " %s", dst_buf); - wrbuf_printf(wrbuf, "\n"); - } + wrbuf_printf(wrbuf, " %s", dst_buf); + + wrbuf_printf(wrbuf, "\n"); + } } @@ -629,6 +640,22 @@ zint freq_term(ZebraHandle zh, int ord, const char *term, RSET rset_set) return hits; } +int term_qsort_handle(const void *a, const void *b) +{ + const struct term_collect *l = a; + const struct term_collect *r = b; + if (l->set_occur < r->set_occur) + return 1; + else if (l->set_occur > r->set_occur) + return -1; + else + { + const char *lterm = l->term ? l->term : ""; + const char *rterm = r->term ? r->term : ""; + return strcmp(lterm, rterm); + } +} + void term_collect_freq(ZebraHandle zh, struct term_collect *col, int no_terms_collect, int ord, RSET rset) @@ -639,6 +666,7 @@ void term_collect_freq(ZebraHandle zh, if (col[i].term) col[i].set_occur = freq_term(zh, ord, col[i].term, rset); } + qsort(col, no_terms_collect, sizeof(*col), term_qsort_handle); } struct term_collect *term_collect_create(zebra_strmap_t sm, @@ -662,14 +690,17 @@ struct term_collect *term_collect_create(zebra_strmap_t sm, it = zebra_strmap_it_create(sm); while ((term = zebra_strmap_it_next(it, &data_buf, &data_len))) { + /* invariant: + col[0] has lowest oc . col[no_terms_collect-1] has highest oc */ int oc = *(int*) data_buf; int j = 0; /* insertion may be slow but terms terms will be "infrequent" and - thus number of iterations should be small below */ + thus number of iterations should be small below + */ while (j < no_terms_collect && oc > col[j].oc) j++; - if (j) - { + if (j) + { /* oc <= col[j] and oc > col[j-1] */ --j; memmove(col, col+1, sizeof(*col) * j); col[j].term = term; @@ -690,17 +721,19 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, zint *pos_array; int i; int num_recs = 10; /* number of records to analyze */ - int no_collect_terms = 20; /* number of term candidates */ + int max_chunks = 2; ZebraMetaRecord *poset; ZEBRA_RES ret = ZEBRA_OK; int *ord_array; WRBUF wr = wrbuf_alloc(); int use_xml = 0; - int no_ord = 0; struct index_spec *spec, *spec_list; int error; + res_get_int(zh->res, "facetNumRecs", &num_recs); + res_get_int(zh->res, "facetMaxChunks", &max_chunks); + /* see if XML is required for response */ if (oid_oidcmp(input_format, yaz_oid_recsyn_xml) == 0) use_xml = 1; @@ -759,6 +792,7 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, } else { + yaz_timing_t timing = yaz_timing_create(); zebra_strmap_t *map_array = odr_malloc(odr, sizeof *map_array * no_ord); for (i = 0; i < no_ord; i++) @@ -775,7 +809,9 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, poset[i].sysno, sysnos, &no_sysnos); assert(no_sysnos > 0); - for (j = 0; j < no_sysnos; j++) + yaz_log(YLOG_LOG, "Analyzing rec=%d ISAM sysno=" ZINT_FORMAT " chunks=%d", + i, poset[i].sysno, no_sysnos); + for (j = 0; j < no_sysnos && j < max_chunks; j++) { size_t slen; const char *str; @@ -784,7 +820,9 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, zebra_rec_keys_t keys = zebra_rec_keys_open(); zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys], rec->size[recInfo_delKeys], 0); - + + yaz_log(YLOG_LOG, "rec %d " ZINT_FORMAT " %s", + j, sysnos[j], zebra_rec_keys_empty(keys) ? "empty" : "non-empty"); if (zebra_rec_keys_rewind(keys)) { while (zebra_rec_keys_read(keys, &str, &slen, &key_in)) @@ -817,15 +855,24 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, rec_free(&rec); } } + yaz_timing_stop(timing); + yaz_log(YLOG_LOG, "facet first phase real=%4.2f", + yaz_timing_get_real(timing)); + yaz_timing_start(timing); if (use_xml) wrbuf_puts(wr, "\n"); for (spec = spec_list, i = 0; i < no_ord; i++, spec = spec->next) { int j; NMEM nmem = nmem_create(); - struct term_collect *col = term_collect_create(map_array[i], - no_collect_terms, - nmem); + struct term_collect *col; + int no_collect_terms = 20; + + if (spec->extra) + no_collect_terms = atoi(spec->extra); + if (no_collect_terms < 1) + no_collect_terms = 1; + col = term_collect_create(map_array[i], no_collect_terms, nmem); term_collect_freq(zh, col, no_collect_terms, ord_array[i], resultSetRef(zh, setname)); @@ -869,9 +916,11 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, wrbuf_puts(wr, "\n"); for (i = 0; i < no_ord; i++) zebra_strmap_destroy(map_array[i]); + yaz_timing_stop(timing); + yaz_log(YLOG_LOG, "facet second phase real=%4.2f", + yaz_timing_get_real(timing)); + yaz_timing_destroy(&timing); } - - *rec_bufp = odr_strdup(odr, wrbuf_cstr(wr)); wrbuf_destroy(wr); *rec_lenp = strlen(*rec_bufp);