X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fretrieve.c;h=7dc82b858e528c366b4a8e8a1e07a77f4083fe94;hb=131e8143a9b8da294d582f0793833679101a2672;hp=d3928d9055f33ace326a0c9b570a3a582ce61da9;hpb=f82c996895d8dcd69e987660ebf9fdaafdba35b0;p=idzebra-moved-to-github.git diff --git a/index/retrieve.c b/index/retrieve.c index d3928d9..7dc82b8 100644 --- a/index/retrieve.c +++ b/index/retrieve.c @@ -1,4 +1,4 @@ -/* $Id: retrieve.c,v 1.78 2007-12-03 13:04:04 adam Exp $ +/* $Id: retrieve.c,v 1.85 2008-03-05 09:21:48 adam Exp $ Copyright (C) 1995-2007 Index Data ApS @@ -49,7 +49,8 @@ static int zebra_create_record_stream(ZebraHandle zh, { RecordAttr *recordAttr = rec_init_attr(zh->reg->zei, *rec); - if ((*rec)->size[recInfo_storeData] > 0) + if ((*rec)->size[recInfo_storeData] > 0 + || (*rec)->info[recInfo_filename] == 0) zebra_create_stream_mem(stream, (*rec)->info[recInfo_storeData], (*rec)->size[recInfo_storeData]); else @@ -75,11 +76,12 @@ static int zebra_create_record_stream(ZebraHandle zh, } return 0; } - + struct index_spec { const char *index_name; const char *index_type; + const char *extra; struct index_spec *next; }; @@ -103,6 +105,7 @@ struct index_spec *parse_index_spec(const char *elem, NMEM nmem, struct index_spec *spec = nmem_malloc(nmem, sizeof(*spec)); spec->index_type = 0; spec->next = 0; + spec->extra = 0; if (!first) first = spec; @@ -119,10 +122,19 @@ struct index_spec *parse_index_spec(const char *elem, NMEM nmem, cp++; cp0 = cp; - while (*cp != '\0' && *cp != ',') + while (*cp != '\0' && *cp != ',' && *cp != ':') cp++; spec->index_type = nmem_strdupn(nmem, cp0, cp - cp0); } + if (*cp == ':') /* extra arguments */ + { + cp++; + cp0 = cp; + + while (*cp != '\0' && *cp != ',' && *cp != ':') + cp++; + spec->extra = nmem_strdupn(nmem, cp0, cp - cp0); + } if (*cp != ',') break; } @@ -383,36 +395,35 @@ int zebra_special_index_fetch(ZebraHandle zh, zint sysno, ODR odr, if (retrieval_type == 0 || !strcmp(retrieval_type_cstr, index_type)) { - zebra_term_untrans(zh, index_type, dst_buf, str); - if (strlen(dst_buf)) + if (zebra_term_untrans(zh, index_type, dst_buf, str)) + *dst_buf = '\0'; /* untrans failed */ + + if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml)) { - if (!oid_oidcmp(input_format, yaz_oid_recsyn_xml)) - { - wrbuf_printf(wrbuf, " ", - key_in.mem[key_in.len -1]); + wrbuf_printf(wrbuf, " \n"); - } - else - { - wrbuf_printf(wrbuf, "%s ", string_index); - - wrbuf_printf(wrbuf, "%s", index_type); - - for (i = 1; i < key_in.len; i++) - wrbuf_printf(wrbuf, " " ZINT_FORMAT, + wrbuf_printf(wrbuf, " type=\"%s\"", index_type); + + wrbuf_printf(wrbuf, " seq=\"" ZINT_FORMAT "\">", + key_in.mem[key_in.len -1]); + wrbuf_xmlputs(wrbuf, dst_buf); + wrbuf_printf(wrbuf, "\n"); + } + else + { + wrbuf_printf(wrbuf, "%s ", string_index); + + wrbuf_printf(wrbuf, "%s", index_type); + + for (i = 1; i < key_in.len; i++) + wrbuf_printf(wrbuf, " " ZINT_FORMAT, key_in.mem[i]); - - wrbuf_printf(wrbuf, " %s", dst_buf); - wrbuf_printf(wrbuf, "\n"); - } + wrbuf_printf(wrbuf, " %s", dst_buf); + + wrbuf_printf(wrbuf, "\n"); + } } @@ -629,6 +640,22 @@ zint freq_term(ZebraHandle zh, int ord, const char *term, RSET rset_set) return hits; } +int term_qsort_handle(const void *a, const void *b) +{ + const struct term_collect *l = a; + const struct term_collect *r = b; + if (l->set_occur < r->set_occur) + return 1; + else if (l->set_occur > r->set_occur) + return -1; + else + { + const char *lterm = l->term ? l->term : ""; + const char *rterm = r->term ? r->term : ""; + return strcmp(lterm, rterm); + } +} + void term_collect_freq(ZebraHandle zh, struct term_collect *col, int no_terms_collect, int ord, RSET rset) @@ -639,6 +666,7 @@ void term_collect_freq(ZebraHandle zh, if (col[i].term) col[i].set_occur = freq_term(zh, ord, col[i].term, rset); } + qsort(col, no_terms_collect, sizeof(*col), term_qsort_handle); } struct term_collect *term_collect_create(zebra_strmap_t sm, @@ -662,14 +690,17 @@ struct term_collect *term_collect_create(zebra_strmap_t sm, it = zebra_strmap_it_create(sm); while ((term = zebra_strmap_it_next(it, &data_buf, &data_len))) { + /* invariant: + col[0] has lowest oc . col[no_terms_collect-1] has highest oc */ int oc = *(int*) data_buf; int j = 0; /* insertion may be slow but terms terms will be "infrequent" and - thus number of iterations should be small below */ + thus number of iterations should be small below + */ while (j < no_terms_collect && oc > col[j].oc) j++; - if (j) - { + if (j) + { /* oc <= col[j] and oc > col[j-1] */ --j; memmove(col, col+1, sizeof(*col) * j); col[j].term = term; @@ -690,26 +721,44 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, zint *pos_array; int i; int num_recs = 10; /* number of records to analyze */ - int no_collect_terms = 20; /* number of term candidates */ + int max_chunks = 2; ZebraMetaRecord *poset; ZEBRA_RES ret = ZEBRA_OK; int *ord_array; WRBUF wr = wrbuf_alloc(); - + int use_xml = 0; int no_ord = 0; struct index_spec *spec, *spec_list; int error; + res_get_int(zh->res, "facetNumRecs", &num_recs); + res_get_int(zh->res, "facetMaxChunks", &max_chunks); + + /* see if XML is required for response */ + if (oid_oidcmp(input_format, yaz_oid_recsyn_xml) == 0) + use_xml = 1; spec_list = parse_index_spec(elemsetname, odr_getmem(odr), &error); if (!spec_list || error) - return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; - + { + zebra_setError( + zh, + YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_, + 0); + return ZEBRA_FAIL; + } + for (spec = spec_list; spec; spec = spec->next) { if (!spec->index_type) - return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; + { + zebra_setError( + zh, + YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_, + 0); + return ZEBRA_FAIL; + } no_ord++; } @@ -723,12 +772,15 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, spec->index_name); if (ord == -1) { - return YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_; + zebra_setError( + zh, + YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_, + 0); + return ZEBRA_FAIL; } ord_array[i] = ord; } - - pos_array = (zint *) xmalloc(num_recs * sizeof(*pos_array)); + pos_array = (zint *) odr_malloc(odr, num_recs * sizeof(*pos_array)); for (i = 0; i < num_recs; i++) pos_array[i] = i+1; poset = zebra_meta_records_create(zh, setname, num_recs, pos_array); @@ -736,11 +788,11 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, { zebra_setError(zh, YAZ_BIB1_SPECIFIED_RESULT_SET_DOES_NOT_EXIST, setname); - xfree(pos_array); ret = ZEBRA_FAIL; } else { + yaz_timing_t timing = yaz_timing_create(); zebra_strmap_t *map_array = odr_malloc(odr, sizeof *map_array * no_ord); for (i = 0; i < no_ord; i++) @@ -757,7 +809,9 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, poset[i].sysno, sysnos, &no_sysnos); assert(no_sysnos > 0); - for (j = 0; j < no_sysnos; j++) + yaz_log(YLOG_LOG, "Analyzing rec=%d ISAM sysno=" ZINT_FORMAT " chunks=%d", + i, poset[i].sysno, no_sysnos); + for (j = 0; j < no_sysnos && j < max_chunks; j++) { size_t slen; const char *str; @@ -766,7 +820,9 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, zebra_rec_keys_t keys = zebra_rec_keys_open(); zebra_rec_keys_set_buf(keys, rec->info[recInfo_delKeys], rec->size[recInfo_delKeys], 0); - + + yaz_log(YLOG_LOG, "rec %d " ZINT_FORMAT " %s", + j, sysnos[j], zebra_rec_keys_empty(keys) ? "empty" : "non-empty"); if (zebra_rec_keys_rewind(keys)) { while (zebra_rec_keys_read(keys, &str, &slen, &key_in)) @@ -777,7 +833,8 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, i++, spec = spec->next) { int ord = CAST_ZINT_TO_INT(key_in.mem[0]); - if (ord == ord_array[i]) + if (ord == ord_array[i] && + str[0] != FIRST_IN_FIELD_CHAR) { int *freq; zebra_strmap_t sm = map_array[i]; @@ -798,49 +855,77 @@ static ZEBRA_RES facet_fetch(ZebraHandle zh, const char *setname, rec_free(&rec); } } - wrbuf_puts(wr, "\n"); + yaz_timing_stop(timing); + yaz_log(YLOG_LOG, "facet first phase real=%4.2f", + yaz_timing_get_real(timing)); + yaz_timing_start(timing); + if (use_xml) + wrbuf_puts(wr, "\n"); for (spec = spec_list, i = 0; i < no_ord; i++, spec = spec->next) { int j; NMEM nmem = nmem_create(); - struct term_collect *col = term_collect_create(map_array[i], - no_collect_terms, - nmem); + struct term_collect *col; + int no_collect_terms = 20; + + if (spec->extra) + no_collect_terms = atoi(spec->extra); + if (no_collect_terms < 1) + no_collect_terms = 1; + col = term_collect_create(map_array[i], no_collect_terms, nmem); term_collect_freq(zh, col, no_collect_terms, ord_array[i], resultSetRef(zh, setname)); - wrbuf_printf(wr, " \n", - spec->index_type, spec->index_name); + if (use_xml) + wrbuf_printf(wr, " \n", + spec->index_type, spec->index_name); + else + wrbuf_printf(wr, "facet %s %s\n", + spec->index_type, spec->index_name); for (j = 0; j < no_collect_terms; j++) { if (col[j].term) { char dst_buf[IT_MAX_WORD]; zebra_term_untrans(zh, spec->index_type, dst_buf, col[j].term); - wrbuf_printf(wr, " "); - wrbuf_xmlputs(wr, dst_buf); - wrbuf_printf(wr, "\n"); + if (use_xml) + { + wrbuf_printf(wr, " "); + wrbuf_xmlputs(wr, dst_buf); + wrbuf_printf(wr, "\n"); + } + else + { + wrbuf_printf(wr, "term %d", col[j].oc); + if (col[j].set_occur) + wrbuf_printf(wr, " " ZINT_FORMAT, + col[j].set_occur); + wrbuf_printf(wr, ": %s\n", dst_buf); + } } } - wrbuf_puts(wr, " \n"); + if (use_xml) + wrbuf_puts(wr, " \n"); nmem_destroy(nmem); } - wrbuf_puts(wr, "\n"); + if (use_xml) + wrbuf_puts(wr, "\n"); for (i = 0; i < no_ord; i++) zebra_strmap_destroy(map_array[i]); + yaz_timing_stop(timing); + yaz_log(YLOG_LOG, "facet second phase real=%4.2f", + yaz_timing_get_real(timing)); + yaz_timing_destroy(&timing); } - - *rec_bufp = odr_strdup(odr, wrbuf_cstr(wr)); wrbuf_destroy(wr); *rec_lenp = strlen(*rec_bufp); *output_format = yaz_oid_recsyn_xml; - xfree(pos_array); zebra_meta_records_destroy(zh, poset, num_recs); return ret; }