X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=isamb%2Fbenchindex1.c;h=ead71271d0383d9f9028cac3256b07b83843d59e;hp=1a9cb88b075ad9b55498b7afae1271b3f8e01036;hb=693a0db94b4b3ac9aee7722572a6b81a86a12e13;hpb=2f2827ef6acf22faa96e0ff55bbb24845e423176 diff --git a/isamb/benchindex1.c b/isamb/benchindex1.c index 1a9cb88..ead7127 100644 --- a/isamb/benchindex1.c +++ b/isamb/benchindex1.c @@ -1,8 +1,5 @@ -/* $Id: benchindex1.c,v 1.1 2006-12-10 21:00:56 adam Exp $ - Copyright (C) 1995-2006 - Index Data ApS - -This file is part of the Zebra server. +/* This file is part of the Zebra server. + Copyright (C) 1994-2011 Index Data Zebra is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -20,14 +17,10 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include -#if HAVE_SYS_TIMES_H -#include -#endif -#if HAVE_SYS_TIME_H -#include +#if HAVE_CONFIG_H +#include #endif - +#include #include #include #include @@ -35,6 +28,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include +#include #include #include #include @@ -42,7 +36,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA struct index_block { NMEM nmem; - size_t no_entries; + int no_entries; size_t current_entry; size_t current_max; struct index_term *terms; @@ -137,16 +131,11 @@ void index_block_flush(struct index_block *b, ISAMB isb, Dict dict, int no_words = 0, no_new_words = 0; const char *dict_info = 0; ISAM_P isamc_p = 0; - -#if HAVE_SYS_TIMES_H -#if HAVE_SYS_TIME_H - struct tms tms1, tms2; - struct timeval start_time, end_time; - double usec; - times(&tms1); - gettimeofday(&start_time, 0); -#endif -#endif + yaz_timing_t tim_dict = 0; + yaz_timing_t tim_isamb = 0; + zint number_of_int_splits = isamb_get_int_splits(isb); + zint number_of_leaf_splits = isamb_get_leaf_splits(isb); + zint number_of_dict_splits = dict_get_no_split(dict); b->ar = xmalloc(sizeof(*b->ar) * b->no_entries); for (i = 0; i < b->no_entries; i++, t = t->next) @@ -157,6 +146,7 @@ void index_block_flush(struct index_block *b, ISAMB isb, Dict dict, assert(!t); qsort(b->ar, b->no_entries, sizeof(*b->ar), cmp_ar); + tim_dict = yaz_timing_create(); #if 0 for (i = 0; i < b->no_entries; i++) { @@ -200,6 +190,9 @@ void index_block_flush(struct index_block *b, ISAMB isb, Dict dict, } } dict_insert(dict, "_w", sizeof(word_id_seq), &word_id_seq); + + yaz_timing_stop(tim_dict); + tim_isamb = yaz_timing_create(); b->current_entry = 0; @@ -216,31 +209,47 @@ void index_block_flush(struct index_block *b, ISAMB isb, Dict dict, dict_insert(dict, "_i", sizeof(isamc_p), &isamc_p); } - yaz_log(YLOG_LOG, "Flushed %d postings, %d/%d words, %d records", - b->no_entries, no_words, no_new_words, no_docs); + yaz_timing_stop(tim_isamb); + + number_of_int_splits = isamb_get_int_splits(isb) - number_of_int_splits; + number_of_leaf_splits = isamb_get_leaf_splits(isb) - number_of_leaf_splits; + number_of_dict_splits = dict_get_no_split(dict) - number_of_dict_splits; + + if (b->round == 0) + { + printf("# run total dict-real user sys isam-real user sys " + " intsp leafsp docs postings words new d-spl\n"); + } + b->round++; + printf("%5d %9.6f %9.6f %5.2f %5.2f %9.6f %5.2f %5.2f " + "%6" ZINT_FORMAT0 " %6" ZINT_FORMAT0 + " %8d %8d %6d %6d" " %5" ZINT_FORMAT0 "\n", + b->round, + yaz_timing_get_real(tim_dict) + yaz_timing_get_real(tim_isamb), + yaz_timing_get_real(tim_dict), + yaz_timing_get_user(tim_dict), + yaz_timing_get_sys(tim_dict), + yaz_timing_get_real(tim_isamb), + yaz_timing_get_user(tim_isamb), + yaz_timing_get_sys(tim_isamb), + number_of_int_splits, + number_of_leaf_splits, + no_docs, + b->no_entries, + no_words, + no_new_words, + number_of_dict_splits + ); + fflush(stdout); + xfree(b->ar); b->ar = 0; nmem_reset(b->nmem); b->no_entries = 0; b->terms = 0; -#if HAVE_SYS_TIMES_H -#if HAVE_SYS_TIME_H - b->round++; - gettimeofday(&end_time, 0); - times(&tms2); - - usec = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + - end_time.tv_usec - start_time.tv_usec; - - printf("%3d %8.6f %5.2f %5.2f\n", - b->round, - usec / 1000000, - (double) (tms2.tms_utime - tms1.tms_utime)/100, - (double) (tms2.tms_stime - tms1.tms_stime)/100); -#endif -#endif - + yaz_timing_destroy(&tim_isamb); + yaz_timing_destroy(&tim_dict); } void index_block_check_flush(struct index_block *b, ISAMB isb, Dict dict, @@ -250,7 +259,6 @@ void index_block_check_flush(struct index_block *b, ISAMB isb, Dict dict, int max = b->current_max; if (total > max) { - yaz_log(YLOG_LOG, "flush to disk total=%d max=%d", total, max); index_block_flush(b, isb, dict, no_docs); } } @@ -267,12 +275,6 @@ void index_block_add(struct index_block *b, b->no_entries++; } -void exit_usage(void) -{ - fprintf(stderr, "benchindex1 [-z sz]\n"); - exit(1); -} - void index_term(struct index_block *b, const char *term, zint docid, zint *seqno) { @@ -284,7 +286,8 @@ void index_term(struct index_block *b, const char *term, (*seqno)++; } -void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid) +void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid, + int subfield_char) { int nl = 1; const char *cp = wrbuf_buf(wrbuf); @@ -297,8 +300,16 @@ void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid) if (nl) { int i; - for (i = 0; i<6 && *cp; i++, cp++) - ; + if (cp[0] != ' ') + { /* skip field+indicator (e.g. 245 00) */ + for (i = 0; i<6 && *cp; i++, cp++) + ; + } + else + { /* continuation line */ + for (i = 0; i<4 && *cp; i++, cp++) + ; + } } nl = 0; if (*cp == '\n') @@ -311,7 +322,7 @@ void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid) nl = 1; cp++; } - else if (*cp == '$' && cp[1]) + else if (*cp == subfield_char && cp[1]) { if (sz) { @@ -320,7 +331,7 @@ void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid) } cp += 2; } - else if (strchr("$/-;,.:[]\"&(){} ", *cp)) + else if (strchr("$*/-;,.:[]\"&(){} ", *cp)) { if (sz) { @@ -345,8 +356,71 @@ void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid) index_term(b, term, docid, &seqno); } +void index_marc_line_records(ISAMB isb, + Dict dict, + zint *docid_seq, + FILE *inf, + int memory) +{ + WRBUF wrbuf = wrbuf_alloc(); + int no_docs = 0; + int new_rec = 1; + char line[4096]; + struct index_block *b = index_block_new(memory); + while(fgets(line, sizeof(line)-1, inf)) + { + if (line[0] == '$') + { + if (!new_rec) + new_rec = 1; + else + new_rec = 0; + continue; + } + if (new_rec) + { + (*docid_seq)++; + no_docs++; + index_block_check_flush(b, isb, dict, no_docs); + new_rec = 0; + } + + if (line[0] == ' ') + { + /* continuation */ + wrbuf_puts(wrbuf, line); + continue; + } + else + { + /* index existing buffer (if any) */ + if (wrbuf_len(wrbuf)) + { + index_wrbuf(b, wrbuf, *docid_seq, '*'); + wrbuf_rewind(wrbuf); + } + if (line[0] != ' ' && line[1] != ' ' && line[2] != ' ' && + line[3] == ' ') + { + /* normal field+indicator line */ + wrbuf_puts(wrbuf, line); + } + } + } + if (wrbuf_len(wrbuf)) + { + index_wrbuf(b, wrbuf, *docid_seq, '*'); + wrbuf_rewind(wrbuf); + } + (*docid_seq)++; + no_docs++; + index_block_flush(b, isb, dict, no_docs); + index_block_destroy(&b); +} + void index_marc_from_file(ISAMB isb, Dict dict, + zint *docid_seq, FILE *inf, int memory, int verbose, int print_offset) @@ -354,17 +428,8 @@ void index_marc_from_file(ISAMB isb, yaz_marc_t mt = yaz_marc_create(); WRBUF wrbuf = wrbuf_alloc(); struct index_block *b = index_block_new(memory); - const char *dict_info = 0; - zint docid_seq = 1; int no_docs = 0; - dict_info = dict_lookup(dict, "_s"); - if (dict_info) - { - assert(*dict_info == sizeof(docid_seq)); - memcpy(&docid_seq, dict_info+1, sizeof(docid_seq)); - } - while (1) { size_t r; @@ -417,35 +482,46 @@ void index_marc_from_file(ISAMB isb, if (yaz_marc_write_line(mt, wrbuf)) break; - index_wrbuf(b, wrbuf, docid_seq); + index_wrbuf(b, wrbuf, *docid_seq, '$'); wrbuf_rewind(wrbuf); - docid_seq++; + (*docid_seq)++; no_docs++; index_block_check_flush(b, isb, dict, no_docs); } index_block_flush(b, isb, dict, no_docs); - wrbuf_free(wrbuf, 1); + wrbuf_destroy(wrbuf); yaz_marc_destroy(mt); index_block_destroy(&b); - yaz_log(YLOG_LOG, "Total " ZINT_FORMAT " documents", docid_seq); - dict_insert(dict, "_s", sizeof(docid_seq), &docid_seq); +} + +void exit_usage(void) +{ + fprintf(stderr, "benchindex1 [-t type] [-c d:i] [-m mem] [-i] [inputfile]\n"); + exit(1); } int main(int argc, char **argv) { BFiles bfs; - ISAMB isb; - ISAMC_M method; + ISAMB isb_postings; + ISAMC_M method_postings; Dict dict; int ret; int reset = 0; char *arg; int memory = 5; + int isam_cache_size = 40; + int dict_cache_size = 50; const char *fname = 0; FILE *inf = stdin; + yaz_timing_t tim = 0; + zint docid_seq = 1; + const char *dict_info; + const char *type = "iso2709"; + int int_count_enable = 1; - while ((ret = options("im:", argv, argc, &arg)) != -2) + while ((ret = options("im:t:c:N", argv, argc, &arg)) != -2) { switch(ret) { @@ -455,9 +531,31 @@ int main(int argc, char **argv) case 'i': reset = 1; break; + case 't': + if (!strcmp(arg, "iso2709")) + type = "iso2709"; + else if (!strcmp(arg, "line")) + type = "line"; + else + { + fprintf(stderr, "bad type: %s.\n", arg); + exit_usage(); + } + break; + case 'c': + if (sscanf(arg, "%d:%d", &dict_cache_size, &isam_cache_size) + != 2) + { + fprintf(stderr, "bad cache sizes for -c\n"); + exit_usage(); + } + break; case 0: fname = arg; break; + case 'N': + int_count_enable = 0; + break; default: fprintf(stderr, "bad option.\n"); exit_usage(); @@ -473,17 +571,23 @@ int main(int argc, char **argv) exit(1); } } - /* setup method (attributes) */ - method.compare_item = key_compare; - method.log_item = key_logdump_txt; + printf("# benchindex1 %s %s\n", __DATE__, __TIME__); + printf("# isam_cache_size = %d\n", isam_cache_size); + printf("# dict_cache_size = %d\n", dict_cache_size); + printf("# int_count_enable = %d\n", int_count_enable); + printf("# memory = %d\n", memory); + + /* setup postings isamb attributes */ + method_postings.compare_item = key_compare; + method_postings.log_item = key_logdump_txt; - method.codec.start = iscz1_start; - method.codec.decode = iscz1_decode; - method.codec.encode = iscz1_encode; - method.codec.stop = iscz1_stop; - method.codec.reset = iscz1_reset; + method_postings.codec.start = iscz1_start; + method_postings.codec.decode = iscz1_decode; + method_postings.codec.encode = iscz1_encode; + method_postings.codec.stop = iscz1_stop; + method_postings.codec.reset = iscz1_reset; - method.debug = 0; + method_postings.debug = 0; /* create block system */ bfs = bfs_create(0, 0); @@ -496,31 +600,58 @@ int main(int argc, char **argv) if (reset) bf_reset(bfs); + tim = yaz_timing_create(); /* create isam handle */ - isb = isamb_open (bfs, "isamb", 1, &method, 0); - if (!isb) + isb_postings = isamb_open (bfs, "isamb", isam_cache_size ? 1 : 0, + &method_postings, 0); + if (!isb_postings) { yaz_log(YLOG_WARN, "isamb_open failed"); exit(2); } - dict = dict_open(bfs, "dict", 50, 1, 0, 4096); + isamb_set_cache_size(isb_postings, isam_cache_size); + isamb_set_int_count(isb_postings, int_count_enable); + dict = dict_open(bfs, "dict", dict_cache_size, 1, 0, 4096); - index_marc_from_file(isb, dict, inf, memory, - 0 /* verbose */ , 0 /* print_offset */); + dict_info = dict_lookup(dict, "_s"); + if (dict_info) + { + assert(*dict_info == sizeof(docid_seq)); + memcpy(&docid_seq, dict_info+1, sizeof(docid_seq)); + } + + if (!strcmp(type, "iso2709")) + index_marc_from_file(isb_postings, dict, &docid_seq, inf, memory, + 0 /* verbose */ , 0 /* print_offset */); + else if (!strcmp(type, "line")) + index_marc_line_records(isb_postings, dict, &docid_seq, inf, memory); + + printf("# Total " ZINT_FORMAT " documents\n", docid_seq); + dict_insert(dict, "_s", sizeof(docid_seq), &docid_seq); dict_close(dict); - isamb_close(isb); + isamb_close(isb_postings); if (fname) fclose(inf); /* exit block system */ bfs_destroy(bfs); + yaz_timing_stop(tim); + + printf("# Total timings real=%8.6f user=%3.2f system=%3.2f\n", + yaz_timing_get_real(tim), + yaz_timing_get_user(tim), + yaz_timing_get_sys(tim)); + + yaz_timing_destroy(&tim); + exit(0); return 0; } /* * Local variables: * c-basic-offset: 4 + * c-file-style: "Stroustrup" * indent-tabs-mode: nil * End: * vim: shiftwidth=4 tabstop=8 expandtab