-/* $Id: benchindex1.c,v 1.2 2006-12-10 21:02:28 adam Exp $
- Copyright (C) 1995-2006
+/* $Id: benchindex1.c,v 1.9 2007-01-15 15:10:17 adam Exp $
+ Copyright (C) 1995-2007
Index Data ApS
This file is part of the Zebra server.
*/
#include <yaz/options.h>
-#if HAVE_SYS_TIMES_H
-#include <sys/times.h>
-#endif
-#if HAVE_SYS_TIME_H
-#include <sys/time.h>
-#endif
-
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <yaz/nmem.h>
#include <yaz/xmalloc.h>
#include <yaz/marcdisp.h>
+#include <yaz/timing.h>
#include <it_key.h>
#include <idzebra/isamb.h>
#include <idzebra/dict.h>
struct index_block {
NMEM nmem;
- size_t no_entries;
+ int no_entries;
size_t current_entry;
size_t current_max;
struct index_term *terms;
int no_words = 0, no_new_words = 0;
const char *dict_info = 0;
ISAM_P isamc_p = 0;
-
-#if HAVE_SYS_TIMES_H
-#if HAVE_SYS_TIME_H
- struct tms tms1, tms2;
- struct timeval start_time, end_time;
- double usec;
- times(&tms1);
- gettimeofday(&start_time, 0);
-#endif
-#endif
+ yaz_timing_t tim_dict = 0;
+ yaz_timing_t tim_isamb = 0;
+ zint number_of_int_splits = isamb_get_int_splits(isb);
+ zint number_of_leaf_splits = isamb_get_leaf_splits(isb);
+ zint number_of_dict_splits = dict_get_no_split(dict);
b->ar = xmalloc(sizeof(*b->ar) * b->no_entries);
for (i = 0; i < b->no_entries; i++, t = t->next)
assert(!t);
qsort(b->ar, b->no_entries, sizeof(*b->ar), cmp_ar);
+ tim_dict = yaz_timing_create();
#if 0
for (i = 0; i < b->no_entries; i++)
{
}
}
dict_insert(dict, "_w", sizeof(word_id_seq), &word_id_seq);
+
+ yaz_timing_stop(tim_dict);
+ tim_isamb = yaz_timing_create();
b->current_entry = 0;
dict_insert(dict, "_i", sizeof(isamc_p), &isamc_p);
}
- yaz_log(YLOG_LOG, "Flushed %d postings, %d/%d words, %d records",
- b->no_entries, no_words, no_new_words, no_docs);
+ yaz_timing_stop(tim_isamb);
+
+ number_of_int_splits = isamb_get_int_splits(isb) - number_of_int_splits;
+ number_of_leaf_splits = isamb_get_leaf_splits(isb) - number_of_leaf_splits;
+ number_of_dict_splits = dict_get_no_split(dict) - number_of_dict_splits;
+
+ if (b->round == 0)
+ {
+ printf("# run total dict-real user sys isam-real user sys "
+ " intsp leafsp docs postings words new d-spl\n");
+ }
+ b->round++;
+ printf("%5d %9.6f %9.6f %5.2f %5.2f %9.6f %5.2f %5.2f "
+ "%6" ZINT_FORMAT0 " %6" ZINT_FORMAT0
+ " %8d %8d %6d %6d" " %5" ZINT_FORMAT0 "\n",
+ b->round,
+ yaz_timing_get_real(tim_dict) + yaz_timing_get_real(tim_isamb),
+ yaz_timing_get_real(tim_dict),
+ yaz_timing_get_user(tim_dict),
+ yaz_timing_get_sys(tim_dict),
+ yaz_timing_get_real(tim_isamb),
+ yaz_timing_get_user(tim_isamb),
+ yaz_timing_get_sys(tim_isamb),
+ number_of_int_splits,
+ number_of_leaf_splits,
+ no_docs,
+ b->no_entries,
+ no_words,
+ no_new_words,
+ number_of_dict_splits
+ );
+ fflush(stdout);
+
xfree(b->ar);
b->ar = 0;
nmem_reset(b->nmem);
b->no_entries = 0;
b->terms = 0;
-#if HAVE_SYS_TIMES_H
-#if HAVE_SYS_TIME_H
- b->round++;
- gettimeofday(&end_time, 0);
- times(&tms2);
-
- usec = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
- end_time.tv_usec - start_time.tv_usec;
-
- printf("%3d %8.6f %5.2f %5.2f\n",
- b->round,
- usec / 1000000,
- (double) (tms2.tms_utime - tms1.tms_utime)/100,
- (double) (tms2.tms_stime - tms1.tms_stime)/100);
-#endif
-#endif
-
+ yaz_timing_destroy(&tim_isamb);
+ yaz_timing_destroy(&tim_dict);
}
void index_block_check_flush(struct index_block *b, ISAMB isb, Dict dict,
int max = b->current_max;
if (total > max)
{
- yaz_log(YLOG_LOG, "flush to disk total=%d max=%d", total, max);
index_block_flush(b, isb, dict, no_docs);
}
}
b->no_entries++;
}
-void exit_usage(void)
-{
- fprintf(stderr, "benchindex1 [-m m] [iso2709file]\n");
- exit(1);
-}
-
void index_term(struct index_block *b, const char *term,
zint docid, zint *seqno)
{
(*seqno)++;
}
-void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid)
+void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid,
+ int subfield_char)
{
int nl = 1;
const char *cp = wrbuf_buf(wrbuf);
if (nl)
{
int i;
- for (i = 0; i<6 && *cp; i++, cp++)
- ;
+ if (cp[0] != ' ')
+ { /* skip field+indicator (e.g. 245 00) */
+ for (i = 0; i<6 && *cp; i++, cp++)
+ ;
+ }
+ else
+ { /* continuation line */
+ for (i = 0; i<4 && *cp; i++, cp++)
+ ;
+ }
}
nl = 0;
if (*cp == '\n')
nl = 1;
cp++;
}
- else if (*cp == '$' && cp[1])
+ else if (*cp == subfield_char && cp[1])
{
if (sz)
{
}
cp += 2;
}
- else if (strchr("$/-;,.:[]\"&(){} ", *cp))
+ else if (strchr("$*/-;,.:[]\"&(){} ", *cp))
{
if (sz)
{
index_term(b, term, docid, &seqno);
}
+void index_marc_line_records(ISAMB isb,
+ Dict dict,
+ zint *docid_seq,
+ FILE *inf,
+ int memory)
+{
+ WRBUF wrbuf = wrbuf_alloc();
+ int no_docs = 0;
+ int new_rec = 1;
+ char line[4096];
+ struct index_block *b = index_block_new(memory);
+ while(fgets(line, sizeof(line)-1, inf))
+ {
+ if (line[0] == '$')
+ {
+ if (!new_rec)
+ new_rec = 1;
+ else
+ new_rec = 0;
+ continue;
+ }
+ if (new_rec)
+ {
+ (*docid_seq)++;
+ no_docs++;
+ index_block_check_flush(b, isb, dict, no_docs);
+ new_rec = 0;
+ }
+
+ if (line[0] == ' ')
+ {
+ /* continuation */
+ wrbuf_puts(wrbuf, line);
+ continue;
+ }
+ else
+ {
+ /* index existing buffer (if any) */
+ if (wrbuf_len(wrbuf))
+ {
+ index_wrbuf(b, wrbuf, *docid_seq, '*');
+ wrbuf_rewind(wrbuf);
+ }
+ if (line[0] != ' ' && line[1] != ' ' && line[2] != ' ' &&
+ line[3] == ' ')
+ {
+ /* normal field+indicator line */
+ wrbuf_puts(wrbuf, line);
+ }
+ }
+ }
+ if (wrbuf_len(wrbuf))
+ {
+ index_wrbuf(b, wrbuf, *docid_seq, '*');
+ wrbuf_rewind(wrbuf);
+ }
+ (*docid_seq)++;
+ no_docs++;
+ index_block_flush(b, isb, dict, no_docs);
+ index_block_destroy(&b);
+}
+
void index_marc_from_file(ISAMB isb,
Dict dict,
+ zint *docid_seq,
FILE *inf,
int memory,
int verbose, int print_offset)
yaz_marc_t mt = yaz_marc_create();
WRBUF wrbuf = wrbuf_alloc();
struct index_block *b = index_block_new(memory);
- const char *dict_info = 0;
- zint docid_seq = 1;
int no_docs = 0;
- dict_info = dict_lookup(dict, "_s");
- if (dict_info)
- {
- assert(*dict_info == sizeof(docid_seq));
- memcpy(&docid_seq, dict_info+1, sizeof(docid_seq));
- }
-
while (1)
{
size_t r;
if (yaz_marc_write_line(mt, wrbuf))
break;
- index_wrbuf(b, wrbuf, docid_seq);
+ index_wrbuf(b, wrbuf, *docid_seq, '$');
wrbuf_rewind(wrbuf);
- docid_seq++;
+ (*docid_seq)++;
no_docs++;
index_block_check_flush(b, isb, dict, no_docs);
wrbuf_free(wrbuf, 1);
yaz_marc_destroy(mt);
index_block_destroy(&b);
- yaz_log(YLOG_LOG, "Total " ZINT_FORMAT " documents", docid_seq);
- dict_insert(dict, "_s", sizeof(docid_seq), &docid_seq);
+}
+
+void exit_usage(void)
+{
+ fprintf(stderr, "benchindex1 [-t type] [-c d:i] [-m mem] [-i] [inputfile]\n");
+ exit(1);
}
int main(int argc, char **argv)
{
BFiles bfs;
- ISAMB isb;
- ISAMC_M method;
+ ISAMB isb_postings;
+ ISAMC_M method_postings;
Dict dict;
int ret;
int reset = 0;
char *arg;
int memory = 5;
+ int isam_cache_size = 40;
+ int dict_cache_size = 50;
const char *fname = 0;
FILE *inf = stdin;
+ yaz_timing_t tim = 0;
+ zint docid_seq = 1;
+ const char *dict_info;
+ const char *type = "iso2709";
+ int int_count_enable = 1;
- while ((ret = options("im:", argv, argc, &arg)) != -2)
+ while ((ret = options("im:t:c:N", argv, argc, &arg)) != -2)
{
switch(ret)
{
case 'i':
reset = 1;
break;
+ case 't':
+ if (!strcmp(arg, "iso2709"))
+ type = "iso2709";
+ else if (!strcmp(arg, "line"))
+ type = "line";
+ else
+ {
+ fprintf(stderr, "bad type: %s.\n", arg);
+ exit_usage();
+ }
+ break;
+ case 'c':
+ if (sscanf(arg, "%d:%d", &dict_cache_size, &isam_cache_size)
+ != 2)
+ {
+ fprintf(stderr, "bad cache sizes for -c\n");
+ exit_usage();
+ }
+ break;
case 0:
fname = arg;
break;
+ case 'N':
+ int_count_enable = 0;
+ break;
default:
fprintf(stderr, "bad option.\n");
exit_usage();
exit(1);
}
}
- /* setup method (attributes) */
- method.compare_item = key_compare;
- method.log_item = key_logdump_txt;
+ printf("# benchindex1 %s %s\n", __DATE__, __TIME__);
+ printf("# isam_cache_size = %d\n", isam_cache_size);
+ printf("# dict_cache_size = %d\n", dict_cache_size);
+ printf("# int_count_enable = %d\n", int_count_enable);
+ printf("# memory = %d\n", memory);
+
+ /* setup postings isamb attributes */
+ method_postings.compare_item = key_compare;
+ method_postings.log_item = key_logdump_txt;
- method.codec.start = iscz1_start;
- method.codec.decode = iscz1_decode;
- method.codec.encode = iscz1_encode;
- method.codec.stop = iscz1_stop;
- method.codec.reset = iscz1_reset;
+ method_postings.codec.start = iscz1_start;
+ method_postings.codec.decode = iscz1_decode;
+ method_postings.codec.encode = iscz1_encode;
+ method_postings.codec.stop = iscz1_stop;
+ method_postings.codec.reset = iscz1_reset;
- method.debug = 0;
+ method_postings.debug = 0;
/* create block system */
bfs = bfs_create(0, 0);
if (reset)
bf_reset(bfs);
+ tim = yaz_timing_create();
/* create isam handle */
- isb = isamb_open (bfs, "isamb", 1, &method, 0);
- if (!isb)
+ isb_postings = isamb_open (bfs, "isamb", isam_cache_size ? 1 : 0,
+ &method_postings, 0);
+ if (!isb_postings)
{
yaz_log(YLOG_WARN, "isamb_open failed");
exit(2);
}
- dict = dict_open(bfs, "dict", 50, 1, 0, 4096);
+ isamb_set_cache_size(isb_postings, isam_cache_size);
+ isamb_set_int_count(isb_postings, int_count_enable);
+ dict = dict_open(bfs, "dict", dict_cache_size, 1, 0, 4096);
- index_marc_from_file(isb, dict, inf, memory,
- 0 /* verbose */ , 0 /* print_offset */);
+ dict_info = dict_lookup(dict, "_s");
+ if (dict_info)
+ {
+ assert(*dict_info == sizeof(docid_seq));
+ memcpy(&docid_seq, dict_info+1, sizeof(docid_seq));
+ }
+
+ if (!strcmp(type, "iso2709"))
+ index_marc_from_file(isb_postings, dict, &docid_seq, inf, memory,
+ 0 /* verbose */ , 0 /* print_offset */);
+ else if (!strcmp(type, "line"))
+ index_marc_line_records(isb_postings, dict, &docid_seq, inf, memory);
+
+ printf("# Total " ZINT_FORMAT " documents\n", docid_seq);
+ dict_insert(dict, "_s", sizeof(docid_seq), &docid_seq);
dict_close(dict);
- isamb_close(isb);
+ isamb_close(isb_postings);
if (fname)
fclose(inf);
/* exit block system */
bfs_destroy(bfs);
+ yaz_timing_stop(tim);
+
+ printf("# Total timings real=%8.6f user=%3.2f system=%3.2f\n",
+ yaz_timing_get_real(tim),
+ yaz_timing_get_user(tim),
+ yaz_timing_get_sys(tim));
+
+ yaz_timing_destroy(&tim);
+
exit(0);
return 0;
}