From 2f2827ef6acf22faa96e0ff55bbb24845e423176 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Sun, 10 Dec 2006 21:00:56 +0000 Subject: [PATCH] Make MARC indexer with different ISAM strategy. --- isamb/.cvsignore | 1 + isamb/Makefile.am | 10 +- isamb/benchindex1.c | 528 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 537 insertions(+), 2 deletions(-) create mode 100644 isamb/benchindex1.c diff --git a/isamb/.cvsignore b/isamb/.cvsignore index 2f5499e..83cbe0b 100644 --- a/isamb/.cvsignore +++ b/isamb/.cvsignore @@ -5,6 +5,7 @@ Makefile.in *.mf tstisamb benchisamb +benchindex1 *.lo *.la *.dat diff --git a/isamb/Makefile.am b/isamb/Makefile.am index 05b3e2d..c65ddb6 100644 --- a/isamb/Makefile.am +++ b/isamb/Makefile.am @@ -1,8 +1,8 @@ -## $Id: Makefile.am,v 1.15 2006-12-10 11:54:29 adam Exp $ +## $Id: Makefile.am,v 1.16 2006-12-10 21:00:56 adam Exp $ noinst_LTLIBRARIES = libidzebra-isamb.la -noinst_PROGRAMS = benchisamb +noinst_PROGRAMS = benchisamb benchindex1 check_PROGRAMS = tstisamb EXTRA_DIST = bench1.sh bench1.plt bench2.sh bench2.plt @@ -19,6 +19,12 @@ benchisamb_LDADD = libidzebra-isamb.la \ ../bfile/libidzebra-bfile.la \ ../util/libidzebra-util.la $(YAZLALIB) +benchindex1_SOURCES = benchindex1.c +benchindex1_LDADD = libidzebra-isamb.la \ + ../bfile/libidzebra-bfile.la \ + ../dict/libidzebra-dict.la \ + ../util/libidzebra-util.la $(YAZLALIB) + libidzebra_isamb_la_SOURCES = isamb.c AM_CPPFLAGS=-I$(srcdir)/../include $(YAZINC) diff --git a/isamb/benchindex1.c b/isamb/benchindex1.c new file mode 100644 index 0000000..1a9cb88 --- /dev/null +++ b/isamb/benchindex1.c @@ -0,0 +1,528 @@ +/* $Id: benchindex1.c,v 1.1 2006-12-10 21:00:56 adam Exp $ + Copyright (C) 1995-2006 + Index Data ApS + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + +#include +#if HAVE_SYS_TIMES_H +#include +#endif +#if HAVE_SYS_TIME_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct index_block { + NMEM nmem; + size_t no_entries; + size_t current_entry; + size_t current_max; + struct index_term *terms; + struct index_term **ar; + int round; +}; + +struct index_term { + const char *term; + zint docid; + zint seqno; + int word_id; + struct index_term *next; +}; + +struct index_block *index_block_new(int memory) +{ + struct index_block *b = xmalloc(sizeof(*b)); + b->no_entries = 0; + b->current_max = memory * 1024 * 1024; + b->terms = 0; + b->nmem = nmem_create(); + b->round = 0; + return b; +} + +void index_block_destroy(struct index_block **bp) +{ + if (*bp) + { + nmem_destroy((*bp)->nmem); + xfree(*bp); + *bp = 0; + } +} + +static int cmp_ar(const void *p1, const void *p2) +{ + struct index_term *t1 = *(struct index_term **) p1; + struct index_term *t2 = *(struct index_term **) p2; + int d = strcmp(t1->term, t2->term); + if (d) + return d; + + if (t1->docid > t2->docid) + return 1; + else if (t1->docid < t2->docid) + return -1; + if (t1->seqno > t2->seqno) + return 1; + else if (t1->seqno < t2->seqno) + return -1; + return 0; +} + + +int code_read(void *vp, char **dst, int *insertMode) +{ + struct index_block *b = (struct index_block *)vp; + struct index_term *t; + struct it_key key; + + if (b->current_entry >= b->no_entries) + return 0; + + t = b->ar[b->current_entry]; + b->current_entry++; + + key.len = 3; + key.mem[0] = t->word_id; + key.mem[1] = t->docid; + key.mem[2] = t->seqno; + key.mem[3] = 0; + + memcpy(*dst, &key, sizeof(key)); + + (*dst) += sizeof(key); + *insertMode = 1; +#if 0 + yaz_log(YLOG_LOG, "returning " ZINT_FORMAT " " ZINT_FORMAT "\n", + key.mem[0], key.mem[1]); +#endif + return 1; +} + +void index_block_flush(struct index_block *b, ISAMB isb, Dict dict, + int no_docs) +{ + struct index_term *t = b->terms; + int i; + int word_id_seq = 0; + int no_words = 0, no_new_words = 0; + const char *dict_info = 0; + ISAM_P isamc_p = 0; + +#if HAVE_SYS_TIMES_H +#if HAVE_SYS_TIME_H + struct tms tms1, tms2; + struct timeval start_time, end_time; + double usec; + times(&tms1); + gettimeofday(&start_time, 0); +#endif +#endif + + b->ar = xmalloc(sizeof(*b->ar) * b->no_entries); + for (i = 0; i < b->no_entries; i++, t = t->next) + { + assert(t); + b->ar[i] = t; + } + assert(!t); + + qsort(b->ar, b->no_entries, sizeof(*b->ar), cmp_ar); +#if 0 + for (i = 0; i < b->no_entries; i++) + { + printf("%s " ZINT_FORMAT " " ZINT_FORMAT "\n", + ar[i]->term, ar[i]->docid, ar[i]->seqno); + } +#endif + dict_info = dict_lookup(dict, "_w"); + if (dict_info) + { + assert(*dict_info == sizeof(word_id_seq)); + memcpy(&word_id_seq, dict_info+1, sizeof(word_id_seq)); + } + + dict_info = dict_lookup(dict, "_i"); + if (dict_info) + { + assert(*dict_info == sizeof(isamc_p)); + memcpy(&isamc_p, dict_info+1, sizeof(isamc_p)); + } + + for (i = 0; i < b->no_entries; i++) + { + if (i > 0 && strcmp(b->ar[i-1]->term, b->ar[i]->term) == 0) + b->ar[i]->word_id = b->ar[i-1]->word_id; + else + { + const char *dict_info = dict_lookup(dict, b->ar[i]->term); + if (dict_info) + { + memcpy(&b->ar[i]->word_id, dict_info+1, sizeof(int)); + } + else + { + word_id_seq++; + no_new_words++; + dict_insert(dict, b->ar[i]->term, sizeof(int), &word_id_seq); + b->ar[i]->word_id = word_id_seq; + } + no_words++; + } + } + dict_insert(dict, "_w", sizeof(word_id_seq), &word_id_seq); + + b->current_entry = 0; + + if (b->no_entries) + { + ISAMC_I isamc_i; + + isamc_i.clientData = b; + isamc_i.read_item = code_read; + + isamb_merge (isb, &isamc_p, &isamc_i); + + assert(isamc_p); + dict_insert(dict, "_i", sizeof(isamc_p), &isamc_p); + } + + yaz_log(YLOG_LOG, "Flushed %d postings, %d/%d words, %d records", + b->no_entries, no_words, no_new_words, no_docs); + xfree(b->ar); + b->ar = 0; + nmem_reset(b->nmem); + b->no_entries = 0; + b->terms = 0; + +#if HAVE_SYS_TIMES_H +#if HAVE_SYS_TIME_H + b->round++; + gettimeofday(&end_time, 0); + times(&tms2); + + usec = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + + end_time.tv_usec - start_time.tv_usec; + + printf("%3d %8.6f %5.2f %5.2f\n", + b->round, + usec / 1000000, + (double) (tms2.tms_utime - tms1.tms_utime)/100, + (double) (tms2.tms_stime - tms1.tms_stime)/100); +#endif +#endif + +} + +void index_block_check_flush(struct index_block *b, ISAMB isb, Dict dict, + int no_docs) +{ + int total = nmem_total(b->nmem); + int max = b->current_max; + if (total > max) + { + yaz_log(YLOG_LOG, "flush to disk total=%d max=%d", total, max); + index_block_flush(b, isb, dict, no_docs); + } +} + +void index_block_add(struct index_block *b, + const char *term, zint docid, zint seqno) +{ + struct index_term *t = nmem_malloc(b->nmem, sizeof(*t)); + t->term = nmem_strdup(b->nmem, term); + t->docid = docid; + t->seqno = seqno; + t->next = b->terms; + b->terms = t; + b->no_entries++; +} + +void exit_usage(void) +{ + fprintf(stderr, "benchindex1 [-z sz]\n"); + exit(1); +} + +void index_term(struct index_block *b, const char *term, + zint docid, zint *seqno) +{ +#if 0 + printf("%s " ZINT_FORMAT " " ZINT_FORMAT "\n", term, + docid, *seqno); +#endif + index_block_add(b, term, docid, *seqno); + (*seqno)++; +} + +void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid) +{ + int nl = 1; + const char *cp = wrbuf_buf(wrbuf); + char term[4096]; + size_t sz = 0; + zint seqno = 0; + + while (*cp) + { + if (nl) + { + int i; + for (i = 0; i<6 && *cp; i++, cp++) + ; + } + nl = 0; + if (*cp == '\n') + { + if (sz) + { + index_term(b, term, docid, &seqno); + sz = 0; + } + nl = 1; + cp++; + } + else if (*cp == '$' && cp[1]) + { + if (sz) + { + index_term(b, term, docid, &seqno); + sz = 0; + } + cp += 2; + } + else if (strchr("$/-;,.:[]\"&(){} ", *cp)) + { + if (sz) + { + index_term(b, term, docid, &seqno); + sz = 0; + } + cp++; + } + else + { + unsigned ch = *(const unsigned char *)cp; + if (sz < sizeof(term)) + { + term[sz] = tolower(ch); + term[sz+1] = '\0'; + sz++; + } + cp++; + } + } + if (sz) + index_term(b, term, docid, &seqno); +} + +void index_marc_from_file(ISAMB isb, + Dict dict, + FILE *inf, + int memory, + int verbose, int print_offset) +{ + yaz_marc_t mt = yaz_marc_create(); + WRBUF wrbuf = wrbuf_alloc(); + struct index_block *b = index_block_new(memory); + const char *dict_info = 0; + zint docid_seq = 1; + int no_docs = 0; + + dict_info = dict_lookup(dict, "_s"); + if (dict_info) + { + assert(*dict_info == sizeof(docid_seq)); + memcpy(&docid_seq, dict_info+1, sizeof(docid_seq)); + } + + while (1) + { + size_t r; + char buf[100001]; + int len, rlen; + + r = fread (buf, 1, 5, inf); + if (r < 5) + { + if (r && print_offset && verbose) + printf ("\n", + (long) r); + break; + } + while (*buf < '0' || *buf > '9') + { + int i; + long off = ftell(inf) - 5; + if (verbose || print_offset) + printf("\n", + *buf & 0xff, *buf & 0xff, + off, off); + for (i = 0; i<4; i++) + buf[i] = buf[i+1]; + r = fread(buf+4, 1, 1, inf); + if (r < 1) + break; + } + if (r < 1) + { + if (verbose || print_offset) + printf ("\n"); + break; + } + len = atoi_n(buf, 5); + if (len < 25 || len > 100000) + { + long off = ftell(inf) - 5; + printf("Bad Length %ld read at offset %ld (%lx)\n", + (long)len, (long) off, (long) off); + break; + } + rlen = len - 5; + r = fread (buf + 5, 1, rlen, inf); + if (r < rlen) + break; + yaz_marc_read_iso2709(mt, buf, len); + + if (yaz_marc_write_line(mt, wrbuf)) + break; + + index_wrbuf(b, wrbuf, docid_seq); + wrbuf_rewind(wrbuf); + docid_seq++; + + no_docs++; + index_block_check_flush(b, isb, dict, no_docs); + } + index_block_flush(b, isb, dict, no_docs); + wrbuf_free(wrbuf, 1); + yaz_marc_destroy(mt); + index_block_destroy(&b); + yaz_log(YLOG_LOG, "Total " ZINT_FORMAT " documents", docid_seq); + dict_insert(dict, "_s", sizeof(docid_seq), &docid_seq); +} + +int main(int argc, char **argv) +{ + BFiles bfs; + ISAMB isb; + ISAMC_M method; + Dict dict; + int ret; + int reset = 0; + char *arg; + int memory = 5; + const char *fname = 0; + FILE *inf = stdin; + + while ((ret = options("im:", argv, argc, &arg)) != -2) + { + switch(ret) + { + case 'm': + memory = atoi(arg); + break; + case 'i': + reset = 1; + break; + case 0: + fname = arg; + break; + default: + fprintf(stderr, "bad option.\n"); + exit_usage(); + } + } + + if (fname) + { + inf = fopen(fname, "rb"); + if (!inf) + { + fprintf(stderr, "Cannot open %s\n", fname); + exit(1); + } + } + /* setup method (attributes) */ + method.compare_item = key_compare; + method.log_item = key_logdump_txt; + + method.codec.start = iscz1_start; + method.codec.decode = iscz1_decode; + method.codec.encode = iscz1_encode; + method.codec.stop = iscz1_stop; + method.codec.reset = iscz1_reset; + + method.debug = 0; + + /* create block system */ + bfs = bfs_create(0, 0); + if (!bfs) + { + yaz_log(YLOG_WARN, "bfs_create failed"); + exit(1); + } + + if (reset) + bf_reset(bfs); + + /* create isam handle */ + isb = isamb_open (bfs, "isamb", 1, &method, 0); + if (!isb) + { + yaz_log(YLOG_WARN, "isamb_open failed"); + exit(2); + } + dict = dict_open(bfs, "dict", 50, 1, 0, 4096); + + index_marc_from_file(isb, dict, inf, memory, + 0 /* verbose */ , 0 /* print_offset */); + + dict_close(dict); + isamb_close(isb); + + if (fname) + fclose(inf); + /* exit block system */ + bfs_destroy(bfs); + exit(0); + return 0; +} +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ + -- 1.7.10.4