From: Heikki Levanto Date: Thu, 22 Jan 2004 15:40:25 +0000 (+0000) Subject: Not creating a temporary file when indexing, if there would only X-Git-Tag: ZEBRA.1.3.16~137 X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=23b1af194b14847f3ba187e3c5c3015797ad3c4a Not creating a temporary file when indexing, if there would only be one temp file. Using the memory buffer directly instead. This is configurable, but the config is hard-coded as it is now. Will need to be in zebra.cfg. --- diff --git a/index/extract.c b/index/extract.c index ff89905..412bc66 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.148 2004-01-22 11:50:16 adam Exp $ +/* $Id: extract.c,v 1.149 2004-01-22 15:40:25 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -1282,7 +1282,7 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, #endif if (zh->reg->key_buf_used + 1024 > (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*)) - extract_flushWriteKeys (zh); + extract_flushWriteKeys (zh,0); ++(zh->reg->ptr_i); (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] = (char*)zh->reg->key_buf + zh->reg->key_buf_used; @@ -1319,24 +1319,60 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, assert (off == reckeys->buf_used); } -void extract_flushWriteKeys (ZebraHandle zh) +void extract_flushWriteKeys (ZebraHandle zh, int final) + /* optimizing: if final=1, and no files written yet */ + /* push the keys directly to merge, sidestepping the */ + /* temp file altogether. Speeds small updates */ { FILE *outf; char out_fname[200]; char *prevcp, *cp; struct encode_info encode_info; int ptr_i = zh->reg->ptr_i; + int temp_policy; #if SORT_EXTRA int i; #endif if (!zh->reg->key_buf || ptr_i <= 0) + { + logf (LOG_DEBUG, " nothing to flush section=%d buf=%p i=%d", + zh->reg->key_file_no, zh->reg->key_buf, ptr_i); + logf (LOG_DEBUG, " buf=%p ", + zh->reg->key_buf); + logf (LOG_DEBUG, " ptr=%d ",zh->reg->ptr_i); + logf (LOG_DEBUG, " reg=%p ",zh->reg); + return; + } (zh->reg->key_file_no)++; logf (LOG_LOG, "sorting section %d", (zh->reg->key_file_no)); + logf (LOG_DEBUG, " sort_buff at %p n=%d", + zh->reg->key_buf + zh->reg->ptr_top - ptr_i,ptr_i); #if !SORT_EXTRA qsort (zh->reg->key_buf + zh->reg->ptr_top - ptr_i, ptr_i, sizeof(char*), key_qsort_compare); + + /* Case 1: always use temp files (old way) */ + /* Case 2: use temp files, if more than one (auto) */ + /* = if this is both the last and the first */ + /* Case 3: never bother with temp files (new) */ + temp_policy=2; + /* FIXME - will come from config file into zh */ + + if ( ( temp_policy ==3 ) || /* always from memory */ + ( ( temp_policy ==2 ) && /* automatic */ + (zh->reg->key_file_no == 1) && /* this is first time */ + (final) ) ) /* and last (=only) time */ + { /* go directly from memory */ + zh->reg->key_file_no =0; /* signal not to read files */ + zebra_index_merge(zh); + zh->reg->ptr_i = 0; + zh->reg->key_buf_used = 0; + return; /*!*/ + } + + /* Not doing directly from memory, write into a temp file */ extract_get_fname_tmp (zh, out_fname, zh->reg->key_file_no); if (!(outf = fopen (out_fname, "wb"))) diff --git a/index/index.h b/index/index.h index 208ef3b..54ffa52 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.103 2004-01-22 11:27:21 adam Exp $ +/* $Id: index.h,v 1.104 2004-01-22 15:40:25 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003 Index Data Aps @@ -420,6 +420,7 @@ int zebra_record_fetch (ZebraHandle zh, int sysno, int score, ODR stream, int *rec_lenp, char **basenamep); void extract_get_fname_tmp (ZebraHandle zh, char *fname, int no); + void zebra_index_merge (ZebraHandle zh); int buffer_extract_record (ZebraHandle zh, @@ -441,8 +442,7 @@ int extract_rec_in_mem (ZebraHandle zh, const char *recordType, int store_keys, int store_data, const char *match_criteria); #endif - -void extract_flushWriteKeys (ZebraHandle zh); +void extract_flushWriteKeys (ZebraHandle zh, int final); struct zebra_fetch_control { off_t offset_end; diff --git a/index/kinput.c b/index/kinput.c index a590587..b9d7305 100644 --- a/index/kinput.c +++ b/index/kinput.c @@ -1,4 +1,4 @@ -/* $Id: kinput.c,v 1.56 2003-06-23 15:35:25 adam Exp $ +/* $Id: kinput.c,v 1.57 2004-01-22 15:40:25 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -236,7 +236,7 @@ struct heap_info { int *ptr; int (*cmp)(const void *p1, const void *p2); struct zebra_register *reg; - + ZebraHandle zh; /* only used for raw reading that bypasses the heaps */ int no_diffs; int no_updates; int no_deletions; @@ -244,17 +244,34 @@ struct heap_info { int no_iterations; }; +static struct heap_info *key_heap_malloc() +{ /* malloc and clear it */ + struct heap_info *hi; + hi = (struct heap_info *) xmalloc (sizeof(*hi)); + hi->info.file = 0; + hi->info.buf = 0; + hi->heapnum = 0; + hi->ptr = 0; + hi->zh=0; + hi->no_diffs = 0; + hi->no_diffs = 0; + hi->no_updates = 0; + hi->no_deletions = 0; + hi->no_insertions = 0; + hi->no_iterations = 0; + return hi; +} + struct heap_info *key_heap_init (int nkeys, int (*cmp)(const void *p1, const void *p2)) { struct heap_info *hi; int i; - hi = (struct heap_info *) xmalloc (sizeof(*hi)); + hi = key_heap_malloc(); hi->info.file = (struct key_file **) - xmalloc (sizeof(*hi->info.file) * (1+nkeys)); + xmalloc (sizeof(*hi->info.file) * (1+nkeys)); hi->info.buf = (char **) xmalloc (sizeof(*hi->info.buf) * (1+nkeys)); - hi->heapnum = 0; hi->ptr = (int *) xmalloc (sizeof(*hi->ptr) * (1+nkeys)); hi->cmp = cmp; for (i = 0; i<= nkeys; i++) @@ -262,12 +279,15 @@ struct heap_info *key_heap_init (int nkeys, hi->ptr[i] = i; hi->info.buf[i] = (char *) xmalloc (INP_NAME_MAX); } - hi->no_diffs = 0; - hi->no_diffs = 0; - hi->no_updates = 0; - hi->no_deletions = 0; - hi->no_insertions = 0; - hi->no_iterations = 0; + return hi; +} + +struct heap_info *key_heap_init_buff ( ZebraHandle zh, + int (*cmp)(const void *p1, const void *p2)) +{ + struct heap_info *hi=key_heap_malloc(); + hi->cmp=cmp; + hi->zh=zh; return hi; } @@ -275,8 +295,10 @@ void key_heap_destroy (struct heap_info *hi, int nkeys) { int i; yaz_log (LOG_DEBUG, "key_heap_destroy"); - for (i = 0; i<=nkeys; i++) - xfree (hi->info.buf[i]); + yaz_log (LOG_DEBUG, "key_heap_destroy nk=%d",nkeys); + if (!hi->zh) + for (i = 0; i<=nkeys; i++) + xfree (hi->info.buf[i]); xfree (hi->info.buf); xfree (hi->ptr); @@ -338,12 +360,30 @@ static void key_heap_insert (struct heap_info *hi, const char *buf, int nbytes, } } +static int heap_read_one_raw (struct heap_info *hi, char *name, char *key) +{ + ZebraHandle zh=hi->zh; + int ptr_i = zh->reg->ptr_i--; + char *cp; + if (!ptr_i) + return 0; + cp=(zh->reg->key_buf)[zh->reg->ptr_top - ptr_i]; + logf (LOG_DEBUG, " raw: i=%d top=%d cp=%p", ptr_i, zh->reg->ptr_top,cp); + strcpy(name, cp); + memcpy(key, cp+strlen(name)+1, KEY_SIZE); + hi->no_iterations++; + return 1; +} + static int heap_read_one (struct heap_info *hi, char *name, char *key) { int n, r; char rbuf[INP_NAME_MAX]; struct key_file *kf; + if (hi->zh) /* bypass the heap stuff, we have a readymade buffer */ + return heap_read_one_raw(hi, name, key); + if (!hi->heapnum) return 0; n = hi->ptr[1]; @@ -868,40 +908,56 @@ void zebra_index_merge (ZebraHandle zh) struct heap_info *hi; struct progressInfo progressInfo; int nkeys = zh->reg->key_file_no; + int usefile; - if (nkeys < 0) + logf (LOG_DEBUG, " index_merge called with nk=%d b=%p", + nkeys, zh->reg->key_buf); + if ( (nkeys==0) && (zh->reg->key_buf==0) ) + return; /* nothing to merge - probably flush after end-trans */ + + usefile = (nkeys!=0); + + if (usefile) { - char fname[1024]; - nkeys = 0; - while (1) + if (nkeys < 0) { - extract_get_fname_tmp (zh, fname, nkeys+1); - if (access (fname, R_OK) == -1) - break; - nkeys++; + char fname[1024]; + nkeys = 0; + while (1) + { + extract_get_fname_tmp (zh, fname, nkeys+1); + if (access (fname, R_OK) == -1) + break; + nkeys++; + } + if (!nkeys) + return ; } - if (!nkeys) - return ; - } - kf = (struct key_file **) xmalloc ((1+nkeys) * sizeof(*kf)); - progressInfo.totalBytes = 0; - progressInfo.totalOffset = 0; - time (&progressInfo.startTime); - time (&progressInfo.lastTime); - for (i = 1; i<=nkeys; i++) - { - kf[i] = key_file_init (i, 8192, zh->res); - kf[i]->readHandler = progressFunc; - kf[i]->readInfo = &progressInfo; - progressInfo.totalBytes += kf[i]->length; - progressInfo.totalOffset += kf[i]->buf_size; + kf = (struct key_file **) xmalloc ((1+nkeys) * sizeof(*kf)); + progressInfo.totalBytes = 0; + progressInfo.totalOffset = 0; + time (&progressInfo.startTime); + time (&progressInfo.lastTime); + for (i = 1; i<=nkeys; i++) + { + kf[i] = key_file_init (i, 8192, zh->res); + kf[i]->readHandler = progressFunc; + kf[i]->readInfo = &progressInfo; + progressInfo.totalBytes += kf[i]->length; + progressInfo.totalOffset += kf[i]->buf_size; + } + hi = key_heap_init (nkeys, key_qsort_compare); + hi->reg = zh->reg; + + for (i = 1; i<=nkeys; i++) + if ((r = key_file_read (kf[i], rbuf))) + key_heap_insert (hi, rbuf, r, kf[i]); + } /* use file */ + else + { /* do not use file, read straight from buffer */ + hi = key_heap_init_buff (zh,key_qsort_compare); + hi->reg = zh->reg; } - hi = key_heap_init (nkeys, key_qsort_compare); - hi->reg = zh->reg; - - for (i = 1; i<=nkeys; i++) - if ((r = key_file_read (kf[i], rbuf))) - key_heap_insert (hi, rbuf, r, kf[i]); if (zh->reg->isams) heap_inps (hi); if (zh->reg->isamc) @@ -913,22 +969,28 @@ void zebra_index_merge (ZebraHandle zh) if (zh->reg->isamb) heap_inpb (hi); - for (i = 1; i<=nkeys; i++) + if (usefile) { - extract_get_fname_tmp (zh, rbuf, i); - unlink (rbuf); + for (i = 1; i<=nkeys; i++) + { + extract_get_fname_tmp (zh, rbuf, i); + unlink (rbuf); + } + for (i = 1; i<=nkeys; i++) + key_file_destroy (kf[i]); + xfree (kf); + } + if (hi->no_iterations) + { /* do not log if nothing happened */ + logf (LOG_LOG, "Iterations . . .%7d", hi->no_iterations); + logf (LOG_LOG, "Distinct words .%7d", hi->no_diffs); + logf (LOG_LOG, "Updates. . . . .%7d", hi->no_updates); + logf (LOG_LOG, "Deletions. . . .%7d", hi->no_deletions); + logf (LOG_LOG, "Insertions . . .%7d", hi->no_insertions); } - logf (LOG_LOG, "Iterations . . .%7d", hi->no_iterations); - logf (LOG_LOG, "Distinct words .%7d", hi->no_diffs); - logf (LOG_LOG, "Updates. . . . .%7d", hi->no_updates); - logf (LOG_LOG, "Deletions. . . .%7d", hi->no_deletions); - logf (LOG_LOG, "Insertions . . .%7d", hi->no_insertions); zh->reg->key_file_no = 0; key_heap_destroy (hi, nkeys); - for (i = 1; i<=nkeys; i++) - key_file_destroy (kf[i]); - xfree (kf); } diff --git a/index/zebraapi.c b/index/zebraapi.c index e4ea555..df8b8af 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -1,4 +1,4 @@ -/* $Id: zebraapi.c,v 1.116 2004-01-22 11:27:21 adam Exp $ +/* $Id: zebraapi.c,v 1.117 2004-01-22 15:40:25 heikki Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 Index Data Aps @@ -77,8 +77,8 @@ static void zebra_flush_reg (ZebraHandle zh) zh->errCode=0; zebraExplain_flush (zh->reg->zei, zh); - extract_flushWriteKeys (zh); - zebra_index_merge (zh); + extract_flushWriteKeys (zh,1 /* final */); + zebra_index_merge (zh ); } static struct zebra_register *zebra_register_open (ZebraService zs, @@ -258,6 +258,7 @@ struct zebra_register *zebra_register_open (ZebraService zs, const char *name, reg->zei = 0; reg->matchDict = 0; reg->key_file_no = 0; + reg->ptr_i=0; zebraRankInstall (reg, rank1_class); zebraRankInstall (reg, rankzv_class); diff --git a/test/gils/zebra1.cfg b/test/gils/zebra1.cfg index 4be5cb5..7902082 100644 --- a/test/gils/zebra1.cfg +++ b/test/gils/zebra1.cfg @@ -1,5 +1,5 @@ # Simple Zebra configuration file -# $Id: zebra1.cfg,v 1.4 2002-10-28 21:39:11 adam Exp $ +# $Id: zebra1.cfg,v 1.5 2004-01-22 15:40:25 heikki Exp $ # # Where the schema files, attribute files, etc are located. profilePath: .:../../tab @@ -16,3 +16,5 @@ recordtype: grs.sgml #recordId: (bib1,identifier-standard) isam: b register: reg:20M + +memmax: 1 diff --git a/test/gils/zebra2.cfg b/test/gils/zebra2.cfg index f11528d..313e793 100644 --- a/test/gils/zebra2.cfg +++ b/test/gils/zebra2.cfg @@ -1,5 +1,5 @@ # Simple Zebra configuration file -# $Id: zebra2.cfg,v 1.4 2002-10-28 21:39:11 adam Exp $ +# $Id: zebra2.cfg,v 1.5 2004-01-22 15:40:25 heikki Exp $ # # Where the schema files, attribute files, etc are located. profilePath: .:../../tab @@ -16,4 +16,9 @@ storeKeys: 1 recordId: file isam: b -register: reg:20M +register: reg:200M + +memmax: 1 +# do not create temp files when indexing +#indexchunk: 1M +