X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=isamc%2Fmerge-d.c;h=2912ecfaa7d7339dc9023dd5c7a02dbdc1888ebf;hp=29fc63025d949a5c03bbce2063659f15885855d7;hb=138b954cffe470bf0e62d812cae6e859aa57cdb5;hpb=e150e51a7e20a902e9fd2f11f00811f94f67d529 diff --git a/isamc/merge-d.c b/isamc/merge-d.c index 29fc630..2912ecf 100644 --- a/isamc/merge-d.c +++ b/isamc/merge-d.c @@ -1,97 +1,26 @@ -/* - * Copyright (c) 1996-1998, Index Data. - * See the file LICENSE for details. - * Heikki Levanto - * - * $Id: merge-d.c,v 1.25 1999-11-30 13:48:04 adam Exp $ - * - * bugs - * sinleton-bit has to be in the high end, not low, so as not to confuse - * ordinary small numbers, like in the next pointer.. - * - * missing - * - * optimize - * - study and optimize block sizes (later) - * - find a way to decide the size of an empty diffblock (after merge) - * - On allocating more blocks (in append and merge), check the order of - * blocks, and if needed, swap them. - * - Write a routine to save/load indexes into a block, save only as many - * bytes as needed (size, diff, diffindexes) - * - * - * caveat - * There is a confusion about the block addresses. cat or type is the category, - * pos or block is the block number. pp structures keep these two separate, - * and combine when saving the pp. The next pointer in the pp structure is - * also a combined address, but needs to be combined every time it is needed, - * and separated when the partss are needed... This is done with the isamd_ - * _block, _type, and _addr macros. The _addr takes block and type as args, - * in that order. This conflicts with the order these are often mentioned in - * the debug log calls, and other places, leading to small mistakes here - * and there. - * - * Needs cleaning! The way diff blocks are handled in append and reading is - * quite different, and likely to give maintenance problems. - * - * log levels (set isamddebug=x in zebra.cfg (or what ever cfg file you use) ) - * 0 = no logging. Default - * 1 = no logging here. isamd logs overall statistics - * 2 = Each call to isamd_append with start address and no more - * 3 = Start and type of append, start of merge, and result of append - * 4 = Block allocations - * 5 = Block-level operations (read/write) - * 6 = Details about diff blocks etc. - * 7 = Log each record as it passes the system (once) - * 8 = Log raw and (de)coded data - * 9 = Anything else that may be useful - * .. = Anything needed to hunt a specific bug - * (note that all tests in the code are like debug>3, which means 4 or above!) - * - * Design for the new and improved isamd - * Key points: - * - The first block is only diffs, no straight data - * - Additional blocks are straight data - * - When a diff block gets filled up, a data block is created by - * merging the diffs with the data - * - * Structure - * - Isamd_pp: buffer for diffs and for data - * keep both pos, type, and combined address - * routine to set the address - * - diffbuf: lengths as short ints, or bytes for small blocks - * - keys are of key_struct, not just a number of bytes. - * - * Routines - * - isamd_append - * - create_new_block if needed - * - append_diffs - * - load_diffs - * - get diffend, start encoding - * - while input data - * - encode it - * - if no room, then realloc block in larger size - * - if still no room, merge and exit - * - append in the block - * - * - merge - * - just as before, except that merges also input data directly - * - writes into new data blocks - * - * - * - isamd.c: load firstpp, load datablock - * save firstpp, save datablock - * - Readlength, writelength - handling right size of len fields - * - isamd_read_main_item: take also a merge input structure, and merge it too - * - prefilter: cache two inputs, and check if they cancel. - * - single-item optimization - * - * questions: Should we realloc firstblocks in a different size as the main - * blocks. Makes a sideways seek, which is bound to be slowe. But saves some - * update time. Compromise: alloc the first one in the size of the datablock, - * but increase if necessary. Large blocks get a large diff, ok. Small ones - * may get an extra seek in read, but save merges. - */ +/* $Id: merge-d.c,v 1.30 2003-03-05 16:41:10 adam Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 + Index Data Aps + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + + #define NEW_ISAM_D 1 /* not yet ready to delete the old one! */ @@ -114,11 +43,11 @@ struct ISAMD_DIFF_s { int difftype; }; -#define DT_NONE 0 // no diff, marks end of sequence -#define DT_DIFF 1 // ordinarry diff -#define DT_MAIN 2 // main data -#define DT_INPU 3 // input data to be merged -#define DT_DONE 4 // done with all input here +#define DT_NONE 0 /* no diff, marks end of sequence */ +#define DT_DIFF 1 /* ordinarry diff */ +#define DT_MAIN 2 /* main data */ +#define DT_INPU 3 /* input data to be merged */ +#define DT_DONE 4 /* done with all input here */ @@ -227,13 +156,27 @@ static int filter_isempty(FILTER F) return ( (0 == F->r1) && (0 == F->r2)) ; } +#if 0 static int filter_only_one(FILTER F) { return ( (0 != F->r1) && (0 == F->r2)); } +#endif - - +/* We may need backfilling, if we read a lonely key to make */ +/* a singleton, but its bitw will not fit in. Then we need to */ +/* process it normally, which means reading it again. So we */ +/* need to unread it first. Luckily the filter is empty at that */ +/* point */ +#if 0 +static void filter_backfill(FILTER F, struct it_key *k, int mode) +{ + assert(F->r1 == FILTER_NOTYET ); /* not overwriting data! */ + F->k1=*k; + F->m1=mode; + F->r1=1; /* ok read */ +} +#endif /*************************************************************** * Singleton encoding @@ -251,6 +194,7 @@ static int filter_only_one(FILTER F) int is_singleton(ISAMD_P ipos) { + return 0; /* no singletons any more */ return ( ipos != 0 ) && ( ipos & SINGLETON_BIT ); } @@ -258,6 +202,7 @@ int is_singleton(ISAMD_P ipos) int singleton_encode(struct it_key *k) /* encodes the key into one int. If it does not fit, returns 0 */ { + return 0; /* no more singletons */ if ( (k->sysno & DEC_MASK(DEC_SYSBITS) ) != k->sysno ) return 0; /* no room dor sysno */ if ( (k->seqno & DEC_MASK(DEC_SYSBITS) ) != k->seqno ) @@ -432,7 +377,7 @@ static void getDiffInfo(ISAMD_PP pp ) if (0==pp->diffinfo[i].maxidx) { - if (pp->is->method->debug > 5) //!!! 4 + if (pp->is->method->debug > 5) /* !!! 4 */ logf(LOG_LOG,"isamd_getDiffInfo:End mark at ix=%d n=%d", diffidx, i); return; /* end marker */ @@ -572,7 +517,7 @@ int isamd_read_item_merge ( ; /* find last diff */ if (p_key) { /* we have an extra item to inject into the merge */ - if (pp->is->method->debug >9) //!!!!! + if (pp->is->method->debug >9) /* !!!!! */ logf(LOG_LOG,"isamd_read_item: going to merge with %d.%d", p_key->sysno, p_key->seqno); pp->diffinfo[i].key = *p_key; /* the key merge could not handle */ @@ -773,8 +718,9 @@ int isamd_read_item (ISAMD_PP pp, char **dst) static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ struct it_key *p_key, /* the data item that didn't fit*/ - /* ISAMD_I data) */ /* more input data comes here */ - FILTER filt) /* more input data arriving here */ + FILTER filt, /* more input data arriving here */ + char *dictentry, /* the thin in the dictionary */ + int dictlen) /* and its size */ { int diffidx; int killblk=0; @@ -783,7 +729,7 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ int r_more = 1; ISAMD_PP pp; ISAMD_PP readpp=firstpp; - int retval=0; + int retpos=0; int diffcat = firstpp->cat; /* keep the category of the diffblock even */ /* if it is going to be empty now. */ /* Alternative: Make it the minimal, and */ @@ -794,8 +740,6 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ /* set up diffs as they should be for reading */ diffidx = ISAMD_BLOCK_OFFSET_1; - //readpp->diffbuf=readpp->buf; // diffinfo has to duplicate it! - //getDiffInfo(readpp); // first read will make the diffinfo, at init if (readpp->is->method->debug >4) logf(LOG_LOG,"isamd_merge: f=%d=%d:%d n=%d=%d:%d", @@ -814,7 +758,6 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ r_ptr= (char *) &r_key; -/* r_more = isamd_read_item_merge( readpp, &r_ptr, p_key, data); */ r_more = isamd_read_item_merge( readpp, &r_ptr, p_key, filt); if (!r_more) { /* oops, all data has been deleted! what to do??? */ @@ -829,13 +772,14 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ /* set up the new blocks for simple writing */ - firstpp=isamd_pp_open(readpp->is,isamd_addr(0, diffcat)); + /* firstpp=isamd_pp_open(readpp->is,isamd_addr(0, diffcat)); */ + firstpp=isamd_pp_create(readpp->is, diffcat); firstpp->pos=isamd_alloc_block(firstpp->is,diffcat); if (readpp->is->method->debug >3) logf(LOG_LOG,"isamd_merge: allocated new firstpp %d=%d:%d", isamd_addr(firstpp->pos,firstpp->cat), firstpp->cat, firstpp->pos ); - pp=isamd_pp_open(readpp->is,isamd_addr(0,readpp->is->max_cat) ); + pp=isamd_pp_create(readpp->is,readpp->is->max_cat ); pp->offset=pp->size=ISAMD_BLOCK_OFFSET_N; while (r_more) @@ -862,9 +806,6 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ } /* while read */ -// firstpp->diffs=0; - - isamd_reduceblock(pp); /* reduce size if possible */ if (0==firstpp->next) firstpp->next = isamd_addr(pp->pos,pp->cat); @@ -888,10 +829,17 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ firstpp->size = firstpp->offset = ISAMD_BLOCK_OFFSET_1; /* nothing there */ memset(firstpp->buf,'\0',firstpp->is->method->filecat[firstpp->cat].bsize); save_first_pp(firstpp); - retval = isamd_addr(firstpp->pos, firstpp->cat); + retpos = isamd_addr(firstpp->pos, firstpp->cat); isamd_pp_close(firstpp); - return retval; + /* Create the dict entry */ + /*!*/ /* it could be this could go in the dict as well, if there's */ + /* been really many deletes. Somehow I suspect that is not the */ + /* case. FIXME: Collect statistics and see if needed */ + dictentry[0]=0; /* mark as a real isam */ + memcpy(dictentry+1, &retpos, sizeof(ISAMD_P)); + dictlen=sizeof(ISAMD_P)+1; + return dictlen; } /* merge */ @@ -906,10 +854,10 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ static int append_diffs( ISAMD is, - ISAMD_P ipos, - /*ISAMD_I data)*/ + char *dictentry, int dictlen, FILTER filt) { + ISAMD_P ipos; struct it_key i_key; /* one input item */ char *i_item = (char *) &i_key; /* same as chars */ char *i_ptr=i_item; @@ -925,26 +873,31 @@ static int append_diffs( char *c_ptr = codebuff; int codelen; int merge_rc; - int retval=0; + ISAMD_P retpos; + int dsize; - if (0==ipos) + if (0==dictlen) { - firstpp=isamd_pp_open(is, isamd_addr(0,0) ); + firstpp=isamd_pp_create(is, 0 ); firstpp->size=firstpp->offset=ISAMD_BLOCK_OFFSET_1; /* create in smallest category, will expand later */ ++(is->no_fbuilds); } else { - firstpp=isamd_pp_open(is, ipos); + firstpp=isamd_pp_open(is, dictentry, dictlen); + if (dictentry[0] ) + ipos=0; + else + memcpy(&ipos,dictentry+1,sizeof(ISAMD_P)); ++(is->no_appds); } if (is->method->debug >2) - logf(LOG_LOG,"isamd_appd: Start ipos=%d=%d:%d n=%d=%d:%d nk=%d", + logf(LOG_LOG,"isamd_appd: Start ipos=%d=%d:%d n=%d=%d:%d nk=%d sz=%d", ipos, isamd_type(ipos), isamd_block(ipos), firstpp->next, isamd_type(firstpp->next), isamd_block(firstpp->next), - firstpp->numKeys); + firstpp->numKeys, firstpp->size); maxsize = is->method->filecat[firstpp->cat].bsize; difflenidx = diffidx = firstpp->size; @@ -952,7 +905,6 @@ static int append_diffs( diffidx+=sizeof(int); /* difflen will be stored here */ /* read first input */ - //i_ptr = i_item; //!!! i_more = filter_read(filt, &i_key, &i_mode); /* i_more = (*data->read_item)(data->clientData, &i_ptr, &i_mode); */ @@ -999,10 +951,10 @@ static int append_diffs( { /* max size - can't help, need to merge it */ if (is->method->debug >7) logf(LOG_LOG,"isamd_appd: need to merge"); - if (is->method->debug >9) //!!!!! + if (is->method->debug >9) /* !!!!! */ logf(LOG_LOG,"isamd_appd: going to merge with m=%d %d.%d", i_mode, i_key.sysno, i_key.seqno); - merge_rc = merge (firstpp, &i_key, filt); + merge_rc = merge (firstpp, &i_key, filt, dictentry, dictlen); if (0!=merge_rc) return merge_rc; /* merge handled them all ! */ assert(!"merge returned zero ??"); @@ -1050,13 +1002,40 @@ static int append_diffs( while ( (difflenidx-diffidx<=sizeof(int)+1) && (difflenidxbuf[difflenidx++]='\0'; - if (0==firstpp->pos) /* need to (re)alloc the block */ - firstpp->pos = isamd_alloc_block(is, firstpp->cat); + if (firstpp->numKeys==0) + { + /* FIXME: Release blocks that may be allocated !!! */ + return 0; /* don't bother storing this! */ + } - retval = save_first_pp( firstpp ); - isamd_pp_close(firstpp); + dsize=diffidx-ISAMD_BLOCK_OFFSET_1; + /* logf(LOG_LOG,"!! nxt=%d diffidx=%d ds=%d", + firstpp->next, diffidx, dsize); */ + + if ( (0==firstpp->next) && (dsize numKeys < 128); + assert(firstpp->numKeys >0); + /* actually, 255 is good enough, but sign mismatches... */ + /* in real life, 4-5 is as much as we can hope for, as long */ + /* as ISAMD_MAX_DICT_LEN is reasonably small (8) */ + dictentry[0]=firstpp->numKeys; + memcpy(dictentry+1, firstpp->buf+ISAMD_BLOCK_OFFSET_1, dsize); + dictlen=dsize+1; + } + else + { + if (0==firstpp->pos) /* need to (re)alloc the block */ + firstpp->pos = isamd_alloc_block(is, firstpp->cat); + retpos = save_first_pp( firstpp ); + isamd_pp_close(firstpp); + dictentry[0]=0; /* mark as a real isam */ + memcpy(dictentry+1, &retpos, sizeof(ISAMD_P)); + dictlen=sizeof(ISAMD_P)+1; + } - return retval; + return dictlen; } /* append_diffs */ @@ -1066,24 +1045,23 @@ static int append_diffs( * isamd_append itself *************************************************************/ -ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) +int isamd_append (ISAMD is, char *dictentry, int dictlen, ISAMD_I data) +/*ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) */ { FILTER F = filter_open(is,data); - ISAMD_P rc=0; + int newlen=0; - int olddebug= is->method->debug; - if (ipos == 7320) - is->method->debug = 99; /*!*/ - if ( filter_isempty(F) ) /* can be, if del-ins of the same */ { if (is->method->debug >3) - logf(LOG_LOG,"isamd_appd: nothing to do for %d=",ipos); + logf(LOG_LOG,"isamd_appd: nothing to do "); filter_close(F); ++(is->no_non); - return ipos; /* without doing anything at all */ + return dictlen; /* without doing anything at all */ } +#ifdef SKIPTHIS + /* The old way to handle singletons */ if ( ( 0==ipos) && filter_only_one(F) ) { struct it_key k; @@ -1091,6 +1069,12 @@ ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) filter_read(F,&k,&mode); assert(mode); rc = singleton_encode(&k); + if (!rc) + { + if (is->method->debug >9) + logf(LOG_LOG,"isamd_appd: singleton didn't fit, backfilling"); + filter_backfill(F,&k, mode); + } if (is->method->debug >9) logf(LOG_LOG,"isamd_appd: singleton %d (%x)", rc,rc); @@ -1098,20 +1082,14 @@ ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) is->no_singles++; assert ( (rc==0) || is_singleton(rc) ); } - if ( 0==rc) /* either not single, or it did not fit */ - { - rc = append_diffs(is,ipos,F); - assert ( ! is_singleton(rc) ); - /* can happen if we run out of bits, so that block numbers overflow */ - /* to SINGLETON_BIT */ - } + newlen = append_diffs(is,ipos,F); +#endif + newlen = append_diffs(is,dictentry,dictlen,F); filter_close(F); if (is->method->debug >2) - logf(LOG_LOG,"isamd_appd: ret %d=%x (%d=%x)", - rc,rc,ipos,ipos); - is->method->debug=olddebug; /*!*/ - return rc; + logf(LOG_LOG,"isamd_appd: ret len=%d ", newlen); + return newlen; } /* isamd_append */ @@ -1122,7 +1100,24 @@ ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) /* * $Log: merge-d.c,v $ - * Revision 1.25 1999-11-30 13:48:04 adam + * Revision 1.30 2003-03-05 16:41:10 adam + * Fix GCC warnings + * + * Revision 1.29 2002/11/26 22:18:34 adam + * Remove // comments + * + * Revision 1.28 2002/08/02 19:26:56 adam + * Towards GPL + * + * Revision 1.27 2002/07/12 18:12:21 heikki + * Isam-D now stores small entries directly in the dictionary. + * Needs more tuning and cleaning... + * + * Revision 1.26 2002/07/11 16:16:00 heikki + * Fixed a bug in isamd, failed to store a single key when its bits + * did not fit into a singleton. + * + * Revision 1.25 1999/11/30 13:48:04 adam * Improved installation. Updated for inclusion of YAZ header files. * * Revision 1.24 1999/10/05 09:57:40 heikki