X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=isamc%2Fmerge-d.c;h=29fc63025d949a5c03bbce2063659f15885855d7;hb=e150e51a7e20a902e9fd2f11f00811f94f67d529;hp=e31adaa5436a8eeb271981e66cb373acbe5c2409;hpb=b918342ded2c57bcd57bf7af6748c17bce4d1cf6;p=idzebra-moved-to-github.git diff --git a/isamc/merge-d.c b/isamc/merge-d.c index e31adaa..29fc630 100644 --- a/isamc/merge-d.c +++ b/isamc/merge-d.c @@ -3,33 +3,22 @@ * See the file LICENSE for details. * Heikki Levanto * - * $Id: merge-d.c,v 1.19 1999-09-13 13:28:28 heikki Exp $ + * $Id: merge-d.c,v 1.25 1999-11-30 13:48:04 adam Exp $ + * + * bugs + * sinleton-bit has to be in the high end, not low, so as not to confuse + * ordinary small numbers, like in the next pointer.. * * missing * * optimize - * - Input filter: Eliminate del-ins pairs, tell if only one entry (or none) - * - single-entry optimizing (keep the one entry in the dict, no block) * - study and optimize block sizes (later) - * - Clean up the different ways diffs are handled in writing and reading - * - Keep a merge-count in the firstpp, and if the block has already been - * merged, reduce it to a larger size even if it could fit in a small one! - * - Keep minimum freespace in the category table, and use that in reduce! - * - pass a space-needed for separateDiffBlock and reduce to be able to - * reserve more room for diffs, or to force a separate (larger?) block - * - Idea: Simplify the structure, so that the first block is always diffs. - * On small blocks, that is all we have. Once a block has been merged, we - * allocate the first main block and a (new) firstblock ffor diffs. From - * that point on the word has two blocks for it. - * - On allocating more blocks (in append), check the order of blocks, and - * if needed, swap them. - * - In merge, merge also with the input data. + * - find a way to decide the size of an empty diffblock (after merge) + * - On allocating more blocks (in append and merge), check the order of + * blocks, and if needed, swap them. * - Write a routine to save/load indexes into a block, save only as many * bytes as needed (size, diff, diffindexes) * - * bugs - * - Some confusion about opening pp's, how to set offset etc. Maybe it'd be - * best to load both diffs and first main block? * * caveat * There is a confusion about the block addresses. cat or type is the category, @@ -111,7 +100,7 @@ #include #include #include -#include +#include #include "../index/index.h" #include "isamd-p.h" @@ -131,6 +120,166 @@ struct ISAMD_DIFF_s { #define DT_INPU 3 // input data to be merged #define DT_DONE 4 // done with all input here + + +/*************************************************************** + * Input preprocess filter + ***************************************************************/ + + +#define FILTER_NOTYET -1 /* no data read in yet, to be done */ + +struct ISAMD_FILTER_s { + ISAMD_I data; /* where the data comes from */ + ISAMD is; /* for debug flags */ + struct it_key k1; /* the next item to be returned */ + int m1; /* mode for k1 */ + int r1; /* result for read of k1, or NOTYET */ + struct it_key k2; /* the one after that */ + int m2; + int r2; +}; + +typedef struct ISAMD_FILTER_s *FILTER; + + +void filter_fill(FILTER F) +{ + while ( (F->r1 == FILTER_NOTYET) || (F->r2 == FILTER_NOTYET) ) + { + if (F->r1==FILTER_NOTYET) + { /* move data forward in the filter */ + F->k1 = F->k2; + F->m1 = F->m2; + F->r1 = F->r2; + if ( 0 != F->r1 ) /* not eof */ + F->r2 = FILTER_NOTYET; /* say we want more */ + if (F->is->method->debug > 9) + logf(LOG_LOG,"filt_fill: shift %d.%d m=%d r=%d", + F->k1.sysno, + F->k1.seqno, + F->m1, F->r1); + } + if (F->r2==FILTER_NOTYET) + { /* read new bottom value */ + char *k_ptr = (char*) &F->k2; + F->r2 = (F->data->read_item)(F->data->clientData, &k_ptr, &F->m2); + if (F->is->method->debug > 9) + logf(LOG_LOG,"filt_fill: read %d.%d m=%d r=%d", + F->k2.sysno, F->k2.seqno, F->m2, F->r2); + } + if ( (F->k1.sysno == F->k2.sysno) && + (F->k1.seqno == F->k2.seqno) && + (F->m1 != F->m2) && + (F->r1 >0 ) && (F->r2 >0) ) + { /* del-ins pair of same key (not eof) , ignore both */ + if (F->is->method->debug > 9) + logf(LOG_LOG,"filt_fill: skipped %d.%d m=%d/%d r=%d/%d", + F->k1.sysno, F->k1.seqno, + F->m1,F->m2, F->r1,F->r2); + F->r1 = FILTER_NOTYET; + F->r2 = FILTER_NOTYET; + } + } /* while */ +} /* filter_fill */ + + +FILTER filter_open( ISAMD is, ISAMD_I data ) +{ + FILTER F = (FILTER) xmalloc(sizeof(struct ISAMD_FILTER_s)); + F->is = is; + F->data = data; + F->k1.sysno=0; + F->k1.seqno=0; + F->k2=F->k1; + F->m1 = F->m2 = 0; + F->r1 = F->r2 = FILTER_NOTYET; + filter_fill(F); + return F; +} + +static void filter_close (FILTER F) +{ + xfree(F); +} + +static int filter_read( FILTER F, + struct it_key *k, + int *mode) +{ + int res; + filter_fill(F); + if (F->is->method->debug > 9) + logf(LOG_LOG,"filt_read: reading %d.%d m=%d r=%d", + F->k1.sysno, F->k1.seqno, F->m1, F->r1); + res = F->r1; + if(res) + { + *k = F->k1; + *mode= F->m1; + } + F->r1 = FILTER_NOTYET; + return res; +} + +static int filter_isempty(FILTER F) +{ + return ( (0 == F->r1) && (0 == F->r2)) ; +} + +static int filter_only_one(FILTER F) +{ + return ( (0 != F->r1) && (0 == F->r2)); +} + + + + +/*************************************************************** + * Singleton encoding + ***************************************************************/ +/* When there is only a single item, we don't allocate a block + * for it, but code it in the directory entry directly, if it + * fits. + */ + +#define DEC_SYSBITS 15 +#define DEC_SEQBITS 15 +#define DEC_MASK(n) ((1<<(n))-1) + +#define SINGLETON_BIT (1<<(DEC_SYSBITS+DEC_SEQBITS+1)) + +int is_singleton(ISAMD_P ipos) +{ + return ( ipos != 0 ) && ( ipos & SINGLETON_BIT ); +} + + +int singleton_encode(struct it_key *k) +/* encodes the key into one int. If it does not fit, returns 0 */ +{ + if ( (k->sysno & DEC_MASK(DEC_SYSBITS) ) != k->sysno ) + return 0; /* no room dor sysno */ + if ( (k->seqno & DEC_MASK(DEC_SYSBITS) ) != k->seqno ) + return 0; /* no room dor sysno */ + return (k->sysno | (k->seqno << DEC_SYSBITS) ) | SINGLETON_BIT; +} + +void singleton_decode (int code, struct it_key *k) +{ + assert (code & SINGLETON_BIT); + k->sysno = code & DEC_MASK(DEC_SYSBITS); + code = code >> DEC_SYSBITS; + k->seqno = code & DEC_MASK(DEC_SEQBITS); +} + + +/*************************************************************** + * General support routines + ***************************************************************/ + + + static char *hexdump(unsigned char *p, int len, char *buff) { static char localbuff[128]; char bytebuff[8]; @@ -145,29 +294,7 @@ static char *hexdump(unsigned char *p, int len, char *buff) { return buff; } -/*************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - *************************************************************** - ***************************************************************/ - -/*************************************************************** - * General support routines - ***************************************************************/ static void isamd_reduceblock(ISAMD_PP pp) /* takes a large block, and reduces its category if possible */ @@ -292,7 +419,7 @@ static void getDiffInfo(ISAMD_PP pp ) if ( (pp->is->method->debug > 0) && (pp->diffinfo[i].maxidx > pp->is->method->filecat[pp->cat].bsize) ) - { /* bug-hunting, this fails on some long runs that log too much */ + { logf(LOG_LOG,"Bad MaxIx!!! %s:%d: diffidx=%d", __FILE__,__LINE__, diffidx); logf(LOG_LOG,"i=%d maxix=%d bsz=%d", i, pp->diffinfo[i].maxidx, @@ -424,7 +551,8 @@ int isamd_read_item_merge ( ISAMD_PP pp, char **dst, struct it_key *p_key, /* the data item that didn't fit*/ - ISAMD_I data) /* more input data comes here */ + /* ISAMD_I data) */ /* more input data comes here */ + FILTER filt) /* more input data comes here */ { /* The last two args can be null for ordinary reads */ char *keyptr; char *codeptr; @@ -460,7 +588,7 @@ int isamd_read_item_merge ( p_key->sysno=p_key->seqno=0; /* used it up */ } - if (data) + if (filt) { /* we have a whole input stream to inject */ pp->diffinfo[i].difftype=DT_INPU; } @@ -469,41 +597,7 @@ int isamd_read_item_merge ( while (retry) { - retry=0; - -#ifdef SKIPTHIS - - if (0==pp->diffinfo[0].key.sysno) - { /* 0 is special case, main data. */ - oldoffs=pp->offset; - keyptr=(char*) &(pp->diffinfo[0].key); - pp->diffinfo[0].mode = ! isamd_read_main_item(pp,&keyptr); - if (pp->is->method->debug > 7) - logf(LOG_LOG,"isamd_read_item: read main at %d-%d %d.%d (%x.%x)", - oldoffs,pp->offset, - pp->diffinfo[0].key.sysno, pp->diffinfo[0].key.seqno, - pp->diffinfo[0].key.sysno, pp->diffinfo[0].key.seqno); - } /* get main data */ - - if ( (0==pp->diffinfo[1].key.sysno) && (-1==pp->diffinfo[1].maxidx) ) - { /* 1 is another special case, the input data at merge */ - keyptr = (char *) &pp->diffinfo[1].key; - i = (*data->read_item)(data->clientData, &keyptr, &pp->diffinfo[1].mode); - if (!i) - { /* did not get it */ - pp->diffinfo[1].key.sysno=0; - pp->diffinfo[1].maxidx=0; /* signal the end */ - } - if (pp->is->method->debug >7) - logf(LOG_LOG,"merge: read diff m=%d %d.%d (%x.%x)", - pp->diffinfo[1].mode, - pp->diffinfo[1].key.sysno, pp->diffinfo[1].key.seqno, - pp->diffinfo[1].key.sysno, pp->diffinfo[1].key.seqno ); - } /* get input data */ - -#endif // SKIPTHIS - - + retry=0; winner = 0; for (i=0; (!retry) && (pp->diffinfo[i].difftype); i++) { @@ -560,7 +654,9 @@ int isamd_read_item_merge ( else if (pp->diffinfo[i].difftype==DT_INPU) { keyptr = (char *) &pp->diffinfo[i].key; - rc = (*data->read_item)(data->clientData, &keyptr, &pp->diffinfo[i].mode); + /* rc = (*data->read_item)(data->clientData, &keyptr, &pp->diffinfo[i].mode); */ + rc = filter_read(filt, &pp->diffinfo[i].key, + &pp->diffinfo[i].mode); if (!rc) { /* did not get it */ pp->diffinfo[i].key.sysno=0; @@ -655,8 +751,11 @@ int isamd_read_item_merge ( assert(winner==0); /* if nothing found, nothing comes from a diff */ cmp= 0; /* eof */ } - if (pp->is->method->debug >9) - logf(LOG_LOG,"mergeDB4: sysno[1]=%d", pp->diffinfo[1].key.sysno); /*!*/ + if (cmp) + ++(pp->is->no_read_keys); + else + ++(pp->is->no_read_eof); + return cmp; } /* isamd_read_item */ @@ -674,7 +773,8 @@ int isamd_read_item (ISAMD_PP pp, char **dst) static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ struct it_key *p_key, /* the data item that didn't fit*/ - ISAMD_I data) /* more input data comes here */ + /* ISAMD_I data) */ /* more input data comes here */ + FILTER filt) /* more input data arriving here */ { int diffidx; int killblk=0; @@ -690,7 +790,7 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ /* resize later. Saves disk, but will lead */ /* into bad seeks. */ - ++(readpp->is->files[0].no_merges); + ++(readpp->is->no_merges); /* set up diffs as they should be for reading */ diffidx = ISAMD_BLOCK_OFFSET_1; @@ -714,7 +814,8 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ r_ptr= (char *) &r_key; - r_more = isamd_read_item_merge( readpp, &r_ptr, p_key, data); +/* r_more = isamd_read_item_merge( readpp, &r_ptr, p_key, data); */ + r_more = isamd_read_item_merge( readpp, &r_ptr, p_key, filt); if (!r_more) { /* oops, all data has been deleted! what to do??? */ /* never mind, we have at least one more delta to add to the block */ @@ -724,7 +825,6 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ if (readpp->is->method->debug >5) logf(LOG_LOG,"isamd_merge:all data has been deleted (nk=%d) ", readpp->numKeys); - //assert (readpp->numKeys == 0); /* no longer true! */ } @@ -757,12 +857,12 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ /* (try to) read next item */ r_ptr= (char *) &r_key; - r_more = isamd_read_item_merge( readpp, &r_ptr,0,data); + r_more = isamd_read_item_merge( readpp, &r_ptr,0,filt); } /* while read */ - firstpp->diffs=0; +// firstpp->diffs=0; isamd_reduceblock(pp); /* reduce size if possible */ @@ -804,7 +904,11 @@ static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */ -static int append_diffs(ISAMD is, ISAMD_P ipos, ISAMD_I data) +static int append_diffs( + ISAMD is, + ISAMD_P ipos, + /*ISAMD_I data)*/ + FILTER filt) { struct it_key i_key; /* one input item */ char *i_item = (char *) &i_key; /* same as chars */ @@ -828,12 +932,12 @@ static int append_diffs(ISAMD is, ISAMD_P ipos, ISAMD_I data) firstpp=isamd_pp_open(is, isamd_addr(0,0) ); firstpp->size=firstpp->offset=ISAMD_BLOCK_OFFSET_1; /* create in smallest category, will expand later */ - ++(is->files[0].no_fbuilds); + ++(is->no_fbuilds); } else { firstpp=isamd_pp_open(is, ipos); - ++(is->files[0].no_appds); + ++(is->no_appds); } if (is->method->debug >2) @@ -848,8 +952,9 @@ static int append_diffs(ISAMD is, ISAMD_P ipos, ISAMD_I data) diffidx+=sizeof(int); /* difflen will be stored here */ /* read first input */ - i_ptr = i_item; - i_more = (*data->read_item)(data->clientData, &i_ptr, &i_mode); + //i_ptr = i_item; //!!! + i_more = filter_read(filt, &i_key, &i_mode); + /* i_more = (*data->read_item)(data->clientData, &i_ptr, &i_mode); */ if (is->method->debug >6) logf(LOG_LOG,"isamd_appd: start m=%d %d.%d=%x.%x: %d", @@ -865,7 +970,7 @@ static int append_diffs(ISAMD is, ISAMD_P ipos, ISAMD_I data) i_key.seqno = i_key.seqno * 2 + i_mode; c_ptr=codebuff; - i_ptr=i_item; + i_ptr=i_item; (*is->method->code_item)(ISAMD_ENCODE, firstpp->decodeClientData, &c_ptr, &i_ptr); codelen = c_ptr - codebuff; @@ -877,31 +982,42 @@ static int append_diffs(ISAMD is, ISAMD_P ipos, ISAMD_I data) if (diffidx + codelen > maxsize ) { /* block full */ - if (firstpp->cat < firstpp->is->max_cat) - { /* just increase the block size */ + while ( (firstpp->cat < firstpp->is->max_cat) && + (diffidx + codelen > maxsize) ) + { /* try to increase the block size */ if (firstpp->pos > 0) /* free the old block if allocated */ isamd_release_block(is, firstpp->cat, firstpp->pos); ++firstpp->cat; maxsize = is->method->filecat[firstpp->cat].bsize; firstpp->pos=0; /* need to allocate it when saving */ if (is->method->debug >3) - logf(LOG_LOG,"isamd_appd: increased diff block to %d (%d)", + logf(LOG_LOG,"isamd_appd: increased diff block sz to %d (%d)", firstpp->cat, maxsize); } - else - { /* max size already - can't help, need to merge it */ + if ((firstpp->cat >= firstpp->is->max_cat) && + (diffidx + codelen > maxsize) ) + { /* max size - can't help, need to merge it */ if (is->method->debug >7) - logf(LOG_LOG,"isamd_appd: block full"); + logf(LOG_LOG,"isamd_appd: need to merge"); if (is->method->debug >9) //!!!!! logf(LOG_LOG,"isamd_appd: going to merge with m=%d %d.%d", i_mode, i_key.sysno, i_key.seqno); - merge_rc = merge (firstpp, &i_key, data); + merge_rc = merge (firstpp, &i_key, filt); if (0!=merge_rc) return merge_rc; /* merge handled them all ! */ assert(!"merge returned zero ??"); } /* need to merge */ } /* block full */ - + + if (!( diffidx+codelen <= maxsize )) + { /* bug hunting */ + logf(LOG_LOG,"OOPS, diffidx problem: d=%d c=%d s=%d > m=%d", + diffidx, codelen, diffidx+codelen, maxsize); + logf(LOG_LOG,"ipos=%d f=%d=%d:%d", + ipos, + isamd_addr(firstpp->pos, firstpp->cat), + firstpp->cat, firstpp->pos ); + } assert ( diffidx+codelen <= maxsize ); /* save the diff */ @@ -919,7 +1035,8 @@ static int append_diffs(ISAMD is, ISAMD_P ipos, ISAMD_I data) /* (try to) read the next input */ i_ptr = i_item; - i_more = (*data->read_item)(data->clientData, &i_ptr, &i_mode); + i_more = filter_read(filt, &i_key, &i_mode); + /* i_more = (*data->read_item)(data->clientData, &i_ptr, &i_mode); */ if ( (i_more) && (is->method->debug >6) ) logf(LOG_LOG,"isamd_appd: got m=%d %d.%d=%x.%x: %d", i_mode, @@ -946,12 +1063,55 @@ static int append_diffs(ISAMD is, ISAMD_P ipos, ISAMD_I data) /************************************************************* - * isamd_append itself, Sweet, isn't it + * isamd_append itself *************************************************************/ ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) { - return append_diffs(is,ipos,data); + FILTER F = filter_open(is,data); + ISAMD_P rc=0; + + int olddebug= is->method->debug; + if (ipos == 7320) + is->method->debug = 99; /*!*/ + + if ( filter_isempty(F) ) /* can be, if del-ins of the same */ + { + if (is->method->debug >3) + logf(LOG_LOG,"isamd_appd: nothing to do for %d=",ipos); + filter_close(F); + ++(is->no_non); + return ipos; /* without doing anything at all */ + } + + if ( ( 0==ipos) && filter_only_one(F) ) + { + struct it_key k; + int mode; + filter_read(F,&k,&mode); + assert(mode); + rc = singleton_encode(&k); + if (is->method->debug >9) + logf(LOG_LOG,"isamd_appd: singleton %d (%x)", + rc,rc); + if (rc) + is->no_singles++; + assert ( (rc==0) || is_singleton(rc) ); + } + if ( 0==rc) /* either not single, or it did not fit */ + { + rc = append_diffs(is,ipos,F); + assert ( ! is_singleton(rc) ); + /* can happen if we run out of bits, so that block numbers overflow */ + /* to SINGLETON_BIT */ + } + filter_close(F); + + if (is->method->debug >2) + logf(LOG_LOG,"isamd_appd: ret %d=%x (%d=%x)", + rc,rc,ipos,ipos); + is->method->debug=olddebug; /*!*/ + return rc; } /* isamd_append */ @@ -962,7 +1122,25 @@ ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) /* * $Log: merge-d.c,v $ - * Revision 1.19 1999-09-13 13:28:28 heikki + * Revision 1.25 1999-11-30 13:48:04 adam + * Improved installation. Updated for inclusion of YAZ header files. + * + * Revision 1.24 1999/10/05 09:57:40 heikki + * Tuning the isam-d (and fixed a small "detail") + * + * Revision 1.23 1999/09/27 14:36:36 heikki + * singletons + * + * Revision 1.22 1999/09/23 18:01:18 heikki + * singleton optimising + * + * Revision 1.21 1999/09/21 17:36:43 heikki + * Added filter function. Not much of effect on the small test set... + * + * Revision 1.20 1999/09/20 15:48:06 heikki + * Small changes + * + * Revision 1.19 1999/09/13 13:28:28 heikki * isam-d optimizing: merging input data in the same go * * Revision 1.18 1999/08/25 18:09:24 heikki @@ -1018,5 +1196,3 @@ ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) */ - -