X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=isamc%2Fmerge.c;h=aa7be55f46c2a8cac7d768a28611b27f68569a17;hb=7afdc32625057ad7146fd043f5bcac1eeb3d53b6;hp=08e51497300aef069e0361111707b55dd79fb154;hpb=b017c9ebfe65bb69d6d6dc17f29a4c331952b8cd;p=idzebra-moved-to-github.git diff --git a/isamc/merge.c b/isamc/merge.c index 08e5149..aa7be55 100644 --- a/isamc/merge.c +++ b/isamc/merge.c @@ -478,17 +478,42 @@ static char *hexdump(unsigned char *p, int len, char *buff) { /* isamh - heikki's append-only isam - * missing: - * read the code from an existing block, to synch encoding - * append to single-block entries - * append to multi-block entries + * Idea: When allocating a new block, allocate memory for a very large block + * (maximal blocksize). When done, see if you can shrink it to some + * smaller size. First-time indexing will go in optimal blocks, and + * following small additions will go to the end of the last of the + * maximal ones. Only later, when new blocks need to be allocated, it + * may make sense to reserve some extra space... */ -ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) + +static void isamh_reduceblock(ISAMH is, ISAMH_PP pp, int numKeys) { + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_reduce: block p=%d c=%d o=%d nk=%d ", + pp->pos, pp->cat, pp->offset, numKeys); + if (pp->pos != 0) + return; /* already allocated in some size */ + while ( ( pp->cat > 0 ) && + ( pp->offset < is->method->filecat[pp->cat-1].bsize) && + ( numKeys < is->method->filecat[pp->cat-1].mblocks) ) + pp->cat--; + pp->pos = isamh_alloc_block(is,pp->cat) ; + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_reduced block p=%d to c=%d o=%d nk=%d bs=%d", + pp->pos, pp->cat, pp->offset, numKeys, + is->method->filecat[pp->cat].bsize); +} /* reduceblock */ + - ISAMH_PP pp; +ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) +{ + ISAMH_PP pp; /* always the last pp in the chain */ + ISAMH_PP firstpp; /* always the first one in the chain, may ==pp */ + ISAMH_PP prevpp; /* the one that points to pp, may be null or ==firstpp */ + /* needed to postpone the writing of its next field until */ + /* pp itself has been categorized */ char i_item[128]; char *i_item_ptr; int i_more=1, i_mode; @@ -498,54 +523,68 @@ ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) char *bufptr; int codelen; - ISAMH_PP firstpp; void *r_clientData; /* encode client data */ int newblock; int newcat; int maxkeys; int maxsize; int retval; + int maxcat; pp = firstpp = isamh_pp_open (is, ipos); + prevpp =0; /* only used when new blocks allocated */ + assert (*is->method->code_reset); + maxcat=0; /* find the largest block size for default allocation */ + for ( newcat=0; is->method->filecat[newcat].mblocks > 0; newcat++) + if ( is->method->filecat[newcat].bsize > + is->method->filecat[maxcat].bsize) + maxcat = newcat; + if ( 0==ipos) { /* new block */ - pp->cat=0; /* start small... */ - pp->pos = isamh_alloc_block(is,pp->cat); + pp->cat=maxcat; /* start large... */ + pp->pos = 0; /* not allocated yet */ pp->size= pp->offset = ISAMH_BLOCK_OFFSET_1 ; + pp->buf=xrealloc(pp->buf,is->method->filecat[maxcat].bsize); r_clientData = (*is->method->code_start)(ISAMH_ENCODE); - logf(LOG_LOG,"isamh_append: starting with new block %d",pp->pos); + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append: starting with new block"); } else { /* existing block */ if (isamh_block(firstpp->lastblock) == firstpp->pos) + /*!!! TODO: BUG: Compare whole addresses !!! (later) */ { /* only one block, we have it already */ pp->offset=ISAMH_BLOCK_OFFSET_1; - logf(LOG_LOG,"isamh_append: starting with one block %d",pp->pos); + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append: starting with one block %d",pp->pos); } else { - logf(LOG_LOG,"isamh_append: starting with multiple blocks %d>%d>%d", - firstpp->pos,isamh_block(firstpp->next),isamh_block(firstpp->lastblock)); + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append: starting with multiple blocks %d>%d>%d", + firstpp->pos,isamh_block(firstpp->next),isamh_block(firstpp->lastblock)); pp=isamh_pp_open(is,firstpp->lastblock); /* dirty, but this can also read a N-block. Just clear extra values*/ pp->lastblock=0; pp->offset=ISAMH_BLOCK_OFFSET_N; } /* get last */ - /* read pointers in it to synchronize the encoder ??!! */ r_clientData = (*is->method->code_start)(ISAMH_ENCODE); - logf(LOG_LOG,"isamh_append: scanning to end of block %d %d->%d", - pp->pos, pp->offset, pp->size); + if (pp->is->method->debug > 3) + logf(LOG_LOG,"isamh_append: scanning to end of block %d %d->%d", + pp->pos, pp->offset, pp->size); codeptr=codebuffer; while (pp->offsetsize) { codeptr=codebuffer; bufptr=pp->buf + pp->offset; (*is->method->code_item)(ISAMH_DECODE, r_clientData, &codeptr, &bufptr); codelen = bufptr - (pp->buf+pp->offset) ; - logf(LOG_LOG,"isamh_append: dec at %d %d/%d:%s", - pp->offset, codelen, codeptr-codebuffer, - hexdump(codebuffer,codeptr-codebuffer,0) ); + if (pp->is->method->debug > 3) + logf(LOG_LOG,"isamh_append: dec at %d %d/%d:%s", + pp->offset, codelen, codeptr-codebuffer, + hexdump(codebuffer,codeptr-codebuffer,0) ); pp->offset += codelen; } } /* existing block */ @@ -553,8 +592,9 @@ ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) i_item_ptr = i_item; i_more = (*data->read_item)(data->clientData,&i_item_ptr,&i_mode); - logf(LOG_LOG,"isamh_append 1: m=%d l=%d %s", - i_mode, i_item_ptr-i_item, hexdump(i_item,i_item_ptr-i_item,0)); + if (pp->is->method->debug > 3) + logf(LOG_LOG,"isamh_append 1: m=%d l=%d %s", + i_mode, i_item_ptr-i_item, hexdump(i_item,i_item_ptr-i_item,0)); maxsize = is->method->filecat[pp->cat].bsize; @@ -568,43 +608,92 @@ ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) assert( (codelen < 128) && (codelen>0)); - logf(LOG_LOG,"isamh_append: coded into %d:%s", - codelen,hexdump(codebuffer,codelen,0)); + if (pp->is->method->debug > 3) + logf(LOG_LOG,"isamh_append: coded into %d:%s (nk=%d)", + codelen,hexdump(codebuffer,codelen,0),firstpp->numKeys); if ( pp->offset + codelen > maxsize ) - { - newcat = pp->cat; + { /* oops, block full, do something */ + newcat = maxcat; /* start with a large block again */ maxkeys = is->method->filecat[pp->cat].mblocks; /* max keys */ - logf(LOG_LOG,"isamh_append: need new block: %d > %d (k:%d/%d)", - pp->offset + codelen, maxsize, firstpp->numKeys,maxkeys ); - if ( (maxkeys>0) && (firstpp->numKeys > maxkeys) ) - { /* time to increase block size */ - newcat++; - maxsize = is->method->filecat[newcat].bsize; - pp->buf=xrealloc(pp->buf,maxsize); - logf(LOG_LOG,"isamh_append: increased to cat %d ",newcat); - } + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append: need new block: %d > %d (k:%d/%d)", + pp->offset + codelen, maxsize, firstpp->numKeys,maxkeys ); - newblock = isamh_alloc_block(is,newcat); - pp->next = isamh_addr(newblock,newcat); - if (firstpp!=pp) - { /* not first block, write to disk already now */ - isamh_buildlaterblock(pp); - isamh_write_block(is,pp->cat,pp->pos,pp->buf); - //if (cat != newcat) - // realloc buf !!!! + newblock = 0; + pp->next = 0; + + /* Four possibilities: */ + if (prevpp!=0) + { /* 1: we have a prevpp that can go on the disk */ + /* reduce pp, set next ptr, and store block */ + /* set pp up as new last block, and remember the just */ + /* filled as (pp) the new prev */ + assert(pp!=firstpp); + isamh_reduceblock(is,pp,firstpp->numKeys); + prevpp->next=isamh_addr(pp->pos,pp->cat); + isamh_buildlaterblock(prevpp); + isamh_write_block(is,prevpp->cat,prevpp->pos,prevpp->buf); + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append N1: Wrote prevpp (%d:%d) -> %d:%d", + prevpp->pos, prevpp->cat, pp->pos, pp->cat); + isamh_pp_close(prevpp); /* it's done its job */ + prevpp = pp; + pp=isamh_pp_open(is,isamh_addr(0,maxcat)); /* start a large one */ + pp->size=pp->offset=ISAMH_BLOCK_OFFSET_N ; + pp->next=0; + pp->lastblock=0; + } + else if ( (firstpp!=pp) && (firstpp->next != 0)) + { /* 2: we are working at end of list, but have no prevpp */ + /* some block (already on disk) points to pp */ + /* set the newly filled block as prev, don't save yet */ + /* allocate new pp */ + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append N2: set up a new prevpp (%d:%d)", + pp->pos, pp->cat); + prevpp = pp; + pp=isamh_pp_open(is,isamh_addr(0,maxcat)); /* start a large one */ + pp->size=pp->offset=ISAMH_BLOCK_OFFSET_N ; + pp->next=0; + pp->lastblock=0; + } + else if ( (firstpp!=pp) && (firstpp->next==0)) + { /* 3: we have earlier allocated pp as the second block */ + /* reduce it to get its address. Set first->next to it */ + /* move it to prevpp, and create a new pp */ + isamh_reduceblock(is,pp,firstpp->numKeys); + firstpp->next=isamh_addr(pp->pos,pp->cat); + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append N3: set up a new firstpp->next (%d:%d)", + pp->pos, pp->cat); + prevpp=pp; + pp=isamh_pp_open(is,isamh_addr(0,maxcat)); /* start a large one */ + pp->size=pp->offset=ISAMH_BLOCK_OFFSET_N ; + pp->next=0; + pp->lastblock=0; } else - { /* we had only one block, allocate a second buffer */ - pp = isamh_pp_open(is,0); + { /* 4: firstpp itself has got full. */ + /* allocate a new pp for it, store nothing */ + assert(firstpp==pp); + assert(firstpp->next==0); + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append N4: allocated new pp for full first (%d:%d)", + firstpp->pos, firstpp->cat); + pp=isamh_pp_open(is,isamh_addr(0,maxcat)); /* start a large one */ + pp->size=pp->offset=ISAMH_BLOCK_OFFSET_N ; + pp->next=0; + pp->lastblock=0; } - pp->cat = newcat; - pp->pos = newblock; - + + maxsize = is->method->filecat[pp->cat].bsize; pp->size=pp->offset=ISAMH_BLOCK_OFFSET_N ; pp->next=0; pp->lastblock=0; - logf(LOG_LOG,"isamh_append: got a new block %d:%d",pp->cat,pp->pos); + if (pp->is->method->debug > 2) + logf(LOG_LOG,"isamh_append: got a new block c=%d p=%d",pp->cat,pp->pos); + /* reset the encoding, and code again */ (*is->method->code_reset)(r_clientData); @@ -612,10 +701,11 @@ ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) i_item_ptr=i_item; (*is->method->code_item)(ISAMH_ENCODE, r_clientData, &codeptr, &i_item_ptr); codelen = codeptr-codebuffer; - logf(LOG_LOG,"isamh_append: coded again %d:%s", - codelen,hexdump(codebuffer,codelen,0)); + if (pp->is->method->debug > 3) + logf(LOG_LOG,"isamh_append: coded again %d:%s (nk=%d)", + codelen,hexdump(codebuffer,codelen,0),firstpp->numKeys); - } /* new block needed */ + } /* block full */ /* ok, now we can write it */ memcpy(&(pp->buf[pp->offset]), codebuffer, codelen); @@ -627,20 +717,38 @@ ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) /* try to read the next element */ i_item_ptr = i_item; i_more = (*data->read_item)(data->clientData,&i_item_ptr,&i_mode); - logf(LOG_LOG,"isamh_append 2: m=%d l=%d %s", + if (pp->is->method->debug > 3) + logf(LOG_LOG,"isamh_append 2: m=%d l=%d %s", i_mode, i_item_ptr-i_item, hexdump(i_item,i_item_ptr-i_item,0)); } /* while */ + isamh_reduceblock(is,pp,firstpp->numKeys); + /* may be the same as firstpp! */ + + if (prevpp) + { /* Write the prev block we have been holding off */ + assert(prevpp->pos); + prevpp->next = isamh_addr(pp->pos, pp->cat); + isamh_buildlaterblock(prevpp); + isamh_write_block(is,prevpp->cat,prevpp->pos,prevpp->buf); + if (firstpp->next==0) /* can happen if extending many blocks */ + firstpp->next=isamh_addr(prevpp->pos,prevpp->cat); + isamh_pp_close(prevpp); + } /* Write the last (partial) block, if needed. */ if (pp!=firstpp) { + pp->next=0; /* just to be sure */ + if (firstpp->next==0) + firstpp->next=isamh_addr(pp->pos,pp->cat); isamh_buildlaterblock(pp); isamh_write_block(is,pp->cat,pp->pos,pp->buf); } /* update first block and write it */ firstpp->lastblock = isamh_addr(pp->pos,pp->cat); + isamh_reduceblock(is,firstpp,firstpp->numKeys); isamh_buildfirstblock(firstpp); isamh_write_block(is,firstpp->cat,firstpp->pos,firstpp->buf); @@ -651,7 +759,7 @@ ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) /* get return value (before it disappears at close! */ retval = isamh_addr(firstpp->pos,firstpp->cat); - isamh_pp_close(firstpp); + isamh_pp_close(firstpp); return retval; @@ -660,8 +768,17 @@ ISAMC_P isamh_append (ISAMH is, ISAMH_P ipos, ISAMH_I data) /* * $Log: merge.c,v $ - * Revision 1.14 1999-07-06 16:30:20 heikki - * IsamH startss to work - at least it builds indexes. Can not search yet... + * Revision 1.19 1999-07-14 12:12:07 heikki + * Large-block isam-h (may not work too well... Abandoning for isam-d) + * + * Revision 1.17 1999/07/13 14:22:17 heikki + * Better allocation strategy in isamh_merge + * + * Revision 1.16 1999/07/08 14:23:27 heikki + * Fixed a bug in isamh_pp_read and cleaned up a bit + * + * Revision 1.15 1999/07/07 09:36:04 heikki + * Fixed an assertion in isamh * * Revision 1.13 1999/07/06 09:37:05 heikki * Working on isamh - not ready yet.