From: Adam Dickmeiss Date: Tue, 6 Jul 1999 12:28:04 +0000 (+0000) Subject: Updated record index structure. Format includes version ID. Compression X-Git-Tag: ZEBRA.1.0~92 X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=commitdiff_plain;h=f3425fb457792aae865096cf9acf5cb41798b1d1 Updated record index structure. Format includes version ID. Compression algorithm ID is stored for each record block. --- diff --git a/CHANGELOG b/CHANGELOG index 058d61f..6d52778 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,11 @@ +Changed record index structure. New layout is incompatible with +previous releases. Added setting "recordcompression" to control +compression of records. Possible values are "none" (no +compression) and bzip2 (compression using libbz2). + +Added XML transfer syntax support for retrieval of structured records. +Schema in CompSpec is recognised in retrieval of structured records. + Changed Tcl record filter so that it attemps to read .tflt. If that fails, the filter reads the file .flt (regx style filter). diff --git a/index/extract.c b/index/extract.c index 4309b60..1e6a395 100644 --- a/index/extract.c +++ b/index/extract.c @@ -4,7 +4,11 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: extract.c,v $ - * Revision 1.96 1999-05-26 07:49:13 adam + * Revision 1.97 1999-07-06 12:28:04 adam + * Updated record index structure. Format includes version ID. Compression + * algorithm ID is stored for each record block. + * + * Revision 1.96 1999/05/26 07:49:13 adam * C++ compilation. * * Revision 1.95 1999/05/21 12:00:17 adam @@ -406,6 +410,8 @@ int key_open (struct recordGroup *rGroup, int mem) BFiles bfs = rGroup->bfs; int rw = rGroup->flagRw; data1_handle dh = rGroup->dh; + char *recordCompression; + int record_compression = REC_COMPRESS_NONE; if (!mem) mem = atoi(res_get_def (common_resource, "memMax", "4"))*1024*1024; if (mem < 50000) @@ -423,7 +429,13 @@ int key_open (struct recordGroup *rGroup, int mem) return -1; } assert (!records); - records = rec_open (bfs, rw); + recordCompression = res_get_def (common_resource, + "recordCompression", "none"); + if (!strcmp (recordCompression, "none")) + record_compression = REC_COMPRESS_NONE; + if (!strcmp (recordCompression, "bzip2")) + record_compression = REC_COMPRESS_BZIP2; + records = rec_open (bfs, rw, record_compression); if (!records) { dict_close (matchDict); diff --git a/index/invstat.c b/index/invstat.c index 4781c50..30a5e90 100644 --- a/index/invstat.c +++ b/index/invstat.c @@ -4,7 +4,11 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: invstat.c,v $ - * Revision 1.11 1999-05-15 14:36:38 adam + * Revision 1.12 1999-07-06 12:28:04 adam + * Updated record index structure. Format includes version ID. Compression + * algorithm ID is stored for each record block. + * + * Revision 1.11 1999/05/15 14:36:38 adam * Updated dictionary. Implemented "compression" of dictionary. * * Revision 1.10 1999/05/12 13:08:06 adam @@ -177,7 +181,7 @@ void inv_prstat (BFiles bfs) exit (1); } } - records = rec_open (bfs, 0); + records = rec_open (bfs, 0, 0); for (i = 0; i<8; i++) stat_info.no_isam_entries[i] = 0; diff --git a/index/main.c b/index/main.c index 0c9cad8..04f332b 100644 --- a/index/main.c +++ b/index/main.c @@ -4,7 +4,11 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: main.c,v $ - * Revision 1.66 1999-06-25 13:48:02 adam + * Revision 1.67 1999-07-06 12:28:04 adam + * Updated record index structure. Format includes version ID. Compression + * algorithm ID is stored for each record block. + * + * Revision 1.66 1999/06/25 13:48:02 adam * Updated MSVC project files. * Added BZIP2 record compression (not very well tested). * @@ -425,7 +429,7 @@ int main (int argc, char **argv) bf_cache (rGroupDef.bfs, rval); zebraIndexLockMsg ("r"); } - records = rec_open (rGroupDef.bfs, 0); + records = rec_open (rGroupDef.bfs, 0, 0); rec_prstat (records); rec_close (&records); inv_prstat (rGroupDef.bfs); diff --git a/index/recindex.c b/index/recindex.c index 2fe16b8..0244da9 100644 --- a/index/recindex.c +++ b/index/recindex.c @@ -4,7 +4,11 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: recindex.c,v $ - * Revision 1.24 1999-06-25 13:48:02 adam + * Revision 1.25 1999-07-06 12:28:04 adam + * Updated record index structure. Format includes version ID. Compression + * algorithm ID is stored for each record block. + * + * Revision 1.24 1999/06/25 13:48:02 adam * Updated MSVC project files. * Added BZIP2 record compression (not very well tested). * @@ -180,11 +184,10 @@ static void rec_release_blocks (Records p, int sysno) { struct record_index_entry entry; int freeblock; - int block_and_ref[2]; + char block_and_ref[sizeof(short) + sizeof(int)]; int dst_type; int first = 1; - logf (LOG_LOG, "release_blocks for sysno=%d", sysno); if (read_indx (p, sysno, &entry, sizeof(entry), 1) != 1) return ; @@ -203,8 +206,11 @@ static void rec_release_blocks (Records p, int sysno) } if (first) { - block_and_ref[1]--; - if (block_and_ref[1]) + short ref; + memcpy (&ref, block_and_ref + sizeof(int), sizeof(ref)); + --ref; + memcpy (block_and_ref + sizeof(int), &ref, sizeof(ref)); + if (ref) { if (bf_write (p->data_BFile[dst_type], freeblock, 0, sizeof(block_and_ref), block_and_ref)) @@ -224,7 +230,7 @@ static void rec_release_blocks (Records p, int sysno) exit (1); } p->head.block_free[dst_type] = freeblock; - freeblock = block_and_ref[0]; + memcpy (&freeblock, block_and_ref, sizeof(int)); p->head.block_used[dst_type]--; } @@ -243,40 +249,6 @@ static void rec_delete_single (Records p, Record rec) write_indx (p, rec->sysno, &entry, sizeof(entry)); } -static void rec_write_tmp_buf (Records p, int size, int *sysnos); - -static void rec_write_single (Records p, Record rec) -{ - - int sysnos[2]; - int i, size = 0; - char *cptr; - - logf (LOG_LOG, " rec_write_single !!!!!!!!!!!!!!!!!!!!!!!!!!!!"); - for (i = 0; i < REC_NO_INFO; i++) - if (!rec->info[i]) - size += sizeof(*rec->size); - else - size += sizeof(*rec->size) + rec->size[i]; - - rec_tmp_expand (p, size); - - cptr = p->tmp_buf + sizeof(int); /* a hack! */ - for (i = 0; i < REC_NO_INFO; i++) - { - memcpy (cptr, &rec->size[i], sizeof(*rec->size)); - cptr += sizeof(*rec->size); - if (rec->info[i]) - { - memcpy (cptr, rec->info[i], rec->size[i]); - cptr += rec->size[i]; - } - } - sysnos[0] = rec->sysno; - sysnos[1] = -1; - rec_write_tmp_buf (p, size, sysnos); -} - static void rec_write_tmp_buf (Records p, int size, int *sysnos) { struct record_index_entry entry; @@ -333,18 +305,14 @@ static void rec_write_tmp_buf (Records p, int size, int *sysnos) sizeof(int) + (p->tmp_buf+size) - cptr, cptr); } -static void rec_update_single (Records p, Record rec) -{ - rec_release_blocks (p, rec->sysno); - rec_write_single (p, rec); -} - -Records rec_open (BFiles bfs, int rw) +Records rec_open (BFiles bfs, int rw, int compression_method) { Records p; int i, r; + int version; p = (Records) xmalloc (sizeof(*p)); + p->compression_method = compression_method; p->rw = rw; p->tmp_size = 1024; p->tmp_buf = (char *) xmalloc (p->tmp_size); @@ -360,6 +328,7 @@ Records rec_open (BFiles bfs, int rw) { case 0: memcpy (p->head.magic, REC_HEAD_MAGIC, sizeof(p->head.magic)); + sprintf (p->head.version, "%3d", REC_VERSION); p->head.index_free = 0; p->head.index_last = 1; p->head.no_records = 0; @@ -375,7 +344,7 @@ Records rec_open (BFiles bfs, int rw) for (i = 1; ihead.block_size[i] = p->head.block_size[i-1] * 4; - p->head.block_move[i] = p->head.block_size[i] * 3; + p->head.block_move[i] = p->head.block_size[i] * 24; } if (rw) rec_write_head (p); @@ -384,9 +353,16 @@ Records rec_open (BFiles bfs, int rw) memcpy (&p->head, p->tmp_buf, sizeof(p->head)); if (memcmp (p->head.magic, REC_HEAD_MAGIC, sizeof(p->head.magic))) { - logf (LOG_FATAL, "read %s. bad header", p->index_fname); + logf (LOG_FATAL, "file %s has bad format", p->index_fname); exit (1); } + version = atoi (p->head.version); + if (version != REC_VERSION) + { + logf (LOG_FATAL, "file %s is version %d, but version" + " %d is required", p->index_fname, version, REC_VERSION); + exit (1); + } break; } for (i = 0; iflag = recordFlagNop; last_rec = e->rec; break; + case recordFlagDelete: + rec_delete_single (p, e->rec); + e->flag = recordFlagNop; + break; default: break; } + rec_rm (&e->rec); } *sysnop = -1; if (ref_count) { - int csize = out_offset + (out_offset >> 6) + 620; - - rec_tmp_expand (p, csize); + int csize = 0; /* indicate compression "not performed yet" */ + compression_method = p->compression_method; + switch (compression_method) + { + case REC_COMPRESS_BZIP2: #if HAVE_BZLIB_H - i = bzBuffToBuffCompress (p->tmp_buf+2*sizeof(int), &csize, - out_buf, out_offset, 9, 0, 30); - if (i != BZ_OK) + csize = out_offset + (out_offset >> 6) + 620; + rec_tmp_expand (p, csize); + i = bzBuffToBuffCompress (p->tmp_buf+sizeof(int)+sizeof(short)+ + sizeof(char), + &csize, out_buf, out_offset, 1, 0, 30); + if (i != BZ_OK) + { + logf (LOG_WARN, "bzBuffToBuffCompress error code=%d", i); + csize = 0; + } + logf (LOG_LOG, "compress %4d %5d %5d", ref_count, out_offset, + csize); +#endif + break; + case REC_COMPRESS_NONE: + break; + } + if (!csize) { - logf (LOG_FATAL, "bzBuffToCompress error code=%d", i); - exit (1); + /* either no compression or compression not supported ... */ + csize = out_offset; + rec_tmp_expand (p, csize); + memcpy (p->tmp_buf + sizeof(int) + sizeof(short) + sizeof(char), + out_buf, out_offset); + csize = out_offset; + compression_method = REC_COMPRESS_NONE; } -#else - memcpy (p->tmp_buf + 2*sizeof(int), out_buf, out_offset); - csize = out_offset; -#endif memcpy (p->tmp_buf + sizeof(int), &ref_count, sizeof(ref_count)); + memcpy (p->tmp_buf + sizeof(int)+sizeof(short), + &compression_method, sizeof(compression_method)); + /* -------- compression */ - rec_write_tmp_buf (p, csize + sizeof(int), sysnos); + rec_write_tmp_buf (p, csize + sizeof(short) + sizeof(char), sysnos); } xfree (out_buf); xfree (sysnos); @@ -559,25 +562,6 @@ static void rec_cache_flush (Records p, int saveCount) saveCount = 0; rec_write_multiple (p, saveCount); - for (i = 0; icache_cur - saveCount; i++) - { - struct record_cache_entry *e = p->record_cache + i; - switch (e->flag) - { - case recordFlagNop: - break; - case recordFlagNew: - rec_write_single (p, e->rec); - break; - case recordFlagWrite: - rec_update_single (p, e->rec); - break; - case recordFlagDelete: - rec_delete_single (p, e->rec); - break; - } - rec_rm (&e->rec); - } for (j = 0; jrecord_cache+j, p->record_cache+i, sizeof(*p->record_cache)); @@ -617,7 +601,7 @@ static void rec_cache_insert (Records p, Record rec, enum recordCacheFlag flag) for (j = 0; jsize[j]; } - if (used > 256000) + if (used > 90000) rec_cache_flush (p, 1); } assert (p->cache_cur < p->cache_max); @@ -663,6 +647,9 @@ Record rec_get (Records p, int sysno) int freeblock, dst_type; char *nptr, *cptr; char *in_buf = 0; + char *bz_buf = 0; + int bz_size; + char compression_method; assert (sysno > 0); assert (p); @@ -702,20 +689,36 @@ Record rec_get (Records p, int sysno) } rec->sysno = sysno; -#if HAVE_BZLIB_H - in_size = entry.size * 30+100; - in_buf = (char *) xmalloc (in_size); - i = bzBuffToBuffDecompress (in_buf, &in_size, p->tmp_buf+2*sizeof(int), - entry.size-sizeof(int), 0, 4); - if (i != BZ_OK) + memcpy (&compression_method, p->tmp_buf + sizeof(int) + sizeof(short), + sizeof(compression_method)); + in_buf = p->tmp_buf + sizeof(int) + sizeof(short) + sizeof(char); + in_size = entry.size - sizeof(short) - sizeof(char); + switch (compression_method) { - logf (LOG_FATAL, "bzBuffToDecompress error code=%d", i); - exit (1); - } + case REC_COMPRESS_BZIP2: +#if HAVE_BZLIB_H + bz_size = entry.size * 30+100; + bz_buf = (char *) xmalloc (bz_size); + i = bzBuffToBuffDecompress (bz_buf, &bz_size, in_buf, in_size, 0, 0); + logf (LOG_LOG, "decompress %5d %5d", in_size, bz_size); + if (i != BZ_OK) + { + logf (LOG_FATAL, "bzBuffToBuffDecompress error code=%d", i); + exit (1); + } + in_buf = bz_buf; + in_size = bz_size; #else - in_buf = p->tmp_buf + 2*sizeof(int); - in_size = entry.size - sizeof(int); + logf (LOG_FATAL, "cannot decompress record(s) in BZIP2 format"); + exit (1); #endif + break; + case REC_COMPRESS_NONE: + break; + } + for (i = 0; iinfo[i] = 0; + nptr = in_buf; /* skip ref count */ while (nptr < in_buf + in_size) { @@ -736,8 +739,7 @@ Record rec_get (Records p, int sysno) if (rec->size[i]) { - rec->info[i] = (char *) xmalloc (rec->size[i]); - memcpy (rec->info[i], nptr, rec->size[i]); + rec->info[i] = nptr; nptr += rec->size[i]; } else @@ -746,7 +748,21 @@ Record rec_get (Records p, int sysno) if (this_sysno == sysno) break; } - xfree (in_buf); + for (i = 0; iinfo[i] && rec->size[i]) + { + char *np = xmalloc (rec->size[i]); + memcpy (np, rec->info[i], rec->size[i]); + rec->info[i] = np; + } + else + { + assert (rec->info[i] == 0); + assert (rec->size[i] == 0); + } + } + xfree (bz_buf); rec_cache_insert (p, rec, recordFlagNop); return rec; } diff --git a/index/recindex.h b/index/recindex.h index 4c061fa..31ae460 100644 --- a/index/recindex.h +++ b/index/recindex.h @@ -4,7 +4,11 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: recindex.h,v $ - * Revision 1.16 1999-06-25 13:48:02 adam + * Revision 1.17 1999-07-06 12:28:04 adam + * Updated record index structure. Format includes version ID. Compression + * algorithm ID is stored for each record block. + * + * Revision 1.16 1999/06/25 13:48:02 adam * Updated MSVC project files. * Added BZIP2 record compression (not very well tested). * @@ -93,10 +97,13 @@ void rec_put (Records p, Record *recpp); Record rec_new (Records p); Record rec_get (Records p, int sysno); void rec_close (Records *p); -Records rec_open (BFiles bfs, int rw); +Records rec_open (BFiles bfs, int rw, int compression_method); char *rec_strdup (const char *s, size_t *len); void rec_prstat (Records p); +#define REC_COMPRESS_NONE 0 +#define REC_COMPRESS_BZIP2 1 + enum { recInfo_fileType, recInfo_filename, diff --git a/index/recindxp.h b/index/recindxp.h index 43e7399..bad75ee 100644 --- a/index/recindxp.h +++ b/index/recindxp.h @@ -4,7 +4,11 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: recindxp.h,v $ - * Revision 1.6 1999-05-26 07:49:13 adam + * Revision 1.7 1999-07-06 12:28:04 adam + * Updated record index structure. Format includes version ID. Compression + * algorithm ID is stored for each record block. + * + * Revision 1.6 1999/05/26 07:49:13 adam * C++ compilation. * * Revision 1.5 1999/02/02 14:51:05 adam @@ -41,10 +45,12 @@ extern "C" { #endif #define REC_BLOCK_TYPES 2 -#define REC_HEAD_MAGIC "recindx" +#define REC_HEAD_MAGIC "recindex" +#define REC_VERSION 1 struct records_info { int rw; + int compression_method; char *index_fname; BFile index_BFile; @@ -62,6 +68,7 @@ struct records_info { struct records_head { char magic[8]; + char version[4]; int block_size[REC_BLOCK_TYPES]; int block_free[REC_BLOCK_TYPES]; int block_last[REC_BLOCK_TYPES]; diff --git a/index/zebraapi.c b/index/zebraapi.c index cf81adc..74400fb 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -4,7 +4,11 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: zebraapi.c,v $ - * Revision 1.19 1999-05-26 07:49:13 adam + * Revision 1.20 1999-07-06 12:28:04 adam + * Updated record index structure. Format includes version ID. Compression + * algorithm ID is stored for each record block. + * + * Revision 1.19 1999/05/26 07:49:13 adam * C++ compilation. * * Revision 1.18 1999/05/15 14:36:38 adam @@ -141,7 +145,7 @@ static int zebra_register_lock (ZebraHandle zh) } bf_cache (zh->bfs, state ? res_get (zh->res, "shadow") : NULL); zh->registerState = state; - zh->records = rec_open (zh->bfs, 0); + zh->records = rec_open (zh->bfs, 0, 0); if (!(zh->dict = dict_open (zh->bfs, FNAME_DICT, 40, 0, 0))) { logf (LOG_WARN, "dict_open");