-/*
- * Copyright (C) 1994-1996, Index Data I/S
- * All rights reserved.
- * Sebastian Hammer, Adam Dickmeiss
- *
- * $Log: extract.c,v $
- * Revision 1.69 1997-04-29 09:26:03 adam
- * Bug fix: generic recordId handling didn't work for compressed internal
- * keys.
- *
- * Revision 1.68 1997/02/12 20:39:45 adam
- * Implemented options -f <n> that limits the log to the first <n>
- * records.
- * Changed some log messages also.
- *
- * Revision 1.67 1996/11/15 15:02:14 adam
- * Minor changes regarding logging.
- *
- * Revision 1.66 1996/11/14 09:52:21 adam
- * Strings in record keys bound by IT_MAX_WORD.
- *
- * Revision 1.65 1996/11/14 08:57:56 adam
- * Reduction of storeKeys area.
- *
- * Revision 1.64 1996/11/08 11:10:16 adam
- * Buffers used during file match got bigger.
- * Compressed ISAM support everywhere.
- * Bug fixes regarding masking characters in queries.
- * Redesigned Regexp-2 queries.
- *
- * Revision 1.63 1996/10/29 14:09:39 adam
- * Use of cisam system - enabled if setting isamc is 1.
- *
- * Revision 1.62 1996/10/11 10:57:01 adam
- * New module recctrl. Used to manage records (extract/retrieval).
- * Several files have been moved to the recctrl sub directory.
- *
- * Revision 1.61 1996/06/06 12:08:37 quinn
- * Added showRecord function
- *
- * Revision 1.60 1996/06/04 10:18:12 adam
- * Search/scan uses character mapping module.
- *
- * Revision 1.59 1996/05/14 15:47:07 adam
- * Cleanup of various buffer size entities.
- *
- * Revision 1.58 1996/05/14 06:16:38 adam
- * Compact use/set bytes used in search service.
- *
- * Revision 1.57 1996/05/13 14:23:04 adam
- * Work on compaction of set/use bytes in dictionary.
- *
- * Revision 1.56 1996/05/09 09:54:42 adam
- * Server supports maps from one logical attributes to a list of physical
- * attributes.
- * The extraction process doesn't make space consuming 'any' keys.
- *
- * Revision 1.55 1996/05/09 07:28:55 quinn
- * Work towards phrases and multiple registers
- *
- * Revision 1.54 1996/05/01 13:46:35 adam
- * First work on multiple records in one file.
- * New option, -offset, to the "unread" command in the filter module.
- *
- * Revision 1.53 1996/04/26 12:09:43 adam
- * Added a few comments.
- *
- * Revision 1.52 1996/04/25 13:27:57 adam
- * Function recordExtract modified so that files with no keys (possibly empty)
- * are ignored.
- *
- * Revision 1.51 1996/03/19 11:08:42 adam
- * Bug fix: Log preamble wasn't always turned off after recordExtract.
- *
- * Revision 1.50 1996/02/12 18:45:36 adam
- * New fileVerboseFlag in record group control.
- *
- * Revision 1.49 1996/02/05 12:29:57 adam
- * Logging reduced a bit.
- * The remaining running time is estimated during register merge.
- *
- * Revision 1.48 1996/02/01 20:53:26 adam
- * The temporary per-record keys are compacted a little, and duplication
- * of the per-records keys are avoided when they are saved in the record
- * information buffer.
- *
- * Revision 1.47 1996/01/17 14:57:48 adam
- * Prototype changed for reader functions in extract/retrieve. File
- * is identified by 'void *' instead of 'int.
- *
- * Revision 1.46 1995/12/15 14:57:16 adam
- * Bug fix.
- *
- * Revision 1.45 1995/12/15 12:37:41 adam
- * In addRecordKeyAny: Writes key only when attrSet != -1.
- *
- * Revision 1.44 1995/12/12 16:00:54 adam
- * System call sync(2) used after update/commit.
- * Locking (based on fcntl) uses F_EXLCK and F_SHLCK instead of F_WRLCK
- * and F_RDLCK.
- *
- * Revision 1.43 1995/12/11 09:12:46 adam
- * The rec_get function returns NULL if record doesn't exist - will
- * happen in the server if the result set records have been deleted since
- * the creation of the set (i.e. the search).
- * The server saves a result temporarily if it is 'volatile', i.e. the
- * set is register dependent.
- *
- * Revision 1.42 1995/12/07 17:38:46 adam
- * Work locking mechanisms for concurrent updates/commit.
- *
- * Revision 1.41 1995/12/06 16:06:42 adam
- * Better diagnostics. Work on 'real' dictionary deletion.
- *
- * Revision 1.40 1995/12/05 16:57:40 adam
- * More work on regular patterns.
- *
- * Revision 1.39 1995/12/05 13:20:18 adam
- * Bug fix: file_read sometimes returned early EOF.
- *
- * Revision 1.38 1995/12/04 17:59:21 adam
- * More work on regular expression conversion.
- *
- * Revision 1.37 1995/12/04 14:22:27 adam
- * Extra arg to recType_byName.
- * Started work on new regular expression parsed input to
- * structured records.
- *
- * Revision 1.36 1995/11/30 08:34:29 adam
- * Started work on commit facility.
- * Changed a few malloc/free to xmalloc/xfree.
- *
- * Revision 1.35 1995/11/28 14:26:21 adam
- * Bug fix: recordId with constant wasn't right.
- * Bug fix: recordId dictionary entry wasn't deleted when needed.
- *
- * Revision 1.34 1995/11/28 09:09:38 adam
- * Zebra config renamed.
- * Use setting 'recordId' to identify record now.
- * Bug fix in recindex.c: rec_release_blocks was invokeded even
- * though the blocks were already released.
- * File traversal properly deletes records when needed.
- *
- * Revision 1.33 1995/11/27 09:56:20 adam
- * Record info elements better enumerated. Internal store of records.
- *
- * Revision 1.32 1995/11/25 10:24:05 adam
- * More record fields - they are enumerated now.
- * New options: flagStoreData flagStoreKey.
- *
- * Revision 1.31 1995/11/24 11:31:35 adam
- * Commands add & del read filenames from stdin if source directory is
- * empty.
- * Match criteria supports 'constant' strings.
- *
- * Revision 1.30 1995/11/22 17:19:16 adam
- * Record management uses the bfile system.
- *
- * Revision 1.29 1995/11/21 15:01:14 adam
- * New general match criteria implemented.
- * New feature: document groups.
- *
- * Revision 1.28 1995/11/21 09:20:30 adam
- * Yet more work on record match.
- *
- * Revision 1.27 1995/11/20 16:59:45 adam
- * New update method: the 'old' keys are saved for each records.
- *
- * Revision 1.26 1995/11/20 11:56:24 adam
- * Work on new traversal.
- *
- * Revision 1.25 1995/11/16 15:34:54 adam
- * Uses new record management system in both indexer and server.
- *
- * Revision 1.24 1995/11/15 19:13:08 adam
- * Work on record management.
- *
- * Revision 1.23 1995/10/27 14:00:10 adam
- * Implemented detection of database availability.
- *
- * Revision 1.22 1995/10/17 18:02:07 adam
- * New feature: databases. Implemented as prefix to words in dictionary.
- *
- * Revision 1.21 1995/10/10 12:24:38 adam
- * Temporary sort files are compressed.
- *
- * Revision 1.20 1995/10/06 13:52:05 adam
- * Bug fixes. Handler may abort further scanning.
- *
- * Revision 1.19 1995/10/04 12:55:16 adam
- * Bug fix in ranked search. Use=Any keys inserted.
- *
- * Revision 1.18 1995/10/04 09:37:08 quinn
- * Fixed bug.
- *
- * Revision 1.17 1995/10/03 14:28:57 adam
- * Buffered read in extract works.
- *
- * Revision 1.16 1995/10/03 14:28:45 adam
- * Work on more effecient read handler in extract.
- *
- * Revision 1.15 1995/10/02 15:42:53 adam
- * Extract uses file descriptors instead of FILE pointers.
- *
- * Revision 1.14 1995/10/02 15:29:13 adam
- * More logging in file_extract.
- *
- * Revision 1.13 1995/09/29 14:01:39 adam
- * Bug fixes.
- *
- * Revision 1.12 1995/09/28 14:22:56 adam
- * Sort uses smaller temporary files.
- *
- * Revision 1.11 1995/09/28 12:10:31 adam
- * Bug fixes. Field prefix used in queries.
- *
- * Revision 1.10 1995/09/28 09:19:41 adam
- * xfree/xmalloc used everywhere.
- * Extract/retrieve method seems to work for text records.
- *
- * Revision 1.9 1995/09/27 12:22:28 adam
- * More work on extract in record control.
- * Field name is not in isam keys but in prefix in dictionary words.
- *
- * Revision 1.8 1995/09/14 07:48:22 adam
- * Record control management.
- *
- * Revision 1.7 1995/09/11 13:09:32 adam
- * More work on relevance feedback.
- *
- * Revision 1.6 1995/09/08 14:52:27 adam
- * Minor changes. Dictionary is lower case now.
- *
- * Revision 1.5 1995/09/06 16:11:16 adam
- * Option: only one word key per file.
- *
- * Revision 1.4 1995/09/05 15:28:39 adam
- * More work on search engine.
- *
- * Revision 1.3 1995/09/04 12:33:41 adam
- * Various cleanup. YAZ util used instead.
- *
- * Revision 1.2 1995/09/04 09:10:34 adam
- * More work on index add/del/update.
- * Merge sort implemented.
- * Initial work on z39 server.
- *
- * Revision 1.1 1995/09/01 14:06:35 adam
- * Split of work into more files.
- *
- */
-#include <stdio.h>
-#include <assert.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include <recctrl.h>
-#include "index.h"
-
-#include "zinfo.h"
-
-static Dict matchDict;
-
-static Records records = NULL;
-
-static char **key_buf;
-static size_t ptr_top;
-static size_t ptr_i;
-static size_t key_buf_used;
-static int key_file_no;
-
-static int records_inserted = 0;
-static int records_updated = 0;
-static int records_deleted = 0;
-static int records_processed = 0;
-
-static ZebTargetInfo *zti = NULL;
-
-static void logRecord (int showFlag)
-{
- if (!showFlag)
- ++records_processed;
- if (showFlag || !(records_processed % 1000))
- {
- logf (LOG_LOG, "Records: %7d i/u/d %d/%d/%d",
- records_processed, records_inserted, records_updated,
- records_deleted);
- }
-}
-
-void key_open (int mem)
-{
- if (!mem)
- mem = atoi(res_get_def (common_resource, "memMax", "4"))*1024*1024;
- if (mem < 50000)
- mem = 50000;
- key_buf = xmalloc (mem);
- ptr_top = mem/sizeof(char*);
- ptr_i = 0;
-
- key_buf_used = 0;
- key_file_no = 0;
-
- if (!(matchDict = dict_open (GMATCH_DICT, 50, 1)))
- {
- logf (LOG_FATAL, "dict_open fail of %s", GMATCH_DICT);
- exit (1);
- }
- assert (!records);
- records = rec_open (1);
-#if 1
- zti = zebTargetInfo_open (records, 1);
-#endif
-}
-
-struct encode_info {
- int sysno;
- int seqno;
- char buf[768];
-};
-
-void encode_key_init (struct encode_info *i)
-{
- i->sysno = 0;
- i->seqno = 0;
-}
-
-char *encode_key_int (int d, char *bp)
-{
- if (d <= 63)
- *bp++ = d;
- else if (d <= 16383)
- {
- *bp++ = 64 + (d>>8);
- *bp++ = d & 255;
- }
- else if (d <= 4194303)
- {
- *bp++ = 128 + (d>>16);
- *bp++ = (d>>8) & 255;
- *bp++ = d & 255;
- }
- else
- {
- *bp++ = 192 + (d>>24);
- *bp++ = (d>>16) & 255;
- *bp++ = (d>>8) & 255;
- *bp++ = d & 255;
- }
- return bp;
-}
-
-void encode_key_write (char *k, struct encode_info *i, FILE *outf)
-{
- struct it_key key;
- char *bp = i->buf;
-
- while ((*bp++ = *k++))
- ;
- memcpy (&key, k+1, sizeof(struct it_key));
- bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp);
- if (i->sysno != key.sysno)
- {
- i->sysno = key.sysno;
- i->seqno = 0;
- }
- bp = encode_key_int (key.seqno - i->seqno, bp);
- i->seqno = key.seqno;
- if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
- {
- logf (LOG_FATAL|LOG_ERRNO, "fwrite");
- exit (1);
- }
-}