X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fextract.c;h=013784c6ba8c0d4ee57e1ce96148e3a0ec45bbb2;hb=801ef4b322574c793abdc1ba81dea2da62ca60bc;hp=df17b2d601ba7c6c27ddc656a50eced6da25f8ef;hpb=80eac939036e614a3c09c84b85569b9318da5288;p=idzebra-moved-to-github.git

diff --git a/index/extract.c b/index/extract.c
index df17b2d..013784c 100644
--- a/index/extract.c
+++ b/index/extract.c
@@ -1,10 +1,51 @@
 /*
- * Copyright (C) 1994-1996, Index Data I/S 
+ * Copyright (C) 1994-1998, Index Data I/S 
  * All rights reserved.
  * Sebastian Hammer, Adam Dickmeiss
  *
  * $Log: extract.c,v $
- * Revision 1.70  1997-07-01 13:00:42  adam
+ * Revision 1.81  1998-03-11 11:19:04  adam
+ * Changed the way sequence numbers are generated.
+ *
+ * Revision 1.80  1998/03/05 08:45:11  adam
+ * New result set model and modular ranking system. Moved towards
+ * descent server API. System information stored as "SGML" records.
+ *
+ * Revision 1.79  1998/02/17 10:32:52  adam
+ * Fixed bug: binary files weren't opened with flag b on NT.
+ *
+ * Revision 1.78  1998/02/10 12:03:05  adam
+ * Implemented Sort.
+ *
+ * Revision 1.77  1998/01/12 15:04:08  adam
+ * The test option (-s) only uses read-lock (and not write lock).
+ *
+ * Revision 1.76  1997/10/27 14:33:04  adam
+ * Moved towards generic character mapping depending on "structure"
+ * field in abstract syntax file. Fixed a few memory leaks. Fixed
+ * bug with negative integers when doing searches with relational
+ * operators.
+ *
+ * Revision 1.75  1997/09/17 12:19:12  adam
+ * Zebra version corresponds to YAZ version 1.4.
+ * Changed Zebra server so that it doesn't depend on global common_resource.
+ *
+ * Revision 1.74  1997/09/09 13:38:06  adam
+ * Partial port to WIN95/NT.
+ *
+ * Revision 1.73  1997/09/04 13:57:20  adam
+ * New file extract/retrieve method tellf (added).
+ * Added O_BINARY for open calls.
+ *
+ * Revision 1.72  1997/07/15 16:32:29  adam
+ * Bug fix: Match handler didn't terminate the resulting string!
+ *
+ * Revision 1.71  1997/07/15 16:28:41  adam
+ * Bug fix: storeData didn't work with files with multiple records.
+ * Bug fix: fixed memory management with records; not really well
+ *  thought through.
+ *
+ * Revision 1.70  1997/07/01 13:00:42  adam
  * Bug fix in routine searchRecordKey: uninitialized variables.
  *
  * Revision 1.69  1997/04/29 09:26:03  adam
@@ -254,10 +295,16 @@
  */
 #include <stdio.h>
 #include <assert.h>
+#ifdef WINDOWS
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include <fcntl.h>
 
 #include <recctrl.h>
+#include <charmap.h>
+#include <sortidx.h>
 #include "index.h"
 
 #include "zinfo.h"
@@ -265,6 +312,7 @@
 static Dict matchDict;
 
 static Records records = NULL;
+static SortIdx sortIdx = NULL;
 
 static char **key_buf;
 static size_t ptr_top;
@@ -277,7 +325,7 @@ static int records_updated = 0;
 static int records_deleted = 0;
 static int records_processed = 0;
 
-static ZebTargetInfo *zti = NULL;
+static ZebraExplainInfo zti = NULL;
 
 static void logRecord (int showFlag)
 {
@@ -291,7 +339,7 @@ static void logRecord (int showFlag)
     }
 }
 
-void key_open (int mem)
+int key_open (BFiles bfs, int mem, int rw, data1_handle dh)
 {
     if (!mem)
         mem = atoi(res_get_def (common_resource, "memMax", "4"))*1024*1024;
@@ -304,16 +352,27 @@ void key_open (int mem)
     key_buf_used = 0;
     key_file_no = 0;
 
-    if (!(matchDict = dict_open (GMATCH_DICT, 50, 1)))
+    if (!(matchDict = dict_open (bfs, GMATCH_DICT, 50, rw)))
     {
         logf (LOG_FATAL, "dict_open fail of %s", GMATCH_DICT);
-        exit (1);
+	return -1;
     }
     assert (!records);
-    records = rec_open (1);
-#if 1
-    zti = zebTargetInfo_open (records, 1);
-#endif
+    records = rec_open (bfs, rw);
+    if (!records)
+    {
+	dict_close (matchDict);
+	return -1;
+    }
+    zti = zebraExplain_open (records, dh, rw);
+    if (!zti)
+    {
+	rec_close (&records);
+	dict_close (matchDict);
+	return -1;	
+    }
+    sortIdx = sortIdx_open (bfs, 1);
+    return 0;
 }
 
 struct encode_info {
@@ -416,9 +475,9 @@ void key_flush (void)
     qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_qsort_compare);
     getFnameTmp (out_fname, key_file_no);
 
-    if (!(outf = fopen (out_fname, "w")))
+    if (!(outf = fopen (out_fname, "wb")))
     {
-        logf (LOG_FATAL|LOG_ERRNO, "fopen (4) %s", out_fname);
+        logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname);
         exit (1);
     }
     logf (LOG_LOG, "writing section %d", key_file_no);
@@ -442,9 +501,9 @@ void key_flush (void)
     qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_x_compare);
     getFnameTmp (out_fname, key_file_no);
 
-    if (!(outf = fopen (out_fname, "w")))
+    if (!(outf = fopen (out_fname, "wb")))
     {
-        logf (LOG_FATAL|LOG_ERRNO, "fopen (4) %s", out_fname);
+        logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname);
         exit (1);
     }
     logf (LOG_LOG, "writing section %d", key_file_no);
@@ -484,28 +543,39 @@ void key_flush (void)
     key_buf_used = 0;
 }
 
-int key_close (void)
+int key_close (int rw)
 {
     key_flush ();
     xfree (key_buf);
-#if 1
-    zebTargetInfo_close (zti, 1);
-#endif
+    if (rw)
+	zebraExplain_runNumberIncrement (zti, 1);
+    zebraExplain_close (zti, rw);
     rec_close (&records);
     dict_close (matchDict);
+    sortIdx_close (sortIdx);
 
     logRecord (1);
     return key_file_no;
 }
 
-static void wordInit (RecWord *p)
+static void wordInit (struct recExtractCtrl *p, RecWord *w)
 {
-    p->attrSet = 1;
-    p->attrUse = 1016;
-    p->which = Word_String;
+    w->zebra_maps = p->zebra_maps;
+    w->seqnos = p->seqno;
+    w->attrSet = 1;
+    w->attrUse = 1016;
+    w->reg_type = 'w';
 }
 
-struct recKeys {
+static struct sortKey {
+    char *string;
+    int length;
+    int attrSet;
+    int attrUse;
+    struct sortKey *next;
+} *sortKeys = NULL;
+
+static struct recKeys {
     int buf_used;
     int buf_max;
     char *buf;
@@ -514,14 +584,14 @@ struct recKeys {
     int prevSeqNo;
 } reckeys;
 
-static void addRecordKey (const RecWord *p)
+static void addIndexString (RecWord *p, const char *string, int length)
 {
     char *dst;
     char attrSet;
     short attrUse;
-    size_t i;
     int lead = 0;
     int diff = 0;
+    int *pseqno = &p->seqnos[p->reg_type];
 
     if (reckeys.buf_used+1024 > reckeys.buf_max)
     {
@@ -546,14 +616,14 @@ static void addRecordKey (const RecWord *p)
     else
         reckeys.prevAttrUse = attrUse;
 #if 1
-    diff = 1 + p->seqno - reckeys.prevSeqNo;
+    diff = 1 + *pseqno - reckeys.prevSeqNo;
     if (diff >= 1 && diff <= 15)
         lead |= (diff << 2);
     else
         diff = 0;
 #endif
-    reckeys.prevSeqNo = p->seqno;
-
+    reckeys.prevSeqNo = *pseqno;
+    
     *dst++ = lead;
 
     if (!(lead & 1))
@@ -566,27 +636,161 @@ static void addRecordKey (const RecWord *p)
         memcpy (dst, &attrUse, sizeof(attrUse));
         dst += sizeof(attrUse);
     }
-    switch (p->which)
-    {
-        case Word_String:
-            *dst++ = 'w';
-            break;
-        case Word_Phrase:
-            *dst++ = 'p';
-            break;
-        case Word_Numeric:
-            *dst++ = 'n';
-    }
-    for (i = 0; p->u.string[i] && i < IT_MAX_WORD-3; i++)
-        *dst++ = p->u.string[i];
+    *dst++ = p->reg_type;
+    memcpy (dst, string, length);
+    dst += length;
     *dst++ = '\0';
 
     if (!diff)
     {
-        memcpy (dst, &p->seqno, sizeof(p->seqno));
-        dst += sizeof(p->seqno);
+        memcpy (dst, pseqno, sizeof(*pseqno));
+        dst += sizeof(*pseqno);
     }
     reckeys.buf_used = dst - reckeys.buf;
+    (*pseqno)++;
+}
+
+static void addSortString (RecWord *p, const char *string, int length)
+{
+    struct sortKey *sk;
+
+    for (sk = sortKeys; sk; sk = sk->next)
+	if (sk->attrSet == p->attrSet && sk->attrUse == p->attrUse)
+	    return;
+
+    sk = xmalloc (sizeof(*sk));
+    sk->next = sortKeys;
+    sortKeys = sk;
+
+    sk->string = xmalloc (p->length);
+    sk->length = p->length;
+    memcpy (sk->string, p->string, p->length);
+    sk->attrSet = p->attrSet;
+    sk->attrUse = p->attrUse;
+}
+
+static void addString (RecWord *p, const char *string, int length)
+{
+    assert (length > 0);
+    if (zebra_maps_is_sort (p->zebra_maps, p->reg_type))
+	addSortString (p, string, length);
+    else
+	addIndexString (p, string, length);
+}
+
+static void addIncompleteField (RecWord *p)
+{
+    const char *b = p->string;
+    int remain = p->length;
+    const char **map = 0;
+
+    if (remain > 0)
+	map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+
+    while (map)
+    {
+	char buf[IT_MAX_WORD+1];
+	int i, remain;
+
+	/* Skip spaces */
+	while (map && *map && **map == *CHR_SPACE)
+	{
+	    remain = p->length - (b - p->string);
+	    if (remain > 0)
+		map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+	    else
+		map = 0;
+	}
+	if (!map)
+	    break;
+	i = 0;
+	while (map && *map && **map != *CHR_SPACE)
+	{
+	    const char *cp = *map;
+
+	    while (i < IT_MAX_WORD && *cp)
+		buf[i++] = *(cp++);
+	    remain = p->length - (b - p->string);
+	    if (remain > 0)
+		map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+	    else
+		map = 0;
+	}
+	if (!i)
+	    return;
+	addString (p, buf, i);
+    }
+}
+
+static void addCompleteField (RecWord *p)
+{
+    const char *b = p->string;
+    char buf[IT_MAX_WORD+1];
+    const char **map = 0;
+    int i = 0, remain = p->length;
+
+    if (remain > 0)
+	map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain);
+
+    while (remain > 0 && i < IT_MAX_WORD)
+    {
+	while (map && *map && **map == *CHR_SPACE)
+	{
+	    remain = p->length - (b - p->string);
+	    if (remain > 0)
+		map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+	    else
+		map = 0;
+	}
+	if (!map)
+	    break;
+
+	if (i && i < IT_MAX_WORD)
+	    buf[i++] = *CHR_SPACE;
+	while (map && *map && **map != *CHR_SPACE)
+	{
+	    const char *cp = *map;
+
+	    if (i >= IT_MAX_WORD)
+		break;
+	    while (i < IT_MAX_WORD && *cp)
+		buf[i++] = *(cp++);
+	    remain = p->length  - (b - p->string);
+	    if (remain > 0)
+		map = zebra_maps_input (p->zebra_maps, p->reg_type, &b,
+					remain);
+	    else
+		map = 0;
+	}
+    }
+    if (!i)
+	return;
+    addString (p, buf, i);
+}
+
+static void addRecordKey (RecWord *p)
+{
+    if (zebra_maps_is_complete (p->zebra_maps, p->reg_type))
+	addCompleteField (p);
+    else
+	addIncompleteField(p);
+}
+
+static void flushSortKeys (SYSNO sysno, int cmd)
+{
+    struct sortKey *sk = sortKeys;
+
+    sortIdx_sysno (sortIdx, sysno);
+    while (sk)
+    {
+	struct sortKey *sk_next = sk->next;
+	sortIdx_type (sortIdx, sk->attrUse);
+	sortIdx_add (sortIdx, sk->string, sk->length);
+	xfree (sk->string);
+	xfree (sk);
+	sk = sk_next;
+    }
+    sortKeys = NULL;
 }
 
 static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys, 
@@ -597,11 +801,12 @@ static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys,
     int seqno = 0;
     int off = 0;
 
-    if (zebTargetInfo_curDatabase (zti, databaseName))
+    if (zebraExplain_curDatabase (zti, databaseName))
     {
-        if (zebTargetInfo_newDatabase (zti, databaseName))
+        if (zebraExplain_newDatabase (zti, databaseName))
             abort ();
     }
+    zebraExplain_recordCountIncrement (zti, cmd ? 1 : -1);
     while (off < reckeys->buf_used)
     {
         const char *src = reckeys->buf + off;
@@ -625,9 +830,9 @@ static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys,
         ++ptr_i;
         key_buf[ptr_top-ptr_i] = (char*)key_buf + key_buf_used;
 
-        ch = zebTargetInfo_lookupSU (zti, attrSet, attrUse);
+        ch = zebraExplain_lookupSU (zti, attrSet, attrUse);
         if (ch < 0)
-            ch = zebTargetInfo_addSU (zti, attrSet, attrUse);
+            ch = zebraExplain_addSU (zti, attrSet, attrUse);
         assert (ch > 0);
         ((char*) key_buf) [key_buf_used++] = ch;
         while (*src)
@@ -653,7 +858,7 @@ static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys,
 }
 
 static const char **searchRecordKey (struct recKeys *reckeys,
-                               int attrSetS, int attrUseS)
+				     int attrSetS, int attrUseS)
 {
     static const char *ws[32];
     int off = 0;
@@ -744,6 +949,12 @@ static off_t file_seek (void *handle, off_t offset)
     return lseek (p->fd, offset, SEEK_SET);
 }
 
+static off_t file_tell (void *handle)
+{
+    struct file_read_info *p = handle;
+    return p->file_offset;
+}
+
 static int file_read (void *handle, char *buf, size_t count)
 {
     struct file_read_info *p = handle;
@@ -918,6 +1129,7 @@ static char *fileMatchStr (struct recKeys *reckeys, struct recordGroup *rGroup,
               fname, rGroup->groupName ? rGroup->groupName : "none");
         return NULL;
     }
+    *dst = '\0';
     return dstBuf;
 }
 
@@ -945,6 +1157,7 @@ static int recordExtract (SYSNO *sysno, const char *fname,
                           char *subType)
 {
     struct recExtractCtrl extractCtrl;
+    RecordAttr *recordAttr;
     int r;
     char *matchStr;
     SYSNO sysnotmp;
@@ -954,25 +1167,30 @@ static int recordExtract (SYSNO *sysno, const char *fname,
 
     if (fi->fd != -1)
     {
+	int i;
         /* we are going to read from a file, so prepare the extraction */
         extractCtrl.fh = fi;
         extractCtrl.subType = subType;
         extractCtrl.init = wordInit;
         extractCtrl.add = addRecordKey;
+	extractCtrl.dh = rGroup->dh;
 
         reckeys.buf_used = 0;
         reckeys.prevAttrUse = -1;
         reckeys.prevAttrSet = -1;
         reckeys.prevSeqNo = 0;
 
+	for (i = 0; i<256; i++)
+	    extractCtrl.seqno[i] = 0;
         recordOffset = fi->file_moffset;
         extractCtrl.offset = recordOffset;
         extractCtrl.readf = file_read;
         extractCtrl.seekf = file_seek;
+        extractCtrl.tellf = file_tell;
         extractCtrl.endf = file_end;
-        extractCtrl.map_chrs_input = map_chrs_input;
-        extractCtrl.flagShowRecords = rGroup->flagShowRecords;
-        if (rGroup->flagShowRecords)
+	extractCtrl.zebra_maps = rGroup->zebra_maps;
+        extractCtrl.flagShowRecords = !rGroup->flagRw;
+        if (!rGroup->flagRw)
             printf ("File: %s %ld\n", fname, (long) recordOffset);
 
         logInfo.fname = fname;
@@ -987,8 +1205,8 @@ static int recordExtract (SYSNO *sysno, const char *fname,
         if (r)      
         {
             /* error occured during extraction ... */
-            if (!rGroup->flagShowRecords &&
-                    records_processed < rGroup->fileVerboseLimit)
+            if (rGroup->flagRw &&
+		records_processed < rGroup->fileVerboseLimit)
             {
                 logf (LOG_WARN, "fail %s %s %ld code = %d", rGroup->recordType,
                       fname, (long) recordOffset, r);
@@ -999,7 +1217,7 @@ static int recordExtract (SYSNO *sysno, const char *fname,
         {
             /* the extraction process returned no information - the record
                is probably empty - unless flagShowRecords is in use */
-            if (rGroup->flagShowRecords)
+            if (!rGroup->flagRw)
                 return 1;
             logf (LOG_WARN, "No keys generated for file %s", fname);
             logf (LOG_WARN, " The file is probably empty");
@@ -1039,20 +1257,26 @@ static int recordExtract (SYSNO *sysno, const char *fname,
         /* new record */
         if (deleteFlag)
         {
-            logf (LOG_LOG, "Cannot delete new record");
+	    logf (LOG_LOG, "delete %s %s %ld", rGroup->recordType,
+		  fname, (long) recordOffset);
+            logf (LOG_WARN, "cannot delete record above (seems new)");
             return 1;
         }
         if (records_processed < rGroup->fileVerboseLimit)
             logf (LOG_LOG, "add %s %s %ld", rGroup->recordType,
                   fname, (long) recordOffset);
         rec = rec_new (records);
+
         *sysno = rec->sysno;
 
+	recordAttr = rec_init_attr (zti, rec);
+
         if (matchStr)
         {
             dict_insert (matchDict, matchStr, sizeof(*sysno), sysno);
         }
         flushRecordKeys (*sysno, 1, &reckeys, rGroup->databaseName);
+	flushSortKeys (*sysno, 1);
 
         records_inserted++;
     }
@@ -1063,8 +1287,20 @@ static int recordExtract (SYSNO *sysno, const char *fname,
 
         rec = rec_get (records, *sysno);
         assert (rec);
+	
+	recordAttr = rec_init_attr (zti, rec);
+
+	if (recordAttr->runNumber == zebraExplain_runNumberIncrement (zti, 0))
+	{
+	    logf (LOG_LOG, "skipped %s %s %ld", rGroup->recordType,
+		  fname, (long) recordOffset);
+	    rec_rm (&rec);
+	    logRecord (0);
+	    return 1;
+	}
         delkeys.buf_used = rec->size[recInfo_delKeys];
 	delkeys.buf = rec->info[recInfo_delKeys];
+	flushSortKeys (*sysno, 0);
         flushRecordKeys (*sysno, 0, &delkeys, rec->info[recInfo_databaseName]);
         if (deleteFlag)
         {
@@ -1085,6 +1321,7 @@ static int recordExtract (SYSNO *sysno, const char *fname,
                     dict_delete (matchDict, matchStr);
                 rec_del (records, &rec);
             }
+	    rec_rm (&rec);
             logRecord (0);
             return 1;
         }
@@ -1139,23 +1376,33 @@ static int recordExtract (SYSNO *sysno, const char *fname,
         rec->size[recInfo_delKeys] = 0;
     }
 
+    /* save file size of original record */
+    zebraExplain_recordBytesIncrement (zti, - recordAttr->recordSize);
+    recordAttr->recordSize = fi->file_moffset - recordOffset;
+    if (!recordAttr->recordSize)
+	recordAttr->recordSize = fi->file_max - recordOffset;
+    zebraExplain_recordBytesIncrement (zti, recordAttr->recordSize);
+
+    /* set run-number for this record */
+    recordAttr->runNumber = zebraExplain_runNumberIncrement (zti, 0);
+
     /* update store data */
     xfree (rec->info[recInfo_storeData]);
     if (rGroup->flagStoreData == 1)
     {
-        rec->size[recInfo_storeData] = fi->file_max;
-        rec->info[recInfo_storeData] = xmalloc (fi->file_max);
+        rec->size[recInfo_storeData] = recordAttr->recordSize;
+        rec->info[recInfo_storeData] = xmalloc (recordAttr->recordSize);
         if (lseek (fi->fd, recordOffset, SEEK_SET) < 0)
         {
             logf (LOG_ERRNO|LOG_FATAL, "seek to %ld in %s", fname,
                   (long) recordOffset);
             exit (1);
         }
-        if (read (fi->fd, rec->info[recInfo_storeData], fi->file_max)
-            < fi->file_max)
+        if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize)
+	    < recordAttr->recordSize)
         {
             logf (LOG_ERRNO|LOG_FATAL, "read %d bytes of %s",
-                  fi->file_max, fname);
+                  recordAttr->recordSize, fname);
             exit (1);
         }
     }
@@ -1170,11 +1417,7 @@ static int recordExtract (SYSNO *sysno, const char *fname,
         rec_strdup (rGroup->databaseName, &rec->size[recInfo_databaseName]); 
 
     /* update offset */
-    xfree (rec->info[recInfo_offset]);
-
-    rec->size[recInfo_offset] = sizeof(recordOffset);
-    rec->info[recInfo_offset] = xmalloc (sizeof(recordOffset));
-    memcpy (rec->info[recInfo_offset], &recordOffset, sizeof(recordOffset));
+    recordAttr->recordOffset = recordOffset;
     
     /* commit this record */
     rec_put (records, &rec);
@@ -1298,7 +1541,7 @@ int fileExtract (SYSNO *sysno, const char *fname,
         fd = -1;
     else
     {
-        if ((fd = open (fname, O_RDONLY)) == -1)
+        if ((fd = open (fname, O_BINARY|O_RDONLY)) == -1)
         {
             logf (LOG_WARN|LOG_ERRNO, "open %s", fname);
             return 0;