X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=index%2Fextract.c;h=e06b9ae9287bbfb3d7abe85457195095ce22569e;hb=3042550d9340f8f9e3058702240e2cd376aa33fa;hp=f92ca7f932421d38a6640e8b9a72aad576a1d06f;hpb=c6959870998f868e6a0e9201739fb54aef696bc6;p=idzebra-moved-to-github.git diff --git a/index/extract.c b/index/extract.c index f92ca7f..e06b9ae 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,5 +1,5 @@ -/* $Id: extract.c,v 1.200 2005-12-09 10:45:04 adam Exp $ - Copyright (C) 1995-2005 +/* $Id: extract.c,v 1.212 2006-05-10 14:13:45 adam Exp $ + Copyright (C) 1995-2006 Index Data ApS This file is part of the Zebra server. @@ -32,15 +32,23 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include "index.h" +#include "orddict.h" #include #include -#if _FILE_OFFSET_BITS == 64 -#define PRINTF_OFF_T "%Ld" +#ifdef WIN32 +#define PRINTF_OFF_T "%I64d" +#else +/* !WIN32 */ +#if SIZEOF_OFF_T == SIZEOF_LONG_LONG +#define PRINTF_OFF_T "%lld" #else #define PRINTF_OFF_T "%ld" #endif +#endif + + #define USE_SHELLSORT 0 #if USE_SHELLSORT @@ -206,6 +214,8 @@ static void file_end (void *handle, off_t offset) } } +#define FILE_MATCH_BLANK "\t " + static char *fileMatchStr (ZebraHandle zh, zebra_rec_keys_t reckeys, const char *fname, const char *spec) @@ -216,8 +226,8 @@ static char *fileMatchStr (ZebraHandle zh, while (1) { - while (*s == ' ' || *s == '\t') - s++; + for (; *s && strchr(FILE_MATCH_BLANK, *s); s++) + ; if (!*s) break; if (*s == '(') @@ -226,21 +236,26 @@ static char *fileMatchStr (ZebraHandle zh, char attset_str[64], attname_str[64]; data1_attset *attset; int i; - char matchFlag[32]; int attSet = 1, attUse = 1; int first = 1; - - s++; - for (i = 0; *s && *s != ',' && *s != ')'; s++) - if (i < 63) + + for (s++; strchr(FILE_MATCH_BLANK, *s); s++) + ; + for (i = 0; *s && *s != ',' && *s != ')' && + !strchr(FILE_MATCH_BLANK, *s); s++) + if (i+1 < sizeof(attset_str)) attset_str[i++] = *s; attset_str[i] = '\0'; - + + for (; strchr(FILE_MATCH_BLANK, *s); s++) + ; if (*s == ',') { - s++; - for (i = 0; *s && *s != ')'; s++) - if (i < 63) + for (s++; strchr(FILE_MATCH_BLANK, *s); s++) + ; + for (i = 0; *s && *s != ')' && + !strchr(FILE_MATCH_BLANK, *s); s++) + if (i+1 < sizeof(attname_str)) attname_str[i++] = *s; attname_str[i] = '\0'; } @@ -257,12 +272,7 @@ static char *fileMatchStr (ZebraHandle zh, } searchRecordKey (zh, reckeys, attSet, attUse, ws, 32); - if (*s == ')') - { - for (i = 0; i<32; i++) - matchFlag[i] = 1; - } - else + if (*s != ')') { yaz_log (YLOG_WARN, "Missing ) in match criteria %s in group %s", spec, zh->m_group ? zh->m_group : "none"); @@ -271,7 +281,7 @@ static char *fileMatchStr (ZebraHandle zh, s++; for (i = 0; i<32; i++) - if (matchFlag[i] && ws[i]) + if (ws[i]) { if (first) { @@ -294,12 +304,12 @@ static char *fileMatchStr (ZebraHandle zh, char special[64]; const char *spec_src = NULL; const char *s1 = ++s; - while (*s1 && *s1 != ' ' && *s1 != '\t') + while (*s1 && !strchr(FILE_MATCH_BLANK, *s1)) s1++; spec_len = s1 - s; - if (spec_len > 63) - spec_len = 63; + if (spec_len > sizeof(special)-1) + spec_len = sizeof(special)-1; memcpy (special, s, spec_len); special[spec_len] = '\0'; s = s1; @@ -329,7 +339,7 @@ static char *fileMatchStr (ZebraHandle zh, while (*s && *s != stopMarker) { - if (i < 63) + if (i+1 < sizeof(tmpString)) tmpString[i++] = *s++; } if (*s) @@ -376,14 +386,15 @@ static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl) ctrl->flagShowRecords = !zh->m_flag_rw; } -static int file_extract_record(ZebraHandle zh, - SYSNO *sysno, const char *fname, - int deleteFlag, - struct file_read_info *fi, - int force_update, - RecType recType, - void *recTypeClientData) +static ZEBRA_RES file_extract_record(ZebraHandle zh, + SYSNO *sysno, const char *fname, + int deleteFlag, + struct file_read_info *fi, + int force_update, + RecType recType, + void *recTypeClientData) { + const char *match_str_to_print = ""; RecordAttr *recordAttr; int r; const char *matchStr = 0; @@ -397,7 +408,7 @@ static int file_extract_record(ZebraHandle zh, { if (zebraExplain_newDatabase (zh->reg->zei, zh->basenames[0], zh->m_explain_database)) - return 0; + return ZEBRA_FAIL; } if (fi->fd != -1) @@ -444,7 +455,7 @@ static int file_extract_record(ZebraHandle zh, yaz_log_init_prefix2 (0); if (r == RECCTRL_EXTRACT_EOF) - return 0; + return ZEBRA_FAIL; else if (r == RECCTRL_EXTRACT_ERROR_GENERIC) { /* error occured during extraction ... */ @@ -454,7 +465,7 @@ static int file_extract_record(ZebraHandle zh, yaz_log (YLOG_WARN, "fail %s %s " PRINTF_OFF_T, zh->m_record_type, fname, recordOffset); } - return 0; + return ZEBRA_FAIL; } else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER) { @@ -466,13 +477,19 @@ static int file_extract_record(ZebraHandle zh, PRINTF_OFF_T, zh->m_record_type, fname, recordOffset); } - return 0; + return ZEBRA_FAIL; } if (extractCtrl.match_criteria[0]) matchStr = extractCtrl.match_criteria; } - /* perform match if sysno not known and if match criteria is specified */ + /* if matchStr is set now - we assume it's printable . + For internal matchStr (see below) we don't print */ + if (matchStr) + match_str_to_print = matchStr; + + /* perform internal match if sysno not known and if match criteria is + specified already */ if (!sysno) { sysnotmp = 0; @@ -485,12 +502,14 @@ static int file_extract_record(ZebraHandle zh, if (!matchStr) { yaz_log(YLOG_WARN, "Bad match criteria"); - return 0; + return ZEBRA_FAIL; } } if (matchStr) { - char *rinfo = dict_lookup (zh->reg->matchDict, matchStr); + int db_ord = zebraExplain_get_database_ord(zh->reg->zei); + char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord, + matchStr); if (rinfo) { assert(*rinfo == sizeof(*sysno)); @@ -503,12 +522,12 @@ static int file_extract_record(ZebraHandle zh, /* the extraction process returned no information - the record is probably empty - unless flagShowRecords is in use */ if (!zh->m_flag_rw) - return 1; + return ZEBRA_OK; if (zh->records_processed < zh->m_file_verbose_limit) yaz_log (YLOG_WARN, "empty %s %s " PRINTF_OFF_T, zh->m_record_type, fname, recordOffset); - return 1; + return ZEBRA_OK; } if (! *sysno) @@ -519,22 +538,31 @@ static int file_extract_record(ZebraHandle zh, yaz_log (YLOG_LOG, "delete %s %s " PRINTF_OFF_T, zh->m_record_type, fname, recordOffset); yaz_log (YLOG_WARN, "cannot delete record above (seems new)"); - return 1; + return ZEBRA_OK; } - if (zh->records_processed < zh->m_file_verbose_limit) - yaz_log (YLOG_LOG, "add %s %s " PRINTF_OFF_T, zh->m_record_type, - fname, recordOffset); - rec = rec_new (zh->reg->records); + rec = rec_new (zh->reg->records); + *sysno = rec->sysno; - + + if (zh->records_processed < zh->m_file_verbose_limit) + { + yaz_log(YLOG_LOG, "add %s %s " PRINTF_OFF_T + " " ZINT_FORMAT " %s" , + zh->m_record_type, + fname, recordOffset, *sysno, match_str_to_print); + } recordAttr = rec_init_attr (zh->reg->zei, rec); recordAttr->staticrank = extractCtrl.staticrank; if (matchStr) { - dict_insert (zh->reg->matchDict, matchStr, sizeof(*sysno), sysno); + int db_ord = zebraExplain_get_database_ord(zh->reg->zei); + dict_insert_ord(zh->reg->matchDict, db_ord, matchStr, + sizeof(*sysno), sysno); } + + #if NATTR extract_flushSortKeys (zh, *sysno, 1, zh->reg->sortKeys); #else @@ -584,30 +612,42 @@ static int file_extract_record(ZebraHandle zh, /* record going to be deleted */ if (zebra_rec_keys_empty(delkeys)) { - yaz_log (YLOG_LOG, "delete %s %s " PRINTF_OFF_T, - zh->m_record_type, fname, recordOffset); + yaz_log (YLOG_LOG, "delete %s %s " PRINTF_OFF_T + " " ZINT_FORMAT, + zh->m_record_type, fname, recordOffset, *sysno); yaz_log (YLOG_WARN, "cannot delete file above, storeKeys false (1)"); } else { if (zh->records_processed < zh->m_file_verbose_limit) - yaz_log (YLOG_LOG, "delete %s %s " PRINTF_OFF_T, - zh->m_record_type, fname, recordOffset); + { + yaz_log(YLOG_LOG, "delete %s %s " PRINTF_OFF_T + " " ZINT_FORMAT " %s" , + zh->m_record_type, + fname, recordOffset, *sysno, match_str_to_print); + } zh->records_deleted++; if (matchStr) - dict_delete (zh->reg->matchDict, matchStr); + { + int db_ord = zebraExplain_get_database_ord(zh->reg->zei); + dict_delete_ord(zh->reg->matchDict, db_ord, matchStr); + } rec_del (zh->reg->records, &rec); } rec_rm (&rec); logRecord (zh); - return 1; + return ZEBRA_OK; } else { /* flush new keys for sort&search etc */ if (zh->records_processed < zh->m_file_verbose_limit) - yaz_log (YLOG_LOG, "update %s %s " PRINTF_OFF_T, - zh->m_record_type, fname, recordOffset); + { + yaz_log(YLOG_LOG, "update %s %s " PRINTF_OFF_T + " " ZINT_FORMAT " %s" , + zh->m_record_type, + fname, recordOffset, *sysno, match_str_to_print); + } recordAttr->staticrank = extractCtrl.staticrank; #if NATTR extract_flushSortKeys (zh, *sysno, 1, zh->reg->sortKeys); @@ -717,13 +757,14 @@ static int file_extract_record(ZebraHandle zh, /* commit this record */ rec_put (zh->reg->records, &rec); logRecord (zh); - return 1; + return ZEBRA_OK; } -int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, - int deleteFlag) +ZEBRA_RES zebra_extract_file(ZebraHandle zh, SYSNO *sysno, const char *fname, + int deleteFlag) { - int r, i, fd; + ZEBRA_RES r = ZEBRA_OK; + int i, fd; char gprefix[128]; char ext[128]; char ext_res[128]; @@ -774,7 +815,7 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, &recTypeClientData))) { yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type); - return 0; + return ZEBRA_FAIL; } switch(recType->version) @@ -799,16 +840,15 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, else strcpy (full_rep, fname); - if ((fd = open (full_rep, O_BINARY|O_RDONLY)) == -1) { yaz_log (YLOG_WARN|YLOG_ERRNO, "open %s", full_rep); zh->m_record_type = original_record_type; - return 0; + return ZEBRA_FAIL; } } fi = file_read_start (fd); - do + while(1) { fi->file_moffset = fi->file_offset; fi->file_more = 0; /* file_end not called (yet) */ @@ -819,8 +859,15 @@ int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname, fi->file_offset = fi->file_moffset; lseek(fi->fd, fi->file_moffset, SEEK_SET); } + if (r != ZEBRA_OK) + { + break; + } + if (sysno) + { + break; + } } - while (r && !sysno); file_read_stop (fi); if (fd != -1) close (fd); @@ -911,7 +958,7 @@ ZEBRA_RES buffer_extract_record(ZebraHandle zh, if (!recType) { - yaz_log (YLOG_WARN, "No such record type: %s", zh->m_record_type); + yaz_log (YLOG_WARN, "No such record type: %s", recordType); return ZEBRA_FAIL; } @@ -964,8 +1011,11 @@ ZEBRA_RES buffer_extract_record(ZebraHandle zh, } } } - if (matchStr) { - char *rinfo = dict_lookup (zh->reg->matchDict, matchStr); + if (matchStr) + { + int db_ord = zebraExplain_get_database_ord(zh->reg->zei); + char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord, + matchStr); if (rinfo) { assert(*rinfo == sizeof(*sysno)); @@ -1003,9 +1053,12 @@ ZEBRA_RES buffer_extract_record(ZebraHandle zh, if (matchStr) { - dict_insert (zh->reg->matchDict, matchStr, - sizeof(*sysno), sysno); + int db_ord = zebraExplain_get_database_ord(zh->reg->zei); + dict_insert_ord(zh->reg->matchDict, db_ord, matchStr, + sizeof(*sysno), sysno); } + + #if NATTR extract_flushSortKeys (zh, *sysno, 1, zh->reg->sortKeys); #else @@ -1080,7 +1133,10 @@ ZEBRA_RES buffer_extract_record(ZebraHandle zh, pr_fname, (long) recordOffset); zh->records_deleted++; if (matchStr) - dict_delete (zh->reg->matchDict, matchStr); + { + int db_ord = zebraExplain_get_database_ord(zh->reg->zei); + dict_delete_ord(zh->reg->matchDict, db_ord, matchStr); + } rec_del (zh->reg->records, &rec); } rec_rm (&rec); @@ -1291,6 +1347,58 @@ int explain_extract (void *handle, Record rec, data1_node *n) return 0; } +void extract_rec_keys_adjust(ZebraHandle zh, int is_insert, + zebra_rec_keys_t reckeys) +{ + ZebraExplainInfo zei = zh->reg->zei; + struct ord_stat { + int no; + int ord; + struct ord_stat *next; + }; + + if (zebra_rec_keys_rewind(reckeys)) + { + struct ord_stat *ord_list = 0; + struct ord_stat *p; + size_t slen; + const char *str; + struct it_key key_in; + while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in)) + { + int ord = key_in.mem[0]; + + for (p = ord_list; p ; p = p->next) + if (p->ord == ord) + { + p->no++; + break; + } + if (!p) + { + p = xmalloc(sizeof(*p)); + p->no = 1; + p->ord = ord; + p->next = ord_list; + ord_list = p; + } + } + + p = ord_list; + while (p) + { + struct ord_stat *p1 = p; + + if (is_insert) + zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1); + else + zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1); + p = p->next; + xfree(p1); + } + } +} + void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, int cmd, zebra_rec_keys_t reckeys, @@ -1298,6 +1406,8 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, { ZebraExplainInfo zei = zh->reg->zei; + extract_rec_keys_adjust(zh, cmd, reckeys); + if (!zh->reg->key_buf) { int mem= 1024*1024* atoi( res_get_def( zh->res, "memmax", "8")); @@ -1344,7 +1454,7 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, zh->reg->key_buf_used += key_SU_encode(ch, (char*)zh->reg->key_buf + zh->reg->key_buf_used); - + /* copy the 0-terminated stuff from str to output */ memcpy((char*)zh->reg->key_buf + zh->reg->key_buf_used, str, slen); zh->reg->key_buf_used += slen; @@ -1355,6 +1465,12 @@ void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno, if (zh->m_staticrank) /* rank config enabled ? */ { + if (staticrank < 0) + { + yaz_log(YLOG_WARN, "staticrank = %ld. Setting to 0", + (long) staticrank); + staticrank = 0; + } *keyp++ = staticrank; key_out.len = 4; } @@ -1530,7 +1646,8 @@ ZEBRA_RES zebra_snippets_rec_keys(ZebraHandle zh, ord = key.mem[0]; zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, - 0/* db */, 0/* set */, 0/* use */); + 0/* db */, 0/* set */, 0/* use */, + 0 /* string_index */); assert(index_type); zebra_term_untrans_iconv(zh, nmem, index_type, &dst_term, str); @@ -1559,7 +1676,7 @@ void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys) assert(key.len <= 4 && key.len > 2); zebraExplain_lookup_ord(zh->reg->zei, - key.mem[0], &index_type, &db, 0, 0); + key.mem[0], &index_type, &db, 0, 0, 0); seqno = (int) key.mem[key.len-1]; @@ -1932,6 +2049,7 @@ void encode_key_init (struct encode_info *i) i->prevcmd=-1; i->keylen=0; i->encode_handle = iscz1_start(); + i->decode_handle = iscz1_start(); } #define OLDENCODE 1 @@ -1953,19 +2071,42 @@ void encode_key_write (char *k, struct encode_info *i, FILE *outf) /* and copy & align key so we can mangle */ memcpy (&key, k+1, sizeof(struct it_key)); /* *k is insert/delete */ +#if 0 + /* debugging */ + key_logdump_txt(YLOG_LOG, &key, *k ? "i" : "d"); +#endif + assert(key.mem[0] >= 0); + bp0 = bp++; iscz1_encode(i->encode_handle, &bp, &src); + *bp0 = (*k * 128) + bp - bp0 - 1; /* length and insert/delete combined */ if (fwrite (i->buf, bp - i->buf, 1, outf) != 1) { yaz_log (YLOG_FATAL|YLOG_ERRNO, "fwrite"); exit (1); } + +#if 0 + /* debugging */ + if (1) + { + struct it_key key2; + const char *src = bp0+1; + char *dst = (char*) &key2; + iscz1_decode(i->decode_handle, &dst, &src); + + key_logdump_txt(YLOG_LOG, &key2, *k ? "i" : "d"); + + assert(key2.mem[1]); + } +#endif } void encode_key_flush (struct encode_info *i, FILE *outf) -{ /* dummy routine */ +{ iscz1_stop(i->encode_handle); + iscz1_stop(i->decode_handle); } #else @@ -2069,3 +2210,11 @@ void encode_key_flush (struct encode_info *i, FILE *outf) i->prevseq=0; } #endif +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ +