X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=recctrl%2Fregxread.c;h=f6f74c8aa316990298a108f09545f16bdb2eb076;hp=a9cd6a2d6f75089233159162d0c6e6630bd4964c;hb=5437b50633032595afe6f87dc0f989bc92a5aea8;hpb=9b6159f0ac334f4b5984e05bed5439863f984861 diff --git a/recctrl/regxread.c b/recctrl/regxread.c index a9cd6a2..f6f74c8 100644 --- a/recctrl/regxread.c +++ b/recctrl/regxread.c @@ -1,160 +1,43 @@ -/* - * Copyright (C) 1994-1998, Index Data - * All rights reserved. - * Sebastian Hammer, Adam Dickmeiss - * - * $Log: regxread.c,v $ - * Revision 1.19 1998-11-03 10:22:39 adam - * Fixed memory leak that could occur for when large data1 node were - * concatenated. Data-type data1_nodes may have multiple nodes. - * - * Revision 1.18 1998/10/15 13:11:47 adam - * Added support for option -record for "end element". When specified - * end element will mark end-of-record when at outer-level. - * - * Revision 1.17 1998/07/01 10:13:51 adam - * Minor fix. - * - * Revision 1.16 1998/06/30 15:15:09 adam - * Tags are trimmed: white space removed before- and after the tag. - * - * Revision 1.15 1998/06/30 12:55:45 adam - * Bug fix. - * - * Revision 1.14 1998/03/05 08:41:00 adam - * Implemented rule contexts. - * - * Revision 1.13 1997/12/12 06:33:58 adam - * Fixed bug that showed up when multiple filter where used. - * Made one routine thread-safe. - * - * Revision 1.12 1997/11/18 10:03:24 adam - * Member num_children removed from data1_node. - * - * Revision 1.11 1997/11/06 11:41:01 adam - * Implemented "begin variant" for the sgml.regx filter. - * - * Revision 1.10 1997/10/31 12:36:12 adam - * Minor change that avoids compiler warning. - * - * Revision 1.9 1997/09/29 09:02:49 adam - * Fixed small bug (introduced by previous commit). - * - * Revision 1.8 1997/09/17 12:19:22 adam - * Zebra version corresponds to YAZ version 1.4. - * Changed Zebra server so that it doesn't depend on global common_resource. - * - * Revision 1.7 1997/07/15 16:33:07 adam - * Check for zero length in execData. - * - * Revision 1.6 1997/02/24 10:41:51 adam - * Cleanup of code and commented out the "end element-end-record" code. - * - * Revision 1.5 1997/02/19 16:22:33 adam - * Fixed "end element" to terminate record in outer-most level. - * - * Revision 1.4 1997/02/12 20:42:58 adam - * Changed some log messages. - * - * Revision 1.3 1996/11/08 14:05:33 adam - * Bug fix: data1 node member u.tag.get_bytes weren't initialized. - * - * Revision 1.2 1996/10/29 14:02:09 adam - * Doesn't use the global data1_tabpath (from YAZ). Instead the function - * data1_get_tabpath is used. - * - * Revision 1.1 1996/10/11 10:57:30 adam - * New module recctrl. Used to manage records (extract/retrieval). - * - * Revision 1.24 1996/06/17 14:25:31 adam - * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG. - * - * Revision 1.23 1996/06/04 10:19:00 adam - * Minor changes - removed include of ctype.h. - * - * Revision 1.22 1996/06/03 15:23:13 adam - * Bug fix: /../ BODY /../ - pattern didn't match EOF. - * - * Revision 1.21 1996/05/14 16:58:38 adam - * Minor change. - * - * Revision 1.20 1996/05/01 13:46:36 adam - * First work on multiple records in one file. - * New option, -offset, to the "unread" command in the filter module. - * - * Revision 1.19 1996/02/12 16:18:20 adam - * Yet another bug fix in implementation of unread command. - * - * Revision 1.18 1996/02/12 16:07:54 adam - * Bug fix in new unread command. - * - * Revision 1.17 1996/02/12 15:56:11 adam - * New code command: unread. - * - * Revision 1.16 1996/01/17 14:57:51 adam - * Prototype changed for reader functions in extract/retrieve. File - * is identified by 'void *' instead of 'int. - * - * Revision 1.15 1996/01/08 19:15:47 adam - * New input filter that works! - * - * Revision 1.14 1996/01/08 09:10:38 adam - * Yet another complete rework on this module. - * - * Revision 1.13 1995/12/15 17:21:50 adam - * This version is able to set data.formatted_text in data1-nodes. - * - * Revision 1.12 1995/12/15 16:20:10 adam - * The filter files (*.flt) are read from the path given by data1_tabpath. - * - * Revision 1.11 1995/12/15 12:35:16 adam - * Better logging. - * - * Revision 1.10 1995/12/15 10:35:36 adam - * Misc. bug fixes. - * - * Revision 1.9 1995/12/14 16:38:48 adam - * Completely new attempt to make regular expression parsing. - * - * Revision 1.8 1995/12/13 17:16:59 adam - * Small changes. - * - * Revision 1.7 1995/12/13 16:51:58 adam - * Modified to set last_child in data1_nodes. - * Uses destroy handler to free up data text nodes. - * - * Revision 1.6 1995/12/13 13:45:37 quinn - * Changed data1 to use nmem. - * - * Revision 1.5 1995/12/11 09:12:52 adam - * The rec_get function returns NULL if record doesn't exist - will - * happen in the server if the result set records have been deleted since - * the creation of the set (i.e. the search). - * The server saves a result temporarily if it is 'volatile', i.e. the - * set is register dependent. - * - * Revision 1.4 1995/12/05 16:57:40 adam - * More work on regular patterns. - * - * Revision 1.3 1995/12/05 09:37:09 adam - * One malloc was renamed to xmalloc. - * - * Revision 1.2 1995/12/04 17:59:24 adam - * More work on regular expression conversion. - * - * Revision 1.1 1995/12/04 14:25:30 adam - * Started work on regular expression parsed input to structured records. - * - */ +/* $Id: regxread.c,v 1.55 2004-11-19 10:27:13 heikki Exp $ + Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004 + Index Data Aps + +This file is part of the Zebra server. + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with Zebra; see the file LICENSE.zebra. If not, write to the +Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. +*/ + + #include #include #include #include -#include +#include #include #include -#include "grsread.h" +#include + +#if HAVE_TCL_H +#include + +#if MAJOR_VERSION >= 8 +#define HAVE_TCL_OBJECTS +#endif +#endif #define REGX_DEBUG 0 @@ -168,9 +51,13 @@ #define REGX_END 4 #define REGX_CODE 5 #define REGX_CONTEXT 6 +#define REGX_INIT 7 struct regxCode { char *str; +#if HAVE_TCL_OBJECTS + Tcl_Obj *tcl_obj; +#endif }; struct lexRuleAction { @@ -201,12 +88,19 @@ struct lexContext { struct lexRule *rules; struct lexRuleInfo **fastRule; int ruleNo; + int initFlag; struct lexRuleAction *beginActionList; struct lexRuleAction *endActionList; + struct lexRuleAction *initActionList; struct lexContext *next; }; +struct lexConcatBuf { + int max; + char *buf; +}; + struct lexSpec { char *name; struct lexContext *context; @@ -218,6 +112,9 @@ struct lexSpec { int lineNo; NMEM m; data1_handle dh; +#if HAVE_TCL_H + Tcl_Interp *tcl_interp; +#endif void *f_win_fh; void (*f_win_ef)(void *, off_t); @@ -228,8 +125,22 @@ struct lexSpec { int (*f_win_rf)(void *, char *, size_t); off_t (*f_win_sf)(void *, off_t); + struct lexConcatBuf *concatBuf; + int maxLevel; + data1_node **d1_stack; + int d1_level; + int stop_flag; + + int *arg_start; + int *arg_end; + int arg_no; + int ptr; }; +struct lexSpecs { + struct lexSpec *spec; + char type[256]; +}; static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos, int *size) @@ -247,7 +158,7 @@ static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos, spec->f_win_start = start_pos; if (!spec->f_win_buf) - spec->f_win_buf = xmalloc (spec->f_win_size); + spec->f_win_buf = (char *) xmalloc (spec->f_win_size); *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf, spec->f_win_size); spec->f_win_end = spec->f_win_start + *size; @@ -293,6 +204,10 @@ static void regxCodeDel (struct regxCode **pp) struct regxCode *p = *pp; if (p) { +#if HAVE_TCL_OBJECTS + if (p->tcl_obj) + Tcl_DecrRefCount (p->tcl_obj); +#endif xfree (p->str); xfree (p); *pp = NULL; @@ -303,10 +218,15 @@ static void regxCodeMk (struct regxCode **pp, const char *buf, int len) { struct regxCode *p; - p = xmalloc (sizeof(*p)); - p->str = xmalloc (len+1); + p = (struct regxCode *) xmalloc (sizeof(*p)); + p->str = (char *) xmalloc (len+1); memcpy (p->str, buf, len); p->str[len] = '\0'; +#if HAVE_TCL_OBJECTS + p->tcl_obj = Tcl_NewStringObj ((char *) buf, len); + if (p->tcl_obj) + Tcl_IncrRefCount (p->tcl_obj); +#endif *pp = p; } @@ -344,15 +264,17 @@ static void actionListDel (struct lexRuleAction **rap) static struct lexContext *lexContextCreate (const char *name) { - struct lexContext *p = xmalloc (sizeof(*p)); + struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p)); p->name = xstrdup (name); p->ruleNo = 1; + p->initFlag = 0; p->dfa = lexSpecDFA (); p->rules = NULL; p->fastRule = NULL; p->beginActionList = NULL; p->endActionList = NULL; + p->initActionList = NULL; p->next = NULL; return p; } @@ -361,6 +283,7 @@ static void lexContextDestroy (struct lexContext *p) { struct lexRule *rp, *rp1; + dfa_delete (&p->dfa); xfree (p->fastRule); for (rp = p->rules; rp; rp = rp1) { @@ -370,23 +293,40 @@ static void lexContextDestroy (struct lexContext *p) } actionListDel (&p->beginActionList); actionListDel (&p->endActionList); + actionListDel (&p->initActionList); xfree (p->name); xfree (p); } -static struct lexSpec *lexSpecCreate (const char *name) +static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh) { struct lexSpec *p; - - p = xmalloc (sizeof(*p)); - p->name = xmalloc (strlen(name)+1); + int i; + + p = (struct lexSpec *) xmalloc (sizeof(*p)); + p->name = (char *) xmalloc (strlen(name)+1); strcpy (p->name, name); +#if HAVE_TCL_H + p->tcl_interp = 0; +#endif + p->dh = dh; p->context = NULL; p->context_stack_size = 100; - p->context_stack = xmalloc (sizeof(*p->context_stack) * - p->context_stack_size); + p->context_stack = (struct lexContext **) + xmalloc (sizeof(*p->context_stack) * p->context_stack_size); p->f_win_buf = NULL; + + p->maxLevel = 128; + p->concatBuf = (struct lexConcatBuf *) + xmalloc (sizeof(*p->concatBuf) * p->maxLevel); + for (i = 0; i < p->maxLevel; i++) + { + p->concatBuf[i].max = 0; + p->concatBuf[i].buf = 0; + } + p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel); + p->d1_level = 0; return p; } @@ -394,11 +334,17 @@ static void lexSpecDestroy (struct lexSpec **pp) { struct lexSpec *p; struct lexContext *lt; + int i; assert (pp); p = *pp; if (!p) return ; + + for (i = 0; i < p->maxLevel; i++) + xfree (p->concatBuf[i].buf); + xfree (p->concatBuf); + lt = p->context; while (lt) { @@ -406,9 +352,14 @@ static void lexSpecDestroy (struct lexSpec **pp) lexContextDestroy (lt); lt = lt_next; } +#if HAVE_TCL_OBJECTS + if (p->tcl_interp) + Tcl_DeleteInterp (p->tcl_interp); +#endif xfree (p->name); xfree (p->f_win_buf); xfree (p->context_stack); + xfree (p->d1_stack); xfree (p); *pp = NULL; } @@ -419,7 +370,7 @@ static int readParseToken (const char **cpp, int *len) char cmd[32]; int i, level; - while (*cp == ' ' || *cp == '\t' || *cp == '\n') + while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r') cp++; switch (*cp) { @@ -454,16 +405,17 @@ static int readParseToken (const char **cpp, int *len) cmd[i] = *cp + 'a' - 'A'; else break; - if (i < sizeof(cmd)-2) + if (i < (int) sizeof(cmd)-2) i++; cp++; } cmd[i] = '\0'; if (i == 0) { - logf (LOG_WARN, "bad character %d %c", *cp, *cp); + yaz_log (YLOG_WARN, "bad character %d %c", *cp, *cp); cp++; - while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n') + while (*cp && *cp != ' ' && *cp != '\t' && + *cp != '\n' && *cp != '\r') cp++; *cpp = cp; return 0; @@ -477,9 +429,11 @@ static int readParseToken (const char **cpp, int *len) return REGX_BODY; else if (!strcmp (cmd, "context")) return REGX_CONTEXT; + else if (!strcmp (cmd, "init")) + return REGX_INIT; else { - logf (LOG_WARN, "bad command %s", cmd); + yaz_log (YLOG_WARN, "bad command %s", cmd); return 0; } } @@ -500,13 +454,13 @@ static int actionListMk (struct lexSpec *spec, const char *s, bodyMark = 1; continue; case REGX_CODE: - *ap = xmalloc (sizeof(**ap)); + *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap)); (*ap)->which = tok; regxCodeMk (&(*ap)->u.code, s, len); s += len+1; break; case REGX_PATTERN: - *ap = xmalloc (sizeof(**ap)); + *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap)); (*ap)->which = tok; (*ap)->u.pattern.body = bodyMark; bodyMark = 0; @@ -517,17 +471,22 @@ static int actionListMk (struct lexSpec *spec, const char *s, { xfree (*ap); *ap = NULL; - logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0); + yaz_log (YLOG_WARN, "regular expression error '%.*s'", s-s0, s0); return -1; } + if (debug_dfa_tran) + printf ("pattern: %.*s\n", s-s0, s0); dfa_mkstate ((*ap)->u.pattern.dfa); s++; break; case REGX_BEGIN: - logf (LOG_WARN, "cannot use begin here"); + yaz_log (YLOG_WARN, "cannot use BEGIN here"); + continue; + case REGX_INIT: + yaz_log (YLOG_WARN, "cannot use INIT here"); continue; case REGX_END: - *ap = xmalloc (sizeof(**ap)); + *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap)); (*ap)->which = tok; break; } @@ -550,7 +509,7 @@ int readOneSpec (struct lexSpec *spec, const char *s) tok = readParseToken (&s, &len); if (tok != REGX_CODE) { - logf (LOG_WARN, "missing name after CONTEXT keyword"); + yaz_log (YLOG_WARN, "missing name after CONTEXT keyword"); return 0; } if (len > 31) @@ -575,23 +534,27 @@ int readOneSpec (struct lexSpec *spec, const char *s) actionListDel (&spec->context->endActionList); actionListMk (spec, s, &spec->context->endActionList); break; + case REGX_INIT: + actionListDel (&spec->context->initActionList); + actionListMk (spec, s, &spec->context->initActionList); + break; case REGX_PATTERN: #if REGX_DEBUG - logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s); + yaz_log (YLOG_LOG, "rule %d %s", spec->context->ruleNo, s); #endif r = dfa_parse (spec->context->dfa, &s); if (r) { - logf (LOG_WARN, "regular expression error. r=%d", r); + yaz_log (YLOG_WARN, "regular expression error. r=%d", r); return -1; } if (*s != '/') { - logf (LOG_WARN, "expects / at end of pattern. got %c", *s); + yaz_log (YLOG_WARN, "expects / at end of pattern. got %c", *s); return -1; } s++; - rp = xmalloc (sizeof(*rp)); + rp = (struct lexRule *) xmalloc (sizeof(*rp)); rp->info.no = spec->context->ruleNo++; rp->next = spec->context->rules; spec->context->rules = rp; @@ -603,27 +566,48 @@ int readOneSpec (struct lexSpec *spec, const char *s) int readFileSpec (struct lexSpec *spec) { struct lexContext *lc; - char *lineBuf; - int lineSize = 512; int c, i, errors = 0; - FILE *spec_inf; + FILE *spec_inf = 0; + WRBUF lineBuf; + char fname[256]; - lineBuf = xmalloc (1+lineSize); - logf (LOG_LOG, "reading regx filter %s.flt", spec->name); - sprintf (lineBuf, "%s.flt", spec->name); - if (!(spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), - lineBuf, "r"))) +#if HAVE_TCL_H + if (spec->tcl_interp) + { + sprintf (fname, "%s.tflt", spec->name); + spec_inf = data1_path_fopen (spec->dh, fname, "r"); + } +#endif + if (!spec_inf) + { + sprintf (fname, "%s.flt", spec->name); + spec_inf = data1_path_fopen (spec->dh, fname, "r"); + } + if (!spec_inf) { - logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name); - xfree (lineBuf); + yaz_log (YLOG_ERRNO|YLOG_WARN, "cannot read spec file %s", spec->name); return -1; } + yaz_log (YLOG_LOG, "reading regx filter %s", fname); +#if HAVE_TCL_H + if (spec->tcl_interp) + yaz_log (YLOG_LOG, "Tcl enabled"); +#endif + +#if 0 + debug_dfa_trav = 0; + debug_dfa_tran = 1; + debug_dfa_followpos = 0; + dfa_verbose = 1; +#endif + + lineBuf = wrbuf_alloc(); spec->lineNo = 0; c = getc (spec_inf); while (c != EOF) { - int off = 0; - if (c == '#' || c == '\n' || c == ' ' || c == '\t') + wrbuf_rewind (lineBuf); + if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r') { while (c != '\n' && c != EOF) c = getc (spec_inf); @@ -634,12 +618,14 @@ int readFileSpec (struct lexSpec *spec) else { int addLine = 0; - - lineBuf[off++] = c; + while (1) { int c1 = c; + wrbuf_putc(lineBuf, c); c = getc (spec_inf); + while (c == '\r') + c = getc (spec_inf); if (c == EOF) break; if (c1 == '\n') @@ -648,28 +634,20 @@ int readFileSpec (struct lexSpec *spec) break; addLine++; } - lineBuf[off] = c; - if (off < lineSize) - off++; } - lineBuf[off] = '\0'; - readOneSpec (spec, lineBuf); + wrbuf_putc(lineBuf, '\0'); + readOneSpec (spec, wrbuf_buf(lineBuf)); spec->lineNo += addLine; } } fclose (spec_inf); - xfree (lineBuf); + wrbuf_free(lineBuf, 1); -#if 0 - debug_dfa_trav = 1; - debug_dfa_tran = 1; - debug_dfa_followpos = 1; - dfa_verbose = 1; -#endif for (lc = spec->context; lc; lc = lc->next) { struct lexRule *rp; - lc->fastRule = xmalloc (sizeof(*lc->fastRule) * lc->ruleNo); + lc->fastRule = (struct lexRuleInfo **) + xmalloc (sizeof(*lc->fastRule) * lc->ruleNo); for (i = 0; i < lc->ruleNo; i++) lc->fastRule[i] = NULL; for (rp = lc->rules; rp; rp = rp->next) @@ -678,84 +656,154 @@ int readFileSpec (struct lexSpec *spec) } if (errors) return -1; + return 0; } +#if 0 static struct lexSpec *curLexSpec = NULL; - -static void destroy_data (struct data1_node *n) -{ - assert (n->which == DATA1N_data); - xfree (n->u.data.data); -} +#endif static void execData (struct lexSpec *spec, - data1_node **d1_stack, int *d1_level, - const char *ebuf, int elen, int formatted_text) + const char *ebuf, int elen, int formatted_text, + const char *attribute_str, int attribute_len) { struct data1_node *res, *parent; + int org_len; if (elen == 0) /* shouldn't happen, but it does! */ return ; #if REGX_DEBUG - if (elen > 40) - logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen, - ebuf, 15, ebuf + elen-15); + if (elen > 80) + yaz_log (YLOG_LOG, "data(%d bytes) %.40s ... %.*s", elen, + ebuf, 40, ebuf + elen-40); + else if (elen == 1 && ebuf[0] == '\n') + { + yaz_log (YLOG_LOG, "data(new line)"); + } else if (elen > 0) - logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf); + yaz_log (YLOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf); else - logf (LOG_DEBUG, "data (%d bytes)", elen); + yaz_log (YLOG_LOG, "data(%d bytes)", elen); #endif - if (*d1_level <= 1) + if (spec->d1_level <= 1) return; - parent = d1_stack[*d1_level -1]; + parent = spec->d1_stack[spec->d1_level -1]; assert (parent); - res = data1_mk_node (spec->dh, spec->m); - res->parent = parent; - res->which = DATA1N_data; - res->u.data.what = DATA1I_text; - res->u.data.len = elen; - res->u.data.formatted_text = formatted_text; - if (elen > DATA1_LOCALDATA) - res->u.data.data = nmem_malloc (spec->m, elen); - else - res->u.data.data = res->lbuf; - memcpy (res->u.data.data, ebuf, elen); - res->root = parent->root; - - parent->last_child = res; - if (d1_stack[*d1_level]) - d1_stack[*d1_level]->next = res; - else - parent->child = res; - d1_stack[*d1_level] = res; + if (attribute_str) + { + data1_xattr **ap; + res = parent; + if (res->which != DATA1N_tag) + return; + /* sweep through exising attributes.. */ + for (ap = &res->u.tag.attributes; *ap; ap = &(*ap)->next) + if (strlen((*ap)->name) == attribute_len && + !memcmp((*ap)->name, attribute_str, attribute_len)) + break; + if (!*ap) + { + /* new attribute. Create it with name + value */ + *ap = nmem_malloc(spec->m, sizeof(**ap)); + + (*ap)->name = nmem_malloc(spec->m, attribute_len+1); + memcpy((*ap)->name, attribute_str, attribute_len); + (*ap)->name[attribute_len] = '\0'; + + (*ap)->value = nmem_malloc(spec->m, elen+1); + memcpy((*ap)->value, ebuf, elen); + (*ap)->value[elen] = '\0'; + (*ap)->next = 0; + } + else + { + /* append to value if attribute already exists */ + char *nv = nmem_malloc(spec->m, elen + 1 + strlen((*ap)->value)); + strcpy(nv, (*ap)->value); + memcpy (nv + strlen(nv), ebuf, elen); + nv[strlen(nv)+elen] = '\0'; + (*ap)->value = nv; + } + } + else + { + if ((res = spec->d1_stack[spec->d1_level]) && + res->which == DATA1N_data) + org_len = res->u.data.len; + else + { + org_len = 0; + + res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent); + res->u.data.what = DATA1I_text; + res->u.data.len = 0; + res->u.data.formatted_text = formatted_text; + res->u.data.data = 0; + + if (spec->d1_stack[spec->d1_level]) + spec->d1_stack[spec->d1_level]->next = res; + spec->d1_stack[spec->d1_level] = res; + } + if (org_len + elen >= spec->concatBuf[spec->d1_level].max) + { + char *old_buf, *new_buf; + + spec->concatBuf[spec->d1_level].max = org_len + elen + 256; + new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max); + if ((old_buf = spec->concatBuf[spec->d1_level].buf)) + { + memcpy (new_buf, old_buf, org_len); + xfree (old_buf); + } + spec->concatBuf[spec->d1_level].buf = new_buf; + } + memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen); + res->u.data.len += elen; + } } static void execDataP (struct lexSpec *spec, - data1_node **d1_stack, int *d1_level, const char *ebuf, int elen, int formatted_text) { - execData (spec, d1_stack, d1_level, ebuf, elen, formatted_text); + execData (spec, ebuf, elen, formatted_text, 0, 0); +} + +static void tagDataRelease (struct lexSpec *spec) +{ + data1_node *res; + + if ((res = spec->d1_stack[spec->d1_level]) && + res->which == DATA1N_data && + res->u.data.what == DATA1I_text) + { + assert (!res->u.data.data); + assert (res->u.data.len > 0); + if (res->u.data.len > DATA1_LOCALDATA) + res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len); + else + res->u.data.data = res->lbuf; + memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf, + res->u.data.len); + } } static void variantBegin (struct lexSpec *spec, - data1_node **d1_stack, int *d1_level, const char *class_str, int class_len, const char *type_str, int type_len, const char *value_str, int value_len) { - struct data1_node *parent = d1_stack[*d1_level -1]; + struct data1_node *parent = spec->d1_stack[spec->d1_level -1]; char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL]; data1_vartype *tp; int i; data1_node *res; - if (*d1_level == 0) + if (spec->d1_level == 0) { - logf (LOG_WARN, "in variant begin. No record type defined"); + yaz_log (YLOG_WARN, "in variant begin. No record type defined"); return ; } if (class_len >= DATA1_MAX_SYMBOL) @@ -769,46 +817,35 @@ static void variantBegin (struct lexSpec *spec, ttype[type_len] = '\0'; #if REGX_DEBUG - logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype, *d1_level); + yaz_log (YLOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype, + spec->d1_level); #endif if (!(tp = - data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset, - tclass, ttype))) + data1_getvartypeby_absyn(spec->dh, parent->root->u.root.absyn, + tclass, ttype))) return; if (parent->which != DATA1N_variant) { - res = data1_mk_node (spec->dh, spec->m); - res->parent = parent; - res->which = DATA1N_variant; - res->u.variant.type = 0; - res->u.variant.value = 0; - res->root = parent->root; - - parent->last_child = res; - if (d1_stack[*d1_level]) - d1_stack[*d1_level]->next = res; - else - parent->child = res; - d1_stack[*d1_level] = res; - d1_stack[++(*d1_level)] = NULL; + res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent); + if (spec->d1_stack[spec->d1_level]) + tagDataRelease (spec); + spec->d1_stack[spec->d1_level] = res; + spec->d1_stack[++(spec->d1_level)] = NULL; } - for (i = *d1_level-1; d1_stack[i]->which == DATA1N_variant; i--) - if (d1_stack[i]->u.variant.type == tp) + for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--) + if (spec->d1_stack[i]->u.variant.type == tp) { - *d1_level = i; + spec->d1_level = i; break; } #if REGX_DEBUG - logf (LOG_DEBUG, "variant node (%d)", *d1_level); + yaz_log (YLOG_LOG, "variant node(%d)", spec->d1_level); #endif - parent = d1_stack[*d1_level-1]; - res = data1_mk_node (spec->dh, spec->m); - res->parent = parent; - res->which = DATA1N_variant; - res->root = parent->root; + parent = spec->d1_stack[spec->d1_level-1]; + res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent); res->u.variant.type = tp; if (value_len >= DATA1_LOCALDATA) @@ -818,13 +855,10 @@ static void variantBegin (struct lexSpec *spec, res->u.variant.value = res->lbuf; - parent->last_child = res; - if (d1_stack[*d1_level]) - d1_stack[*d1_level]->next = res; - else - parent->child = res; - d1_stack[*d1_level] = res; - d1_stack[++(*d1_level)] = NULL; + if (spec->d1_stack[spec->d1_level]) + tagDataRelease (spec); + spec->d1_stack[spec->d1_level] = res; + spec->d1_stack[++(spec->d1_level)] = NULL; } static void tagStrip (const char **tag, int *len) @@ -841,100 +875,77 @@ static void tagStrip (const char **tag, int *len) } static void tagBegin (struct lexSpec *spec, - data1_node **d1_stack, int *d1_level, const char *tag, int len) { - struct data1_node *parent = d1_stack[*d1_level -1]; - data1_element *elem = NULL; - data1_node *partag = get_parent_tag(spec->dh, parent); - data1_node *res; - data1_element *e = NULL; - int localtag = 0; - - if (*d1_level == 0) + if (spec->d1_level == 0) { - logf (LOG_WARN, "in element begin. No record type defined"); + yaz_log (YLOG_WARN, "in element begin. No record type defined"); return ; } tagStrip (&tag, &len); - - res = data1_mk_node (spec->dh, spec->m); - res->parent = parent; - res->which = DATA1N_tag; - res->u.tag.get_bytes = -1; - - if (len >= DATA1_LOCALDATA) - res->u.tag.tag = nmem_malloc (spec->m, len+1); - else - res->u.tag.tag = res->lbuf; + if (spec->d1_stack[spec->d1_level]) + tagDataRelease (spec); - memcpy (res->u.tag.tag, tag, len); - res->u.tag.tag[len] = '\0'; - #if REGX_DEBUG - logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, *d1_level); + yaz_log (YLOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level); #endif - if (parent->which == DATA1N_variant) - return ; - if (partag) - if (!(e = partag->u.tag.element)) - localtag = 1; - - elem = data1_getelementbytagname (spec->dh, d1_stack[0]->u.root.absyn, - e, res->u.tag.tag); - res->u.tag.element = elem; - res->u.tag.node_selected = 0; - res->u.tag.make_variantlist = 0; - res->u.tag.no_data_requested = 0; - res->root = parent->root; - - parent->last_child = res; - if (d1_stack[*d1_level]) - d1_stack[*d1_level]->next = res; - else - parent->child = res; - d1_stack[*d1_level] = res; - d1_stack[++(*d1_level)] = NULL; + + spec->d1_stack[spec->d1_level] = data1_mk_tag_n ( + spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]); + spec->d1_stack[++(spec->d1_level)] = NULL; } -static void tagEnd (struct lexSpec *spec, - data1_node **d1_stack, int *d1_level, int min_level, +static void tagEnd (struct lexSpec *spec, int min_level, const char *tag, int len) { tagStrip (&tag, &len); - while (*d1_level > min_level) + while (spec->d1_level > min_level) { - (*d1_level)--; - if (*d1_level == 0) + tagDataRelease (spec); + (spec->d1_level)--; + if (spec->d1_level == 0) break; - if ((d1_stack[*d1_level]->which == DATA1N_tag) && + if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) && (!tag || - (strlen(d1_stack[*d1_level]->u.tag.tag) == (size_t) len && - !memcmp (d1_stack[*d1_level]->u.tag.tag, tag, len)))) + (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) == + (size_t) len && + !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len)))) break; } #if REGX_DEBUG - logf (LOG_DEBUG, "end tag (%d)", *d1_level); + yaz_log (YLOG_LOG, "end tag(%d)", spec->d1_level); #endif } static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr, - struct DFA *dfa) + struct DFA *dfa, int greedy) { struct DFA_state *state = dfa->states[0]; struct DFA_tran *t; - unsigned char c; + unsigned char c = 0; unsigned char c_prev = 0; int ptr = *pptr; /* current pointer */ int start_ptr = *pptr; /* first char of match */ int last_ptr = 0; /* last char of match */ int last_rule = 0; /* rule number of current match */ + int restore_ptr = 0; int i; + if (ptr) + { + --ptr; + c = f_win_advance (spec, &ptr); + } while (1) { + if (dfa->states[0] == state) + { + c_prev = c; + restore_ptr = ptr; + } c = f_win_advance (spec, &ptr); + if (ptr == F_WIN_EOF) { if (last_rule) @@ -945,10 +956,11 @@ static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr, } break; } + t = state->trans; i = state->tran_no; while (1) - if (--i < 0) + if (--i < 0) /* no transition for character c */ { if (last_rule) { @@ -957,27 +969,28 @@ static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr, return 1; } state = dfa->states[0]; + + ptr = restore_ptr; + c = f_win_advance (spec, &ptr); + start_ptr = ptr; - c_prev = c; + break; } else if (c >= t->ch[0] && c <= t->ch[1]) { state = dfa->states[t->to]; - if (state->rule_no) - { - if (c_prev == '\n') - { - last_rule = state->rule_no; - last_ptr = ptr; - } - else - { - last_rule = state->rule_nno; - last_ptr = ptr; - } - } - break; + if (state->rule_no && c_prev == '\n') + { + last_rule = state->rule_no; + last_ptr = ptr; + } + else if (state->rule_nno) + { + last_rule = state->rule_nno; + last_ptr = ptr; + } + break; } else t++; @@ -986,7 +999,6 @@ static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr, } static int execTok (struct lexSpec *spec, const char **src, - int arg_no, int *arg_start, int *arg_end, const char **tokBuf, int *tokLen) { const char *s = *src; @@ -1001,16 +1013,17 @@ static int execTok (struct lexSpec *spec, const char **src, s++; while (*s >= '0' && *s <= '9') n = n*10 + (*s++ -'0'); - if (arg_no == 0) + if (spec->arg_no == 0) { *tokBuf = ""; *tokLen = 0; } else { - if (n >= arg_no) - n = arg_no-1; - *tokBuf = f_win_get (spec, arg_start[n], arg_end[n], tokLen); + if (n >= spec->arg_no) + n = spec->arg_no-1; + *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n], + tokLen); } } else if (*s == '\"') @@ -1031,7 +1044,8 @@ static int execTok (struct lexSpec *spec, const char **src, else if (*s == '-') { *tokBuf = s++; - while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';') + while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' && + *s != ';') s++; *tokLen = s - *tokBuf; *src = s; @@ -1040,7 +1054,8 @@ static int execTok (struct lexSpec *spec, const char **src, else { *tokBuf = s++; - while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';') + while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' && + *s != ';') s++; *tokLen = s - *tokBuf; } @@ -1057,85 +1072,302 @@ static char *regxStrz (const char *src, int len, char *str) return str; } -static int execCode (struct lexSpec *spec, - int arg_no, int *arg_start, int *arg_end, int *pptr, - struct regxCode *code, - data1_node **d1_stack, int *d1_level) +#if HAVE_TCL_H +static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp, + int argc, const char **argv) +{ + struct lexSpec *spec = (struct lexSpec *) clientData; + if (argc < 2) + return TCL_ERROR; + if (!strcmp(argv[1], "record") && argc == 3) + { + const char *absynName = argv[2]; + data1_node *res; + +#if REGX_DEBUG + yaz_log (YLOG_LOG, "begin record %s", absynName); +#endif + res = data1_mk_root (spec->dh, spec->m, absynName); + + spec->d1_level = 0; + + spec->d1_stack[spec->d1_level++] = res; + + res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res); + + spec->d1_stack[spec->d1_level++] = res; + + spec->d1_stack[spec->d1_level] = NULL; + } + else if (!strcmp(argv[1], "element") && argc == 3) + { + tagBegin (spec, argv[2], strlen(argv[2])); + } + else if (!strcmp (argv[1], "variant") && argc == 5) + { + variantBegin (spec, argv[2], strlen(argv[2]), + argv[3], strlen(argv[3]), + argv[4], strlen(argv[4])); + } + else if (!strcmp (argv[1], "context") && argc == 3) + { + struct lexContext *lc = spec->context; +#if REGX_DEBUG + yaz_log (YLOG_LOG, "begin context %s",argv[2]); +#endif + while (lc && strcmp (argv[2], lc->name)) + lc = lc->next; + if (lc) + { + spec->context_stack[++(spec->context_stack_top)] = lc; + } + else + yaz_log (YLOG_WARN, "unknown context %s", argv[2]); + } + else + return TCL_ERROR; + return TCL_OK; +} + +static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp, + int argc, const char **argv) +{ + struct lexSpec *spec = (struct lexSpec *) clientData; + if (argc < 2) + return TCL_ERROR; + + if (!strcmp (argv[1], "record")) + { + while (spec->d1_level) + { + tagDataRelease (spec); + (spec->d1_level)--; + } +#if REGX_DEBUG + yaz_log (YLOG_LOG, "end record"); +#endif + spec->stop_flag = 1; + } + else if (!strcmp (argv[1], "element")) + { + int min_level = 2; + const char *element = 0; + if (argc >= 3 && !strcmp(argv[2], "-record")) + { + min_level = 0; + if (argc == 4) + element = argv[3]; + } + else + if (argc == 3) + element = argv[2]; + tagEnd (spec, min_level, element, (element ? strlen(element) : 0)); + if (spec->d1_level <= 1) + { +#if REGX_DEBUG + yaz_log (YLOG_LOG, "end element end records"); +#endif + spec->stop_flag = 1; + } + } + else if (!strcmp (argv[1], "context")) + { +#if REGX_DEBUG + yaz_log (YLOG_LOG, "end context"); +#endif + if (spec->context_stack_top) + (spec->context_stack_top)--; + } + else + return TCL_ERROR; + return TCL_OK; +} + +static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp, + int argc, const char **argv) +{ + int argi = 1; + int textFlag = 0; + const char *element = 0; + const char *attribute = 0; + struct lexSpec *spec = (struct lexSpec *) clientData; + + while (argi < argc) + { + if (!strcmp("-text", argv[argi])) + { + textFlag = 1; + argi++; + } + else if (!strcmp("-element", argv[argi])) + { + argi++; + if (argi < argc) + element = argv[argi++]; + } + else if (!strcmp("-attribute", argv[argi])) + { + argi++; + if (argi < argc) + attribute = argv[argi++]; + } + else + break; + } + if (element) + tagBegin (spec, element, strlen(element)); + + while (argi < argc) + { +#if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0) + Tcl_DString ds; + char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds); + execData (spec, native, strlen(native), textFlag, attribute, + attribute ? strlen(attribute) : 0); + Tcl_DStringFree (&ds); +#else + execData (spec, argv[argi], strlen(argv[argi]), textFlag, attribute, + attribute ? strlen(attribute) : 0); +#endif + argi++; + } + if (element) + tagEnd (spec, 2, NULL, 0); + return TCL_OK; +} + +static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp, + int argc, const char **argv) +{ + struct lexSpec *spec = (struct lexSpec *) clientData; + int argi = 1; + int offset = 0; + int no; + + while (argi < argc) + { + if (!strcmp("-offset", argv[argi])) + { + argi++; + if (argi < argc) + { + offset = atoi(argv[argi]); + argi++; + } + } + else + break; + } + if (argi != argc-1) + return TCL_ERROR; + no = atoi(argv[argi]); + if (no >= spec->arg_no) + no = spec->arg_no - 1; + spec->ptr = spec->arg_start[no] + offset; + return TCL_OK; +} + +static void execTcl (struct lexSpec *spec, struct regxCode *code) +{ + int i; + int ret; + for (i = 0; i < spec->arg_no; i++) + { + char var_name[10], *var_buf; + int var_len, ch; + + sprintf (var_name, "%d", i); + var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i], + &var_len); + if (var_buf) + { + ch = var_buf[var_len]; + var_buf[var_len] = '\0'; + Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0); + var_buf[var_len] = ch; + } + } +#if HAVE_TCL_OBJECTS + ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj); +#else + ret = Tcl_GlobalEval (spec->tcl_interp, code->str); +#endif + if (ret != TCL_OK) + { + const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0); + yaz_log(YLOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s", + spec->tcl_interp->errorLine, + spec->tcl_interp->result, + err ? err : "[NO ERRORINFO]"); + } +} +/* HAVE_TCL_H */ +#endif + +static void execCode (struct lexSpec *spec, struct regxCode *code) { const char *s = code->str; int cmd_len, r; - int returnCode = 1; const char *cmd_str; - r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); while (r) { char *p, ptmp[64]; if (r == 1) { - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); continue; } p = regxStrz (cmd_str, cmd_len, ptmp); if (!strcmp (p, "begin")) { - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r < 2) { - logf (LOG_WARN, "missing keyword after 'begin'"); + yaz_log (YLOG_WARN, "missing keyword after 'begin'"); continue; } p = regxStrz (cmd_str, cmd_len, ptmp); if (!strcmp (p, "record")) { - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r < 2) continue; - if (*d1_level == 0) + if (spec->d1_level <= 1) { static char absynName[64]; - data1_absyn *absyn; + data1_node *res; if (cmd_len > 63) cmd_len = 63; memcpy (absynName, cmd_str, cmd_len); absynName[cmd_len] = '\0'; - #if REGX_DEBUG - logf (LOG_DEBUG, "begin record %s", absynName); + yaz_log (YLOG_LOG, "begin record %s", absynName); #endif - if (!(absyn = data1_get_absyn (spec->dh, absynName))) - logf (LOG_WARN, "Unknown tagset: %s", absynName); - else - { - data1_node *res; - - res = data1_mk_node (spec->dh, spec->m); - res->which = DATA1N_root; - res->u.root.type = absynName; - res->u.root.absyn = absyn; - res->root = res; - - d1_stack[*d1_level] = res; - d1_stack[++(*d1_level)] = NULL; - } + res = data1_mk_root (spec->dh, spec->m, absynName); + + spec->d1_level = 0; + + spec->d1_stack[spec->d1_level++] = res; + + res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res); + + spec->d1_stack[spec->d1_level++] = res; + + spec->d1_stack[spec->d1_level] = NULL; } - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else if (!strcmp (p, "element")) { - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r < 2) continue; - tagBegin (spec, d1_stack, d1_level, cmd_str, cmd_len); - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + tagBegin (spec, cmd_str, cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else if (!strcmp (p, "variant")) { @@ -1145,207 +1377,200 @@ static int execCode (struct lexSpec *spec, const char *type_str = NULL; int value_len; const char *value_str = NULL; - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r < 2) continue; class_str = cmd_str; class_len = cmd_len; - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r < 2) continue; type_str = cmd_str; type_len = cmd_len; - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r < 2) continue; value_str = cmd_str; value_len = cmd_len; - variantBegin (spec, d1_stack, d1_level, class_str, class_len, + variantBegin (spec, class_str, class_len, type_str, type_len, value_str, value_len); - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else if (!strcmp (p, "context")) { if (r > 1) { struct lexContext *lc = spec->context; - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); p = regxStrz (cmd_str, cmd_len, ptmp); #if REGX_DEBUG - logf (LOG_DEBUG, "begin context %s", p); + yaz_log (YLOG_LOG, "begin context %s", p); #endif while (lc && strcmp (p, lc->name)) lc = lc->next; if (lc) spec->context_stack[++(spec->context_stack_top)] = lc; else - logf (LOG_WARN, "unknown context %s", p); + yaz_log (YLOG_WARN, "unknown context %s", p); } - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else { - logf (LOG_WARN, "bad keyword '%s' after begin", p); + yaz_log (YLOG_WARN, "bad keyword '%s' after begin", p); } } else if (!strcmp (p, "end")) { - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r < 2) { - logf (LOG_WARN, "missing keyword after 'end'"); + yaz_log (YLOG_WARN, "missing keyword after 'end'"); continue; } p = regxStrz (cmd_str, cmd_len, ptmp); if (!strcmp (p, "record")) { - *d1_level = 0; - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + while (spec->d1_level) + { + tagDataRelease (spec); + (spec->d1_level)--; + } + r = execTok (spec, &s, &cmd_str, &cmd_len); #if REGX_DEBUG - logf (LOG_DEBUG, "end record"); + yaz_log (YLOG_LOG, "end record"); #endif - returnCode = 0; + spec->stop_flag = 1; } else if (!strcmp (p, "element")) { - int min_level = 1; - while ((r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len)) == 3) + int min_level = 2; + while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3) { if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len)) min_level = 0; } if (r > 2) { - tagEnd (spec, d1_stack, d1_level, min_level, - cmd_str, cmd_len); - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + tagEnd (spec, min_level, cmd_str, cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else - tagEnd (spec, d1_stack, d1_level, min_level, NULL, 0); - if (*d1_level == 0) + tagEnd (spec, min_level, NULL, 0); + if (spec->d1_level <= 1) { #if REGX_DEBUG - logf (LOG_DEBUG, "end element end records"); + yaz_log (YLOG_LOG, "end element end records"); #endif - returnCode = 0; + spec->stop_flag = 1; } } else if (!strcmp (p, "context")) { #if REGX_DEBUG - logf (LOG_DEBUG, "end context"); + yaz_log (YLOG_LOG, "end context"); #endif if (spec->context_stack_top) (spec->context_stack_top)--; - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else - logf (LOG_WARN, "bad keyword '%s' after end", p); + yaz_log (YLOG_WARN, "bad keyword '%s' after end", p); } else if (!strcmp (p, "data")) { int textFlag = 0; int element_len; const char *element_str = NULL; + int attribute_len; + const char *attribute_str = NULL; - while ((r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len)) == 3) + while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3) { if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len)) textFlag = 1; else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len)) { - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &element_str, &element_len); + r = execTok (spec, &s, &element_str, &element_len); + if (r < 2) + break; + } + else if (cmd_len==10 && !memcmp ("-attribute", cmd_str, + cmd_len)) + { + r = execTok (spec, &s, &attribute_str, &attribute_len); if (r < 2) break; } else - logf (LOG_WARN, "bad data option: %.*s", + yaz_log (YLOG_WARN, "bad data option: %.*s", cmd_len, cmd_str); } if (r != 2) { - logf (LOG_WARN, "missing data item after data"); + yaz_log (YLOG_WARN, "missing data item after data"); continue; } if (element_str) - tagBegin (spec, d1_stack, d1_level, element_str, element_len); + tagBegin (spec, element_str, element_len); do { - execData (spec, d1_stack, d1_level, cmd_str, cmd_len, - textFlag); - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + execData (spec, cmd_str, cmd_len, textFlag, + attribute_str, attribute_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } while (r > 1); if (element_str) - tagEnd (spec, d1_stack, d1_level, 1, NULL, 0); + tagEnd (spec, 2, NULL, 0); } else if (!strcmp (p, "unread")) { int no, offset; - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len)) { - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); if (r < 2) { - logf (LOG_WARN, "missing number after -offset"); + yaz_log (YLOG_WARN, "missing number after -offset"); continue; } p = regxStrz (cmd_str, cmd_len, ptmp); offset = atoi (p); - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else offset = 0; if (r < 2) { - logf (LOG_WARN, "missing index after unread command"); + yaz_log (YLOG_WARN, "missing index after unread command"); continue; } if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9') { - logf (LOG_WARN, "bad index after unread command"); + yaz_log (YLOG_WARN, "bad index after unread command"); continue; } else { no = *cmd_str - '0'; - if (no >= arg_no) - no = arg_no - 1; - *pptr = arg_start[no] + offset; + if (no >= spec->arg_no) + no = spec->arg_no - 1; + spec->ptr = spec->arg_start[no] + offset; } - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else if (!strcmp (p, "context")) { if (r > 1) { struct lexContext *lc = spec->context; - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); p = regxStrz (cmd_str, cmd_len, ptmp); while (lc && strcmp (p, lc->name)) @@ -1353,39 +1578,29 @@ static int execCode (struct lexSpec *spec, if (lc) spec->context_stack[spec->context_stack_top] = lc; else - logf (LOG_WARN, "unknown context %s", p); + yaz_log (YLOG_WARN, "unknown context %s", p); } - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } else { - logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str); - r = execTok (spec, &s, arg_no, arg_start, arg_end, - &cmd_str, &cmd_len); + yaz_log (YLOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str); + r = execTok (spec, &s, &cmd_str, &cmd_len); continue; } if (r > 1) { - logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str); + yaz_log (YLOG_WARN, "ignoring token %.*s", cmd_len, cmd_str); do { - r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str, - &cmd_len); + r = execTok (spec, &s, &cmd_str, &cmd_len); } while (r > 1); } } - return returnCode; } -/* - * execAction: Execute action specified by 'ap'. Returns 0 if - * the pattern(s) associated by rule and code could be executed - * ok; returns 1 if code couldn't be executed. - */ static int execAction (struct lexSpec *spec, struct lexRuleAction *ap, - data1_node **d1_stack, int *d1_level, int start_ptr, int *pptr) { int sptr; @@ -1393,8 +1608,12 @@ static int execAction (struct lexSpec *spec, struct lexRuleAction *ap, int arg_end[20]; int arg_no = 1; + if (!ap) + return 1; arg_start[0] = start_ptr; arg_end[0] = *pptr; + spec->arg_start = arg_start; + spec->arg_end = arg_end; while (ap) { @@ -1404,13 +1623,14 @@ static int execAction (struct lexSpec *spec, struct lexRuleAction *ap, if (ap->u.pattern.body) { arg_start[arg_no] = *pptr; - if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa)) + if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0)) { arg_end[arg_no] = F_WIN_EOF; arg_no++; arg_start[arg_no] = F_WIN_EOF; arg_end[arg_no] = F_WIN_EOF; -/* return 1*/ + yaz_log(YLOG_DEBUG, "Pattern match rest of record"); + *pptr = F_WIN_EOF; } else { @@ -1423,7 +1643,7 @@ static int execAction (struct lexSpec *spec, struct lexRuleAction *ap, else { arg_start[arg_no] = *pptr; - if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa)) + if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1)) return 1; if (sptr != arg_start[arg_no]) return 1; @@ -1432,9 +1652,19 @@ static int execAction (struct lexSpec *spec, struct lexRuleAction *ap, arg_no++; break; case REGX_CODE: - if (!execCode (spec, arg_no, arg_start, arg_end, pptr, - ap->u.code, d1_stack, d1_level)) - return 0; + spec->arg_no = arg_no; + spec->ptr = *pptr; +#if HAVE_TCL_H + if (spec->tcl_interp) + execTcl(spec, ap->u.code); + else + execCode (spec, ap->u.code); +#else + execCode (spec, ap->u.code); +#endif + *pptr = spec->ptr; + if (spec->stop_flag) + return 0; break; case REGX_END: arg_start[arg_no] = *pptr; @@ -1448,18 +1678,16 @@ static int execAction (struct lexSpec *spec, struct lexRuleAction *ap, } static int execRule (struct lexSpec *spec, struct lexContext *context, - data1_node **d1_stack, int *d1_level, int ruleNo, int start_ptr, int *pptr) { #if REGX_DEBUG - logf (LOG_DEBUG, "exec rule %d", ruleNo); + yaz_log (YLOG_LOG, "exec rule %d", ruleNo); #endif return execAction (spec, context->fastRule[ruleNo]->actionList, - d1_stack, d1_level, start_ptr, pptr); + start_ptr, pptr); } -data1_node *lexNode (struct lexSpec *spec, - data1_node **d1_stack, int *d1_level, int *ptr) +data1_node *lexNode (struct lexSpec *spec, int *ptr) { struct lexContext *context = spec->context_stack[spec->context_stack_top]; struct DFA_state *state = context->dfa->states[0]; @@ -1487,14 +1715,13 @@ data1_node *lexNode (struct lexSpec *spec, int size; char *buf; buf = f_win_get (spec, skip_ptr, start_ptr, &size); - execDataP (spec, d1_stack, d1_level, buf, size, 0); + execDataP (spec, buf, size, 0); } /* restore pointer */ *ptr = last_ptr; /* execute rule */ - if (!execRule (spec, context, d1_stack, d1_level, - last_rule, start_ptr, ptr)) - break; + if (!execRule (spec, context, last_rule, start_ptr, ptr)) + break; /* restore skip pointer */ skip_ptr = *ptr; last_rule = 0; @@ -1505,7 +1732,7 @@ data1_node *lexNode (struct lexSpec *spec, int size; char *buf; buf = f_win_get (spec, skip_ptr, *ptr, &size); - execDataP (spec, d1_stack, d1_level, buf, size, 0); + execDataP (spec, buf, size, 0); } if (*ptr == F_WIN_EOF) break; @@ -1523,17 +1750,16 @@ data1_node *lexNode (struct lexSpec *spec, int size; char *buf; buf = f_win_get (spec, skip_ptr, start_ptr, &size); - execDataP (spec, d1_stack, d1_level, buf, size, 0); + execDataP (spec, buf, size, 0); } /* restore pointer */ *ptr = last_ptr; - if (!execRule (spec, context, d1_stack, d1_level, - last_rule, start_ptr, ptr)) + if (!execRule (spec, context, last_rule, start_ptr, ptr)) { if (spec->f_win_ef && *ptr != F_WIN_EOF) { #if REGX_DEBUG - logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr); + yaz_log (YLOG_LOG, "regx: endf ptr=%d", *ptr); #endif (*spec->f_win_ef)(spec->f_win_fh, *ptr); } @@ -1585,10 +1811,10 @@ static data1_node *lexRoot (struct lexSpec *spec, off_t offset, const char *context_name) { struct lexContext *lt = spec->context; - data1_node *d1_stack[512]; - int d1_level = 0; int ptr = offset; + spec->stop_flag = 0; + spec->d1_level = 0; spec->context_stack_top = 0; while (lt) { @@ -1598,50 +1824,189 @@ static data1_node *lexRoot (struct lexSpec *spec, off_t offset, } if (!lt) { - logf (LOG_WARN, "cannot find context %s", context_name); + yaz_log (YLOG_WARN, "cannot find context %s", context_name); return NULL; } spec->context_stack[spec->context_stack_top] = lt; - d1_stack[d1_level] = NULL; - if (lt->beginActionList) - execAction (spec, lt->beginActionList, d1_stack, &d1_level, 0, &ptr); - lexNode (spec, d1_stack, &d1_level, &ptr); - if (lt->endActionList) - execAction (spec, lt->endActionList, d1_stack, &d1_level, ptr, &ptr); - return *d1_stack; + spec->d1_stack[spec->d1_level] = NULL; +#if 1 + if (!lt->initFlag) + { + lt->initFlag = 1; + execAction (spec, lt->initActionList, ptr, &ptr); + } +#endif + execAction (spec, lt->beginActionList, ptr, &ptr); + lexNode (spec, &ptr); + while (spec->d1_level) + { + tagDataRelease (spec); + (spec->d1_level)--; + } + execAction (spec, lt->endActionList, ptr, &ptr); + return spec->d1_stack[0]; +} + +void grs_destroy(void *clientData) +{ + struct lexSpecs *specs = (struct lexSpecs *) clientData; + if (specs->spec) + { + lexSpecDestroy(&specs->spec); + } + xfree (specs); +} + +void *grs_init(Res res, RecType recType) +{ + struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs)); + specs->spec = 0; + strcpy(specs->type, ""); + return specs; +} + + +void grs_config(void *clientData, Res res, const char *args) +{ + struct lexSpecs *specs = (struct lexSpecs *) clientData; + if (strlen(args) < sizeof(specs->type)) + strcpy(specs->type, args); } data1_node *grs_read_regx (struct grs_read_info *p) { int res; + struct lexSpecs *specs = (struct lexSpecs *) p->clientData; + struct lexSpec **curLexSpec = &specs->spec; #if REGX_DEBUG - logf (LOG_DEBUG, "grs_read_regx"); + yaz_log (YLOG_LOG, "grs_read_regx"); #endif - if (!curLexSpec || strcmp (curLexSpec->name, p->type)) + if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type)) { - if (curLexSpec) - lexSpecDestroy (&curLexSpec); - curLexSpec = lexSpecCreate (p->type); - curLexSpec->dh = p->dh; - res = readFileSpec (curLexSpec); + if (*curLexSpec) + lexSpecDestroy (curLexSpec); + *curLexSpec = lexSpecCreate (specs->type, p->dh); + res = readFileSpec (*curLexSpec); if (res) { - lexSpecDestroy (&curLexSpec); + lexSpecDestroy (curLexSpec); return NULL; } } - curLexSpec->dh = p->dh; + (*curLexSpec)->dh = p->dh; if (!p->offset) { - curLexSpec->f_win_start = 0; - curLexSpec->f_win_end = 0; - curLexSpec->f_win_rf = p->readf; - curLexSpec->f_win_sf = p->seekf; - curLexSpec->f_win_fh = p->fh; - curLexSpec->f_win_ef = p->endf; - curLexSpec->f_win_size = 500000; + (*curLexSpec)->f_win_start = 0; + (*curLexSpec)->f_win_end = 0; + (*curLexSpec)->f_win_rf = p->readf; + (*curLexSpec)->f_win_sf = p->seekf; + (*curLexSpec)->f_win_fh = p->fh; + (*curLexSpec)->f_win_ef = p->endf; + (*curLexSpec)->f_win_size = 500000; } - curLexSpec->m = p->mem; - return lexRoot (curLexSpec, p->offset, "main"); + (*curLexSpec)->m = p->mem; + return lexRoot (*curLexSpec, p->offset, "main"); } + +static int extract_regx(void *clientData, struct recExtractCtrl *ctrl) +{ + return zebra_grs_extract(clientData, ctrl, grs_read_regx); +} + +static int retrieve_regx(void *clientData, struct recRetrieveCtrl *ctrl) +{ + return zebra_grs_retrieve(clientData, ctrl, grs_read_regx); +} + +static struct recType regx_type = { + "grs.regx", + grs_init, + grs_config, + grs_destroy, + extract_regx, + retrieve_regx, +}; + + +#if HAVE_TCL_H +data1_node *grs_read_tcl (struct grs_read_info *p) +{ + int res; + struct lexSpecs *specs = (struct lexSpecs *) p->clientData; + struct lexSpec **curLexSpec = &specs->spec; + +#if REGX_DEBUG + yaz_log (YLOG_LOG, "grs_read_tcl"); +#endif + if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type)) + { + Tcl_Interp *tcl_interp; + if (*curLexSpec) + lexSpecDestroy (curLexSpec); + *curLexSpec = lexSpecCreate (specs->type, p->dh); + Tcl_FindExecutable(""); + tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp(); + Tcl_Init(tcl_interp); + Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0); + Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0); + Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0); + Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread, + *curLexSpec, 0); + res = readFileSpec (*curLexSpec); + if (res) + { + lexSpecDestroy (curLexSpec); + return NULL; + } + } + (*curLexSpec)->dh = p->dh; + if (!p->offset) + { + (*curLexSpec)->f_win_start = 0; + (*curLexSpec)->f_win_end = 0; + (*curLexSpec)->f_win_rf = p->readf; + (*curLexSpec)->f_win_sf = p->seekf; + (*curLexSpec)->f_win_fh = p->fh; + (*curLexSpec)->f_win_ef = p->endf; + (*curLexSpec)->f_win_size = 500000; + } + (*curLexSpec)->m = p->mem; + return lexRoot (*curLexSpec, p->offset, "main"); +} + +static int extract_tcl(void *clientData, struct recExtractCtrl *ctrl) +{ + return zebra_grs_extract(clientData, ctrl, grs_read_tcl); +} + +static int retrieve_tcl(void *clientData, struct recRetrieveCtrl *ctrl) +{ + return zebra_grs_retrieve(clientData, ctrl, grs_read_tcl); +} + +static struct recType tcl_type = { + "grs.tcl", + grs_init, + grs_config, + grs_destroy, + extract_tcl, + retrieve_tcl, +}; + +#endif + +RecType +#ifdef IDZEBRA_STATIC_GRS_REGX +idzebra_filter_grs_regx +#else +idzebra_filter +#endif + +[] = { + ®x_type, +#if HAVE_TCL_H + &tcl_type, +#endif + 0, +};