-/*
- * Copyright (C) 1994-2001, Index Data
- * All rights reserved.
- *
- * $Log: regxread.c,v $
- * Revision 1.38 2002-04-04 20:50:37 adam
- * Multi register works with record paths and data1 profile path
- *
- * Revision 1.37 2001/05/29 08:51:59 adam
- * More fixes for character encodings.
- *
- * Revision 1.36 2001/05/22 21:02:26 adam
- * Fixes for Tcl UTF8 character handling.
- *
- * Revision 1.35 2001/03/29 21:31:31 adam
- * Fixed "record begin" for Tcl filter.
- *
- * Revision 1.34 2000/11/29 14:24:01 adam
- * Script configure uses yaz pthreads options. Added locking for
- * zebra_register_{lock,unlock}.
- *
- * Revision 1.33 1999/11/30 13:48:04 adam
- * Improved installation. Updated for inclusion of YAZ header files.
- *
- * Revision 1.32 1999/09/07 07:19:21 adam
- * Work on character mapping. Implemented replace rules.
- *
- * Revision 1.31 1999/07/14 13:05:29 adam
- * Tcl filter works with objects when TCL is version 8 or later; filter
- * works with strings otherwise (slow).
- *
- * Revision 1.30 1999/07/14 10:55:28 adam
- * Fixed memory leak.
- *
- * Revision 1.29 1999/07/12 07:27:54 adam
- * Improved speed of Tcl processing. Fixed one memory leak.
- *
- * Revision 1.28 1999/07/06 12:26:04 adam
- * Fixed filters so that MS-DOS CR is ignored.
- *
- * Revision 1.27 1999/06/28 13:25:40 quinn
- * Improved diagnostics for Tcl
- *
- * Revision 1.26 1999/05/26 07:49:14 adam
- * C++ compilation.
- *
- * Revision 1.25 1999/05/25 12:33:32 adam
- * Fixed bug in Tcl filter.
- *
- * Revision 1.24 1999/05/21 11:08:46 adam
- * Tcl filter attempts to read <filt>.tflt. Improvements to configure
- * script so that it reads uninstalled Tcl source.
- *
- * Revision 1.23 1999/05/20 12:57:18 adam
- * Implemented TCL filter. Updated recctrl system.
- *
- * Revision 1.22 1998/11/03 16:07:13 adam
- * Yet another fix.
- *
- * Revision 1.21 1998/11/03 15:43:39 adam
- * Fixed bug introduced by previous commit.
- *
- * Revision 1.20 1998/11/03 14:51:28 adam
- * Changed code so that it creates as few data1 nodes as possible.
- *
- * Revision 1.19 1998/11/03 10:22:39 adam
- * Fixed memory leak that could occur for when large data1 node were
- * concatenated. Data-type data1_nodes may have multiple nodes.
- *
- * Revision 1.18 1998/10/15 13:11:47 adam
- * Added support for option -record for "end element". When specified
- * end element will mark end-of-record when at outer-level.
- *
- * Revision 1.17 1998/07/01 10:13:51 adam
- * Minor fix.
- *
- * Revision 1.16 1998/06/30 15:15:09 adam
- * Tags are trimmed: white space removed before- and after the tag.
- *
- * Revision 1.15 1998/06/30 12:55:45 adam
- * Bug fix.
- *
- * Revision 1.14 1998/03/05 08:41:00 adam
- * Implemented rule contexts.
- *
- * Revision 1.13 1997/12/12 06:33:58 adam
- * Fixed bug that showed up when multiple filter where used.
- * Made one routine thread-safe.
- *
- * Revision 1.12 1997/11/18 10:03:24 adam
- * Member num_children removed from data1_node.
- *
- * Revision 1.11 1997/11/06 11:41:01 adam
- * Implemented "begin variant" for the sgml.regx filter.
- *
- * Revision 1.10 1997/10/31 12:36:12 adam
- * Minor change that avoids compiler warning.
- *
- * Revision 1.9 1997/09/29 09:02:49 adam
- * Fixed small bug (introduced by previous commit).
- *
- * Revision 1.8 1997/09/17 12:19:22 adam
- * Zebra version corresponds to YAZ version 1.4.
- * Changed Zebra server so that it doesn't depend on global common_resource.
- *
- * Revision 1.7 1997/07/15 16:33:07 adam
- * Check for zero length in execData.
- *
- * Revision 1.6 1997/02/24 10:41:51 adam
- * Cleanup of code and commented out the "end element-end-record" code.
- *
- * Revision 1.5 1997/02/19 16:22:33 adam
- * Fixed "end element" to terminate record in outer-most level.
- *
- * Revision 1.4 1997/02/12 20:42:58 adam
- * Changed some log messages.
- *
- * Revision 1.3 1996/11/08 14:05:33 adam
- * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
- *
- * Revision 1.2 1996/10/29 14:02:09 adam
- * Doesn't use the global data1_tabpath (from YAZ). Instead the function
- * data1_get_tabpath is used.
- *
- * Revision 1.1 1996/10/11 10:57:30 adam
- * New module recctrl. Used to manage records (extract/retrieval).
- *
- * Revision 1.24 1996/06/17 14:25:31 adam
- * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
- *
- * Revision 1.23 1996/06/04 10:19:00 adam
- * Minor changes - removed include of ctype.h.
- *
- * Revision 1.22 1996/06/03 15:23:13 adam
- * Bug fix: /../ BODY /../ - pattern didn't match EOF.
- *
- * Revision 1.21 1996/05/14 16:58:38 adam
- * Minor change.
- *
- * Revision 1.20 1996/05/01 13:46:36 adam
- * First work on multiple records in one file.
- * New option, -offset, to the "unread" command in the filter module.
- *
- * Revision 1.19 1996/02/12 16:18:20 adam
- * Yet another bug fix in implementation of unread command.
- *
- * Revision 1.18 1996/02/12 16:07:54 adam
- * Bug fix in new unread command.
- *
- * Revision 1.17 1996/02/12 15:56:11 adam
- * New code command: unread.
- *
- * Revision 1.16 1996/01/17 14:57:51 adam
- * Prototype changed for reader functions in extract/retrieve. File
- * is identified by 'void *' instead of 'int.
- *
- * Revision 1.15 1996/01/08 19:15:47 adam
- * New input filter that works!
- *
- * Revision 1.14 1996/01/08 09:10:38 adam
- * Yet another complete rework on this module.
- *
- * Revision 1.13 1995/12/15 17:21:50 adam
- * This version is able to set data.formatted_text in data1-nodes.
- *
- * Revision 1.12 1995/12/15 16:20:10 adam
- * The filter files (*.flt) are read from the path given by data1_tabpath.
- *
- * Revision 1.11 1995/12/15 12:35:16 adam
- * Better logging.
- *
- * Revision 1.10 1995/12/15 10:35:36 adam
- * Misc. bug fixes.
- *
- * Revision 1.9 1995/12/14 16:38:48 adam
- * Completely new attempt to make regular expression parsing.
- *
- * Revision 1.8 1995/12/13 17:16:59 adam
- * Small changes.
- *
- * Revision 1.7 1995/12/13 16:51:58 adam
- * Modified to set last_child in data1_nodes.
- * Uses destroy handler to free up data text nodes.
- *
- * Revision 1.6 1995/12/13 13:45:37 quinn
- * Changed data1 to use nmem.
- *
- * Revision 1.5 1995/12/11 09:12:52 adam
- * The rec_get function returns NULL if record doesn't exist - will
- * happen in the server if the result set records have been deleted since
- * the creation of the set (i.e. the search).
- * The server saves a result temporarily if it is 'volatile', i.e. the
- * set is register dependent.
- *
- * Revision 1.4 1995/12/05 16:57:40 adam
- * More work on regular patterns.
- *
- * Revision 1.3 1995/12/05 09:37:09 adam
- * One malloc was renamed to xmalloc.
- *
- * Revision 1.2 1995/12/04 17:59:24 adam
- * More work on regular expression conversion.
- *
- * Revision 1.1 1995/12/04 14:25:30 adam
- * Started work on regular expression parsed input to structured records.
- *
- */
+/* $Id: regxread.c,v 1.57 2005-01-16 23:14:57 adam Exp $
+ Copyright (C) 1995-2005
+ Index Data ApS
+
+This file is part of the Zebra server.
+
+Zebra is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with Zebra; see the file LICENSE.zebra. If not, write to the
+Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+02111-1307, USA.
+*/
+
#include <stdio.h>
+#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <ctype.h>
#include <yaz/tpath.h>
#include <zebrautl.h>
#include <dfa.h>
-#include "grsread.h"
+#include <idzebra/recgrs.h>
#if HAVE_TCL_H
#include <tcl.h>
struct lexSpecs {
struct lexSpec *spec;
+ char type[256];
};
static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
cmd[i] = '\0';
if (i == 0)
{
- logf (LOG_WARN, "bad character %d %c", *cp, *cp);
+ yaz_log (YLOG_WARN, "bad character %d %c", *cp, *cp);
cp++;
while (*cp && *cp != ' ' && *cp != '\t' &&
*cp != '\n' && *cp != '\r')
return REGX_INIT;
else
{
- logf (LOG_WARN, "bad command %s", cmd);
+ yaz_log (YLOG_WARN, "bad command %s", cmd);
return 0;
}
}
{
xfree (*ap);
*ap = NULL;
- logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
+ yaz_log (YLOG_WARN, "regular expression error '%.*s'", s-s0, s0);
return -1;
}
+ if (debug_dfa_tran)
+ printf ("pattern: %.*s\n", s-s0, s0);
dfa_mkstate ((*ap)->u.pattern.dfa);
s++;
break;
case REGX_BEGIN:
- logf (LOG_WARN, "cannot use BEGIN here");
+ yaz_log (YLOG_WARN, "cannot use BEGIN here");
continue;
case REGX_INIT:
- logf (LOG_WARN, "cannot use INIT here");
+ yaz_log (YLOG_WARN, "cannot use INIT here");
continue;
case REGX_END:
*ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
tok = readParseToken (&s, &len);
if (tok != REGX_CODE)
{
- logf (LOG_WARN, "missing name after CONTEXT keyword");
+ yaz_log (YLOG_WARN, "missing name after CONTEXT keyword");
return 0;
}
if (len > 31)
break;
case REGX_PATTERN:
#if REGX_DEBUG
- logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
+ yaz_log (YLOG_LOG, "rule %d %s", spec->context->ruleNo, s);
#endif
r = dfa_parse (spec->context->dfa, &s);
if (r)
{
- logf (LOG_WARN, "regular expression error. r=%d", r);
+ yaz_log (YLOG_WARN, "regular expression error. r=%d", r);
return -1;
}
if (*s != '/')
{
- logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
+ yaz_log (YLOG_WARN, "expects / at end of pattern. got %c", *s);
return -1;
}
s++;
}
if (!spec_inf)
{
- logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
+ yaz_log (YLOG_ERRNO|YLOG_WARN, "cannot read spec file %s", spec->name);
return -1;
}
- logf (LOG_LOG, "reading regx filter %s", fname);
+ yaz_log (YLOG_LOG, "reading regx filter %s", fname);
#if HAVE_TCL_H
if (spec->tcl_interp)
- logf (LOG_LOG, "Tcl enabled");
+ yaz_log (YLOG_LOG, "Tcl enabled");
+#endif
+
+#if 0
+ debug_dfa_trav = 0;
+ debug_dfa_tran = 1;
+ debug_dfa_followpos = 0;
+ dfa_verbose = 1;
#endif
+
lineBuf = wrbuf_alloc();
spec->lineNo = 0;
c = getc (spec_inf);
fclose (spec_inf);
wrbuf_free(lineBuf, 1);
-#if 0
- debug_dfa_trav = 1;
- debug_dfa_tran = 1;
- debug_dfa_followpos = 1;
- dfa_verbose = 1;
-#endif
for (lc = spec->context; lc; lc = lc->next)
{
struct lexRule *rp;
#endif
static void execData (struct lexSpec *spec,
- const char *ebuf, int elen, int formatted_text)
+ const char *ebuf, int elen, int formatted_text,
+ const char *attribute_str, int attribute_len)
{
struct data1_node *res, *parent;
int org_len;
if (elen == 0) /* shouldn't happen, but it does! */
return ;
#if REGX_DEBUG
- if (elen > 40)
- logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen,
- ebuf, 15, ebuf + elen-15);
+ if (elen > 80)
+ yaz_log (YLOG_LOG, "data(%d bytes) %.40s ... %.*s", elen,
+ ebuf, 40, ebuf + elen-40);
+ else if (elen == 1 && ebuf[0] == '\n')
+ {
+ yaz_log (YLOG_LOG, "data(new line)");
+ }
else if (elen > 0)
- logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf);
+ yaz_log (YLOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf);
else
- logf (LOG_LOG, "data (%d bytes)", elen);
+ yaz_log (YLOG_LOG, "data(%d bytes)", elen);
#endif
if (spec->d1_level <= 1)
parent = spec->d1_stack[spec->d1_level -1];
assert (parent);
- if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
- org_len = res->u.data.len;
- else
+ if (attribute_str)
{
- org_len = 0;
-
- res = data1_mk_node (spec->dh, spec->m);
- res->parent = parent;
- res->which = DATA1N_data;
- res->u.data.what = DATA1I_text;
- res->u.data.len = 0;
- res->u.data.formatted_text = formatted_text;
-#if 0
- if (elen > DATA1_LOCALDATA)
- res->u.data.data = nmem_malloc (spec->m, elen);
- else
- res->u.data.data = res->lbuf;
- memcpy (res->u.data.data, ebuf, elen);
-#else
- res->u.data.data = 0;
-#endif
- res->root = parent->root;
-
- parent->last_child = res;
- if (spec->d1_stack[spec->d1_level])
- spec->d1_stack[spec->d1_level]->next = res;
+ data1_xattr **ap;
+ res = parent;
+ if (res->which != DATA1N_tag)
+ return;
+ /* sweep through exising attributes.. */
+ for (ap = &res->u.tag.attributes; *ap; ap = &(*ap)->next)
+ if (strlen((*ap)->name) == attribute_len &&
+ !memcmp((*ap)->name, attribute_str, attribute_len))
+ break;
+ if (!*ap)
+ {
+ /* new attribute. Create it with name + value */
+ *ap = nmem_malloc(spec->m, sizeof(**ap));
+
+ (*ap)->name = nmem_malloc(spec->m, attribute_len+1);
+ memcpy((*ap)->name, attribute_str, attribute_len);
+ (*ap)->name[attribute_len] = '\0';
+
+ (*ap)->value = nmem_malloc(spec->m, elen+1);
+ memcpy((*ap)->value, ebuf, elen);
+ (*ap)->value[elen] = '\0';
+ (*ap)->next = 0;
+ }
else
- parent->child = res;
- spec->d1_stack[spec->d1_level] = res;
- }
- if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
+ {
+ /* append to value if attribute already exists */
+ char *nv = nmem_malloc(spec->m, elen + 1 + strlen((*ap)->value));
+ strcpy(nv, (*ap)->value);
+ memcpy (nv + strlen(nv), ebuf, elen);
+ nv[strlen(nv)+elen] = '\0';
+ (*ap)->value = nv;
+ }
+ }
+ else
{
- char *old_buf, *new_buf;
-
- spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
- new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
- if ((old_buf = spec->concatBuf[spec->d1_level].buf))
+ if ((res = spec->d1_stack[spec->d1_level]) &&
+ res->which == DATA1N_data)
+ org_len = res->u.data.len;
+ else
{
- memcpy (new_buf, old_buf, org_len);
- xfree (old_buf);
+ org_len = 0;
+
+ res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
+ res->u.data.what = DATA1I_text;
+ res->u.data.len = 0;
+ res->u.data.formatted_text = formatted_text;
+ res->u.data.data = 0;
+
+ if (spec->d1_stack[spec->d1_level])
+ spec->d1_stack[spec->d1_level]->next = res;
+ spec->d1_stack[spec->d1_level] = res;
}
- spec->concatBuf[spec->d1_level].buf = new_buf;
+ if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
+ {
+ char *old_buf, *new_buf;
+
+ spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
+ new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
+ if ((old_buf = spec->concatBuf[spec->d1_level].buf))
+ {
+ memcpy (new_buf, old_buf, org_len);
+ xfree (old_buf);
+ }
+ spec->concatBuf[spec->d1_level].buf = new_buf;
+ }
+ memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
+ res->u.data.len += elen;
}
- memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
- res->u.data.len += elen;
}
static void execDataP (struct lexSpec *spec,
const char *ebuf, int elen, int formatted_text)
{
- execData (spec, ebuf, elen, formatted_text);
+ execData (spec, ebuf, elen, formatted_text, 0, 0);
}
static void tagDataRelease (struct lexSpec *spec)
if (spec->d1_level == 0)
{
- logf (LOG_WARN, "in variant begin. No record type defined");
+ yaz_log (YLOG_WARN, "in variant begin. No record type defined");
return ;
}
if (class_len >= DATA1_MAX_SYMBOL)
ttype[type_len] = '\0';
#if REGX_DEBUG
- logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype,
+ yaz_log (YLOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype,
spec->d1_level);
#endif
if (!(tp =
- data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
- tclass, ttype)))
+ data1_getvartypeby_absyn(spec->dh, parent->root->u.root.absyn,
+ tclass, ttype)))
return;
if (parent->which != DATA1N_variant)
{
- res = data1_mk_node (spec->dh, spec->m);
- res->parent = parent;
- res->which = DATA1N_variant;
- res->u.variant.type = 0;
- res->u.variant.value = 0;
- res->root = parent->root;
-
- parent->last_child = res;
+ res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
if (spec->d1_stack[spec->d1_level])
- {
tagDataRelease (spec);
- spec->d1_stack[spec->d1_level]->next = res;
- }
- else
- parent->child = res;
spec->d1_stack[spec->d1_level] = res;
spec->d1_stack[++(spec->d1_level)] = NULL;
}
}
#if REGX_DEBUG
- logf (LOG_LOG, "variant node (%d)", spec->d1_level);
+ yaz_log (YLOG_LOG, "variant node(%d)", spec->d1_level);
#endif
parent = spec->d1_stack[spec->d1_level-1];
- res = data1_mk_node (spec->dh, spec->m);
- res->parent = parent;
- res->which = DATA1N_variant;
- res->root = parent->root;
+ res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
res->u.variant.type = tp;
if (value_len >= DATA1_LOCALDATA)
res->u.variant.value = res->lbuf;
- parent->last_child = res;
if (spec->d1_stack[spec->d1_level])
- {
tagDataRelease (spec);
- spec->d1_stack[spec->d1_level]->next = res;
- }
- else
- parent->child = res;
spec->d1_stack[spec->d1_level] = res;
spec->d1_stack[++(spec->d1_level)] = NULL;
}
static void tagBegin (struct lexSpec *spec,
const char *tag, int len)
{
- struct data1_node *parent;
- data1_element *elem = NULL;
- data1_node *partag;
- data1_node *res;
- data1_element *e = NULL;
- int localtag = 0;
-
if (spec->d1_level == 0)
{
- logf (LOG_WARN, "in element begin. No record type defined");
+ yaz_log (YLOG_WARN, "in element begin. No record type defined");
return ;
}
tagStrip (&tag, &len);
+ if (spec->d1_stack[spec->d1_level])
+ tagDataRelease (spec);
- parent = spec->d1_stack[spec->d1_level -1];
- partag = get_parent_tag(spec->dh, parent);
-
- res = data1_mk_node_type (spec->dh, spec->m, DATA1N_tag);
- res->parent = parent;
-
- if (len >= DATA1_LOCALDATA)
- res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
- else
- res->u.tag.tag = res->lbuf;
-
- memcpy (res->u.tag.tag, tag, len);
- res->u.tag.tag[len] = '\0';
-
#if REGX_DEBUG
- logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
+ yaz_log (YLOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level);
#endif
- if (parent->which == DATA1N_variant)
- return ;
- if (partag)
- if (!(e = partag->u.tag.element))
- localtag = 1;
-
- elem = data1_getelementbytagname (spec->dh,
- spec->d1_stack[0]->u.root.absyn,
- e, res->u.tag.tag);
- res->u.tag.element = elem;
- res->root = parent->root;
- parent->last_child = res;
- if (spec->d1_stack[spec->d1_level])
- {
- tagDataRelease (spec);
- spec->d1_stack[spec->d1_level]->next = res;
- }
- else
- parent->child = res;
- spec->d1_stack[spec->d1_level] = res;
+ spec->d1_stack[spec->d1_level] = data1_mk_tag_n (
+ spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]);
spec->d1_stack[++(spec->d1_level)] = NULL;
}
break;
}
#if REGX_DEBUG
- logf (LOG_LOG, "end tag (%d)", spec->d1_level);
+ yaz_log (YLOG_LOG, "end tag(%d)", spec->d1_level);
#endif
}
static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
- struct DFA *dfa)
+ struct DFA *dfa, int greedy)
{
struct DFA_state *state = dfa->states[0];
struct DFA_tran *t;
- unsigned char c;
+ unsigned char c = 0;
unsigned char c_prev = 0;
int ptr = *pptr; /* current pointer */
int start_ptr = *pptr; /* first char of match */
int last_ptr = 0; /* last char of match */
int last_rule = 0; /* rule number of current match */
+ int restore_ptr = 0;
int i;
+ if (ptr)
+ {
+ --ptr;
+ c = f_win_advance (spec, &ptr);
+ }
while (1)
{
+ if (dfa->states[0] == state)
+ {
+ c_prev = c;
+ restore_ptr = ptr;
+ }
c = f_win_advance (spec, &ptr);
+
if (ptr == F_WIN_EOF)
{
if (last_rule)
}
break;
}
+
t = state->trans;
i = state->tran_no;
while (1)
- if (--i < 0)
+ if (--i < 0) /* no transition for character c */
{
if (last_rule)
{
return 1;
}
state = dfa->states[0];
+
+ ptr = restore_ptr;
+ c = f_win_advance (spec, &ptr);
+
start_ptr = ptr;
- c_prev = c;
+
break;
}
else if (c >= t->ch[0] && c <= t->ch[1])
{
state = dfa->states[t->to];
- if (state->rule_no)
- {
- if (c_prev == '\n')
- {
- last_rule = state->rule_no;
- last_ptr = ptr;
- }
- else
- {
- last_rule = state->rule_nno;
- last_ptr = ptr;
- }
- }
- break;
+ if (state->rule_no && c_prev == '\n')
+ {
+ last_rule = state->rule_no;
+ last_ptr = ptr;
+ }
+ else if (state->rule_nno)
+ {
+ last_rule = state->rule_nno;
+ last_ptr = ptr;
+ }
+ break;
}
else
t++;
#if HAVE_TCL_H
static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
- int argc, char **argv)
+ int argc, const char **argv)
{
struct lexSpec *spec = (struct lexSpec *) clientData;
if (argc < 2)
return TCL_ERROR;
if (!strcmp(argv[1], "record") && argc == 3)
{
- char *absynName = argv[2];
- data1_absyn *absyn;
+ const char *absynName = argv[2];
+ data1_node *res;
#if REGX_DEBUG
- logf (LOG_LOG, "begin record %s", absynName);
+ yaz_log (YLOG_LOG, "begin record %s", absynName);
#endif
- if (!(absyn = data1_get_absyn (spec->dh, absynName)))
- logf (LOG_WARN, "Unknown tagset: %s", absynName);
- else
- {
- data1_node *res;
-
- res = data1_mk_node (spec->dh, spec->m);
- res->which = DATA1N_root;
- res->u.root.type =
- data1_insert_string(spec->dh, res, spec->m, absynName);
- res->u.root.absyn = absyn;
- res->root = res;
-
- spec->d1_stack[spec->d1_level] = res;
- spec->d1_stack[++(spec->d1_level)] = NULL;
- }
+ res = data1_mk_root (spec->dh, spec->m, absynName);
+
+ spec->d1_level = 0;
+
+ spec->d1_stack[spec->d1_level++] = res;
+
+ res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
+
+ spec->d1_stack[spec->d1_level++] = res;
+
+ spec->d1_stack[spec->d1_level] = NULL;
}
else if (!strcmp(argv[1], "element") && argc == 3)
{
{
struct lexContext *lc = spec->context;
#if REGX_DEBUG
- logf (LOG_LOG, "begin context %s",argv[2]);
+ yaz_log (YLOG_LOG, "begin context %s",argv[2]);
#endif
while (lc && strcmp (argv[2], lc->name))
lc = lc->next;
spec->context_stack[++(spec->context_stack_top)] = lc;
}
else
- logf (LOG_WARN, "unknown context %s", argv[2]);
+ yaz_log (YLOG_WARN, "unknown context %s", argv[2]);
}
else
return TCL_ERROR;
}
static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
- int argc, char **argv)
+ int argc, const char **argv)
{
struct lexSpec *spec = (struct lexSpec *) clientData;
if (argc < 2)
(spec->d1_level)--;
}
#if REGX_DEBUG
- logf (LOG_LOG, "end record");
+ yaz_log (YLOG_LOG, "end record");
#endif
spec->stop_flag = 1;
}
else if (!strcmp (argv[1], "element"))
{
- int min_level = 1;
- char *element = 0;
+ int min_level = 2;
+ const char *element = 0;
if (argc >= 3 && !strcmp(argv[2], "-record"))
{
min_level = 0;
if (argc == 3)
element = argv[2];
tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
- if (spec->d1_level == 0)
+ if (spec->d1_level <= 1)
{
#if REGX_DEBUG
- logf (LOG_LOG, "end element end records");
+ yaz_log (YLOG_LOG, "end element end records");
#endif
spec->stop_flag = 1;
}
else if (!strcmp (argv[1], "context"))
{
#if REGX_DEBUG
- logf (LOG_LOG, "end context");
+ yaz_log (YLOG_LOG, "end context");
#endif
if (spec->context_stack_top)
(spec->context_stack_top)--;
}
static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
- int argc, char **argv)
+ int argc, const char **argv)
{
int argi = 1;
int textFlag = 0;
const char *element = 0;
+ const char *attribute = 0;
struct lexSpec *spec = (struct lexSpec *) clientData;
while (argi < argc)
if (argi < argc)
element = argv[argi++];
}
+ else if (!strcmp("-attribute", argv[argi]))
+ {
+ argi++;
+ if (argi < argc)
+ attribute = argv[argi++];
+ }
else
break;
}
#if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
Tcl_DString ds;
char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
- execData (spec, native, strlen(native), textFlag);
+ execData (spec, native, strlen(native), textFlag, attribute,
+ attribute ? strlen(attribute) : 0);
Tcl_DStringFree (&ds);
#else
- execData (spec, argv[argi], strlen(argv[argi]), textFlag);
+ execData (spec, argv[argi], strlen(argv[argi]), textFlag, attribute,
+ attribute ? strlen(attribute) : 0);
#endif
argi++;
}
if (element)
- tagEnd (spec, 1, NULL, 0);
+ tagEnd (spec, 2, NULL, 0);
return TCL_OK;
}
static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
- int argc, char **argv)
+ int argc, const char **argv)
{
struct lexSpec *spec = (struct lexSpec *) clientData;
int argi = 1;
if (ret != TCL_OK)
{
const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
- logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
+ yaz_log(YLOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
spec->tcl_interp->errorLine,
spec->tcl_interp->result,
err ? err : "[NO ERRORINFO]");
r = execTok (spec, &s, &cmd_str, &cmd_len);
if (r < 2)
{
- logf (LOG_WARN, "missing keyword after 'begin'");
+ yaz_log (YLOG_WARN, "missing keyword after 'begin'");
continue;
}
p = regxStrz (cmd_str, cmd_len, ptmp);
r = execTok (spec, &s, &cmd_str, &cmd_len);
if (r < 2)
continue;
- if (spec->d1_level == 0)
+ if (spec->d1_level <= 1)
{
static char absynName[64];
- data1_absyn *absyn;
+ data1_node *res;
if (cmd_len > 63)
cmd_len = 63;
memcpy (absynName, cmd_str, cmd_len);
absynName[cmd_len] = '\0';
-
#if REGX_DEBUG
- logf (LOG_LOG, "begin record %s", absynName);
+ yaz_log (YLOG_LOG, "begin record %s", absynName);
#endif
- if (!(absyn = data1_get_absyn (spec->dh, absynName)))
- logf (LOG_WARN, "Unknown tagset: %s", absynName);
- else
- {
- data1_node *res;
-
- res = data1_mk_node (spec->dh, spec->m);
- res->which = DATA1N_root;
- res->u.root.type = absynName;
- res->u.root.absyn = absyn;
- res->root = res;
-
- spec->d1_stack[spec->d1_level] = res;
- spec->d1_stack[++(spec->d1_level)] = NULL;
- }
+ res = data1_mk_root (spec->dh, spec->m, absynName);
+
+ spec->d1_level = 0;
+
+ spec->d1_stack[spec->d1_level++] = res;
+
+ res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
+
+ spec->d1_stack[spec->d1_level++] = res;
+
+ spec->d1_stack[spec->d1_level] = NULL;
}
r = execTok (spec, &s, &cmd_str, &cmd_len);
}
r = execTok (spec, &s, &cmd_str, &cmd_len);
p = regxStrz (cmd_str, cmd_len, ptmp);
#if REGX_DEBUG
- logf (LOG_LOG, "begin context %s", p);
+ yaz_log (YLOG_LOG, "begin context %s", p);
#endif
while (lc && strcmp (p, lc->name))
lc = lc->next;
if (lc)
spec->context_stack[++(spec->context_stack_top)] = lc;
else
- logf (LOG_WARN, "unknown context %s", p);
+ yaz_log (YLOG_WARN, "unknown context %s", p);
}
r = execTok (spec, &s, &cmd_str, &cmd_len);
}
else
{
- logf (LOG_WARN, "bad keyword '%s' after begin", p);
+ yaz_log (YLOG_WARN, "bad keyword '%s' after begin", p);
}
}
else if (!strcmp (p, "end"))
r = execTok (spec, &s, &cmd_str, &cmd_len);
if (r < 2)
{
- logf (LOG_WARN, "missing keyword after 'end'");
+ yaz_log (YLOG_WARN, "missing keyword after 'end'");
continue;
}
p = regxStrz (cmd_str, cmd_len, ptmp);
}
r = execTok (spec, &s, &cmd_str, &cmd_len);
#if REGX_DEBUG
- logf (LOG_LOG, "end record");
+ yaz_log (YLOG_LOG, "end record");
#endif
spec->stop_flag = 1;
}
else if (!strcmp (p, "element"))
{
- int min_level = 1;
+ int min_level = 2;
while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
{
if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
}
else
tagEnd (spec, min_level, NULL, 0);
- if (spec->d1_level == 0)
+ if (spec->d1_level <= 1)
{
#if REGX_DEBUG
- logf (LOG_LOG, "end element end records");
+ yaz_log (YLOG_LOG, "end element end records");
#endif
spec->stop_flag = 1;
}
else if (!strcmp (p, "context"))
{
#if REGX_DEBUG
- logf (LOG_LOG, "end context");
+ yaz_log (YLOG_LOG, "end context");
#endif
if (spec->context_stack_top)
(spec->context_stack_top)--;
r = execTok (spec, &s, &cmd_str, &cmd_len);
}
else
- logf (LOG_WARN, "bad keyword '%s' after end", p);
+ yaz_log (YLOG_WARN, "bad keyword '%s' after end", p);
}
else if (!strcmp (p, "data"))
{
int textFlag = 0;
int element_len;
const char *element_str = NULL;
+ int attribute_len;
+ const char *attribute_str = NULL;
while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
{
if (r < 2)
break;
}
+ else if (cmd_len==10 && !memcmp ("-attribute", cmd_str,
+ cmd_len))
+ {
+ r = execTok (spec, &s, &attribute_str, &attribute_len);
+ if (r < 2)
+ break;
+ }
else
- logf (LOG_WARN, "bad data option: %.*s",
+ yaz_log (YLOG_WARN, "bad data option: %.*s",
cmd_len, cmd_str);
}
if (r != 2)
{
- logf (LOG_WARN, "missing data item after data");
+ yaz_log (YLOG_WARN, "missing data item after data");
continue;
}
if (element_str)
tagBegin (spec, element_str, element_len);
do
{
- execData (spec, cmd_str, cmd_len,textFlag);
+ execData (spec, cmd_str, cmd_len, textFlag,
+ attribute_str, attribute_len);
r = execTok (spec, &s, &cmd_str, &cmd_len);
} while (r > 1);
if (element_str)
- tagEnd (spec, 1, NULL, 0);
+ tagEnd (spec, 2, NULL, 0);
}
else if (!strcmp (p, "unread"))
{
r = execTok (spec, &s, &cmd_str, &cmd_len);
if (r < 2)
{
- logf (LOG_WARN, "missing number after -offset");
+ yaz_log (YLOG_WARN, "missing number after -offset");
continue;
}
p = regxStrz (cmd_str, cmd_len, ptmp);
offset = 0;
if (r < 2)
{
- logf (LOG_WARN, "missing index after unread command");
+ yaz_log (YLOG_WARN, "missing index after unread command");
continue;
}
if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
{
- logf (LOG_WARN, "bad index after unread command");
+ yaz_log (YLOG_WARN, "bad index after unread command");
continue;
}
else
if (lc)
spec->context_stack[spec->context_stack_top] = lc;
else
- logf (LOG_WARN, "unknown context %s", p);
+ yaz_log (YLOG_WARN, "unknown context %s", p);
}
r = execTok (spec, &s, &cmd_str, &cmd_len);
}
else
{
- logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
+ yaz_log (YLOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
r = execTok (spec, &s, &cmd_str, &cmd_len);
continue;
}
if (r > 1)
{
- logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
+ yaz_log (YLOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
do {
r = execTok (spec, &s, &cmd_str, &cmd_len);
} while (r > 1);
if (ap->u.pattern.body)
{
arg_start[arg_no] = *pptr;
- if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
+ if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0))
{
arg_end[arg_no] = F_WIN_EOF;
arg_no++;
arg_start[arg_no] = F_WIN_EOF;
arg_end[arg_no] = F_WIN_EOF;
-/* return 1*/
+ yaz_log(YLOG_DEBUG, "Pattern match rest of record");
+ *pptr = F_WIN_EOF;
}
else
{
else
{
arg_start[arg_no] = *pptr;
- if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
+ if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1))
return 1;
if (sptr != arg_start[arg_no])
return 1;
int ruleNo, int start_ptr, int *pptr)
{
#if REGX_DEBUG
- logf (LOG_LOG, "exec rule %d", ruleNo);
+ yaz_log (YLOG_LOG, "exec rule %d", ruleNo);
#endif
return execAction (spec, context->fastRule[ruleNo]->actionList,
start_ptr, pptr);
if (spec->f_win_ef && *ptr != F_WIN_EOF)
{
#if REGX_DEBUG
- logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
+ yaz_log (YLOG_LOG, "regx: endf ptr=%d", *ptr);
#endif
(*spec->f_win_ef)(spec->f_win_fh, *ptr);
}
}
if (!lt)
{
- logf (LOG_WARN, "cannot find context %s", context_name);
+ yaz_log (YLOG_WARN, "cannot find context %s", context_name);
return NULL;
}
spec->context_stack[spec->context_stack_top] = lt;
xfree (specs);
}
-void *grs_init(void)
+void *grs_init(Res res, RecType recType)
{
struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
specs->spec = 0;
+ strcpy(specs->type, "");
return specs;
}
+
+void grs_config(void *clientData, Res res, const char *args)
+{
+ struct lexSpecs *specs = (struct lexSpecs *) clientData;
+ if (strlen(args) < sizeof(specs->type))
+ strcpy(specs->type, args);
+}
+
data1_node *grs_read_regx (struct grs_read_info *p)
{
int res;
struct lexSpec **curLexSpec = &specs->spec;
#if REGX_DEBUG
- logf (LOG_LOG, "grs_read_regx");
+ yaz_log (YLOG_LOG, "grs_read_regx");
#endif
- if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
+ if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
{
if (*curLexSpec)
lexSpecDestroy (curLexSpec);
- *curLexSpec = lexSpecCreate (p->type, p->dh);
+ *curLexSpec = lexSpecCreate (specs->type, p->dh);
res = readFileSpec (*curLexSpec);
if (res)
{
return lexRoot (*curLexSpec, p->offset, "main");
}
-static struct recTypeGrs regx_type = {
- "regx",
+static int extract_regx(void *clientData, struct recExtractCtrl *ctrl)
+{
+ return zebra_grs_extract(clientData, ctrl, grs_read_regx);
+}
+
+static int retrieve_regx(void *clientData, struct recRetrieveCtrl *ctrl)
+{
+ return zebra_grs_retrieve(clientData, ctrl, grs_read_regx);
+}
+
+static struct recType regx_type = {
+ "grs.regx",
grs_init,
+ grs_config,
grs_destroy,
- grs_read_regx
+ extract_regx,
+ retrieve_regx,
};
-RecTypeGrs recTypeGrs_regx = ®x_type;
#if HAVE_TCL_H
data1_node *grs_read_tcl (struct grs_read_info *p)
struct lexSpec **curLexSpec = &specs->spec;
#if REGX_DEBUG
- logf (LOG_LOG, "grs_read_tcl");
+ yaz_log (YLOG_LOG, "grs_read_tcl");
#endif
- if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
+ if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
{
Tcl_Interp *tcl_interp;
if (*curLexSpec)
lexSpecDestroy (curLexSpec);
- *curLexSpec = lexSpecCreate (p->type, p->dh);
+ *curLexSpec = lexSpecCreate (specs->type, p->dh);
Tcl_FindExecutable("");
tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
Tcl_Init(tcl_interp);
return lexRoot (*curLexSpec, p->offset, "main");
}
-static struct recTypeGrs tcl_type = {
- "tcl",
+static int extract_tcl(void *clientData, struct recExtractCtrl *ctrl)
+{
+ return zebra_grs_extract(clientData, ctrl, grs_read_tcl);
+}
+
+static int retrieve_tcl(void *clientData, struct recRetrieveCtrl *ctrl)
+{
+ return zebra_grs_retrieve(clientData, ctrl, grs_read_tcl);
+}
+
+static struct recType tcl_type = {
+ "grs.tcl",
grs_init,
+ grs_config,
grs_destroy,
- grs_read_tcl
+ extract_tcl,
+ retrieve_tcl,
};
-RecTypeGrs recTypeGrs_tcl = &tcl_type;
#endif
+
+RecType
+#ifdef IDZEBRA_STATIC_GRS_REGX
+idzebra_filter_grs_regx
+#else
+idzebra_filter
+#endif
+
+[] = {
+ ®x_type,
+#if HAVE_TCL_H
+ &tcl_type,
+#endif
+ 0,
+};