2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.24 1999-05-21 11:08:46 adam
8 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
9 * script so that it reads uninstalled Tcl source.
11 * Revision 1.23 1999/05/20 12:57:18 adam
12 * Implemented TCL filter. Updated recctrl system.
14 * Revision 1.22 1998/11/03 16:07:13 adam
17 * Revision 1.21 1998/11/03 15:43:39 adam
18 * Fixed bug introduced by previous commit.
20 * Revision 1.20 1998/11/03 14:51:28 adam
21 * Changed code so that it creates as few data1 nodes as possible.
23 * Revision 1.19 1998/11/03 10:22:39 adam
24 * Fixed memory leak that could occur for when large data1 node were
25 * concatenated. Data-type data1_nodes may have multiple nodes.
27 * Revision 1.18 1998/10/15 13:11:47 adam
28 * Added support for option -record for "end element". When specified
29 * end element will mark end-of-record when at outer-level.
31 * Revision 1.17 1998/07/01 10:13:51 adam
34 * Revision 1.16 1998/06/30 15:15:09 adam
35 * Tags are trimmed: white space removed before- and after the tag.
37 * Revision 1.15 1998/06/30 12:55:45 adam
40 * Revision 1.14 1998/03/05 08:41:00 adam
41 * Implemented rule contexts.
43 * Revision 1.13 1997/12/12 06:33:58 adam
44 * Fixed bug that showed up when multiple filter where used.
45 * Made one routine thread-safe.
47 * Revision 1.12 1997/11/18 10:03:24 adam
48 * Member num_children removed from data1_node.
50 * Revision 1.11 1997/11/06 11:41:01 adam
51 * Implemented "begin variant" for the sgml.regx filter.
53 * Revision 1.10 1997/10/31 12:36:12 adam
54 * Minor change that avoids compiler warning.
56 * Revision 1.9 1997/09/29 09:02:49 adam
57 * Fixed small bug (introduced by previous commit).
59 * Revision 1.8 1997/09/17 12:19:22 adam
60 * Zebra version corresponds to YAZ version 1.4.
61 * Changed Zebra server so that it doesn't depend on global common_resource.
63 * Revision 1.7 1997/07/15 16:33:07 adam
64 * Check for zero length in execData.
66 * Revision 1.6 1997/02/24 10:41:51 adam
67 * Cleanup of code and commented out the "end element-end-record" code.
69 * Revision 1.5 1997/02/19 16:22:33 adam
70 * Fixed "end element" to terminate record in outer-most level.
72 * Revision 1.4 1997/02/12 20:42:58 adam
73 * Changed some log messages.
75 * Revision 1.3 1996/11/08 14:05:33 adam
76 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
78 * Revision 1.2 1996/10/29 14:02:09 adam
79 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
80 * data1_get_tabpath is used.
82 * Revision 1.1 1996/10/11 10:57:30 adam
83 * New module recctrl. Used to manage records (extract/retrieval).
85 * Revision 1.24 1996/06/17 14:25:31 adam
86 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
88 * Revision 1.23 1996/06/04 10:19:00 adam
89 * Minor changes - removed include of ctype.h.
91 * Revision 1.22 1996/06/03 15:23:13 adam
92 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
94 * Revision 1.21 1996/05/14 16:58:38 adam
97 * Revision 1.20 1996/05/01 13:46:36 adam
98 * First work on multiple records in one file.
99 * New option, -offset, to the "unread" command in the filter module.
101 * Revision 1.19 1996/02/12 16:18:20 adam
102 * Yet another bug fix in implementation of unread command.
104 * Revision 1.18 1996/02/12 16:07:54 adam
105 * Bug fix in new unread command.
107 * Revision 1.17 1996/02/12 15:56:11 adam
108 * New code command: unread.
110 * Revision 1.16 1996/01/17 14:57:51 adam
111 * Prototype changed for reader functions in extract/retrieve. File
112 * is identified by 'void *' instead of 'int.
114 * Revision 1.15 1996/01/08 19:15:47 adam
115 * New input filter that works!
117 * Revision 1.14 1996/01/08 09:10:38 adam
118 * Yet another complete rework on this module.
120 * Revision 1.13 1995/12/15 17:21:50 adam
121 * This version is able to set data.formatted_text in data1-nodes.
123 * Revision 1.12 1995/12/15 16:20:10 adam
124 * The filter files (*.flt) are read from the path given by data1_tabpath.
126 * Revision 1.11 1995/12/15 12:35:16 adam
129 * Revision 1.10 1995/12/15 10:35:36 adam
132 * Revision 1.9 1995/12/14 16:38:48 adam
133 * Completely new attempt to make regular expression parsing.
135 * Revision 1.8 1995/12/13 17:16:59 adam
138 * Revision 1.7 1995/12/13 16:51:58 adam
139 * Modified to set last_child in data1_nodes.
140 * Uses destroy handler to free up data text nodes.
142 * Revision 1.6 1995/12/13 13:45:37 quinn
143 * Changed data1 to use nmem.
145 * Revision 1.5 1995/12/11 09:12:52 adam
146 * The rec_get function returns NULL if record doesn't exist - will
147 * happen in the server if the result set records have been deleted since
148 * the creation of the set (i.e. the search).
149 * The server saves a result temporarily if it is 'volatile', i.e. the
150 * set is register dependent.
152 * Revision 1.4 1995/12/05 16:57:40 adam
153 * More work on regular patterns.
155 * Revision 1.3 1995/12/05 09:37:09 adam
156 * One malloc was renamed to xmalloc.
158 * Revision 1.2 1995/12/04 17:59:24 adam
159 * More work on regular expression conversion.
161 * Revision 1.1 1995/12/04 14:25:30 adam
162 * Started work on regular expression parsed input to structured records.
171 #include <zebrautl.h>
181 #define F_WIN_EOF 2000000000
185 #define REGX_PATTERN 1
190 #define REGX_CONTEXT 6
197 struct lexRuleAction {
201 struct DFA *dfa; /* REGX_PATTERN */
204 struct regxCode *code; /* REGX_CODE */
206 struct lexRuleAction *next;
211 struct lexRuleAction *actionList;
215 struct lexRuleInfo info;
216 struct lexRule *next;
222 struct lexRule *rules;
223 struct lexRuleInfo **fastRule;
227 struct lexRuleAction *beginActionList;
228 struct lexRuleAction *endActionList;
229 struct lexRuleAction *initActionList;
230 struct lexContext *next;
233 struct lexConcatBuf {
241 struct lexContext *context;
243 struct lexContext **context_stack;
244 int context_stack_size;
245 int context_stack_top;
251 Tcl_Interp *tcl_interp;
254 void (*f_win_ef)(void *, off_t);
256 int f_win_start; /* first byte of buffer is this file offset */
257 int f_win_end; /* last byte of buffer is this offset - 1 */
258 int f_win_size; /* size of buffer */
259 char *f_win_buf; /* buffer itself */
260 int (*f_win_rf)(void *, char *, size_t);
261 off_t (*f_win_sf)(void *, off_t);
263 struct lexConcatBuf **concatBuf;
265 data1_node **d1_stack;
276 struct lexSpec *spec;
279 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
282 int i, r, off = start_pos - spec->f_win_start;
284 if (off >= 0 && end_pos <= spec->f_win_end)
286 *size = end_pos - start_pos;
287 return spec->f_win_buf + off;
289 if (off < 0 || start_pos >= spec->f_win_end)
291 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
292 spec->f_win_start = start_pos;
294 if (!spec->f_win_buf)
295 spec->f_win_buf = xmalloc (spec->f_win_size);
296 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
298 spec->f_win_end = spec->f_win_start + *size;
300 if (*size > end_pos - start_pos)
301 *size = end_pos - start_pos;
302 return spec->f_win_buf;
304 for (i = 0; i<spec->f_win_end - start_pos; i++)
305 spec->f_win_buf[i] = spec->f_win_buf[i + off];
306 r = (*spec->f_win_rf)(spec->f_win_fh,
308 spec->f_win_size - i);
309 spec->f_win_start = start_pos;
310 spec->f_win_end += r;
312 if (*size > end_pos - start_pos)
313 *size = end_pos - start_pos;
314 return spec->f_win_buf;
317 static int f_win_advance (struct lexSpec *spec, int *pos)
322 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
323 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
324 if (*pos == F_WIN_EOF)
326 buf = f_win_get (spec, *pos, *pos+1, &size);
336 static void regxCodeDel (struct regxCode **pp)
338 struct regxCode *p = *pp;
347 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
351 p = xmalloc (sizeof(*p));
352 p->str = xmalloc (len+1);
353 memcpy (p->str, buf, len);
358 static struct DFA *lexSpecDFA (void)
363 dfa_parse_cmap_del (dfa, ' ');
364 dfa_parse_cmap_del (dfa, '\t');
365 dfa_parse_cmap_add (dfa, '/', 0);
369 static void actionListDel (struct lexRuleAction **rap)
371 struct lexRuleAction *ra1, *ra;
373 for (ra = *rap; ra; ra = ra1)
379 dfa_delete (&ra->u.pattern.dfa);
382 regxCodeDel (&ra->u.code);
390 static struct lexContext *lexContextCreate (const char *name)
392 struct lexContext *p = xmalloc (sizeof(*p));
394 p->name = xstrdup (name);
397 p->dfa = lexSpecDFA ();
400 p->beginActionList = NULL;
401 p->endActionList = NULL;
402 p->initActionList = NULL;
407 static void lexContextDestroy (struct lexContext *p)
409 struct lexRule *rp, *rp1;
412 for (rp = p->rules; rp; rp = rp1)
415 actionListDel (&rp->info.actionList);
418 actionListDel (&p->beginActionList);
419 actionListDel (&p->endActionList);
424 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
429 p = xmalloc (sizeof(*p));
430 p->name = xmalloc (strlen(name)+1);
431 strcpy (p->name, name);
438 p->context_stack_size = 100;
439 p->context_stack = xmalloc (sizeof(*p->context_stack) *
440 p->context_stack_size);
444 p->concatBuf = xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
445 for (i = 0; i < p->maxLevel; i++)
447 p->concatBuf[i] = xmalloc (sizeof(**p->concatBuf));
448 p->concatBuf[i]->len = p->concatBuf[i]->max = 0;
449 p->concatBuf[i]->buf = 0;
451 p->d1_stack = xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
456 static void lexSpecDestroy (struct lexSpec **pp)
459 struct lexContext *lt;
467 for (i = 0; i < p->maxLevel; i++)
468 xfree (p->concatBuf[i]);
469 xfree (p->concatBuf);
474 struct lexContext *lt_next = lt->next;
475 lexContextDestroy (lt);
480 Tcl_DeleteInterp (p->tcl_interp);
483 xfree (p->f_win_buf);
484 xfree (p->context_stack);
490 static int readParseToken (const char **cpp, int *len)
492 const char *cp = *cpp;
496 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
525 if (*cp >= 'a' && *cp <= 'z')
527 else if (*cp >= 'A' && *cp <= 'Z')
528 cmd[i] = *cp + 'a' - 'A';
531 if (i < sizeof(cmd)-2)
538 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
540 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
546 if (!strcmp (cmd, "begin"))
548 else if (!strcmp (cmd, "end"))
550 else if (!strcmp (cmd, "body"))
552 else if (!strcmp (cmd, "context"))
554 else if (!strcmp (cmd, "init"))
558 logf (LOG_WARN, "bad command %s", cmd);
564 static int actionListMk (struct lexSpec *spec, const char *s,
565 struct lexRuleAction **ap)
571 while ((tok = readParseToken (&s, &len)))
579 *ap = xmalloc (sizeof(**ap));
581 regxCodeMk (&(*ap)->u.code, s, len);
585 *ap = xmalloc (sizeof(**ap));
587 (*ap)->u.pattern.body = bodyMark;
589 (*ap)->u.pattern.dfa = lexSpecDFA ();
591 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
596 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
599 dfa_mkstate ((*ap)->u.pattern.dfa);
603 logf (LOG_WARN, "cannot use BEGIN here");
606 logf (LOG_WARN, "cannot use INIT here");
609 *ap = xmalloc (sizeof(**ap));
619 int readOneSpec (struct lexSpec *spec, const char *s)
623 struct lexContext *lc;
625 tok = readParseToken (&s, &len);
626 if (tok == REGX_CONTEXT)
628 char context_name[32];
629 tok = readParseToken (&s, &len);
630 if (tok != REGX_CODE)
632 logf (LOG_WARN, "missing name after CONTEXT keyword");
637 memcpy (context_name, s, len);
638 context_name[len] = '\0';
639 lc = lexContextCreate (context_name);
640 lc->next = spec->context;
645 spec->context = lexContextCreate ("main");
650 actionListDel (&spec->context->beginActionList);
651 actionListMk (spec, s, &spec->context->beginActionList);
654 actionListDel (&spec->context->endActionList);
655 actionListMk (spec, s, &spec->context->endActionList);
658 actionListDel (&spec->context->initActionList);
659 actionListMk (spec, s, &spec->context->initActionList);
663 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
665 r = dfa_parse (spec->context->dfa, &s);
668 logf (LOG_WARN, "regular expression error. r=%d", r);
673 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
677 rp = xmalloc (sizeof(*rp));
678 rp->info.no = spec->context->ruleNo++;
679 rp->next = spec->context->rules;
680 spec->context->rules = rp;
681 actionListMk (spec, s, &rp->info.actionList);
686 int readFileSpec (struct lexSpec *spec)
688 struct lexContext *lc;
691 int c, i, errors = 0;
694 lineBuf = xmalloc (1+lineSize);
696 if (spec->tcl_interp)
698 sprintf (lineBuf, "%s.tflt", spec->name);
699 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
704 sprintf (lineBuf, "%s.flt", spec->name);
705 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
709 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
713 logf (LOG_LOG, "reading regx filter %s.flt", lineBuf);
714 sprintf (lineBuf, "%s.flt", spec->name);
715 if (!(spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh),
718 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
723 if (spec->tcl_interp)
724 logf (LOG_LOG, "Tcl enabled");
731 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
733 while (c != '\n' && c != EOF)
752 if (c != ' ' && c != '\t')
761 readOneSpec (spec, lineBuf);
762 spec->lineNo += addLine;
771 debug_dfa_followpos = 1;
774 for (lc = spec->context; lc; lc = lc->next)
777 lc->fastRule = xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
778 for (i = 0; i < lc->ruleNo; i++)
779 lc->fastRule[i] = NULL;
780 for (rp = lc->rules; rp; rp = rp->next)
781 lc->fastRule[rp->info.no] = &rp->info;
782 dfa_mkstate (lc->dfa);
791 static struct lexSpec *curLexSpec = NULL;
794 static void execData (struct lexSpec *spec,
795 const char *ebuf, int elen, int formatted_text)
797 struct data1_node *res, *parent;
800 if (elen == 0) /* shouldn't happen, but it does! */
804 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
805 ebuf, 15, ebuf + elen-15);
807 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
809 logf (LOG_DEBUG, "data (%d bytes)", elen);
812 if (spec->d1_level <= 1)
815 parent = spec->d1_stack[spec->d1_level -1];
818 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
819 org_len = res->u.data.len;
824 res = data1_mk_node (spec->dh, spec->m);
825 res->parent = parent;
826 res->which = DATA1N_data;
827 res->u.data.what = DATA1I_text;
829 res->u.data.formatted_text = formatted_text;
831 if (elen > DATA1_LOCALDATA)
832 res->u.data.data = nmem_malloc (spec->m, elen);
834 res->u.data.data = res->lbuf;
835 memcpy (res->u.data.data, ebuf, elen);
837 res->u.data.data = 0;
839 res->root = parent->root;
841 parent->last_child = res;
842 if (spec->d1_stack[spec->d1_level])
843 spec->d1_stack[spec->d1_level]->next = res;
846 spec->d1_stack[spec->d1_level] = res;
848 if (org_len + elen >= spec->concatBuf[spec->d1_level]->max)
850 char *old_buf, *new_buf;
852 spec->concatBuf[spec->d1_level]->max = org_len + elen + 256;
853 new_buf = xmalloc (spec->concatBuf[spec->d1_level]->max);
854 if ((old_buf = spec->concatBuf[spec->d1_level]->buf))
856 memcpy (new_buf, old_buf, org_len);
859 spec->concatBuf[spec->d1_level]->buf = new_buf;
861 assert (spec->concatBuf[spec->d1_level]);
862 memcpy (spec->concatBuf[spec->d1_level]->buf + org_len, ebuf, elen);
863 res->u.data.len += elen;
866 static void execDataP (struct lexSpec *spec,
867 const char *ebuf, int elen, int formatted_text)
869 execData (spec, ebuf, elen, formatted_text);
872 static void tagDataRelease (struct lexSpec *spec)
876 if ((res = spec->d1_stack[spec->d1_level]) &&
877 res->which == DATA1N_data &&
878 res->u.data.what == DATA1I_text)
880 assert (!res->u.data.data);
881 assert (res->u.data.len > 0);
882 if (res->u.data.len > DATA1_LOCALDATA)
883 res->u.data.data = nmem_malloc (spec->m, res->u.data.len);
885 res->u.data.data = res->lbuf;
886 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level]->buf,
891 static void variantBegin (struct lexSpec *spec,
892 const char *class_str, int class_len,
893 const char *type_str, int type_len,
894 const char *value_str, int value_len)
896 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
897 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
902 if (spec->d1_level == 0)
904 logf (LOG_WARN, "in variant begin. No record type defined");
907 if (class_len >= DATA1_MAX_SYMBOL)
908 class_len = DATA1_MAX_SYMBOL-1;
909 memcpy (tclass, class_str, class_len);
910 tclass[class_len] = '\0';
912 if (type_len >= DATA1_MAX_SYMBOL)
913 type_len = DATA1_MAX_SYMBOL-1;
914 memcpy (ttype, type_str, type_len);
915 ttype[type_len] = '\0';
918 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
923 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
927 if (parent->which != DATA1N_variant)
929 res = data1_mk_node (spec->dh, spec->m);
930 res->parent = parent;
931 res->which = DATA1N_variant;
932 res->u.variant.type = 0;
933 res->u.variant.value = 0;
934 res->root = parent->root;
936 parent->last_child = res;
937 if (spec->d1_stack[spec->d1_level])
939 tagDataRelease (spec);
940 spec->d1_stack[spec->d1_level]->next = res;
944 spec->d1_stack[spec->d1_level] = res;
945 spec->d1_stack[++(spec->d1_level)] = NULL;
947 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
948 if (spec->d1_stack[i]->u.variant.type == tp)
955 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
957 parent = spec->d1_stack[spec->d1_level-1];
958 res = data1_mk_node (spec->dh, spec->m);
959 res->parent = parent;
960 res->which = DATA1N_variant;
961 res->root = parent->root;
962 res->u.variant.type = tp;
964 if (value_len >= DATA1_LOCALDATA)
965 value_len =DATA1_LOCALDATA-1;
966 memcpy (res->lbuf, value_str, value_len);
967 res->lbuf[value_len] = '\0';
969 res->u.variant.value = res->lbuf;
971 parent->last_child = res;
972 if (spec->d1_stack[spec->d1_level])
974 tagDataRelease (spec);
975 spec->d1_stack[spec->d1_level]->next = res;
979 spec->d1_stack[spec->d1_level] = res;
980 spec->d1_stack[++(spec->d1_level)] = NULL;
983 static void tagStrip (const char **tag, int *len)
987 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
990 for (i = 0; i < *len && isspace((*tag)[i]); i++)
996 static void tagBegin (struct lexSpec *spec,
997 const char *tag, int len)
999 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
1000 data1_element *elem = NULL;
1001 data1_node *partag = get_parent_tag(spec->dh, parent);
1003 data1_element *e = NULL;
1006 if (spec->d1_level == 0)
1008 logf (LOG_WARN, "in element begin. No record type defined");
1011 tagStrip (&tag, &len);
1013 res = data1_mk_node (spec->dh, spec->m);
1014 res->parent = parent;
1015 res->which = DATA1N_tag;
1016 res->u.tag.get_bytes = -1;
1018 if (len >= DATA1_LOCALDATA)
1019 res->u.tag.tag = nmem_malloc (spec->m, len+1);
1021 res->u.tag.tag = res->lbuf;
1023 memcpy (res->u.tag.tag, tag, len);
1024 res->u.tag.tag[len] = '\0';
1027 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1029 if (parent->which == DATA1N_variant)
1032 if (!(e = partag->u.tag.element))
1035 elem = data1_getelementbytagname (spec->dh,
1036 spec->d1_stack[0]->u.root.absyn,
1038 res->u.tag.element = elem;
1039 res->u.tag.node_selected = 0;
1040 res->u.tag.make_variantlist = 0;
1041 res->u.tag.no_data_requested = 0;
1042 res->root = parent->root;
1044 parent->last_child = res;
1045 if (spec->d1_stack[spec->d1_level])
1047 tagDataRelease (spec);
1048 spec->d1_stack[spec->d1_level]->next = res;
1051 parent->child = res;
1052 spec->d1_stack[spec->d1_level] = res;
1053 spec->d1_stack[++(spec->d1_level)] = NULL;
1056 static void tagEnd (struct lexSpec *spec, int min_level,
1057 const char *tag, int len)
1059 tagStrip (&tag, &len);
1060 while (spec->d1_level > min_level)
1062 tagDataRelease (spec);
1064 if (spec->d1_level == 0)
1066 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1068 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1070 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1074 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1079 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1082 struct DFA_state *state = dfa->states[0];
1085 unsigned char c_prev = 0;
1086 int ptr = *pptr; /* current pointer */
1087 int start_ptr = *pptr; /* first char of match */
1088 int last_ptr = 0; /* last char of match */
1089 int last_rule = 0; /* rule number of current match */
1094 c = f_win_advance (spec, &ptr);
1095 if (ptr == F_WIN_EOF)
1112 *mptr = start_ptr; /* match starts here */
1113 *pptr = last_ptr; /* match end here (+1) */
1116 state = dfa->states[0];
1121 else if (c >= t->ch[0] && c <= t->ch[1])
1123 state = dfa->states[t->to];
1128 last_rule = state->rule_no;
1133 last_rule = state->rule_nno;
1145 static int execTok (struct lexSpec *spec, const char **src,
1146 const char **tokBuf, int *tokLen)
1148 const char *s = *src;
1150 while (*s == ' ' || *s == '\t')
1154 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1158 while (*s >= '0' && *s <= '9')
1159 n = n*10 + (*s++ -'0');
1160 if (spec->arg_no == 0)
1167 if (n >= spec->arg_no)
1169 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1173 else if (*s == '\"')
1176 while (*s && *s != '\"')
1178 *tokLen = s - *tokBuf;
1183 else if (*s == '\n' || *s == ';')
1191 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1193 *tokLen = s - *tokBuf;
1200 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1202 *tokLen = s - *tokBuf;
1208 static char *regxStrz (const char *src, int len, char *str)
1212 memcpy (str, src, len);
1218 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1219 int argc, char **argv)
1221 struct lexSpec *spec = clientData;
1224 if (!strcmp(argv[1], "record") && argc == 3)
1226 char *absynName = argv[2];
1230 logf (LOG_DEBUG, "begin record %s", absynName);
1232 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1233 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1238 res = data1_mk_node (spec->dh, spec->m);
1239 res->which = DATA1N_root;
1240 res->u.root.type = absynName;
1241 res->u.root.absyn = absyn;
1244 spec->d1_stack[spec->d1_level] = res;
1245 spec->d1_stack[++(spec->d1_level)] = NULL;
1248 else if (!strcmp(argv[1], "element") && argc == 3)
1250 tagBegin (spec, argv[2], strlen(argv[2]));
1252 else if (!strcmp (argv[1], "variant") && argc == 5)
1254 variantBegin (spec, argv[2], strlen(argv[2]),
1255 argv[3], strlen(argv[3]),
1256 argv[4], strlen(argv[4]));
1258 else if (!strcmp (argv[1], "context") && argc == 3)
1260 struct lexContext *lc = spec->context;
1262 logf (LOG_DEBUG, "begin context %s",argv[2]);
1264 while (lc && strcmp (argv[2], lc->name))
1268 spec->context_stack[++(spec->context_stack_top)] = lc;
1271 logf (LOG_WARN, "unknown context %s", argv[2]);
1278 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1279 int argc, char **argv)
1281 struct lexSpec *spec = clientData;
1285 if (!strcmp (argv[1], "record"))
1287 while (spec->d1_level)
1289 tagDataRelease (spec);
1293 logf (LOG_DEBUG, "end record");
1295 spec->stop_flag = 1;
1297 else if (!strcmp (argv[1], "element"))
1301 if (!strcmp(argv[2], "-record"))
1312 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1313 if (spec->d1_level == 0)
1316 logf (LOG_DEBUG, "end element end records");
1318 spec->stop_flag = 1;
1321 else if (!strcmp (argv[1], "context"))
1324 logf (LOG_DEBUG, "end context");
1326 if (spec->context_stack_top)
1327 (spec->context_stack_top)--;
1334 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1335 int argc, char **argv)
1339 const char *element = 0;
1340 struct lexSpec *spec = clientData;
1344 if (!strcmp("-text", argv[argi]))
1349 else if (!strcmp("-element", argv[argi]))
1353 element = argv[argi++];
1359 tagBegin (spec, element, strlen(element));
1363 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1367 tagEnd (spec, 1, NULL, 0);
1371 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1372 int argc, char **argv)
1374 struct lexSpec *spec = clientData;
1381 if (!strcmp("-offset", argv[argi]))
1386 offset = atoi(argv[argi]);
1395 no = atoi(argv[argi]);
1396 if (no >= spec->arg_no)
1397 no = spec->arg_no - 1;
1398 spec->ptr = spec->arg_start[no] + offset;
1402 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1405 for (i = 0; i < spec->arg_no; i++)
1407 char var_name[10], *var_buf;
1410 sprintf (var_name, "%d", i);
1411 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1415 ch = var_buf[var_len];
1416 var_buf[var_len] = '\0';
1417 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1418 var_buf[var_len] = ch;
1421 Tcl_Eval (spec->tcl_interp, code->str);
1426 static void execCode (struct lexSpec *spec, struct regxCode *code)
1428 const char *s = code->str;
1430 const char *cmd_str;
1432 r = execTok (spec, &s, &cmd_str, &cmd_len);
1439 r = execTok (spec, &s, &cmd_str, &cmd_len);
1442 p = regxStrz (cmd_str, cmd_len, ptmp);
1443 if (!strcmp (p, "begin"))
1445 r = execTok (spec, &s, &cmd_str, &cmd_len);
1448 logf (LOG_WARN, "missing keyword after 'begin'");
1451 p = regxStrz (cmd_str, cmd_len, ptmp);
1452 if (!strcmp (p, "record"))
1454 r = execTok (spec, &s, &cmd_str, &cmd_len);
1457 if (spec->d1_level == 0)
1459 static char absynName[64];
1464 memcpy (absynName, cmd_str, cmd_len);
1465 absynName[cmd_len] = '\0';
1468 logf (LOG_DEBUG, "begin record %s", absynName);
1470 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1471 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1476 res = data1_mk_node (spec->dh, spec->m);
1477 res->which = DATA1N_root;
1478 res->u.root.type = absynName;
1479 res->u.root.absyn = absyn;
1482 spec->d1_stack[spec->d1_level] = res;
1483 spec->d1_stack[++(spec->d1_level)] = NULL;
1486 r = execTok (spec, &s, &cmd_str, &cmd_len);
1488 else if (!strcmp (p, "element"))
1490 r = execTok (spec, &s, &cmd_str, &cmd_len);
1493 tagBegin (spec, cmd_str, cmd_len);
1494 r = execTok (spec, &s, &cmd_str, &cmd_len);
1496 else if (!strcmp (p, "variant"))
1499 const char *class_str = NULL;
1501 const char *type_str = NULL;
1503 const char *value_str = NULL;
1504 r = execTok (spec, &s, &cmd_str, &cmd_len);
1507 class_str = cmd_str;
1508 class_len = cmd_len;
1509 r = execTok (spec, &s, &cmd_str, &cmd_len);
1515 r = execTok (spec, &s, &cmd_str, &cmd_len);
1518 value_str = cmd_str;
1519 value_len = cmd_len;
1521 variantBegin (spec, class_str, class_len,
1522 type_str, type_len, value_str, value_len);
1525 r = execTok (spec, &s, &cmd_str, &cmd_len);
1527 else if (!strcmp (p, "context"))
1531 struct lexContext *lc = spec->context;
1532 r = execTok (spec, &s, &cmd_str, &cmd_len);
1533 p = regxStrz (cmd_str, cmd_len, ptmp);
1535 logf (LOG_DEBUG, "begin context %s", p);
1537 while (lc && strcmp (p, lc->name))
1540 spec->context_stack[++(spec->context_stack_top)] = lc;
1542 logf (LOG_WARN, "unknown context %s", p);
1545 r = execTok (spec, &s, &cmd_str, &cmd_len);
1549 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1552 else if (!strcmp (p, "end"))
1554 r = execTok (spec, &s, &cmd_str, &cmd_len);
1557 logf (LOG_WARN, "missing keyword after 'end'");
1560 p = regxStrz (cmd_str, cmd_len, ptmp);
1561 if (!strcmp (p, "record"))
1563 while (spec->d1_level)
1565 tagDataRelease (spec);
1568 r = execTok (spec, &s, &cmd_str, &cmd_len);
1570 logf (LOG_DEBUG, "end record");
1572 spec->stop_flag = 1;
1574 else if (!strcmp (p, "element"))
1577 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1579 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1584 tagEnd (spec, min_level, cmd_str, cmd_len);
1585 r = execTok (spec, &s, &cmd_str, &cmd_len);
1588 tagEnd (spec, min_level, NULL, 0);
1589 if (spec->d1_level == 0)
1592 logf (LOG_DEBUG, "end element end records");
1594 spec->stop_flag = 1;
1598 else if (!strcmp (p, "context"))
1601 logf (LOG_DEBUG, "end context");
1603 if (spec->context_stack_top)
1604 (spec->context_stack_top)--;
1605 r = execTok (spec, &s, &cmd_str, &cmd_len);
1608 logf (LOG_WARN, "bad keyword '%s' after end", p);
1610 else if (!strcmp (p, "data"))
1614 const char *element_str = NULL;
1616 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1618 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1620 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1622 r = execTok (spec, &s, &element_str, &element_len);
1627 logf (LOG_WARN, "bad data option: %.*s",
1632 logf (LOG_WARN, "missing data item after data");
1636 tagBegin (spec, element_str, element_len);
1639 execData (spec, cmd_str, cmd_len,textFlag);
1640 r = execTok (spec, &s, &cmd_str, &cmd_len);
1643 tagEnd (spec, 1, NULL, 0);
1645 else if (!strcmp (p, "unread"))
1648 r = execTok (spec, &s, &cmd_str, &cmd_len);
1649 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1651 r = execTok (spec, &s, &cmd_str, &cmd_len);
1654 logf (LOG_WARN, "missing number after -offset");
1657 p = regxStrz (cmd_str, cmd_len, ptmp);
1659 r = execTok (spec, &s, &cmd_str, &cmd_len);
1665 logf (LOG_WARN, "missing index after unread command");
1668 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1670 logf (LOG_WARN, "bad index after unread command");
1675 no = *cmd_str - '0';
1676 if (no >= spec->arg_no)
1677 no = spec->arg_no - 1;
1678 spec->ptr = spec->arg_start[no] + offset;
1680 r = execTok (spec, &s, &cmd_str, &cmd_len);
1682 else if (!strcmp (p, "context"))
1686 struct lexContext *lc = spec->context;
1687 r = execTok (spec, &s, &cmd_str, &cmd_len);
1688 p = regxStrz (cmd_str, cmd_len, ptmp);
1690 while (lc && strcmp (p, lc->name))
1693 spec->context_stack[spec->context_stack_top] = lc;
1695 logf (LOG_WARN, "unknown context %s", p);
1698 r = execTok (spec, &s, &cmd_str, &cmd_len);
1702 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1703 r = execTok (spec, &s, &cmd_str, &cmd_len);
1708 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1710 r = execTok (spec, &s, &cmd_str, &cmd_len);
1717 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1718 int start_ptr, int *pptr)
1727 arg_start[0] = start_ptr;
1729 spec->arg_start = arg_start;
1730 spec->arg_end = arg_end;
1737 if (ap->u.pattern.body)
1739 arg_start[arg_no] = *pptr;
1740 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1742 arg_end[arg_no] = F_WIN_EOF;
1744 arg_start[arg_no] = F_WIN_EOF;
1745 arg_end[arg_no] = F_WIN_EOF;
1750 arg_end[arg_no] = sptr;
1752 arg_start[arg_no] = sptr;
1753 arg_end[arg_no] = *pptr;
1758 arg_start[arg_no] = *pptr;
1759 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1761 if (sptr != arg_start[arg_no])
1763 arg_end[arg_no] = *pptr;
1768 spec->arg_no = arg_no;
1771 if (spec->tcl_interp)
1772 execTcl(spec, ap->u.code);
1774 execCode (spec, ap->u.code);
1776 execCode (spec, ap->u.code);
1779 if (spec->stop_flag)
1783 arg_start[arg_no] = *pptr;
1784 arg_end[arg_no] = F_WIN_EOF;
1793 static int execRule (struct lexSpec *spec, struct lexContext *context,
1794 int ruleNo, int start_ptr, int *pptr)
1797 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1799 return execAction (spec, context->fastRule[ruleNo]->actionList,
1803 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1805 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1806 struct DFA_state *state = context->dfa->states[0];
1809 unsigned char c_prev = '\n';
1811 int last_rule = 0; /* rule number of current match */
1812 int last_ptr = *ptr; /* last char of match */
1813 int start_ptr = *ptr; /* first char of match */
1814 int skip_ptr = *ptr; /* first char of run */
1818 c = f_win_advance (spec, ptr);
1819 if (*ptr == F_WIN_EOF)
1821 /* end of file met */
1824 /* there was a match */
1825 if (skip_ptr < start_ptr)
1827 /* deal with chars that didn't match */
1830 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1831 execDataP (spec, buf, size, 0);
1833 /* restore pointer */
1836 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1838 /* restore skip pointer */
1842 else if (skip_ptr < *ptr)
1844 /* deal with chars that didn't match */
1847 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1848 execDataP (spec, buf, size, 0);
1850 if (*ptr == F_WIN_EOF)
1857 { /* no transition for character c ... */
1860 if (skip_ptr < start_ptr)
1862 /* deal with chars that didn't match */
1865 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1866 execDataP (spec, buf, size, 0);
1868 /* restore pointer */
1870 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1872 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1875 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1877 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1881 context = spec->context_stack[spec->context_stack_top];
1884 last_ptr = start_ptr = *ptr;
1888 c_prev = f_win_advance (spec, &start_ptr);
1893 c_prev = f_win_advance (spec, &start_ptr);
1896 state = context->dfa->states[0];
1899 else if (c >= t->ch[0] && c <= t->ch[1])
1900 { /* transition ... */
1901 state = context->dfa->states[t->to];
1906 last_rule = state->rule_no;
1909 else if (state->rule_nno)
1911 last_rule = state->rule_nno;
1923 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1924 const char *context_name)
1926 struct lexContext *lt = spec->context;
1929 spec->stop_flag = 0;
1931 spec->context_stack_top = 0;
1934 if (!strcmp (lt->name, context_name))
1940 logf (LOG_WARN, "cannot find context %s", context_name);
1943 spec->context_stack[spec->context_stack_top] = lt;
1944 spec->d1_stack[spec->d1_level] = NULL;
1949 execAction (spec, lt->initActionList, ptr, &ptr);
1952 execAction (spec, lt->beginActionList, ptr, &ptr);
1953 lexNode (spec, &ptr);
1954 while (spec->d1_level)
1956 tagDataRelease (spec);
1959 execAction (spec, lt->endActionList, ptr, &ptr);
1960 return spec->d1_stack[0];
1963 void grs_destroy(void *clientData)
1965 struct lexSpecs *specs = clientData;
1968 lexSpecDestroy(&specs->spec);
1973 void *grs_init(void)
1975 struct lexSpecs *specs = xmalloc (sizeof(*specs));
1980 data1_node *grs_read_regx (struct grs_read_info *p)
1983 struct lexSpecs *specs = p->clientData;
1984 struct lexSpec **curLexSpec = &specs->spec;
1987 logf (LOG_DEBUG, "grs_read_regx");
1989 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1992 lexSpecDestroy (curLexSpec);
1993 *curLexSpec = lexSpecCreate (p->type, p->dh);
1994 res = readFileSpec (*curLexSpec);
1997 lexSpecDestroy (curLexSpec);
2001 (*curLexSpec)->dh = p->dh;
2004 (*curLexSpec)->f_win_start = 0;
2005 (*curLexSpec)->f_win_end = 0;
2006 (*curLexSpec)->f_win_rf = p->readf;
2007 (*curLexSpec)->f_win_sf = p->seekf;
2008 (*curLexSpec)->f_win_fh = p->fh;
2009 (*curLexSpec)->f_win_ef = p->endf;
2010 (*curLexSpec)->f_win_size = 500000;
2012 (*curLexSpec)->m = p->mem;
2013 return lexRoot (*curLexSpec, p->offset, "main");
2016 static struct recTypeGrs regx_type = {
2023 RecTypeGrs recTypeGrs_regx = ®x_type;
2026 data1_node *grs_read_tcl (struct grs_read_info *p)
2029 struct lexSpecs *specs = p->clientData;
2030 struct lexSpec **curLexSpec = &specs->spec;
2033 logf (LOG_DEBUG, "grs_read_tcl");
2035 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2037 Tcl_Interp *tcl_interp;
2039 lexSpecDestroy (curLexSpec);
2040 *curLexSpec = lexSpecCreate (p->type, p->dh);
2041 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2042 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2043 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2044 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2045 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2047 res = readFileSpec (*curLexSpec);
2050 lexSpecDestroy (curLexSpec);
2054 (*curLexSpec)->dh = p->dh;
2057 (*curLexSpec)->f_win_start = 0;
2058 (*curLexSpec)->f_win_end = 0;
2059 (*curLexSpec)->f_win_rf = p->readf;
2060 (*curLexSpec)->f_win_sf = p->seekf;
2061 (*curLexSpec)->f_win_fh = p->fh;
2062 (*curLexSpec)->f_win_ef = p->endf;
2063 (*curLexSpec)->f_win_size = 500000;
2065 (*curLexSpec)->m = p->mem;
2066 return lexRoot (*curLexSpec, p->offset, "main");
2069 static struct recTypeGrs tcl_type = {
2076 RecTypeGrs recTypeGrs_tcl = &tcl_type;