2 * Copyright (C) 1994-2001, Index Data
6 * Revision 1.36 2001-05-22 21:02:26 adam
7 * Fixes for Tcl UTF8 character handling.
9 * Revision 1.35 2001/03/29 21:31:31 adam
10 * Fixed "record begin" for Tcl filter.
12 * Revision 1.34 2000/11/29 14:24:01 adam
13 * Script configure uses yaz pthreads options. Added locking for
14 * zebra_register_{lock,unlock}.
16 * Revision 1.33 1999/11/30 13:48:04 adam
17 * Improved installation. Updated for inclusion of YAZ header files.
19 * Revision 1.32 1999/09/07 07:19:21 adam
20 * Work on character mapping. Implemented replace rules.
22 * Revision 1.31 1999/07/14 13:05:29 adam
23 * Tcl filter works with objects when TCL is version 8 or later; filter
24 * works with strings otherwise (slow).
26 * Revision 1.30 1999/07/14 10:55:28 adam
29 * Revision 1.29 1999/07/12 07:27:54 adam
30 * Improved speed of Tcl processing. Fixed one memory leak.
32 * Revision 1.28 1999/07/06 12:26:04 adam
33 * Fixed filters so that MS-DOS CR is ignored.
35 * Revision 1.27 1999/06/28 13:25:40 quinn
36 * Improved diagnostics for Tcl
38 * Revision 1.26 1999/05/26 07:49:14 adam
41 * Revision 1.25 1999/05/25 12:33:32 adam
42 * Fixed bug in Tcl filter.
44 * Revision 1.24 1999/05/21 11:08:46 adam
45 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
46 * script so that it reads uninstalled Tcl source.
48 * Revision 1.23 1999/05/20 12:57:18 adam
49 * Implemented TCL filter. Updated recctrl system.
51 * Revision 1.22 1998/11/03 16:07:13 adam
54 * Revision 1.21 1998/11/03 15:43:39 adam
55 * Fixed bug introduced by previous commit.
57 * Revision 1.20 1998/11/03 14:51:28 adam
58 * Changed code so that it creates as few data1 nodes as possible.
60 * Revision 1.19 1998/11/03 10:22:39 adam
61 * Fixed memory leak that could occur for when large data1 node were
62 * concatenated. Data-type data1_nodes may have multiple nodes.
64 * Revision 1.18 1998/10/15 13:11:47 adam
65 * Added support for option -record for "end element". When specified
66 * end element will mark end-of-record when at outer-level.
68 * Revision 1.17 1998/07/01 10:13:51 adam
71 * Revision 1.16 1998/06/30 15:15:09 adam
72 * Tags are trimmed: white space removed before- and after the tag.
74 * Revision 1.15 1998/06/30 12:55:45 adam
77 * Revision 1.14 1998/03/05 08:41:00 adam
78 * Implemented rule contexts.
80 * Revision 1.13 1997/12/12 06:33:58 adam
81 * Fixed bug that showed up when multiple filter where used.
82 * Made one routine thread-safe.
84 * Revision 1.12 1997/11/18 10:03:24 adam
85 * Member num_children removed from data1_node.
87 * Revision 1.11 1997/11/06 11:41:01 adam
88 * Implemented "begin variant" for the sgml.regx filter.
90 * Revision 1.10 1997/10/31 12:36:12 adam
91 * Minor change that avoids compiler warning.
93 * Revision 1.9 1997/09/29 09:02:49 adam
94 * Fixed small bug (introduced by previous commit).
96 * Revision 1.8 1997/09/17 12:19:22 adam
97 * Zebra version corresponds to YAZ version 1.4.
98 * Changed Zebra server so that it doesn't depend on global common_resource.
100 * Revision 1.7 1997/07/15 16:33:07 adam
101 * Check for zero length in execData.
103 * Revision 1.6 1997/02/24 10:41:51 adam
104 * Cleanup of code and commented out the "end element-end-record" code.
106 * Revision 1.5 1997/02/19 16:22:33 adam
107 * Fixed "end element" to terminate record in outer-most level.
109 * Revision 1.4 1997/02/12 20:42:58 adam
110 * Changed some log messages.
112 * Revision 1.3 1996/11/08 14:05:33 adam
113 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
115 * Revision 1.2 1996/10/29 14:02:09 adam
116 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
117 * data1_get_tabpath is used.
119 * Revision 1.1 1996/10/11 10:57:30 adam
120 * New module recctrl. Used to manage records (extract/retrieval).
122 * Revision 1.24 1996/06/17 14:25:31 adam
123 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
125 * Revision 1.23 1996/06/04 10:19:00 adam
126 * Minor changes - removed include of ctype.h.
128 * Revision 1.22 1996/06/03 15:23:13 adam
129 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
131 * Revision 1.21 1996/05/14 16:58:38 adam
134 * Revision 1.20 1996/05/01 13:46:36 adam
135 * First work on multiple records in one file.
136 * New option, -offset, to the "unread" command in the filter module.
138 * Revision 1.19 1996/02/12 16:18:20 adam
139 * Yet another bug fix in implementation of unread command.
141 * Revision 1.18 1996/02/12 16:07:54 adam
142 * Bug fix in new unread command.
144 * Revision 1.17 1996/02/12 15:56:11 adam
145 * New code command: unread.
147 * Revision 1.16 1996/01/17 14:57:51 adam
148 * Prototype changed for reader functions in extract/retrieve. File
149 * is identified by 'void *' instead of 'int.
151 * Revision 1.15 1996/01/08 19:15:47 adam
152 * New input filter that works!
154 * Revision 1.14 1996/01/08 09:10:38 adam
155 * Yet another complete rework on this module.
157 * Revision 1.13 1995/12/15 17:21:50 adam
158 * This version is able to set data.formatted_text in data1-nodes.
160 * Revision 1.12 1995/12/15 16:20:10 adam
161 * The filter files (*.flt) are read from the path given by data1_tabpath.
163 * Revision 1.11 1995/12/15 12:35:16 adam
166 * Revision 1.10 1995/12/15 10:35:36 adam
169 * Revision 1.9 1995/12/14 16:38:48 adam
170 * Completely new attempt to make regular expression parsing.
172 * Revision 1.8 1995/12/13 17:16:59 adam
175 * Revision 1.7 1995/12/13 16:51:58 adam
176 * Modified to set last_child in data1_nodes.
177 * Uses destroy handler to free up data text nodes.
179 * Revision 1.6 1995/12/13 13:45:37 quinn
180 * Changed data1 to use nmem.
182 * Revision 1.5 1995/12/11 09:12:52 adam
183 * The rec_get function returns NULL if record doesn't exist - will
184 * happen in the server if the result set records have been deleted since
185 * the creation of the set (i.e. the search).
186 * The server saves a result temporarily if it is 'volatile', i.e. the
187 * set is register dependent.
189 * Revision 1.4 1995/12/05 16:57:40 adam
190 * More work on regular patterns.
192 * Revision 1.3 1995/12/05 09:37:09 adam
193 * One malloc was renamed to xmalloc.
195 * Revision 1.2 1995/12/04 17:59:24 adam
196 * More work on regular expression conversion.
198 * Revision 1.1 1995/12/04 14:25:30 adam
199 * Started work on regular expression parsed input to structured records.
207 #include <yaz/tpath.h>
208 #include <zebrautl.h>
215 #if MAJOR_VERSION >= 8
216 #define HAVE_TCL_OBJECTS
222 #define F_WIN_EOF 2000000000
226 #define REGX_PATTERN 1
231 #define REGX_CONTEXT 6
241 struct lexRuleAction {
245 struct DFA *dfa; /* REGX_PATTERN */
248 struct regxCode *code; /* REGX_CODE */
250 struct lexRuleAction *next;
255 struct lexRuleAction *actionList;
259 struct lexRuleInfo info;
260 struct lexRule *next;
266 struct lexRule *rules;
267 struct lexRuleInfo **fastRule;
271 struct lexRuleAction *beginActionList;
272 struct lexRuleAction *endActionList;
273 struct lexRuleAction *initActionList;
274 struct lexContext *next;
277 struct lexConcatBuf {
284 struct lexContext *context;
286 struct lexContext **context_stack;
287 int context_stack_size;
288 int context_stack_top;
294 Tcl_Interp *tcl_interp;
297 void (*f_win_ef)(void *, off_t);
299 int f_win_start; /* first byte of buffer is this file offset */
300 int f_win_end; /* last byte of buffer is this offset - 1 */
301 int f_win_size; /* size of buffer */
302 char *f_win_buf; /* buffer itself */
303 int (*f_win_rf)(void *, char *, size_t);
304 off_t (*f_win_sf)(void *, off_t);
306 struct lexConcatBuf *concatBuf;
308 data1_node **d1_stack;
319 struct lexSpec *spec;
322 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
325 int i, r, off = start_pos - spec->f_win_start;
327 if (off >= 0 && end_pos <= spec->f_win_end)
329 *size = end_pos - start_pos;
330 return spec->f_win_buf + off;
332 if (off < 0 || start_pos >= spec->f_win_end)
334 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
335 spec->f_win_start = start_pos;
337 if (!spec->f_win_buf)
338 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
339 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
341 spec->f_win_end = spec->f_win_start + *size;
343 if (*size > end_pos - start_pos)
344 *size = end_pos - start_pos;
345 return spec->f_win_buf;
347 for (i = 0; i<spec->f_win_end - start_pos; i++)
348 spec->f_win_buf[i] = spec->f_win_buf[i + off];
349 r = (*spec->f_win_rf)(spec->f_win_fh,
351 spec->f_win_size - i);
352 spec->f_win_start = start_pos;
353 spec->f_win_end += r;
355 if (*size > end_pos - start_pos)
356 *size = end_pos - start_pos;
357 return spec->f_win_buf;
360 static int f_win_advance (struct lexSpec *spec, int *pos)
365 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
366 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
367 if (*pos == F_WIN_EOF)
369 buf = f_win_get (spec, *pos, *pos+1, &size);
379 static void regxCodeDel (struct regxCode **pp)
381 struct regxCode *p = *pp;
386 Tcl_DecrRefCount (p->tcl_obj);
394 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
398 p = (struct regxCode *) xmalloc (sizeof(*p));
399 p->str = (char *) xmalloc (len+1);
400 memcpy (p->str, buf, len);
403 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
405 Tcl_IncrRefCount (p->tcl_obj);
410 static struct DFA *lexSpecDFA (void)
415 dfa_parse_cmap_del (dfa, ' ');
416 dfa_parse_cmap_del (dfa, '\t');
417 dfa_parse_cmap_add (dfa, '/', 0);
421 static void actionListDel (struct lexRuleAction **rap)
423 struct lexRuleAction *ra1, *ra;
425 for (ra = *rap; ra; ra = ra1)
431 dfa_delete (&ra->u.pattern.dfa);
434 regxCodeDel (&ra->u.code);
442 static struct lexContext *lexContextCreate (const char *name)
444 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
446 p->name = xstrdup (name);
449 p->dfa = lexSpecDFA ();
452 p->beginActionList = NULL;
453 p->endActionList = NULL;
454 p->initActionList = NULL;
459 static void lexContextDestroy (struct lexContext *p)
461 struct lexRule *rp, *rp1;
463 dfa_delete (&p->dfa);
465 for (rp = p->rules; rp; rp = rp1)
468 actionListDel (&rp->info.actionList);
471 actionListDel (&p->beginActionList);
472 actionListDel (&p->endActionList);
473 actionListDel (&p->initActionList);
478 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
483 p = (struct lexSpec *) xmalloc (sizeof(*p));
484 p->name = (char *) xmalloc (strlen(name)+1);
485 strcpy (p->name, name);
492 p->context_stack_size = 100;
493 p->context_stack = (struct lexContext **)
494 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
498 p->concatBuf = (struct lexConcatBuf *)
499 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
500 for (i = 0; i < p->maxLevel; i++)
502 p->concatBuf[i].max = 0;
503 p->concatBuf[i].buf = 0;
505 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
510 static void lexSpecDestroy (struct lexSpec **pp)
513 struct lexContext *lt;
521 for (i = 0; i < p->maxLevel; i++)
522 xfree (p->concatBuf[i].buf);
523 xfree (p->concatBuf);
528 struct lexContext *lt_next = lt->next;
529 lexContextDestroy (lt);
534 Tcl_DeleteInterp (p->tcl_interp);
537 xfree (p->f_win_buf);
538 xfree (p->context_stack);
544 static int readParseToken (const char **cpp, int *len)
546 const char *cp = *cpp;
550 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
579 if (*cp >= 'a' && *cp <= 'z')
581 else if (*cp >= 'A' && *cp <= 'Z')
582 cmd[i] = *cp + 'a' - 'A';
585 if (i < (int) sizeof(cmd)-2)
592 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
594 while (*cp && *cp != ' ' && *cp != '\t' &&
595 *cp != '\n' && *cp != '\r')
601 if (!strcmp (cmd, "begin"))
603 else if (!strcmp (cmd, "end"))
605 else if (!strcmp (cmd, "body"))
607 else if (!strcmp (cmd, "context"))
609 else if (!strcmp (cmd, "init"))
613 logf (LOG_WARN, "bad command %s", cmd);
619 static int actionListMk (struct lexSpec *spec, const char *s,
620 struct lexRuleAction **ap)
626 while ((tok = readParseToken (&s, &len)))
634 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
636 regxCodeMk (&(*ap)->u.code, s, len);
640 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
642 (*ap)->u.pattern.body = bodyMark;
644 (*ap)->u.pattern.dfa = lexSpecDFA ();
646 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
651 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
654 dfa_mkstate ((*ap)->u.pattern.dfa);
658 logf (LOG_WARN, "cannot use BEGIN here");
661 logf (LOG_WARN, "cannot use INIT here");
664 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
674 int readOneSpec (struct lexSpec *spec, const char *s)
678 struct lexContext *lc;
680 tok = readParseToken (&s, &len);
681 if (tok == REGX_CONTEXT)
683 char context_name[32];
684 tok = readParseToken (&s, &len);
685 if (tok != REGX_CODE)
687 logf (LOG_WARN, "missing name after CONTEXT keyword");
692 memcpy (context_name, s, len);
693 context_name[len] = '\0';
694 lc = lexContextCreate (context_name);
695 lc->next = spec->context;
700 spec->context = lexContextCreate ("main");
705 actionListDel (&spec->context->beginActionList);
706 actionListMk (spec, s, &spec->context->beginActionList);
709 actionListDel (&spec->context->endActionList);
710 actionListMk (spec, s, &spec->context->endActionList);
713 actionListDel (&spec->context->initActionList);
714 actionListMk (spec, s, &spec->context->initActionList);
718 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
720 r = dfa_parse (spec->context->dfa, &s);
723 logf (LOG_WARN, "regular expression error. r=%d", r);
728 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
732 rp = (struct lexRule *) xmalloc (sizeof(*rp));
733 rp->info.no = spec->context->ruleNo++;
734 rp->next = spec->context->rules;
735 spec->context->rules = rp;
736 actionListMk (spec, s, &rp->info.actionList);
741 int readFileSpec (struct lexSpec *spec)
743 struct lexContext *lc;
744 int c, i, errors = 0;
750 if (spec->tcl_interp)
752 sprintf (fname, "%s.tflt", spec->name);
753 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
758 sprintf (fname, "%s.flt", spec->name);
759 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
763 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
766 logf (LOG_LOG, "reading regx filter %s", fname);
768 if (spec->tcl_interp)
769 logf (LOG_LOG, "Tcl enabled");
771 lineBuf = wrbuf_alloc();
776 wrbuf_rewind (lineBuf);
777 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
779 while (c != '\n' && c != EOF)
792 wrbuf_putc(lineBuf, c);
800 if (c != ' ' && c != '\t')
805 wrbuf_putc(lineBuf, '\0');
806 readOneSpec (spec, wrbuf_buf(lineBuf));
807 spec->lineNo += addLine;
811 wrbuf_free(lineBuf, 1);
816 debug_dfa_followpos = 1;
819 for (lc = spec->context; lc; lc = lc->next)
822 lc->fastRule = (struct lexRuleInfo **)
823 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
824 for (i = 0; i < lc->ruleNo; i++)
825 lc->fastRule[i] = NULL;
826 for (rp = lc->rules; rp; rp = rp->next)
827 lc->fastRule[rp->info.no] = &rp->info;
828 dfa_mkstate (lc->dfa);
837 static struct lexSpec *curLexSpec = NULL;
840 static void execData (struct lexSpec *spec,
841 const char *ebuf, int elen, int formatted_text)
843 struct data1_node *res, *parent;
846 if (elen == 0) /* shouldn't happen, but it does! */
850 logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen,
851 ebuf, 15, ebuf + elen-15);
853 logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf);
855 logf (LOG_LOG, "data (%d bytes)", elen);
858 if (spec->d1_level <= 1)
861 parent = spec->d1_stack[spec->d1_level -1];
864 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
865 org_len = res->u.data.len;
870 res = data1_mk_node (spec->dh, spec->m);
871 res->parent = parent;
872 res->which = DATA1N_data;
873 res->u.data.what = DATA1I_text;
875 res->u.data.formatted_text = formatted_text;
877 if (elen > DATA1_LOCALDATA)
878 res->u.data.data = nmem_malloc (spec->m, elen);
880 res->u.data.data = res->lbuf;
881 memcpy (res->u.data.data, ebuf, elen);
883 res->u.data.data = 0;
885 res->root = parent->root;
887 parent->last_child = res;
888 if (spec->d1_stack[spec->d1_level])
889 spec->d1_stack[spec->d1_level]->next = res;
892 spec->d1_stack[spec->d1_level] = res;
894 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
896 char *old_buf, *new_buf;
898 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
899 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
900 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
902 memcpy (new_buf, old_buf, org_len);
905 spec->concatBuf[spec->d1_level].buf = new_buf;
907 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
908 res->u.data.len += elen;
911 static void execDataP (struct lexSpec *spec,
912 const char *ebuf, int elen, int formatted_text)
914 execData (spec, ebuf, elen, formatted_text);
917 static void tagDataRelease (struct lexSpec *spec)
921 if ((res = spec->d1_stack[spec->d1_level]) &&
922 res->which == DATA1N_data &&
923 res->u.data.what == DATA1I_text)
925 assert (!res->u.data.data);
926 assert (res->u.data.len > 0);
927 if (res->u.data.len > DATA1_LOCALDATA)
928 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
930 res->u.data.data = res->lbuf;
931 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
936 static void variantBegin (struct lexSpec *spec,
937 const char *class_str, int class_len,
938 const char *type_str, int type_len,
939 const char *value_str, int value_len)
941 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
942 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
947 if (spec->d1_level == 0)
949 logf (LOG_WARN, "in variant begin. No record type defined");
952 if (class_len >= DATA1_MAX_SYMBOL)
953 class_len = DATA1_MAX_SYMBOL-1;
954 memcpy (tclass, class_str, class_len);
955 tclass[class_len] = '\0';
957 if (type_len >= DATA1_MAX_SYMBOL)
958 type_len = DATA1_MAX_SYMBOL-1;
959 memcpy (ttype, type_str, type_len);
960 ttype[type_len] = '\0';
963 logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype,
968 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
972 if (parent->which != DATA1N_variant)
974 res = data1_mk_node (spec->dh, spec->m);
975 res->parent = parent;
976 res->which = DATA1N_variant;
977 res->u.variant.type = 0;
978 res->u.variant.value = 0;
979 res->root = parent->root;
981 parent->last_child = res;
982 if (spec->d1_stack[spec->d1_level])
984 tagDataRelease (spec);
985 spec->d1_stack[spec->d1_level]->next = res;
989 spec->d1_stack[spec->d1_level] = res;
990 spec->d1_stack[++(spec->d1_level)] = NULL;
992 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
993 if (spec->d1_stack[i]->u.variant.type == tp)
1000 logf (LOG_LOG, "variant node (%d)", spec->d1_level);
1002 parent = spec->d1_stack[spec->d1_level-1];
1003 res = data1_mk_node (spec->dh, spec->m);
1004 res->parent = parent;
1005 res->which = DATA1N_variant;
1006 res->root = parent->root;
1007 res->u.variant.type = tp;
1009 if (value_len >= DATA1_LOCALDATA)
1010 value_len =DATA1_LOCALDATA-1;
1011 memcpy (res->lbuf, value_str, value_len);
1012 res->lbuf[value_len] = '\0';
1014 res->u.variant.value = res->lbuf;
1016 parent->last_child = res;
1017 if (spec->d1_stack[spec->d1_level])
1019 tagDataRelease (spec);
1020 spec->d1_stack[spec->d1_level]->next = res;
1023 parent->child = res;
1024 spec->d1_stack[spec->d1_level] = res;
1025 spec->d1_stack[++(spec->d1_level)] = NULL;
1028 static void tagStrip (const char **tag, int *len)
1032 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1035 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1041 static void tagBegin (struct lexSpec *spec,
1042 const char *tag, int len)
1044 struct data1_node *parent;
1045 data1_element *elem = NULL;
1048 data1_element *e = NULL;
1051 if (spec->d1_level == 0)
1053 logf (LOG_WARN, "in element begin. No record type defined");
1056 tagStrip (&tag, &len);
1058 parent = spec->d1_stack[spec->d1_level -1];
1059 partag = get_parent_tag(spec->dh, parent);
1061 res = data1_mk_node_type (spec->dh, spec->m, DATA1N_tag);
1062 res->parent = parent;
1064 if (len >= DATA1_LOCALDATA)
1065 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1067 res->u.tag.tag = res->lbuf;
1069 memcpy (res->u.tag.tag, tag, len);
1070 res->u.tag.tag[len] = '\0';
1073 logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1075 if (parent->which == DATA1N_variant)
1078 if (!(e = partag->u.tag.element))
1081 elem = data1_getelementbytagname (spec->dh,
1082 spec->d1_stack[0]->u.root.absyn,
1084 res->u.tag.element = elem;
1085 res->root = parent->root;
1087 parent->last_child = res;
1088 if (spec->d1_stack[spec->d1_level])
1090 tagDataRelease (spec);
1091 spec->d1_stack[spec->d1_level]->next = res;
1094 parent->child = res;
1095 spec->d1_stack[spec->d1_level] = res;
1096 spec->d1_stack[++(spec->d1_level)] = NULL;
1099 static void tagEnd (struct lexSpec *spec, int min_level,
1100 const char *tag, int len)
1102 tagStrip (&tag, &len);
1103 while (spec->d1_level > min_level)
1105 tagDataRelease (spec);
1107 if (spec->d1_level == 0)
1109 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1111 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1113 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1117 logf (LOG_LOG, "end tag (%d)", spec->d1_level);
1122 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1125 struct DFA_state *state = dfa->states[0];
1128 unsigned char c_prev = 0;
1129 int ptr = *pptr; /* current pointer */
1130 int start_ptr = *pptr; /* first char of match */
1131 int last_ptr = 0; /* last char of match */
1132 int last_rule = 0; /* rule number of current match */
1137 c = f_win_advance (spec, &ptr);
1138 if (ptr == F_WIN_EOF)
1155 *mptr = start_ptr; /* match starts here */
1156 *pptr = last_ptr; /* match end here (+1) */
1159 state = dfa->states[0];
1164 else if (c >= t->ch[0] && c <= t->ch[1])
1166 state = dfa->states[t->to];
1171 last_rule = state->rule_no;
1176 last_rule = state->rule_nno;
1188 static int execTok (struct lexSpec *spec, const char **src,
1189 const char **tokBuf, int *tokLen)
1191 const char *s = *src;
1193 while (*s == ' ' || *s == '\t')
1197 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1201 while (*s >= '0' && *s <= '9')
1202 n = n*10 + (*s++ -'0');
1203 if (spec->arg_no == 0)
1210 if (n >= spec->arg_no)
1212 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1216 else if (*s == '\"')
1219 while (*s && *s != '\"')
1221 *tokLen = s - *tokBuf;
1226 else if (*s == '\n' || *s == ';')
1234 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1237 *tokLen = s - *tokBuf;
1244 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1247 *tokLen = s - *tokBuf;
1253 static char *regxStrz (const char *src, int len, char *str)
1257 memcpy (str, src, len);
1263 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1264 int argc, char **argv)
1266 struct lexSpec *spec = (struct lexSpec *) clientData;
1269 if (!strcmp(argv[1], "record") && argc == 3)
1271 char *absynName = argv[2];
1275 logf (LOG_LOG, "begin record %s", absynName);
1277 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1278 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1283 res = data1_mk_node (spec->dh, spec->m);
1284 res->which = DATA1N_root;
1286 data1_insert_string(spec->dh, res, spec->m, absynName);
1287 res->u.root.absyn = absyn;
1290 spec->d1_stack[spec->d1_level] = res;
1291 spec->d1_stack[++(spec->d1_level)] = NULL;
1294 else if (!strcmp(argv[1], "element") && argc == 3)
1296 tagBegin (spec, argv[2], strlen(argv[2]));
1298 else if (!strcmp (argv[1], "variant") && argc == 5)
1300 variantBegin (spec, argv[2], strlen(argv[2]),
1301 argv[3], strlen(argv[3]),
1302 argv[4], strlen(argv[4]));
1304 else if (!strcmp (argv[1], "context") && argc == 3)
1306 struct lexContext *lc = spec->context;
1308 logf (LOG_LOG, "begin context %s",argv[2]);
1310 while (lc && strcmp (argv[2], lc->name))
1314 spec->context_stack[++(spec->context_stack_top)] = lc;
1317 logf (LOG_WARN, "unknown context %s", argv[2]);
1324 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1325 int argc, char **argv)
1327 struct lexSpec *spec = (struct lexSpec *) clientData;
1331 if (!strcmp (argv[1], "record"))
1333 while (spec->d1_level)
1335 tagDataRelease (spec);
1339 logf (LOG_LOG, "end record");
1341 spec->stop_flag = 1;
1343 else if (!strcmp (argv[1], "element"))
1347 if (argc >= 3 && !strcmp(argv[2], "-record"))
1356 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1357 if (spec->d1_level == 0)
1360 logf (LOG_LOG, "end element end records");
1362 spec->stop_flag = 1;
1365 else if (!strcmp (argv[1], "context"))
1368 logf (LOG_LOG, "end context");
1370 if (spec->context_stack_top)
1371 (spec->context_stack_top)--;
1378 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1379 int argc, char **argv)
1383 const char *element = 0;
1384 struct lexSpec *spec = (struct lexSpec *) clientData;
1388 if (!strcmp("-text", argv[argi]))
1393 else if (!strcmp("-element", argv[argi]))
1397 element = argv[argi++];
1403 tagBegin (spec, element, strlen(element));
1407 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1409 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1410 execData (spec, native, strlen(native), textFlag);
1411 Tcl_DStringFree (&ds);
1413 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1418 tagEnd (spec, 1, NULL, 0);
1422 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1423 int argc, char **argv)
1425 struct lexSpec *spec = (struct lexSpec *) clientData;
1432 if (!strcmp("-offset", argv[argi]))
1437 offset = atoi(argv[argi]);
1446 no = atoi(argv[argi]);
1447 if (no >= spec->arg_no)
1448 no = spec->arg_no - 1;
1449 spec->ptr = spec->arg_start[no] + offset;
1453 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1457 for (i = 0; i < spec->arg_no; i++)
1459 char var_name[10], *var_buf;
1462 sprintf (var_name, "%d", i);
1463 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1467 ch = var_buf[var_len];
1468 var_buf[var_len] = '\0';
1469 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1470 var_buf[var_len] = ch;
1473 #if HAVE_TCL_OBJECTS
1474 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1476 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1480 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1481 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1482 spec->tcl_interp->errorLine,
1483 spec->tcl_interp->result,
1484 err ? err : "[NO ERRORINFO]");
1490 static void execCode (struct lexSpec *spec, struct regxCode *code)
1492 const char *s = code->str;
1494 const char *cmd_str;
1496 r = execTok (spec, &s, &cmd_str, &cmd_len);
1503 r = execTok (spec, &s, &cmd_str, &cmd_len);
1506 p = regxStrz (cmd_str, cmd_len, ptmp);
1507 if (!strcmp (p, "begin"))
1509 r = execTok (spec, &s, &cmd_str, &cmd_len);
1512 logf (LOG_WARN, "missing keyword after 'begin'");
1515 p = regxStrz (cmd_str, cmd_len, ptmp);
1516 if (!strcmp (p, "record"))
1518 r = execTok (spec, &s, &cmd_str, &cmd_len);
1521 if (spec->d1_level == 0)
1523 static char absynName[64];
1528 memcpy (absynName, cmd_str, cmd_len);
1529 absynName[cmd_len] = '\0';
1532 logf (LOG_LOG, "begin record %s", absynName);
1534 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1535 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1540 res = data1_mk_node (spec->dh, spec->m);
1541 res->which = DATA1N_root;
1542 res->u.root.type = absynName;
1543 res->u.root.absyn = absyn;
1546 spec->d1_stack[spec->d1_level] = res;
1547 spec->d1_stack[++(spec->d1_level)] = NULL;
1550 r = execTok (spec, &s, &cmd_str, &cmd_len);
1552 else if (!strcmp (p, "element"))
1554 r = execTok (spec, &s, &cmd_str, &cmd_len);
1557 tagBegin (spec, cmd_str, cmd_len);
1558 r = execTok (spec, &s, &cmd_str, &cmd_len);
1560 else if (!strcmp (p, "variant"))
1563 const char *class_str = NULL;
1565 const char *type_str = NULL;
1567 const char *value_str = NULL;
1568 r = execTok (spec, &s, &cmd_str, &cmd_len);
1571 class_str = cmd_str;
1572 class_len = cmd_len;
1573 r = execTok (spec, &s, &cmd_str, &cmd_len);
1579 r = execTok (spec, &s, &cmd_str, &cmd_len);
1582 value_str = cmd_str;
1583 value_len = cmd_len;
1585 variantBegin (spec, class_str, class_len,
1586 type_str, type_len, value_str, value_len);
1589 r = execTok (spec, &s, &cmd_str, &cmd_len);
1591 else if (!strcmp (p, "context"))
1595 struct lexContext *lc = spec->context;
1596 r = execTok (spec, &s, &cmd_str, &cmd_len);
1597 p = regxStrz (cmd_str, cmd_len, ptmp);
1599 logf (LOG_LOG, "begin context %s", p);
1601 while (lc && strcmp (p, lc->name))
1604 spec->context_stack[++(spec->context_stack_top)] = lc;
1606 logf (LOG_WARN, "unknown context %s", p);
1609 r = execTok (spec, &s, &cmd_str, &cmd_len);
1613 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1616 else if (!strcmp (p, "end"))
1618 r = execTok (spec, &s, &cmd_str, &cmd_len);
1621 logf (LOG_WARN, "missing keyword after 'end'");
1624 p = regxStrz (cmd_str, cmd_len, ptmp);
1625 if (!strcmp (p, "record"))
1627 while (spec->d1_level)
1629 tagDataRelease (spec);
1632 r = execTok (spec, &s, &cmd_str, &cmd_len);
1634 logf (LOG_LOG, "end record");
1636 spec->stop_flag = 1;
1638 else if (!strcmp (p, "element"))
1641 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1643 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1648 tagEnd (spec, min_level, cmd_str, cmd_len);
1649 r = execTok (spec, &s, &cmd_str, &cmd_len);
1652 tagEnd (spec, min_level, NULL, 0);
1653 if (spec->d1_level == 0)
1656 logf (LOG_LOG, "end element end records");
1658 spec->stop_flag = 1;
1662 else if (!strcmp (p, "context"))
1665 logf (LOG_LOG, "end context");
1667 if (spec->context_stack_top)
1668 (spec->context_stack_top)--;
1669 r = execTok (spec, &s, &cmd_str, &cmd_len);
1672 logf (LOG_WARN, "bad keyword '%s' after end", p);
1674 else if (!strcmp (p, "data"))
1678 const char *element_str = NULL;
1680 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1682 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1684 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1686 r = execTok (spec, &s, &element_str, &element_len);
1691 logf (LOG_WARN, "bad data option: %.*s",
1696 logf (LOG_WARN, "missing data item after data");
1700 tagBegin (spec, element_str, element_len);
1703 execData (spec, cmd_str, cmd_len,textFlag);
1704 r = execTok (spec, &s, &cmd_str, &cmd_len);
1707 tagEnd (spec, 1, NULL, 0);
1709 else if (!strcmp (p, "unread"))
1712 r = execTok (spec, &s, &cmd_str, &cmd_len);
1713 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1715 r = execTok (spec, &s, &cmd_str, &cmd_len);
1718 logf (LOG_WARN, "missing number after -offset");
1721 p = regxStrz (cmd_str, cmd_len, ptmp);
1723 r = execTok (spec, &s, &cmd_str, &cmd_len);
1729 logf (LOG_WARN, "missing index after unread command");
1732 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1734 logf (LOG_WARN, "bad index after unread command");
1739 no = *cmd_str - '0';
1740 if (no >= spec->arg_no)
1741 no = spec->arg_no - 1;
1742 spec->ptr = spec->arg_start[no] + offset;
1744 r = execTok (spec, &s, &cmd_str, &cmd_len);
1746 else if (!strcmp (p, "context"))
1750 struct lexContext *lc = spec->context;
1751 r = execTok (spec, &s, &cmd_str, &cmd_len);
1752 p = regxStrz (cmd_str, cmd_len, ptmp);
1754 while (lc && strcmp (p, lc->name))
1757 spec->context_stack[spec->context_stack_top] = lc;
1759 logf (LOG_WARN, "unknown context %s", p);
1762 r = execTok (spec, &s, &cmd_str, &cmd_len);
1766 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1767 r = execTok (spec, &s, &cmd_str, &cmd_len);
1772 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1774 r = execTok (spec, &s, &cmd_str, &cmd_len);
1781 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1782 int start_ptr, int *pptr)
1791 arg_start[0] = start_ptr;
1793 spec->arg_start = arg_start;
1794 spec->arg_end = arg_end;
1801 if (ap->u.pattern.body)
1803 arg_start[arg_no] = *pptr;
1804 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1806 arg_end[arg_no] = F_WIN_EOF;
1808 arg_start[arg_no] = F_WIN_EOF;
1809 arg_end[arg_no] = F_WIN_EOF;
1814 arg_end[arg_no] = sptr;
1816 arg_start[arg_no] = sptr;
1817 arg_end[arg_no] = *pptr;
1822 arg_start[arg_no] = *pptr;
1823 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1825 if (sptr != arg_start[arg_no])
1827 arg_end[arg_no] = *pptr;
1832 spec->arg_no = arg_no;
1835 if (spec->tcl_interp)
1836 execTcl(spec, ap->u.code);
1838 execCode (spec, ap->u.code);
1840 execCode (spec, ap->u.code);
1843 if (spec->stop_flag)
1847 arg_start[arg_no] = *pptr;
1848 arg_end[arg_no] = F_WIN_EOF;
1857 static int execRule (struct lexSpec *spec, struct lexContext *context,
1858 int ruleNo, int start_ptr, int *pptr)
1861 logf (LOG_LOG, "exec rule %d", ruleNo);
1863 return execAction (spec, context->fastRule[ruleNo]->actionList,
1867 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1869 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1870 struct DFA_state *state = context->dfa->states[0];
1873 unsigned char c_prev = '\n';
1875 int last_rule = 0; /* rule number of current match */
1876 int last_ptr = *ptr; /* last char of match */
1877 int start_ptr = *ptr; /* first char of match */
1878 int skip_ptr = *ptr; /* first char of run */
1882 c = f_win_advance (spec, ptr);
1883 if (*ptr == F_WIN_EOF)
1885 /* end of file met */
1888 /* there was a match */
1889 if (skip_ptr < start_ptr)
1891 /* deal with chars that didn't match */
1894 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1895 execDataP (spec, buf, size, 0);
1897 /* restore pointer */
1900 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1902 /* restore skip pointer */
1906 else if (skip_ptr < *ptr)
1908 /* deal with chars that didn't match */
1911 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1912 execDataP (spec, buf, size, 0);
1914 if (*ptr == F_WIN_EOF)
1921 { /* no transition for character c ... */
1924 if (skip_ptr < start_ptr)
1926 /* deal with chars that didn't match */
1929 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1930 execDataP (spec, buf, size, 0);
1932 /* restore pointer */
1934 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1936 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1939 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1941 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1945 context = spec->context_stack[spec->context_stack_top];
1948 last_ptr = start_ptr = *ptr;
1952 c_prev = f_win_advance (spec, &start_ptr);
1957 c_prev = f_win_advance (spec, &start_ptr);
1960 state = context->dfa->states[0];
1963 else if (c >= t->ch[0] && c <= t->ch[1])
1964 { /* transition ... */
1965 state = context->dfa->states[t->to];
1970 last_rule = state->rule_no;
1973 else if (state->rule_nno)
1975 last_rule = state->rule_nno;
1987 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1988 const char *context_name)
1990 struct lexContext *lt = spec->context;
1993 spec->stop_flag = 0;
1995 spec->context_stack_top = 0;
1998 if (!strcmp (lt->name, context_name))
2004 logf (LOG_WARN, "cannot find context %s", context_name);
2007 spec->context_stack[spec->context_stack_top] = lt;
2008 spec->d1_stack[spec->d1_level] = NULL;
2013 execAction (spec, lt->initActionList, ptr, &ptr);
2016 execAction (spec, lt->beginActionList, ptr, &ptr);
2017 lexNode (spec, &ptr);
2018 while (spec->d1_level)
2020 tagDataRelease (spec);
2023 execAction (spec, lt->endActionList, ptr, &ptr);
2024 return spec->d1_stack[0];
2027 void grs_destroy(void *clientData)
2029 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2032 lexSpecDestroy(&specs->spec);
2037 void *grs_init(void)
2039 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2044 data1_node *grs_read_regx (struct grs_read_info *p)
2047 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2048 struct lexSpec **curLexSpec = &specs->spec;
2051 logf (LOG_LOG, "grs_read_regx");
2053 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2056 lexSpecDestroy (curLexSpec);
2057 *curLexSpec = lexSpecCreate (p->type, p->dh);
2058 res = readFileSpec (*curLexSpec);
2061 lexSpecDestroy (curLexSpec);
2065 (*curLexSpec)->dh = p->dh;
2068 (*curLexSpec)->f_win_start = 0;
2069 (*curLexSpec)->f_win_end = 0;
2070 (*curLexSpec)->f_win_rf = p->readf;
2071 (*curLexSpec)->f_win_sf = p->seekf;
2072 (*curLexSpec)->f_win_fh = p->fh;
2073 (*curLexSpec)->f_win_ef = p->endf;
2074 (*curLexSpec)->f_win_size = 500000;
2076 (*curLexSpec)->m = p->mem;
2077 return lexRoot (*curLexSpec, p->offset, "main");
2080 static struct recTypeGrs regx_type = {
2087 RecTypeGrs recTypeGrs_regx = ®x_type;
2090 data1_node *grs_read_tcl (struct grs_read_info *p)
2093 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2094 struct lexSpec **curLexSpec = &specs->spec;
2097 logf (LOG_LOG, "grs_read_tcl");
2099 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2101 Tcl_Interp *tcl_interp;
2103 lexSpecDestroy (curLexSpec);
2104 *curLexSpec = lexSpecCreate (p->type, p->dh);
2105 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2106 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2107 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2108 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2109 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2111 res = readFileSpec (*curLexSpec);
2114 lexSpecDestroy (curLexSpec);
2118 (*curLexSpec)->dh = p->dh;
2121 (*curLexSpec)->f_win_start = 0;
2122 (*curLexSpec)->f_win_end = 0;
2123 (*curLexSpec)->f_win_rf = p->readf;
2124 (*curLexSpec)->f_win_sf = p->seekf;
2125 (*curLexSpec)->f_win_fh = p->fh;
2126 (*curLexSpec)->f_win_ef = p->endf;
2127 (*curLexSpec)->f_win_size = 500000;
2129 (*curLexSpec)->m = p->mem;
2130 return lexRoot (*curLexSpec, p->offset, "main");
2133 static struct recTypeGrs tcl_type = {
2140 RecTypeGrs recTypeGrs_tcl = &tcl_type;