1 /* $Id: regxread.c,v 1.50.2.4 2006-10-30 14:14:20 adam Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
29 #include <yaz/tpath.h>
38 #if MAJOR_VERSION >= 8
39 #define HAVE_TCL_OBJECTS
45 #define F_WIN_EOF 2000000000
49 #define REGX_PATTERN 1
54 #define REGX_CONTEXT 6
64 struct lexRuleAction {
68 struct DFA *dfa; /* REGX_PATTERN */
71 struct regxCode *code; /* REGX_CODE */
73 struct lexRuleAction *next;
78 struct lexRuleAction *actionList;
82 struct lexRuleInfo info;
89 struct lexRule *rules;
90 struct lexRuleInfo **fastRule;
94 struct lexRuleAction *beginActionList;
95 struct lexRuleAction *endActionList;
96 struct lexRuleAction *initActionList;
97 struct lexContext *next;
100 struct lexConcatBuf {
107 struct lexContext *context;
109 struct lexContext **context_stack;
110 int context_stack_size;
111 int context_stack_top;
117 Tcl_Interp *tcl_interp;
120 void (*f_win_ef)(void *, off_t);
122 int f_win_start; /* first byte of buffer is this file offset */
123 int f_win_end; /* last byte of buffer is this offset - 1 */
124 int f_win_size; /* size of buffer */
125 char *f_win_buf; /* buffer itself */
126 int (*f_win_rf)(void *, char *, size_t);
127 off_t (*f_win_sf)(void *, off_t);
129 struct lexConcatBuf *concatBuf;
131 data1_node **d1_stack;
142 struct lexSpec *spec;
145 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
148 int i, r, off = start_pos - spec->f_win_start;
150 if (off >= 0 && end_pos <= spec->f_win_end)
152 *size = end_pos - start_pos;
153 return spec->f_win_buf + off;
155 if (off < 0 || start_pos >= spec->f_win_end)
157 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
158 spec->f_win_start = start_pos;
160 if (!spec->f_win_buf)
161 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
162 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
164 spec->f_win_end = spec->f_win_start + *size;
166 if (*size > end_pos - start_pos)
167 *size = end_pos - start_pos;
168 return spec->f_win_buf;
170 for (i = 0; i<spec->f_win_end - start_pos; i++)
171 spec->f_win_buf[i] = spec->f_win_buf[i + off];
172 r = (*spec->f_win_rf)(spec->f_win_fh,
174 spec->f_win_size - i);
175 spec->f_win_start = start_pos;
176 spec->f_win_end += r;
178 if (*size > end_pos - start_pos)
179 *size = end_pos - start_pos;
180 return spec->f_win_buf;
183 static int f_win_advance (struct lexSpec *spec, int *pos)
188 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
189 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
190 if (*pos == F_WIN_EOF)
192 buf = f_win_get (spec, *pos, *pos+1, &size);
202 static void regxCodeDel (struct regxCode **pp)
204 struct regxCode *p = *pp;
209 Tcl_DecrRefCount (p->tcl_obj);
217 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
221 p = (struct regxCode *) xmalloc (sizeof(*p));
222 p->str = (char *) xmalloc (len+1);
223 memcpy (p->str, buf, len);
226 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
228 Tcl_IncrRefCount (p->tcl_obj);
233 static struct DFA *lexSpecDFA (void)
238 dfa_parse_cmap_del (dfa, ' ');
239 dfa_parse_cmap_del (dfa, '\t');
240 dfa_parse_cmap_add (dfa, '/', 0);
244 static void actionListDel (struct lexRuleAction **rap)
246 struct lexRuleAction *ra1, *ra;
248 for (ra = *rap; ra; ra = ra1)
254 dfa_delete (&ra->u.pattern.dfa);
257 regxCodeDel (&ra->u.code);
265 static struct lexContext *lexContextCreate (const char *name)
267 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
269 p->name = xstrdup (name);
272 p->dfa = lexSpecDFA ();
275 p->beginActionList = NULL;
276 p->endActionList = NULL;
277 p->initActionList = NULL;
282 static void lexContextDestroy (struct lexContext *p)
284 struct lexRule *rp, *rp1;
286 dfa_delete (&p->dfa);
288 for (rp = p->rules; rp; rp = rp1)
291 actionListDel (&rp->info.actionList);
294 actionListDel (&p->beginActionList);
295 actionListDel (&p->endActionList);
296 actionListDel (&p->initActionList);
301 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
306 p = (struct lexSpec *) xmalloc (sizeof(*p));
307 p->name = (char *) xmalloc (strlen(name)+1);
308 strcpy (p->name, name);
315 p->context_stack_size = 100;
316 p->context_stack = (struct lexContext **)
317 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
321 p->concatBuf = (struct lexConcatBuf *)
322 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
323 for (i = 0; i < p->maxLevel; i++)
325 p->concatBuf[i].max = 0;
326 p->concatBuf[i].buf = 0;
328 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
333 static void lexSpecDestroy (struct lexSpec **pp)
336 struct lexContext *lt;
344 for (i = 0; i < p->maxLevel; i++)
345 xfree (p->concatBuf[i].buf);
346 xfree (p->concatBuf);
351 struct lexContext *lt_next = lt->next;
352 lexContextDestroy (lt);
357 Tcl_DeleteInterp (p->tcl_interp);
360 xfree (p->f_win_buf);
361 xfree (p->context_stack);
367 static int readParseToken (const char **cpp, int *len)
369 const char *cp = *cpp;
373 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
402 if (*cp >= 'a' && *cp <= 'z')
404 else if (*cp >= 'A' && *cp <= 'Z')
405 cmd[i] = *cp + 'a' - 'A';
408 if (i < (int) sizeof(cmd)-2)
415 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
417 while (*cp && *cp != ' ' && *cp != '\t' &&
418 *cp != '\n' && *cp != '\r')
424 if (!strcmp (cmd, "begin"))
426 else if (!strcmp (cmd, "end"))
428 else if (!strcmp (cmd, "body"))
430 else if (!strcmp (cmd, "context"))
432 else if (!strcmp (cmd, "init"))
436 logf (LOG_WARN, "bad command %s", cmd);
442 static int actionListMk (struct lexSpec *spec, const char *s,
443 struct lexRuleAction **ap)
449 while ((tok = readParseToken (&s, &len)))
457 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
459 regxCodeMk (&(*ap)->u.code, s, len);
463 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
465 (*ap)->u.pattern.body = bodyMark;
467 (*ap)->u.pattern.dfa = lexSpecDFA ();
469 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
475 logf (LOG_WARN, "regular expression error '%.*s'", sz, s0);
481 printf ("pattern: %.*s\n", sz, s0);
483 dfa_mkstate ((*ap)->u.pattern.dfa);
487 logf (LOG_WARN, "cannot use BEGIN here");
490 logf (LOG_WARN, "cannot use INIT here");
493 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
503 int readOneSpec (struct lexSpec *spec, const char *s)
507 struct lexContext *lc;
509 tok = readParseToken (&s, &len);
510 if (tok == REGX_CONTEXT)
512 char context_name[32];
513 tok = readParseToken (&s, &len);
514 if (tok != REGX_CODE)
516 logf (LOG_WARN, "missing name after CONTEXT keyword");
521 memcpy (context_name, s, len);
522 context_name[len] = '\0';
523 lc = lexContextCreate (context_name);
524 lc->next = spec->context;
529 spec->context = lexContextCreate ("main");
534 actionListDel (&spec->context->beginActionList);
535 actionListMk (spec, s, &spec->context->beginActionList);
538 actionListDel (&spec->context->endActionList);
539 actionListMk (spec, s, &spec->context->endActionList);
542 actionListDel (&spec->context->initActionList);
543 actionListMk (spec, s, &spec->context->initActionList);
547 logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s);
549 r = dfa_parse (spec->context->dfa, &s);
552 logf (LOG_WARN, "regular expression error. r=%d", r);
557 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
561 rp = (struct lexRule *) xmalloc (sizeof(*rp));
562 rp->info.no = spec->context->ruleNo++;
563 rp->next = spec->context->rules;
564 spec->context->rules = rp;
565 actionListMk (spec, s, &rp->info.actionList);
570 int readFileSpec (struct lexSpec *spec)
572 struct lexContext *lc;
573 int c, i, errors = 0;
579 if (spec->tcl_interp)
581 sprintf (fname, "%s.tflt", spec->name);
582 spec_inf = data1_path_fopen (spec->dh, fname, "r");
587 sprintf (fname, "%s.flt", spec->name);
588 spec_inf = data1_path_fopen (spec->dh, fname, "r");
592 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
595 logf (LOG_LOG, "reading regx filter %s", fname);
597 if (spec->tcl_interp)
598 logf (LOG_LOG, "Tcl enabled");
604 debug_dfa_followpos = 0;
608 lineBuf = wrbuf_alloc();
613 wrbuf_rewind (lineBuf);
614 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
616 while (c != '\n' && c != EOF)
629 wrbuf_putc(lineBuf, c);
637 if (c != ' ' && c != '\t')
642 wrbuf_putc(lineBuf, '\0');
643 readOneSpec (spec, wrbuf_buf(lineBuf));
644 spec->lineNo += addLine;
648 wrbuf_free(lineBuf, 1);
650 for (lc = spec->context; lc; lc = lc->next)
653 lc->fastRule = (struct lexRuleInfo **)
654 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
655 for (i = 0; i < lc->ruleNo; i++)
656 lc->fastRule[i] = NULL;
657 for (rp = lc->rules; rp; rp = rp->next)
658 lc->fastRule[rp->info.no] = &rp->info;
659 dfa_mkstate (lc->dfa);
668 static struct lexSpec *curLexSpec = NULL;
671 static void execData (struct lexSpec *spec,
672 const char *ebuf, int elen, int formatted_text,
673 const char *attribute_str, int attribute_len)
675 struct data1_node *res, *parent;
678 if (elen == 0) /* shouldn't happen, but it does! */
682 logf (LOG_LOG, "data(%d bytes) %.40s ... %.*s", elen,
683 ebuf, 40, ebuf + elen-40);
684 else if (elen == 1 && ebuf[0] == '\n')
686 logf (LOG_LOG, "data(new line)");
689 logf (LOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf);
691 logf (LOG_LOG, "data(%d bytes)", elen);
694 if (spec->d1_level <= 1)
697 parent = spec->d1_stack[spec->d1_level -1];
704 if (res->which != DATA1N_tag)
706 /* sweep through exising attributes.. */
707 for (ap = &res->u.tag.attributes; *ap; ap = &(*ap)->next)
708 if (strlen((*ap)->name) == attribute_len &&
709 !memcmp((*ap)->name, attribute_str, attribute_len))
713 /* new attribute. Create it with name + value */
714 *ap = nmem_malloc(spec->m, sizeof(**ap));
716 (*ap)->name = nmem_malloc(spec->m, attribute_len+1);
717 memcpy((*ap)->name, attribute_str, attribute_len);
718 (*ap)->name[attribute_len] = '\0';
720 (*ap)->value = nmem_malloc(spec->m, elen+1);
721 memcpy((*ap)->value, ebuf, elen);
722 (*ap)->value[elen] = '\0';
727 /* append to value if attribute already exists */
728 char *nv = nmem_malloc(spec->m, elen + 1 + strlen((*ap)->value));
729 strcpy(nv, (*ap)->value);
730 memcpy (nv + strlen(nv), ebuf, elen);
731 nv[strlen(nv)+elen] = '\0';
737 if ((res = spec->d1_stack[spec->d1_level]) &&
738 res->which == DATA1N_data)
739 org_len = res->u.data.len;
744 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
745 res->u.data.what = DATA1I_text;
747 res->u.data.formatted_text = formatted_text;
748 res->u.data.data = 0;
750 if (spec->d1_stack[spec->d1_level])
751 spec->d1_stack[spec->d1_level]->next = res;
752 spec->d1_stack[spec->d1_level] = res;
754 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
756 char *old_buf, *new_buf;
758 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
759 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
760 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
762 memcpy (new_buf, old_buf, org_len);
765 spec->concatBuf[spec->d1_level].buf = new_buf;
767 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
768 res->u.data.len += elen;
772 static void execDataP (struct lexSpec *spec,
773 const char *ebuf, int elen, int formatted_text)
775 execData (spec, ebuf, elen, formatted_text, 0, 0);
778 static void tagDataRelease (struct lexSpec *spec)
782 if ((res = spec->d1_stack[spec->d1_level]) &&
783 res->which == DATA1N_data &&
784 res->u.data.what == DATA1I_text)
786 assert (!res->u.data.data);
787 assert (res->u.data.len > 0);
788 if (res->u.data.len > DATA1_LOCALDATA)
789 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
791 res->u.data.data = res->lbuf;
792 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
797 static void variantBegin (struct lexSpec *spec,
798 const char *class_str, int class_len,
799 const char *type_str, int type_len,
800 const char *value_str, int value_len)
802 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
803 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
808 if (spec->d1_level == 0)
810 logf (LOG_WARN, "in variant begin. No record type defined");
813 if (class_len >= DATA1_MAX_SYMBOL)
814 class_len = DATA1_MAX_SYMBOL-1;
815 memcpy (tclass, class_str, class_len);
816 tclass[class_len] = '\0';
818 if (type_len >= DATA1_MAX_SYMBOL)
819 type_len = DATA1_MAX_SYMBOL-1;
820 memcpy (ttype, type_str, type_len);
821 ttype[type_len] = '\0';
824 logf (LOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype,
829 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
833 if (parent->which != DATA1N_variant)
835 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
836 if (spec->d1_stack[spec->d1_level])
837 tagDataRelease (spec);
838 spec->d1_stack[spec->d1_level] = res;
839 spec->d1_stack[++(spec->d1_level)] = NULL;
841 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
842 if (spec->d1_stack[i]->u.variant.type == tp)
849 logf (LOG_LOG, "variant node(%d)", spec->d1_level);
851 parent = spec->d1_stack[spec->d1_level-1];
852 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
853 res->u.variant.type = tp;
855 if (value_len >= DATA1_LOCALDATA)
856 value_len =DATA1_LOCALDATA-1;
857 memcpy (res->lbuf, value_str, value_len);
858 res->lbuf[value_len] = '\0';
860 res->u.variant.value = res->lbuf;
862 if (spec->d1_stack[spec->d1_level])
863 tagDataRelease (spec);
864 spec->d1_stack[spec->d1_level] = res;
865 spec->d1_stack[++(spec->d1_level)] = NULL;
868 static void tagStrip (const char **tag, int *len)
872 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
875 for (i = 0; i < *len && isspace((*tag)[i]); i++)
881 static void tagBegin (struct lexSpec *spec,
882 const char *tag, int len)
884 if (spec->d1_level == 0)
886 logf (LOG_WARN, "in element begin. No record type defined");
889 tagStrip (&tag, &len);
890 if (spec->d1_stack[spec->d1_level])
891 tagDataRelease (spec);
894 logf (LOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level);
897 spec->d1_stack[spec->d1_level] = data1_mk_tag_n (
898 spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]);
899 spec->d1_stack[++(spec->d1_level)] = NULL;
902 static void tagEnd (struct lexSpec *spec, int min_level,
903 const char *tag, int len)
905 tagStrip (&tag, &len);
906 while (spec->d1_level > min_level)
908 tagDataRelease (spec);
910 if (spec->d1_level == 0)
912 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
914 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
916 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
920 logf (LOG_LOG, "end tag(%d)", spec->d1_level);
925 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
926 struct DFA *dfa, int greedy)
928 struct DFA_state *state = dfa->states[0];
931 unsigned char c_prev = 0;
932 int ptr = *pptr; /* current pointer */
933 int start_ptr = *pptr; /* first char of match */
934 int last_ptr = 0; /* last char of match */
935 int last_rule = 0; /* rule number of current match */
942 c = f_win_advance (spec, &ptr);
946 if (dfa->states[0] == state)
951 c = f_win_advance (spec, &ptr);
953 if (ptr == F_WIN_EOF)
967 if (--i < 0) /* no transition for character c */
971 *mptr = start_ptr; /* match starts here */
972 *pptr = last_ptr; /* match end here (+1) */
975 state = dfa->states[0];
978 c = f_win_advance (spec, &ptr);
984 else if (c >= t->ch[0] && c <= t->ch[1])
986 state = dfa->states[t->to];
987 if (state->rule_no && c_prev == '\n')
989 last_rule = state->rule_no;
992 else if (state->rule_nno)
994 last_rule = state->rule_nno;
1005 static int execTok (struct lexSpec *spec, const char **src,
1006 const char **tokBuf, int *tokLen)
1008 const char *s = *src;
1010 while (*s == ' ' || *s == '\t')
1014 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1018 while (*s >= '0' && *s <= '9')
1019 n = n*10 + (*s++ -'0');
1020 if (spec->arg_no == 0)
1027 if (n >= spec->arg_no)
1029 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1033 else if (*s == '\"')
1036 while (*s && *s != '\"')
1038 *tokLen = s - *tokBuf;
1043 else if (*s == '\n' || *s == ';')
1051 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1054 *tokLen = s - *tokBuf;
1061 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1064 *tokLen = s - *tokBuf;
1070 static char *regxStrz (const char *src, int len, char *str)
1074 memcpy (str, src, len);
1080 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1081 int argc, char **argv)
1083 struct lexSpec *spec = (struct lexSpec *) clientData;
1086 if (!strcmp(argv[1], "record") && argc == 3)
1088 char *absynName = argv[2];
1092 logf (LOG_LOG, "begin record %s", absynName);
1094 res = data1_mk_root (spec->dh, spec->m, absynName);
1098 spec->d1_stack[spec->d1_level++] = res;
1100 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1102 spec->d1_stack[spec->d1_level++] = res;
1104 spec->d1_stack[spec->d1_level] = NULL;
1106 else if (!strcmp(argv[1], "element") && argc == 3)
1108 tagBegin (spec, argv[2], strlen(argv[2]));
1110 else if (!strcmp (argv[1], "variant") && argc == 5)
1112 variantBegin (spec, argv[2], strlen(argv[2]),
1113 argv[3], strlen(argv[3]),
1114 argv[4], strlen(argv[4]));
1116 else if (!strcmp (argv[1], "context") && argc == 3)
1118 struct lexContext *lc = spec->context;
1120 logf (LOG_LOG, "begin context %s",argv[2]);
1122 while (lc && strcmp (argv[2], lc->name))
1126 spec->context_stack[++(spec->context_stack_top)] = lc;
1129 logf (LOG_WARN, "unknown context %s", argv[2]);
1136 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1137 int argc, char **argv)
1139 struct lexSpec *spec = (struct lexSpec *) clientData;
1143 if (!strcmp (argv[1], "record"))
1145 while (spec->d1_level)
1147 tagDataRelease (spec);
1151 logf (LOG_LOG, "end record");
1153 spec->stop_flag = 1;
1155 else if (!strcmp (argv[1], "element"))
1159 if (argc >= 3 && !strcmp(argv[2], "-record"))
1168 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1169 if (spec->d1_level <= 1)
1172 logf (LOG_LOG, "end element end records");
1174 spec->stop_flag = 1;
1177 else if (!strcmp (argv[1], "context"))
1180 logf (LOG_LOG, "end context");
1182 if (spec->context_stack_top)
1183 (spec->context_stack_top)--;
1190 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1191 int argc, char **argv)
1195 const char *element = 0;
1196 const char *attribute = 0;
1197 struct lexSpec *spec = (struct lexSpec *) clientData;
1201 if (!strcmp("-text", argv[argi]))
1206 else if (!strcmp("-element", argv[argi]))
1210 element = argv[argi++];
1212 else if (!strcmp("-attribute", argv[argi]))
1216 attribute = argv[argi++];
1222 tagBegin (spec, element, strlen(element));
1226 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1228 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1229 execData (spec, native, strlen(native), textFlag, attribute,
1230 attribute ? strlen(attribute) : 0);
1231 Tcl_DStringFree (&ds);
1233 execData (spec, argv[argi], strlen(argv[argi]), textFlag, attribute,
1234 attribute ? strlen(attribute) : 0);
1239 tagEnd (spec, 2, NULL, 0);
1243 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1244 int argc, char **argv)
1246 struct lexSpec *spec = (struct lexSpec *) clientData;
1253 if (!strcmp("-offset", argv[argi]))
1258 offset = atoi(argv[argi]);
1267 no = atoi(argv[argi]);
1268 if (no >= spec->arg_no)
1269 no = spec->arg_no - 1;
1270 spec->ptr = spec->arg_start[no] + offset;
1274 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1278 for (i = 0; i < spec->arg_no; i++)
1280 char var_name[10], *var_buf;
1283 sprintf (var_name, "%d", i);
1284 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1288 ch = var_buf[var_len];
1289 var_buf[var_len] = '\0';
1290 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1291 var_buf[var_len] = ch;
1294 #if HAVE_TCL_OBJECTS
1295 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1297 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1301 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1302 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1303 spec->tcl_interp->errorLine,
1304 spec->tcl_interp->result,
1305 err ? err : "[NO ERRORINFO]");
1311 static void execCode (struct lexSpec *spec, struct regxCode *code)
1313 const char *s = code->str;
1315 const char *cmd_str;
1317 r = execTok (spec, &s, &cmd_str, &cmd_len);
1324 r = execTok (spec, &s, &cmd_str, &cmd_len);
1327 p = regxStrz (cmd_str, cmd_len, ptmp);
1328 if (!strcmp (p, "begin"))
1330 r = execTok (spec, &s, &cmd_str, &cmd_len);
1333 logf (LOG_WARN, "missing keyword after 'begin'");
1336 p = regxStrz (cmd_str, cmd_len, ptmp);
1337 if (!strcmp (p, "record"))
1339 r = execTok (spec, &s, &cmd_str, &cmd_len);
1342 if (spec->d1_level <= 1)
1344 static char absynName[64];
1349 memcpy (absynName, cmd_str, cmd_len);
1350 absynName[cmd_len] = '\0';
1352 logf (LOG_LOG, "begin record %s", absynName);
1354 res = data1_mk_root (spec->dh, spec->m, absynName);
1358 spec->d1_stack[spec->d1_level++] = res;
1360 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1362 spec->d1_stack[spec->d1_level++] = res;
1364 spec->d1_stack[spec->d1_level] = NULL;
1366 r = execTok (spec, &s, &cmd_str, &cmd_len);
1368 else if (!strcmp (p, "element"))
1370 r = execTok (spec, &s, &cmd_str, &cmd_len);
1373 tagBegin (spec, cmd_str, cmd_len);
1374 r = execTok (spec, &s, &cmd_str, &cmd_len);
1376 else if (!strcmp (p, "variant"))
1379 const char *class_str = NULL;
1381 const char *type_str = NULL;
1383 const char *value_str = NULL;
1384 r = execTok (spec, &s, &cmd_str, &cmd_len);
1387 class_str = cmd_str;
1388 class_len = cmd_len;
1389 r = execTok (spec, &s, &cmd_str, &cmd_len);
1395 r = execTok (spec, &s, &cmd_str, &cmd_len);
1398 value_str = cmd_str;
1399 value_len = cmd_len;
1401 variantBegin (spec, class_str, class_len,
1402 type_str, type_len, value_str, value_len);
1405 r = execTok (spec, &s, &cmd_str, &cmd_len);
1407 else if (!strcmp (p, "context"))
1411 struct lexContext *lc = spec->context;
1412 r = execTok (spec, &s, &cmd_str, &cmd_len);
1413 p = regxStrz (cmd_str, cmd_len, ptmp);
1415 logf (LOG_LOG, "begin context %s", p);
1417 while (lc && strcmp (p, lc->name))
1420 spec->context_stack[++(spec->context_stack_top)] = lc;
1422 logf (LOG_WARN, "unknown context %s", p);
1425 r = execTok (spec, &s, &cmd_str, &cmd_len);
1429 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1432 else if (!strcmp (p, "end"))
1434 r = execTok (spec, &s, &cmd_str, &cmd_len);
1437 logf (LOG_WARN, "missing keyword after 'end'");
1440 p = regxStrz (cmd_str, cmd_len, ptmp);
1441 if (!strcmp (p, "record"))
1443 while (spec->d1_level)
1445 tagDataRelease (spec);
1448 r = execTok (spec, &s, &cmd_str, &cmd_len);
1450 logf (LOG_LOG, "end record");
1452 spec->stop_flag = 1;
1454 else if (!strcmp (p, "element"))
1457 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1459 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1464 tagEnd (spec, min_level, cmd_str, cmd_len);
1465 r = execTok (spec, &s, &cmd_str, &cmd_len);
1468 tagEnd (spec, min_level, NULL, 0);
1469 if (spec->d1_level <= 1)
1472 logf (LOG_LOG, "end element end records");
1474 spec->stop_flag = 1;
1478 else if (!strcmp (p, "context"))
1481 logf (LOG_LOG, "end context");
1483 if (spec->context_stack_top)
1484 (spec->context_stack_top)--;
1485 r = execTok (spec, &s, &cmd_str, &cmd_len);
1488 logf (LOG_WARN, "bad keyword '%s' after end", p);
1490 else if (!strcmp (p, "data"))
1494 const char *element_str = NULL;
1496 const char *attribute_str = NULL;
1498 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1500 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1502 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1504 r = execTok (spec, &s, &element_str, &element_len);
1508 else if (cmd_len==10 && !memcmp ("-attribute", cmd_str,
1511 r = execTok (spec, &s, &attribute_str, &attribute_len);
1516 logf (LOG_WARN, "bad data option: %.*s",
1521 logf (LOG_WARN, "missing data item after data");
1525 tagBegin (spec, element_str, element_len);
1528 execData (spec, cmd_str, cmd_len, textFlag,
1529 attribute_str, attribute_len);
1530 r = execTok (spec, &s, &cmd_str, &cmd_len);
1533 tagEnd (spec, 2, NULL, 0);
1535 else if (!strcmp (p, "unread"))
1538 r = execTok (spec, &s, &cmd_str, &cmd_len);
1539 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1541 r = execTok (spec, &s, &cmd_str, &cmd_len);
1544 logf (LOG_WARN, "missing number after -offset");
1547 p = regxStrz (cmd_str, cmd_len, ptmp);
1549 r = execTok (spec, &s, &cmd_str, &cmd_len);
1555 logf (LOG_WARN, "missing index after unread command");
1558 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1560 logf (LOG_WARN, "bad index after unread command");
1565 no = *cmd_str - '0';
1566 if (no >= spec->arg_no)
1567 no = spec->arg_no - 1;
1568 spec->ptr = spec->arg_start[no] + offset;
1570 r = execTok (spec, &s, &cmd_str, &cmd_len);
1572 else if (!strcmp (p, "context"))
1576 struct lexContext *lc = spec->context;
1577 r = execTok (spec, &s, &cmd_str, &cmd_len);
1578 p = regxStrz (cmd_str, cmd_len, ptmp);
1580 while (lc && strcmp (p, lc->name))
1583 spec->context_stack[spec->context_stack_top] = lc;
1585 logf (LOG_WARN, "unknown context %s", p);
1588 r = execTok (spec, &s, &cmd_str, &cmd_len);
1592 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1593 r = execTok (spec, &s, &cmd_str, &cmd_len);
1598 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1600 r = execTok (spec, &s, &cmd_str, &cmd_len);
1607 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1608 int start_ptr, int *pptr)
1617 arg_start[0] = start_ptr;
1619 spec->arg_start = arg_start;
1620 spec->arg_end = arg_end;
1627 if (ap->u.pattern.body)
1629 arg_start[arg_no] = *pptr;
1630 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0))
1632 arg_end[arg_no] = F_WIN_EOF;
1634 arg_start[arg_no] = F_WIN_EOF;
1635 arg_end[arg_no] = F_WIN_EOF;
1636 yaz_log(LOG_DEBUG, "Pattern match rest of record");
1641 arg_end[arg_no] = sptr;
1643 arg_start[arg_no] = sptr;
1644 arg_end[arg_no] = *pptr;
1649 arg_start[arg_no] = *pptr;
1650 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1))
1652 if (sptr != arg_start[arg_no])
1654 arg_end[arg_no] = *pptr;
1659 spec->arg_no = arg_no;
1662 if (spec->tcl_interp)
1663 execTcl(spec, ap->u.code);
1665 execCode (spec, ap->u.code);
1667 execCode (spec, ap->u.code);
1670 if (spec->stop_flag)
1674 arg_start[arg_no] = *pptr;
1675 arg_end[arg_no] = F_WIN_EOF;
1684 static int execRule (struct lexSpec *spec, struct lexContext *context,
1685 int ruleNo, int start_ptr, int *pptr)
1688 logf (LOG_LOG, "exec rule %d", ruleNo);
1690 return execAction (spec, context->fastRule[ruleNo]->actionList,
1694 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1696 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1697 struct DFA_state *state = context->dfa->states[0];
1700 unsigned char c_prev = '\n';
1702 int last_rule = 0; /* rule number of current match */
1703 int last_ptr = *ptr; /* last char of match */
1704 int start_ptr = *ptr; /* first char of match */
1705 int skip_ptr = *ptr; /* first char of run */
1709 c = f_win_advance (spec, ptr);
1710 if (*ptr == F_WIN_EOF)
1712 /* end of file met */
1715 /* there was a match */
1716 if (skip_ptr < start_ptr)
1718 /* deal with chars that didn't match */
1721 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1722 execDataP (spec, buf, size, 0);
1724 /* restore pointer */
1727 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1729 /* restore skip pointer */
1733 else if (skip_ptr < *ptr)
1735 /* deal with chars that didn't match */
1738 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1739 execDataP (spec, buf, size, 0);
1741 state = context->dfa->states[0];
1742 if (*ptr == F_WIN_EOF)
1749 { /* no transition for character c ... */
1752 if (skip_ptr < start_ptr)
1754 /* deal with chars that didn't match */
1757 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1758 execDataP (spec, buf, size, 0);
1760 /* restore pointer */
1762 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1764 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1767 logf (LOG_LOG, "regx: endf ptr=%d", *ptr);
1769 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1773 context = spec->context_stack[spec->context_stack_top];
1776 last_ptr = start_ptr = *ptr;
1780 c_prev = f_win_advance (spec, &start_ptr);
1785 c_prev = f_win_advance (spec, &start_ptr);
1788 state = context->dfa->states[0];
1791 else if (c >= t->ch[0] && c <= t->ch[1])
1792 { /* transition ... */
1793 state = context->dfa->states[t->to];
1798 last_rule = state->rule_no;
1801 else if (state->rule_nno)
1803 last_rule = state->rule_nno;
1815 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1816 const char *context_name)
1818 struct lexContext *lt = spec->context;
1821 spec->stop_flag = 0;
1823 spec->context_stack_top = 0;
1826 if (!strcmp (lt->name, context_name))
1832 logf (LOG_WARN, "cannot find context %s", context_name);
1835 spec->context_stack[spec->context_stack_top] = lt;
1836 spec->d1_stack[spec->d1_level] = NULL;
1841 execAction (spec, lt->initActionList, ptr, &ptr);
1844 execAction (spec, lt->beginActionList, ptr, &ptr);
1845 lexNode (spec, &ptr);
1846 while (spec->d1_level)
1848 tagDataRelease (spec);
1851 execAction (spec, lt->endActionList, ptr, &ptr);
1852 return spec->d1_stack[0];
1855 void grs_destroy(void *clientData)
1857 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1860 lexSpecDestroy(&specs->spec);
1865 void *grs_init(void)
1867 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1872 data1_node *grs_read_regx (struct grs_read_info *p)
1875 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1876 struct lexSpec **curLexSpec = &specs->spec;
1879 logf (LOG_LOG, "grs_read_regx");
1881 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1884 lexSpecDestroy (curLexSpec);
1885 *curLexSpec = lexSpecCreate (p->type, p->dh);
1886 res = readFileSpec (*curLexSpec);
1889 lexSpecDestroy (curLexSpec);
1893 (*curLexSpec)->dh = p->dh;
1896 (*curLexSpec)->f_win_start = 0;
1897 (*curLexSpec)->f_win_end = 0;
1898 (*curLexSpec)->f_win_rf = p->readf;
1899 (*curLexSpec)->f_win_sf = p->seekf;
1900 (*curLexSpec)->f_win_fh = p->fh;
1901 (*curLexSpec)->f_win_ef = p->endf;
1902 (*curLexSpec)->f_win_size = 500000;
1904 (*curLexSpec)->m = p->mem;
1905 return lexRoot (*curLexSpec, p->offset, "main");
1908 static struct recTypeGrs regx_type = {
1915 RecTypeGrs recTypeGrs_regx = ®x_type;
1918 data1_node *grs_read_tcl (struct grs_read_info *p)
1921 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1922 struct lexSpec **curLexSpec = &specs->spec;
1925 logf (LOG_LOG, "grs_read_tcl");
1927 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1929 Tcl_Interp *tcl_interp;
1931 lexSpecDestroy (curLexSpec);
1932 *curLexSpec = lexSpecCreate (p->type, p->dh);
1933 Tcl_FindExecutable("");
1934 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1935 Tcl_Init(tcl_interp);
1936 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1937 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1938 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1939 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1941 res = readFileSpec (*curLexSpec);
1944 lexSpecDestroy (curLexSpec);
1948 (*curLexSpec)->dh = p->dh;
1951 (*curLexSpec)->f_win_start = 0;
1952 (*curLexSpec)->f_win_end = 0;
1953 (*curLexSpec)->f_win_rf = p->readf;
1954 (*curLexSpec)->f_win_sf = p->seekf;
1955 (*curLexSpec)->f_win_fh = p->fh;
1956 (*curLexSpec)->f_win_ef = p->endf;
1957 (*curLexSpec)->f_win_size = 500000;
1959 (*curLexSpec)->m = p->mem;
1960 return lexRoot (*curLexSpec, p->offset, "main");
1963 static struct recTypeGrs tcl_type = {
1970 RecTypeGrs recTypeGrs_tcl = &tcl_type;