X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=recctrl%2Fregxread.c;h=6fdbd89b2ce47bd18af85a7bc04fd4922f903b54;hb=5adb31268b057741850de38eafd42baf8dc947ea;hp=cc74152915daf333d1fc733b78c97ce4ff370004;hpb=ef696645cc3b7e0f4027008d1dc589c0f0f90c1f;p=idzebra-moved-to-github.git diff --git a/recctrl/regxread.c b/recctrl/regxread.c index cc74152..6fdbd89 100644 --- a/recctrl/regxread.c +++ b/recctrl/regxread.c @@ -1,10 +1,44 @@ /* - * Copyright (C) 1994-1999, Index Data + * Copyright (C) 1994-2001, Index Data * All rights reserved. - * Sebastian Hammer, Adam Dickmeiss * * $Log: regxread.c,v $ - * Revision 1.26 1999-05-26 07:49:14 adam + * Revision 1.37 2001-05-29 08:51:59 adam + * More fixes for character encodings. + * + * Revision 1.36 2001/05/22 21:02:26 adam + * Fixes for Tcl UTF8 character handling. + * + * Revision 1.35 2001/03/29 21:31:31 adam + * Fixed "record begin" for Tcl filter. + * + * Revision 1.34 2000/11/29 14:24:01 adam + * Script configure uses yaz pthreads options. Added locking for + * zebra_register_{lock,unlock}. + * + * Revision 1.33 1999/11/30 13:48:04 adam + * Improved installation. Updated for inclusion of YAZ header files. + * + * Revision 1.32 1999/09/07 07:19:21 adam + * Work on character mapping. Implemented replace rules. + * + * Revision 1.31 1999/07/14 13:05:29 adam + * Tcl filter works with objects when TCL is version 8 or later; filter + * works with strings otherwise (slow). + * + * Revision 1.30 1999/07/14 10:55:28 adam + * Fixed memory leak. + * + * Revision 1.29 1999/07/12 07:27:54 adam + * Improved speed of Tcl processing. Fixed one memory leak. + * + * Revision 1.28 1999/07/06 12:26:04 adam + * Fixed filters so that MS-DOS CR is ignored. + * + * Revision 1.27 1999/06/28 13:25:40 quinn + * Improved diagnostics for Tcl + * + * Revision 1.26 1999/05/26 07:49:14 adam * C++ compilation. * * Revision 1.25 1999/05/25 12:33:32 adam @@ -173,13 +207,17 @@ #include #include -#include +#include #include #include #include "grsread.h" #if HAVE_TCL_H #include + +#if MAJOR_VERSION >= 8 +#define HAVE_TCL_OBJECTS +#endif #endif #define REGX_DEBUG 0 @@ -198,6 +236,9 @@ struct regxCode { char *str; +#if HAVE_TCL_OBJECTS + Tcl_Obj *tcl_obj; +#endif }; struct lexRuleAction { @@ -237,7 +278,6 @@ struct lexContext { }; struct lexConcatBuf { - int len; int max; char *buf; }; @@ -266,7 +306,7 @@ struct lexSpec { int (*f_win_rf)(void *, char *, size_t); off_t (*f_win_sf)(void *, off_t); - struct lexConcatBuf **concatBuf; + struct lexConcatBuf *concatBuf; int maxLevel; data1_node **d1_stack; int d1_level; @@ -344,6 +384,10 @@ static void regxCodeDel (struct regxCode **pp) struct regxCode *p = *pp; if (p) { +#if HAVE_TCL_OBJECTS + if (p->tcl_obj) + Tcl_DecrRefCount (p->tcl_obj); +#endif xfree (p->str); xfree (p); *pp = NULL; @@ -358,6 +402,11 @@ static void regxCodeMk (struct regxCode **pp, const char *buf, int len) p->str = (char *) xmalloc (len+1); memcpy (p->str, buf, len); p->str[len] = '\0'; +#if HAVE_TCL_OBJECTS + p->tcl_obj = Tcl_NewStringObj ((char *) buf, len); + if (p->tcl_obj) + Tcl_IncrRefCount (p->tcl_obj); +#endif *pp = p; } @@ -414,6 +463,7 @@ static void lexContextDestroy (struct lexContext *p) { struct lexRule *rp, *rp1; + dfa_delete (&p->dfa); xfree (p->fastRule); for (rp = p->rules; rp; rp = rp1) { @@ -423,6 +473,7 @@ static void lexContextDestroy (struct lexContext *p) } actionListDel (&p->beginActionList); actionListDel (&p->endActionList); + actionListDel (&p->initActionList); xfree (p->name); xfree (p); } @@ -447,14 +498,12 @@ static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh) p->f_win_buf = NULL; p->maxLevel = 128; - p->concatBuf = (struct lexConcatBuf **) + p->concatBuf = (struct lexConcatBuf *) xmalloc (sizeof(*p->concatBuf) * p->maxLevel); for (i = 0; i < p->maxLevel; i++) { - p->concatBuf[i] = (struct lexConcatBuf *) - xmalloc (sizeof(**p->concatBuf)); - p->concatBuf[i]->len = p->concatBuf[i]->max = 0; - p->concatBuf[i]->buf = 0; + p->concatBuf[i].max = 0; + p->concatBuf[i].buf = 0; } p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel); p->d1_level = 0; @@ -473,7 +522,7 @@ static void lexSpecDestroy (struct lexSpec **pp) return ; for (i = 0; i < p->maxLevel; i++) - xfree (p->concatBuf[i]); + xfree (p->concatBuf[i].buf); xfree (p->concatBuf); lt = p->context; @@ -483,7 +532,7 @@ static void lexSpecDestroy (struct lexSpec **pp) lexContextDestroy (lt); lt = lt_next; } -#if HAVE_TCL_H +#if HAVE_TCL_OBJECTS if (p->tcl_interp) Tcl_DeleteInterp (p->tcl_interp); #endif @@ -501,7 +550,7 @@ static int readParseToken (const char **cpp, int *len) char cmd[32]; int i, level; - while (*cp == ' ' || *cp == '\t' || *cp == '\n') + while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r') cp++; switch (*cp) { @@ -545,7 +594,8 @@ static int readParseToken (const char **cpp, int *len) { logf (LOG_WARN, "bad character %d %c", *cp, *cp); cp++; - while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n') + while (*cp && *cp != ' ' && *cp != '\t' && + *cp != '\n' && *cp != '\r') cp++; *cpp = cp; return 0; @@ -668,7 +718,7 @@ int readOneSpec (struct lexSpec *spec, const char *s) break; case REGX_PATTERN: #if REGX_DEBUG - logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s); + logf (LOG_LOG, "rule %d %s", spec->context->ruleNo, s); #endif r = dfa_parse (spec->context->dfa, &s); if (r) @@ -694,41 +744,40 @@ int readOneSpec (struct lexSpec *spec, const char *s) int readFileSpec (struct lexSpec *spec) { struct lexContext *lc; - char *lineBuf; - int lineSize = 512; int c, i, errors = 0; FILE *spec_inf = 0; + WRBUF lineBuf; + char fname[256]; - lineBuf = (char *) xmalloc (1+lineSize); #if HAVE_TCL_H if (spec->tcl_interp) { - sprintf (lineBuf, "%s.tflt", spec->name); - spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r"); + sprintf (fname, "%s.tflt", spec->name); + spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r"); } #endif if (!spec_inf) { - sprintf (lineBuf, "%s.flt", spec->name); - spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r"); + sprintf (fname, "%s.flt", spec->name); + spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r"); } if (!spec_inf) { logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name); - xfree (lineBuf); return -1; } - logf (LOG_LOG, "reading regx filter %s", lineBuf); + logf (LOG_LOG, "reading regx filter %s", fname); #if HAVE_TCL_H if (spec->tcl_interp) logf (LOG_LOG, "Tcl enabled"); #endif + lineBuf = wrbuf_alloc(); spec->lineNo = 0; c = getc (spec_inf); while (c != EOF) { - int off = 0; - if (c == '#' || c == '\n' || c == ' ' || c == '\t') + wrbuf_rewind (lineBuf); + if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r') { while (c != '\n' && c != EOF) c = getc (spec_inf); @@ -739,12 +788,14 @@ int readFileSpec (struct lexSpec *spec) else { int addLine = 0; - - lineBuf[off++] = c; + while (1) { int c1 = c; + wrbuf_putc(lineBuf, c); c = getc (spec_inf); + while (c == '\r') + c = getc (spec_inf); if (c == EOF) break; if (c1 == '\n') @@ -753,17 +804,14 @@ int readFileSpec (struct lexSpec *spec) break; addLine++; } - lineBuf[off] = c; - if (off < lineSize) - off++; } - lineBuf[off] = '\0'; - readOneSpec (spec, lineBuf); + wrbuf_putc(lineBuf, '\0'); + readOneSpec (spec, wrbuf_buf(lineBuf)); spec->lineNo += addLine; } } fclose (spec_inf); - xfree (lineBuf); + wrbuf_free(lineBuf, 1); #if 0 debug_dfa_trav = 1; @@ -802,12 +850,12 @@ static void execData (struct lexSpec *spec, return ; #if REGX_DEBUG if (elen > 40) - logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen, + logf (LOG_LOG, "data (%d bytes) %.15s ... %.*s", elen, ebuf, 15, ebuf + elen-15); else if (elen > 0) - logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf); + logf (LOG_LOG, "data (%d bytes) %.*s", elen, elen, ebuf); else - logf (LOG_DEBUG, "data (%d bytes)", elen); + logf (LOG_LOG, "data (%d bytes)", elen); #endif if (spec->d1_level <= 1) @@ -846,21 +894,20 @@ static void execData (struct lexSpec *spec, parent->child = res; spec->d1_stack[spec->d1_level] = res; } - if (org_len + elen >= spec->concatBuf[spec->d1_level]->max) + if (org_len + elen >= spec->concatBuf[spec->d1_level].max) { char *old_buf, *new_buf; - spec->concatBuf[spec->d1_level]->max = org_len + elen + 256; - new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level]->max); - if ((old_buf = spec->concatBuf[spec->d1_level]->buf)) + spec->concatBuf[spec->d1_level].max = org_len + elen + 256; + new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max); + if ((old_buf = spec->concatBuf[spec->d1_level].buf)) { memcpy (new_buf, old_buf, org_len); xfree (old_buf); } - spec->concatBuf[spec->d1_level]->buf = new_buf; + spec->concatBuf[spec->d1_level].buf = new_buf; } - assert (spec->concatBuf[spec->d1_level]); - memcpy (spec->concatBuf[spec->d1_level]->buf + org_len, ebuf, elen); + memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen); res->u.data.len += elen; } @@ -884,7 +931,7 @@ static void tagDataRelease (struct lexSpec *spec) res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len); else res->u.data.data = res->lbuf; - memcpy (res->u.data.data, spec->concatBuf[spec->d1_level]->buf, + memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf, res->u.data.len); } } @@ -916,7 +963,7 @@ static void variantBegin (struct lexSpec *spec, ttype[type_len] = '\0'; #if REGX_DEBUG - logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype, + logf (LOG_LOG, "variant begin %s %s (%d)", tclass, ttype, spec->d1_level); #endif @@ -953,7 +1000,7 @@ static void variantBegin (struct lexSpec *spec, } #if REGX_DEBUG - logf (LOG_DEBUG, "variant node (%d)", spec->d1_level); + logf (LOG_LOG, "variant node (%d)", spec->d1_level); #endif parent = spec->d1_stack[spec->d1_level-1]; res = data1_mk_node (spec->dh, spec->m); @@ -997,9 +1044,9 @@ static void tagStrip (const char **tag, int *len) static void tagBegin (struct lexSpec *spec, const char *tag, int len) { - struct data1_node *parent = spec->d1_stack[spec->d1_level -1]; + struct data1_node *parent; data1_element *elem = NULL; - data1_node *partag = get_parent_tag(spec->dh, parent); + data1_node *partag; data1_node *res; data1_element *e = NULL; int localtag = 0; @@ -1010,11 +1057,12 @@ static void tagBegin (struct lexSpec *spec, return ; } tagStrip (&tag, &len); + + parent = spec->d1_stack[spec->d1_level -1]; + partag = get_parent_tag(spec->dh, parent); - res = data1_mk_node (spec->dh, spec->m); + res = data1_mk_node_type (spec->dh, spec->m, DATA1N_tag); res->parent = parent; - res->which = DATA1N_tag; - res->u.tag.get_bytes = -1; if (len >= DATA1_LOCALDATA) res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1); @@ -1025,7 +1073,7 @@ static void tagBegin (struct lexSpec *spec, res->u.tag.tag[len] = '\0'; #if REGX_DEBUG - logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level); + logf (LOG_LOG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level); #endif if (parent->which == DATA1N_variant) return ; @@ -1037,9 +1085,6 @@ static void tagBegin (struct lexSpec *spec, spec->d1_stack[0]->u.root.absyn, e, res->u.tag.tag); res->u.tag.element = elem; - res->u.tag.node_selected = 0; - res->u.tag.make_variantlist = 0; - res->u.tag.no_data_requested = 0; res->root = parent->root; parent->last_child = res; @@ -1072,7 +1117,7 @@ static void tagEnd (struct lexSpec *spec, int min_level, break; } #if REGX_DEBUG - logf (LOG_DEBUG, "end tag (%d)", spec->d1_level); + logf (LOG_LOG, "end tag (%d)", spec->d1_level); #endif } @@ -1189,7 +1234,8 @@ static int execTok (struct lexSpec *spec, const char **src, else if (*s == '-') { *tokBuf = s++; - while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';') + while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' && + *s != ';') s++; *tokLen = s - *tokBuf; *src = s; @@ -1198,7 +1244,8 @@ static int execTok (struct lexSpec *spec, const char **src, else { *tokBuf = s++; - while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';') + while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' && + *s != ';') s++; *tokLen = s - *tokBuf; } @@ -1228,7 +1275,7 @@ static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp, data1_absyn *absyn; #if REGX_DEBUG - logf (LOG_DEBUG, "begin record %s", absynName); + logf (LOG_LOG, "begin record %s", absynName); #endif if (!(absyn = data1_get_absyn (spec->dh, absynName))) logf (LOG_WARN, "Unknown tagset: %s", absynName); @@ -1238,7 +1285,8 @@ static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp, res = data1_mk_node (spec->dh, spec->m); res->which = DATA1N_root; - res->u.root.type = absynName; + res->u.root.type = + data1_insert_string(spec->dh, res, spec->m, absynName); res->u.root.absyn = absyn; res->root = res; @@ -1260,7 +1308,7 @@ static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp, { struct lexContext *lc = spec->context; #if REGX_DEBUG - logf (LOG_DEBUG, "begin context %s",argv[2]); + logf (LOG_LOG, "begin context %s",argv[2]); #endif while (lc && strcmp (argv[2], lc->name)) lc = lc->next; @@ -1291,7 +1339,7 @@ static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp, (spec->d1_level)--; } #if REGX_DEBUG - logf (LOG_DEBUG, "end record"); + logf (LOG_LOG, "end record"); #endif spec->stop_flag = 1; } @@ -1312,7 +1360,7 @@ static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp, if (spec->d1_level == 0) { #if REGX_DEBUG - logf (LOG_DEBUG, "end element end records"); + logf (LOG_LOG, "end element end records"); #endif spec->stop_flag = 1; } @@ -1320,7 +1368,7 @@ static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp, else if (!strcmp (argv[1], "context")) { #if REGX_DEBUG - logf (LOG_DEBUG, "end context"); + logf (LOG_LOG, "end context"); #endif if (spec->context_stack_top) (spec->context_stack_top)--; @@ -1359,7 +1407,14 @@ static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp, while (argi < argc) { +#if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0) + Tcl_DString ds; + char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds); + execData (spec, native, strlen(native), textFlag); + Tcl_DStringFree (&ds); +#else execData (spec, argv[argi], strlen(argv[argi]), textFlag); +#endif argi++; } if (element) @@ -1401,6 +1456,7 @@ static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp, static void execTcl (struct lexSpec *spec, struct regxCode *code) { int i; + int ret; for (i = 0; i < spec->arg_no; i++) { char var_name[10], *var_buf; @@ -1417,7 +1473,19 @@ static void execTcl (struct lexSpec *spec, struct regxCode *code) var_buf[var_len] = ch; } } - Tcl_Eval (spec->tcl_interp, code->str); +#if HAVE_TCL_OBJECTS + ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj); +#else + ret = Tcl_GlobalEval (spec->tcl_interp, code->str); +#endif + if (ret != TCL_OK) + { + const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0); + logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s", + spec->tcl_interp->errorLine, + spec->tcl_interp->result, + err ? err : "[NO ERRORINFO]"); + } } /* HAVE_TCL_H */ #endif @@ -1464,7 +1532,7 @@ static void execCode (struct lexSpec *spec, struct regxCode *code) absynName[cmd_len] = '\0'; #if REGX_DEBUG - logf (LOG_DEBUG, "begin record %s", absynName); + logf (LOG_LOG, "begin record %s", absynName); #endif if (!(absyn = data1_get_absyn (spec->dh, absynName))) logf (LOG_WARN, "Unknown tagset: %s", absynName); @@ -1531,7 +1599,7 @@ static void execCode (struct lexSpec *spec, struct regxCode *code) r = execTok (spec, &s, &cmd_str, &cmd_len); p = regxStrz (cmd_str, cmd_len, ptmp); #if REGX_DEBUG - logf (LOG_DEBUG, "begin context %s", p); + logf (LOG_LOG, "begin context %s", p); #endif while (lc && strcmp (p, lc->name)) lc = lc->next; @@ -1566,7 +1634,7 @@ static void execCode (struct lexSpec *spec, struct regxCode *code) } r = execTok (spec, &s, &cmd_str, &cmd_len); #if REGX_DEBUG - logf (LOG_DEBUG, "end record"); + logf (LOG_LOG, "end record"); #endif spec->stop_flag = 1; } @@ -1588,7 +1656,7 @@ static void execCode (struct lexSpec *spec, struct regxCode *code) if (spec->d1_level == 0) { #if REGX_DEBUG - logf (LOG_DEBUG, "end element end records"); + logf (LOG_LOG, "end element end records"); #endif spec->stop_flag = 1; } @@ -1597,7 +1665,7 @@ static void execCode (struct lexSpec *spec, struct regxCode *code) else if (!strcmp (p, "context")) { #if REGX_DEBUG - logf (LOG_DEBUG, "end context"); + logf (LOG_LOG, "end context"); #endif if (spec->context_stack_top) (spec->context_stack_top)--; @@ -1793,7 +1861,7 @@ static int execRule (struct lexSpec *spec, struct lexContext *context, int ruleNo, int start_ptr, int *pptr) { #if REGX_DEBUG - logf (LOG_DEBUG, "exec rule %d", ruleNo); + logf (LOG_LOG, "exec rule %d", ruleNo); #endif return execAction (spec, context->fastRule[ruleNo]->actionList, start_ptr, pptr); @@ -1871,7 +1939,7 @@ data1_node *lexNode (struct lexSpec *spec, int *ptr) if (spec->f_win_ef && *ptr != F_WIN_EOF) { #if REGX_DEBUG - logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr); + logf (LOG_LOG, "regx: endf ptr=%d", *ptr); #endif (*spec->f_win_ef)(spec->f_win_fh, *ptr); } @@ -1983,7 +2051,7 @@ data1_node *grs_read_regx (struct grs_read_info *p) struct lexSpec **curLexSpec = &specs->spec; #if REGX_DEBUG - logf (LOG_DEBUG, "grs_read_regx"); + logf (LOG_LOG, "grs_read_regx"); #endif if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type)) { @@ -2029,7 +2097,7 @@ data1_node *grs_read_tcl (struct grs_read_info *p) struct lexSpec **curLexSpec = &specs->spec; #if REGX_DEBUG - logf (LOG_DEBUG, "grs_read_tcl"); + logf (LOG_LOG, "grs_read_tcl"); #endif if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type)) { @@ -2037,7 +2105,9 @@ data1_node *grs_read_tcl (struct grs_read_info *p) if (*curLexSpec) lexSpecDestroy (curLexSpec); *curLexSpec = lexSpecCreate (p->type, p->dh); + Tcl_FindExecutable(""); tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp(); + Tcl_Init(tcl_interp); Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0); Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0); Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);