X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=dfa%2Fdfa.c;h=b20650f41de2fc9f4ce703ca2226a92e7e3c615d;hp=9ce116b0da5813e51e050724fe1181681cf49f74;hb=732870c555c7d32c5d2b6a4914c6fa7232eb26b2;hpb=b879b04a092d5b00cc866cf16f755e55053d2e89 diff --git a/dfa/dfa.c b/dfa/dfa.c index 9ce116b..b20650f 100644 --- a/dfa/dfa.c +++ b/dfa/dfa.c @@ -1,21 +1,26 @@ -/* - * Copyright (C) 1994, Index Data I/S - * All rights reserved. - * Sebastian Hammer, Adam Dickmeiss - * - * $Log: dfa.c,v $ - * Revision 1.3 1995-09-04 12:33:26 adam - * Various cleanup. YAZ util used instead. - * - * Revision 1.2 1995/01/25 11:30:50 adam - * Simple error reporting when parsing regular expressions. - * Memory usage reduced. - * - * Revision 1.1 1995/01/24 16:02:52 adam - * New private header file in dfa module (dfap.h). - * Module no longer uses yacc to parse regular expressions. - * - */ +/* This file is part of the Zebra server. + Copyright (C) 1994-2011 Index Data + +Zebra is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +Zebra is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*/ + + +#if HAVE_CONFIG_H +#include +#endif #include #include @@ -23,7 +28,7 @@ #include #include -#include +#include #include "dfap.h" #include "imalloc.h" @@ -43,8 +48,8 @@ struct Tnode { } u; unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */ unsigned nullable : 1; - Set firstpos; /* first positions */ - Set lastpos; /* last positions */ + DFASet firstpos; /* first positions */ + DFASet lastpos; /* last positions */ }; struct Tblock { /* Tnode bucket (block) */ @@ -60,41 +65,29 @@ int debug_dfa_trav = 0; int debug_dfa_tran = 0; int debug_dfa_followpos = 0; int dfa_verbose = 0; -int yydebug = 0; -static struct DFA_parse *parse_info = NULL; - -static int err_code; -static int inside_string; -static const unsigned char *expr_ptr; -static unsigned short *ctrl_chars; -static struct Tnode **posar; - -static SetType poset; -static Set *followpos; - -static struct Tnode *mk_Tnode (void); -static struct Tnode *mk_Tnode_cset (BSet charset); -static void term_Tnode (void); +static struct Tnode *mk_Tnode (struct DFA_parse *parse_info); +static struct Tnode *mk_Tnode_cset (struct DFA_parse *parse_info, + BSet charset); +static void term_Tnode (struct DFA_parse *parse_info); static void - del_followpos (void), - init_pos (void), - del_pos (void), - mk_dfa_tran (struct DFA_states *dfas), - add_follow (Set lastpos, Set firstpos), - dfa_trav (struct Tnode *n), - init_followpos (void), - mk_dfa_tran (struct DFA_states *dfas), - pr_tran (struct DFA_states *dfas), - pr_verbose (struct DFA_states *dfas), - pr_followpos (void), + del_followpos (struct DFA_parse *parse_info), + init_pos (struct DFA_parse *parse_info), + del_pos (struct DFA_parse *parse_info), + mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas), + add_follow (struct DFA_parse *parse_info, DFASet lastpos, DFASet firstpos), + dfa_trav (struct DFA_parse *parse_info, struct Tnode *n), + init_followpos (struct DFA_parse *parse_info), + pr_tran (struct DFA_parse *parse_info, struct DFA_states *dfas), + pr_verbose (struct DFA_parse *parse_info, struct DFA_states *dfas), + pr_followpos (struct DFA_parse *parse_info), out_char (int c), - lex (void); + lex (struct DFA_parse *parse_info); static int - nextchar (int *esc), - read_charset (void); + nextchar (struct DFA_parse *parse_info, int *esc), + read_charset (struct DFA_parse *parse_info); static const char *str_char (unsigned c); @@ -113,28 +106,25 @@ static const char #define L_END 12 #define L_START 13 -static int lookahead; -static unsigned look_ch; -static BSet look_chars; -static struct Tnode *expr_1 (void), - *expr_2 (void), - *expr_3 (void), - *expr_4 (void); +static struct Tnode *expr_1 (struct DFA_parse *parse_info), + *expr_2 (struct DFA_parse *parse_info), + *expr_3 (struct DFA_parse *parse_info), + *expr_4 (struct DFA_parse *parse_info); -static struct Tnode *expr_1 (void) +static struct Tnode *expr_1 (struct DFA_parse *parse_info) { struct Tnode *t1, *t2, *tn; - if (!(t1 = expr_2 ())) + if (!(t1 = expr_2 (parse_info))) return t1; - while (lookahead == L_ALT) + while (parse_info->lookahead == L_ALT) { - lex (); - if (!(t2 = expr_2 ())) + lex (parse_info); + if (!(t2 = expr_2 (parse_info))) return t2; - tn = mk_Tnode (); + tn = mk_Tnode (parse_info); tn->pos = OR; tn->u.p[0] = t1; tn->u.p[1] = t2; @@ -143,23 +133,23 @@ static struct Tnode *expr_1 (void) return t1; } -static struct Tnode *expr_2 (void) +static struct Tnode *expr_2 (struct DFA_parse *parse_info) { struct Tnode *t1, *t2, *tn; - if (!(t1 = expr_3 ())) + if (!(t1 = expr_3 (parse_info))) return t1; - while (lookahead == L_WILD || - lookahead == L_ANYZ || - lookahead == L_ANY || - lookahead == L_LP || - lookahead == L_CHAR || - lookahead == L_CHARS) + while (parse_info->lookahead == L_WILD || + parse_info->lookahead == L_ANYZ || + parse_info->lookahead == L_ANY || + parse_info->lookahead == L_LP || + parse_info->lookahead == L_CHAR || + parse_info->lookahead == L_CHARS) { - if (!(t2 = expr_3 ())) + if (!(t2 = expr_3 (parse_info))) return t2; - tn = mk_Tnode (); + tn = mk_Tnode (parse_info); tn->pos = CAT; tn->u.p[0] = t1; tn->u.p[1] = t2; @@ -168,249 +158,281 @@ static struct Tnode *expr_2 (void) return t1; } -static struct Tnode *expr_3 (void) +static struct Tnode *expr_3 (struct DFA_parse *parse_info) { struct Tnode *t1, *tn; - if (!(t1 = expr_4 ())) + if (!(t1 = expr_4 (parse_info))) return t1; - if (lookahead == L_CLOS0) + if (parse_info->lookahead == L_CLOS0) { - lex (); - tn = mk_Tnode (); + lex (parse_info); + tn = mk_Tnode (parse_info); tn->pos = STAR; tn->u.p[0] = t1; t1 = tn; } - else if (lookahead == L_CLOS1) + else if (parse_info->lookahead == L_CLOS1) { - lex (); - tn = mk_Tnode (); + lex (parse_info); + tn = mk_Tnode (parse_info); tn->pos = PLUS; tn->u.p[0] = t1; t1 = tn; } - else if (lookahead == L_QUEST) + else if (parse_info->lookahead == L_QUEST) { - lex (); - tn = mk_Tnode(); + lex (parse_info); + tn = mk_Tnode(parse_info); tn->pos = OR; tn->u.p[0] = t1; - tn->u.p[1] = mk_Tnode(); + tn->u.p[1] = mk_Tnode(parse_info); tn->u.p[1]->pos = EPSILON; t1 = tn; } return t1; } -static struct Tnode *expr_4 (void) +static struct Tnode *expr_4 (struct DFA_parse *parse_info) { struct Tnode *t1; - switch (lookahead) + switch (parse_info->lookahead) { case L_LP: - lex (); - if (!(t1 = expr_1 ())) + lex (parse_info); + if (!(t1 = expr_1 (parse_info))) return t1; - if (lookahead == L_RP) - lex (); + if (parse_info->lookahead == L_RP) + lex (parse_info); else return NULL; break; case L_CHAR: - t1 = mk_Tnode(); - t1->pos = ++(parse_info->position); - t1->u.ch[1] = t1->u.ch[0] = look_ch; - lex (); + t1 = mk_Tnode(parse_info); + t1->pos = ++parse_info->position; + t1->u.ch[1] = t1->u.ch[0] = parse_info->look_ch; + lex (parse_info); break; case L_CHARS: - t1 = mk_Tnode_cset (look_chars); - lex (); + t1 = mk_Tnode_cset (parse_info, parse_info->look_chars); + lex (parse_info); break; case L_ANY: - t1 = mk_Tnode_cset (parse_info->anyset); - lex (); + t1 = mk_Tnode_cset (parse_info, parse_info->anyset); + lex (parse_info); break; case L_ANYZ: - t1 = mk_Tnode(); + t1 = mk_Tnode(parse_info); t1->pos = OR; - t1->u.p[0] = mk_Tnode_cset (parse_info->anyset); - t1->u.p[1] = mk_Tnode(); + t1->u.p[0] = mk_Tnode_cset (parse_info, parse_info->anyset); + t1->u.p[1] = mk_Tnode(parse_info); t1->u.p[1]->pos = EPSILON; - lex (); + lex (parse_info); break; case L_WILD: - t1 = mk_Tnode(); + t1 = mk_Tnode(parse_info); t1->pos = STAR; - t1->u.p[0] = mk_Tnode_cset (parse_info->anyset); - lex (); + t1->u.p[0] = mk_Tnode_cset (parse_info, parse_info->anyset); + lex (parse_info); default: t1 = NULL; } return t1; } -static void do_parse (dfap, s, cc, tnp) -struct DFA_parse *dfap; -char **s; -const unsigned short *cc; -struct Tnode **tnp; +static void do_parse (struct DFA_parse *parse_info, const char **s, + struct Tnode **tnp) { - int i; - struct Tnode *t1, *t2; + int start_anchor_flag = 0; + struct Tnode *t1, *t2, *tn; - for (i=0; cc[i]; i +=2) - ; - ctrl_chars = imalloc ((i+2) * sizeof(*ctrl_chars)); - for (i=0; (ctrl_chars[i] = cc[i]); i ++) - switch (cc[++i]) + parse_info->err_code = 0; + parse_info->expr_ptr = * (const unsigned char **) s; + + parse_info->inside_string = 0; + lex (parse_info); + if (parse_info->lookahead == L_START) + { + start_anchor_flag = 1; + lex (parse_info); + } + if (parse_info->lookahead == L_END) + { + t1 = mk_Tnode (parse_info); + t1->pos = ++parse_info->position; + t1->u.ch[1] = t1->u.ch[0] = '\n'; + lex (parse_info); + } + else + { + t1 = expr_1 (parse_info); + if (t1 && parse_info->lookahead == L_END) { - case '(': - ctrl_chars[i] = L_LP; - break; - case ')': - ctrl_chars[i] = L_RP; - break; - case '.': - ctrl_chars[i] = L_ANY; - break; - case '|': - ctrl_chars[i] = L_ALT; - break; - case '?': - ctrl_chars[i] = L_QUEST; - break; - case '+': - ctrl_chars[i] = L_CLOS1; - break; - case '*': - ctrl_chars[i] = L_CLOS0; - break; - case '$': - ctrl_chars[i] = L_END; - break; - case '^': - ctrl_chars[i] = L_START; - break; - case '!': - ctrl_chars[i] = L_ANY; - break; - case '#': - ctrl_chars[i] = L_ANYZ; - break; - case '@': - ctrl_chars[i] = L_WILD; - break; - default: - ctrl_chars[i] = 0; + t2 = mk_Tnode (parse_info); + t2->pos = ++parse_info->position; + t2->u.ch[1] = t2->u.ch[0] = '\n'; + + tn = mk_Tnode (parse_info); + tn->pos = CAT; + tn->u.p[0] = t1; + tn->u.p[1] = t2; + t1 = tn; + + lex (parse_info); } - parse_info = dfap; - err_code = 0; - expr_ptr = (unsigned char *) *s; - - inside_string = 0; - lex (); - t1 = expr_1 (); - if (t1 && lookahead == 0) + } + if (t1 && parse_info->lookahead == 0) { - t2 = mk_Tnode(); + t2 = mk_Tnode(parse_info); t2->pos = ++parse_info->position; t2->u.ch[0] = -(++parse_info->rule); + t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule); - *tnp = mk_Tnode(); + *tnp = mk_Tnode(parse_info); (*tnp)->pos = CAT; (*tnp)->u.p[0] = t1; (*tnp)->u.p[1] = t2; } else { - if (!err_code) + if (!parse_info->err_code) { - if (lookahead == L_RP) - err_code = DFA_ERR_RP; - else if (lookahead == L_LP) - err_code = DFA_ERR_LP; + if (parse_info->lookahead == L_RP) + parse_info->err_code = DFA_ERR_RP; + else if (parse_info->lookahead == L_LP) + parse_info->err_code = DFA_ERR_LP; else - err_code = DFA_ERR_SYNTAX; + parse_info->err_code = DFA_ERR_SYNTAX; } } - *s = (char *) expr_ptr; - ifree (ctrl_chars); + *s = (const char *) parse_info->expr_ptr; } -static int nextchar (int *esc) +static int nextchar (struct DFA_parse *parse_info, int *esc) { *esc = 0; - if (*expr_ptr == '\0' || isspace(*expr_ptr)) + if (*parse_info->expr_ptr == '\0') return 0; - else if (*expr_ptr != '\\') - return *expr_ptr++; + else if (*parse_info->expr_ptr != '\\') + return *parse_info->expr_ptr++; *esc = 1; - switch (*++expr_ptr) + switch (*++parse_info->expr_ptr) { case '\r': case '\n': - case '\t': case '\0': return '\\'; + case '\t': + ++parse_info->expr_ptr; + return ' '; case 'n': - ++expr_ptr; + ++parse_info->expr_ptr; return '\n'; case 't': - ++expr_ptr; + ++parse_info->expr_ptr; return '\t'; case 'r': - ++expr_ptr; + ++parse_info->expr_ptr; return '\r'; case 'f': - ++expr_ptr; + ++parse_info->expr_ptr; return '\f'; default: - return *expr_ptr++; + return *parse_info->expr_ptr++; } } -static int nextchar_set (int *esc) +static int nextchar_set (struct DFA_parse *parse_info, int *esc) { - if (*expr_ptr == ' ') + if (*parse_info->expr_ptr == ' ') { *esc = 0; - return *expr_ptr++; + return *parse_info->expr_ptr++; } - return nextchar (esc); + return nextchar (parse_info, esc); } -static int read_charset (void) +static int read_charset (struct DFA_parse *parse_info) { - int i, ch0, ch1, esc0, esc1, cc = 0; - look_chars = mk_BSet (&parse_info->charset); - res_BSet (parse_info->charset, look_chars); + int i, ch0, esc0, cc = 0; + parse_info->look_chars = mk_BSet (&parse_info->charset); + res_BSet (parse_info->charset, parse_info->look_chars); - ch0 = nextchar_set (&esc0); + ch0 = nextchar_set (parse_info, &esc0); if (!esc0 && ch0 == '^') { cc = 1; - ch0 = nextchar_set (&esc0); + ch0 = nextchar_set (parse_info, &esc0); } + /** + ch0 is last met character + ch1 is "next" char + */ while (ch0 != 0) { + int ch1, esc1; if (!esc0 && ch0 == ']') break; - add_BSet (parse_info->charset, look_chars, ch0); - ch1 = nextchar_set (&esc1); + if (!esc0 && ch0 == '-') + { + ch1 = ch0; + esc1 = esc0; + ch0 = 1; + add_BSet (parse_info->charset, parse_info->look_chars, ch0); + } + else + { + if (ch0 == 1) + { + ch0 = nextchar(parse_info, &esc0); + } + else + { + if (parse_info->cmap) + { + const char **mapto; + char mapfrom[2]; + const char *mcp = mapfrom; + mapfrom[0] = ch0; + mapto = parse_info->cmap(parse_info->cmap_data, &mcp, 1); + assert (mapto); + ch0 = mapto[0][0]; + } + } + add_BSet (parse_info->charset, parse_info->look_chars, ch0); + ch1 = nextchar_set (parse_info, &esc1); + } if (!esc1 && ch1 == '-') { - if ((ch1 = nextchar_set (&esc1)) == 0) + int open_range = 0; + if ((ch1 = nextchar_set (parse_info, &esc1)) == 0) break; if (!esc1 && ch1 == ']') { - add_BSet (parse_info->charset, look_chars, '-'); - break; + ch1 = 255; + open_range = 1; + } + else if (ch1 == 1) + { + ch1 = nextchar(parse_info, &esc1); } - for (i=ch0; ++i<=ch1;) - add_BSet (parse_info->charset, look_chars, i); - ch0 = nextchar_set (&esc0); + else if (parse_info->cmap) + { + const char **mapto; + char mapfrom[2]; + const char *mcp = mapfrom; + mapfrom[0] = ch1; + mapto = (*parse_info->cmap) (parse_info->cmap_data, &mcp, 1); + assert (mapto); + ch1 = mapto[0][0]; + } + for (i = ch0; ++i <= ch1;) + add_BSet (parse_info->charset, parse_info->look_chars, i); + + if (open_range) + break; + ch0 = nextchar_set (parse_info, &esc0); } else { @@ -419,69 +441,73 @@ static int read_charset (void) } } if (cc) - com_BSet (parse_info->charset, look_chars); + com_BSet (parse_info->charset, parse_info->look_chars); return L_CHARS; } -unsigned short dfa_thompson_chars[] = +static int map_l_char (struct DFA_parse *parse_info) { - '*', '*', - '+', '+', - '|', '|', - '^', '^', - '$', '$', - '?', '?', - '.', '.', - '(', '(', - ')', ')', - 0 -}; + const char **mapto; + const char *cp0 = (const char *) (parse_info->expr_ptr-1); + int i = 0, len = strlen(cp0); -unsigned short dfa_ccl_chars[] = -{ - '#', '#', - '!', '!', - '?', '@', - 0 -}; + if (cp0[0] == 1 && cp0[1]) + { + parse_info->expr_ptr++; + parse_info->look_ch = ((unsigned char *) cp0)[1]; + return L_CHAR; + } + if (!parse_info->cmap) + return L_CHAR; -static int lex_sub(void) + mapto = (*parse_info->cmap) (parse_info->cmap_data, &cp0, len); + assert (mapto); + + parse_info->expr_ptr = (const unsigned char *) cp0; + parse_info->look_ch = ((unsigned char **) mapto)[i][0]; + yaz_log (YLOG_DEBUG, "map from %c to %d", parse_info->expr_ptr[-1], parse_info->look_ch); + return L_CHAR; +} + +static int lex_sub(struct DFA_parse *parse_info) { int esc; - const unsigned short *cc; - while ((look_ch = nextchar (&esc)) != 0) - if (look_ch == '\"') + while ((parse_info->look_ch = nextchar (parse_info, &esc)) != 0) + if (parse_info->look_ch == '\"') { if (esc) - return L_CHAR; - inside_string = !inside_string; + return map_l_char (parse_info); + parse_info->inside_string = !parse_info->inside_string; } - else if (esc || inside_string) - return L_CHAR; - else if (look_ch == '[') - return read_charset(); - else if (look_ch == ' ') - return 0; - else + else if (esc || parse_info->inside_string) + return map_l_char (parse_info); + else if (parse_info->look_ch == '[') + return read_charset(parse_info); + else { - for (cc = ctrl_chars; *cc; cc += 2) - if (*cc == look_ch) + const int *cc; + for (cc = parse_info->charMap; *cc; cc += 2) + if (*cc == (int) (parse_info->look_ch)) + { + if (!cc[1]) + --parse_info->expr_ptr; return cc[1]; - return L_CHAR; + } + return map_l_char (parse_info); } return 0; } -static void lex (void) +static void lex (struct DFA_parse *parse_info) { - lookahead = lex_sub (); + parse_info->lookahead = lex_sub (parse_info); } static const char *str_char (unsigned c) { static char s[6]; s[0] = '\\'; - if (c < 32) + if (c < 32 || c >= 127) switch (c) { case '\r': @@ -521,7 +547,7 @@ static void out_char (int c) printf ("%s", str_char (c)); } -static void term_Tnode (void) +static void term_Tnode (struct DFA_parse *parse_info) { struct Tblock *t, *t1; for (t = parse_info->start; (t1 = t);) @@ -532,7 +558,7 @@ static void term_Tnode (void) } } -static struct Tnode *mk_Tnode (void) +static struct Tnode *mk_Tnode (struct DFA_parse *parse_info) { struct Tblock *tnew; @@ -553,9 +579,9 @@ static struct Tnode *mk_Tnode (void) return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD); } -struct Tnode *mk_Tnode_cset (BSet charset) +struct Tnode *mk_Tnode_cset (struct DFA_parse *parse_info, BSet charset) { - struct Tnode *tn1, *tn0 = mk_Tnode(); + struct Tnode *tn1, *tn0 = mk_Tnode(parse_info); int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0); if (ch0 == -1) tn0->pos = EPSILON; @@ -571,11 +597,11 @@ struct Tnode *mk_Tnode_cset (BSet charset) tn0->u.ch[1] = ch1-1; while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1) { - tn1 = mk_Tnode(); + tn1 = mk_Tnode(parse_info); tn1->pos = OR; tn1->u.p[0] = tn0; tn0 = tn1; - tn1 = tn0->u.p[1] = mk_Tnode(); + tn1 = tn0->u.p[1] = mk_Tnode(parse_info); tn1->u.ch[0] = ch0; tn1->pos = ++(parse_info->position); if ((ch1 = travi_BSet (parse_info->charset, charset, @@ -591,109 +617,115 @@ struct Tnode *mk_Tnode_cset (BSet charset) return tn0; } -static void del_followpos (void) +static void del_followpos (struct DFA_parse *parse_info) { - ifree (followpos); + ifree (parse_info->followpos); } -static void init_pos (void) +static void init_pos (struct DFA_parse *parse_info) { - posar = (struct Tnode **) imalloc (sizeof(struct Tnode*) + parse_info->posar = (struct Tnode **) imalloc (sizeof(struct Tnode*) * (1+parse_info->position)); } -static void del_pos (void) +static void del_pos (struct DFA_parse *parse_info) { - ifree (posar); + ifree (parse_info->posar); } -static void add_follow (Set lastpos, Set firstpos) +static void add_follow (struct DFA_parse *parse_info, + DFASet lastpos, DFASet firstpos) { while (lastpos) { - followpos[lastpos->value] = - union_Set (poset, followpos[lastpos->value], firstpos); + parse_info->followpos[lastpos->value] = + union_DFASet (parse_info->poset, + parse_info->followpos[lastpos->value], firstpos); lastpos = lastpos->next; } } -static void dfa_trav (struct Tnode *n) +static void dfa_trav (struct DFA_parse *parse_info, struct Tnode *n) { + struct Tnode **posar = parse_info->posar; + DFASetType poset = parse_info->poset; + switch (n->pos) { case CAT: - dfa_trav (n->u.p[0]); - dfa_trav (n->u.p[1]); + dfa_trav (parse_info, n->u.p[0]); + dfa_trav (parse_info, n->u.p[1]); n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable; - n->firstpos = mk_Set (poset); - n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos); + n->firstpos = mk_DFASet (poset); + n->firstpos = union_DFASet (poset, n->firstpos, n->u.p[0]->firstpos); if (n->u.p[0]->nullable) - n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos); - n->lastpos = mk_Set (poset); - n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos); + n->firstpos = union_DFASet (poset, n->firstpos, n->u.p[1]->firstpos); + n->lastpos = mk_DFASet (poset); + n->lastpos = union_DFASet (poset, n->lastpos, n->u.p[1]->lastpos); if (n->u.p[1]->nullable) - n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos); + n->lastpos = union_DFASet (poset, n->lastpos, n->u.p[0]->lastpos); - add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos); + add_follow (parse_info, n->u.p[0]->lastpos, n->u.p[1]->firstpos); - n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos); - n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos); - n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos); - n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos); + n->u.p[0]->firstpos = rm_DFASet (poset, n->u.p[0]->firstpos); + n->u.p[0]->lastpos = rm_DFASet (poset, n->u.p[0]->lastpos); + n->u.p[1]->firstpos = rm_DFASet (poset, n->u.p[1]->firstpos); + n->u.p[1]->lastpos = rm_DFASet (poset, n->u.p[1]->lastpos); if (debug_dfa_trav) printf ("CAT"); break; case OR: - dfa_trav (n->u.p[0]); - dfa_trav (n->u.p[1]); + dfa_trav (parse_info, n->u.p[0]); + dfa_trav (parse_info, n->u.p[1]); n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable; - n->firstpos = merge_Set (poset, n->u.p[0]->firstpos, + n->firstpos = merge_DFASet (poset, n->u.p[0]->firstpos, n->u.p[1]->firstpos); - n->lastpos = merge_Set (poset, n->u.p[0]->lastpos, + n->lastpos = merge_DFASet (poset, n->u.p[0]->lastpos, n->u.p[1]->lastpos); - n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos); - n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos); - n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos); - n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos); + n->u.p[0]->firstpos = rm_DFASet (poset, n->u.p[0]->firstpos); + n->u.p[0]->lastpos = rm_DFASet (poset, n->u.p[0]->lastpos); + n->u.p[1]->firstpos = rm_DFASet (poset, n->u.p[1]->firstpos); + n->u.p[1]->lastpos = rm_DFASet (poset, n->u.p[1]->lastpos); if (debug_dfa_trav) printf ("OR"); break; case PLUS: - dfa_trav (n->u.p[0]); + dfa_trav (parse_info, n->u.p[0]); n->nullable = n->u.p[0]->nullable; n->firstpos = n->u.p[0]->firstpos; n->lastpos = n->u.p[0]->lastpos; - add_follow (n->lastpos, n->firstpos); + add_follow (parse_info, n->lastpos, n->firstpos); if (debug_dfa_trav) printf ("PLUS"); break; case STAR: - dfa_trav (n->u.p[0]); + dfa_trav (parse_info, n->u.p[0]); n->nullable = 1; n->firstpos = n->u.p[0]->firstpos; n->lastpos = n->u.p[0]->lastpos; - add_follow (n->lastpos, n->firstpos); + add_follow (parse_info, n->lastpos, n->firstpos); if (debug_dfa_trav) printf ("STAR"); break; case EPSILON: n->nullable = 1; - n->lastpos = mk_Set (poset); - n->firstpos = mk_Set (poset); + n->lastpos = mk_DFASet (poset); + n->firstpos = mk_DFASet (poset); if (debug_dfa_trav) printf ("EPSILON"); break; default: posar[n->pos] = n; n->nullable = 0; - n->firstpos = mk_Set (poset); - n->firstpos = add_Set (poset, n->firstpos, n->pos); - n->lastpos = mk_Set (poset); - n->lastpos = add_Set (poset, n->lastpos, n->pos); + n->firstpos = mk_DFASet (poset); + n->firstpos = add_DFASet (poset, n->firstpos, n->pos); + n->lastpos = mk_DFASet (poset); + n->lastpos = add_DFASet (poset, n->lastpos, n->pos); if (debug_dfa_trav) + { if (n->u.ch[0] < 0) - printf ("#%d", -n->u.ch[0]); + printf ("#%d (n#%d)", -n->u.ch[0], -n->u.ch[1]); else if (n->u.ch[1] > n->u.ch[0]) { putchar ('['); @@ -705,80 +737,99 @@ static void dfa_trav (struct Tnode *n) } else out_char (n->u.ch[0]); + } } if (debug_dfa_trav) { printf ("\n nullable : %c\n", n->nullable ? '1' : '0'); printf (" firstpos :"); - pr_Set (poset, n->firstpos); + pr_DFASet (poset, n->firstpos); printf (" lastpos :"); - pr_Set (poset, n->lastpos); + pr_DFASet (poset, n->lastpos); } } -static void init_followpos (void) +static void init_followpos (struct DFA_parse *parse_info) { - Set *fa; + DFASet *fa; int i; - followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position)); + parse_info->followpos = fa = + (DFASet *) imalloc (sizeof(DFASet) * (1+parse_info->position)); for (i = parse_info->position+1; --i >= 0; fa++) - *fa = mk_Set (poset); + *fa = mk_DFASet (parse_info->poset); } -static void mk_dfa_tran (struct DFA_states *dfas) +static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) { - int i, c; + int i, j, c; int max_char; short *pos, *pos_i; - Set tran_set; + DFASet tran_set; int char_0, char_1; struct DFA_state *dfa_from, *dfa_to; + struct Tnode **posar; + DFASetType poset; + DFASet *followpos; assert (parse_info); + + posar = parse_info->posar; max_char = parse_info->charset->size; pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1)); - tran_set = cp_Set (poset, parse_info->root->firstpos); + tran_set = cp_DFASet (parse_info->poset, parse_info->root->firstpos); i = add_DFA_state (dfas, &tran_set, &dfa_from); assert (i); dfa_from->rule_no = 0; + poset = parse_info->poset; + followpos = parse_info->followpos; while ((dfa_from = get_DFA_state (dfas))) { pos_i = pos; - i = 0; + j = i = 0; for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next) if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char) *pos_i++ = tran_set->value; - else if (c < 0 && (i == 0 || c > i)) - i = c; + else if (c < 0) + { + if (i == 0 || c > i) + i = c; + c = posar[tran_set->value]->u.ch[1]; + if (j == 0 || c > j) + j = c; + } *pos_i = -1; dfa_from->rule_no = -i; + dfa_from->rule_nno = -j; for (char_1 = 0; char_1 <= max_char; char_1++) { char_0 = max_char+1; for (pos_i = pos; (i = *pos_i) != -1; ++pos_i) - if (posar[i]->u.ch[1] >= char_1) - if ((c=posar[i]->u.ch[0]) < char_0) - if (c < char_1) - char_0 = char_1; - else - char_0 = c; + if (posar[i]->u.ch[1] >= char_1 + && (c=posar[i]->u.ch[0]) < char_0) + { + if (c < char_1) + char_0 = char_1; + else + char_0 = c; + } - char_1 = max_char; if (char_0 > max_char) break; - tran_set = mk_Set (poset); + + char_1 = max_char; + + tran_set = mk_DFASet (poset); for (pos_i = pos; (i = *pos_i) != -1; ++pos_i) - if ((c=posar[i]->u.ch[1]) >= char_0) - if (posar[i]->u.ch[0] <= char_0) - { - if (c < char_1) - char_1 = c; - tran_set = union_Set (poset, tran_set, followpos[i]); - } - else if (c <= char_1) - char_1 = c-1; + { + if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1) + char_1 = c - 1; /* forward chunk */ + else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1) + char_1 = c; /* backward chunk */ + if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0) + tran_set = union_DFASet (poset, tran_set, followpos[i]); + } if (tran_set) { add_DFA_state (dfas, &tran_set, &dfa_to); @@ -790,7 +841,7 @@ static void mk_dfa_tran (struct DFA_states *dfas) sort_DFA_states (dfas); } -static void pr_tran (struct DFA_states *dfas) +static void pr_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) { struct DFA_state *s; struct DFA_tran *tran; @@ -803,8 +854,11 @@ static void pr_tran (struct DFA_states *dfas) printf ("DFA state %d", no); if (s->rule_no) printf ("#%d", s->rule_no); + if (s->rule_nno) + printf (" #%d", s->rule_nno); + putchar (':'); - pr_Set (poset, s->set); + pr_DFASet (parse_info->poset, s->set); prev_no = -1; for (i=s->tran_no, tran=s->trans; --i >= 0; tran++) { @@ -824,28 +878,29 @@ static void pr_tran (struct DFA_states *dfas) } } -static void pr_verbose (struct DFA_states *dfas) +static void pr_verbose (struct DFA_parse *parse_info, struct DFA_states *dfas) { long i, j; int k; - printf ("%d/%d tree nodes used, %d bytes each\n", - parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode)); + printf ("%d/%d tree nodes used, %ld bytes each\n", + parse_info->use_Tnode, parse_info->max_Tnode, (long) sizeof (struct Tnode)); k = inf_BSetHandle (parse_info->charset, &i, &j); - printf ("%ld/%ld character sets, %d bytes each\n", - i/k, j/k, k*sizeof(BSetWord)); - k = inf_SetType (poset, &i, &j); + printf ("%ld/%ld character sets, %ld bytes each\n", + i/k, j/k, (long) k*sizeof(BSetWord)); + k = inf_DFASetType (parse_info->poset, &i, &j); printf ("%ld/%ld poset items, %d bytes each\n", i, j, k); printf ("%d DFA states\n", dfas->no); } -static void pr_followpos (void) +static void pr_followpos (struct DFA_parse *parse_info) { + struct Tnode **posar = parse_info->posar; int i; printf ("\nfollowsets:\n"); for (i=1; i <= parse_info->position; i++) { printf ("%3d:", i); - pr_Set (poset, followpos[i]); + pr_DFASet (parse_info->poset, parse_info->followpos[i]); putchar ('\t'); if (posar[i]->u.ch[0] < 0) @@ -866,29 +921,138 @@ static void pr_followpos (void) putchar ('\n'); } +void dfa_parse_cmap_clean (struct DFA *d) +{ + struct DFA_parse *dfa = d->parse_info; + + assert (dfa); + if (!dfa->charMap) + { + dfa->charMapSize = 7; + dfa->charMap = (int *) + imalloc (dfa->charMapSize * sizeof(*dfa->charMap)); + } + dfa->charMap[0] = 0; +} + +void dfa_parse_cmap_new (struct DFA *d, const int *cmap) +{ + struct DFA_parse *dfa = d->parse_info; + const int *cp; + int size; + + assert (dfa); + for (cp = cmap; *cp; cp += 2) + ; + size = cp - cmap + 1; + if (size > dfa->charMapSize) + { + if (dfa->charMap) + ifree (dfa->charMap); + dfa->charMapSize = size; + dfa->charMap = (int *) imalloc (size * sizeof(*dfa->charMap)); + } + memcpy (dfa->charMap, cmap, size * sizeof(*dfa->charMap)); +} + +void dfa_parse_cmap_del (struct DFA *d, int from) +{ + struct DFA_parse *dfa = d->parse_info; + int *cc; + + assert (dfa); + for (cc = dfa->charMap; *cc; cc += 2) + if (*cc == from) + { + while ((cc[0] = cc[2])) + { + cc[1] = cc[3]; + cc += 2; + } + break; + } +} + +void dfa_parse_cmap_add (struct DFA *d, int from, int to) +{ + struct DFA_parse *dfa = d->parse_info; + int *cc; + int indx, size; + + assert (dfa); + for (cc = dfa->charMap; *cc; cc += 2) + if (*cc == from) + { + cc[1] = to; + return ; + } + indx = cc - dfa->charMap; + size = dfa->charMapSize; + if (indx >= size) + { + int *cn = (int *) imalloc ((size+16) * sizeof(*dfa->charMap)); + memcpy (cn, dfa->charMap, indx*sizeof(*dfa->charMap)); + ifree (dfa->charMap); + dfa->charMap = cn; + dfa->charMapSize = size+16; + } + dfa->charMap[indx] = from; + dfa->charMap[indx+1] = to; + dfa->charMap[indx+2] = 0; +} + +void dfa_parse_cmap_thompson (struct DFA *d) +{ + static int thompson_chars[] = + { + '*', L_CLOS0, + '+', L_CLOS1, + '|', L_ALT, + '^', L_START, + '$', L_END, + '?', L_QUEST, + '.', L_ANY, + '(', L_LP, + ')', L_RP, + ' ', 0, + '\t', 0, + '\n', 0, + 0 + }; + dfa_parse_cmap_new (d, thompson_chars); +} + static struct DFA_parse *dfa_parse_init (void) { - parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse)); + struct DFA_parse *parse_info = + (struct DFA_parse *) imalloc (sizeof (struct DFA_parse)); parse_info->charset = mk_BSetHandle (255, 20); parse_info->position = 0; parse_info->rule = 0; parse_info->root = NULL; + /* initialize the anyset which by default does not include \n */ parse_info->anyset = mk_BSet (&parse_info->charset); res_BSet (parse_info->charset, parse_info->anyset); add_BSet (parse_info->charset, parse_info->anyset, '\n'); com_BSet (parse_info->charset, parse_info->anyset); + parse_info->use_Tnode = parse_info->max_Tnode = 0; + parse_info->start = parse_info->end = NULL; + parse_info->charMap = NULL; + parse_info->charMapSize = 0; + parse_info->cmap = NULL; return parse_info; } static void rm_dfa_parse (struct DFA_parse **dfap) { - parse_info = *dfap; + struct DFA_parse *parse_info = *dfap; assert (parse_info); - term_Tnode(); + term_Tnode(parse_info); rm_BSetHandle (&parse_info->charset); + ifree (parse_info->charMap); ifree (parse_info); *dfap = NULL; } @@ -896,27 +1060,30 @@ static void rm_dfa_parse (struct DFA_parse **dfap) static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk) { struct DFA_states *dfas; + struct DFA_parse *parse_info = dfap; assert (poset_chunk > 10); assert (dfap); - parse_info = dfap; - poset = mk_SetType (poset_chunk); - init_pos(); - init_followpos(); - dfa_trav (parse_info->root); + parse_info->poset = mk_DFASetType (poset_chunk); + init_pos(parse_info); + init_followpos(parse_info); + assert (parse_info->root); + dfa_trav (parse_info, parse_info->root); if (debug_dfa_followpos) - pr_followpos(); - init_DFA_states (&dfas, poset, STATE_HASH); - mk_dfa_tran (dfas); + pr_followpos(parse_info); + init_DFA_states (&dfas, parse_info->poset, (int) (STATE_HASH)); + mk_dfa_tran (parse_info, dfas); if (debug_dfa_tran) - pr_tran (dfas); + { + pr_tran (parse_info, dfas); + } if (dfa_verbose) - pr_verbose (dfas); - del_pos(); - del_followpos(); - rm_SetType (poset); + pr_verbose (parse_info, dfas); + del_pos(parse_info); + del_followpos(parse_info); + rm_DFASetType (parse_info->poset); return dfas; } @@ -924,28 +1091,50 @@ struct DFA *dfa_init (void) { struct DFA *dfa; - dfa = imalloc (sizeof(*dfa)); + dfa = (struct DFA *) imalloc (sizeof(*dfa)); dfa->parse_info = dfa_parse_init (); dfa->state_info = NULL; dfa->states = NULL; + dfa_parse_cmap_thompson (dfa); return dfa; } -int dfa_parse (struct DFA *dfa, char **pattern) +void dfa_anyset_includes_nl(struct DFA *dfa) +{ + add_BSet (dfa->parse_info->charset, dfa->parse_info->anyset, '\n'); +} + +void dfa_set_cmap (struct DFA *dfa, void *vp, + const char **(*cmap)(void *vp, const char **from, int len)) +{ + dfa->parse_info->cmap = cmap; + dfa->parse_info->cmap_data = vp; +} + +int dfa_get_last_rule (struct DFA *dfa) +{ + return dfa->parse_info->rule; +} + +int dfa_parse (struct DFA *dfa, const char **pattern) { struct Tnode *top; + struct DFA_parse *parse_info; assert (dfa); - do_parse (dfa->parse_info, pattern, dfa_thompson_chars, &top); - if (err_code) - return err_code; + assert (dfa->parse_info); + parse_info = dfa->parse_info; + + do_parse (parse_info, pattern, &top); + if (parse_info->err_code) + return parse_info->err_code; if ( !dfa->parse_info->root ) dfa->parse_info->root = top; else { struct Tnode *n; - n = mk_Tnode (); + n = mk_Tnode (parse_info); n->pos = OR; n->u.p[0] = dfa->parse_info->root; n->u.p[1] = top; @@ -970,7 +1159,17 @@ void dfa_delete (struct DFA **dfap) assert (*dfap); if ((*dfap)->parse_info) rm_dfa_parse (&(*dfap)->parse_info); - rm_DFA_states (&(*dfap)->state_info); + if ((*dfap)->state_info) + rm_DFA_states (&(*dfap)->state_info); ifree (*dfap); *dfap = NULL; } +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ +