X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=dfa%2Fdfa.c;h=4c7c864d072a888c166c9fa8579eee1eef8cd087;hb=c332d3fe0ad55f5dbf6705ece399eaf8a7e9ac96;hp=8d41a37ca220075960aaefe73faef9a40522d410;hpb=41194374db152313aa9a6c2d34053dc33aab180e;p=idzebra-moved-to-github.git diff --git a/dfa/dfa.c b/dfa/dfa.c index 8d41a37..4c7c864 100644 --- a/dfa/dfa.c +++ b/dfa/dfa.c @@ -4,7 +4,39 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: dfa.c,v $ - * Revision 1.1 1995-01-24 16:02:52 adam + * Revision 1.11 1996-01-08 19:15:24 adam + * Allow single $ in expressions. + * + * Revision 1.10 1996/01/08 09:09:17 adam + * Function dfa_parse got 'const' string argument. + * New functions to define char mappings made public. + * + * Revision 1.9 1995/12/06 12:24:58 adam + * Removed verbatim mode code. + * + * Revision 1.8 1995/12/06 09:09:58 adam + * Work on left and right anchors. + * + * Revision 1.7 1995/11/27 09:23:02 adam + * New berbatim hook in regular expressions. "[]n ..". + * + * Revision 1.6 1995/10/16 09:31:25 adam + * Bug fix. + * + * Revision 1.5 1995/10/02 15:17:58 adam + * Bug fix in dfa_delete. + * + * Revision 1.4 1995/09/28 09:18:52 adam + * Removed various preprocessor defines. + * + * Revision 1.3 1995/09/04 12:33:26 adam + * Various cleanup. YAZ util used instead. + * + * Revision 1.2 1995/01/25 11:30:50 adam + * Simple error reporting when parsing regular expressions. + * Memory usage reduced. + * + * Revision 1.1 1995/01/24 16:02:52 adam * New private header file in dfa module (dfap.h). * Module no longer uses yacc to parse regular expressions. * @@ -16,7 +48,7 @@ #include #include -#include +#include #include "dfap.h" #include "imalloc.h" @@ -45,19 +77,20 @@ struct Tblock { /* Tnode bucket (block) */ struct Tnode *tarray; /* array of nodes in bucket */ }; -#define TADD 256 +#define TADD 64 +#define STATE_HASH 199 +#define POSET_CHUNK 100 int debug_dfa_trav = 0; int debug_dfa_tran = 0; int debug_dfa_followpos = 0; int dfa_verbose = 0; -int yydebug = 0; static struct DFA_parse *parse_info = NULL; +static int err_code; static int inside_string; static const unsigned char *expr_ptr; -static unsigned short *ctrl_chars; static struct Tnode **posar; static SetType poset; @@ -75,7 +108,6 @@ static void add_follow (Set lastpos, Set firstpos), dfa_trav (struct Tnode *n), init_followpos (void), - mk_dfa_tran (struct DFA_states *dfas), pr_tran (struct DFA_states *dfas), pr_verbose (struct DFA_states *dfas), pr_followpos (void), @@ -116,11 +148,13 @@ static struct Tnode *expr_1 (void) { struct Tnode *t1, *t2, *tn; - t1 = expr_2 (); + if (!(t1 = expr_2 ())) + return t1; while (lookahead == L_ALT) { lex (); - t2 = expr_2 (); + if (!(t2 = expr_2 ())) + return t2; tn = mk_Tnode (); tn->pos = OR; @@ -134,7 +168,9 @@ static struct Tnode *expr_1 (void) static struct Tnode *expr_2 (void) { struct Tnode *t1, *t2, *tn; - t1 = expr_3 (); + + if (!(t1 = expr_3 ())) + return t1; while (lookahead == L_WILD || lookahead == L_ANYZ || lookahead == L_ANY || @@ -142,7 +178,8 @@ static struct Tnode *expr_2 (void) lookahead == L_CHAR || lookahead == L_CHARS) { - t2 = expr_3 (); + if (!(t2 = expr_3 ())) + return t2; tn = mk_Tnode (); tn->pos = CAT; @@ -157,7 +194,8 @@ static struct Tnode *expr_3 (void) { struct Tnode *t1, *tn; - t1 = expr_4 (); + if (!(t1 = expr_4 ())) + return t1; if (lookahead == L_CLOS0) { lex (); @@ -190,17 +228,21 @@ static struct Tnode *expr_3 (void) static struct Tnode *expr_4 (void) { struct Tnode *t1; + switch (lookahead) { case L_LP: lex (); - t1 = expr_1 (); + if (!(t1 = expr_1 ())) + return t1; if (lookahead == L_RP) lex (); + else + return NULL; break; case L_CHAR: t1 = mk_Tnode(); - t1->pos = ++(parse_info->position); + t1->pos = ++parse_info->position; t1->u.ch[1] = t1->u.ch[0] = look_ch; lex (); break; @@ -227,89 +269,83 @@ static struct Tnode *expr_4 (void) lex (); default: t1 = NULL; - assert (0); - lex (); } return t1; } -static int parse (dfap, s, cc) -struct DFA_parse *dfap; -char **s; -const unsigned short *cc; +static void do_parse (struct DFA_parse *dfap, const char **s, + struct Tnode **tnp) { - int i; - struct Tnode *t1, *t2; + int start_anchor_flag = 0; + struct Tnode *t1, *t2, *tn; - for (i=0; cc[i]; i +=2) - ; - ctrl_chars = imalloc ((i+2) * sizeof(*ctrl_chars)); - for (i=0; (ctrl_chars[i] = cc[i]); i ++) - switch (cc[++i]) - { - case '(': - ctrl_chars[i] = L_LP; - break; - case ')': - ctrl_chars[i] = L_RP; - break; - case '.': - ctrl_chars[i] = L_ANY; - break; - case '|': - ctrl_chars[i] = L_ALT; - break; - case '?': - ctrl_chars[i] = L_QUEST; - break; - case '+': - ctrl_chars[i] = L_CLOS1; - break; - case '*': - ctrl_chars[i] = L_CLOS0; - break; - case '$': - ctrl_chars[i] = L_END; - break; - case '^': - ctrl_chars[i] = L_START; - break; - case '!': - ctrl_chars[i] = L_ANY; - break; - case '#': - ctrl_chars[i] = L_ANYZ; - break; - case '@': - ctrl_chars[i] = L_WILD; - break; - default: - ctrl_chars[i] = 0; - } parse_info = dfap; - expr_ptr = (unsigned char *) *s; + err_code = 0; + expr_ptr = (const unsigned char *) *s; inside_string = 0; lex (); - t1 = expr_1 (); - t2 = mk_Tnode(); - t2->pos = ++parse_info->position; - t2->u.ch[0] = -(++parse_info->rule); - - parse_info->top = mk_Tnode(); - parse_info->top->pos = CAT; - parse_info->top->u.p[0] = t1; - parse_info->top->u.p[1] = t2; - - *s = (char *) expr_ptr; - ifree (ctrl_chars); - return 0; + if (lookahead == L_START) + { + start_anchor_flag = 1; + lex (); + } + if (lookahead == L_END) + { + t1 = mk_Tnode (); + t1->pos = ++parse_info->position; + t1->u.ch[1] = t1->u.ch[0] = '\n'; + lex (); + } + else + { + t1 = expr_1 (); + if (t1 && lookahead == L_END) + { + t2 = mk_Tnode (); + t2->pos = ++parse_info->position; + t2->u.ch[1] = t2->u.ch[0] = '\n'; + + tn = mk_Tnode (); + tn->pos = CAT; + tn->u.p[0] = t1; + tn->u.p[1] = t2; + t1 = tn; + + lex (); + } + } + if (t1 && lookahead == 0) + { + t2 = mk_Tnode(); + t2->pos = ++parse_info->position; + t2->u.ch[0] = -(++parse_info->rule); + t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule); + + *tnp = mk_Tnode(); + (*tnp)->pos = CAT; + (*tnp)->u.p[0] = t1; + (*tnp)->u.p[1] = t2; + } + else + { + if (!err_code) + { + if (lookahead == L_RP) + err_code = DFA_ERR_RP; + else if (lookahead == L_LP) + err_code = DFA_ERR_LP; + else + err_code = DFA_ERR_SYNTAX; + } + } + *s = (const char *) expr_ptr; } static int nextchar (int *esc) { *esc = 0; - if (*expr_ptr == '\0' || isspace(*expr_ptr)) + if (*expr_ptr == '\0') return 0; else if (*expr_ptr != '\\') return *expr_ptr++; @@ -318,9 +354,11 @@ static int nextchar (int *esc) { case '\r': case '\n': - case '\t': case '\0': return '\\'; + case '\t': + ++expr_ptr; + return ' '; case 'n': ++expr_ptr; return '\n'; @@ -338,27 +376,37 @@ static int nextchar (int *esc) } } +static int nextchar_set (int *esc) +{ + if (*expr_ptr == ' ') + { + *esc = 0; + return *expr_ptr++; + } + return nextchar (esc); +} + static int read_charset (void) { int i, ch0, ch1, esc0, esc1, cc = 0; look_chars = mk_BSet (&parse_info->charset); res_BSet (parse_info->charset, look_chars); - ch0 = nextchar (&esc0); + ch0 = nextchar_set (&esc0); if (!esc0 && ch0 == '^') { cc = 1; - ch0 = nextchar (&esc0); + ch0 = nextchar_set (&esc0); } while (ch0 != 0) { if (!esc0 && ch0 == ']') break; add_BSet (parse_info->charset, look_chars, ch0); - ch1 = nextchar (&esc1); + ch1 = nextchar_set (&esc1); if (!esc1 && ch1 == '-') { - if ((ch1 = nextchar (&esc1)) == 0) + if ((ch1 = nextchar_set (&esc1)) == 0) break; if (!esc1 && ch1 == ']') { @@ -367,7 +415,7 @@ static int read_charset (void) } for (i=ch0; ++i<=ch1;) add_BSet (parse_info->charset, look_chars, i); - ch0 = nextchar (&esc0); + ch0 = nextchar_set (&esc0); } else { @@ -380,32 +428,9 @@ static int read_charset (void) return L_CHARS; } -unsigned short dfa_thompson_chars[] = -{ - '*', '*', - '+', '+', - '|', '|', - '^', '^', - '$', '$', - '?', '?', - '.', '.', - '(', '(', - ')', ')', - 0 -}; - -unsigned short dfa_ccl_chars[] = -{ - '#', '#', - '!', '!', - '?', '@', - 0 -}; - static int lex_sub(void) { int esc; - const unsigned short *cc; while ((look_ch = nextchar (&esc)) != 0) if (look_ch == '\"') { @@ -417,13 +442,16 @@ static int lex_sub(void) return L_CHAR; else if (look_ch == '[') return read_charset(); - else if (look_ch == ' ') - return 0; - else + else { - for (cc = ctrl_chars; *cc; cc += 2) + const int *cc; + for (cc = parse_info->charMap; *cc; cc += 2) if (*cc == look_ch) + { + if (!cc[1]) + --expr_ptr; return cc[1]; + } return L_CHAR; } return 0; @@ -650,7 +678,7 @@ static void dfa_trav (struct Tnode *n) n->lastpos = add_Set (poset, n->lastpos, n->pos); if (debug_dfa_trav) if (n->u.ch[0] < 0) - printf ("#%d", -n->u.ch[0]); + printf ("#%d (n#%d)", -n->u.ch[0], -n->u.ch[1]); else if (n->u.ch[1] > n->u.ch[0]) { putchar ('['); @@ -684,7 +712,7 @@ static void init_followpos (void) static void mk_dfa_tran (struct DFA_states *dfas) { - int i, c; + int i, j, c; int max_char; short *pos, *pos_i; Set tran_set; @@ -702,40 +730,48 @@ static void mk_dfa_tran (struct DFA_states *dfas) while ((dfa_from = get_DFA_state (dfas))) { pos_i = pos; - i = 0; + j = i = 0; for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next) if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char) *pos_i++ = tran_set->value; - else if (c < 0 && (i == 0 || c > i)) - i = c; + else if (c < 0) + { + if (i == 0 || c > i) + i = c; + c = posar[tran_set->value]->u.ch[1]; + if (j == 0 || c > j) + j = c; + } *pos_i = -1; dfa_from->rule_no = -i; + dfa_from->rule_nno = -j; for (char_1 = 0; char_1 <= max_char; char_1++) { char_0 = max_char+1; for (pos_i = pos; (i = *pos_i) != -1; ++pos_i) - if (posar[i]->u.ch[1] >= char_1) - if ((c=posar[i]->u.ch[0]) < char_0) - if (c < char_1) - char_0 = char_1; - else - char_0 = c; + if (posar[i]->u.ch[1] >= char_1 + && (c=posar[i]->u.ch[0]) < char_0) + if (c < char_1) + char_0 = char_1; + else + char_0 = c; - char_1 = max_char; if (char_0 > max_char) break; + + char_1 = max_char; + tran_set = mk_Set (poset); for (pos_i = pos; (i = *pos_i) != -1; ++pos_i) - if ((c=posar[i]->u.ch[1]) >= char_0) - if (posar[i]->u.ch[0] <= char_0) - { - if (c < char_1) - char_1 = c; - tran_set = union_Set (poset, tran_set, followpos[i]); - } - else if (c <= char_1) - char_1 = c-1; + { + if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1) + char_1 = c - 1; /* forward chunk */ + else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1) + char_1 = c; /* backward chunk */ + if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0) + tran_set = union_Set (poset, tran_set, followpos[i]); + } if (tran_set) { add_DFA_state (dfas, &tran_set, &dfa_to); @@ -760,6 +796,9 @@ static void pr_tran (struct DFA_states *dfas) printf ("DFA state %d", no); if (s->rule_no) printf ("#%d", s->rule_no); + if (s->rule_nno) + printf (" #%d", s->rule_nno); + putchar (':'); pr_Set (poset, s->set); prev_no = -1; @@ -823,6 +862,106 @@ static void pr_followpos (void) putchar ('\n'); } +void dfa_parse_cmap_clean (struct DFA *d) +{ + struct DFA_parse *dfa = d->parse_info; + + assert (dfa); + if (!dfa->charMap) + { + dfa->charMapSize = 7; + dfa->charMap = imalloc (dfa->charMapSize * sizeof(*dfa->charMap)); + } + dfa->charMap[0] = 0; +} + +void dfa_parse_cmap_new (struct DFA *d, const int *cmap) +{ + struct DFA_parse *dfa = d->parse_info; + const int *cp; + int size; + + assert (dfa); + for (cp = cmap; *cp; cp += 2) + ; + size = cp - cmap + 1; + if (size > dfa->charMapSize) + { + if (dfa->charMap) + ifree (dfa->charMap); + dfa->charMapSize = size; + dfa->charMap = imalloc (size * sizeof(*dfa->charMap)); + } + memcpy (dfa->charMap, cmap, size * sizeof(*dfa->charMap)); +} + +void dfa_parse_cmap_del (struct DFA *d, int from) +{ + struct DFA_parse *dfa = d->parse_info; + int *cc; + + assert (dfa); + for (cc = dfa->charMap; *cc; cc += 2) + if (*cc == from) + { + while ((cc[0] = cc[2])) + { + cc[1] = cc[3]; + cc += 2; + } + break; + } +} + +void dfa_parse_cmap_add (struct DFA *d, int from, int to) +{ + struct DFA_parse *dfa = d->parse_info; + int *cc; + int indx, size; + + assert (dfa); + for (cc = dfa->charMap; *cc; cc += 2) + if (*cc == from) + { + cc[1] = to; + return ; + } + indx = cc - dfa->charMap; + size = dfa->charMapSize; + if (indx >= size) + { + int *cn = imalloc ((size+16) * sizeof(*dfa->charMap)); + memcpy (cn, dfa->charMap, indx*sizeof(*dfa->charMap)); + ifree (dfa->charMap); + dfa->charMap = cn; + dfa->charMapSize = size+16; + } + dfa->charMap[indx] = from; + dfa->charMap[indx+1] = to; + dfa->charMap[indx+2] = 0; +} + +void dfa_parse_cmap_thompson (struct DFA *d) +{ + static int thompson_chars[] = + { + '*', L_CLOS0, + '+', L_CLOS1, + '|', L_ALT, + '^', L_START, + '$', L_END, + '?', L_QUEST, + '.', L_ANY, + '(', L_LP, + ')', L_RP, + ' ', 0, + '\t', 0, + '\n', 0, + 0 + }; + dfa_parse_cmap_new (d, thompson_chars); +} + static struct DFA_parse *dfa_parse_init (void) { parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse)); @@ -837,6 +976,8 @@ static struct DFA_parse *dfa_parse_init (void) add_BSet (parse_info->charset, parse_info->anyset, '\n'); com_BSet (parse_info->charset, parse_info->anyset); parse_info->use_Tnode = parse_info->max_Tnode = 0; + parse_info->charMap = NULL; + parse_info->charMapSize = 0; return parse_info; } @@ -846,6 +987,7 @@ static void rm_dfa_parse (struct DFA_parse **dfap) assert (parse_info); term_Tnode(); rm_BSetHandle (&parse_info->charset); + ifree (parse_info->charMap); ifree (parse_info); *dfap = NULL; } @@ -861,11 +1003,12 @@ static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk) poset = mk_SetType (poset_chunk); init_pos(); init_followpos(); + assert (parse_info->root); dfa_trav (parse_info->root); if (debug_dfa_followpos) pr_followpos(); - init_DFA_states (&dfas, poset, 401); + init_DFA_states (&dfas, poset, STATE_HASH); mk_dfa_tran (dfas); if (debug_dfa_tran) pr_tran (dfas); @@ -885,19 +1028,21 @@ struct DFA *dfa_init (void) dfa->parse_info = dfa_parse_init (); dfa->state_info = NULL; dfa->states = NULL; + dfa_parse_cmap_thompson (dfa); return dfa; } -int dfa_parse (struct DFA *dfa, char **pattern) +int dfa_parse (struct DFA *dfa, const char **pattern) { - int i; + struct Tnode *top; assert (dfa); - i = parse (dfa->parse_info, pattern, dfa_thompson_chars); - if (i) - return i; + assert (dfa->parse_info); + do_parse (dfa->parse_info, pattern, &top); + if (err_code) + return err_code; if ( !dfa->parse_info->root ) - dfa->parse_info->root = dfa->parse_info->top; + dfa->parse_info->root = top; else { struct Tnode *n; @@ -905,7 +1050,7 @@ int dfa_parse (struct DFA *dfa, char **pattern) n = mk_Tnode (); n->pos = OR; n->u.p[0] = dfa->parse_info->root; - n->u.p[1] = dfa->parse_info->top; + n->u.p[1] = top; dfa->parse_info->root = n; } return 0; @@ -915,7 +1060,7 @@ void dfa_mkstate (struct DFA *dfa) { assert (dfa); - dfa->state_info = mk_dfas (dfa->parse_info, 200); + dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK); dfa->no_states = dfa->state_info->no; dfa->states = dfa->state_info->sortarray; rm_dfa_parse (&dfa->parse_info); @@ -927,7 +1072,8 @@ void dfa_delete (struct DFA **dfap) assert (*dfap); if ((*dfap)->parse_info) rm_dfa_parse (&(*dfap)->parse_info); - rm_DFA_states (&(*dfap)->state_info); + if ((*dfap)->state_info) + rm_DFA_states (&(*dfap)->state_info); ifree (*dfap); *dfap = NULL; }