X-Git-Url: http://git.indexdata.com/?p=idzebra-moved-to-github.git;a=blobdiff_plain;f=dfa%2Fdfa.c;h=8ea7fc1f86b668f9348757b6ab8ac4de34dfd0f5;hp=340cffc0ed5db3e76e42b23d5c5a78f066d66582;hb=dcda88860b03641b6900d43135ca769f005105e8;hpb=58e3c5132f9fe86fefbf2e130275ab9980eeed1e diff --git a/dfa/dfa.c b/dfa/dfa.c index 340cffc..8ea7fc1 100644 --- a/dfa/dfa.c +++ b/dfa/dfa.c @@ -1,8 +1,5 @@ -/* $Id: dfa.c,v 1.29 2003-06-18 11:46:33 adam Exp $ - Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 - Index Data Aps - -This file is part of the Zebra server. +/* This file is part of the Zebra server. + Copyright (C) Index Data Zebra is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free @@ -15,12 +12,15 @@ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Zebra; see the file LICENSE.zebra. If not, write to the -Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA -02111-1307, USA. +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#if HAVE_CONFIG_H +#include +#endif #include #include @@ -28,12 +28,10 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include -#include +#include #include "dfap.h" #include "imalloc.h" -#define DFA_OPEN_RANGE 1 - #define CAT 16000 #define OR 16001 #define STAR 16002 @@ -50,8 +48,8 @@ struct Tnode { } u; unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */ unsigned nullable : 1; - Set firstpos; /* first positions */ - Set lastpos; /* last positions */ + DFASet firstpos; /* first positions */ + DFASet lastpos; /* last positions */ }; struct Tblock { /* Tnode bucket (block) */ @@ -73,12 +71,12 @@ static struct Tnode *mk_Tnode_cset (struct DFA_parse *parse_info, BSet charset); static void term_Tnode (struct DFA_parse *parse_info); -static void - del_followpos (struct DFA_parse *parse_info), - init_pos (struct DFA_parse *parse_info), +static void + del_followpos (struct DFA_parse *parse_info), + init_pos (struct DFA_parse *parse_info), del_pos (struct DFA_parse *parse_info), mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas), - add_follow (struct DFA_parse *parse_info, Set lastpos, Set firstpos), + add_follow (struct DFA_parse *parse_info, DFASet lastpos, DFASet firstpos), dfa_trav (struct DFA_parse *parse_info, struct Tnode *n), init_followpos (struct DFA_parse *parse_info), pr_tran (struct DFA_parse *parse_info, struct DFA_states *dfas), @@ -91,7 +89,7 @@ static int nextchar (struct DFA_parse *parse_info, int *esc), read_charset (struct DFA_parse *parse_info); -static const char +static const char *str_char (unsigned c); #define L_LP 1 @@ -115,7 +113,7 @@ static struct Tnode *expr_1 (struct DFA_parse *parse_info), *expr_4 (struct DFA_parse *parse_info); static struct Tnode *expr_1 (struct DFA_parse *parse_info) -{ +{ struct Tnode *t1, *t2, *tn; if (!(t1 = expr_2 (parse_info))) @@ -125,7 +123,7 @@ static struct Tnode *expr_1 (struct DFA_parse *parse_info) lex (parse_info); if (!(t2 = expr_2 (parse_info))) return t2; - + tn = mk_Tnode (parse_info); tn->pos = OR; tn->u.p[0] = t1; @@ -138,7 +136,7 @@ static struct Tnode *expr_1 (struct DFA_parse *parse_info) static struct Tnode *expr_2 (struct DFA_parse *parse_info) { struct Tnode *t1, *t2, *tn; - + if (!(t1 = expr_3 (parse_info))) return t1; while (parse_info->lookahead == L_WILD || @@ -150,7 +148,7 @@ static struct Tnode *expr_2 (struct DFA_parse *parse_info) { if (!(t2 = expr_3 (parse_info))) return t2; - + tn = mk_Tnode (parse_info); tn->pos = CAT; tn->u.p[0] = t1; @@ -274,13 +272,13 @@ static void do_parse (struct DFA_parse *parse_info, const char **s, t2 = mk_Tnode (parse_info); t2->pos = ++parse_info->position; t2->u.ch[1] = t2->u.ch[0] = '\n'; - + tn = mk_Tnode (parse_info); tn->pos = CAT; tn->u.p[0] = t1; tn->u.p[1] = t2; t1 = tn; - + lex (parse_info); } } @@ -290,7 +288,7 @@ static void do_parse (struct DFA_parse *parse_info, const char **s, t2->pos = ++parse_info->position; t2->u.ch[0] = -(++parse_info->rule); t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule); - + *tnp = mk_Tnode(parse_info); (*tnp)->pos = CAT; (*tnp)->u.p[0] = t1; @@ -357,7 +355,7 @@ static int nextchar_set (struct DFA_parse *parse_info, int *esc) static int read_charset (struct DFA_parse *parse_info) { - int i, ch0, ch1, esc0, esc1, cc = 0; + int i, ch0, esc0, cc = 0; parse_info->look_chars = mk_BSet (&parse_info->charset); res_BSet (parse_info->charset, parse_info->look_chars); @@ -367,8 +365,13 @@ static int read_charset (struct DFA_parse *parse_info) cc = 1; ch0 = nextchar_set (parse_info, &esc0); } + /** + ch0 is last met character + ch1 is "next" char + */ while (ch0 != 0) { + int ch1, esc1; if (!esc0 && ch0 == ']') break; if (!esc0 && ch0 == '-') @@ -380,16 +383,23 @@ static int read_charset (struct DFA_parse *parse_info) } else { - if (parse_info->cmap) - { - const char **mapto; - char mapfrom[2]; - const char *mcp = mapfrom; - mapfrom[0] = ch0; - mapto = (*parse_info->cmap)(parse_info->cmap_data, &mcp, 1); - assert (mapto); - ch0 = mapto[0][0]; - } + if (ch0 == 1) + { + ch0 = nextchar(parse_info, &esc0); + } + else + { + if (parse_info->cmap) + { + const char **mapto; + char mapfrom[2]; + const char *mcp = mapfrom; + mapfrom[0] = ch0; + mapto = parse_info->cmap(parse_info->cmap_data, &mcp, 1); + assert (mapto); + ch0 = mapto[0][0]; + } + } add_BSet (parse_info->charset, parse_info->look_chars, ch0); ch1 = nextchar_set (parse_info, &esc1); } @@ -398,20 +408,16 @@ static int read_charset (struct DFA_parse *parse_info) int open_range = 0; if ((ch1 = nextchar_set (parse_info, &esc1)) == 0) break; -#if DFA_OPEN_RANGE if (!esc1 && ch1 == ']') { ch1 = 255; open_range = 1; } -#else - if (!esc1 && ch1 == ']') + else if (ch1 == 1) { - add_BSet (parse_info->charset, parse_info->look_chars, '-'); - break; + ch1 = nextchar(parse_info, &esc1); } -#endif - if (!open_range && parse_info->cmap) + else if (parse_info->cmap) { const char **mapto; char mapfrom[2]; @@ -421,12 +427,12 @@ static int read_charset (struct DFA_parse *parse_info) assert (mapto); ch1 = mapto[0][0]; } - for (i=ch0; ++i<=ch1;) + for (i = ch0; ++i <= ch1;) add_BSet (parse_info->charset, parse_info->look_chars, i); - if (!open_range) - ch0 = nextchar_set (parse_info, &esc0); - else + + if (open_range) break; + ch0 = nextchar_set (parse_info, &esc0); } else { @@ -456,10 +462,10 @@ static int map_l_char (struct DFA_parse *parse_info) mapto = (*parse_info->cmap) (parse_info->cmap_data, &cp0, len); assert (mapto); - + parse_info->expr_ptr = (const unsigned char *) cp0; parse_info->look_ch = ((unsigned char **) mapto)[i][0]; - logf (LOG_DEBUG, "map from %c to %d", parse_info->expr_ptr[-1], parse_info->look_ch); + yaz_log (YLOG_DEBUG, "map from %c to %d", parse_info->expr_ptr[-1], parse_info->look_ch); return L_CHAR; } @@ -477,7 +483,7 @@ static int lex_sub(struct DFA_parse *parse_info) return map_l_char (parse_info); else if (parse_info->look_ch == '[') return read_charset(parse_info); - else + else { const int *cc; for (cc = parse_info->charMap; *cc; cc += 2) @@ -618,7 +624,7 @@ static void del_followpos (struct DFA_parse *parse_info) static void init_pos (struct DFA_parse *parse_info) { - parse_info->posar = (struct Tnode **) imalloc (sizeof(struct Tnode*) + parse_info->posar = (struct Tnode **) imalloc (sizeof(struct Tnode*) * (1+parse_info->position)); } @@ -628,43 +634,43 @@ static void del_pos (struct DFA_parse *parse_info) } static void add_follow (struct DFA_parse *parse_info, - Set lastpos, Set firstpos) + DFASet lastpos, DFASet firstpos) { while (lastpos) { - parse_info->followpos[lastpos->value] = - union_Set (parse_info->poset, + parse_info->followpos[lastpos->value] = + union_DFASet (parse_info->poset, parse_info->followpos[lastpos->value], firstpos); lastpos = lastpos->next; - } + } } static void dfa_trav (struct DFA_parse *parse_info, struct Tnode *n) { struct Tnode **posar = parse_info->posar; - SetType poset = parse_info->poset; - + DFASetType poset = parse_info->poset; + switch (n->pos) { case CAT: dfa_trav (parse_info, n->u.p[0]); dfa_trav (parse_info, n->u.p[1]); n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable; - n->firstpos = mk_Set (poset); - n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos); + n->firstpos = mk_DFASet (poset); + n->firstpos = union_DFASet (poset, n->firstpos, n->u.p[0]->firstpos); if (n->u.p[0]->nullable) - n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos); - n->lastpos = mk_Set (poset); - n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos); + n->firstpos = union_DFASet (poset, n->firstpos, n->u.p[1]->firstpos); + n->lastpos = mk_DFASet (poset); + n->lastpos = union_DFASet (poset, n->lastpos, n->u.p[1]->lastpos); if (n->u.p[1]->nullable) - n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos); + n->lastpos = union_DFASet (poset, n->lastpos, n->u.p[0]->lastpos); add_follow (parse_info, n->u.p[0]->lastpos, n->u.p[1]->firstpos); - n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos); - n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos); - n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos); - n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos); + n->u.p[0]->firstpos = rm_DFASet (poset, n->u.p[0]->firstpos); + n->u.p[0]->lastpos = rm_DFASet (poset, n->u.p[0]->lastpos); + n->u.p[1]->firstpos = rm_DFASet (poset, n->u.p[1]->firstpos); + n->u.p[1]->lastpos = rm_DFASet (poset, n->u.p[1]->lastpos); if (debug_dfa_trav) printf ("CAT"); break; @@ -673,14 +679,14 @@ static void dfa_trav (struct DFA_parse *parse_info, struct Tnode *n) dfa_trav (parse_info, n->u.p[1]); n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable; - n->firstpos = merge_Set (poset, n->u.p[0]->firstpos, + n->firstpos = merge_DFASet (poset, n->u.p[0]->firstpos, n->u.p[1]->firstpos); - n->lastpos = merge_Set (poset, n->u.p[0]->lastpos, + n->lastpos = merge_DFASet (poset, n->u.p[0]->lastpos, n->u.p[1]->lastpos); - n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos); - n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos); - n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos); - n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos); + n->u.p[0]->firstpos = rm_DFASet (poset, n->u.p[0]->firstpos); + n->u.p[0]->lastpos = rm_DFASet (poset, n->u.p[0]->lastpos); + n->u.p[1]->firstpos = rm_DFASet (poset, n->u.p[1]->firstpos); + n->u.p[1]->lastpos = rm_DFASet (poset, n->u.p[1]->lastpos); if (debug_dfa_trav) printf ("OR"); break; @@ -704,18 +710,18 @@ static void dfa_trav (struct DFA_parse *parse_info, struct Tnode *n) break; case EPSILON: n->nullable = 1; - n->lastpos = mk_Set (poset); - n->firstpos = mk_Set (poset); + n->lastpos = mk_DFASet (poset); + n->firstpos = mk_DFASet (poset); if (debug_dfa_trav) printf ("EPSILON"); break; default: posar[n->pos] = n; n->nullable = 0; - n->firstpos = mk_Set (poset); - n->firstpos = add_Set (poset, n->firstpos, n->pos); - n->lastpos = mk_Set (poset); - n->lastpos = add_Set (poset, n->lastpos, n->pos); + n->firstpos = mk_DFASet (poset); + n->firstpos = add_DFASet (poset, n->firstpos, n->pos); + n->lastpos = mk_DFASet (poset); + n->lastpos = add_DFASet (poset, n->lastpos, n->pos); if (debug_dfa_trav) { if (n->u.ch[0] < 0) @@ -737,20 +743,20 @@ static void dfa_trav (struct DFA_parse *parse_info, struct Tnode *n) { printf ("\n nullable : %c\n", n->nullable ? '1' : '0'); printf (" firstpos :"); - pr_Set (poset, n->firstpos); + pr_DFASet (poset, n->firstpos); printf (" lastpos :"); - pr_Set (poset, n->lastpos); + pr_DFASet (poset, n->lastpos); } } static void init_followpos (struct DFA_parse *parse_info) { - Set *fa; + DFASet *fa; int i; parse_info->followpos = fa = - (Set *) imalloc (sizeof(Set) * (1+parse_info->position)); + (DFASet *) imalloc (sizeof(DFASet) * (1+parse_info->position)); for (i = parse_info->position+1; --i >= 0; fa++) - *fa = mk_Set (parse_info->poset); + *fa = mk_DFASet (parse_info->poset); } static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) @@ -758,12 +764,12 @@ static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) int i, j, c; int max_char; short *pos, *pos_i; - Set tran_set; + DFASet tran_set; int char_0, char_1; struct DFA_state *dfa_from, *dfa_to; struct Tnode **posar; - SetType poset; - Set *followpos; + DFASetType poset; + DFASet *followpos; assert (parse_info); @@ -771,7 +777,7 @@ static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) max_char = parse_info->charset->size; pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1)); - tran_set = cp_Set (parse_info->poset, parse_info->root->firstpos); + tran_set = cp_DFASet (parse_info->poset, parse_info->root->firstpos); i = add_DFA_state (dfas, &tran_set, &dfa_from); assert (i); dfa_from->rule_no = 0; @@ -782,7 +788,7 @@ static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) pos_i = pos; j = i = 0; for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next) - if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char) + if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char) *pos_i++ = tran_set->value; else if (c < 0) { @@ -800,7 +806,7 @@ static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) { char_0 = max_char+1; for (pos_i = pos; (i = *pos_i) != -1; ++pos_i) - if (posar[i]->u.ch[1] >= char_1 + if (posar[i]->u.ch[1] >= char_1 && (c=posar[i]->u.ch[0]) < char_0) { if (c < char_1) @@ -813,8 +819,8 @@ static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) break; char_1 = max_char; - - tran_set = mk_Set (poset); + + tran_set = mk_DFASet (poset); for (pos_i = pos; (i = *pos_i) != -1; ++pos_i) { if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1) @@ -822,7 +828,7 @@ static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1) char_1 = c; /* backward chunk */ if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0) - tran_set = union_Set (poset, tran_set, followpos[i]); + tran_set = union_DFASet (poset, tran_set, followpos[i]); } if (tran_set) { @@ -852,7 +858,7 @@ static void pr_tran (struct DFA_parse *parse_info, struct DFA_states *dfas) printf (" #%d", s->rule_nno); putchar (':'); - pr_Set (parse_info->poset, s->set); + pr_DFASet (parse_info->poset, s->set); prev_no = -1; for (i=s->tran_no, tran=s->trans; --i >= 0; tran++) { @@ -876,12 +882,12 @@ static void pr_verbose (struct DFA_parse *parse_info, struct DFA_states *dfas) { long i, j; int k; - printf ("%d/%d tree nodes used, %d bytes each\n", - parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode)); + printf ("%d/%d tree nodes used, %ld bytes each\n", + parse_info->use_Tnode, parse_info->max_Tnode, (long) sizeof (struct Tnode)); k = inf_BSetHandle (parse_info->charset, &i, &j); - printf ("%ld/%ld character sets, %d bytes each\n", - i/k, j/k, k*sizeof(BSetWord)); - k = inf_SetType (parse_info->poset, &i, &j); + printf ("%ld/%ld character sets, %ld bytes each\n", + i/k, j/k, (long) k*sizeof(BSetWord)); + k = inf_DFASetType (parse_info->poset, &i, &j); printf ("%ld/%ld poset items, %d bytes each\n", i, j, k); printf ("%d DFA states\n", dfas->no); } @@ -894,7 +900,7 @@ static void pr_followpos (struct DFA_parse *parse_info) for (i=1; i <= parse_info->position; i++) { printf ("%3d:", i); - pr_Set (parse_info->poset, parse_info->followpos[i]); + pr_DFASet (parse_info->poset, parse_info->followpos[i]); putchar ('\t'); if (posar[i]->u.ch[0] < 0) @@ -960,7 +966,7 @@ void dfa_parse_cmap_del (struct DFA *d, int from) { while ((cc[0] = cc[2])) { - cc[1] = cc[3]; + cc[1] = cc[3]; cc += 2; } break; @@ -1018,7 +1024,7 @@ void dfa_parse_cmap_thompson (struct DFA *d) static struct DFA_parse *dfa_parse_init (void) { - struct DFA_parse *parse_info = + struct DFA_parse *parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse)); parse_info->charset = mk_BSetHandle (255, 20); @@ -1026,9 +1032,12 @@ static struct DFA_parse *dfa_parse_init (void) parse_info->rule = 0; parse_info->root = NULL; + /* initialize the anyset which by default does not include \n */ parse_info->anyset = mk_BSet (&parse_info->charset); res_BSet (parse_info->charset, parse_info->anyset); + add_BSet (parse_info->charset, parse_info->anyset, '\n'); com_BSet (parse_info->charset, parse_info->anyset); + parse_info->use_Tnode = parse_info->max_Tnode = 0; parse_info->start = parse_info->end = NULL; parse_info->charMap = NULL; @@ -1056,7 +1065,7 @@ static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk) assert (poset_chunk > 10); assert (dfap); - parse_info->poset = mk_SetType (poset_chunk); + parse_info->poset = mk_DFASetType (poset_chunk); init_pos(parse_info); init_followpos(parse_info); assert (parse_info->root); @@ -1068,14 +1077,13 @@ static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk) mk_dfa_tran (parse_info, dfas); if (debug_dfa_tran) { - printf ("PR_TRAN\n"); pr_tran (parse_info, dfas); } if (dfa_verbose) pr_verbose (parse_info, dfas); del_pos(parse_info); del_followpos(parse_info); - rm_SetType (parse_info->poset); + rm_DFASetType (parse_info->poset); return dfas; } @@ -1091,6 +1099,11 @@ struct DFA *dfa_init (void) return dfa; } +void dfa_anyset_includes_nl(struct DFA *dfa) +{ + add_BSet (dfa->parse_info->charset, dfa->parse_info->anyset, '\n'); +} + void dfa_set_cmap (struct DFA *dfa, void *vp, const char **(*cmap)(void *vp, const char **from, int len)) { @@ -1098,6 +1111,11 @@ void dfa_set_cmap (struct DFA *dfa, void *vp, dfa->parse_info->cmap_data = vp; } +int dfa_get_last_rule (struct DFA *dfa) +{ + return dfa->parse_info->rule; +} + int dfa_parse (struct DFA *dfa, const char **pattern) { struct Tnode *top; @@ -1107,12 +1125,6 @@ int dfa_parse (struct DFA *dfa, const char **pattern) assert (dfa->parse_info); parse_info = dfa->parse_info; - if (!parse_info->cmap) - { - res_BSet (parse_info->charset, parse_info->anyset); - add_BSet (parse_info->charset, parse_info->anyset, '\n'); - com_BSet (parse_info->charset, parse_info->anyset); - } do_parse (parse_info, pattern, &top); if (parse_info->err_code) return parse_info->err_code; @@ -1152,3 +1164,12 @@ void dfa_delete (struct DFA **dfap) ifree (*dfap); *dfap = NULL; } +/* + * Local variables: + * c-basic-offset: 4 + * c-file-style: "Stroustrup" + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ +