-/*
- * Copyright (C) 1994-1999, Index Data
- * All rights reserved.
- * Sebastian Hammer, Adam Dickmeiss
- *
- * $Log: dfa.c,v $
- * Revision 1.26 1999-05-26 07:49:12 adam
- * C++ compilation.
- *
- * Revision 1.25 1999/02/02 14:50:05 adam
- * Updated WIN32 code specific sections. Changed header.
- *
- * Revision 1.24 1998/10/28 10:48:55 adam
- * Added type cast to prevent warning.
- *
- * Revision 1.23 1998/09/02 14:15:28 adam
- * Zebra uses GNU Configure.
- *
- * Revision 1.22 1998/06/24 12:16:10 adam
- * Support for relations on text operands. Open range support in
- * DFA module (i.e. [-j], [g-]).
- *
- * Revision 1.21 1998/06/22 11:33:39 adam
- * Added two type casts.
- *
- * Revision 1.20 1998/06/08 14:40:44 adam
- * Fixed problem with signed character(s) in regular expressions.
- *
- * Revision 1.19 1998/01/12 14:39:39 adam
- * Fixed bug in term_Tnode.
- *
- * Revision 1.18 1997/09/29 09:05:17 adam
- * Thread safe DFA module. We simply had to put a few static vars to
- * the DFA_parse structure.
- *
- * Revision 1.17 1997/09/18 08:59:17 adam
- * Extra generic handle for the character mapping routines.
- *
- * Revision 1.16 1997/09/05 15:29:57 adam
- * Changed prototype for chr_map_input - added const.
- * Added support for C++, headers uses extern "C" for public definitions.
- *
- * Revision 1.15 1997/02/10 10:19:20 adam
- * Added facility for open character sets, eg [a-].
- *
- * Revision 1.14 1996/10/29 13:57:22 adam
- * Include of zebrautl.h instead of alexutil.h.
- *
- * Revision 1.13 1996/06/17 14:24:08 adam
- * Bug fix: read_charset didn't handle character mapping.
- *
- * Revision 1.12 1996/06/04 10:20:02 adam
- * Added support for character mapping.
- *
- * Revision 1.11 1996/01/08 19:15:24 adam
- * Allow single $ in expressions.
- *
- * Revision 1.10 1996/01/08 09:09:17 adam
- * Function dfa_parse got 'const' string argument.
- * New functions to define char mappings made public.
- *
- * Revision 1.9 1995/12/06 12:24:58 adam
- * Removed verbatim mode code.
- *
- * Revision 1.8 1995/12/06 09:09:58 adam
- * Work on left and right anchors.
- *
- * Revision 1.7 1995/11/27 09:23:02 adam
- * New berbatim hook in regular expressions. "[]n ..".
- *
- * Revision 1.6 1995/10/16 09:31:25 adam
- * Bug fix.
- *
- * Revision 1.5 1995/10/02 15:17:58 adam
- * Bug fix in dfa_delete.
- *
- * Revision 1.4 1995/09/28 09:18:52 adam
- * Removed various preprocessor defines.
- *
- * Revision 1.3 1995/09/04 12:33:26 adam
- * Various cleanup. YAZ util used instead.
- *
- * Revision 1.2 1995/01/25 11:30:50 adam
- * Simple error reporting when parsing regular expressions.
- * Memory usage reduced.
- *
- * Revision 1.1 1995/01/24 16:02:52 adam
- * New private header file in dfa module (dfap.h).
- * Module no longer uses yacc to parse regular expressions.
- *
- */
+/* $Id: dfa.c,v 1.40 2007-01-15 15:10:15 adam Exp $
+ Copyright (C) 1995-2007
+ Index Data ApS
+
+This file is part of the Zebra server.
+
+Zebra is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*/
+
+
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <ctype.h>
-#include <zebrautl.h>
+#include <idzebra/util.h>
#include "dfap.h"
#include "imalloc.h"
-#define DFA_OPEN_RANGE 1
-
#define CAT 16000
#define OR 16001
#define STAR 16002
} u;
unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
unsigned nullable : 1;
- Set firstpos; /* first positions */
- Set lastpos; /* last positions */
+ DFASet firstpos; /* first positions */
+ DFASet lastpos; /* last positions */
};
struct Tblock { /* Tnode bucket (block) */
init_pos (struct DFA_parse *parse_info),
del_pos (struct DFA_parse *parse_info),
mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas),
- add_follow (struct DFA_parse *parse_info, Set lastpos, Set firstpos),
+ add_follow (struct DFA_parse *parse_info, DFASet lastpos, DFASet firstpos),
dfa_trav (struct DFA_parse *parse_info, struct Tnode *n),
init_followpos (struct DFA_parse *parse_info),
pr_tran (struct DFA_parse *parse_info, struct DFA_states *dfas),
static int read_charset (struct DFA_parse *parse_info)
{
- int i, ch0, ch1, esc0, esc1, cc = 0;
+ int i, ch0, esc0, cc = 0;
parse_info->look_chars = mk_BSet (&parse_info->charset);
res_BSet (parse_info->charset, parse_info->look_chars);
cc = 1;
ch0 = nextchar_set (parse_info, &esc0);
}
+ /**
+ ch0 is last met character
+ ch1 is "next" char
+ */
while (ch0 != 0)
{
+ int ch1, esc1;
if (!esc0 && ch0 == ']')
break;
if (!esc0 && ch0 == '-')
}
else
{
- if (parse_info->cmap)
- {
- const char **mapto;
- char mapfrom[2];
- const char *mcp = mapfrom;
- mapfrom[0] = ch0;
- mapto = (*parse_info->cmap)(parse_info->cmap_data, &mcp, 1);
- assert (mapto);
- ch0 = mapto[0][0];
- }
+ if (ch0 == 1)
+ {
+ ch0 = nextchar(parse_info, &esc0);
+ }
+ else
+ {
+ if (parse_info->cmap)
+ {
+ const char **mapto;
+ char mapfrom[2];
+ const char *mcp = mapfrom;
+ mapfrom[0] = ch0;
+ mapto = parse_info->cmap(parse_info->cmap_data, &mcp, 1);
+ assert (mapto);
+ ch0 = mapto[0][0];
+ }
+ }
add_BSet (parse_info->charset, parse_info->look_chars, ch0);
ch1 = nextchar_set (parse_info, &esc1);
}
int open_range = 0;
if ((ch1 = nextchar_set (parse_info, &esc1)) == 0)
break;
-#if DFA_OPEN_RANGE
if (!esc1 && ch1 == ']')
{
ch1 = 255;
open_range = 1;
}
-#else
- if (!esc1 && ch1 == ']')
+ else if (ch1 == 1)
{
- add_BSet (parse_info->charset, parse_info->look_chars, '-');
- break;
+ ch1 = nextchar(parse_info, &esc1);
}
-#endif
- if (!open_range && parse_info->cmap)
+ else if (parse_info->cmap)
{
const char **mapto;
char mapfrom[2];
assert (mapto);
ch1 = mapto[0][0];
}
- for (i=ch0; ++i<=ch1;)
+ for (i = ch0; ++i <= ch1;)
add_BSet (parse_info->charset, parse_info->look_chars, i);
- if (!open_range)
- ch0 = nextchar_set (parse_info, &esc0);
- else
+
+ if (open_range)
break;
+ ch0 = nextchar_set (parse_info, &esc0);
}
else
{
parse_info->expr_ptr = (const unsigned char *) cp0;
parse_info->look_ch = ((unsigned char **) mapto)[i][0];
- logf (LOG_DEBUG, "map from %c to %d", parse_info->expr_ptr[-1], parse_info->look_ch);
+ yaz_log (YLOG_DEBUG, "map from %c to %d", parse_info->expr_ptr[-1], parse_info->look_ch);
return L_CHAR;
}
}
static void add_follow (struct DFA_parse *parse_info,
- Set lastpos, Set firstpos)
+ DFASet lastpos, DFASet firstpos)
{
while (lastpos)
{
parse_info->followpos[lastpos->value] =
- union_Set (parse_info->poset,
+ union_DFASet (parse_info->poset,
parse_info->followpos[lastpos->value], firstpos);
lastpos = lastpos->next;
}
static void dfa_trav (struct DFA_parse *parse_info, struct Tnode *n)
{
struct Tnode **posar = parse_info->posar;
- SetType poset = parse_info->poset;
+ DFASetType poset = parse_info->poset;
switch (n->pos)
{
dfa_trav (parse_info, n->u.p[0]);
dfa_trav (parse_info, n->u.p[1]);
n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
- n->firstpos = mk_Set (poset);
- n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
+ n->firstpos = mk_DFASet (poset);
+ n->firstpos = union_DFASet (poset, n->firstpos, n->u.p[0]->firstpos);
if (n->u.p[0]->nullable)
- n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
- n->lastpos = mk_Set (poset);
- n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
+ n->firstpos = union_DFASet (poset, n->firstpos, n->u.p[1]->firstpos);
+ n->lastpos = mk_DFASet (poset);
+ n->lastpos = union_DFASet (poset, n->lastpos, n->u.p[1]->lastpos);
if (n->u.p[1]->nullable)
- n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
+ n->lastpos = union_DFASet (poset, n->lastpos, n->u.p[0]->lastpos);
add_follow (parse_info, n->u.p[0]->lastpos, n->u.p[1]->firstpos);
- n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
- n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
- n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
- n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
+ n->u.p[0]->firstpos = rm_DFASet (poset, n->u.p[0]->firstpos);
+ n->u.p[0]->lastpos = rm_DFASet (poset, n->u.p[0]->lastpos);
+ n->u.p[1]->firstpos = rm_DFASet (poset, n->u.p[1]->firstpos);
+ n->u.p[1]->lastpos = rm_DFASet (poset, n->u.p[1]->lastpos);
if (debug_dfa_trav)
printf ("CAT");
break;
dfa_trav (parse_info, n->u.p[1]);
n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
- n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
+ n->firstpos = merge_DFASet (poset, n->u.p[0]->firstpos,
n->u.p[1]->firstpos);
- n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
+ n->lastpos = merge_DFASet (poset, n->u.p[0]->lastpos,
n->u.p[1]->lastpos);
- n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
- n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
- n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
- n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
+ n->u.p[0]->firstpos = rm_DFASet (poset, n->u.p[0]->firstpos);
+ n->u.p[0]->lastpos = rm_DFASet (poset, n->u.p[0]->lastpos);
+ n->u.p[1]->firstpos = rm_DFASet (poset, n->u.p[1]->firstpos);
+ n->u.p[1]->lastpos = rm_DFASet (poset, n->u.p[1]->lastpos);
if (debug_dfa_trav)
printf ("OR");
break;
break;
case EPSILON:
n->nullable = 1;
- n->lastpos = mk_Set (poset);
- n->firstpos = mk_Set (poset);
+ n->lastpos = mk_DFASet (poset);
+ n->firstpos = mk_DFASet (poset);
if (debug_dfa_trav)
printf ("EPSILON");
break;
default:
posar[n->pos] = n;
n->nullable = 0;
- n->firstpos = mk_Set (poset);
- n->firstpos = add_Set (poset, n->firstpos, n->pos);
- n->lastpos = mk_Set (poset);
- n->lastpos = add_Set (poset, n->lastpos, n->pos);
+ n->firstpos = mk_DFASet (poset);
+ n->firstpos = add_DFASet (poset, n->firstpos, n->pos);
+ n->lastpos = mk_DFASet (poset);
+ n->lastpos = add_DFASet (poset, n->lastpos, n->pos);
if (debug_dfa_trav)
{
if (n->u.ch[0] < 0)
{
printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
printf (" firstpos :");
- pr_Set (poset, n->firstpos);
+ pr_DFASet (poset, n->firstpos);
printf (" lastpos :");
- pr_Set (poset, n->lastpos);
+ pr_DFASet (poset, n->lastpos);
}
}
static void init_followpos (struct DFA_parse *parse_info)
{
- Set *fa;
+ DFASet *fa;
int i;
parse_info->followpos = fa =
- (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
+ (DFASet *) imalloc (sizeof(DFASet) * (1+parse_info->position));
for (i = parse_info->position+1; --i >= 0; fa++)
- *fa = mk_Set (parse_info->poset);
+ *fa = mk_DFASet (parse_info->poset);
}
static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas)
int i, j, c;
int max_char;
short *pos, *pos_i;
- Set tran_set;
+ DFASet tran_set;
int char_0, char_1;
struct DFA_state *dfa_from, *dfa_to;
struct Tnode **posar;
- SetType poset;
- Set *followpos;
+ DFASetType poset;
+ DFASet *followpos;
assert (parse_info);
max_char = parse_info->charset->size;
pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
- tran_set = cp_Set (parse_info->poset, parse_info->root->firstpos);
+ tran_set = cp_DFASet (parse_info->poset, parse_info->root->firstpos);
i = add_DFA_state (dfas, &tran_set, &dfa_from);
assert (i);
dfa_from->rule_no = 0;
char_1 = max_char;
- tran_set = mk_Set (poset);
+ tran_set = mk_DFASet (poset);
for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
{
if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
char_1 = c; /* backward chunk */
if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
- tran_set = union_Set (poset, tran_set, followpos[i]);
+ tran_set = union_DFASet (poset, tran_set, followpos[i]);
}
if (tran_set)
{
printf (" #%d", s->rule_nno);
putchar (':');
- pr_Set (parse_info->poset, s->set);
+ pr_DFASet (parse_info->poset, s->set);
prev_no = -1;
for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
{
{
long i, j;
int k;
- printf ("%d/%d tree nodes used, %d bytes each\n",
- parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
+ printf ("%d/%d tree nodes used, %ld bytes each\n",
+ parse_info->use_Tnode, parse_info->max_Tnode, (long) sizeof (struct Tnode));
k = inf_BSetHandle (parse_info->charset, &i, &j);
- printf ("%ld/%ld character sets, %d bytes each\n",
- i/k, j/k, k*sizeof(BSetWord));
- k = inf_SetType (parse_info->poset, &i, &j);
+ printf ("%ld/%ld character sets, %ld bytes each\n",
+ i/k, j/k, (long) k*sizeof(BSetWord));
+ k = inf_DFASetType (parse_info->poset, &i, &j);
printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
printf ("%d DFA states\n", dfas->no);
}
for (i=1; i <= parse_info->position; i++)
{
printf ("%3d:", i);
- pr_Set (parse_info->poset, parse_info->followpos[i]);
+ pr_DFASet (parse_info->poset, parse_info->followpos[i]);
putchar ('\t');
if (posar[i]->u.ch[0] < 0)
parse_info->anyset = mk_BSet (&parse_info->charset);
res_BSet (parse_info->charset, parse_info->anyset);
- add_BSet (parse_info->charset, parse_info->anyset, '\n');
com_BSet (parse_info->charset, parse_info->anyset);
parse_info->use_Tnode = parse_info->max_Tnode = 0;
parse_info->start = parse_info->end = NULL;
assert (poset_chunk > 10);
assert (dfap);
- parse_info->poset = mk_SetType (poset_chunk);
+ parse_info->poset = mk_DFASetType (poset_chunk);
init_pos(parse_info);
init_followpos(parse_info);
assert (parse_info->root);
init_DFA_states (&dfas, parse_info->poset, (int) (STATE_HASH));
mk_dfa_tran (parse_info, dfas);
if (debug_dfa_tran)
+ {
pr_tran (parse_info, dfas);
+ }
if (dfa_verbose)
pr_verbose (parse_info, dfas);
del_pos(parse_info);
del_followpos(parse_info);
- rm_SetType (parse_info->poset);
+ rm_DFASetType (parse_info->poset);
return dfas;
}
dfa->parse_info->cmap_data = vp;
}
+int dfa_get_last_rule (struct DFA *dfa)
+{
+ return dfa->parse_info->rule;
+}
+
int dfa_parse (struct DFA *dfa, const char **pattern)
{
struct Tnode *top;
assert (dfa);
assert (dfa->parse_info);
parse_info = dfa->parse_info;
+
+ if (!parse_info->cmap)
+ {
+ res_BSet (parse_info->charset, parse_info->anyset);
+ add_BSet (parse_info->charset, parse_info->anyset, '\n');
+ com_BSet (parse_info->charset, parse_info->anyset);
+ }
do_parse (parse_info, pattern, &top);
if (parse_info->err_code)
return parse_info->err_code;
ifree (*dfap);
*dfap = NULL;
}
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+