/* Copyright (C) 2006, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: nfa.c,v 1.1 2006-05-03 09:04:33 heikki Exp $
+ * $Id: nfa.c,v 1.4 2006-05-03 13:47:35 heikki Exp $
*
* This is a simple NFA-based system for character set normalizing
* in yaz and zebra. Unlike standard NFA's, this operates on ranges of
yaz_nfa_state *firststate; /* the initial state */
yaz_backref_info *curr_backrefs; /* the backrefs we are matching */
yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/
+ int lastmatch; /* the result of last match */
};
struct yaz_nfa_state {
n->nmem=my_nmem;
n->nstates=0;
n->laststate=0;
+ n->firststate=0;
+ n->nbackrefs=0;
+ n->curr_backrefs=0;
+ n->best_backrefs=0;
+ n->lastmatch= YAZ_NFA_NOMATCH;
return n;
}
return s->result;
}
-int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s,
+int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s,
int backref_number,
int is_start ){
if (is_start) {
if (s->backref_start)
return 1;
s->backref_start=backref_number;
- if (n->nbackrefs<backref_number)
- n->nbackrefs=backref_number;
+ if (n->nbackrefs<=backref_number) {
+ n->nbackrefs=backref_number+1;
+ n->curr_backrefs=0;
+ n->best_backrefs=0;
+ /* clear them just in case we have already matched on */
+ /* with this nfa, and created a too small backref table */
+ /* we will reallocate when matching. */
+ }
} else {
if (s->backref_end)
return 1;
+ if (n->nbackrefs<backref_number)
+ return 2; /* can't end a backref that has not been started */
s->backref_end=backref_number;
}
return 0; /* ok */
}
-int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s,
+int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s,
int is_start ) {
if (!s)
return 0;
* * * * * * */
struct matcher {
+ yaz_nfa *n;
yaz_nfa_char *longest;
int bestnode;
void *result;
};
/* Finds the longest match. In case of ties, chooses the one with the
- * lowest numbered state */
-static void match_state(yaz_nfa_state *s,
+ * lowest numbered state. Keep track of the back references. Recursively
+ * traverses the NFA. Keeps track of the best hit it has found. */
+
+static void match_state(
+ yaz_nfa_state *s,
yaz_nfa_char *inchar,
size_t incharsleft,
struct matcher *m ) {
yaz_nfa_transition *t=s->lasttrans;
-
+ if (s->backref_start)
+ m->n->curr_backrefs[s->backref_start].start=inchar;
+ if (s->backref_end)
+ m->n->curr_backrefs[s->backref_end].end=inchar;
if (t) {
if (incharsleft) {
do {
t=t->next;
if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){
m->empties=0;
+ if (t->range_start!=t->range_end){
+ /* backref 0 is special: the last range operation */
+ m->n->curr_backrefs[0].start=inchar;
+ m->n->curr_backrefs[0].end=inchar;
+ }
match_state(t->to_state, inchar+1,incharsleft-1,m);
/* yes, descent to all matching nodes, even if overrun, */
/* since we can find a better one later */
if ( (m->longest < inchar) || /* longer result */
((m->longest == inchar)&&(m->bestnode<s->num)) ){
/* or as long, but with lower node number. Still better */
+ int i;
m->longest=inchar;
m->bestnode=s->num;
m->result=s->result;
+ if (m->n->curr_backrefs)
+ for (i=0; i<m->n->nbackrefs;i++) {
+ m->n->best_backrefs[i]=m->n->curr_backrefs[i];
+ }
}
}
+ if (s->backref_start)
+ m->n->curr_backrefs[s->backref_start].start=0;
+ if (s->backref_end)
+ m->n->curr_backrefs[s->backref_end].end=0;
+ m->n->curr_backrefs[0].start=0;
+ m->n->curr_backrefs[0].end=0;
} /* match_state */
int yaz_nfa_match(yaz_nfa *n,
size_t incharsleft,
void **result ){
struct matcher m;
- if (!n->firststate)
- return YAZ_NFA_NOMATCH;
+ int sz;
+ int i;
+ if (!n->firststate) {
+ n->lastmatch=YAZ_NFA_NOMATCH;
+ return n->lastmatch;
+ }
+ m.n=n;
m.longest=*inbuff;
m.bestnode=n->nstates;
m.result=0;
m.errorcode=0;
+ sz=sizeof( struct yaz_nfa_backref_info) * n->nbackrefs;
if (!n->curr_backrefs) {
- int sz=sizeof( struct yaz_nfa_backref_info);
- n->curr_backrefs=nmem_malloc( n->nmem, n->nbackrefs* sz);
- n->best_backrefs=nmem_malloc( n->nmem, n->nbackrefs* sz);
+ n->curr_backrefs=nmem_malloc( n->nmem, sz);
+ n->best_backrefs=nmem_malloc( n->nmem, sz);
+ }
+ for (i=0;i<n->nbackrefs;i++) {
+ n->curr_backrefs[i].start=0;
+ n->curr_backrefs[i].end=0;
+ n->best_backrefs[i].start=0;
+ n->best_backrefs[i].end=0;
}
-
match_state(n->firststate, *inbuff, incharsleft, &m);
if (m.result) {
*result=m.result;
*inbuff=m.longest;
if (m.errorcode)
- return m.errorcode;
+ n->lastmatch=m.errorcode;
else
- return YAZ_NFA_SUCCESS;
+ n->lastmatch= YAZ_NFA_SUCCESS;
+ return n->lastmatch;
}
- return YAZ_NFA_NOMATCH;
+ n->lastmatch=YAZ_NFA_NOMATCH;
+ return n->lastmatch;
}
+int yaz_nfa_get_backref( yaz_nfa *n,
+ int backref_no,
+ yaz_nfa_char **start,
+ yaz_nfa_char **end) {
+ if (backref_no>=n->nbackrefs)
+ return 2;
+ if (backref_no<0)
+ return 2;
+ if (n->lastmatch== YAZ_NFA_NOMATCH)
+ return 1; /* accept other errors, they return partial matches*/
+
+ *start=n->best_backrefs[backref_no].start;
+ *end=n->best_backrefs[backref_no].end;
+ return 0;
+}
/* * * * * * * * *
static void dump_trans(FILE *F, yaz_nfa_transition *t ) {
char c1;
char c2;
+ char *e;
c1=t->range_start;
c2=t->range_end;
- char *e="";
+ e="";
if ( (t->range_start <= ' ') || (t->range_start>'z'))
c1='?';
if ( (t->range_end <= ' ') || (t->range_end>'z'))
yaz_nfa_state *s;
if (!F) /* lazy programmers can just pass 0 for F */
F=stdout;
- fprintf(F,"The NFA has %d states \n",n->nstates);
+ fprintf(F,"The NFA has %d states and %d backrefs\n",
+ n->nstates, n->nbackrefs);
s=n->laststate;
if (s) {
do {