X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=src%2Fnfa.c;h=ef6f3f41c43319550f0046969b39f0a0fabd6b3e;hb=68175726e7a40ecd8bd16e63605b2196fbeffb9e;hp=5e8d56e6b797b5417c56a745d1e7081664f671ac;hpb=4d994a119a1949522fe229270983ba1b1b399c6a;p=yaz-moved-to-github.git diff --git a/src/nfa.c b/src/nfa.c index 5e8d56e..ef6f3f4 100644 --- a/src/nfa.c +++ b/src/nfa.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfa.c,v 1.1 2006-05-03 09:04:33 heikki Exp $ + * $Id: nfa.c,v 1.4 2006-05-03 13:47:35 heikki Exp $ * * This is a simple NFA-based system for character set normalizing * in yaz and zebra. Unlike standard NFA's, this operates on ranges of @@ -41,6 +41,7 @@ struct yaz_nfa { yaz_nfa_state *firststate; /* the initial state */ yaz_backref_info *curr_backrefs; /* the backrefs we are matching */ yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/ + int lastmatch; /* the result of last match */ }; struct yaz_nfa_state { @@ -81,6 +82,11 @@ yaz_nfa *yaz_nfa_init() { n->nmem=my_nmem; n->nstates=0; n->laststate=0; + n->firststate=0; + n->nbackrefs=0; + n->curr_backrefs=0; + n->best_backrefs=0; + n->lastmatch= YAZ_NFA_NOMATCH; return n; } @@ -125,24 +131,32 @@ void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) { return s->result; } -int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s, +int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s, int backref_number, int is_start ){ if (is_start) { if (s->backref_start) return 1; s->backref_start=backref_number; - if (n->nbackrefsnbackrefs=backref_number; + if (n->nbackrefs<=backref_number) { + n->nbackrefs=backref_number+1; + n->curr_backrefs=0; + n->best_backrefs=0; + /* clear them just in case we have already matched on */ + /* with this nfa, and created a too small backref table */ + /* we will reallocate when matching. */ + } } else { if (s->backref_end) return 1; + if (n->nbackrefsbackref_end=backref_number; } return 0; /* ok */ } -int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s, +int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s, int is_start ) { if (!s) return 0; @@ -245,6 +259,7 @@ yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n, * * * * * * */ struct matcher { + yaz_nfa *n; yaz_nfa_char *longest; int bestnode; void *result; @@ -253,19 +268,30 @@ struct matcher { }; /* Finds the longest match. In case of ties, chooses the one with the - * lowest numbered state */ -static void match_state(yaz_nfa_state *s, + * lowest numbered state. Keep track of the back references. Recursively + * traverses the NFA. Keeps track of the best hit it has found. */ + +static void match_state( + yaz_nfa_state *s, yaz_nfa_char *inchar, size_t incharsleft, struct matcher *m ) { yaz_nfa_transition *t=s->lasttrans; - + if (s->backref_start) + m->n->curr_backrefs[s->backref_start].start=inchar; + if (s->backref_end) + m->n->curr_backrefs[s->backref_end].end=inchar; if (t) { if (incharsleft) { do { t=t->next; if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){ m->empties=0; + if (t->range_start!=t->range_end){ + /* backref 0 is special: the last range operation */ + m->n->curr_backrefs[0].start=inchar; + m->n->curr_backrefs[0].end=inchar; + } match_state(t->to_state, inchar+1,incharsleft-1,m); /* yes, descent to all matching nodes, even if overrun, */ /* since we can find a better one later */ @@ -284,11 +310,22 @@ static void match_state(yaz_nfa_state *s, if ( (m->longest < inchar) || /* longer result */ ((m->longest == inchar)&&(m->bestnodenum)) ){ /* or as long, but with lower node number. Still better */ + int i; m->longest=inchar; m->bestnode=s->num; m->result=s->result; + if (m->n->curr_backrefs) + for (i=0; in->nbackrefs;i++) { + m->n->best_backrefs[i]=m->n->curr_backrefs[i]; + } } } + if (s->backref_start) + m->n->curr_backrefs[s->backref_start].start=0; + if (s->backref_end) + m->n->curr_backrefs[s->backref_end].end=0; + m->n->curr_backrefs[0].start=0; + m->n->curr_backrefs[0].end=0; } /* match_state */ int yaz_nfa_match(yaz_nfa *n, @@ -296,32 +333,59 @@ int yaz_nfa_match(yaz_nfa *n, size_t incharsleft, void **result ){ struct matcher m; - if (!n->firststate) - return YAZ_NFA_NOMATCH; + int sz; + int i; + if (!n->firststate) { + n->lastmatch=YAZ_NFA_NOMATCH; + return n->lastmatch; + } + m.n=n; m.longest=*inbuff; m.bestnode=n->nstates; m.result=0; m.errorcode=0; + sz=sizeof( struct yaz_nfa_backref_info) * n->nbackrefs; if (!n->curr_backrefs) { - int sz=sizeof( struct yaz_nfa_backref_info); - n->curr_backrefs=nmem_malloc( n->nmem, n->nbackrefs* sz); - n->best_backrefs=nmem_malloc( n->nmem, n->nbackrefs* sz); + n->curr_backrefs=nmem_malloc( n->nmem, sz); + n->best_backrefs=nmem_malloc( n->nmem, sz); + } + for (i=0;inbackrefs;i++) { + n->curr_backrefs[i].start=0; + n->curr_backrefs[i].end=0; + n->best_backrefs[i].start=0; + n->best_backrefs[i].end=0; } - match_state(n->firststate, *inbuff, incharsleft, &m); if (m.result) { *result=m.result; *inbuff=m.longest; if (m.errorcode) - return m.errorcode; + n->lastmatch=m.errorcode; else - return YAZ_NFA_SUCCESS; + n->lastmatch= YAZ_NFA_SUCCESS; + return n->lastmatch; } - return YAZ_NFA_NOMATCH; + n->lastmatch=YAZ_NFA_NOMATCH; + return n->lastmatch; } +int yaz_nfa_get_backref( yaz_nfa *n, + int backref_no, + yaz_nfa_char **start, + yaz_nfa_char **end) { + if (backref_no>=n->nbackrefs) + return 2; + if (backref_no<0) + return 2; + if (n->lastmatch== YAZ_NFA_NOMATCH) + return 1; /* accept other errors, they return partial matches*/ + + *start=n->best_backrefs[backref_no].start; + *end=n->best_backrefs[backref_no].end; + return 0; +} /* * * * * * * * * @@ -347,9 +411,10 @@ yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s){ static void dump_trans(FILE *F, yaz_nfa_transition *t ) { char c1; char c2; + char *e; c1=t->range_start; c2=t->range_end; - char *e=""; + e=""; if ( (t->range_start <= ' ') || (t->range_start>'z')) c1='?'; if ( (t->range_end <= ' ') || (t->range_end>'z')) @@ -400,7 +465,8 @@ void yaz_nfa_dump(FILE *F, yaz_nfa *n, yaz_nfa_state *s; if (!F) /* lazy programmers can just pass 0 for F */ F=stdout; - fprintf(F,"The NFA has %d states \n",n->nstates); + fprintf(F,"The NFA has %d states and %d backrefs\n", + n->nstates, n->nbackrefs); s=n->laststate; if (s) { do {