From: Heikki Levanto Date: Wed, 3 May 2006 13:47:35 +0000 (+0000) Subject: The NFA seems to work now (as far as I can see). X-Git-Tag: YAZ.2.1.20~88 X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=commitdiff_plain;h=b77f4b5c2b191b83cf3f213faa8b6b263e36fea4;ds=sidebyside The NFA seems to work now (as far as I can see). Still needs to do the output side of it. --- diff --git a/include/yaz/nfa.h b/include/yaz/nfa.h index a61b5c1..6f3ba4e 100644 --- a/include/yaz/nfa.h +++ b/include/yaz/nfa.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. - * $Id: nfa.h,v 1.2 2006-05-03 11:09:59 heikki Exp $ + * $Id: nfa.h,v 1.3 2006-05-03 13:47:35 heikki Exp $ */ /** @@ -86,12 +86,13 @@ void *yaz_nfa_get_result( yaz_nfa *n /** The NFA itself */, yaz_nfa_state *s /** The state whose result you want */); -/** \brief Set the backref number to a state. +/** \brief Set a backref point to a state. * - * Each state can be the beginning and/or ending of a backref - * sequence. This call sets those flags in the states. After matching, - * we can get hold of the backrefs that matched, and use them in our - * translations. The backrefs start at 1, not zero! + * Each state can be the beginning and/or ending point of a backref + * sequence. This call sets one of those flags in the state. After + * matching, we can get hold of the backrefs that matched, and use + * them in our translations. The numbering of backrefs start at 1, + * not zero! * * \param n the nfa * \param s the state to add to @@ -103,11 +104,11 @@ void *yaz_nfa_get_result( * */ -int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s, +int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s, int backref_number, int is_start ); -/** \brief Get the backref number of a state. +/** \brief Get the backref point of a state * * \param n the nfa * \param s the state to add to @@ -115,7 +116,7 @@ int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s, * \return the backref number associated with the state, or 0 if none. */ -int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s, +int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s, int is_start ); /** \brief Add a transition to the NFA. @@ -196,6 +197,31 @@ int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t incharsleft, #define YAZ_NFA_OVERRUN 2 #define YAZ_NFA_LOOP 3 +/** \brief Get a back reference after a successfull match. + * + * \param n the nfa + * \param backref_no the number of the backref to get + * \param begin beginning of the matching substring + * \param end end of the matching substring + * + * Returns pointers to the beginning and end of a backref, or null + * pointers if one endpoint not met. Those pointers point to the + * original buffer that was matched, so the caller will not have to + * worry about freeing anything special. + * + * It is technically possible to create NFAs that meet the start but + * not the end of a backref. It is up to the caller to decide how + * to handle such a situation. + * + * \retval 0 OK + * \retval 1 no match + * \retval 2 no such backref + */ + +int yaz_nfa_get_backref( yaz_nfa *n, + int backref_no, + yaz_nfa_char **start, + yaz_nfa_char **end ); /** \brief Get the first state of the NFA. * diff --git a/src/nfa.c b/src/nfa.c index 4d0e8a2..ef6f3f4 100644 --- a/src/nfa.c +++ b/src/nfa.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfa.c,v 1.3 2006-05-03 11:36:14 mike Exp $ + * $Id: nfa.c,v 1.4 2006-05-03 13:47:35 heikki Exp $ * * This is a simple NFA-based system for character set normalizing * in yaz and zebra. Unlike standard NFA's, this operates on ranges of @@ -41,6 +41,7 @@ struct yaz_nfa { yaz_nfa_state *firststate; /* the initial state */ yaz_backref_info *curr_backrefs; /* the backrefs we are matching */ yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/ + int lastmatch; /* the result of last match */ }; struct yaz_nfa_state { @@ -85,6 +86,7 @@ yaz_nfa *yaz_nfa_init() { n->nbackrefs=0; n->curr_backrefs=0; n->best_backrefs=0; + n->lastmatch= YAZ_NFA_NOMATCH; return n; } @@ -129,15 +131,15 @@ void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) { return s->result; } -int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s, +int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s, int backref_number, int is_start ){ if (is_start) { if (s->backref_start) return 1; s->backref_start=backref_number; - if (n->nbackrefsnbackrefs=backref_number; + if (n->nbackrefs<=backref_number) { + n->nbackrefs=backref_number+1; n->curr_backrefs=0; n->best_backrefs=0; /* clear them just in case we have already matched on */ @@ -154,7 +156,7 @@ int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s, return 0; /* ok */ } -int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s, +int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s, int is_start ) { if (!s) return 0; @@ -285,6 +287,11 @@ static void match_state( t=t->next; if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){ m->empties=0; + if (t->range_start!=t->range_end){ + /* backref 0 is special: the last range operation */ + m->n->curr_backrefs[0].start=inchar; + m->n->curr_backrefs[0].end=inchar; + } match_state(t->to_state, inchar+1,incharsleft-1,m); /* yes, descent to all matching nodes, even if overrun, */ /* since we can find a better one later */ @@ -308,14 +315,17 @@ static void match_state( m->bestnode=s->num; m->result=s->result; if (m->n->curr_backrefs) - for (i=0; in->nbackrefs;i++) + for (i=0; in->nbackrefs;i++) { m->n->best_backrefs[i]=m->n->curr_backrefs[i]; + } } } if (s->backref_start) m->n->curr_backrefs[s->backref_start].start=0; if (s->backref_end) m->n->curr_backrefs[s->backref_end].end=0; + m->n->curr_backrefs[0].start=0; + m->n->curr_backrefs[0].end=0; } /* match_state */ int yaz_nfa_match(yaz_nfa *n, @@ -323,33 +333,59 @@ int yaz_nfa_match(yaz_nfa *n, size_t incharsleft, void **result ){ struct matcher m; - if (!n->firststate) - return YAZ_NFA_NOMATCH; + int sz; + int i; + if (!n->firststate) { + n->lastmatch=YAZ_NFA_NOMATCH; + return n->lastmatch; + } m.n=n; m.longest=*inbuff; m.bestnode=n->nstates; m.result=0; m.errorcode=0; - if ((!n->curr_backrefs) && (n->nbackrefs)){ - int sz=sizeof( struct yaz_nfa_backref_info) * (n->nbackrefs+1); + sz=sizeof( struct yaz_nfa_backref_info) * n->nbackrefs; + if (!n->curr_backrefs) { n->curr_backrefs=nmem_malloc( n->nmem, sz); n->best_backrefs=nmem_malloc( n->nmem, sz); } - + for (i=0;inbackrefs;i++) { + n->curr_backrefs[i].start=0; + n->curr_backrefs[i].end=0; + n->best_backrefs[i].start=0; + n->best_backrefs[i].end=0; + } match_state(n->firststate, *inbuff, incharsleft, &m); if (m.result) { *result=m.result; *inbuff=m.longest; if (m.errorcode) - return m.errorcode; + n->lastmatch=m.errorcode; else - return YAZ_NFA_SUCCESS; + n->lastmatch= YAZ_NFA_SUCCESS; + return n->lastmatch; } - return YAZ_NFA_NOMATCH; + n->lastmatch=YAZ_NFA_NOMATCH; + return n->lastmatch; } +int yaz_nfa_get_backref( yaz_nfa *n, + int backref_no, + yaz_nfa_char **start, + yaz_nfa_char **end) { + if (backref_no>=n->nbackrefs) + return 2; + if (backref_no<0) + return 2; + if (n->lastmatch== YAZ_NFA_NOMATCH) + return 1; /* accept other errors, they return partial matches*/ + + *start=n->best_backrefs[backref_no].start; + *end=n->best_backrefs[backref_no].end; + return 0; +} /* * * * * * * * * @@ -429,7 +465,8 @@ void yaz_nfa_dump(FILE *F, yaz_nfa *n, yaz_nfa_state *s; if (!F) /* lazy programmers can just pass 0 for F */ F=stdout; - fprintf(F,"The NFA has %d states \n",n->nstates); + fprintf(F,"The NFA has %d states and %d backrefs\n", + n->nstates, n->nbackrefs); s=n->laststate; if (s) { do { diff --git a/test/nfatest1.c b/test/nfatest1.c index efafb8d..c37fc4f 100644 --- a/test/nfatest1.c +++ b/test/nfatest1.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfatest1.c,v 1.1 2006-05-03 09:04:33 heikki Exp $ + * $Id: nfatest1.c,v 1.2 2006-05-03 13:47:35 heikki Exp $ * */ @@ -24,28 +24,47 @@ void test_match(yaz_nfa *n, yaz_nfa_char *buf, int buflen, int expcode, char *expstr) { yaz_nfa_char *c=buf; + yaz_nfa_char *cp1,*cp2; void *resptr=0; - int i; + int i,bi; i=yaz_nfa_match(n,&c,buflen,&resptr); #if VERBOSE printf("\n'%s' returned %d. Moved c by %d, and resulted in '%s'\n", expstr, i, (c-buf),(char*)resptr); #endif YAZ_CHECK_EQ(i,expcode); - YAZ_CHECK_EQ(strcmp(expstr,(char*)resptr),0); + if (i!=1) + YAZ_CHECK_EQ(strcmp(expstr,(char*)resptr),0); + i=0; + bi=0; + while(bi!=2){ + bi=yaz_nfa_get_backref(n,i,&cp1,&cp2); + if (bi==0 && ( cp1 || cp2 ) ) { +#if VERBOSE + printf(" got backref %d of %d chars (%p to %p): '", + i, cp2-cp1+1, cp1, cp2); + while (cp2-cp1 >= 0 ) + printf("%c", *cp1++); + printf("'\n"); +#endif + } + i++; + } } void construction_test() { yaz_nfa* n= yaz_nfa_init(); + yaz_nfa_char *cp, *cp1, *cp2; yaz_nfa_state *s,*s0,*s1,*s2,*s3,*s4,*s5; int i; yaz_nfa_char seq1[]={'p','r','e','f','i','x',0}; yaz_nfa_char seq2[]={'p','r','e','l','i','m',0}; - yaz_nfa_char tst1[]={'c','0'}; - yaz_nfa_char tst2[]={'c','k','0'}; - yaz_nfa_char tst3[]={'c','x','0'}; - yaz_nfa_char tst4[]={'z','k','0'}; - yaz_nfa_char tst5[]={'y','k','k','k','k','k','k','y','0'}; + yaz_nfa_char tst1[]={'c',0}; + yaz_nfa_char tst2[]={'c','k',0}; + yaz_nfa_char tst3[]={'c','x',0}; + yaz_nfa_char tst4[]={'z','k',0}; + yaz_nfa_char tst5[]={'y','k','l','k','k','l','k','d',0}; + yaz_nfa_char tst6[]={'x','z','k','a','b',0}; void *p; YAZ_CHECK(n); @@ -106,20 +125,35 @@ void construction_test() { yaz_nfa_set_result(n,s,"loop"); s=yaz_nfa_add_range(n, 0, 'y','y' ); + yaz_nfa_set_backref_point(n,s,1,1); s1=yaz_nfa_add_state(n); - yaz_nfa_set_backref(n,s1,1,1); yaz_nfa_add_empty_transition(n,s,s1); s=s1; - yaz_nfa_add_transition(n,s,s,'k','k'); - s=yaz_nfa_add_range(n, s, 'y','y' ); - yaz_nfa_set_result(n,s,"y k+ y"); - yaz_nfa_set_backref(n,s,1,0); + yaz_nfa_add_transition(n,s,s,'k','l'); + s=yaz_nfa_add_range(n, s, 'd','d' ); + yaz_nfa_set_result(n,s,"y k+ d"); + yaz_nfa_set_backref_point(n,s,1,0); s=yaz_nfa_add_sequence(n, 0, seq1 ); yaz_nfa_set_result(n,s,"PREFIX"); s=yaz_nfa_add_sequence(n, 0, seq2 ); yaz_nfa_set_result(n,s,"PRELIM"); + s=yaz_nfa_add_range(n, 0, 'x','x' ); + yaz_nfa_set_backref_point(n,s,2,1); + s1=yaz_nfa_add_sequence(n,s,tst4); + yaz_nfa_set_backref_point(n,s1,2,0); + yaz_nfa_set_result(n,s1,"xzk"); + + /* check return codes before doing any matches */ + i=yaz_nfa_get_backref(n, 0, &cp1, &cp2 ); + YAZ_CHECK_EQ(i,1); + i=yaz_nfa_get_backref(n, 3, &cp1, &cp2 ); + YAZ_CHECK_EQ(i,2); + i=yaz_nfa_get_backref(n, 1, &cp1, &cp2 ); + YAZ_CHECK_EQ(i,1); + + #if VERBOSE yaz_nfa_dump(0,n, printfunc); #endif @@ -130,7 +164,17 @@ void construction_test() { test_match(n,tst2,3,YAZ_NFA_SUCCESS,"first"); test_match(n,tst3,3,YAZ_NFA_SUCCESS,"a-k,x-z"); test_match(n,tst4,9,YAZ_NFA_LOOP,"loop"); - test_match(n,tst5,9,YAZ_NFA_SUCCESS,"y k+ y"); + test_match(n,tst5,9,YAZ_NFA_SUCCESS,"y k+ d"); + + cp=tst6; + i=yaz_nfa_match(n,&cp,8,&p); + YAZ_CHECK_EQ(i,YAZ_NFA_SUCCESS); + i=yaz_nfa_get_backref(n, 2, &cp1, &cp2 ); + YAZ_CHECK_EQ(i,0); +#if VERBOSE + printf("backref from %p to %p is %d long\n", + cp1,cp2, cp2-cp1 ); +#endif yaz_nfa_destroy(n); }