X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=blobdiff_plain;f=src%2Fnfa.c;fp=src%2Fnfa.c;h=5e8d56e6b797b5417c56a745d1e7081664f671ac;hp=0000000000000000000000000000000000000000;hb=4d994a119a1949522fe229270983ba1b1b399c6a;hpb=a0e27aac0589d493172c73f6660b844fc6460d7c diff --git a/src/nfa.c b/src/nfa.c new file mode 100644 index 0000000..5e8d56e --- /dev/null +++ b/src/nfa.c @@ -0,0 +1,421 @@ +/* Copyright (C) 2006, Index Data ApS + * See the file LICENSE for details. + * + * $Id: nfa.c,v 1.1 2006-05-03 09:04:33 heikki Exp $ + * + * This is a simple NFA-based system for character set normalizing + * in yaz and zebra. Unlike standard NFA's, this operates on ranges of + * characters at a time, so as to keep the size small. + * + * All characters are internally handled as unsigned 32-bit ints, so + * this works well with unicode. Translating to and from utf-8 is trivial, if + * need be. + * + * The NFA stores a translation-thing in the terminal nodes. It does not + * concern itself with what that thing is, for us it is juts a void* + * + * The module consists of two parts: Building the NFA, and translating + * strings with it. + */ + +#include +#include +#include + +/* * * * * * * * * * + * Data structures + * * * * * * * * * */ + +typedef struct yaz_nfa_backref_info yaz_backref_info; + +struct yaz_nfa_backref_info { + yaz_nfa_char *start; + yaz_nfa_char *end; +}; + +struct yaz_nfa { + NMEM nmem; + int nstates; /* how many states do we have */ + int nbackrefs; /* how many backrefs do we have */ + yaz_nfa_state *laststate; /* points to the last in the circular list */ + yaz_nfa_state *firststate; /* the initial state */ + yaz_backref_info *curr_backrefs; /* the backrefs we are matching */ + yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/ +}; + +struct yaz_nfa_state { + int num; /* state number. used for resoving ambiguities, and for dumping */ + void *result; /* signals a terminal state, and gives the result */ + yaz_nfa_transition *lasttrans; /* circular list of transitions */ + yaz_nfa_state *next; /* Circular linked list */ + int backref_start; /* which backreference starts here. 0 if none */ + int backref_end; /* which backreference ends here. 0 if none */ +} ; + +struct yaz_nfa_transition { + yaz_nfa_state *to_state; /* where to */ + yaz_nfa_char range_start; + yaz_nfa_char range_end; + yaz_nfa_transition *next; /* a linked list of them */ +} ; + +/* a range from 1 down to 0 is impossible, and is used to denote an */ +/* empty (epsilon) transition */ +#define EMPTY_START 1 +#define EMPTY_END 0 + +/* a limit for the number of empty transitions we can have in a row before */ +/* we declare this a loop */ +#define LOOP_LIMIT 100 + + + + +/* * * * * * * + * Initializing and destroying whole nfa's + * * * * * * */ + +yaz_nfa *yaz_nfa_init() { + NMEM my_nmem = nmem_create(); + yaz_nfa *n=nmem_malloc(my_nmem, sizeof(yaz_nfa)); + n->nmem=my_nmem; + n->nstates=0; + n->laststate=0; + return n; +} + +void yaz_nfa_destroy(yaz_nfa *n) { + nmem_destroy(n->nmem); +} + + +/* * * * * * + * Low-level interface to building the NFA + * * * * * */ + +yaz_nfa_state *yaz_nfa_add_state(yaz_nfa *n) { + yaz_nfa_state *s = nmem_malloc(n->nmem,sizeof(yaz_nfa_state)); + s->num = (n->nstates)++; + s->result=0; + s->lasttrans=0; + s->backref_start=0; + s->backref_end=0; + if (n->laststate) { + s->next=n->laststate->next; + n->laststate->next=s; + n->laststate=s; + } else { /* first state */ + n->laststate=s; + n->firststate=s; + s->next=s; + } + return s; +} + +int yaz_nfa_set_result(yaz_nfa *n, yaz_nfa_state *s,void *result) { + if ((s->result)&&result) + return 1; + s->result=result; + return 0; +} + +void *yaz_nfa_get_result(yaz_nfa *n, yaz_nfa_state *s) { + if (!s) + return 0; + return s->result; +} + +int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s, + int backref_number, + int is_start ){ + if (is_start) { + if (s->backref_start) + return 1; + s->backref_start=backref_number; + if (n->nbackrefsnbackrefs=backref_number; + } else { + if (s->backref_end) + return 1; + s->backref_end=backref_number; + } + return 0; /* ok */ +} + +int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s, + int is_start ) { + if (!s) + return 0; + if (is_start) + return s->backref_start; + else + return s->backref_end; +} + +void yaz_nfa_add_transition(yaz_nfa *n, + yaz_nfa_state *from_state, + yaz_nfa_state *to_state, + yaz_nfa_char range_start, + yaz_nfa_char range_end) { + yaz_nfa_transition *t=nmem_malloc(n->nmem,sizeof(yaz_nfa_transition)); + t->range_start=range_start; + t->range_end=range_end; + t->to_state=to_state; + if (from_state->lasttrans) { + t->next= from_state->lasttrans->next; + from_state->lasttrans->next=t; + from_state->lasttrans=t; + } else { /* first trans */ + from_state->lasttrans=t; + t->next=t; + } +} + +void yaz_nfa_add_empty_transition( yaz_nfa *n, + yaz_nfa_state *from_state, + yaz_nfa_state *to_state) { + yaz_nfa_add_transition(n,from_state,to_state, + EMPTY_START,EMPTY_END); +} + +/* * * * * * * * + * Medium-level interface + * * * * * * * */ + +/* Finds a transition from s where the range is exactly c..c */ +/* There should only be one such transition */ +static yaz_nfa_state *find_single_trans( + yaz_nfa_state *s, + yaz_nfa_char range_start, + yaz_nfa_char range_end) { + yaz_nfa_transition *t=s->lasttrans; + if (!t) + return 0; + do { + t=t->next; + if ( ( t->range_start == range_start ) && ( t->range_end == range_end) ) + return t->to_state; + } while (t != s->lasttrans ); + return 0; +} + + +yaz_nfa_state *yaz_nfa_add_range(yaz_nfa *n, + yaz_nfa_state *s, + yaz_nfa_char range_start, + yaz_nfa_char range_end) { + yaz_nfa_state *nextstate; + if (!s) /* default to top-level of the nfa */ + s=n->firststate; + nextstate=find_single_trans(s,range_start,range_end); + if (!nextstate) { + nextstate = yaz_nfa_add_state(n); + yaz_nfa_add_transition(n, s, nextstate, range_start, range_end); + } + return nextstate; +} + +yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n, + yaz_nfa_state *s, + yaz_nfa_char *seq ){ + yaz_nfa_state *nextstate; + if (!s) /* default to top-level of the nfa */ + s=n->firststate; + nextstate=find_single_trans(s,*seq,*seq); + if (nextstate) { + seq++; + if (!*seq) /* whole sequence matched */ + return nextstate; + else + return yaz_nfa_add_sequence(n, nextstate, seq); + } else { /* no next state, build the rest */ + while (*seq) { + s=yaz_nfa_add_range(n,s, *seq, *seq); + seq++; + } + return s; + } + return 0; /* compiler shut up, we return somewhere above */ +} + + + +/* * * * * * * + * Searching the NFA + * * * * * * */ + +struct matcher { + yaz_nfa_char *longest; + int bestnode; + void *result; + int errorcode; + int empties; /* count how many consecutive empty transitions */ +}; + +/* Finds the longest match. In case of ties, chooses the one with the + * lowest numbered state */ +static void match_state(yaz_nfa_state *s, + yaz_nfa_char *inchar, + size_t incharsleft, + struct matcher *m ) { + yaz_nfa_transition *t=s->lasttrans; + + if (t) { + if (incharsleft) { + do { + t=t->next; + if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){ + m->empties=0; + match_state(t->to_state, inchar+1,incharsleft-1,m); + /* yes, descent to all matching nodes, even if overrun, */ + /* since we can find a better one later */ + } else if (( t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) { + if ( m->empties++ > LOOP_LIMIT ) + m->errorcode= YAZ_NFA_LOOP; + else + match_state(t->to_state, inchar,incharsleft,m); + } + } while (t != s->lasttrans ); + } else { + m->errorcode=YAZ_NFA_OVERRUN; + } + } + if (s->result) { /* terminal node */ + if ( (m->longest < inchar) || /* longer result */ + ((m->longest == inchar)&&(m->bestnodenum)) ){ + /* or as long, but with lower node number. Still better */ + m->longest=inchar; + m->bestnode=s->num; + m->result=s->result; + } + } +} /* match_state */ + +int yaz_nfa_match(yaz_nfa *n, + yaz_nfa_char **inbuff, + size_t incharsleft, + void **result ){ + struct matcher m; + if (!n->firststate) + return YAZ_NFA_NOMATCH; + m.longest=*inbuff; + m.bestnode=n->nstates; + m.result=0; + m.errorcode=0; + if (!n->curr_backrefs) { + int sz=sizeof( struct yaz_nfa_backref_info); + n->curr_backrefs=nmem_malloc( n->nmem, n->nbackrefs* sz); + n->best_backrefs=nmem_malloc( n->nmem, n->nbackrefs* sz); + } + + + match_state(n->firststate, *inbuff, incharsleft, &m); + if (m.result) { + *result=m.result; + *inbuff=m.longest; + if (m.errorcode) + return m.errorcode; + else + return YAZ_NFA_SUCCESS; + } + return YAZ_NFA_NOMATCH; +} + + + + +/* * * * * * * * * + * Debug routines + * * * * * * */ + +yaz_nfa_state *yaz_nfa_get_first(yaz_nfa *n){ + if (!n) + return 0; + return n->firststate; +} + +yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s){ + if (n && s) { + if (s==n->laststate) + return 0; + return s->next; + } + return 0; +} + + +static void dump_trans(FILE *F, yaz_nfa_transition *t ) { + char c1; + char c2; + c1=t->range_start; + c2=t->range_end; + char *e=""; + if ( (t->range_start <= ' ') || (t->range_start>'z')) + c1='?'; + if ( (t->range_end <= ' ') || (t->range_end>'z')) + c2='?'; + if ((t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) { + e="(empty)"; + } + fprintf(F," -> [%d] %s '%c' %x - '%c' %x \n", + t->to_state->num, e, + c1, t->range_start, + c2, t->range_end ); +} + +static void dump_state(FILE *F, yaz_nfa_state *s, + char *(*strfunc)(void *) ) { + yaz_nfa_transition *t; + char *resultstring=""; + if (s->result) { + if (strfunc) { + resultstring=(*strfunc)(s->result); + } + else + resultstring=s->result; + } + fprintf(F," state [%d] %s %s", + s->num, s->result?"(FINAL)":"",resultstring ); + if (s->backref_start) { + fprintf(F," start-%d",s->backref_start); + } + if (s->backref_end) { + fprintf(F," end-%d",s->backref_end); + } + fprintf(F,"\n"); + t=s->lasttrans; + if (!t) { + fprintf(F," (no transitions)\n"); + } else { + do { + t=t->next; + dump_trans(F,t); + } while (t != s->lasttrans); + } + +} + +void yaz_nfa_dump(FILE *F, yaz_nfa *n, + char *(*strfunc)(void *) ) { + yaz_nfa_state *s; + if (!F) /* lazy programmers can just pass 0 for F */ + F=stdout; + fprintf(F,"The NFA has %d states \n",n->nstates); + s=n->laststate; + if (s) { + do { + s=s->next; + dump_state(F,s, strfunc); + } while (s != n->laststate); + } +} + + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */