From: Heikki Levanto Date: Fri, 5 May 2006 14:02:27 +0000 (+0000) Subject: Range converter for actually converting character ranges X-Git-Tag: YAZ.2.1.20~80 X-Git-Url: http://git.indexdata.com/?p=yaz-moved-to-github.git;a=commitdiff_plain;h=065092597b2e0234d65b25b31930cbe98ec59738 Range converter for actually converting character ranges Fixed a nasty off-by-one in the endpoints of backrefs. Added tests for them. --- diff --git a/include/yaz/nfa.h b/include/yaz/nfa.h index 76c131e..6880c70 100644 --- a/include/yaz/nfa.h +++ b/include/yaz/nfa.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. - * $Id: nfa.h,v 1.5 2006-05-05 09:14:42 heikki Exp $ + * $Id: nfa.h,v 1.6 2006-05-05 14:02:27 heikki Exp $ */ /** @@ -258,6 +258,24 @@ yaz_nfa_converter *yaz_nfa_create_backref_converter ( yaz_nfa *n, int backref_no ); +/** \brief Create a charcater range converter + * \param n the nfa + * \param backref_no The backreference to reproduce + * \param from_char the first character of the original range + * \param to_char the first character of the target range + * + * This converter takes a backreference, and shifts the characters + * by a constant value. For example, translating a-z to A-Z. + * Note that backref 0 is always the last character that matched a + * range, even if no backrefs were defined in teh nfa. This makes + * it pretty useful with this converter. + * + */ +yaz_nfa_converter *yaz_nfa_create_range_converter ( + yaz_nfa *n, int backref_no, + yaz_nfa_char from_char, + yaz_nfa_char to_char); + /** \brief Connects converters in a chain. * \param n the nfa (mostly for nmem access) diff --git a/src/nfa.c b/src/nfa.c index 863af23..126f940 100644 --- a/src/nfa.c +++ b/src/nfa.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfa.c,v 1.6 2006-05-05 09:14:42 heikki Exp $ + * $Id: nfa.c,v 1.7 2006-05-05 14:02:27 heikki Exp $ */ /** @@ -78,7 +78,8 @@ struct yaz_nfa_transition { typedef enum { conv_none, conv_string, - conv_backref + conv_backref, + conv_range } yaz_nfa_converter_type; struct yaz_nfa_converter { @@ -282,7 +283,7 @@ yaz_nfa_state *yaz_nfa_add_sequence(yaz_nfa *n, struct matcher { yaz_nfa *n; yaz_nfa_char *longest; - int bestnode; + int bestnode; void *result; int errorcode; int empties; /* count how many consecutive empty transitions */ @@ -294,14 +295,15 @@ struct matcher { static void match_state( yaz_nfa_state *s, - yaz_nfa_char *inchar, - size_t incharsleft, - struct matcher *m ) { + yaz_nfa_char *prevmatch, + yaz_nfa_char *inchar, + size_t incharsleft, + struct matcher *m ) { yaz_nfa_transition *t = s->lasttrans; if (s->backref_start) m->n->curr_backrefs[s->backref_start].start = inchar; if (s->backref_end) - m->n->curr_backrefs[s->backref_end].end = inchar; + m->n->curr_backrefs[s->backref_end].end = prevmatch; if (t) { if (incharsleft) { do { @@ -313,14 +315,15 @@ static void match_state( m->n->curr_backrefs[0].start = inchar; m->n->curr_backrefs[0].end = inchar; } - match_state(t->to_state, inchar+1, incharsleft-1, m); - /* yes, descent to all matching nodes, even if overrun, */ - /* since we can find a better one later */ - } else if (( t->range_start==EMPTY_START) && (t->range_end==EMPTY_END)) { + match_state(t->to_state, inchar, + inchar+1, incharsleft-1, m); + } else if (( t->range_start==EMPTY_START) && + (t->range_end==EMPTY_END)) { if ( m->empties++ > LOOP_LIMIT ) m->errorcode= YAZ_NFA_LOOP; else - match_state(t->to_state, inchar, incharsleft, m); + match_state(t->to_state, prevmatch, + inchar, incharsleft, m); } } while (t != s->lasttrans ); } else { @@ -378,7 +381,7 @@ int yaz_nfa_match(yaz_nfa *n, n->best_backrefs[i].end = 0; } - match_state(n->firststate, *inbuff, *incharsleft, &m); + match_state(n->firststate, *inbuff, *inbuff, *incharsleft, &m); if (m.result) { *incharsleft -= (m.longest-*inbuff); *result = m.result; @@ -450,6 +453,19 @@ yaz_nfa_converter *yaz_nfa_create_backref_converter ( return c; } +yaz_nfa_converter *yaz_nfa_create_range_converter ( + yaz_nfa *n, int backref_no, + yaz_nfa_char from_char, + yaz_nfa_char to_char){ + yaz_nfa_converter *c; + c=create_null_converter(n); + c->type=conv_range; + c->backref_no=backref_no; + c->char_diff=to_char - from_char; + return c; + +} + void yaz_nfa_append_converter ( yaz_nfa *n, @@ -487,7 +503,7 @@ static int backref_convert ( return 0; if (i==1) /* no match in dfa */ return 1; /* should not happen */ - while (cp2>cp1) { + while (cp2>=cp1) { if ((*outcharsleft)-- <= 0) return 2; **outbuff=*cp1++; @@ -496,6 +512,30 @@ static int backref_convert ( return 0; } +static int range_convert ( + yaz_nfa *n, + yaz_nfa_converter *c, + yaz_nfa_char **outbuff, + size_t *outcharsleft){ + yaz_nfa_char *cp1,*cp2; + int i; + i = yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2); + printf ("range_convert: i=%d d=%d, cp1=%p cp2=%p \n",i,c->char_diff,cp1,cp2); + if (i == 2) /* no backref, produce no output, not ok */ + return 1; /* should not happen */ + if (i == 1) /* no match in dfa */ + return 1; /* should not happen */ + while (cp2 >= cp1) { + if ((*outcharsleft)-- <= 0) + return 2; + printf(" range_convert: %d '%c' -> ",*cp1,*cp1); + **outbuff=(*cp1++) + c->char_diff ; + printf("%d '%c'\n",**outbuff, **outbuff); + (*outbuff)++; + } + return 0; +} + int yaz_nfa_run_converters( yaz_nfa *n, @@ -503,7 +543,6 @@ int yaz_nfa_run_converters( yaz_nfa_char **outbuff, size_t *outcharsleft){ int rc=0; - // yaz_nfa_char *bufstart=*outbuff; while (c && !rc) { switch(c->type) { case conv_string: @@ -512,6 +551,9 @@ int yaz_nfa_run_converters( case conv_backref: rc=backref_convert(n,c,outbuff,outcharsleft); break; + case conv_range: + rc=range_convert(n,c,outbuff,outcharsleft); + break; default: rc=3; /* internal error */ } diff --git a/test/nfatest1.c b/test/nfatest1.c index 7ae8b29..3d14b30 100644 --- a/test/nfatest1.c +++ b/test/nfatest1.c @@ -1,7 +1,7 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. * - * $Id: nfatest1.c,v 1.4 2006-05-05 09:14:42 heikki Exp $ + * $Id: nfatest1.c,v 1.5 2006-05-05 14:02:27 heikki Exp $ * */ @@ -12,7 +12,7 @@ #include #include -#define VERBOSE 1 +#define VERBOSE 0 char *printfunc(void *result) { static char buf[200]; @@ -175,15 +175,18 @@ void construction_test() { test_match(n, tst4, 9, YAZ_NFA_LOOP, "loop"); test_match(n, tst5, 9, YAZ_NFA_SUCCESS, "y k+ d"); - cp = tst6; + cp = tst6; /* xzkab */ sz = 8; i = yaz_nfa_match(n, &cp, &sz, &p); YAZ_CHECK_EQ(i, YAZ_NFA_SUCCESS); i = yaz_nfa_get_backref(n, 2, &cp1, &cp2 ); YAZ_CHECK_EQ(i, 0); + YAZ_CHECK_EQ(cp2-cp1+1,2); + YAZ_CHECK_EQ(*cp1, 'z' ); + YAZ_CHECK_EQ(*cp2, 'k' ); #if VERBOSE - printf("backref from %p to %p is %d long. sz is now %d\n", - cp1,cp2, cp2-cp1,sz ); + printf("backref from %p '%c' to %p '%c' is %d long. sz is now %d\n", + cp1, *cp1, cp2, *cp2, cp2-cp1+1, sz ); #endif yaz_nfa_destroy(n); @@ -191,12 +194,12 @@ void construction_test() { void converter_test() { yaz_nfa* n= yaz_nfa_init(); - yaz_nfa_converter *c1,*c2; + yaz_nfa_converter *c1, *c2, *c3; yaz_nfa_char str1[]={'a','b','c'}; yaz_nfa_char seq1[]={'A','B','C',0}; yaz_nfa_char seq2[]={'k','m','n','m','x','P','Q','X',0}; yaz_nfa_char outbuf[1024]; - yaz_nfa_char *outp,*cp, *cp1, *cp2; + yaz_nfa_char *outp, *cp, *cp1, *cp2; yaz_nfa_state *s, *s2; void *vp; int i; @@ -251,7 +254,7 @@ void converter_test() { yaz_nfa_set_result(n,s,c1); yaz_nfa_set_backref_point(n,s,1,0); - /* [k-o][m-n]*x -> m-n sequence */ + /* ([k-o][m-n]*)x -> \1 */ s=yaz_nfa_add_state(n); yaz_nfa_add_empty_transition(n,0,s); yaz_nfa_set_backref_point(n,s,2,1); @@ -275,8 +278,12 @@ void converter_test() { c2=vp; YAZ_CHECK_EQ(i,YAZ_NFA_SUCCESS); i=yaz_nfa_get_backref(n, 2, &cp1, &cp2 ); +#if VERBOSE + printf("backref from %p '%c' to %p '%c' is %d long. sz is now %d\n", + cp1, *cp1, cp2, *cp2, cp2-cp1+1, sz ); +#endif YAZ_CHECK_EQ(i,0); - YAZ_CHECK_EQ((int)c1,(int)c2); + YAZ_CHECK_EQ((int)c1,(int)c2); /* got our pointer back from nfa */ for(i=0;i<1024;i++) outbuf[i]=10000+i; outp=outbuf; @@ -291,6 +298,21 @@ void converter_test() { YAZ_CHECK_EQ(outbuf[5],10000+5); YAZ_CHECK_EQ(sz,11-5); + c3=yaz_nfa_create_range_converter(n,2, 'a', 'A' ); + for(i=0;i<1024;i++) + outbuf[i]=10000+i; + outp=outbuf; + sz=11; + i=yaz_nfa_run_converters(n, c3, &outp, &sz); + YAZ_CHECK_EQ(i,0); + YAZ_CHECK_EQ(outbuf[0],'K'); + YAZ_CHECK_EQ(outbuf[1],'M'); + YAZ_CHECK_EQ(outbuf[2],'N'); + YAZ_CHECK_EQ(outbuf[3],'M'); + YAZ_CHECK_EQ(outbuf[4],'X'); + YAZ_CHECK_EQ(outbuf[5],10000+5); + YAZ_CHECK_EQ(sz,11-5); + yaz_nfa_destroy(n); }