X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=include%2Fyaz%2Fnfa.h;h=ddce4af69578c537a80633e8ded19b45703d5035;hb=d58b3c28b5efc7106f6ed65cd52c8ad56de7b6b9;hp=7e256af771daa1ca74e95857f0a2239de8f52e05;hpb=68175726e7a40ecd8bd16e63605b2196fbeffb9e;p=yaz-moved-to-github.git diff --git a/include/yaz/nfa.h b/include/yaz/nfa.h index 7e256af..ddce4af 100644 --- a/include/yaz/nfa.h +++ b/include/yaz/nfa.h @@ -1,6 +1,6 @@ /* Copyright (C) 2006, Index Data ApS * See the file LICENSE for details. - * $Id: nfa.h,v 1.4 2006-05-04 18:58:24 adam Exp $ + * $Id: nfa.h,v 1.7 2006-05-10 13:58:46 heikki Exp $ */ /** @@ -15,6 +15,13 @@ * possible sequence of input characters that match the ranges in the * conditions, and that leads into a terminal state. * + * Separate from this we have converters. Those can often be used + * together with a NFA (think match-pattern and replace-pattern). + * + * A converter is a routine that produces some output. It can translate a + * range of characters into another range, emit a constant string, or + * something like that. + * */ #ifndef YAZ_NFA_H @@ -24,7 +31,37 @@ YAZ_BEGIN_CDECL -/** \brief Internal character type */ +/** \name return codes and data types*/ +/* \{ */ +/** \brief Success */ +#define YAZ_NFA_SUCCESS 0 + +/** \brief no match found */ +#define YAZ_NFA_NOMATCH 1 + +/** \brief Need more input */ +#define YAZ_NFA_OVERRUN 2 + +/** \brief The NFA is looping */ +#define YAZ_NFA_LOOP 3 + +/** \brief no room in output buffer */ +#define YAZ_NFA_NOSPACE 4 + +/** \brief tryig to set a result when one already exists*/ +#define YAZ_NFA_ALREADY 5 + +/** \brief Attempting to set an end to a backref that has not been started */ +#define YAZ_NFA_NOSTART 6 + +/** \brief Asking for a non-existing backref */ +#define YAZ_NFA_NOSUCHBACKREF 7 + +/** \brief Internal error, should never happen */ +#define YAZ_NFA_INTERNAL 8 + + +/** \brief Internal character type. 32-bit unicode! */ typedef unsigned int yaz_nfa_char; @@ -38,6 +75,13 @@ typedef struct yaz_nfa_state yaz_nfa_state; /** \brief Transition from one state to another */ typedef struct yaz_nfa_transition yaz_nfa_transition; +/** \brief A converter produces some output to a buffer */ +typedef struct yaz_nfa_converter yaz_nfa_converter; + +/* \} */ + +/** \name Low-level interface to building the NFA */ +/* \{ */ /** \brief Initialize the NFA without any states in it * @@ -64,17 +108,19 @@ yaz_nfa_state *yaz_nfa_add_state( /** \brief Sets the result pointer to a state * - * Call with null to clear the pointer. + * \param n the NFA itself + * \param s the state to which the result will be added + * \param result the result pointer + * + * Sets the result pointer of a state. If already set, returns an error. Call + * with a NULL pointer to clear the result, before setting a new one. * - * \retval 0 ok - * \retval 1 The state already has a result! + * \retval YAZ_NFA_SUCCESS ok + * \retval YAZ_NFA_ALREADY The state already has a result! */ int yaz_nfa_set_result( - /** The NFA itsef */ yaz_nfa *n, - /** The state to which the result is added */ yaz_nfa_state *s, - /** The result. The NFA does not care what it is, just stores it */ void *result ); @@ -96,11 +142,12 @@ void *yaz_nfa_get_result( * * \param n the nfa * \param s the state to add to - * \param backref_number is the number of the back reference. 0 for clearing + * \param backref_number is the number of the back reference. * \param is_start is 1 for start of the backref, 0 for end - * \retval 0 for OK - * \retval 1 if the backref is already set - * \retval 2 for ending a backref that has not been started + * + * \retval YAZ_NFA_SUCCESS for OK + * \retval YAZ_NFA_ALREADY if the backref is already set + * \retval YAZ_NFA_NOSTART for ending a backref that has not been started * */ @@ -124,7 +171,7 @@ int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s, * Add a transition between two existing states. The condition * is (as always) a range of yaz_nfa_chars. * \param n the nfa - * \param from_state which state the transition is from + * \param from_state which state the transition is from. null=initial * \param to_state where the transition goes to * \param range_start is the beginning of the range of values * \param range_end is the end of the range of values @@ -163,41 +210,49 @@ yaz_nfa_state *yaz_nfa_add_range( yaz_nfa *n, /** \brief Add a sequence of transitions and states. * + * \param n the nfa + * \param s the state to add this to. If null, adds to the initial state + * \param seq is a sequence of yaz_fna_chars. + * \param seq_len is the length of the sequence + * \Return the final state + * * Starting from state s (or from the initial state, if s is * null), finds as much of seq as possible and inserts the rest. - * \Return the final state */ yaz_nfa_state *yaz_nfa_add_sequence( yaz_nfa *n, yaz_nfa_state *s, - yaz_nfa_char *seq ); + yaz_nfa_char *seq, + size_t seq_len ); +/** \} */ + +/** \name Low-level interface for mathcing the NFA. */ +/* + * These do the actual matching. They know nothing of + * the type of the result pointers + */ +/** \{ */ /** \brief Find the longest possible match. * * \param n the nfa itself * \param inbuff buffer of input data. Will be incremented when match - * \param incharsleft max number of inchars to use from inbuff + * \param incharsleft max number of inchars to use from inbuff. decrements. * \param result the result pointer from the nfa (what ever that is) * * In case of errors, returns the best match so far, * which the caller is free to ignore. * - * \retval 0 success - * \retval 1 no match found - * \retval 2 overrun'of input buffer - * \retval 3 looping too far + * \retval YAZ_NFA_SUCCESS success + * \retval YAZ_NFA_NOMATCH no match found + * \retval YAZ_NFA_OVERRUN overrun of input buffer + * \retval YAZ_NFA_LOOP looping too far * */ -int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t incharsleft, +int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t *incharsleft, void **result ); -/** yaz_nfa_match return codes */ -#define YAZ_NFA_SUCCESS 0 -#define YAZ_NFA_NOMATCH 1 -#define YAZ_NFA_OVERRUN 2 -#define YAZ_NFA_LOOP 3 - /** \brief Get a back reference after a successfull match. * * \param n the nfa @@ -214,9 +269,9 @@ int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t incharsleft, * not the end of a backref. It is up to the caller to decide how * to handle such a situation. * - * \retval 0 OK - * \retval 1 no match - * \retval 2 no such backref + * \retval YAZ_NFA_SUCCESS OK + * \retval YAZ_NFA_NOMATCH The NFA hasn't matched anything, no backref + * \retval YAZ_NFA_NOSUCHBACKREF no such backref */ int yaz_nfa_get_backref( yaz_nfa *n, @@ -224,6 +279,224 @@ int yaz_nfa_get_backref( yaz_nfa *n, yaz_nfa_char **start, yaz_nfa_char **end ); +/* \} */ + +/** \name Low-level interface to the converters */ +/* These produce some output text into a buffer. There are a few + * kinds of converters, each producing different type of output. + */ +/* \{ */ + +/** \brief Create a string converter. + * \param n the nfa + * \param string the string to output + * \param length how many chars in the string + * + * This converter produces a constant string in the output + */ +yaz_nfa_converter *yaz_nfa_create_string_converter ( + yaz_nfa *n, + yaz_nfa_char *string, + size_t length ); + +/** \brief Create a backref converter + * \param n the nfa + * \param backref_no The backreference to reproduce + * + * This converter copies a backref into the output buffer + */ +yaz_nfa_converter *yaz_nfa_create_backref_converter ( + yaz_nfa *n, int backref_no ); + + +/** \brief Create a charcater range converter + * \param n the nfa + * \param backref_no The backreference to reproduce + * \param from_char the first character of the original range + * \param to_char the first character of the target range + * + * This converter takes a backreference, and shifts the characters + * by a constant value. For example, translating a-z to A-Z. + * Note that backref 0 is always the last character that matched a + * range, even if no backrefs were defined in teh nfa. This makes + * it pretty useful with this converter. + * + */ +yaz_nfa_converter *yaz_nfa_create_range_converter ( + yaz_nfa *n, int backref_no, + yaz_nfa_char from_char, + yaz_nfa_char to_char); + + +/** \brief Connects converters in a chain. + * \param n the nfa (mostly for nmem access) + * \param startpoint the first converter in the chain + * \param newconverter + * + * Places the new converter at the end of the chain that starts from + * startpoint. + * + */ +void yaz_nfa_append_converter ( + yaz_nfa *n, + yaz_nfa_converter *startpoint, + yaz_nfa_converter *newconverter); + +/** brief Runs the chain of converters. + * \param n the nfa (mostly for nmem access) + * \param c the first converter in a chain + * \param outbuff buffer to write the output in. Increments the ptr. + * \param outcharsleft how many may we write + * + * Runs the converters in the chain, placing output into outbuff + * (and incrementing the pointer). + * + * \retval YAZ_NFA_SUCCESS OK + * \retval YAZ_NFA_NOMATCH no match to get backrefs from + * \retval YAZ_NFA_NOSPACE no room in outbuf + * \retval YAZ_NFA_INTERNAL Should never happen + * + */ +int yaz_nfa_run_converters( + yaz_nfa *n, + yaz_nfa_converter *c, + yaz_nfa_char **outbuff, + size_t *outcharsleft); + +/** \} */ + +/** \name High-level interface to the NFA */ +/* This interface combines the NFA and converters, for ease of + * access. There are a few calls to build a complete system, and a call + * to do the actual conversion. + */ +/* \{ */ + +/** \brief Add a rule that converts one string to another ('IX' -> '9') + * + * \param n The nfa itself + * \param from_string the string to match + * \param from_length length of the from_string + * \param to_string the string to write in the output + * \param to_length length of the to_string + * + * Adds a matching rule and a string converter to the NFA. + * Can be used for converting strings into nothing, for example, + * to remove markup. + * + * \retval YAZ_NFA_SUCCESS OK + * \retval YAZ_NFA_ALREADY Conflict with some other rule + * + */ +int yaz_nfa_add_string_rule( yaz_nfa *n, + yaz_nfa_char *from_string, + size_t from_length, + yaz_nfa_char *to_string, + size_t to_length); + +/** brief Just like yaz_nfa_add_string_rule, but takes the strings in ascii + * + * \param n The nfa itself + * \param from_string the string to match + * \param to_string the string to write in the output + * + * Like yaz_nfa_add_string_rule, this adds a rule to translate a string + * into another. The only difference is that this one takes the strings as + * normal char *, which means that no high-valued unicodes can be handled, + * and that this one uses null-terminated strings. In short, this is a + * simplified version mostly intended for tests and other small uses. + * + * \retval YAZ_NFA_SUCCESS OK + * \retval YAZ_NFA_ALREADY Conflict with some other rule + */ +int yaz_nfa_add_ascii_string_rule( yaz_nfa *n, + char *from_string, + char *to_string); + + +/** \brief Add a rule that converts a character range + * + * \param n The nfa itself + * \param range_start Where the matching range starts + * \param range_end Where the matching range ends + * \param output_range_start Where the resulting range starts + * + * + * Adds a character range rule to the NFA. The range to be converted + * is defined by the range_start and range_end parameters. The output + * range starts at output_range_start, and is automatically as long + * as the input range. + * + * Useful for alphabet normalizing [a-z] -> [A-Z] + * + * \retval YAZ_NFA_SUCCESS OK + * \retval YAZ_NFA_ALREADY Conflict with some other rule + */ +int yaz_nfa_add_char_range_rule (yaz_nfa *n, + yaz_nfa_char range_start, + yaz_nfa_char range_end, + yaz_nfa_char output_range_start); + +/** \brief Add a rule that converts a character range to a string + * + * \param n The nfa itself + * \param range_start Where the matching range starts + * \param range_end Where the matching range ends + * \param to_string the string to write in the output + * \param to_length length of the to_string + * + * \retval YAZ_NFA_SUCCESS OK + * \retval YAZ_NFA_ALREADY Conflict with some other rule + * + * Adds a character range match rule, and a string converter. + * + * Useful in converting a range of special characters into (short?) + * strings of whitespace, or even to nothing at all. + */ +int yaz_nfa_add_char_string_rule (yaz_nfa *n, + yaz_nfa_char range_start, + yaz_nfa_char range_end, + yaz_nfa_char* to_string, + size_t to_length); + +/** \brief Converts one 'slice' that is, the best matching rule. + * + * \param n the nfa itself + * \param inbuff buffer of input data. Will be incremented when match + * \param incharsleft max number of inchars to use from inbuff. decrements. + * \param outbuff buffer for output data. Will be incremented when match + * \param outcharsleft max number of chars to write to outbuff. + * + * \retval YAZ_NFA_SUCCESS OK + * \retval YAZ_NFA_OVERRUN No more input data, some pattern could match + * \retval YAZ_NFA_NOSPACE No room in the putput buffer + * \retval YAZ_NFA_NOSUCHBACKREF NFA refers to a non-existing backref + * + * Finds the best match at the beginning of inbuf, and fires its converter(s) + * to produce output in outbuff. Increments both inbuf and outbuf pointers and + * decrements the *charsleft values, so all is ready for calling again, until + * the buffer is exhausted. That loop is left to the caller, so he can load + * more data in the buffer in good time. + * + * If no match is found, converts one character into itself. If the matcher + * returns any sort of error, leaves the pointers where they were. + */ +int yaz_nfa_convert_slice (yaz_nfa *n, + yaz_nfa_char **inbuff, + size_t *incharsleft, + yaz_nfa_char **outbuff, + size_t *outcharsleft); + + +/* \} */ + +/** \name Debug routines */ +/* These provide a method for traversing all the states defined + * in the NFA, for example to release memory allocated in the results, + * and a simple debug routine to dump the NFA */ +/* \{ */ + + /** \brief Get the first state of the NFA. * * \param n the nfa @@ -258,6 +531,9 @@ yaz_nfa_state *yaz_nfa_get_next(yaz_nfa *n, yaz_nfa_state *s); */ void yaz_nfa_dump(FILE *F, yaz_nfa *n, char *(*strfunc)(void *) ); +/* \} */ + + YAZ_END_CDECL