From e9b13c0966913e073e98f9c5d7c5f4deed7c4c46 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 3 Oct 1994 17:23:01 +0000 Subject: [PATCH] First version of dictionary lookup with regular expressions and errors. --- dict/Makefile | 7 +- dict/dicttest.c | 28 +++- dict/lookgrep.c | 395 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ include/dict.h | 28 ++-- 4 files changed, 443 insertions(+), 15 deletions(-) create mode 100644 dict/lookgrep.c diff --git a/dict/Makefile b/dict/Makefile index 48b276d..c81758c 100644 --- a/dict/Makefile +++ b/dict/Makefile @@ -1,7 +1,7 @@ # Copyright (C) 1994, Index Data I/S # All rights reserved. # Sebastian Hammer, Adam Dickmeiss -# $Id: Makefile,v 1.11 1994-09-26 10:17:23 adam Exp $ +# $Id: Makefile,v 1.12 1994-10-03 17:23:01 adam Exp $ SHELL=/bin/sh INCLUDE=-I../include @@ -10,11 +10,14 @@ TPROG2=dictext CFLAGS=-g -Wall -pedantic DEFS=$(INCLUDE) LIB=../lib/dict.a -PO = dopen.o dclose.o drdwr.o open.o close.o insert.o lookup.o lookupec.o +PO = dopen.o dclose.o drdwr.o open.o close.o insert.o lookup.o \ + lookupec.o lookgrep.o CPP=cc -E all: $(LIB) +alll: $(LIB) $(TPROG1) $(TPROG2) + $(TPROG1): $(TPROG1).o $(LIB) $(CC) $(CFLAGS) -o $(TPROG1) $(TPROG1).o $(LIB) ../lib/bfile.a ../lib/dfa.a ../lib/util.a diff --git a/dict/dicttest.c b/dict/dicttest.c index c461b84..bc34640 100644 --- a/dict/dicttest.c +++ b/dict/dicttest.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: dicttest.c,v $ - * Revision 1.11 1994-09-28 13:07:09 adam + * Revision 1.12 1994-10-03 17:23:03 adam + * First version of dictionary lookup with regular expressions and errors. + * + * Revision 1.11 1994/09/28 13:07:09 adam * Use log_mask_str now. * * Revision 1.10 1994/09/26 10:17:24 adam @@ -63,6 +66,13 @@ static int lookup_handle (Dict_char *name) return 0; } +static int grep_handle (Dict_char *name, char *info) +{ + look_hits++; + printf ("%s\n", name); + return 0; +} + int main (int argc, char **argv) { const char *name = NULL; @@ -74,6 +84,7 @@ int main (int argc, char **argv) int cache = 10; int ret; int unique = 0; + char *grep_pattern = NULL; char *arg; int no_of_iterations = 0; int no_of_new = 0, no_of_same = 0, no_of_change = 0; @@ -84,12 +95,12 @@ int main (int argc, char **argv) if (argc < 2) { fprintf (stderr, "usage:\n " - " %s [-r n] [-u] [-s n] [-v n] [-i f] [-w] [-c n]" + " %s [-r n] [-u] [-g pat] [-s n] [-v n] [-i f] [-w] [-c n]" " base file\n", prog); exit (1); } - while ((ret = options ("r:us:v:i:wc:", argv, argc, &arg)) != -2) + while ((ret = options ("r:ug:s:v:i:wc:", argv, argc, &arg)) != -2) { if (ret == 0) { @@ -103,6 +114,10 @@ int main (int argc, char **argv) exit (1); } } + else if (ret == 'g') + { + grep_pattern = arg; + } else if (ret == 'r') { range = atoi (arg); @@ -240,6 +255,13 @@ int main (int argc, char **argv) log (LOG_LOG, "No of hits.... %d", no_of_hits); log (LOG_LOG, "No of misses.. %d", no_of_misses); } + if (grep_pattern) + { + if (range < 0) + range = 0; + log (LOG_LOG, "Grepping '%s'", grep_pattern); + dict_lookup_grep (dict, grep_pattern, range, grep_handle); + } dict_close (dict); res_close (common_resource); return 0; diff --git a/dict/lookgrep.c b/dict/lookgrep.c new file mode 100644 index 0000000..b71a484 --- /dev/null +++ b/dict/lookgrep.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 1994, Index Data I/S + * All rights reserved. + * Sebastian Hammer, Adam Dickmeiss + * + * $Log: lookgrep.c,v $ + * Revision 1.1 1994-10-03 17:23:04 adam + * First version of dictionary lookup with regular expressions and errors. + * + */ + +#include +#include +#include +#include + +#include +#include + +typedef unsigned MatchWord; +#define WORD_BITS 32 + +typedef struct { + int n; /* no of MatchWord needed */ + int range; /* max no. of errors */ + MatchWord *Sc; /* Mask Sc */ +} MatchContext; + +#define INLINE + +static INLINE void set_bit (MatchContext *mc, MatchWord *m, int ch, int state) +{ + int off = state & (WORD_BITS-1); + int wno = state / WORD_BITS; + + m[mc->n * ch + wno] |= 1<n * ch + wno] &= ~(1<n * ch + wno] & (1<n = (dfas->no+WORD_BITS) / WORD_BITS; + mc->range = range; + mc->Sc = xcalloc (sizeof(*mc->Sc) * 256 * mc->n, 1); + + for (i=0; ino; i++) + { + int j; + DFA_state *state = dfas->sortarray[i]; + + for (j=0; jtran_no; j++) + { + int ch; + int ch0 = state->trans[j].ch[0]; + int ch1 = state->trans[j].ch[1]; + assert (ch0 >= 0 && ch1 >= 0); + + for (ch = ch0; ch <= ch1; ch++) + set_bit (mc, mc->Sc, ch, i); + } + } + return mc; +} + + +static void mask_shift (MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc, + DFA_states *dfas, int ch) +{ + int j, s = 0; + MatchWord *Rsrc_p = Rsrc, mask; + +#if 1 + for (j = 0; jn; j++) + Rdst[j] = 0; +#else + Rdst[0] = 1; + for (j = 1; jn; j++) + Rdst[j] = 0; +#endif + while (1) + { + mask = *Rsrc_p++; + for (j = 0; jsortarray[s]; + int i = state->tran_no; + while (--i >= 0) + if (ch >= state->trans[i].ch[0] && + ch <= state->trans[i].ch[1]) + set_bit (mc, Rdst, 0, state->trans[i].to); + } + if (mask & 2) + { + DFA_state *state = dfas->sortarray[s+1]; + int i = state->tran_no; + while (--i >= 0) + if (ch >= state->trans[i].ch[0] && + ch <= state->trans[i].ch[1]) + set_bit (mc, Rdst, 0, state->trans[i].to); + } + if (mask & 4) + { + DFA_state *state = dfas->sortarray[s+2]; + int i = state->tran_no; + while (--i >= 0) + if (ch >= state->trans[i].ch[0] && + ch <= state->trans[i].ch[1]) + set_bit (mc, Rdst, 0, state->trans[i].to); + } + if (mask & 8) + { + DFA_state *state = dfas->sortarray[s+3]; + int i = state->tran_no; + while (--i >= 0) + if (ch >= state->trans[i].ch[0] && + ch <= state->trans[i].ch[1]) + set_bit (mc, Rdst, 0, state->trans[i].to); + } + } + s += 4; + if (s >= dfas->no) + return; + mask >>= 4; + } + } +} + +static void shift (MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc, + DFA_states *dfas) +{ + int j, s = 0; + MatchWord *Rsrc_p = Rsrc, mask; + for (j = 0; jn; j++) + Rdst[j] = 0; + while (1) + { + mask = *Rsrc_p++; + for (j = 0; jsortarray[s]; + int i = state->tran_no; + while (--i >= 0) + set_bit (mc, Rdst, 0, state->trans[i].to); + } + if (mask & 2) + { + DFA_state *state = dfas->sortarray[s+1]; + int i = state->tran_no; + while (--i >= 0) + set_bit (mc, Rdst, 0, state->trans[i].to); + } + if (mask & 4) + { + DFA_state *state = dfas->sortarray[s+2]; + int i = state->tran_no; + while (--i >= 0) + set_bit (mc, Rdst, 0, state->trans[i].to); + } + if (mask & 8) + { + DFA_state *state = dfas->sortarray[s+3]; + int i = state->tran_no; + while (--i >= 0) + set_bit (mc, Rdst, 0, state->trans[i].to); + } + } + s += 4; + if (s >= dfas->no) + return; + mask >>= 4; + } + } +} + +static void or (MatchContext *mc, MatchWord *Rdst, + MatchWord *Rsrc1, MatchWord *Rsrc2) +{ + int i; + for (i = 0; in; i++) + Rdst[i] = Rsrc1[i] | Rsrc2[i]; +} + +static int move (MatchContext *mc, MatchWord *Rj1, MatchWord *Rj, + Dict_char ch, DFA_states *dfas, MatchWord *Rtmp) +{ + int d; + MatchWord *Rj_a = Rtmp; + MatchWord *Rj_b = Rtmp + mc->n; + MatchWord *Rj_c = Rtmp + 2*mc->n; + + mask_shift (mc, Rj1, Rj, dfas, ch); + for (d = 1; d <= mc->range; d++) + { + mask_shift (mc, Rj_b, Rj+d*mc->n, dfas, ch); /* 1 */ + + or (mc, Rj_a, Rj+(d-1)*mc->n, Rj1+(d-1)*mc->n); /* 2,3 */ + + shift (mc, Rj_c, Rj_a, dfas); + + or (mc, Rj_a, Rj_b, Rj_c); /* 1,2,3*/ + + or (mc, Rj1+d*mc->n, Rj_a, Rj+(d-1)*mc->n); /* 1,2,3,4 */ + } + return 1; + +} + + +static int dict_grep (Dict dict, Dict_ptr ptr, MatchContext *mc, + MatchWord *Rj, int pos, + int (*userfunc)(Dict_char *name, char *info), + Dict_char *prefix, DFA_states *dfas) +{ + int lo, hi; + void *p; + short *indxp; + char *info; + + dict_bf_readp (dict->dbf, ptr, &p); + lo = 0; + hi = DICT_nodir(p)-1; + indxp = (short*) ((char*) p+DICT_PAGESIZE-sizeof(short)); + while (lo <= hi) + { + if (indxp[-lo] > 0) + { + /* string (Dict_char *) DICT_EOS terminated */ + /* unsigned char length of information */ + /* char * information */ + int j; + int was_match = 0; + info = (char*)p + indxp[-lo]; + for (j=0; ; j++) + { + Dict_char ch; + int s, d; + MatchWord *Rj0 = Rj + j *(1+mc->range)*mc->n; + MatchWord *Rj1 = Rj + (j+1)*(1+mc->range)*mc->n; + MatchWord *Rj_tmp = Rj + (j+2)*(1+mc->range)*mc->n; + + memcpy (&ch, info+j*sizeof(Dict_char), sizeof(Dict_char)); + prefix[pos+j] = ch; + if (ch == DICT_EOS) + { + if (was_match) + (*userfunc)(prefix, info+(j+1)*sizeof(Dict_char)); + break; + } + + was_match = 0; + move (mc, Rj1, Rj0, ch, dfas, Rj_tmp); + for (d = mc->n; --d >= 0; ) + if (Rj1[mc->range*mc->n + d]) + break; + if (d < 0) + break; + for (s = 0; sno; s++) + { + if (dfas->sortarray[s]->rule_no) + if (get_bit (mc, Rj1+mc->range*mc->n, 0, s)) + was_match = 1; + } + for (d = 0; d <= mc->range; d++) + reset_bit (mc, Rj1+d*mc->n, 0, dfas->no); + } + } + else + { + MatchWord *Rj1 = Rj+ (1+mc->range)*mc->n; + MatchWord *Rj_tmp = Rj+2*(1+mc->range)*mc->n; + Dict_char ch; + int d; + + /* Dict_ptr subptr */ + /* Dict_char sub char */ + /* unsigned char length of information */ + /* char * information */ + info = (char*)p - indxp[-lo]; + memcpy (&ch, info+sizeof(Dict_ptr), sizeof(Dict_char)); + prefix[pos] = ch; + + move (mc, Rj1, Rj, ch, dfas, Rj_tmp); + for (d = mc->n; --d >= 0; ) + if (Rj1[mc->range*mc->n + d]) + break; + if (d >= 0) + { + Dict_ptr subptr; + if (info[sizeof(Dict_ptr)+sizeof(Dict_char)]) + { + int s; + for (s = 0; sno; s++) + if (dfas->sortarray[s]->rule_no) + if (get_bit (mc, Rj1+mc->range*mc->n, 0, s)) + { + prefix[pos+1] = DICT_EOS; + (*userfunc)(prefix, info+sizeof(Dict_ptr)+ + sizeof(Dict_char)); + break; + } + } + memcpy (&subptr, info, sizeof(Dict_ptr)); + if (subptr) + { + dict_grep (dict, subptr, mc, Rj1, pos+1, + userfunc, prefix, dfas); + dict_bf_readp (dict->dbf, ptr, &p); + indxp = (short*) ((char*) p+DICT_PAGESIZE-sizeof(short)); + } + } + } + lo++; + } + return 0; +} + +int dict_lookup_grep (Dict dict, Dict_char *pattern, int range, + int (*userfunc)(Dict_char *name, char *info)) +{ + MatchWord *Rj; + Dict_char prefix[2048]; + char *this_pattern = pattern; + DFA_states *dfas; + MatchContext *mc; + DFA *dfa = init_dfa(); + int i, d; + + i = parse_dfa (dfa, &this_pattern, dfa_thompson_chars); + if (i || *this_pattern) + { + rm_dfa (&dfa); + return -1; + } + dfa->root = dfa->top; + dfas = mk_dfas (dfa, 200); + rm_dfa (&dfa); + + mc = mk_MatchContext (dfas, range); + + Rj = xcalloc ((1000+dict_strlen(pattern)+range+2)*(range+1)*mc->n, + sizeof(*Rj)); + + set_bit (mc, Rj, 0, 0); + for (d = 1; d<=mc->range; d++) + { + int s; + memcpy (Rj + mc->n * d, Rj + mc->n * (d-1), mc->n * sizeof(*Rj)); + for (s = 0; sno; s++) + { + if (get_bit (mc, Rj, d-1, s)) + { + DFA_state *state = dfas->sortarray[s]; + int i = state->tran_no; + while (--i >= 0) + set_bit (mc, Rj, d, state->trans[i].to); + } + } + } + i = dict_grep (dict, 1, mc, Rj, 0, userfunc, prefix, dfas); + + rm_dfas (&dfas); + xfree (Rj); + return i; +} + + + diff --git a/include/dict.h b/include/dict.h index 4dd87ca..0deb357 100644 --- a/include/dict.h +++ b/include/dict.h @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: dict.h,v $ - * Revision 1.7 1994-09-22 10:44:47 adam + * Revision 1.8 1994-10-03 17:23:11 adam + * First version of dictionary lookup with regular expressions and errors. + * + * Revision 1.7 1994/09/22 10:44:47 adam * Don't remember what changed!! * * Revision 1.6 1994/09/16 15:39:21 adam @@ -57,15 +60,15 @@ typedef struct Dict_file_struct { int cache; BFile bf; - + struct Dict_file_block *all_blocks; struct Dict_file_block *free_list; struct Dict_file_block **hash_array; - + struct Dict_file_block *lru_back, *lru_front; int hash_size; void *all_data; - + int block_size; int hits; int misses; @@ -75,24 +78,29 @@ typedef struct Dict_struct { int rw; Dict_BFile dbf; struct Dict_head head; -} *Dict; +} +*Dict; + +#define DICT_MAGIC "dict00" + +#define DICT_PAGESIZE 8192 int dict_bf_readp (Dict_BFile bf, int no, void **bufp); + int dict_bf_newp (Dict_BFile bf, int no, void **bufp); int dict_bf_touch (Dict_BFile bf, int no); void dict_bf_flush_blocks (Dict_BFile bf, int no_to_flush); Dict_BFile dict_bf_open (const char *name, int block_size, int cache, int rw); int dict_bf_close (Dict_BFile dbf); -#define DICT_MAGIC "dict00" - -#define DICT_PAGESIZE 8192 - + Dict dict_open (const char *name, int cache, int rw); int dict_close (Dict dict); int dict_insert (Dict dict, const Dict_char *p, int userlen, void *userinfo); char *dict_lookup (Dict dict, Dict_char *p); int dict_lookup_ec (Dict dict, Dict_char *p, int range, - int (*f)(Dict_char*name)); + int (*f)(Dict_char *name)); +int dict_lookup_grep (Dict dict, Dict_char *p, int range, + int (*f)(Dict_char *name, char *info)); int dict_strcmp (const Dict_char *s1, const Dict_char *s2); int dict_strlen (const Dict_char *s); -- 1.7.10.4