From 25711769a3fb5c6bf0ab3eb9634bf74ba07dc48d Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 27 Sep 1994 16:31:17 +0000 Subject: [PATCH] First version of grepper: grep with error correction. --- dfa/Makefile | 15 ++- dfa/agrep.c | 141 +++++++++++++------------ dfa/grepper.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ dfa/imalloc.c | 7 +- dfa/lexer.c | 67 ++++++------ 5 files changed, 446 insertions(+), 109 deletions(-) create mode 100644 dfa/grepper.c diff --git a/dfa/Makefile b/dfa/Makefile index 5c5c02e..b0ad65d 100644 --- a/dfa/Makefile +++ b/dfa/Makefile @@ -1,13 +1,14 @@ # Copyright (C) 1994, Index Data I/S # All rights reserved. # Sebastian Hammer, Adam Dickmeiss -# $Id: Makefile,v 1.2 1994-09-26 16:30:55 adam Exp $ +# $Id: Makefile,v 1.3 1994-09-27 16:31:17 adam Exp $ SHELL=/bin/sh INCLUDE=-I../include TPROG1=agrep TPROG2=lexer -CFLAGS=-g -Wall -pedantic +TPROG3=grepper +CFLAGS=-g -Wall DEFS=$(INCLUDE) -DYACC -DYYDEBUG=1 -DMEMDEBUG=1 LIB=../lib/dfa.a PO = regexp.o imalloc.o states.o set.o bset.o @@ -16,12 +17,16 @@ YACC=yacc all: $(LIB) +alll: $(LIB) $(TPROG1) $(TPROG2) $(TPROG3) + $(TPROG1): $(TPROG1).o $(LIB) $(CC) $(CFLAGS) -o $(TPROG1) $(TPROG1).o $(LIB) ../lib/util.a $(TPROG2): $(TPROG2).o readfile.o $(LIB) - $(CC) $(CFLAGS) -o $(TPROG2) $(TPROG2).o readfile.o \ - $(LIB) ../lib/util.a + $(CC) $(CFLAGS) -o $(TPROG2) $(TPROG2).o readfile.o $(LIB) ../lib/util.a + +$(TPROG3): $(TPROG3).o $(LIB) + $(CC) $(CFLAGS) -o $(TPROG3) $(TPROG3).o $(LIB) ../lib/util.a $(LIB): $(PO) rm -f $(LIB) @@ -37,7 +42,7 @@ $(LIB): $(PO) mv y.tab.o $*.o clean: - rm -f *.[oa] $(TPROG1) $(TPROG2) core mon.out gmon.out errlist y.tab.c + rm -f *.[oa] $(TPROG1) $(TPROG2) $(TPROG3) core mon.out gmon.out errlist y.tab* depend: depend2 diff --git a/dfa/agrep.c b/dfa/agrep.c index 3c4e242..c8cdf7e 100644 --- a/dfa/agrep.c +++ b/dfa/agrep.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: agrep.c,v $ - * Revision 1.2 1994-09-26 16:30:56 adam + * Revision 1.3 1994-09-27 16:31:18 adam + * First version of grepper: grep with error correction. + * + * Revision 1.2 1994/09/26 16:30:56 adam * Minor changes. imalloc uses xmalloc now. * * Revision 1.1 1994/09/26 10:16:52 adam @@ -34,14 +37,14 @@ static char *prog; -void error( const char *format, ... ) +void error (const char *format, ...) { va_list argptr; - va_start( argptr, format ); - fprintf( stderr, "%s error: ", prog ); - (void) vfprintf( stderr, format, argptr ); - putc( '\n', stderr ); - exit( 1 ); + va_start (argptr, format); + fprintf (stderr, "%s error: ", prog); + (void) vfprintf (stderr, format, argptr); + putc ('\n', stderr); + exit (1); } #ifdef YYDEBUG @@ -54,18 +57,18 @@ extern int alexdebug; static int show_lines = 0; -int agrep_options( argc, argv ) +int agrep_options (argc, argv) int argc; char **argv; { - while( --argc > 0 ) - if( **++argv == '-' ) - while( *++*argv ) + while (--argc > 0) + if (**++argv == '-') + while (*++*argv) { - switch( **argv ) + switch (**argv) { case 'V': - fprintf( stderr, "%s: %s %s\n", prog, __DATE__, __TIME__ ); + fprintf (stderr, "%s: %s %s\n", prog, __DATE__, __TIME__); continue; case 'v': dfa_verbose = 1; @@ -83,7 +86,7 @@ char **argv; continue; #endif case 'd': - switch( *++*argv ) + switch (*++*argv) { case 's': debug_dfa_tran = 1; @@ -102,7 +105,7 @@ char **argv; } continue; default: - fprintf( stderr, "%s: unknown option `-%s'\n", prog, *argv ); + fprintf (stderr, "%s: unknown option `-%s'\n", prog, *argv); return 1; } break; @@ -115,21 +118,21 @@ static char *inf_buf; static char *inf_ptr, *inf_flsh; static int inf_eof, line_no; -static int inf_flush( fd ) +static int inf_flush (fd) int fd; { char *p; unsigned b, r; r = (unsigned) (inf_buf+INF_BUF_SIZE - inf_ptr); /* no of `wrap' bytes */ - if( r ) - memcpy( inf_buf, inf_ptr, r ); + if (r) + memcpy (inf_buf, inf_ptr, r); inf_ptr = p = inf_buf + r; b = INF_BUF_SIZE - r; do - if( (r = read( fd, p, b ) ) == (unsigned) -1 ) + if ((r = read (fd, p, b)) == (unsigned) -1) return -1; - else if( r ) + else if (r) p += r; else { @@ -137,36 +140,36 @@ int fd; inf_eof = 1; break; } - while( (b -= r) > 0 ); - while( p != inf_buf && *--p != '\n' ) + while ((b -= r) > 0); + while (p != inf_buf && *--p != '\n') ; - while( p != inf_buf && *--p != '\n' ) + while (p != inf_buf && *--p != '\n') ; inf_flsh = p+1; return 0; } -static char *prline( p ) +static char *prline (p) char *p; { char *p0; --p; - while( p != inf_buf && p[-1] != '\n' ) + while (p != inf_buf && p[-1] != '\n') --p; p0 = p; - while( *p++ != '\n' ) + while (*p++ != '\n') ; p[-1] = '\0'; - if( show_lines ) - printf( "%5d:\t%s\n", line_no, p0 ); + if (show_lines) + printf ("%5d:\t%s\n", line_no, p0); else - puts( p0 ); + puts (p0); p[-1] = '\n'; return p; } -static int go( fd, dfaar ) +static int go (fd, dfaar) int fd; DFA_state **dfaar; { @@ -176,40 +179,40 @@ DFA_state **dfaar; int i; unsigned char c; - while( 1 ) + while (1) { - for( c = *inf_ptr++, t=s->trans, i=s->tran_no; --i >= 0; t++ ) - if( c >= t->ch[0] && c <= t->ch[1] ) + for (c = *inf_ptr++, t=s->trans, i=s->tran_no; --i >= 0; t++) + if (c >= t->ch[0] && c <= t->ch[1]) { p = inf_ptr; do { - if( (s = dfaar[t->to] )->rule_no ) + if ((s = dfaar[t->to])->rule_no) { - inf_ptr = prline( inf_ptr ); + inf_ptr = prline (inf_ptr); c = '\n'; break; } - for( t=s->trans, i=s->tran_no; --i >= 0; t++ ) - if( (unsigned) *p >= t->ch[0] - && (unsigned) *p <= t->ch[1] ) + for (t=s->trans, i=s->tran_no; --i >= 0; t++) + if ((unsigned) *p >= t->ch[0] + && (unsigned) *p <= t->ch[1]) break; p++; - } while( i >= 0 ); + } while (i >= 0); s = dfaar[0]; break; } - if( c == '\n' ) + if (c == '\n') { ++line_no; - if( inf_ptr == inf_flsh ) + if (inf_ptr == inf_flsh) { - if( inf_eof ) + if (inf_eof) break; ++line_no; - if( inf_flush( fd ) ) + if (inf_flush (fd)) { - fprintf( stderr, "%s: read error\n", prog ); + fprintf (stderr, "%s: read error\n", prog); return -1; } } @@ -218,24 +221,24 @@ DFA_state **dfaar; return 0; } -int agrep( dfas, fd ) +int agrep (dfas, fd) DFA_states *dfas; int fd; { - inf_buf = imalloc( sizeof(char)*INF_BUF_SIZE ); + inf_buf = imalloc (sizeof(char)*INF_BUF_SIZE); inf_eof = 0; inf_ptr = inf_buf+INF_BUF_SIZE; - inf_flush( fd ); + inf_flush (fd); line_no = 1; - go( fd, dfas->sortarray); + go (fd, dfas->sortarray); - ifree( inf_buf ); + ifree (inf_buf); return 0; } -int main( argc, argv ) +int main (argc, argv) int argc; char **argv; { @@ -253,42 +256,42 @@ char **argv; alexdebug = 0; #endif #endif - setbuf( stdout, outbuf ); - i = agrep_options( argc, argv ); - if( i ) + setbuf (stdout, outbuf); + i = agrep_options (argc, argv); + if (i) return i; - while( --argc > 0 ) - if( **++argv != '-' && **argv ) - if( !pattern ) + while (--argc > 0) + if (**++argv != '-' && **argv) + if (!pattern) { pattern = *argv; - i = parse_dfa( dfa, &pattern, dfa_thompson_chars ); - if( i || *pattern ) + i = parse_dfa (dfa, &pattern, dfa_thompson_chars); + if (i || *pattern) { - fprintf( stderr, "%s: illegal pattern\n", prog ); + fprintf (stderr, "%s: illegal pattern\n", prog); return 1; } dfa->root = dfa->top; - dfas = mk_dfas( dfa, 200 ); - rm_dfa( &dfa ); + dfas = mk_dfas (dfa, 200); + rm_dfa (&dfa); } else { ++no; - fd = open( *argv, O_RDONLY | O_BINARY); - if( fd == -1 ) + fd = open (*argv, O_RDONLY | O_BINARY); + if (fd == -1) { - fprintf( stderr, "%s: couldn't open `%s'\n", prog, *argv ); + fprintf (stderr, "%s: couldn't open `%s'\n", prog, *argv); return 1; } - i = agrep( dfas, fd ); - close( fd ); - if( i ) + i = agrep (dfas, fd); + close (fd); + if (i) return i; } - if( !no ) + if (!no) { - fprintf( stderr, "%s: no files specified\n", prog ); + fprintf (stderr, "%s: no files specified\n", prog); return 2; } fflush(stdout); diff --git a/dfa/grepper.c b/dfa/grepper.c new file mode 100644 index 0000000..afc8927 --- /dev/null +++ b/dfa/grepper.c @@ -0,0 +1,325 @@ +/* + * Copyright (C) 1994, Index Data I/S + * All rights reserved. + * Sebastian Hammer, Adam Dickmeiss + * + * $Log: grepper.c,v $ + * Revision 1.1 1994-09-27 16:31:18 adam + * First version of grepper: grep with error correction. + * + */ +#include +#include +#include +#include +#include + +#include +#include +#include "imalloc.h" + +char *prog; +static int show_line = 0; + +typedef unsigned MatchWord; +#define WORD_BITS 32 + +typedef struct { + int n; /* no of MatchWord needed */ + int range; /* max no. of errors */ + MatchWord *Sc; /* Mask Sc */ +} MatchContext; + +#define INFBUF_SIZE 16384 + +static inline void set_bit (MatchContext *mc, MatchWord *m, int ch, int state) +{ + int off = state & (WORD_BITS-1); + int wno = state / WORD_BITS; + + m[mc->n * ch + wno] |= 1<n * ch + wno] &= ~(1<n * ch + wno] & (1<n = (dfas->no+WORD_BITS) / WORD_BITS; + mc->range = range; + mc->Sc = icalloc (sizeof(*mc->Sc) * 256 * mc->n); + + for (i=0; ino; i++) + { + int j; + DFA_state *state = dfas->sortarray[i]; + + for (j=0; jtran_no; j++) + { + int ch; + int ch0 = state->trans[j].ch[0]; + int ch1 = state->trans[j].ch[1]; + assert (ch0 >= 0 && ch1 >= 0); + + for (ch = ch0; ch <= ch1; ch++) + set_bit (mc, mc->Sc, ch, i); + } + } + return mc; +} + +static void mask_shift (MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc, + DFA_states *dfas, int ch) +{ + int i, s; + Rdst[0] = 1; + for (i = 1; in; i++) + Rdst[i] = 0; + for (s = 0; sno; s++) + if (get_bit (mc, Rsrc, 0, s)) + { + DFA_state *state = dfas->sortarray[s]; + int i = state->tran_no; + while (--i >= 0) + if (ch >= state->trans[i].ch[0] && ch <= state->trans[i].ch[1]) + set_bit (mc, Rdst, 0, state->trans[i].to); + } +} + +static void shift (MatchContext *mc, MatchWord *Rdst, MatchWord *Rsrc, + DFA_states *dfas) +{ + int i, s; + for (i = 0; in; i++) + Rdst[i] = 0; + for (s = 0; sno; s++) + if (get_bit (mc, Rsrc, 0, s)) + { + DFA_state *state = dfas->sortarray[s]; + int i = state->tran_no; + while (--i >= 0) + set_bit (mc, Rdst, 0, state->trans[i].to); + } +} + +static void or (MatchContext *mc, MatchWord *Rdst, + MatchWord *Rsrc1, MatchWord *Rsrc2) +{ + int i; + for (i = 0; in; i++) + Rdst[i] = Rsrc1[i] | Rsrc2[i]; +} + + +static int go (MatchContext *mc, DFA_states *dfas, FILE *inf) +{ + MatchWord *Rj, *Rj1, *Rj_a, *Rj_b; + int s, d, ch; + int lineno = 1; + char *infbuf; + int inf_ptr = 1; + int no_match = 0; + + infbuf = imalloc (INFBUF_SIZE); + infbuf[0] = '\n'; + Rj = icalloc (mc->n * (mc->range+1) * sizeof(*Rj)); + Rj1 = icalloc (mc->n * (mc->range+1) * sizeof(*Rj)); + Rj_a = icalloc (mc->n * sizeof(*Rj)); + Rj_b = icalloc (mc->n * sizeof(*Rj)); + + set_bit (mc, Rj, 0, 0); + for (d = 1; d<=mc->range; d++) + { + int s; + memcpy (Rj + mc->n * d, Rj + mc->n * (d-1), mc->n * sizeof(*Rj)); + for (s = 0; sno; s++) + { + if (get_bit (mc, Rj, d-1, s)) + { + DFA_state *state = dfas->sortarray[s]; + int i = state->tran_no; + while (--i >= 0) + set_bit (mc, Rj, d, state->trans[i].to); + } + } + } + while ((ch = getc (inf)) != EOF) + { + MatchWord *Rj_t; + if (ch == '\n') + { + if (no_match) + { + int i = inf_ptr; + if (show_line) + printf ("%5d:", lineno); + while (infbuf[i] != '\n') + { + if (--i < 0) + i = INFBUF_SIZE-1; + } + do + { + if (++i == INFBUF_SIZE) + i = 0; + putchar (infbuf[i]); + } while (infbuf[i] != '\n'); + no_match = 0; + } + lineno++; + } + infbuf[inf_ptr++] = ch; + if (inf_ptr == INFBUF_SIZE) + inf_ptr = 0; + mask_shift (mc, Rj1, Rj, dfas, ch); + for (d = 1; d <= mc->range; d++) + { + mask_shift (mc, Rj_b, Rj+d*mc->n, dfas, ch);/* 1 */ + + shift (mc, Rj_a, Rj+(d-1)*mc->n, dfas); /* 2 */ + + or (mc, Rj_a, Rj_a, Rj_b); /* 1,2 */ + + shift (mc, Rj_b, Rj1+(d-1)*mc->n, dfas); /* 3 */ + + or (mc, Rj_a, Rj_a, Rj_b); /* 1,2,3 */ + + or (mc, Rj1+d*mc->n, Rj_a, Rj+(d-1)*mc->n); /* 1,2,3,4 */ + } + for (s = 0; sno; s++) + { + if (dfas->sortarray[s]->rule_no) + if (get_bit (mc, Rj1+mc->range*mc->n, 0, s)) + no_match++; + } + for (d = 0; d <= mc->range; d++) + reset_bit (mc, Rj1+d*mc->n, 0, dfas->no); + Rj_t = Rj1; + Rj1 = Rj; + Rj = Rj_t; + } + ifree (Rj); + ifree (Rj1); + ifree (Rj_a); + ifree (Rj_b); + ifree (infbuf); + return 0; +} + +static int grep_file (DFA_states *dfas, const char *fname, int range) +{ + FILE *inf; + MatchContext *mc; + + if (fname) + { + inf = fopen (fname, "r"); + if (!inf) + { + log (LOG_FATAL|LOG_ERRNO, "cannot open `%s'", fname); + exit (1); + } + } + else + inf = stdin; + + mc = mk_MatchContext (dfas, range); + + go (mc, dfas, inf); + + if (fname) + fclose (inf); + return 0; +} + +int main (int argc, char **argv) +{ + int ret; + int range = 0; + char *arg; + char *pattern = NULL; + DFA_states *dfas = NULL; + int no_files = 0; + + prog = argv[0]; + while ((ret = options ("nr:dsv:", argv, argc, &arg)) != -2) + { + if (ret == 0) + { + if (!pattern) + { + int i; + DFA *dfa = init_dfa(); + pattern = arg; + i = parse_dfa (dfa, &pattern, dfa_thompson_chars); + if (i || *pattern) + { + fprintf (stderr, "%s: illegal pattern\n", prog); + return 1; + } + dfa->root = dfa->top; + dfas = mk_dfas (dfa, 200); + rm_dfa (&dfa); + } + else + { + no_files++; + grep_file (dfas, arg, range); + } + } + else if (ret == 'v') + { + log_init (atoi(arg), prog, NULL); + } + else if (ret == 's') + { + dfa_verbose = 1; + } + else if (ret == 'd') + { + debug_dfa_tran = 1; + debug_dfa_followpos = 1; + debug_dfa_trav = 1; + } + else if (ret == 'r') + { + range = atoi (arg); + } + else if (ret == 'n') + { + show_line = 1; + } + else + { + log (LOG_FATAL, "unknown option"); + exit (1); + } + } + if (!pattern) + { + fprintf (stderr, "usage:\n " + " %s [-d] [-n] [-r n] [-s] [-v n] pattern file ..\n", prog); + exit (1); + } + else if (no_files == 0) + { + grep_file (dfas, NULL, range); + } + return 0; +} diff --git a/dfa/imalloc.c b/dfa/imalloc.c index 7e9ec71..a0a98f1 100644 --- a/dfa/imalloc.c +++ b/dfa/imalloc.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: imalloc.c,v $ - * Revision 1.2 1994-09-26 16:30:56 adam + * Revision 1.3 1994-09-27 16:31:19 adam + * First version of grepper: grep with error correction. + * + * Revision 1.2 1994/09/26 16:30:56 adam * Minor changes. imalloc uses xmalloc now. * * Revision 1.1 1994/09/26 10:16:54 adam @@ -73,7 +76,7 @@ void *icalloc (size_t size) ++alloc_calls; return (void *)p; #else - void p = (void) xcalloc( size, 1 ); + void *p = (void) xcalloc( size, 1 ); if( !p ) log (LOG_FATAL, "Out of memory (icalloc)" ); return p; diff --git a/dfa/lexer.c b/dfa/lexer.c index 98337d1..9729d9e 100644 --- a/dfa/lexer.c +++ b/dfa/lexer.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: lexer.c,v $ - * Revision 1.1 1994-09-26 10:16:55 adam + * Revision 1.2 1994-09-27 16:31:20 adam + * First version of grepper: grep with error correction. + * + * Revision 1.1 1994/09/26 10:16:55 adam * First version of dfa module in alex. This version uses yacc to parse * regular expressions. This should be hand-made instead. * @@ -27,14 +30,14 @@ static char *prog; -void error( const char *format, ... ) +void error (const char *format, ...) { va_list argptr; - va_start( argptr, format ); - fprintf( stderr, "%s error: ", prog ); - (void) vfprintf( stderr, format, argptr ); - putc( '\n', stderr ); - exit( 1 ); + va_start (argptr, format); + fprintf (stderr, "%s error: ", prog); + (void) vfprintf (stderr, format, argptr); + putc ('\n', stderr); + exit (1); } #ifdef YACC @@ -44,21 +47,17 @@ extern int alexdebug; #endif int ccluse = 0; -static int alex_options (int argc, char **argv); - -static int alex_options (int argc, char **argv) +static int lexer_options (int argc, char **argv) { - while( --argc > 0 ) - if( **++argv == '-' ) - while( *++*argv ) + while (--argc > 0) + if (**++argv == '-') + while (*++*argv) { - switch( **argv ) + switch (**argv) { -#ifdef __STDC__ case 'V': - fprintf( stderr, "%s: %s %s\n", prog, __DATE__, __TIME__ ); + fprintf (stderr, "%s: %s %s\n", prog, __DATE__, __TIME__); continue; -#endif case 'v': dfa_verbose = 1; continue; @@ -73,7 +72,7 @@ static int alex_options (int argc, char **argv) ccluse = 1; continue; case 'd': - switch( *++*argv ) + switch (*++*argv) { case 's': debug_dfa_tran = 1; @@ -92,7 +91,8 @@ static int alex_options (int argc, char **argv) } continue; default: - fprintf( stderr, "%s: unknown option `-%s'\n", prog, *argv ); + fprintf (stderr, "%s: unknown option `-%s'\n", + prog, *argv); return 1; } break; @@ -100,7 +100,7 @@ static int alex_options (int argc, char **argv) return 0; } -int main (int argc, char **argv ) +int main (int argc, char **argv) { int i, no = 0; DFA *dfa; @@ -112,32 +112,33 @@ int main (int argc, char **argv ) #else alexdebug = 0; #endif - i = alex_options( argc, argv ); - if( i ) + i = lexer_options (argc, argv); + if (i) return i; - if( argc < 2 ) + if (argc < 2) { - fprintf( stderr, "%s: usage %s -cVvt -d[stf]\n", prog, prog ); + fprintf (stderr, "usage\n %s [-c] [-V] [-v] [-t] [-d[stf]] file\n", + prog); return 1; } - else while( --argc > 0 ) - if( **++argv != '-' && **argv ) + else while (--argc > 0) + if (**++argv != '-' && **argv) { ++no; - i = read_file( *argv, &dfa ); - if( i ) + i = read_file (*argv, &dfa); + if (i) return i; - dfas = mk_dfas( dfa, 2000 ); - rm_dfa( &dfa ); - rm_dfas( &dfas ); + dfas = mk_dfas (dfa, 2000); + rm_dfa (&dfa); + rm_dfas (&dfas); } #ifdef MEMDEBUG imemstat(); #endif - if( !no ) + if (!no) { - fprintf( stderr, "%s: no files specified\n", prog ); + fprintf (stderr, "%s: no files specified\n", prog); return 2; } return 0; -- 1.7.10.4