From 09a35650e85178961deead32f1772f61e3ef0ed3 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 15 Oct 1998 12:30:59 +0000 Subject: [PATCH] Buf fixes. Robot saves body of text without tags and java script sections. --- Makefile.in | 15 +++++++++------ configure.in | 28 +++++++++++++++++++++++----- hswitch.c | 31 +++++++++++++++++++++++-------- init.c | 4 ++-- robot.tcl | 42 ++++++++++++++++++++++++++++++++---------- tclmain.c | 6 +++--- tclrobot.h | 4 ++-- 7 files changed, 94 insertions(+), 36 deletions(-) diff --git a/Makefile.in b/Makefile.in index 7729bb4..019d606 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,5 +1,5 @@ # Makefile for Tcl Web Robot -# $Id: Makefile.in,v 1.1 1996/08/06 14:04:22 adam Exp $ +# $Id: Makefile.in,v 1.2 1998/10/15 12:30:59 adam Exp $ SHELL=/bin/sh # Version @@ -27,7 +27,11 @@ INSTALL=@INSTALL@ INSTALL_PROGRAM=@INSTALL_PROGRAM@ INSTALL_DATA=@INSTALL_DATA@ RANLIB=@RANLIB@ -SHLIB_LD=@SHLIB_LD@ + +SHLIB_LD = @SHLIB_LD@ +SHLIB_CFLAGS = @SHLIB_CFLAGS@ +SHLIB_SUFFIX = @SHLIB_SUFFIX@ +SHLIB_VERSION = @SHLIB_VERSION@ O=hswitch.o init.o @@ -39,12 +43,11 @@ tclrobot.a: $(O) ar cr tclrobot.a $(O) $(RANLIB) tclrobot.a -libtclrobot.so: $(O) - $(SHLIB_LD) -o libtclrobot.so $(O) - $(RANLIB) libtclrobot.so +tclrobot$(SHLIB_SUFFIX): $(O) + $(SHLIB_LD) -o tclrobot$(SHLIB_SUFFIX) $(O) .c.o: - $(CC) -c $(CFLAGS) $(DEFS) $< + $(CC) -c $(CFLAGS) $(SHLIB_CFLAGS) $(DEFS) $< clean: rm -f tclrobot core *.out *.o *.a *.so config.* diff --git a/configure.in b/configure.in index b755929..010f9d5 100644 --- a/configure.in +++ b/configure.in @@ -1,7 +1,7 @@ dnl Web robot toolkit for tcl -dnl (c) Index Data 1996 +dnl (c) Index Data 1996-1998 dnl See the file LICENSE for details. -dnl $Id: configure.in,v 1.1 1996/08/06 14:04:22 adam Exp $ +dnl $Id: configure.in,v 1.2 1998/10/15 12:31:00 adam Exp $ AC_INIT(tclrobot.h) CC=${CC-cc} dnl ------ Substitutions @@ -10,11 +10,26 @@ AC_SUBST(TCLLIB) AC_SUBST(TKLIB) AC_SUBST(TCLINC) AC_SUBST(TKINC) +AC_SUBST(SHLIB_CFLAGS) AC_SUBST(SHLIB_LD) +AC_SUBST(SHLIB_SUFFIX) +AC_SUBST(SHLIB_VERSION) AC_SUBST(RANLIB) dnl ------ Preliminary settings AC_PROG_INSTALL AC_PREFIX_PROGRAM(tclsh) +if test "x$prefix" = xNONE; then + AC_PREFIX_PROGRAM(tclsh8.1) +fi +if test "x$prefix" = xNONE; then + AC_PREFIX_PROGRAM(tclsh8.0) +fi +if test "x$prefix" = xNONE; then + AC_PREFIX_PROGRAM(tclsh7.6) +fi +if test "x$prefix" = xNONE; then + AC_PREFIX_PROGRAM(tclsh7.5) +fi AC_STDC_HEADERS if test "$ac_cv_header_stdc" = no; then AC_MSG_WARN(Your system doesn't seem to support ANSI C) @@ -27,11 +42,14 @@ else fi if test -r ${tryprefix}/lib/tclConfig.sh; then AC_MSG_CHECKING(for Tcl) - source ${tryprefix}/lib/tclConfig.sh + . ${tryprefix}/lib/tclConfig.sh TCLLIB="${TCL_LIB_SPEC} ${TCL_LIBS}" TCLINC=-I${TCL_PREFIX}/include RANLIB=${TCL_RANLIB} - SHLIB_LD=${TCL_SHLIB_LD} + SHLIB_CFLAGS=$TCL_SHLIB_CFLAGS + SHLIB_LD=$TCL_SHLIB_LD + SHLIB_SUFFIX=$TCL_SHLIB_SUFFIX + SHLIB_VERSION=$TCL_SHLIB_VERSION AC_MSG_RESULT($TCL_VERSION) else AC_MSG_WARN(Didn't find Tcl) @@ -39,7 +57,7 @@ fi dnl ------ look for Tk AC_MSG_CHECKING(for Tk) if test -r ${tryprefix}/lib/tkConfig.sh; then - source ${tryprefix}/lib/tkConfig.sh + . ${tryprefix}/lib/tkConfig.sh AC_MSG_RESULT($TK_VERSION) TKINC=${TK_XINCLUDES} TKLIB="${TK_PREFIX}/lib/${TK_LIB_FILE} ${TK_LIBS}" diff --git a/hswitch.c b/hswitch.c index e631c1c..3a05f5e 100644 --- a/hswitch.c +++ b/hswitch.c @@ -1,5 +1,5 @@ /* - * $Id: hswitch.c,v 1.1 1996/08/06 14:04:22 adam Exp $ + * $Id: hswitch.c,v 1.2 1998/10/15 12:31:01 adam Exp $ */ #include #include @@ -12,6 +12,8 @@ #define SPACECHR " \t\r\n\f" +#define DEBUG(x) + static int skipSpace (const char *cp) { int i = 0; @@ -23,10 +25,15 @@ static int skipSpace (const char *cp) static int skipTag (const char *cp, char *dst) { int i; - - for (i=0; i=", cp[i]); i++) - dst[i] = tolower(cp[i]); - dst[i] = '\0'; + int j = 0; + + for (i=0; cp[i] && !strchr (SPACECHR "/>=", cp[i]); i++) + if (j < TAG_MAX_LEN-1) + { + dst[j] = tolower(cp[j]); + j++; + } + dst[j] = '\0'; return i; } @@ -102,6 +109,7 @@ static int tagStart (struct tagInfo *tag, const char *tagString, if (tag && !tag->level) { strcpy (tag->name, tagString); + DEBUG(printf ("------ consuming this %s\n", tag->name)); tag->tagParms = NULL; nParms = &tag->tagParms; } @@ -109,15 +117,20 @@ static int tagStart (struct tagInfo *tag, const char *tagString, i = skipSpace (cp); while (cp[i] && cp[i] != '>') { - int nor = skipParm (cp+i, parm_name, &parm_value); + int nor = skipParm (cp+i, parm_name, &parm_value); i += nor; + if (nor && tag) + { + DEBUG(printf ("parm_name=%s parm_value=%s\n", parm_name, parm_value)); + } if (nor && tag && !tag->level) { *nParms = malloc (sizeof(**nParms)); assert (*nParms); - (*nParms)->next = NULL; strcpy ((*nParms)->name, parm_name); (*nParms)->value = parm_value; + (*nParms)->next = NULL; + nParms = &(*nParms)->next; } else { @@ -164,6 +177,7 @@ static int tagEnd (Tcl_Interp *interp, struct tagInfo *tag, struct tagParm *tp0 = tp; sprintf (vname, "parm(%s)", tp->name); + DEBUG(printf ("vname=%s\n", vname)); Tcl_SetVar (interp, vname, tp->value ? tp->value : "",0); tp = tp->next; @@ -209,9 +223,10 @@ int htmlSwitch (ClientData clientData, Tcl_Interp *interp, cp++; cp += skipTag (cp, tagStr); tagI = tagLookup (tags, noTags, tagStr); + DEBUG(printf ("tagStr = %s tagI = %d\n", tagStr, tagI)); cp += tagStart (tagI >= 0 ? tags+tagI : NULL, tagStr, cp); } - else if (cp[0] == '<') /* end tag */ + else if (cp[0] == '<' && cp[1] == '/')/* end tag */ { char tagStr[TAG_MAX_LEN]; const char *body_end = cp; diff --git a/init.c b/init.c index b3f7509..eb737fa 100644 --- a/init.c +++ b/init.c @@ -1,9 +1,9 @@ /* - * $Id: init.c,v 1.1 1996/08/06 14:04:22 adam Exp $ + * $Id: init.c,v 1.2 1998/10/15 12:31:02 adam Exp $ */ #include "tclrobot.h" -int TclRobot_Init (Tcl_Interp *interp) +int Tclrobot_Init (Tcl_Interp *interp) { Tcl_CreateCommand (interp, "htmlSwitch", htmlSwitch, (ClientData) NULL, (Tcl_CmdDeleteProc *) NULL); diff --git a/robot.tcl b/robot.tcl index b2a7224..c942418 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,8 +1,8 @@ -# -# $Id: robot.tcl,v 1.1 1996/08/06 14:04:22 adam Exp $ +#!/usr/bin/tclsh +# $Id: robot.tcl,v 1.2 1998/10/15 12:31:03 adam Exp $ # proc RobotFileNext {area} { - if {[catch {set ns [glob $area/*]}]} { + if {[catch {set ns [glob ${area}/*]}]} { return {} } set off [string first / $area] @@ -122,16 +122,25 @@ proc RobotSave {url} { set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] set ti 0 if {[info exists URL($url,line)]} { - set htmlContent [join $URL($url,line)] + set htmlContent [join $URL($url,line) \n] htmlSwitch $htmlContent \ - title { + title { if {!$ti} { headSave $url $out $body set ti 1 } + } body { + regsub -all -nocase {} $body {} abody + regsub -all {<[^\>]+>} $abody {} nbody + puts $out "" + puts $out $nbody + puts $out "" } a { - if {![info exists parm(href)]} continue + if {![info exists parm(href)]} { + puts "no href" + continue + } if {!$ti} { headSave $url $out "untitled" set ti 1 @@ -145,7 +154,7 @@ proc RobotSave {url} { set host $URL($url,host) set path $hpath } - if {![regexp {\.dk$} $host]} continue + if {![regexp {\.indexdata\.dk$} $host]} continue } else { continue } @@ -153,7 +162,6 @@ proc RobotSave {url} { set host $URL($url,host) set method http } else { - puts " href=$parm(href)" set ext [file extension $URL($url,path)] if {[string compare $ext {}]} { set dpart [file dirname $URL($url,path)] @@ -243,7 +251,7 @@ proc RobotRead {url sock} { } } } else { - set URL($url,state) skip + set URL($url,state) html if {[info exists URL($url,head,Content-type)]} { if {![string compare $URL($url,head,Content-type) text/html]} { set URL($url,state) html @@ -289,7 +297,21 @@ proc RobotGetUrl {url phost} { return 0 } -#RobotGetUrl http://www.dtv.dk/ {} +if {![llength [info commands htmlSwitch]]} { + set e [info sharedlibextension] + if {[catch {load ./tclrobot$e}]} { + load tclrobot$e + } +} + +if {![llength $argv]} { + puts "Tclrobot: specify one or more sites." + exit 1 +} +foreach site $argv { + set x [RobotFileOpen unvisited $site /] + close $x +} RobotRestart vwait forever diff --git a/tclmain.c b/tclmain.c index e99c71e..7f90b31 100644 --- a/tclmain.c +++ b/tclmain.c @@ -1,5 +1,5 @@ /* - * $Id: tclmain.c,v 1.1 1996/08/06 14:04:22 adam Exp $ + * $Id: tclmain.c,v 1.2 1998/10/15 12:31:04 adam Exp $ */ #include "tclrobot.h" @@ -24,10 +24,10 @@ int Tcl_AppInit(Tcl_Interp *interp) return TCL_ERROR; } - if (TclRobot_Init(interp) == TCL_ERROR) { + if (Tclrobot_Init(interp) == TCL_ERROR) { return TCL_ERROR; } - Tcl_StaticPackage(interp, "TclRobot", TclRobot_Init, + Tcl_StaticPackage(interp, "TclRobot", Tclrobot_Init, (Tcl_PackageInitProc *) NULL); /* diff --git a/tclrobot.h b/tclrobot.h index a582f62..25840cf 100644 --- a/tclrobot.h +++ b/tclrobot.h @@ -1,9 +1,9 @@ /* - * $Id: tclrobot.h,v 1.1 1996/08/06 14:04:22 adam Exp $ + * $Id: tclrobot.h,v 1.2 1998/10/15 12:31:05 adam Exp $ */ #include int htmlSwitch (ClientData clientData, Tcl_Interp *interp, int argc, char **argv); -int TclRobot_Init (Tcl_Interp *interp); +int Tclrobot_Init (Tcl_Interp *interp); -- 1.7.10.4