From: Adam Dickmeiss Date: Tue, 6 Aug 1996 14:04:22 +0000 (+0000) Subject: Initial revision X-Git-Tag: index00~1 X-Git-Url: http://git.indexdata.com/?p=tclrobot.git;a=commitdiff_plain;h=2c4a844e7d87397d31d29bd3bfc56c97a5f1618b Initial revision --- 2c4a844e7d87397d31d29bd3bfc56c97a5f1618b diff --git a/Makefile.in b/Makefile.in new file mode 100644 index 0000000..7729bb4 --- /dev/null +++ b/Makefile.in @@ -0,0 +1,50 @@ +# Makefile for Tcl Web Robot +# $Id: Makefile.in,v 1.1 1996/08/06 14:04:22 adam Exp $ +SHELL=/bin/sh + +# Version +VERSION=0.0 + +# Directory prefix wich machine independent files +prefix=@prefix@ + +# Directory prefix with machine dependent files +exec_prefix=@exec_prefix@ + +BINDIR=$(exec_prefix)/bin +LIBDIR=$(exec_prefix)/lib +LIBRARY=@(prefix)/lib/tclrobot + +TCLLIB=@TCLLIB@ +TCLINC=@TCLINC@ +TKLIB=@TKLIB@ +TKINC=@TKINC@ + +INCLUDE=$(TCLINC) +DEFS=$(INCLUDE) + +INSTALL=@INSTALL@ +INSTALL_PROGRAM=@INSTALL_PROGRAM@ +INSTALL_DATA=@INSTALL_DATA@ +RANLIB=@RANLIB@ +SHLIB_LD=@SHLIB_LD@ + +O=hswitch.o init.o + +tclrobot: tclrobot.a tclmain.o + $(CC) -o tclrobot $(CFLAGS) tclmain.o tclrobot.a $(TCLLIB) + +tclrobot.a: $(O) + rm -f tclrobot.a + ar cr tclrobot.a $(O) + $(RANLIB) tclrobot.a + +libtclrobot.so: $(O) + $(SHLIB_LD) -o libtclrobot.so $(O) + $(RANLIB) libtclrobot.so + +.c.o: + $(CC) -c $(CFLAGS) $(DEFS) $< + +clean: + rm -f tclrobot core *.out *.o *.a *.so config.* diff --git a/configure.in b/configure.in new file mode 100644 index 0000000..b755929 --- /dev/null +++ b/configure.in @@ -0,0 +1,49 @@ +dnl Web robot toolkit for tcl +dnl (c) Index Data 1996 +dnl See the file LICENSE for details. +dnl $Id: configure.in,v 1.1 1996/08/06 14:04:22 adam Exp $ +AC_INIT(tclrobot.h) +CC=${CC-cc} +dnl ------ Substitutions +AC_SUBST(CC) +AC_SUBST(TCLLIB) +AC_SUBST(TKLIB) +AC_SUBST(TCLINC) +AC_SUBST(TKINC) +AC_SUBST(SHLIB_LD) +AC_SUBST(RANLIB) +dnl ------ Preliminary settings +AC_PROG_INSTALL +AC_PREFIX_PROGRAM(tclsh) +AC_STDC_HEADERS +if test "$ac_cv_header_stdc" = no; then + AC_MSG_WARN(Your system doesn't seem to support ANSI C) +fi +dnl ------ look for Tcl +if test "x$prefix" = xNONE; then + tryprefix=/usr/local +else + tryprefix=${prefix} +fi +if test -r ${tryprefix}/lib/tclConfig.sh; then + AC_MSG_CHECKING(for Tcl) + source ${tryprefix}/lib/tclConfig.sh + TCLLIB="${TCL_LIB_SPEC} ${TCL_LIBS}" + TCLINC=-I${TCL_PREFIX}/include + RANLIB=${TCL_RANLIB} + SHLIB_LD=${TCL_SHLIB_LD} + AC_MSG_RESULT($TCL_VERSION) +else + AC_MSG_WARN(Didn't find Tcl) +fi +dnl ------ look for Tk +AC_MSG_CHECKING(for Tk) +if test -r ${tryprefix}/lib/tkConfig.sh; then + source ${tryprefix}/lib/tkConfig.sh + AC_MSG_RESULT($TK_VERSION) + TKINC=${TK_XINCLUDES} + TKLIB="${TK_PREFIX}/lib/${TK_LIB_FILE} ${TK_LIBS}" +else + AC_MSG_WARN(Didn't find Tk) +fi +AC_OUTPUT(Makefile) diff --git a/hswitch.c b/hswitch.c new file mode 100644 index 0000000..e631c1c --- /dev/null +++ b/hswitch.c @@ -0,0 +1,233 @@ +/* + * $Id: hswitch.c,v 1.1 1996/08/06 14:04:22 adam Exp $ + */ +#include +#include +#include +#include + +#include "tclrobot.h" + +#define TAG_MAX_LEN 32 + +#define SPACECHR " \t\r\n\f" + +static int skipSpace (const char *cp) +{ + int i = 0; + while (strchr (SPACECHR, cp[i])) + i++; + return i; +} + +static int skipTag (const char *cp, char *dst) +{ + int i; + + for (i=0; i=", cp[i]); i++) + dst[i] = tolower(cp[i]); + dst[i] = '\0'; + return i; +} + +static int skipParm (const char *cp, char *name, char **value) +{ + int i = skipTag (cp, name); + *value = NULL; + if (!i) + return skipSpace (cp); + i += skipSpace (cp + i); + if (cp[i] == '=') + { + int v0, v1; + i++; + i += skipSpace (cp + i); + if (cp[i] == '\"') + { + v0 = ++i; + while (cp[i] != '\"' && cp[i]) + i++; + v1 = i; + if (cp[i]) + i++; + } + else + { + v0 = i; + while (cp[i] && !strchr (SPACECHR ">", cp[i])) + i++; + v1 = i; + } + *value = malloc (v1 - v0 + 1); + memcpy (*value, cp + v0, v1-v0); + (*value)[v1-v0] = '\0'; + } + i += skipSpace (cp + i); + return i; +} + +struct tagParm { + char name[TAG_MAX_LEN]; + char *value; + struct tagParm *next; +}; + +struct tagInfo { + int level; + char *pattern; + char *code; + + char name[TAG_MAX_LEN]; + const char *body_start; + struct tagParm *tagParms; +}; + +static int tagLookup (struct tagInfo *tags, int tagNo, const char *tagString) +{ + int i; + for (i = 0; ilevel) + { + strcpy (tag->name, tagString); + tag->tagParms = NULL; + nParms = &tag->tagParms; + } + + i = skipSpace (cp); + while (cp[i] && cp[i] != '>') + { + int nor = skipParm (cp+i, parm_name, &parm_value); + i += nor; + if (nor && tag && !tag->level) + { + *nParms = malloc (sizeof(**nParms)); + assert (*nParms); + (*nParms)->next = NULL; + strcpy ((*nParms)->name, parm_name); + (*nParms)->value = parm_value; + } + else + { + if (!nor) + i++; + free (parm_value); + } + } + if (cp[i]) + i++; + if (tag) + { + if (!tag->level) + tag->body_start = cp+i; + ++(tag->level); + } + return i; +} + +static int tagEnd (Tcl_Interp *interp, struct tagInfo *tag, + const char *tagString, const char *cp, const char *body_end) +{ + int i = 0; + + if (cp[i] == '>') + i++; + + if (tag && tag->level) + { + -- (tag->level); + if (!tag->level) + { + struct tagParm *tp = tag->tagParms; + char *value = malloc (body_end - tag->body_start + 1); + + assert (value); + memcpy (value, tag->body_start, body_end - tag->body_start); + value[body_end - tag->body_start] = '\0'; + Tcl_SetVar (interp, "body", value, 0); + free (value); + while (tp) + { + char vname[TAG_MAX_LEN+30]; + struct tagParm *tp0 = tp; + + sprintf (vname, "parm(%s)", tp->name); + + Tcl_SetVar (interp, vname, tp->value ? tp->value : "",0); + tp = tp->next; + free (tp0); + } + Tcl_Eval (interp, tag->code); + } + } + return i; +} + +int htmlSwitch (ClientData clientData, Tcl_Interp *interp, + int argc, char **argv) +{ + struct tagInfo *tags; + int noTags; + const char *cp; + int i, argi = 1; + + cp = argv[argi++]; + noTags = (argc - argi)/2; + if (noTags < 1) + { + interp->result = + "wrong # args: should be ?switches? string pattern body ..."; + return TCL_ERROR; + } + tags = malloc (sizeof(*tags) * noTags); + assert (tags); + for (i = 0; i= 0 ? tags+tagI : NULL, tagStr, cp); + } + else if (cp[0] == '<') /* end tag */ + { + char tagStr[TAG_MAX_LEN]; + const char *body_end = cp; + int tagI; + + cp += 2; + cp += skipTag (cp, tagStr); + tagI = tagLookup (tags, noTags, tagStr); + cp += tagEnd (interp, tagI >= 0 ? tags+tagI : NULL, + tagStr, cp, body_end); + } + else /* no tag */ + cp++; + } + free (tags); + return TCL_OK; +} + + diff --git a/init.c b/init.c new file mode 100644 index 0000000..b3f7509 --- /dev/null +++ b/init.c @@ -0,0 +1,11 @@ +/* + * $Id: init.c,v 1.1 1996/08/06 14:04:22 adam Exp $ + */ +#include "tclrobot.h" + +int TclRobot_Init (Tcl_Interp *interp) +{ + Tcl_CreateCommand (interp, "htmlSwitch", htmlSwitch, (ClientData) NULL, + (Tcl_CmdDeleteProc *) NULL); + return TCL_OK; +} diff --git a/install-sh b/install-sh new file mode 100755 index 0000000..89fc9b0 --- /dev/null +++ b/install-sh @@ -0,0 +1,238 @@ +#! /bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. +# + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +tranformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/robot.tcl b/robot.tcl new file mode 100755 index 0000000..b2a7224 --- /dev/null +++ b/robot.tcl @@ -0,0 +1,295 @@ +# +# $Id: robot.tcl,v 1.1 1996/08/06 14:04:22 adam Exp $ +# +proc RobotFileNext {area} { + if {[catch {set ns [glob $area/*]}]} { + return {} + } + set off [string first / $area] + incr off + foreach n $ns { + if {[file isfile $n]} { + if {[string first :.html $n] > 0} { + return http://[string range $area/ $off end] + } + return http://[string range $n $off end] + } + if {[file isdirectory $n]} { + set sb [RobotFileNext $n] + if {[string length $sb]} { + return $sb + } + } + } + return {} +} + +proc RobotFileExist {area host path} { + set comp [split $area/$host$path /] + set l [llength $comp] + incr l -1 + if {![string length [lindex $comp $l]]} { + set comp [split $area/$host$path:.html /] + } + return [file exists [join $comp /]] +} + +proc RobotFileUnlink {area host path} { + set comp [split $area/$host$path /] + set l [llength $comp] + incr l -1 + if {![string length [lindex $comp $l]]} { + set comp [split $area/$host$path:.html /] + } + if {[catch {exec rm [join $comp /]}]} return + incr l -1 + for {set i $l} {$i > 0} {incr i -1} { + set path [join [lrange $comp 0 $i] /] + if {![catch {glob $path/*}]} return + exec rmdir ./$path + } +} + +proc RobotFileOpen {area host path} { + set orgPwd [pwd] + + set comp [split $area/$host$path /] + set len [llength $comp] + incr len -1 + for {set i 0} {$i < $len} {incr i} { + set d [lindex $comp $i] + if {[catch {cd ./$d}]} { + exec mkdir $d + cd ./$d + } + } + set d [lindex $comp $len] + if {[string length $d]} { + set out [open $d w] + } else { + set out [open :.html w] + } + cd $orgPwd + return $out +} + +proc RobotRestart {} { + global URL + + while {1} { + set url [RobotFileNext unvisited] + if {![string length $url]} break + set r [RobotGetUrl $url {}] + if {!$r} { + return + } else { + RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + } + } + exit 0 +} + +proc headSave {url out title} { + global URL + + puts $out {} + puts $out " $title" + if {[info exists URL($url,head,Last-modified)]} { + puts $out " $URL($url,head,Last-modified)" + } + puts $out {} + if {[info exists URL($url,head,Date)]} { + puts $out " $URL($url,head,Date)" + } + if {[info exists URL($url,head,Content-length)]} { + puts $out " $URL($url,head,Content-length)" + } + if {[info exists URL($url,head,Server)]} { + puts $out " $URL($url,head,Server)" + } + puts $out {} + puts $out {} + puts $out " $url" + if {[info exists URL($url,head,Content-type)]} { + puts $out " $URL($url,head,Content-type)" + } + puts $out {} +} + +proc RobotSave {url} { + global URL + + set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] + set ti 0 + if {[info exists URL($url,line)]} { + set htmlContent [join $URL($url,line)] + + htmlSwitch $htmlContent \ + title { + if {!$ti} { + headSave $url $out $body + set ti 1 + } + } a { + if {![info exists parm(href)]} continue + if {!$ti} { + headSave $url $out "untitled" + set ti 1 + } + + if {[regexp {^\#} $parm(href)]} { + continue + } elseif {[regexp {^([^:]+):([^#]+)} $parm(href) x method hpath]} { + if {![string compare $method http]} { + if {![regexp {^//([^/]+)(.*)} $hpath x host path]} { + set host $URL($url,host) + set path $hpath + } + if {![regexp {\.dk$} $host]} continue + } else { + continue + } + } elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} { + set host $URL($url,host) + set method http + } else { + puts " href=$parm(href)" + set ext [file extension $URL($url,path)] + if {[string compare $ext {}]} { + set dpart [file dirname $URL($url,path)] + } else { + set dpart $URL($url,path) + } + regexp {^([^#]+)} $parm(href) x path + set host $URL($url,host) + set path [string trimright $dpart /]/$path + set method http + } + set ext [file extension $path] + if {![string length $ext]} { + set path [string trimright $path /]/ + } else { + set path [string trimright $path /] + } + set c [split $path /] + set i [llength $c] + incr i -1 + set path [lindex $c $i] + incr i -1 + while {$i >= 0} { + switch -- [lindex $c $i] { + .. { + incr i -2 + } + . { + incr i -1 + } + default { + set path [lindex $c $i]/$path + incr i -1 + } + } + } + set href "$method://$host$path" + + puts $out "" + puts $out "
  • $href" + puts $out " $body" + puts $out "" + + if {![regexp {/.*bin/} $href)]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + close $outf + } + } + } + } + if {!$ti} { + headSave $url $out "untitled" + set ti 1 + } + puts $out "" + close $out + RobotFileUnlink unvisited $URL($url,host) $URL($url,path) +} + +proc RobotRead {url sock} { + global URL + + set readCount [gets $sock line] + if {$readCount < 0} { + if [eof $sock] { + close $sock + RobotSave $url + RobotRestart + } + } elseif {$readCount > 0} { + switch $URL($url,state) { + head { + puts "head: $line" + if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} { + set URL($url,head,$name) $value + } + } + html { + lappend URL($url,line) $line +# puts "body: $line" + } + skip { + close $sock + RobotSave $url + RobotRestart + } + } + } else { + set URL($url,state) skip + if {[info exists URL($url,head,Content-type)]} { + if {![string compare $URL($url,head,Content-type) text/html]} { + set URL($url,state) html + } + } + } +} + +proc RobotConnect {url sock} { + global URL + + fileevent $sock readable [list RobotRead $url $sock] + puts $sock "GET $URL($url,path) HTTP/1.0" + puts $sock "" + flush $sock +} + +proc RobotNop {} { + +} + +proc RobotGetUrl {url phost} { + global URL + set port 80 + puts "---------" + puts $url + if {[regexp {([^:]+)://([^/]+)([^ ?]*)} $url x method host path]} { + puts "method=$method host=$host path=$path" + } else { + return -1 + } + set URL($url,method) $method + set URL($url,host) $host + set URL($url,port) $port + set URL($url,path) $path + set URL($url,state) head + if [catch {set sock [socket -async $host $port]}] { + return -1 + } + fconfigure $sock -translation {auto crlf} + RobotConnect $url $sock + + return 0 +} + +#RobotGetUrl http://www.dtv.dk/ {} +RobotRestart +vwait forever + diff --git a/tclmain.c b/tclmain.c new file mode 100644 index 0000000..e99c71e --- /dev/null +++ b/tclmain.c @@ -0,0 +1,58 @@ +/* + * $Id: tclmain.c,v 1.1 1996/08/06 14:04:22 adam Exp $ + */ + +#include "tclrobot.h" + +/* + * The following variable is a special hack that is needed in order for + * Sun shared libraries to be used for Tcl. + */ + +extern int matherr(); +int *tclDummyMathPtr = (int *) matherr; + +int main(int argc, char **argv) +{ + Tcl_Main(argc, argv, Tcl_AppInit); + return 0; /* Needed only to prevent compiler warning. */ +} + +int Tcl_AppInit(Tcl_Interp *interp) +{ + if (Tcl_Init(interp) == TCL_ERROR) { + return TCL_ERROR; + } + + if (TclRobot_Init(interp) == TCL_ERROR) { + return TCL_ERROR; + } + Tcl_StaticPackage(interp, "TclRobot", TclRobot_Init, + (Tcl_PackageInitProc *) NULL); + + /* + * Call the init procedures for included packages. Each call should + * look like this: + * + * if (Mod_Init(interp) == TCL_ERROR) { + * return TCL_ERROR; + * } + * + * where "Mod" is the name of the module. + */ + + /* + * Call Tcl_CreateCommand for application-specific commands, if + * they weren't already created by the init procedures called above. + */ + + /* + * Specify a user-specific startup file to invoke if the application + * is run interactively. Typically the startup file is "~/.apprc" + * where "app" is the name of the application. If this line is deleted + * then no user-specific startup file will be run under any conditions. + */ + + Tcl_SetVar(interp, "tcl_rcFileName", "~/.tclshrc", TCL_GLOBAL_ONLY); + return TCL_OK; +} diff --git a/tclrobot.h b/tclrobot.h new file mode 100644 index 0000000..a582f62 --- /dev/null +++ b/tclrobot.h @@ -0,0 +1,9 @@ +/* + * $Id: tclrobot.h,v 1.1 1996/08/06 14:04:22 adam Exp $ + */ +#include + +int htmlSwitch (ClientData clientData, Tcl_Interp *interp, + int argc, char **argv); + +int TclRobot_Init (Tcl_Interp *interp);