-#
-# $Id: robot.tcl,v 1.1 1996/08/06 14:04:22 adam Exp $
+#!/usr/bin/tclsh
+# $Id: robot.tcl,v 1.3 1998/10/15 13:27:19 adam Exp $
#
proc RobotFileNext {area} {
- if {[catch {set ns [glob $area/*]}]} {
+ if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
set off [string first / $area]
proc RobotSave {url} {
global URL
+ global domains
set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
set ti 0
if {[info exists URL($url,line)]} {
- set htmlContent [join $URL($url,line)]
+ set htmlContent [join $URL($url,line) \n]
htmlSwitch $htmlContent \
- title {
+ title {
if {!$ti} {
headSave $url $out $body
set ti 1
}
+ } body {
+ regsub -all -nocase {<script.*</script>} $body {} abody
+ regsub -all {<[^\>]+>} $abody {} nbody
+ puts $out "<body>"
+ puts $out $nbody
+ puts $out "</body>"
} a {
- if {![info exists parm(href)]} continue
+ if {![info exists parm(href)]} {
+ puts "no href"
+ continue
+ }
if {!$ti} {
headSave $url $out "untitled"
set ti 1
if {[regexp {^\#} $parm(href)]} {
continue
} elseif {[regexp {^([^:]+):([^#]+)} $parm(href) x method hpath]} {
+ set ok 0
if {![string compare $method http]} {
if {![regexp {^//([^/]+)(.*)} $hpath x host path]} {
set host $URL($url,host)
set path $hpath
}
- if {![regexp {\.dk$} $host]} continue
- } else {
- continue
+ foreach domain $domains {
+ if {[string match $domain $host]} {
+ set ok 1
+ break
+ }
+ }
}
+ if {!$ok} continue
} elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} {
set host $URL($url,host)
set method http
} else {
- puts " href=$parm(href)"
set ext [file extension $URL($url,path)]
if {[string compare $ext {}]} {
set dpart [file dirname $URL($url,path)]
}
}
} else {
- set URL($url,state) skip
+ set URL($url,state) html
if {[info exists URL($url,head,Content-type)]} {
if {![string compare $URL($url,head,Content-type) text/html]} {
set URL($url,state) html
set port 80
puts "---------"
puts $url
- if {[regexp {([^:]+)://([^/]+)([^ ?]*)} $url x method host path]} {
+ if {[regexp {([^:]+)://([^/]+)([^ ]*)} $url x method host path]} {
puts "method=$method host=$host path=$path"
} else {
return -1
return 0
}
-#RobotGetUrl http://www.dtv.dk/ {}
+if {![llength [info commands htmlSwitch]]} {
+ set e [info sharedlibextension]
+ if {[catch {load ./tclrobot$e}]} {
+ load tclrobot$e
+ }
+}
+
+if {[llength $argv] < 2} {
+ puts "Tclrobot: usage <domain> <start>"
+ exit 1
+}
+set domains [lindex $argv 0]
+set site [lindex $argv 1]
+if {[string length $site]} {
+ set x [RobotFileOpen unvisited $site /]
+ close $x
+}
+
RobotRestart
vwait forever