2 # $Id: robot.tcl,v 1.3 1998/10/15 13:27:19 adam Exp $
4 proc RobotFileNext {area} {
5 if {[catch {set ns [glob ${area}/*]}]} {
8 set off [string first / $area]
11 if {[file isfile $n]} {
12 if {[string first :.html $n] > 0} {
13 return http://[string range $area/ $off end]
15 return http://[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set sb [RobotFileNext $n]
19 if {[string length $sb]} {
27 proc RobotFileExist {area host path} {
28 set comp [split $area/$host$path /]
31 if {![string length [lindex $comp $l]]} {
32 set comp [split $area/$host$path:.html /]
34 return [file exists [join $comp /]]
37 proc RobotFileUnlink {area host path} {
38 set comp [split $area/$host$path /]
41 if {![string length [lindex $comp $l]]} {
42 set comp [split $area/$host$path:.html /]
44 if {[catch {exec rm [join $comp /]}]} return
46 for {set i $l} {$i > 0} {incr i -1} {
47 set path [join [lrange $comp 0 $i] /]
48 if {![catch {glob $path/*}]} return
53 proc RobotFileOpen {area host path} {
56 set comp [split $area/$host$path /]
57 set len [llength $comp]
59 for {set i 0} {$i < $len} {incr i} {
60 set d [lindex $comp $i]
61 if {[catch {cd ./$d}]} {
66 set d [lindex $comp $len]
67 if {[string length $d]} {
70 set out [open :.html w]
76 proc RobotRestart {} {
80 set url [RobotFileNext unvisited]
81 if {![string length $url]} break
82 set r [RobotGetUrl $url {}]
86 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
92 proc headSave {url out title} {
96 puts $out "<ti> $title"
97 if {[info exists URL($url,head,Last-modified)]} {
98 puts $out "<dm> $URL($url,head,Last-modified)"
101 if {[info exists URL($url,head,Date)]} {
102 puts $out " <lc> $URL($url,head,Date)"
104 if {[info exists URL($url,head,Content-length)]} {
105 puts $out " <by> $URL($url,head,Content-length)"
107 if {[info exists URL($url,head,Server)]} {
108 puts $out " <srvr> $URL($url,head,Server)"
112 puts $out " <avli> $url"
113 if {[info exists URL($url,head,Content-type)]} {
114 puts $out " <ty> $URL($url,head,Content-type)"
119 proc RobotSave {url} {
123 set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
125 if {[info exists URL($url,line)]} {
126 set htmlContent [join $URL($url,line) \n]
128 htmlSwitch $htmlContent \
131 headSave $url $out $body
135 regsub -all -nocase {<script.*</script>} $body {} abody
136 regsub -all {<[^\>]+>} $abody {} nbody
141 if {![info exists parm(href)]} {
146 headSave $url $out "untitled"
150 if {[regexp {^\#} $parm(href)]} {
152 } elseif {[regexp {^([^:]+):([^#]+)} $parm(href) x method hpath]} {
154 if {![string compare $method http]} {
155 if {![regexp {^//([^/]+)(.*)} $hpath x host path]} {
156 set host $URL($url,host)
159 foreach domain $domains {
160 if {[string match $domain $host]} {
167 } elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} {
168 set host $URL($url,host)
171 set ext [file extension $URL($url,path)]
172 if {[string compare $ext {}]} {
173 set dpart [file dirname $URL($url,path)]
175 set dpart $URL($url,path)
177 regexp {^([^#]+)} $parm(href) x path
178 set host $URL($url,host)
179 set path [string trimright $dpart /]/$path
182 set ext [file extension $path]
183 if {![string length $ext]} {
184 set path [string trimright $path /]/
186 set path [string trimright $path /]
188 set c [split $path /]
191 set path [lindex $c $i]
194 switch -- [lindex $c $i] {
202 set path [lindex $c $i]/$path
207 set href "$method://$host$path"
210 puts $out "<li> $href"
211 puts $out "<cp> $body"
214 if {![regexp {/.*bin/} $href)]} {
215 if {![RobotFileExist visited $host $path]} {
216 set outf [RobotFileOpen unvisited $host $path]
223 headSave $url $out "untitled"
228 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
231 proc RobotRead {url sock} {
234 set readCount [gets $sock line]
235 if {$readCount < 0} {
241 } elseif {$readCount > 0} {
242 switch $URL($url,state) {
245 if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} {
246 set URL($url,head,$name) $value
250 lappend URL($url,line) $line
260 set URL($url,state) html
261 if {[info exists URL($url,head,Content-type)]} {
262 if {![string compare $URL($url,head,Content-type) text/html]} {
263 set URL($url,state) html
269 proc RobotConnect {url sock} {
272 fileevent $sock readable [list RobotRead $url $sock]
273 puts $sock "GET $URL($url,path) HTTP/1.0"
282 proc RobotGetUrl {url phost} {
287 if {[regexp {([^:]+)://([^/]+)([^ ]*)} $url x method host path]} {
288 puts "method=$method host=$host path=$path"
292 set URL($url,method) $method
293 set URL($url,host) $host
294 set URL($url,port) $port
295 set URL($url,path) $path
296 set URL($url,state) head
297 if [catch {set sock [socket -async $host $port]}] {
300 fconfigure $sock -translation {auto crlf}
301 RobotConnect $url $sock
306 if {![llength [info commands htmlSwitch]]} {
307 set e [info sharedlibextension]
308 if {[catch {load ./tclrobot$e}]} {
313 if {[llength $argv] < 2} {
314 puts "Tclrobot: usage <domain> <start>"
317 set domains [lindex $argv 0]
318 set site [lindex $argv 1]
319 if {[string length $site]} {
320 set x [RobotFileOpen unvisited $site /]