+#
+# $Id: robot.tcl,v 1.1 1996/08/06 14:04:22 adam Exp $
+#
+proc RobotFileNext {area} {
+ if {[catch {set ns [glob $area/*]}]} {
+ return {}
+ }
+ set off [string first / $area]
+ incr off
+ foreach n $ns {
+ if {[file isfile $n]} {
+ if {[string first :.html $n] > 0} {
+ return http://[string range $area/ $off end]
+ }
+ return http://[string range $n $off end]
+ }
+ if {[file isdirectory $n]} {
+ set sb [RobotFileNext $n]
+ if {[string length $sb]} {
+ return $sb
+ }
+ }
+ }
+ return {}
+}
+
+proc RobotFileExist {area host path} {
+ set comp [split $area/$host$path /]
+ set l [llength $comp]
+ incr l -1
+ if {![string length [lindex $comp $l]]} {
+ set comp [split $area/$host$path:.html /]
+ }
+ return [file exists [join $comp /]]
+}
+
+proc RobotFileUnlink {area host path} {
+ set comp [split $area/$host$path /]
+ set l [llength $comp]
+ incr l -1
+ if {![string length [lindex $comp $l]]} {
+ set comp [split $area/$host$path:.html /]
+ }
+ if {[catch {exec rm [join $comp /]}]} return
+ incr l -1
+ for {set i $l} {$i > 0} {incr i -1} {
+ set path [join [lrange $comp 0 $i] /]
+ if {![catch {glob $path/*}]} return
+ exec rmdir ./$path
+ }
+}
+
+proc RobotFileOpen {area host path} {
+ set orgPwd [pwd]
+
+ set comp [split $area/$host$path /]
+ set len [llength $comp]
+ incr len -1
+ for {set i 0} {$i < $len} {incr i} {
+ set d [lindex $comp $i]
+ if {[catch {cd ./$d}]} {
+ exec mkdir $d
+ cd ./$d
+ }
+ }
+ set d [lindex $comp $len]
+ if {[string length $d]} {
+ set out [open $d w]
+ } else {
+ set out [open :.html w]
+ }
+ cd $orgPwd
+ return $out
+}
+
+proc RobotRestart {} {
+ global URL
+
+ while {1} {
+ set url [RobotFileNext unvisited]
+ if {![string length $url]} break
+ set r [RobotGetUrl $url {}]
+ if {!$r} {
+ return
+ } else {
+ RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
+ }
+ }
+ exit 0
+}
+
+proc headSave {url out title} {
+ global URL
+
+ puts $out {<nwi>}
+ puts $out "<ti> $title"
+ if {[info exists URL($url,head,Last-modified)]} {
+ puts $out "<dm> $URL($url,head,Last-modified)"
+ }
+ puts $out {<si>}
+ if {[info exists URL($url,head,Date)]} {
+ puts $out " <lc> $URL($url,head,Date)"
+ }
+ if {[info exists URL($url,head,Content-length)]} {
+ puts $out " <by> $URL($url,head,Content-length)"
+ }
+ if {[info exists URL($url,head,Server)]} {
+ puts $out " <srvr> $URL($url,head,Server)"
+ }
+ puts $out {</si>}
+ puts $out {<av>}
+ puts $out " <avli> $url"
+ if {[info exists URL($url,head,Content-type)]} {
+ puts $out " <ty> $URL($url,head,Content-type)"
+ }
+ puts $out {</av>}
+}
+
+proc RobotSave {url} {
+ global URL
+
+ set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
+ set ti 0
+ if {[info exists URL($url,line)]} {
+ set htmlContent [join $URL($url,line)]
+
+ htmlSwitch $htmlContent \
+ title {
+ if {!$ti} {
+ headSave $url $out $body
+ set ti 1
+ }
+ } a {
+ if {![info exists parm(href)]} continue
+ if {!$ti} {
+ headSave $url $out "untitled"
+ set ti 1
+ }
+
+ if {[regexp {^\#} $parm(href)]} {
+ continue
+ } elseif {[regexp {^([^:]+):([^#]+)} $parm(href) x method hpath]} {
+ if {![string compare $method http]} {
+ if {![regexp {^//([^/]+)(.*)} $hpath x host path]} {
+ set host $URL($url,host)
+ set path $hpath
+ }
+ if {![regexp {\.dk$} $host]} continue
+ } else {
+ continue
+ }
+ } elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} {
+ set host $URL($url,host)
+ set method http
+ } else {
+ puts " href=$parm(href)"
+ set ext [file extension $URL($url,path)]
+ if {[string compare $ext {}]} {
+ set dpart [file dirname $URL($url,path)]
+ } else {
+ set dpart $URL($url,path)
+ }
+ regexp {^([^#]+)} $parm(href) x path
+ set host $URL($url,host)
+ set path [string trimright $dpart /]/$path
+ set method http
+ }
+ set ext [file extension $path]
+ if {![string length $ext]} {
+ set path [string trimright $path /]/
+ } else {
+ set path [string trimright $path /]
+ }
+ set c [split $path /]
+ set i [llength $c]
+ incr i -1
+ set path [lindex $c $i]
+ incr i -1
+ while {$i >= 0} {
+ switch -- [lindex $c $i] {
+ .. {
+ incr i -2
+ }
+ . {
+ incr i -1
+ }
+ default {
+ set path [lindex $c $i]/$path
+ incr i -1
+ }
+ }
+ }
+ set href "$method://$host$path"
+
+ puts $out "<cr>"
+ puts $out "<li> $href"
+ puts $out "<cp> $body"
+ puts $out "</cr>"
+
+ if {![regexp {/.*bin/} $href)]} {
+ if {![RobotFileExist visited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ close $outf
+ }
+ }
+ }
+ }
+ if {!$ti} {
+ headSave $url $out "untitled"
+ set ti 1
+ }
+ puts $out "</nwi>"
+ close $out
+ RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
+}
+
+proc RobotRead {url sock} {
+ global URL
+
+ set readCount [gets $sock line]
+ if {$readCount < 0} {
+ if [eof $sock] {
+ close $sock
+ RobotSave $url
+ RobotRestart
+ }
+ } elseif {$readCount > 0} {
+ switch $URL($url,state) {
+ head {
+ puts "head: $line"
+ if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} {
+ set URL($url,head,$name) $value
+ }
+ }
+ html {
+ lappend URL($url,line) $line
+# puts "body: $line"
+ }
+ skip {
+ close $sock
+ RobotSave $url
+ RobotRestart
+ }
+ }
+ } else {
+ set URL($url,state) skip
+ if {[info exists URL($url,head,Content-type)]} {
+ if {![string compare $URL($url,head,Content-type) text/html]} {
+ set URL($url,state) html
+ }
+ }
+ }
+}
+
+proc RobotConnect {url sock} {
+ global URL
+
+ fileevent $sock readable [list RobotRead $url $sock]
+ puts $sock "GET $URL($url,path) HTTP/1.0"
+ puts $sock ""
+ flush $sock
+}
+
+proc RobotNop {} {
+
+}
+
+proc RobotGetUrl {url phost} {
+ global URL
+ set port 80
+ puts "---------"
+ puts $url
+ if {[regexp {([^:]+)://([^/]+)([^ ?]*)} $url x method host path]} {
+ puts "method=$method host=$host path=$path"
+ } else {
+ return -1
+ }
+ set URL($url,method) $method
+ set URL($url,host) $host
+ set URL($url,port) $port
+ set URL($url,path) $path
+ set URL($url,state) head
+ if [catch {set sock [socket -async $host $port]}] {
+ return -1
+ }
+ fconfigure $sock -translation {auto crlf}
+ RobotConnect $url $sock
+
+ return 0
+}
+
+#RobotGetUrl http://www.dtv.dk/ {}
+RobotRestart
+vwait forever
+