X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;ds=sidebyside;f=robot.tcl;h=ab3cef4b8b50fe5773d31c671be66ad3cde707a5;hb=87b050c8552f5b45c870b8c942ca67fe1da363a5;hp=c539a0404eca88168513eb046aa9fddb79b0de85;hpb=af8cb4656cb382dfd6fdb932778a5e59379c59b3;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index c539a04..ab3cef4 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.4 1999/02/04 20:37:25 perhans Exp $ +# $Id: robot.tcl,v 1.5 1999/12/27 11:49:31 adam Exp $ # proc RobotFileNext {area} { if {[catch {set ns [glob ${area}/*]}]} { @@ -7,6 +7,7 @@ proc RobotFileNext {area} { } set off [string first / $area] incr off + foreach n $ns { if {[file isfile $n]} { if {[string first :.html $n] > 0} { @@ -30,6 +31,10 @@ proc RobotFileExist {area host path} { incr l -1 if {![string length [lindex $comp $l]]} { set comp [split $area/$host$path:.html /] + } elseif {[file exists [join $comp /]]} { + return 1 + } else { + set comp [split $area/$host$path/:.html /] } return [file exists [join $comp /]] } @@ -52,7 +57,14 @@ proc RobotFileUnlink {area host path} { proc RobotFileOpen {area host path} { set orgPwd [pwd] + global workdir + #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path" + if {[string compare $orgPwd $workdir]} { + puts "workdir = $workdir" + puts "pwd = $orgPwd" + exit 1 + } set comp [split $area/$host$path /] set len [llength $comp] incr len -1 @@ -70,17 +82,22 @@ proc RobotFileOpen {area host path} { set out [open :.html w] } cd $orgPwd + #puts "RobotFileStop" return $out } proc RobotRestart {} { global URL - + while {1} { set url [RobotFileNext unvisited] - if {![string length $url]} break + if {![string length $url]} { + puts "No more unvisited" + break + } set r [RobotGetUrl $url {}] if {!$r} { + puts "RobotGetUrl returned 0 on url=$url" return } else { RobotFileUnlink unvisited $URL($url,host) $URL($url,path) @@ -91,7 +108,7 @@ proc RobotRestart {} { proc headSave {url out title} { global URL - + puts $out {} puts $out "$title" if {[info exists URL($url,head,last-modified)]} { @@ -116,13 +133,122 @@ proc headSave {url out title} { puts $out {} } -proc RobotSave {url} { +proc RobotHref {url hrefx hostx pathx} { + global URL domains + upvar $hrefx href + upvar $hostx host + upvar $pathx path + + # puts "Ref url = $url href=$href" + # get method (if any) + if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { + set hpath $href + set method http + } else { + if {[string compare $method http]} { + return 0 + } + } + # get host (if any) + if {![regexp {^//([^/]+)(.*)} $hpath x host epath]} { + set epath $hpath + set host $URL($url,host) + } else { + if {![string length $epath]} { + set epath / + } + set ok 0 + foreach domain $domains { + if {[string match $domain $host]} { + set ok 1 + break + } + } + if {!$ok} { + return 0 + } + } + if {[regexp {^(\#|\?)} $epath]} { + # within page + return 0 + } elseif {![regexp {^([/][^\#?]*)} $epath x path]} { + # relative path + set ext [file extension $URL($url,path)] + if {[string compare $ext {}]} { + set dpart [file dirname $URL($url,path)] + } else { + set dpart $URL($url,path) + } + regexp {^([^\#?]+)} $epath x path + set path [string trimright $dpart /]/$path + } + set c [split $path /] + set i [llength $c] + incr i -1 + set path [lindex $c $i] + incr i -1 + while {$i >= 0} { + switch -- [lindex $c $i] { + .. { + incr i -2 + } + . { + incr i -1 + } + default { + set path [lindex $c $i]/$path + incr i -1 + } + } + } + set href "$method://$host$path" + # puts "Ref href = $href" + return 1 +} + +proc Robot401 {url} { + global URL + + puts "Bad link $url" + RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + if {![RobotFileExist forbidden $URL($url,host) $URL($url,path)]} { + set outf [RobotFileOpen forbidden $URL($url,host) $URL($url,path)] + close $outf + } +} + +proc Robot404 {url} { + global URL + + puts "Bad link $url" + RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] + close $outf + } +} + +proc Robot301 {url tourl} { + global URL + + puts "Redirecting from $url to $tourl" + RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + if {[RobotHref $url tourl host path]} { + if {![RobotFileExist unvisited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + close $outf + } + } +} + +proc Robot200 {url} { global URL domains + # puts "Parsing $url" set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] set ti 0 - if {[info exists URL($url,line)]} { - set htmlContent [join $URL($url,line) \n] + if {[info exists URL($url,buf)]} { + set htmlContent $URL($url,buf) htmlSwitch $htmlContent \ title { @@ -145,130 +271,136 @@ proc RobotSave {url} { headSave $url $out "untitled" set ti 1 } - - if {[regexp {^\#} $parm(href)]} { - continue - } elseif {[regexp {^([^:]+):([^#]+)} $parm(href) x method hpath]} { - set ok 0 - if {![string compare $method http]} { - if {![regexp {^//([^/]+)(.*)} $hpath x host path]} { - set host $URL($url,host) - set path $hpath - } - foreach domain $domains { - if {[string match $domain $host]} { - set ok 1 - break - } - } - } - if {!$ok} continue - } elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} { - set host $URL($url,host) - set method http - } else { - set ext [file extension $URL($url,path)] - if {[string compare $ext {}]} { - set dpart [file dirname $URL($url,path)] - } else { - set dpart $URL($url,path) - } - regexp {^([^#]+)} $parm(href) x path - set host $URL($url,host) - set path [string trimright $dpart /]/$path - set method http - } - set ext [file extension $path] - if {![string length $ext]} { - set path [string trimright $path /]/ - } else { - set path [string trimright $path /] - } - set c [split $path /] - set i [llength $c] - incr i -1 - set path [lindex $c $i] - incr i -1 - while {$i >= 0} { - switch -- [lindex $c $i] { - .. { - incr i -2 - } - . { - incr i -1 - } - default { - set path [lindex $c $i]/$path - incr i -1 - } - } - } - set href "$method://$host$path" - - puts $out "" - puts $out "$href" - puts $out "$body" - puts $out "" - - if {![regexp {/.*bin/} $href)]} { - if {![RobotFileExist visited $host $path]} { - set outf [RobotFileOpen unvisited $host $path] - close $outf - } - } + if {1} { + set href $parm(href) + if {![RobotHref $url href host path]} continue + + puts $out "" + puts $out "$href" + puts $out "$body" + puts $out "" + + if {![RobotFileExist visited $host $path]} { + if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} { + puts "--- Error $msg" + exit 1 + } + close $outf + } + } } } if {!$ti} { - headSave $url $out "untitled" - set ti 1 + headSave $url $out "untitled" + set ti 1 } puts $out "" close $out + # puts "Parsing done" RobotFileUnlink unvisited $URL($url,host) $URL($url,path) } -proc RobotRead {url sock} { +proc RobotReadBody {url sock} { global URL - set readCount [gets $sock line] - if {$readCount < 0} { - if [eof $sock] { - close $sock - RobotSave $url - RobotRestart - } - } elseif {$readCount > 0} { - switch $URL($url,state) { - head { - puts "head: $line" - if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} { - set URL($url,head,[string tolower $name]) $value - } - } - html { - lappend URL($url,line) $line - } - skip { - close $sock - RobotSave $url - RobotRestart - } - } + set buffer [read $sock 16384] + set readCount [string length $buffer] + + if {$readCount <= 0} { + close $sock + Robot200 $url + RobotRestart } else { - set URL($url,state) html - if {[info exists URL($url,head,content-type)]} { - if {![string compare $URL($url,head,content-type) text/html]} { - set URL($url,state) html - } - } + # puts "Got $readCount bytes" + set URL($url,buf) $URL($url,buf)$buffer + } +} + +proc RobotReadHead {url sock} { + global URL + + set buffer [read $sock 8192] + set readCount [string length $buffer] + + if {$readCount <= 0} { + Robot404 $url + close $sock + RobotRestart + } else { + # puts "Got $readCount bytes" + set URL($url,buf) $URL($url,buf)$buffer + + set n [string first \n\n $URL($url,buf)] + if {$n > 1} { + set code 0 + set version {} + set headbuf [string range $URL($url,buf) 0 $n] + incr n + incr n + set URL($url,buf) [string range $URL($url,buf) $n end] + + regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code + set lines [split $headbuf \n] + foreach line $lines { + if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} { + set URL($url,head,[string tolower $name]) $value + } + } + puts "code = $code" + set URL($url,state) skip + switch $code { + 301 { + Robot301 $url $URL($url,head,location) + close $sock + RobotRestart + } + 302 { + Robot301 $url $URL($url,head,location) + close $sock + RobotRestart + } + 404 { + Robot404 $url + close $sock + RobotRestart + } + 401 { + Robot401 $url + close $sock + RobotRestart + } + 200 { + if {[info exists URL($url,head,content-type)]} { + if {![string compare $URL($url,head,content-type) text/html]} { + set URL($url,state) html + } + } + if {[string compare $URL($url,state) html]} { + close $sock + Robot200 $url + RobotRestart + } else { + fileevent $sock readable [list RobotReadBody $url $sock] + } + } + default { + Robot404 $url + close $sock + RobotRestart + } + } + } } } proc RobotConnect {url sock} { global URL - fileevent $sock readable [list RobotRead $url $sock] + fconfigure $sock -translation {auto crlf} -blocking 0 + puts "Reading $url" + fileevent $sock readable [list RobotReadHead $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" + puts $sock "Host: $URL($url,host)" puts $sock "" flush $sock } @@ -279,23 +411,24 @@ proc RobotNop {} { proc RobotGetUrl {url phost} { global URL - set port 80 puts "---------" puts $url - if {[regexp {([^:]+)://([^/]+)([^ ]*)} $url x method host path]} { - puts "method=$method host=$host path=$path" - } else { + if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} { return -1 } + if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} { + set port 80 + set host $hostport + } set URL($url,method) $method set URL($url,host) $host set URL($url,port) $port set URL($url,path) $path set URL($url,state) head + set URL($url,buf) {} if [catch {set sock [socket -async $host $port]}] { return -1 } - fconfigure $sock -translation {auto crlf} RobotConnect $url $sock return 0 @@ -304,7 +437,7 @@ proc RobotGetUrl {url phost} { if {![llength [info commands htmlSwitch]]} { set e [info sharedlibextension] if {[catch {load ./tclrobot$e}]} { - load tclrobot$e + load tclrobot$e } } @@ -313,6 +446,9 @@ if {[llength $argv] < 2} { puts " Example: '*.dk' www.indexdata.dk" exit 1 } + +set workdir [pwd] + set domains [lindex $argv 0] set site [lindex $argv 1] if {[string length $site]} { @@ -320,5 +456,6 @@ if {[string length $site]} { close $x } + RobotRestart vwait forever