From: Adam Dickmeiss Date: Tue, 23 Jan 2001 09:20:32 +0000 (+0000) Subject: Multiple http connections. Bug fixes. X-Git-Tag: ZMBOT.0.1~31 X-Git-Url: http://git.indexdata.com/?p=tclrobot.git;a=commitdiff_plain;h=4adc245769f17af7801647a0314dda7dfefe1dba Multiple http connections. Bug fixes. --- diff --git a/robot.tcl b/robot.tcl index 82e5c28..ddbfb82 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.9 2000/12/11 17:11:03 adam Exp $ +# $Id: robot.tcl,v 1.10 2001/01/23 09:20:32 adam Exp $ # proc RobotFileNext1 {area lead} { puts "RobotFileNext1 area=$area lead=$lead" @@ -26,35 +26,49 @@ proc RobotFileNext1 {area lead} { return {} } -proc RobotFileWait {} { - global robotSeq - set robotSeq 0 +proc RobotWriteRecord {outf fromurl distance} { + puts $outf "" + puts $outf "" + puts $outf $distance + puts $outf "" + puts $outf "" + puts $outf $fromurl + puts $outf "" + puts $outf "" +} + +proc RobotReadRecord {inf fromurlx distancex} { + upvar $fromurlx fromurl + upvar $distancex distance + gets $inf + gets $inf + set distance [string trim [gets $inf]] + puts "got distance = $distance" + gets $inf + gets $inf + set fromurl [string trim [gets $inf]] } proc RobotFileNext {area} { - global robotSeq + global robotSeq global idleTime ns + puts "RobotFileNext robotSeq=$robotSeq" - if {[catch {set ns [glob ${area}/*]}]} { - return {} + if {$robotSeq < 0} { + return {} + } + if {$robotSeq == 0} { + if {[catch {set ns [glob ${area}/*]}]} { + return {} + } } set off [string length $area] incr off - set n [lindex $ns $robotSeq] if {![string length $n]} { + set robotSeq -1 flush stdout puts "------------ N E X T R O U N D --------" - set robotSeq -1 - after 60000 RobotFileWait - vwait robotSeq - - set n [lindex $ns $robotSeq] - if {![string length $n]} { - puts "robotSeq = $robotSeq" - puts "ns=$ns" - puts "no more work at index" - return {} - } + return wait } incr robotSeq if {[file isfile $n/frobots.txt]} { @@ -73,16 +87,14 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - puts "RobotFileExist begin" - puts "area=$area host=$host path=$path" + puts "RobotFileExist begin area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "npath=$npath" - puts "RobotFileExist end" + puts "RobotFileExist end npath=$npath" return [file exists $npath] } @@ -140,7 +152,6 @@ proc RobotFileOpen {area host path {mode w}} { } if {[catch {cd ./$d}]} { exec mkdir $d - puts "creating $d" cd ./$d if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { set out [open frobots.txt w] @@ -153,39 +164,68 @@ proc RobotFileOpen {area host path {mode w}} { if {[string length $d]} { if {[file isdirectory $d]} { set out [open $d/f $mode] - puts "1" } else { set out [open f$d $mode] - puts "2" } } else { set out [open f $mode] - puts "3" } cd $orgPwd - #puts "RobotFileStop" return $out } -proc RobotRestart {sock} { - global URL - global robotMoreWork - +proc RobotRR {} { + global robotSeq robotsRunning + + incr robotsRunning -1 + while {$robotsRunning} { + vwait robotsRunning + } + set robotSeq 0 + RobotStart +} + +proc RobotRestart {url sock} { + global URL robotsRunning + close $sock after cancel $URL($sock,cancel) - while {1} { + + foreach v [array names URL $url,*] { + unset URL($v) + } + + incr robotsRunning -1 + RobotStart +} + +proc RobotStart {} { + global URL + global robotsRunning robotsMax idleTime + + puts "RobotStart" + while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { - break + return + } + incr robotsRunning + if {[string compare $url wait] == 0} { + after $idleTime RobotRR + return } set r [RobotGetUrl $url {}] if {!$r} { - return + if {$robotsRunning >= $robotsMax} return } else { - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - } + incr robotsRunning -1 + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotFileClose $outf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + } } - incr robotMoreWork -1 } proc headSave {url out} { @@ -246,7 +286,7 @@ proc RobotHref {url hrefx hostx pathx} { } } else { regexp {^([^\#]*)} $hpath x surl - set host $URL($url,host) + set host $URL($url,hostport) } if {![string length $surl]} { return 0 @@ -284,18 +324,9 @@ proc RobotHref {url hrefx hostx pathx} { } } regsub -all {~} $path {%7E} path - set ok 1 - if {[info exists URL($host,robots)]} { - foreach l $URL($host,robots) { - if {[string first [lindex $l 1] $path] == 0} { - set ok [lindex $l 0] - break - } - } - } set href "$method://$host$path" - puts "Ref href = $href, ok=$ok" - return $ok + puts "Ref href = $href" + return 1 } proc RobotError {url code} { @@ -303,16 +334,16 @@ proc RobotError {url code} { puts "Bad URL $url, $code" set fromurl {} - if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] - close $inf - } - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url $code" - puts $outf "Reference $fromurl" + set distance -1 + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } } @@ -322,35 +353,57 @@ proc RobotRedirect {url tourl code} { puts "Redirecting from $url to $tourl" + set distance {} set fromurl {} - if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance RobotFileClose $inf } - if {[catch {RobotFileUnlink unvisited $URL($url,host) $URL($url,path)}]} { - puts "unlink failed" - exit 1 - } - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url to $tourl $code" - puts $outf "Reference $fromurl" + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } if {[RobotHref $url tourl host path]} { - if {![RobotFileExist unvisited $host $path]} { - puts "Mark as unvisited" - set outf [RobotFileOpen unvisited $host $path] - puts $outf $code - RobotFileClose $outf + if {![RobotFileExist visited $host $path]} { + if {![RobotFileExist unvisited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } + } else { + set olddistance {} + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[string length $distance] == 0} { + set distance 1000 + } + puts "distance=$distance olddistance=$olddistance" + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $tourl $distance + RobotFileClose $outf + } } } + if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} { + puts "unlink failed" + exit 1 + } } proc RobotTextHtml {url out} { - global URL + global URL maxDistance + set distance 0 + if {$maxDistance < 1000 && [info exists URL($url,dist)]} { + set distance [expr $URL($url,dist) + 1] + } htmlSwitch $URL($url,buf) \ title { puts $out "$body" @@ -374,22 +427,47 @@ proc RobotTextHtml {url out} { puts "no href" continue } - if {1} { - set href $parm(href) + if {[expr $distance <= $maxDistance]} { + set href [string trim $parm(href)] if {![RobotHref $url href host path]} continue puts $out "" puts $out "$href" puts $out "$body" puts $out "" - + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 if {![RobotFileExist bad $host $path]} { - if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} { - puts "--- Error $msg" - exit 1 - } - puts $outf $url + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance RobotFileClose $outf } } @@ -400,10 +478,13 @@ proc RobotTextHtml {url out} { proc RobotsTxt {url} { global agent URL - set v URL($URL($url,host),robots) + RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf) +} + +proc RobotsTxt0 {v buf} { + global URL agent set section 0 - foreach l [split $URL($url,buf) \n] { - puts $l + foreach l [split $buf \n] { if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} { puts "cmd=$cmd arg=$arg" switch $cmd { @@ -444,13 +525,26 @@ proc RobotTextPlain {url out} { proc Robot200 {url} { global URL domains - puts "Parsing $url" - set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] + set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] puts $out "" + + set distance 1000 + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + set URL($url,dist) $distance + puts $out "" + puts $out " $distance" + puts $out "" headSave $url $out + puts "Parsing $url distance=$distance" switch $URL($url,head,content-type) { text/html { - RobotTextHtml $url $out + if {[string length $distance]} { + RobotTextHtml $url $out + } } text/plain { RobotTextPlain $url $out @@ -464,21 +558,22 @@ proc Robot200 {url} { puts $out "" RobotFileClose $out # puts "Parsing done" - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) } proc RobotReadContent {url sock binary} { global URL + puts "RobotReadContent $url" set buffer [read $sock 16384] set readCount [string length $buffer] if {$readCount <= 0} { Robot200 $url - RobotRestart $sock + RobotRestart $url $sock } elseif {!$binary && [string first \0 $buffer] >= 0} { Robot200 $url - RobotRestart $sock + RobotRestart $url $sock } else { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer @@ -491,13 +586,13 @@ proc RobotReadHeader {url sock} { puts "RobotReadHeader $url" if {[catch {set buffer [read $sock 2148]}]} { RobotError $url 404 - RobotRestart $sock + RobotRestart $url $sock } set readCount [string length $buffer] if {$readCount <= 0} { RobotError $url 404 - RobotRestart $sock + RobotRestart $url $sock } else { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer @@ -522,11 +617,11 @@ proc RobotReadHeader {url sock} { switch $code { 301 { RobotRedirect $url $URL($url,head,location) 301 - RobotRestart $sock + RobotRestart $url $sock } 302 { RobotRedirect $url $URL($url,head,location) 302 - RobotRestart $sock + RobotRestart $url $sock } 200 { if {![info exists URL($url,head,content-type)]} { @@ -542,18 +637,18 @@ proc RobotReadHeader {url sock} { } default { RobotError $url $code - RobotRestart $sock + RobotRestart $url $sock } } } } } -proc RobotSockCancel {sock url} { +proc RobotSockCancel {url sock} { puts "RobotSockCancel sock=$sock url=$url" RobotError $url 401 - RobotRestart $sock + RobotRestart $url $sock } proc RobotConnect {url sock} { @@ -566,7 +661,7 @@ proc RobotConnect {url sock} { puts $sock "User-Agent: $agent" puts $sock "" flush $sock - set URL($sock,cancel) [after 60000 [list RobotSockCancel $sock $url]] + set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]] } proc RobotNop {} { @@ -574,11 +669,10 @@ proc RobotNop {} { } proc RobotGetUrl {url phost} { - global URL + global URL robotsRunning flush stdout - puts "---------" - puts $url - if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} { + puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url" + if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} { @@ -587,10 +681,36 @@ proc RobotGetUrl {url phost} { } set URL($url,method) $method set URL($url,host) $host - set URL($url,port) $port + set URL($url,hostport) $hostport set URL($url,path) $path set URL($url,state) head set URL($url,buf) {} + + if {[string compare $path /robots.txt]} { + set ok 1 + if {![info exists URL($hostport,robots)]} { + puts "READING robots.txt for host $hostport" + if {[RobotFileExist visited $hostport /robots.txt]} { + set inf [RobotFileOpen visited $hostport /robots.txt r] + set buf [read $inf 32768] + close $inf + } else { + set buf "User-Agent: *\nAllow: /\n" + } + RobotsTxt0 URL($hostport,robots) $buf + } + if {[info exists URL($hostport,robots)]} { + foreach l $URL($hostport,robots) { + if {[string first [lindex $l 1] $path] == 0} { + set ok [lindex $l 0] + break + } + } + } + if {!$ok} { + return -1 + } + } if [catch {set sock [socket -async $host $port]}] { return -1 } @@ -618,25 +738,32 @@ proc bgerror {m} { puts $errorInfo } -set robotMoreWork 0 +set robotsRunning 0 +set robotsMax 5 set robotSeq 0 set workdir [pwd] +set idleTime 60000 if {[llength $argv] < 2} { - puts "Tclrobot: usage " - puts " Example: '*.indexdata.dk' http://www.indexdata.dk/" + puts "Tclrobot: usage " + puts " Example: 3 '*.indexdata.dk' http://www.indexdata.dk/" exit 1 } -set domains [lindex $argv 0] -foreach site [lindex $argv 1] { - incr robotMoreWork - if [RobotGetUrl $site {}] { - incr robotMoreWork -1 - puts "Couldn't process $site" +set maxDistance [lindex $argv 0] +set domains [lindex $argv 1] +foreach href [lindex $argv 2] { + if {[RobotHref http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $href 0 + RobotFileClose $outf + } } } -while {$robotMoreWork} { - vwait robotMoreWork +RobotStart + +while {$robotsRunning} { + vwait robotsRunning }