From: Adam Dickmeiss Date: Fri, 8 Dec 2000 22:46:53 +0000 (+0000) Subject: File robots.txt now read the each domain. X-Git-Tag: ZMBOT.0.1~34 X-Git-Url: http://git.indexdata.com/?p=tclrobot.git;a=commitdiff_plain;h=04f32b20fae8795aab0c1f8b703394056f3efea3 File robots.txt now read the each domain. Pages are now fetched in a Round-robin fashion. --- diff --git a/dcdot.tcl b/dcdot.tcl index af0bef7..a29e6f5 100755 --- a/dcdot.tcl +++ b/dcdot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: dcdot.tcl,v 1.2 2000/12/08 08:55:35 adam Exp $ +# $Id: dcdot.tcl,v 1.3 2000/12/08 22:46:53 adam Exp $ # proc RobotRestart {} { @@ -99,7 +99,6 @@ proc RobotReadHeader {url sock} { } } default { - Robot404 $url close $sock RobotRestart } @@ -170,4 +169,7 @@ if {$argc == 1} { puts $m } } + foreach v [array names URL $url,head,*] { + puts "$v = $URL($v)" + } } diff --git a/robot.tcl b/robot.tcl index 93c4541..15468bf 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,7 +1,7 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.6 2000/12/07 20:16:11 adam Exp $ +# $Id: robot.tcl,v 1.7 2000/12/08 22:46:53 adam Exp $ # -proc RobotFileNext {area} { +proc RobotFileNext1 {area} { if {[catch {set ns [glob ${area}/*]}]} { return {} } @@ -18,7 +18,7 @@ proc RobotFileNext {area} { } foreach n $ns { if {[file isdirectory $n]} { - set sb [RobotFileNext $n] + set sb [RobotFileNext1 $n] if {[string length $sb]} { return $sb } @@ -27,6 +27,45 @@ proc RobotFileNext {area} { return {} } +proc RobotFileWait {} { + global robotSeq + set robotSeq 0 +} + +proc RobotFileNext {area} { + global robotSeq + if {[catch {set ns [glob ${area}/*]}]} { + return {} + } + set off [string length $area] + incr off + + set n [lindex $ns $robotSeq] + if {![string length $n]} { + puts "------------ N E X T R O U N D --------" + set robotSeq -1 + after 2000 RobotFileWait + vwait robotSeq + + set n [lindex $ns $robotSeq] + if {![string length $n]} { + return {} + } + } + incr robotSeq + if {[file isfile $n/robots.txt]} { + puts "ok returning http://[string range $n $off end]/robots.txt" + return http://[string range $n $off end]/robots.txt + } elseif {[file isdirectory $n]} { + set sb [RobotFileNext1 $n] + if {[string length $sb]} { + return $sb + } + } + return {} +} + + proc RobotFileExist {area host path} { set comp [split $area/$host$path /] set l [llength $comp] @@ -84,6 +123,11 @@ proc RobotFileOpen {area host path {mode w}} { if {[catch {cd ./$d}]} { exec mkdir $d cd ./$d + if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { + set out [open robots.txt w] + puts "creating robots.txt in $d" + close $out + } } } set d [lindex $comp $len] @@ -541,25 +585,8 @@ proc bgerror {m} { puts "BGERROR $m" } -if {0} { - proc RobotRestart {} { - global robotMoreWork - set robotMoreWork 0 - puts "myrestart" - } - set robotMoreWork 1 - set url {http://www.indexdata.dk/zap/} - RobotGetUrl $url {} - while {$robotMoreWork} { - vwait robotMoreWork - } - puts "-----------" - puts $URL($url,buf) - puts "-----------" - exit 1 -} - set robotMoreWork 0 +set robotSeq 0 set workdir [pwd] if {[llength $argv] < 2} { @@ -575,9 +602,6 @@ if {[string length $site]} { if [RobotGetUrl $site {}] { set robotMoreWork 0 puts "Couldn't process $site" - } else { - #set x [RobotFileOpen unvisited $site /robots.txt] - #RobotFileClose $x } }