From: Adam Dickmeiss Date: Sun, 10 Dec 2000 22:27:48 +0000 (+0000) Subject: Implemented robots.txt rules. X-Git-Tag: ZMBOT.0.1~33 X-Git-Url: http://git.indexdata.com/?p=tclrobot.git;a=commitdiff_plain;h=563c2ed58045d7cd4c977af162f13d1fa308b4a5 Implemented robots.txt rules. --- diff --git a/robot.tcl b/robot.tcl index 15468bf..4934b92 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.7 2000/12/08 22:46:53 adam Exp $ +# $Id: robot.tcl,v 1.8 2000/12/10 22:27:48 adam Exp $ # proc RobotFileNext1 {area} { if {[catch {set ns [glob ${area}/*]}]} { @@ -44,7 +44,7 @@ proc RobotFileNext {area} { if {![string length $n]} { puts "------------ N E X T R O U N D --------" set robotSeq -1 - after 2000 RobotFileWait + after 30000 RobotFileWait vwait robotSeq set n [lindex $ns $robotSeq] @@ -109,8 +109,9 @@ proc RobotFileOpen {area host path {mode w}} { if {![info exists workdir]} { return stdout } - puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path" + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" if {[string compare $orgPwd $workdir]} { + puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" puts "pwd = $orgPwd" exit 1 @@ -145,10 +146,12 @@ proc RobotFileOpen {area host path {mode w}} { return $out } -proc RobotRestart {} { +proc RobotRestart {sock} { global URL global robotMoreWork - + + close $sock + after cancel $URL($sock,cancel) while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { @@ -156,13 +159,12 @@ proc RobotRestart {} { } set r [RobotGetUrl $url {}] if {!$r} { - puts "RobotGetUrl returned 0 on url=$url" return } else { RobotFileUnlink unvisited $URL($url,host) $URL($url,path) } } - set robotMoreWork 0 + incr robotMoreWork -1 } proc headSave {url out} { @@ -257,18 +259,28 @@ proc RobotHref {url hrefx hostx pathx} { incr i -1 } } - } + } + regsub -all {~} $path {%7E} path + set ok 1 + if {[info exists URL($host,robots)]} { + foreach l $URL($host,robots) { + if {[string first [lindex $l 1] $path] == 0} { + set ok [lindex $l 0] + break + } + } + } set href "$method://$host$path" - puts "Ref href = $href" - return 1 + puts "Ref href = $href, ok=$ok" + return $ok } -proc Robot401 {url} { +proc RobotError {url code} { global URL - puts "Bad URL $url" + puts "Bad URL $url, $code" set fromurl {} - catch { + if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} { set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] set fromurl [gets $inf] close $inf @@ -276,32 +288,13 @@ proc Robot401 {url} { RobotFileUnlink unvisited $URL($url,host) $URL($url,path) if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url 401" + puts $outf "URL=$url $code" puts $outf "Reference $fromurl" RobotFileClose $outf } } -proc Robot404 {url} { - global URL - - puts "Bad URL $url" - set fromurl {} - catch { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] - RobotFileClose $inf - } - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url 404" - puts $outf "Reference $fromurl" - RobotFileClose $outf - } - } - -proc Robot301 {url tourl} { +proc RobotRedirect {url tourl code} { global URL puts "Redirecting from $url to $tourl" @@ -315,7 +308,7 @@ proc Robot301 {url tourl} { RobotFileUnlink unvisited $URL($url,host) $URL($url,path) if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url to $tourl 301" + puts $outf "URL=$url to $tourl $code" puts $outf "Reference $fromurl" RobotFileClose $outf } @@ -323,7 +316,7 @@ proc Robot301 {url tourl} { if {![RobotFileExist unvisited $host $path]} { puts "Mark as unvisited" set outf [RobotFileOpen unvisited $host $path] - puts $outf 301 + puts $outf $code RobotFileClose $outf } } @@ -396,6 +389,38 @@ proc RobotTextHtml {url out} { puts $out "" } +proc RobotsTxt {url} { + global agent URL + + set v URL($URL($url,host),robots) + set section 0 + foreach l [split $URL($url,buf) \n] { + puts $l + if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} { + puts "cmd=$cmd arg=$arg" + switch $cmd { + User-Agent { + if {$section} break + set pat [string tolower $arg]* + set section [string match $pat $agent] + } + Disallow { + if {$section} { + puts "rule [list 0 $arg]" + lappend $v [list 0 $arg] + } + } + Allow { + if {$section} { + puts "rule [list 1 $arg]" + lappend $v [list 1 $arg] + } + } + } + } + } +} + proc RobotTextPlain {url out} { global URL @@ -404,6 +429,10 @@ proc RobotTextPlain {url out} { puts $out $URL($url,buf) puts $out "" puts $out "" + + if {![string compare $URL($url,path) /robots.txt]} { + RobotsTxt $url + } } proc Robot200 {url} { @@ -433,11 +462,13 @@ proc RobotReadContent {url sock} { set buffer [read $sock 16384] set readCount [string length $buffer] - + if {$readCount <= 0} { - close $sock Robot200 $url - RobotRestart + RobotRestart $sock + } elseif {[string first \0 $buffer] >= 0} { + Robot200 $url + RobotRestart $sock } else { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer @@ -447,13 +478,15 @@ proc RobotReadContent {url sock} { proc RobotReadHeader {url sock} { global URL - set buffer [read $sock 2148] + if {[catch {set buffer [read $sock 2148]}]} { + RobotError $url 404 + RobotRestart $sock + } set readCount [string length $buffer] if {$readCount <= 0} { - Robot404 $url - close $sock - RobotRestart + RobotError $url 404 + RobotRestart $sock } else { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer @@ -478,24 +511,20 @@ proc RobotReadHeader {url sock} { set URL($url,state) skip switch $code { 301 { - Robot301 $url $URL($url,head,location) - close $sock - RobotRestart + RobotRedirect $url $URL($url,head,location) 301 + RobotRestart $sock } 302 { - Robot301 $url $URL($url,head,location) - close $sock - RobotRestart + RobotRedirect $url $URL($url,head,location) 302 + RobotRestart $sock } 404 { - Robot404 $url - close $sock - RobotRestart + RobotError $url 404 + RobotRestart $sock } 401 { - Robot401 $url - close $sock - RobotRestart + RobotError $url 401 + RobotRestart $sock } 200 { if {![info exists URL($url,head,content-type)]} { @@ -509,33 +538,38 @@ proc RobotReadHeader {url sock} { fileevent $sock readable [list RobotReadContent $url $sock] } default { - close $sock Robot200 $url - RobotRestart + RobotRestart $sock } } } default { - Robot404 $url - close $sock - RobotRestart + RobotError $url 404 + RobotRestart $sock } } } } } +proc RobotSockCancel {sock url} { + + puts "RobotSockCancel sock=$sock url=$url" + RobotError $url 401 + RobotRestart $sock +} + proc RobotConnect {url sock} { global URL agent fconfigure $sock -translation {auto crlf} -blocking 0 - puts "Reading $url" fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" puts $sock "User-Agent: $agent" puts $sock "" flush $sock + set URL($sock,cancel) [after 60000 [list RobotSockCancel $sock $url]] } proc RobotNop {} { @@ -574,7 +608,6 @@ if {![llength [info commands htmlSwitch]]} { } } - set agent "zmbot/0.0" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" @@ -582,7 +615,9 @@ if {![catch {set os [exec uname -s -r]}]} { } proc bgerror {m} { + global errorInfo puts "BGERROR $m" + puts $errorInfo } set robotMoreWork 0 @@ -596,11 +631,10 @@ if {[llength $argv] < 2} { } set domains [lindex $argv 0] -set site [lindex $argv 1] -if {[string length $site]} { - set robotMoreWork 1 +foreach site [lindex $argv 1] { + incr robotMoreWork if [RobotGetUrl $site {}] { - set robotMoreWork 0 + incr robotMoreWork -1 puts "Couldn't process $site" } }