X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=c7d85c47a68dfe22d74f029269b06a90de19b5de;hb=8278051059f0aa2849729755d70967d58dddd8a6;hp=15468bfdfdf5bfd5408b783ed5f9b1e4f20f8b33;hpb=04f32b20fae8795aab0c1f8b703394056f3efea3;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 15468bf..c7d85c4 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,24 +1,23 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.7 2000/12/08 22:46:53 adam Exp $ +# $Id: robot.tcl,v 1.13 2001/02/26 22:51:51 adam Exp $ # -proc RobotFileNext1 {area} { +proc RobotFileNext1 {area lead} { + puts "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } - set off [string first / $area] - incr off - foreach n $ns { if {[file isfile $n]} { - if {[string first :.html $n] > 0} { - return http://[string range $area/ $off end] - } - return http://[string range $n $off end] + set off [string last / $n] + incr off 2 + return $lead/[string range $n $off end] } } foreach n $ns { if {[file isdirectory $n]} { - set sb [RobotFileNext1 $n] + set off [string last / $n] + incr off 2 + set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb } @@ -27,66 +26,91 @@ proc RobotFileNext1 {area} { return {} } -proc RobotFileWait {} { - global robotSeq - set robotSeq 0 +proc RobotWriteRecord {outf fromurl distance} { + puts $outf "" + puts $outf "" + puts $outf $distance + puts $outf "" + puts $outf "" + puts $outf $fromurl + puts $outf "" + puts $outf "" +} + +proc RobotReadRecord {inf fromurlx distancex} { + upvar $fromurlx fromurl + upvar $distancex distance + gets $inf + gets $inf + set distance [string trim [gets $inf]] + puts "got distance = $distance" + gets $inf + gets $inf + set fromurl [string trim [gets $inf]] } proc RobotFileNext {area} { - global robotSeq - if {[catch {set ns [glob ${area}/*]}]} { - return {} + global robotSeq global idleTime ns + + puts "RobotFileNext robotSeq=$robotSeq" + if {$robotSeq < 0} { + return {} + } + if {$robotSeq == 0} { + if {[catch {set ns [glob ${area}/*]}]} { + return {} + } } set off [string length $area] incr off - set n [lindex $ns $robotSeq] if {![string length $n]} { - puts "------------ N E X T R O U N D --------" set robotSeq -1 - after 2000 RobotFileWait - vwait robotSeq - - set n [lindex $ns $robotSeq] - if {![string length $n]} { - return {} - } + flush stdout + puts "------------ N E X T R O U N D --------" + return wait } incr robotSeq - if {[file isfile $n/robots.txt]} { + if {[file isfile $n/frobots.txt]} { puts "ok returning http://[string range $n $off end]/robots.txt" return http://[string range $n $off end]/robots.txt } elseif {[file isdirectory $n]} { - set sb [RobotFileNext1 $n] + set sb [RobotFileNext1 $n http://[string range $n $off end]] if {[string length $sb]} { return $sb } } + puts "no more work at end of RobotFileNext n=$n" + puts "ns=$ns" return {} } proc RobotFileExist {area host path} { - set comp [split $area/$host$path /] - set l [llength $comp] + puts "RobotFileExist begin area=$area host=$host path=$path" + set lpath [split $path /] + set l [llength $lpath] incr l -1 - if {![string length [lindex $comp $l]]} { - set comp [split $area/$host$path:.html /] - } elseif {[file exists [join $comp /]]} { - return 1 - } else { - set comp [split $area/$host$path/:.html /] - } - return [file exists [join $comp /]] + set t [lindex $lpath $l] + incr l -1 + set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + puts "RobotFileExist end npath=$npath" + return [file exists $npath] } proc RobotFileUnlink {area host path} { - set comp [split $area/$host$path /] + puts "RobotFileUnlink begin" + puts "area=$area host=$host path=$path" + set lpath [split $path /] + set l [llength $lpath] + incr l -1 + set t [lindex $lpath $l] + incr l -1 + set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + puts "npath=$npath" + set comp [split $npath /] set l [llength $comp] incr l -1 - if {![string length [lindex $comp $l]]} { - set comp [split $area/$host$path:.html /] - } if {[catch {exec rm [join $comp /]}]} return incr l -1 for {set i $l} {$i > 0} {incr i -1} { @@ -94,6 +118,7 @@ proc RobotFileUnlink {area host path} { if {![catch {glob $path/*}]} return exec rmdir ./$path } + puts "RobotFileUnlink end" } proc RobotFileClose {out} { @@ -109,8 +134,9 @@ proc RobotFileOpen {area host path {mode w}} { if {![info exists workdir]} { return stdout } - puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path" + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" if {[string compare $orgPwd $workdir]} { + puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" puts "pwd = $orgPwd" exit 1 @@ -119,12 +145,16 @@ proc RobotFileOpen {area host path {mode w}} { set len [llength $comp] incr len -1 for {set i 0} {$i < $len} {incr i} { - set d [lindex $comp $i] + if {$i > 1} { + set d "d[lindex $comp $i]" + } else { + set d [lindex $comp $i] + } if {[catch {cd ./$d}]} { exec mkdir $d cd ./$d if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { - set out [open robots.txt w] + set out [open frobots.txt w] puts "creating robots.txt in $d" close $out } @@ -133,42 +163,74 @@ proc RobotFileOpen {area host path {mode w}} { set d [lindex $comp $len] if {[string length $d]} { if {[file isdirectory $d]} { - set out [open $d/:.html $mode] + set out [open $d/f $mode] } else { - set out [open $d $mode] + set out [open f$d $mode] } } else { - set out [open :.html $mode] + set out [open f $mode] } cd $orgPwd - #puts "RobotFileStop" return $out } -proc RobotRestart {} { +proc RobotRR {} { + global robotSeq robotsRunning + + incr robotsRunning -1 + while {$robotsRunning} { + vwait robotsRunning + } + set robotSeq 0 + RobotStart +} + +proc RobotRestart {url sock} { + global URL robotsRunning + + close $sock + after cancel $URL($sock,cancel) + + foreach v [array names URL $url,*] { + unset URL($v) + } + + incr robotsRunning -1 + RobotStart +} + +proc RobotStart {} { global URL - global robotMoreWork - - while {1} { + global robotsRunning robotsMax idleTime + + puts "RobotStart" + while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { - break + return + } + incr robotsRunning + if {[string compare $url wait] == 0} { + after $idleTime RobotRR + return } set r [RobotGetUrl $url {}] if {!$r} { - puts "RobotGetUrl returned 0 on url=$url" - return + if {$robotsRunning >= $robotsMax} return } else { - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - } + incr robotsRunning -1 + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotFileClose $outf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + } } - set robotMoreWork 0 } proc headSave {url out} { global URL - puts $out {} if {[info exists URL($url,head,last-modified)]} { puts $out "$URL($url,head,last-modified)" } @@ -198,6 +260,16 @@ proc RobotHref {url hrefx hostx pathx} { upvar $pathx path puts "Ref url = $url href=$href" + + if {[string first { } $href] >= 0} { + return 0 + } + if {[string length $href] > 256} { + return 0 + } + if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { + return 0 + } # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href @@ -212,19 +284,21 @@ proc RobotHref {url hrefx hostx pathx} { if {![string length $surl]} { set surl / } - set ok 0 - foreach domain $domains { - if {[string match $domain $host]} { - set ok 1 - break + if {[info exist domains]} { + set ok 0 + foreach domain $domains { + if {[string match $domain $host]} { + set ok 1 + break + } } - } - if {!$ok} { - return 0 - } + if {!$ok} { + return 0 + } + } } else { regexp {^([^\#]*)} $hpath x surl - set host $URL($url,host) + set host $URL($url,hostport) } if {![string length $surl]} { return 0 @@ -248,6 +322,9 @@ proc RobotHref {url hrefx hostx pathx} { switch -- [lindex $c $i] { .. { incr i -2 + if {$i < 0} { + set i 0 + } } . { incr i -1 @@ -257,94 +334,92 @@ proc RobotHref {url hrefx hostx pathx} { incr i -1 } } - } + } + regsub -all {~} $path {%7E} path set href "$method://$host$path" puts "Ref href = $href" return 1 } -proc Robot401 {url} { +proc RobotError {url code} { global URL - puts "Bad URL $url" + puts "Bad URL $url, $code" set fromurl {} - catch { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] - close $inf - } - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url 401" - puts $outf "Reference $fromurl" - RobotFileClose $outf - } -} - -proc Robot404 {url} { - global URL - - puts "Bad URL $url" - set fromurl {} - catch { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] + set distance -1 + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance RobotFileClose $inf } - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url 404" - puts $outf "Reference $fromurl" + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } - } +} -proc Robot301 {url tourl} { +proc RobotRedirect {url tourl code} { global URL puts "Redirecting from $url to $tourl" + set distance {} set fromurl {} - catch { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance RobotFileClose $inf } - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url to $tourl 301" - puts $outf "Reference $fromurl" + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } if {[RobotHref $url tourl host path]} { - if {![RobotFileExist unvisited $host $path]} { - puts "Mark as unvisited" - set outf [RobotFileOpen unvisited $host $path] - puts $outf 301 - RobotFileClose $outf + if {![RobotFileExist visited $host $path]} { + if {![RobotFileExist unvisited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } + } else { + set olddistance {} + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[string length $distance] == 0} { + set distance 1000 + } + puts "distance=$distance olddistance=$olddistance" + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $tourl $distance + RobotFileClose $outf + } } } + if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} { + puts "unlink failed" + exit 1 + } } proc RobotTextHtml {url out} { - global URL + global URL maxDistance - set head 0 + set distance 0 + if {$maxDistance < 1000 && [info exists URL($url,dist)]} { + set distance [expr $URL($url,dist) + 1] + } htmlSwitch $URL($url,buf) \ title { - if {!$head} { - headSave $url $out - set head 1 - } puts $out "$body" } -nonest meta { - if {!$head} { - headSave $url $out - set head 1 - } puts -nonewline $out "" puts $out $nbody puts $out "" - } a { + } -nonest a { if {![info exists parm(href)]} { puts "no href" continue } - if {!$head} { - headSave $url $out - set head 1 - } - if {1} { - set href $parm(href) + if {[expr $distance <= $maxDistance]} { + set href [string trim $parm(href)] if {![RobotHref $url href host path]} continue puts $out "" puts $out "$href" puts $out "$body" puts $out "" - + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 if {![RobotFileExist bad $host $path]} { - if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} { - puts "--- Error $msg" - exit 1 - } - puts $outf $url + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance RobotFileClose $outf } } } } - if {!$head} { - headSave $url $out - set head 1 +} + +proc RobotsTxt {url} { + global agent URL + + RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf) +} + +proc RobotsTxt0 {v buf} { + global URL agent + set section 0 + foreach l [split $buf \n] { + if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} { + puts "cmd=$cmd arg=$arg" + switch $cmd { + User-Agent { + if {$section} break + set pat [string tolower $arg]* + set section [string match $pat $agent] + } + Disallow { + if {$section} { + puts "rule [list 0 $arg]" + lappend $v [list 0 $arg] + } + } + Allow { + if {$section} { + puts "rule [list 1 $arg]" + lappend $v [list 1 $arg] + } + } + } + } } - puts $out "" } proc RobotTextPlain {url out} { global URL - headSave $url $out puts $out "" puts $out $URL($url,buf) puts $out "" - puts $out "" + + if {![string compare $URL($url,path) /robots.txt]} { + RobotsTxt $url + } } proc Robot200 {url} { global URL domains - puts "Parsing $url" - set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] + set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] + puts $out "" + + set distance 1000 + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + set URL($url,dist) $distance + puts $out "" + puts $out " $distance" + puts $out "" + headSave $url $out + puts "Parsing $url distance=$distance" switch $URL($url,head,content-type) { text/html { - RobotTextHtml $url $out + if {[string length $distance]} { + RobotTextHtml $url $out + } } text/plain { RobotTextPlain $url $out } - default { - headSave $url $out - puts $out "" + application/pdf { + set pdff [open test.pdf w] + puts -nonewline $pdff $URL($url,buf) + close $pdff } } + puts $out "" RobotFileClose $out # puts "Parsing done" - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) } -proc RobotReadContent {url sock} { +proc RobotReadContent {url sock binary} { global URL + puts "RobotReadContent $url" set buffer [read $sock 16384] set readCount [string length $buffer] - + if {$readCount <= 0} { - close $sock Robot200 $url - RobotRestart + RobotRestart $url $sock + } elseif {!$binary && [string first \0 $buffer] >= 0} { + Robot200 $url + RobotRestart $url $sock } else { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer @@ -447,95 +595,85 @@ proc RobotReadContent {url sock} { proc RobotReadHeader {url sock} { global URL - set buffer [read $sock 2148] + puts "RobotReadHeader $url" + if {[catch {set buffer [read $sock 2148]}]} { + RobotError $url 404 + RobotRestart $url $sock + } set readCount [string length $buffer] if {$readCount <= 0} { - Robot404 $url - close $sock - RobotRestart + RobotError $url 404 + RobotRestart $url $sock } else { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer - set n [string first \n\n $URL($url,buf)] + set n [string first \r\n\r\n $URL($url,buf)] if {$n > 1} { set code 0 set version {} set headbuf [string range $URL($url,buf) 0 $n] - incr n - incr n + incr n 4 set URL($url,buf) [string range $URL($url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code set lines [split $headbuf \n] foreach line $lines { if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} { - set URL($url,head,[string tolower $name]) $value + set URL($url,head,[string tolower $name]) [string trim $value] } } puts "code = $code" set URL($url,state) skip switch $code { 301 { - Robot301 $url $URL($url,head,location) - close $sock - RobotRestart + RobotRedirect $url $URL($url,head,location) 301 + RobotRestart $url $sock } 302 { - Robot301 $url $URL($url,head,location) - close $sock - RobotRestart - } - 404 { - Robot404 $url - close $sock - RobotRestart - } - 401 { - Robot401 $url - close $sock - RobotRestart + RobotRedirect $url $URL($url,head,location) 302 + RobotRestart $url $sock } 200 { if {![info exists URL($url,head,content-type)]} { set URL($url,head,content-type) {} } + set binary 0 switch $URL($url,head,content-type) { - text/html { - fileevent $sock readable [list RobotReadContent $url $sock] - } - text/plain { - fileevent $sock readable [list RobotReadContent $url $sock] - } - default { - close $sock - Robot200 $url - RobotRestart + application/pdf { + set binary 1 } } + fileevent $sock readable [list RobotReadContent $url $sock $binary] } default { - Robot404 $url - close $sock - RobotRestart + RobotError $url $code + RobotRestart $url $sock } } } } } +proc RobotSockCancel {url sock} { + + puts "RobotSockCancel sock=$sock url=$url" + RobotError $url 401 + RobotRestart $url $sock +} + proc RobotConnect {url sock} { global URL agent - fconfigure $sock -translation {auto crlf} -blocking 0 - puts "Reading $url" + fconfigure $sock -translation {lf crlf} -blocking 0 fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" puts $sock "User-Agent: $agent" puts $sock "" flush $sock + set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]] } proc RobotNop {} { @@ -543,10 +681,10 @@ proc RobotNop {} { } proc RobotGetUrl {url phost} { - global URL - puts "---------" - puts $url - if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} { + global URL robotsRunning + flush stdout + puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url" + if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} { @@ -555,10 +693,36 @@ proc RobotGetUrl {url phost} { } set URL($url,method) $method set URL($url,host) $host - set URL($url,port) $port + set URL($url,hostport) $hostport set URL($url,path) $path set URL($url,state) head set URL($url,buf) {} + + if {[string compare $path /robots.txt]} { + set ok 1 + if {![info exists URL($hostport,robots)]} { + puts "READING robots.txt for host $hostport" + if {[RobotFileExist visited $hostport /robots.txt]} { + set inf [RobotFileOpen visited $hostport /robots.txt r] + set buf [read $inf 32768] + close $inf + } else { + set buf "User-Agent: *\nAllow: /\n" + } + RobotsTxt0 URL($hostport,robots) $buf + } + if {[info exists URL($hostport,robots)]} { + foreach l $URL($hostport,robots) { + if {[string first [lindex $l 1] $path] == 0} { + set ok [lindex $l 0] + break + } + } + } + if {!$ok} { + return -1 + } + } if [catch {set sock [socket -async $host $port]}] { return -1 } @@ -574,37 +738,91 @@ if {![llength [info commands htmlSwitch]]} { } } - set agent "zmbot/0.0" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" - puts "agent: $agent" } +puts "agent: $agent" + proc bgerror {m} { + global errorInfo puts "BGERROR $m" + puts $errorInfo } -set robotMoreWork 0 +set robotsRunning 0 set robotSeq 0 set workdir [pwd] +set idleTime 60000 + +set i 0 +set l [llength $argv] -if {[llength $argv] < 2} { - puts "Tclrobot: usage " - puts " Example: '*.indexdata.dk' http://www.indexdata.dk/" +if {$l < 2} { + puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]} + puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" exit 1 } -set domains [lindex $argv 0] -set site [lindex $argv 1] -if {[string length $site]} { - set robotMoreWork 1 - if [RobotGetUrl $site {}] { - set robotMoreWork 0 - puts "Couldn't process $site" +while {$i < $l} { + set arg [lindex $argv $i] + switch -glob -- $arg { + -j* { + set robotsMax [string range $arg 2 end] + if {![string length $robotsMax]} { + set robotsMax [lindex $argv [incr i]] + } + } + -c* { + set maxDistance [string range $arg 2 end] + if {![string length $maxDistance]} { + set maxDistance [lindex $argv [incr i]] + } + } + -d* { + set dom [string range $arg 2 end] + if {![string length $dom]} { + set dom [lindex $argv [incr i]] + } + lappend domains $dom + } + -i* { + set idleTime [string range $arg 2 end] + if {![string length $idleTime]} { + set idleTime [lindex $argv [incr i]] + } + } + default { + set href $arg + if {[RobotHref http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } + } } + incr i +} + +if {![info exist domains]} { + set domains {*} } +if {![info exist maxDistance]} { + set maxDistance 3 +} +if {![info exist robotsMax]} { + set robotsMax 5 +} + +puts "domains=$domains" +puts "max distance=$maxDistance" +puts "max jobs=$robotsMax" + +RobotStart -while {$robotMoreWork} { - vwait robotMoreWork +while {$robotsRunning} { + vwait robotsRunning }