#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.7 2000/12/08 22:46:53 adam Exp $
+# $Id: robot.tcl,v 1.11 2001/01/23 11:26:43 adam Exp $
#
-proc RobotFileNext1 {area} {
+proc RobotFileNext1 {area lead} {
+ puts "RobotFileNext1 area=$area lead=$lead"
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
- set off [string first / $area]
- incr off
-
foreach n $ns {
if {[file isfile $n]} {
- if {[string first :.html $n] > 0} {
- return http://[string range $area/ $off end]
- }
- return http://[string range $n $off end]
+ set off [string last / $n]
+ incr off 2
+ return $lead/[string range $n $off end]
}
}
foreach n $ns {
if {[file isdirectory $n]} {
- set sb [RobotFileNext1 $n]
+ set off [string last / $n]
+ incr off 2
+ set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
return {}
}
-proc RobotFileWait {} {
- global robotSeq
- set robotSeq 0
+proc RobotWriteRecord {outf fromurl distance} {
+ puts $outf "<zmbot>"
+ puts $outf "<distance>"
+ puts $outf $distance
+ puts $outf "</distance>"
+ puts $outf "<fromurl>"
+ puts $outf $fromurl
+ puts $outf "</fromurl>"
+ puts $outf "</zmbot>"
+}
+
+proc RobotReadRecord {inf fromurlx distancex} {
+ upvar $fromurlx fromurl
+ upvar $distancex distance
+ gets $inf
+ gets $inf
+ set distance [string trim [gets $inf]]
+ puts "got distance = $distance"
+ gets $inf
+ gets $inf
+ set fromurl [string trim [gets $inf]]
}
proc RobotFileNext {area} {
- global robotSeq
- if {[catch {set ns [glob ${area}/*]}]} {
- return {}
+ global robotSeq global idleTime ns
+
+ puts "RobotFileNext robotSeq=$robotSeq"
+ if {$robotSeq < 0} {
+ return {}
+ }
+ if {$robotSeq == 0} {
+ if {[catch {set ns [glob ${area}/*]}]} {
+ return {}
+ }
}
set off [string length $area]
incr off
-
set n [lindex $ns $robotSeq]
if {![string length $n]} {
- puts "------------ N E X T R O U N D --------"
set robotSeq -1
- after 2000 RobotFileWait
- vwait robotSeq
-
- set n [lindex $ns $robotSeq]
- if {![string length $n]} {
- return {}
- }
+ flush stdout
+ puts "------------ N E X T R O U N D --------"
+ return wait
}
incr robotSeq
- if {[file isfile $n/robots.txt]} {
+ if {[file isfile $n/frobots.txt]} {
puts "ok returning http://[string range $n $off end]/robots.txt"
return http://[string range $n $off end]/robots.txt
} elseif {[file isdirectory $n]} {
- set sb [RobotFileNext1 $n]
+ set sb [RobotFileNext1 $n http://[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
}
+ puts "no more work at end of RobotFileNext n=$n"
+ puts "ns=$ns"
return {}
}
proc RobotFileExist {area host path} {
- set comp [split $area/$host$path /]
- set l [llength $comp]
+ puts "RobotFileExist begin area=$area host=$host path=$path"
+ set lpath [split $path /]
+ set l [llength $lpath]
incr l -1
- if {![string length [lindex $comp $l]]} {
- set comp [split $area/$host$path:.html /]
- } elseif {[file exists [join $comp /]]} {
- return 1
- } else {
- set comp [split $area/$host$path/:.html /]
- }
- return [file exists [join $comp /]]
+ set t [lindex $lpath $l]
+ incr l -1
+ set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ puts "RobotFileExist end npath=$npath"
+ return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
- set comp [split $area/$host$path /]
+ puts "RobotFileUnlink begin"
+ puts "area=$area host=$host path=$path"
+ set lpath [split $path /]
+ set l [llength $lpath]
+ incr l -1
+ set t [lindex $lpath $l]
+ incr l -1
+ set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ puts "npath=$npath"
+ set comp [split $npath /]
set l [llength $comp]
incr l -1
- if {![string length [lindex $comp $l]]} {
- set comp [split $area/$host$path:.html /]
- }
if {[catch {exec rm [join $comp /]}]} return
incr l -1
for {set i $l} {$i > 0} {incr i -1} {
if {![catch {glob $path/*}]} return
exec rmdir ./$path
}
+ puts "RobotFileUnlink end"
}
proc RobotFileClose {out} {
if {![info exists workdir]} {
return stdout
}
- puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
if {[string compare $orgPwd $workdir]} {
+ puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
puts "pwd = $orgPwd"
exit 1
set len [llength $comp]
incr len -1
for {set i 0} {$i < $len} {incr i} {
- set d [lindex $comp $i]
+ if {$i > 1} {
+ set d "d[lindex $comp $i]"
+ } else {
+ set d [lindex $comp $i]
+ }
if {[catch {cd ./$d}]} {
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
- set out [open robots.txt w]
+ set out [open frobots.txt w]
puts "creating robots.txt in $d"
close $out
}
set d [lindex $comp $len]
if {[string length $d]} {
if {[file isdirectory $d]} {
- set out [open $d/:.html $mode]
+ set out [open $d/f $mode]
} else {
- set out [open $d $mode]
+ set out [open f$d $mode]
}
} else {
- set out [open :.html $mode]
+ set out [open f $mode]
}
cd $orgPwd
- #puts "RobotFileStop"
return $out
}
-proc RobotRestart {} {
+proc RobotRR {} {
+ global robotSeq robotsRunning
+
+ incr robotsRunning -1
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+ set robotSeq 0
+ RobotStart
+}
+
+proc RobotRestart {url sock} {
+ global URL robotsRunning
+
+ close $sock
+ after cancel $URL($sock,cancel)
+
+ foreach v [array names URL $url,*] {
+ unset URL($v)
+ }
+
+ incr robotsRunning -1
+ RobotStart
+}
+
+proc RobotStart {} {
global URL
- global robotMoreWork
-
- while {1} {
+ global robotsRunning robotsMax idleTime
+
+ puts "RobotStart"
+ while {1} {
set url [RobotFileNext unvisited]
if {![string length $url]} {
- break
+ return
+ }
+ incr robotsRunning
+ if {[string compare $url wait] == 0} {
+ after $idleTime RobotRR
+ return
}
set r [RobotGetUrl $url {}]
if {!$r} {
- puts "RobotGetUrl returned 0 on url=$url"
- return
+ if {$robotsRunning >= $robotsMax} return
} else {
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- }
+ incr robotsRunning -1
+ if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotFileClose $outf
+ }
+ RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
+ }
}
- set robotMoreWork 0
}
proc headSave {url out} {
global URL
- puts $out {<zmbot>}
if {[info exists URL($url,head,last-modified)]} {
puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
}
upvar $pathx path
puts "Ref url = $url href=$href"
+
+ if {[string first { } $href] >= 0} {
+ return 0
+ }
+ if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+ return 0
+ }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
}
} else {
regexp {^([^\#]*)} $hpath x surl
- set host $URL($url,host)
+ set host $URL($url,hostport)
}
if {![string length $surl]} {
return 0
switch -- [lindex $c $i] {
.. {
incr i -2
+ if {$i < 0} {
+ set i 0
+ }
}
. {
incr i -1
incr i -1
}
}
- }
+ }
+ regsub -all {~} $path {%7E} path
set href "$method://$host$path"
puts "Ref href = $href"
return 1
}
-proc Robot401 {url} {
+proc RobotError {url code} {
global URL
- puts "Bad URL $url"
+ puts "Bad URL $url, $code"
set fromurl {}
- catch {
- set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
- set fromurl [gets $inf]
- close $inf
- }
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url 401"
- puts $outf "Reference $fromurl"
- RobotFileClose $outf
- }
-}
-
-proc Robot404 {url} {
- global URL
-
- puts "Bad URL $url"
- set fromurl {}
- catch {
- set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
- set fromurl [gets $inf]
+ set distance -1
+ if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
+ set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url 404"
- puts $outf "Reference $fromurl"
+ RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
+ if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
- }
+}
-proc Robot301 {url tourl} {
+proc RobotRedirect {url tourl code} {
global URL
puts "Redirecting from $url to $tourl"
+ set distance {}
set fromurl {}
- catch {
- set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
- set fromurl [gets $inf]
+ if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
+ set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url to $tourl 301"
- puts $outf "Reference $fromurl"
+ if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
if {[RobotHref $url tourl host path]} {
- if {![RobotFileExist unvisited $host $path]} {
- puts "Mark as unvisited"
- set outf [RobotFileOpen unvisited $host $path]
- puts $outf 301
- RobotFileClose $outf
+ if {![RobotFileExist visited $host $path]} {
+ if {![RobotFileExist unvisited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+ } else {
+ set olddistance {}
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[string length $distance] == 0} {
+ set distance 1000
+ }
+ puts "distance=$distance olddistance=$olddistance"
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $tourl $distance
+ RobotFileClose $outf
+ }
}
}
+ if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} {
+ puts "unlink failed"
+ exit 1
+ }
}
proc RobotTextHtml {url out} {
- global URL
+ global URL maxDistance
- set head 0
+ set distance 0
+ if {$maxDistance < 1000 && [info exists URL($url,dist)]} {
+ set distance [expr $URL($url,dist) + 1]
+ }
htmlSwitch $URL($url,buf) \
title {
- if {!$head} {
- headSave $url $out
- set head 1
- }
puts $out "<title>$body</title>"
} -nonest meta {
- if {!$head} {
- headSave $url $out
- set head 1
- }
puts -nonewline $out "<meta"
foreach a [array names parm] {
puts -nonewline $out " $a"
puts "no href"
continue
}
- if {!$head} {
- headSave $url $out
- set head 1
- }
- if {1} {
- set href $parm(href)
+ if {[expr $distance <= $maxDistance]} {
+ set href [string trim $parm(href)]
if {![RobotHref $url href host path]} continue
puts $out "<cr>"
puts $out "<identifier>$href</identifier>"
puts $out "<description>$body</description>"
puts $out "</cr>"
-
+
if {![RobotFileExist visited $host $path]} {
+ set olddistance 1000
if {![RobotFileExist bad $host $path]} {
- if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
- puts "--- Error $msg"
- exit 1
- }
- puts $outf $url
+ if {[RobotFileExist unvisited $host $path]} {
+ set inf [RobotFileOpen unvisited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ }
+ } else {
+ set olddistance 0
+ }
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ } elseif {[string compare $href $url]} {
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf xurl olddistance
+ close $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ puts "OK remarking url=$url href=$href"
+ puts "olddistance = $olddistance"
+ puts "newdistance = $distance"
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
RobotFileClose $outf
}
}
}
}
- if {!$head} {
- headSave $url $out
- set head 1
+}
+
+proc RobotsTxt {url} {
+ global agent URL
+
+ RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf)
+}
+
+proc RobotsTxt0 {v buf} {
+ global URL agent
+ set section 0
+ foreach l [split $buf \n] {
+ if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
+ puts "cmd=$cmd arg=$arg"
+ switch $cmd {
+ User-Agent {
+ if {$section} break
+ set pat [string tolower $arg]*
+ set section [string match $pat $agent]
+ }
+ Disallow {
+ if {$section} {
+ puts "rule [list 0 $arg]"
+ lappend $v [list 0 $arg]
+ }
+ }
+ Allow {
+ if {$section} {
+ puts "rule [list 1 $arg]"
+ lappend $v [list 1 $arg]
+ }
+ }
+ }
+ }
}
- puts $out "</zmbot>"
}
proc RobotTextPlain {url out} {
global URL
- headSave $url $out
puts $out "<documentcontent>"
puts $out $URL($url,buf)
puts $out "</documentcontent>"
- puts $out "</meta>"
+
+ if {![string compare $URL($url,path) /robots.txt]} {
+ RobotsTxt $url
+ }
}
proc Robot200 {url} {
global URL domains
- puts "Parsing $url"
- set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
+ set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
+ puts $out "<zmbot>"
+
+ set distance 1000
+ if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
+ set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ set URL($url,dist) $distance
+ puts $out "<distance>"
+ puts $out " $distance"
+ puts $out "</distance>"
+ headSave $url $out
+ puts "Parsing $url distance=$distance"
switch $URL($url,head,content-type) {
text/html {
- RobotTextHtml $url $out
+ if {[string length $distance]} {
+ RobotTextHtml $url $out
+ }
}
text/plain {
RobotTextPlain $url $out
}
- default {
- headSave $url $out
- puts $out "</zmbot>"
+ application/pdf {
+ set pdff [open test.pdf w]
+ puts -nonewline $pdff $URL($url,buf)
+ close $pdff
}
}
+ puts $out "</zmbot>"
RobotFileClose $out
# puts "Parsing done"
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
+ RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
}
-proc RobotReadContent {url sock} {
+proc RobotReadContent {url sock binary} {
global URL
+ puts "RobotReadContent $url"
set buffer [read $sock 16384]
set readCount [string length $buffer]
-
+
if {$readCount <= 0} {
- close $sock
Robot200 $url
- RobotRestart
+ RobotRestart $url $sock
+ } elseif {!$binary && [string first \0 $buffer] >= 0} {
+ Robot200 $url
+ RobotRestart $url $sock
} else {
# puts "Got $readCount bytes"
set URL($url,buf) $URL($url,buf)$buffer
proc RobotReadHeader {url sock} {
global URL
- set buffer [read $sock 2148]
+ puts "RobotReadHeader $url"
+ if {[catch {set buffer [read $sock 2148]}]} {
+ RobotError $url 404
+ RobotRestart $url $sock
+ }
set readCount [string length $buffer]
if {$readCount <= 0} {
- Robot404 $url
- close $sock
- RobotRestart
+ RobotError $url 404
+ RobotRestart $url $sock
} else {
# puts "Got $readCount bytes"
set URL($url,buf) $URL($url,buf)$buffer
- set n [string first \n\n $URL($url,buf)]
+ set n [string first \r\n\r\n $URL($url,buf)]
if {$n > 1} {
set code 0
set version {}
set headbuf [string range $URL($url,buf) 0 $n]
- incr n
- incr n
+ incr n 4
set URL($url,buf) [string range $URL($url,buf) $n end]
regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
set lines [split $headbuf \n]
foreach line $lines {
if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
- set URL($url,head,[string tolower $name]) $value
+ set URL($url,head,[string tolower $name]) [string trim $value]
}
}
puts "code = $code"
set URL($url,state) skip
switch $code {
301 {
- Robot301 $url $URL($url,head,location)
- close $sock
- RobotRestart
+ RobotRedirect $url $URL($url,head,location) 301
+ RobotRestart $url $sock
}
302 {
- Robot301 $url $URL($url,head,location)
- close $sock
- RobotRestart
- }
- 404 {
- Robot404 $url
- close $sock
- RobotRestart
- }
- 401 {
- Robot401 $url
- close $sock
- RobotRestart
+ RobotRedirect $url $URL($url,head,location) 302
+ RobotRestart $url $sock
}
200 {
if {![info exists URL($url,head,content-type)]} {
set URL($url,head,content-type) {}
}
+ set binary 0
switch $URL($url,head,content-type) {
- text/html {
- fileevent $sock readable [list RobotReadContent $url $sock]
- }
- text/plain {
- fileevent $sock readable [list RobotReadContent $url $sock]
- }
- default {
- close $sock
- Robot200 $url
- RobotRestart
+ application/pdf {
+ set binary 1
}
}
+ fileevent $sock readable [list RobotReadContent $url $sock $binary]
}
default {
- Robot404 $url
- close $sock
- RobotRestart
+ RobotError $url $code
+ RobotRestart $url $sock
}
}
}
}
}
+proc RobotSockCancel {url sock} {
+
+ puts "RobotSockCancel sock=$sock url=$url"
+ RobotError $url 401
+ RobotRestart $url $sock
+}
+
proc RobotConnect {url sock} {
global URL agent
- fconfigure $sock -translation {auto crlf} -blocking 0
- puts "Reading $url"
+ fconfigure $sock -translation {lf crlf} -blocking 0
fileevent $sock readable [list RobotReadHeader $url $sock]
puts $sock "GET $URL($url,path) HTTP/1.0"
puts $sock "Host: $URL($url,host)"
puts $sock "User-Agent: $agent"
puts $sock ""
flush $sock
+ set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]]
}
proc RobotNop {} {
}
proc RobotGetUrl {url phost} {
- global URL
- puts "---------"
- puts $url
- if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
+ global URL robotsRunning
+ flush stdout
+ puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url"
+ if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
return -1
}
if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
}
set URL($url,method) $method
set URL($url,host) $host
- set URL($url,port) $port
+ set URL($url,hostport) $hostport
set URL($url,path) $path
set URL($url,state) head
set URL($url,buf) {}
+
+ if {[string compare $path /robots.txt]} {
+ set ok 1
+ if {![info exists URL($hostport,robots)]} {
+ puts "READING robots.txt for host $hostport"
+ if {[RobotFileExist visited $hostport /robots.txt]} {
+ set inf [RobotFileOpen visited $hostport /robots.txt r]
+ set buf [read $inf 32768]
+ close $inf
+ } else {
+ set buf "User-Agent: *\nAllow: /\n"
+ }
+ RobotsTxt0 URL($hostport,robots) $buf
+ }
+ if {[info exists URL($hostport,robots)]} {
+ foreach l $URL($hostport,robots) {
+ if {[string first [lindex $l 1] $path] == 0} {
+ set ok [lindex $l 0]
+ break
+ }
+ }
+ }
+ if {!$ok} {
+ return -1
+ }
+ }
if [catch {set sock [socket -async $host $port]}] {
return -1
}
}
}
-
set agent "zmbot/0.0"
if {![catch {set os [exec uname -s -r]}]} {
set agent "$agent ($os)"
- puts "agent: $agent"
}
+puts "agent: $agent"
+
proc bgerror {m} {
+ global errorInfo
puts "BGERROR $m"
+ puts $errorInfo
}
-set robotMoreWork 0
+set robotsRunning 0
set robotSeq 0
set workdir [pwd]
+set idleTime 60000
-if {[llength $argv] < 2} {
- puts "Tclrobot: usage <domain> <start>"
- puts " Example: '*.indexdata.dk' http://www.indexdata.dk/"
+set i 0
+set l [llength $argv]
+
+if {$l < 2} {
+ puts {tclrobot: usage [-j jobs] [-c count] [-d domain] [url ..]}
+ puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
exit 1
}
-set domains [lindex $argv 0]
-set site [lindex $argv 1]
-if {[string length $site]} {
- set robotMoreWork 1
- if [RobotGetUrl $site {}] {
- set robotMoreWork 0
- puts "Couldn't process $site"
+while {$i < $l} {
+ set arg [lindex $argv $i]
+ switch -glob -- $arg {
+ -j* {
+ set robotsMax [string range $arg 2 end]
+ if {![string length $robotsMax]} {
+ set robotsMax [lindex $argv [incr i]]
+ }
+ }
+ -c* {
+ set maxDistance [string range $arg 2 end]
+ if {![string length $maxDistance]} {
+ set maxDistance [lindex $argv [incr i]]
+ }
+ }
+ -d* {
+ set dom [string range $arg 2 end]
+ if {![string length $dom]} {
+ set dom [lindex $argv [incr i]]
+ }
+ lappend domains $dom
+ }
+ default {
+ set href $arg
+ if {[RobotHref http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist visited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
+ }
}
+ incr i
+}
+
+if {![info exist domains]} {
+ set domains {*}
}
+if {![info exist maxDistance]} {
+ set maxDistance 3
+}
+if {![info exist robotsMax]} {
+ set robotsMax 5
+}
+
+puts "domains=$domains"
+puts "max distance=$maxDistance"
+puts "max jobs=$robotsMax"
+
+RobotStart
-while {$robotMoreWork} {
- vwait robotMoreWork
+while {$robotsRunning} {
+ vwait robotsRunning
}