+if {![llength [info commands htmlSwitch]]} {
+ set e [info sharedlibextension]
+ if {[catch {load ./tclrobot$e}]} {
+ load tclrobot$e
+ }
+}
+
+set agent "zmbot/0.2"
+if {![catch {set os [exec uname -s -r]}]} {
+ set agent "$agent ($os)"
+}
+
+puts "agent: $agent"
+
+proc bgerror {m} {
+ global errorInfo
+ puts "BGERROR $m"
+ puts $errorInfo
+}
+
+set robotsRunning 0
+set workdir [pwd]
+set idletime 30000
+set acceptLanguage {}
+set debuglevel 0
+
+# Rules: allow, deny, url
+
+proc checkrule {task type this} {
+ global control
+ global debuglevel
+
+ set default_ret 1
+
+ if {$debuglevel > 3} {
+ puts "CHECKRULE $type $this"
+ }
+ if {[info exist control($task,alrules)]} {
+ foreach l $control($task,alrules) {
+ if {$debuglevel > 3} {
+ puts "consider $l"
+ }
+ # consider type
+ if {[lindex $l 1] != $type} continue
+ # consider mask (! negates)
+ set masks [lindex $l 2]
+ set ok 0
+ set default_ret 0
+ foreach mask $masks {
+ if {$debuglevel > 4} {
+ puts "consider single mask $mask"
+ }
+ if {[string index $mask 0] == "!"} {
+ set mask [string range $mask 1 end]
+ if {[string match $mask $this]} continue
+ } else {
+ if {![string match $mask $this]} continue
+ }
+ set ok 1
+ }
+ if {$debuglevel > 4} {
+ puts "ok = $ok"
+ }
+ if {!$ok} continue
+ # OK, we have a match
+ if {[lindex $l 0] == "allow"} {
+ if {$debuglevel > 3} {
+ puts "CHECKRULE MATCH OK"
+ }
+ return 1
+ } else {
+ if {$debuglevel > 3} {
+ puts "CHECKFULE MATCH FAIL"
+ }
+ return 0
+ }
+ }
+ }
+ if {$debuglevel > 3} {
+ puts "CHECKRULE MATCH DEFAULT $default_ret"
+ }
+ return $default_ret
+}
+
+
+proc url {href} {
+ global debuglevel task
+
+ if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
+}
+
+proc deny {type stuff} {
+ global control task
+
+ lappend control($task,alrules) [list deny $type $stuff]
+}
+
+proc allow {type stuff} {
+ global control task
+
+ lappend control($task,alrules) [list allow $type $stuff]
+}
+
+proc debug {level} {
+ global debuglevel
+
+ set debuglevel $level
+}
+
+proc task {t} {
+ global tasks task status control
+
+ set task $t
+
+ if {[info exists tasks]} {
+ if {[lsearch -exact $tasks $t] >= 0} {
+ return 0
+ }
+ }
+
+ lappend tasks $t
+ set status($t,unvisited) 0
+ set status($t,visited) 0
+ set status($t,bad) 0
+ set status($t,raw) 0
+ set status($t,active) 1
+ set control($t,seq) 0
+ set control($t,distance) 10
+ return 1
+}
+
+# Little utility that ensures that at least one task is present (main).
+proc chktask {} {
+ global tasks
+ if {![info exist tasks]} {
+ task main
+ }
+}
+
+
+# Parse options
+
+set i 0
+set l [llength $argv]
+
+if {$l < 2} {
+ puts {tclrobot: usage:}
+ puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-D dir] [-r rules] [url ..]}
+ puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
+
+ exit 1
+}
+
+
+while {$i < $l} {
+ set arg [lindex $argv $i]
+ switch -glob -- $arg {
+ -t* {
+ set t [string range $arg 2 end]
+ if {![string length $t]} {
+ set t [lindex $argv [incr i]]
+ }
+ task $t
+ }
+ -D* {
+ set dir [string range $arg 2 end]
+ if {![string length $dir]} {
+ set dir [lindex $argv [incr i]]
+ }
+ lappend daemon_dir $dir
+ }
+ -j* {
+ set robotsMax [string range $arg 2 end]
+ if {![string length $robotsMax]} {
+ set robotsMax [lindex $argv [incr i]]
+ }
+ }
+ -c* {
+ chktask
+ set control($task,distance) [string range $arg 2 end]
+ if {![string length $control($task,distance)]} {
+ set control($task,distance) [lindex $argv [incr i]]
+ }
+ }
+ -d* {
+ chktask
+ set dom [string range $arg 2 end]
+ if {![string length $dom]} {
+ set dom [lindex $argv [incr i]]
+ }
+ lappend control($task,domains) $dom
+ }
+ -i* {
+ set idletime [string range $arg 2 end]
+ if {![string length $idletime]} {
+ set idletime [lindex $argv [incr i]]
+ }
+ }
+ -l* {
+ chktask
+ set acceptLanguage [string range $arg 2 end]
+ if {![string length $acceptLanguage]} {
+ set acceptLanguage [lindex $argv [incr i]]
+ }
+ }
+ -r* {
+ chktask
+ set rfile [string range $arg 2 end]
+ if {![string length $rfile]} {
+ set rfile [lindex $argv [incr i]]
+ }
+ catch {unset maxdistance}
+ source $rfile
+ if {[info exists maxdistance]} {
+ set control($task,distance) $maxdistance
+ }
+ }
+ default {
+ chktask
+ set href $arg
+ if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
+ }
+ }
+ incr i
+}
+
+if {![info exist robotsMax]} {
+ set robotsMax 5
+}
+
+if {[info exist daemon_dir]} {
+ RobotDaemonLoop
+} else {
+ foreach t $tasks {
+ puts "task $t"
+ puts "max distance=$control($t,distance)"
+ if {[info exists control($t,domains)]} {
+ puts "domains=$control($t,domains)"
+ }
+ }
+ puts "max jobs=$robotsMax"
+
+ foreach t $tasks {
+ RobotStart $t
+ }
+
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+
+ if {[info exists tasks]} {
+ foreach t $tasks {
+ set statusfile [open $t/status w]
+ puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
+ close $statusfile
+ }
+ }
+}