+set idletime 60000
+set acceptLanguage {}
+set debuglevel 0
+set status(unvisited) 0
+set status(visited) 0
+set status(bad) 0
+set status(raw) 0
+
+
+# Rules: allow, deny, url
+
+proc checkrule {type this} {
+ global alrules
+ global debuglevel
+
+ if {$debuglevel > 3} {
+ puts "CHECKRULE $type $this"
+ }
+ if {[info exist alrules]} {
+ foreach l $alrules {
+ if {$debuglevel > 3} {
+ puts "consider $l"
+ }
+ # consider type
+ if {[lindex $l 1] != $type} continue
+ # consider mask (! negates)
+ set masks [lindex $l 2]
+ set ok 0
+ foreach mask $masks {
+ if {$debuglevel > 4} {
+ puts "consider single mask $mask"
+ }
+ if {[string index $mask 0] == "!"} {
+ set mask [string range $mask 1 end]
+ if {[string match $mask $this]} continue
+ } else {
+ if {![string match $mask $this]} continue
+ }
+ set ok 1
+ }
+ if {$debuglevel > 4} {
+ puts "ok = $ok"
+ }
+ if {!$ok} continue
+ # OK, we have a match
+ if {[lindex $l 0] == "allow"} {
+ if {$debuglevel > 3} {
+ puts "CHECKRULE MATCH OK"
+ }
+ return 1
+ } else {
+ if {$debuglevel > 3} {
+ puts "CHECKFULE MATCH FAIL"
+ }
+ return 0
+ }
+ }
+ }
+ if {$debuglevel > 3} {
+ puts "CHECKRULE MATCH OK"
+ }
+ return 1
+}
+
+
+proc url {href} {
+ global debuglevel
+
+ if {[RobotHref http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist visited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
+}
+
+proc deny {type stuff} {
+ global alrules
+
+ lappend alrules [list deny $type $stuff]
+}
+
+proc allow {type stuff} {
+ global alrules
+
+ lappend alrules [list allow $type $stuff]
+}
+
+proc debug {level} {
+ global debuglevel
+
+ set debuglevel $level
+}
+
+# Parse options
+
+set i 0
+set l [llength $argv]
+
+if {$l < 2} {
+ puts {tclrobot: usage:}
+ puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
+ puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
+
+ exit 1
+}
+while {$i < $l} {
+ set arg [lindex $argv $i]
+ switch -glob -- $arg {
+ -j* {
+ set robotsMax [string range $arg 2 end]
+ if {![string length $robotsMax]} {
+ set robotsMax [lindex $argv [incr i]]
+ }
+ }
+ -c* {
+ set maxdistance [string range $arg 2 end]
+ if {![string length $maxdistance]} {
+ set maxdistance [lindex $argv [incr i]]
+ }
+ }
+ -d* {
+ set dom [string range $arg 2 end]
+ if {![string length $dom]} {
+ set dom [lindex $argv [incr i]]
+ }
+ lappend domains $dom
+ }
+ -i* {
+ set idletime [string range $arg 2 end]
+ if {![string length $idletime]} {
+ set idletime [lindex $argv [incr i]]
+ }
+ }
+ -l* {
+ set acceptLanguage [string range $arg 2 end]
+ if {![string length $acceptLanguage]} {
+ set acceptLanguage [lindex $argv [incr i]]
+ }
+ }
+ -r* {
+ set rfile [string range $arg 2 end]
+ if {![string length $rfile]} {
+ set rfile [lindex $argv [incr i]]
+ }
+ source $rfile
+ }
+ default {
+ set href $arg
+ if {[RobotHref http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist visited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
+ }
+ }
+ incr i
+}
+
+if {![info exist domains]} {
+ set domains {*}
+}
+if {![info exist maxdistance]} {
+ set maxdistance 50
+}
+if {![info exist robotsMax]} {
+ set robotsMax 5
+}
+
+puts "domains=$domains"
+puts "max distance=$maxdistance"
+puts "max jobs=$robotsMax"
+
+
+RobotStart
+