X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=5bd9f82b3231c2dae6861bbef2438c82198545d5;hb=bb21c575e5bf08f8daf9d33218f6358e63d36c62;hp=ddbfb82f3193e87d70eb249a91866b728d9cfaee;hpb=4adc245769f17af7801647a0314dda7dfefe1dba;p=tclrobot.git
diff --git a/robot.tcl b/robot.tcl
index ddbfb82..5bd9f82 100755
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.10 2001/01/23 09:20:32 adam Exp $
+# $Id: robot.tcl,v 1.15 2001/06/05 08:44:50 adam Exp $
#
proc RobotFileNext1 {area lead} {
puts "RobotFileNext1 area=$area lead=$lead"
@@ -260,6 +260,16 @@ proc RobotHref {url hrefx hostx pathx} {
upvar $pathx path
puts "Ref url = $url href=$href"
+
+ if {[string first { } $href] >= 0} {
+ return 0
+ }
+ if {[string length $href] > 256} {
+ return 0
+ }
+ if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+ return 0
+ }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
@@ -274,16 +284,18 @@ proc RobotHref {url hrefx hostx pathx} {
if {![string length $surl]} {
set surl /
}
- set ok 0
- foreach domain $domains {
- if {[string match $domain $host]} {
- set ok 1
- break
+ if {[info exist domains]} {
+ set ok 0
+ foreach domain $domains {
+ if {[string match $domain $host]} {
+ set ok 1
+ break
+ }
}
- }
- if {!$ok} {
- return 0
- }
+ if {!$ok} {
+ return 0
+ }
+ }
} else {
regexp {^([^\#]*)} $hpath x surl
set host $URL($url,hostport)
@@ -422,7 +434,7 @@ proc RobotTextHtml {url out} {
puts $out ""
puts $out $nbody
puts $out ""
- } a {
+ } -nonest a {
if {![info exists parm(href)]} {
puts "no href"
continue
@@ -608,7 +620,7 @@ proc RobotReadHeader {url sock} {
regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
set lines [split $headbuf \n]
foreach line $lines {
- if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
+ if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
set URL($url,head,[string tolower $name]) [string trim $value]
}
}
@@ -729,9 +741,10 @@ if {![llength [info commands htmlSwitch]]} {
set agent "zmbot/0.0"
if {![catch {set os [exec uname -s -r]}]} {
set agent "$agent ($os)"
- puts "agent: $agent"
}
+puts "agent: $agent"
+
proc bgerror {m} {
global errorInfo
puts "BGERROR $m"
@@ -739,28 +752,74 @@ proc bgerror {m} {
}
set robotsRunning 0
-set robotsMax 5
set robotSeq 0
set workdir [pwd]
set idleTime 60000
-if {[llength $argv] < 2} {
- puts "Tclrobot: usage "
- puts " Example: 3 '*.indexdata.dk' http://www.indexdata.dk/"
+set i 0
+set l [llength $argv]
+
+if {$l < 2} {
+ puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]}
+ puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
exit 1
}
-set maxDistance [lindex $argv 0]
-set domains [lindex $argv 1]
-foreach href [lindex $argv 2] {
- if {[RobotHref http://www.indexdata.dk/ href host path]} {
- if {![RobotFileExist visited $host $path]} {
- set outf [RobotFileOpen unvisited $host $path]
- RobotWriteRecord $outf $href 0
- RobotFileClose $outf
+while {$i < $l} {
+ set arg [lindex $argv $i]
+ switch -glob -- $arg {
+ -j* {
+ set robotsMax [string range $arg 2 end]
+ if {![string length $robotsMax]} {
+ set robotsMax [lindex $argv [incr i]]
+ }
+ }
+ -c* {
+ set maxDistance [string range $arg 2 end]
+ if {![string length $maxDistance]} {
+ set maxDistance [lindex $argv [incr i]]
+ }
+ }
+ -d* {
+ set dom [string range $arg 2 end]
+ if {![string length $dom]} {
+ set dom [lindex $argv [incr i]]
+ }
+ lappend domains $dom
+ }
+ -i* {
+ set idleTime [string range $arg 2 end]
+ if {![string length $idleTime]} {
+ set idleTime [lindex $argv [incr i]]
+ }
+ }
+ default {
+ set href $arg
+ if {[RobotHref http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist visited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
}
}
+ incr i
+}
+
+if {![info exist domains]} {
+ set domains {*}
+}
+if {![info exist maxDistance]} {
+ set maxDistance 50
}
+if {![info exist robotsMax]} {
+ set robotsMax 5
+}
+
+puts "domains=$domains"
+puts "max distance=$maxDistance"
+puts "max jobs=$robotsMax"
RobotStart