X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=a90d6e8d91c801e0ed392f4b48797c9542c2be46;hb=1eb62bcb2a33b6e5bf29ec82e2fed329953bbf9a;hp=55e7f2a2b0d066f92df90daa8156db2277420c39;hpb=5c476d6b3055153cfcb6972965b6e450f685ddeb;p=tclrobot.git
diff --git a/robot.tcl b/robot.tcl
index 55e7f2a..a90d6e8 100755
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.35 2003/06/10 11:43:52 adam Exp $
+# $Id: robot.tcl,v 1.43 2003/06/11 09:40:22 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
@@ -9,14 +9,19 @@ proc RobotFileNext1 {area lead} {
foreach n $ns {
if {[file isfile $n]} {
set off [string last / $n]
- incr off 2
- return $lead/[string range $n $off end]
+ # skip /
+ incr off
+ set end [string length $n]
+ # skip _.tkl
+ incr end -6
+ return $lead/[string range $n $off $end]
}
}
foreach n $ns {
if {[file isdirectory $n]} {
set off [string last / $n]
- incr off 2
+ # skip /
+ incr off
set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
if {[string length $sb]} {
return $sb
@@ -50,15 +55,15 @@ proc RobotReadRecord {inf fromurlx distancex} {
}
proc RobotFileNext {task area} {
- global robotSeq
+ global control
global idletime ns
global status
- # puts "RobotFileNext robotSeq=$robotSeq($task)"
- if {$robotSeq($task) < 0} {
+ # puts "RobotFileNext seq=$control($task,seq)"
+ if {$control($task,seq) < 0} {
return {}
}
- if {$robotSeq($task) == 0} {
+ if {$control($task,seq) == 0} {
if {[catch {set ns($task) [glob $task/$area/*]}]} {
return done
}
@@ -66,19 +71,19 @@ proc RobotFileNext {task area} {
# puts "ns=$ns($task)"
set off [string length $task/$area]
incr off
- set n [lindex $ns($task) $robotSeq($task)]
+ set n [lindex $ns($task) $control($task,seq)]
# puts "n=$n"
if {![string length $n]} {
- set robotSeq($task) -1
+ set control($task,seq) -1
flush stdout
set statusfile [open $task/status w]
puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
close $statusfile
return wait
}
- incr robotSeq($task)
- if {[file isfile $n/frobots.txt]} {
- puts "ok returning http://[string range $n $off end]/robots.txt"
+ incr control($task,seq)
+ if {[file isfile $n/robots.txt_.tkl]} {
+ # puts "ok returning http://[string range $n $off end]/robots.txt"
return http://[string range $n $off end]/robots.txt
} elseif {[file isdirectory $n]} {
set sb [RobotFileNext1 $n http://[string range $n $off end]]
@@ -98,40 +103,25 @@ proc RobotFileExist {task area host path} {
if {$debuglevel > 3} {
puts "RobotFileExist begin area=$area host=$host path=$path"
}
- set lpath [split $path /]
- set l [llength $lpath]
- incr l -1
- set t [lindex $lpath $l]
- incr l -1
- set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
- if {$debuglevel > 3} {
- puts "RobotFileExist end npath=$npath"
- }
- return [file exists $npath]
+ return [file exists $task/$area/$host${path}_.tkl]
}
proc RobotFileUnlink {task area host path} {
global status
# puts "RobotFileUnlink begin"
# puts "area=$area host=$host path=$path"
- set lpath [split $path /]
- set l [llength $lpath]
- incr l -1
- set t [lindex $lpath $l]
- incr l -1
- set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ set npath $task/$area/$host${path}_.tkl
# puts "npath=$npath"
set comp [split $npath /]
- if {[catch {exec rm [join $comp /]}]} return
+ if {[catch {exec rm $npath}]} return
set l [llength $comp]
- incr l -1
- incr l -1
+ incr l -2
incr status($task,$area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
- exec rmdir ./$path
+ exec rmdir $path
}
# puts "RobotFileUnlink end"
}
@@ -148,6 +138,8 @@ proc RobotFileOpen {task area host path {mode w}} {
global status
global debuglevel
+ # puts "RobotFileOpen task=$task path=$path"
+
if {![info exists workdir]} {
return stdout
}
@@ -169,12 +161,14 @@ proc RobotFileOpen {task area host path {mode w}} {
for {set i 0} {$i <= $len} {incr i} {
set d [lindex $comp $i]
- if {[catch {cd $d}]} {
+ if {[string length $d] == 0} {
+ cd /
+ } elseif {[catch {cd $d}]} {
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
if {[string compare $path /robots.txt]} {
- set out [open frobots.txt w]
+ set out [open robots.txt_.tkl w]
puts "creating robots.txt in $d"
close $out
incr status($task,unvisited)
@@ -190,8 +184,8 @@ proc RobotFileOpen {task area host path {mode w}} {
# puts "2 path=$path comp=$comp"
for {set i 0} {$i < $len} {incr i} {
- set d "d[lindex $comp $i]"
- if {[string length $d] > 1} {
+ set d [lindex $comp $i]
+ if {[string length $d] > 0} {
if {[catch {cd $d}]} {
exec mkdir $d
cd ./$d
@@ -199,11 +193,7 @@ proc RobotFileOpen {task area host path {mode w}} {
}
}
set d [lindex $comp $len]
- if {[string length $d]} {
- set out [open f$d $mode]
- } else {
- set out [open f $mode]
- }
+ set out [open ${d}_.tkl $mode]
if {$mode == "w"} {
incr status($task,$area)
}
@@ -211,16 +201,16 @@ proc RobotFileOpen {task area host path {mode w}} {
return $out
}
-
proc RobotStartJob {fname t} {
global control
-
set f [open $fname r]
set xml [read $f]
puts "Reading $fname"
- regexp {([^<]*)} $xml x status
close $f
+ if {![regexp {([^<]*)} $xml x status]} {
+ return
+ }
if {$status == "done"} {
puts "already done"
return
@@ -300,7 +290,7 @@ proc RobotScanDir {} {
}
proc RobotRR {task} {
- global robotSeq robotsRunning tasks robotsMax status
+ global control robotsRunning tasks robotsMax status
puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
incr robotsRunning -1
@@ -322,7 +312,7 @@ proc RobotRR {task} {
set statusfile [open $t/status w]
puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
close $statusfile
- set robotSeq($t) 0
+ set control($t,seq) 0
RobotStart $t
}
}
@@ -345,7 +335,7 @@ proc RobotDaemonLoop {} {
if {[info exists tasks]} {
puts "daemon loop tasks $tasks"
foreach t $tasks {
- set robotSeq($t) 0
+ set control($t,seq) 0
RobotStart $t
}
while {$robotsRunning} {
@@ -1058,6 +1048,8 @@ proc checkrule {task type this} {
global control
global debuglevel
+ set default_ret 1
+
if {$debuglevel > 3} {
puts "CHECKRULE $type $this"
}
@@ -1071,6 +1063,7 @@ proc checkrule {task type this} {
# consider mask (! negates)
set masks [lindex $l 2]
set ok 0
+ set default_ret 0
foreach mask $masks {
if {$debuglevel > 4} {
puts "consider single mask $mask"
@@ -1102,9 +1095,9 @@ proc checkrule {task type this} {
}
}
if {$debuglevel > 3} {
- puts "CHECKRULE MATCH OK"
+ puts "CHECKRULE MATCH DEFAULT $default_ret"
}
- return 1
+ return $default_ret
}
@@ -1139,7 +1132,7 @@ proc debug {level} {
}
proc task {t} {
- global tasks task status robotSeq control
+ global tasks task status control
set task $t
@@ -1155,7 +1148,7 @@ proc task {t} {
set status($t,bad) 0
set status($t,raw) 0
set status($t,active) 1
- set robotSeq($t) 0
+ set control($t,seq) 0
set control($t,distance) 10
return 1
}
@@ -1176,7 +1169,7 @@ set l [llength $argv]
if {$l < 2} {
puts {tclrobot: usage:}
- puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
+ puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-D dir] [-r rules] [url ..]}
puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
exit 1