X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=a90d6e8d91c801e0ed392f4b48797c9542c2be46;hb=1eb62bcb2a33b6e5bf29ec82e2fed329953bbf9a;hp=55e7f2a2b0d066f92df90daa8156db2277420c39;hpb=5c476d6b3055153cfcb6972965b6e450f685ddeb;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 55e7f2a..a90d6e8 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.35 2003/06/10 11:43:52 adam Exp $ +# $Id: robot.tcl,v 1.43 2003/06/11 09:40:22 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -9,14 +9,19 @@ proc RobotFileNext1 {area lead} { foreach n $ns { if {[file isfile $n]} { set off [string last / $n] - incr off 2 - return $lead/[string range $n $off end] + # skip / + incr off + set end [string length $n] + # skip _.tkl + incr end -6 + return $lead/[string range $n $off $end] } } foreach n $ns { if {[file isdirectory $n]} { set off [string last / $n] - incr off 2 + # skip / + incr off set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb @@ -50,15 +55,15 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {task area} { - global robotSeq + global control global idletime ns global status - # puts "RobotFileNext robotSeq=$robotSeq($task)" - if {$robotSeq($task) < 0} { + # puts "RobotFileNext seq=$control($task,seq)" + if {$control($task,seq) < 0} { return {} } - if {$robotSeq($task) == 0} { + if {$control($task,seq) == 0} { if {[catch {set ns($task) [glob $task/$area/*]}]} { return done } @@ -66,19 +71,19 @@ proc RobotFileNext {task area} { # puts "ns=$ns($task)" set off [string length $task/$area] incr off - set n [lindex $ns($task) $robotSeq($task)] + set n [lindex $ns($task) $control($task,seq)] # puts "n=$n" if {![string length $n]} { - set robotSeq($task) -1 + set control($task,seq) -1 flush stdout set statusfile [open $task/status w] puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)" close $statusfile return wait } - incr robotSeq($task) - if {[file isfile $n/frobots.txt]} { - puts "ok returning http://[string range $n $off end]/robots.txt" + incr control($task,seq) + if {[file isfile $n/robots.txt_.tkl]} { + # puts "ok returning http://[string range $n $off end]/robots.txt" return http://[string range $n $off end]/robots.txt } elseif {[file isdirectory $n]} { set sb [RobotFileNext1 $n http://[string range $n $off end]] @@ -98,40 +103,25 @@ proc RobotFileExist {task area host path} { if {$debuglevel > 3} { puts "RobotFileExist begin area=$area host=$host path=$path" } - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t - if {$debuglevel > 3} { - puts "RobotFileExist end npath=$npath" - } - return [file exists $npath] + return [file exists $task/$area/$host${path}_.tkl] } proc RobotFileUnlink {task area host path} { global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t + set npath $task/$area/$host${path}_.tkl # puts "npath=$npath" set comp [split $npath /] - if {[catch {exec rm [join $comp /]}]} return + if {[catch {exec rm $npath}]} return set l [llength $comp] - incr l -1 - incr l -1 + incr l -2 incr status($task,$area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return - exec rmdir ./$path + exec rmdir $path } # puts "RobotFileUnlink end" } @@ -148,6 +138,8 @@ proc RobotFileOpen {task area host path {mode w}} { global status global debuglevel + # puts "RobotFileOpen task=$task path=$path" + if {![info exists workdir]} { return stdout } @@ -169,12 +161,14 @@ proc RobotFileOpen {task area host path {mode w}} { for {set i 0} {$i <= $len} {incr i} { set d [lindex $comp $i] - if {[catch {cd $d}]} { + if {[string length $d] == 0} { + cd / + } elseif {[catch {cd $d}]} { exec mkdir $d cd ./$d if {![string compare $area unvisited] && $i == $len && $mode == "w"} { if {[string compare $path /robots.txt]} { - set out [open frobots.txt w] + set out [open robots.txt_.tkl w] puts "creating robots.txt in $d" close $out incr status($task,unvisited) @@ -190,8 +184,8 @@ proc RobotFileOpen {task area host path {mode w}} { # puts "2 path=$path comp=$comp" for {set i 0} {$i < $len} {incr i} { - set d "d[lindex $comp $i]" - if {[string length $d] > 1} { + set d [lindex $comp $i] + if {[string length $d] > 0} { if {[catch {cd $d}]} { exec mkdir $d cd ./$d @@ -199,11 +193,7 @@ proc RobotFileOpen {task area host path {mode w}} { } } set d [lindex $comp $len] - if {[string length $d]} { - set out [open f$d $mode] - } else { - set out [open f $mode] - } + set out [open ${d}_.tkl $mode] if {$mode == "w"} { incr status($task,$area) } @@ -211,16 +201,16 @@ proc RobotFileOpen {task area host path {mode w}} { return $out } - proc RobotStartJob {fname t} { global control - set f [open $fname r] set xml [read $f] puts "Reading $fname" - regexp {([^<]*)} $xml x status close $f + if {![regexp {([^<]*)} $xml x status]} { + return + } if {$status == "done"} { puts "already done" return @@ -300,7 +290,7 @@ proc RobotScanDir {} { } proc RobotRR {task} { - global robotSeq robotsRunning tasks robotsMax status + global control robotsRunning tasks robotsMax status puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------" incr robotsRunning -1 @@ -322,7 +312,7 @@ proc RobotRR {task} { set statusfile [open $t/status w] puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" close $statusfile - set robotSeq($t) 0 + set control($t,seq) 0 RobotStart $t } } @@ -345,7 +335,7 @@ proc RobotDaemonLoop {} { if {[info exists tasks]} { puts "daemon loop tasks $tasks" foreach t $tasks { - set robotSeq($t) 0 + set control($t,seq) 0 RobotStart $t } while {$robotsRunning} { @@ -1058,6 +1048,8 @@ proc checkrule {task type this} { global control global debuglevel + set default_ret 1 + if {$debuglevel > 3} { puts "CHECKRULE $type $this" } @@ -1071,6 +1063,7 @@ proc checkrule {task type this} { # consider mask (! negates) set masks [lindex $l 2] set ok 0 + set default_ret 0 foreach mask $masks { if {$debuglevel > 4} { puts "consider single mask $mask" @@ -1102,9 +1095,9 @@ proc checkrule {task type this} { } } if {$debuglevel > 3} { - puts "CHECKRULE MATCH OK" + puts "CHECKRULE MATCH DEFAULT $default_ret" } - return 1 + return $default_ret } @@ -1139,7 +1132,7 @@ proc debug {level} { } proc task {t} { - global tasks task status robotSeq control + global tasks task status control set task $t @@ -1155,7 +1148,7 @@ proc task {t} { set status($t,bad) 0 set status($t,raw) 0 set status($t,active) 1 - set robotSeq($t) 0 + set control($t,seq) 0 set control($t,distance) 10 return 1 } @@ -1176,7 +1169,7 @@ set l [llength $argv] if {$l < 2} { puts {tclrobot: usage:} - puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]} + puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-D dir] [-r rules] [url ..]} puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" exit 1