X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=73d558aad3893ad0b0b823f929247a5cb5d02dac;hb=4355628830cd0f9e27c059d20254d8e1c30896eb;hp=968e64e14bd24eac406e5a345aad3fc20182dbb0;hpb=5a6e4380b9b39674fb63839a5aeaa23cb68537cb;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 968e64e..73d558a 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.36 2003/06/10 11:55:18 adam Exp $ +# $Id: robot.tcl,v 1.45 2003/06/11 10:29:41 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -9,14 +9,19 @@ proc RobotFileNext1 {area lead} { foreach n $ns { if {[file isfile $n]} { set off [string last / $n] - incr off 2 - return $lead/[string range $n $off end] + # skip / + incr off + set end [string length $n] + # skip _.tkl + incr end -6 + return $lead/[string range $n $off $end] } } foreach n $ns { if {[file isdirectory $n]} { set off [string last / $n] - incr off 2 + # skip / + incr off set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb @@ -27,6 +32,7 @@ proc RobotFileNext1 {area lead} { } proc RobotWriteRecord {outf fromurl distance} { + puts $outf {} puts $outf "" puts $outf "" puts $outf $distance @@ -42,6 +48,7 @@ proc RobotReadRecord {inf fromurlx distancex} { upvar $distancex distance gets $inf gets $inf + gets $inf set distance [string trim [gets $inf]] # puts "got distance = $distance" gets $inf @@ -50,15 +57,15 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {task area} { - global robotSeq + global control global idletime ns global status - # puts "RobotFileNext robotSeq=$robotSeq($task)" - if {$robotSeq($task) < 0} { + # puts "RobotFileNext seq=$control($task,seq)" + if {$control($task,seq) < 0} { return {} } - if {$robotSeq($task) == 0} { + if {$control($task,seq) == 0} { if {[catch {set ns($task) [glob $task/$area/*]}]} { return done } @@ -66,19 +73,19 @@ proc RobotFileNext {task area} { # puts "ns=$ns($task)" set off [string length $task/$area] incr off - set n [lindex $ns($task) $robotSeq($task)] + set n [lindex $ns($task) $control($task,seq)] # puts "n=$n" if {![string length $n]} { - set robotSeq($task) -1 + set control($task,seq) -1 flush stdout set statusfile [open $task/status w] puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)" close $statusfile return wait } - incr robotSeq($task) - if {[file isfile $n/frobots.txt]} { - puts "ok returning http://[string range $n $off end]/robots.txt" + incr control($task,seq) + if {[file isfile $n/robots.txt_.tkl]} { + # puts "ok returning http://[string range $n $off end]/robots.txt" return http://[string range $n $off end]/robots.txt } elseif {[file isdirectory $n]} { set sb [RobotFileNext1 $n http://[string range $n $off end]] @@ -98,40 +105,25 @@ proc RobotFileExist {task area host path} { if {$debuglevel > 3} { puts "RobotFileExist begin area=$area host=$host path=$path" } - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t - if {$debuglevel > 3} { - puts "RobotFileExist end npath=$npath" - } - return [file exists $npath] + return [file exists $task/$area/$host${path}_.tkl] } proc RobotFileUnlink {task area host path} { global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t + set npath $task/$area/$host${path}_.tkl # puts "npath=$npath" set comp [split $npath /] - if {[catch {exec rm [join $comp /]}]} return + if {[catch {exec rm $npath}]} return set l [llength $comp] - incr l -1 - incr l -1 + incr l -2 incr status($task,$area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return - exec rmdir ./$path + exec rmdir $path } # puts "RobotFileUnlink end" } @@ -148,6 +140,8 @@ proc RobotFileOpen {task area host path {mode w}} { global status global debuglevel + # puts "RobotFileOpen task=$task path=$path" + if {![info exists workdir]} { return stdout } @@ -169,12 +163,14 @@ proc RobotFileOpen {task area host path {mode w}} { for {set i 0} {$i <= $len} {incr i} { set d [lindex $comp $i] - if {[catch {cd $d}]} { + if {[string length $d] == 0} { + cd / + } elseif {[catch {cd $d}]} { exec mkdir $d cd ./$d if {![string compare $area unvisited] && $i == $len && $mode == "w"} { if {[string compare $path /robots.txt]} { - set out [open frobots.txt w] + set out [open robots.txt_.tkl w] puts "creating robots.txt in $d" close $out incr status($task,unvisited) @@ -190,8 +186,8 @@ proc RobotFileOpen {task area host path {mode w}} { # puts "2 path=$path comp=$comp" for {set i 0} {$i < $len} {incr i} { - set d "d[lindex $comp $i]" - if {[string length $d] > 1} { + set d [lindex $comp $i] + if {[string length $d] > 0} { if {[catch {cd $d}]} { exec mkdir $d cd ./$d @@ -199,11 +195,7 @@ proc RobotFileOpen {task area host path {mode w}} { } } set d [lindex $comp $len] - if {[string length $d]} { - set out [open f$d $mode] - } else { - set out [open f $mode] - } + set out [open ${d}_.tkl $mode] if {$mode == "w"} { incr status($task,$area) } @@ -211,16 +203,16 @@ proc RobotFileOpen {task area host path {mode w}} { return $out } - proc RobotStartJob {fname t} { global control - set f [open $fname r] set xml [read $f] puts "Reading $fname" - regexp {([^<]*)} $xml x status close $f + if {![regexp {([^<]*)} $xml x status]} { + return + } if {$status == "done"} { puts "already done" return @@ -300,7 +292,7 @@ proc RobotScanDir {} { } proc RobotRR {task} { - global robotSeq robotsRunning tasks robotsMax status + global control robotsRunning tasks robotsMax status puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------" incr robotsRunning -1 @@ -322,7 +314,7 @@ proc RobotRR {task} { set statusfile [open $t/status w] puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" close $statusfile - set robotSeq($t) 0 + set control($t,seq) 0 RobotStart $t } } @@ -345,7 +337,7 @@ proc RobotDaemonLoop {} { if {[info exists tasks]} { puts "daemon loop tasks $tasks" foreach t $tasks { - set robotSeq($t) 0 + set control($t,seq) 0 RobotStart $t } while {$robotsRunning} { @@ -618,6 +610,15 @@ proc RobotRedirect {task url tourl code} { } } +proc wellform {body} { + regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} body + regsub -all {<[^\>]+>} $body {} abody + regsub -all { } $abody { } body + regsub -all {&} $body {&} abody + return $abody +} + proc link {task url out href body distance} { global URL control if {[expr $distance > $control($task,distance)]} return @@ -626,7 +627,8 @@ proc link {task url out href body distance} { puts $out "" puts $out "$href" - puts $out "$body" + set abody [wellform $body] + puts $out "$abody" puts $out "" if {![RobotFileExist $task visited $host $path]} { @@ -722,11 +724,9 @@ proc RobotTextHtml {task url out} { # don't print title of document content if noindex is used if {!$noindex} { puts $out "$title" - regsub -all {} $body { } abody - regsub -all -nocase {} $abody {} bbody - regsub -all {<[^\>]+>} $bbody {} nbody + set bbody [wellform $body] puts $out "" - puts $out $nbody + puts $out $bbody puts $out "" } } -nonest base { @@ -811,6 +811,8 @@ proc RobotTextPlain {task url out} { proc RobotWriteMetadata {task url out} { global URL + set charset $URL($task,$url,charset) + puts $out "" puts $out "" set distance 1000 @@ -896,6 +898,7 @@ proc RobotReadHeader {task url sock} { set version {} set headbuf [string range $URL($task,$url,buf) 0 $n] incr n 4 + set URL($task,$url,charset) ISO-8859-1 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code @@ -904,6 +907,7 @@ proc RobotReadHeader {task url sock} { if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { set URL($task,$url,head,[string tolower $name]) [string trim $value] } + regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset) } puts "HTTP CODE $code" set URL($task,$url,state) skip @@ -1058,6 +1062,8 @@ proc checkrule {task type this} { global control global debuglevel + set default_ret 1 + if {$debuglevel > 3} { puts "CHECKRULE $type $this" } @@ -1071,6 +1077,7 @@ proc checkrule {task type this} { # consider mask (! negates) set masks [lindex $l 2] set ok 0 + set default_ret 0 foreach mask $masks { if {$debuglevel > 4} { puts "consider single mask $mask" @@ -1102,9 +1109,9 @@ proc checkrule {task type this} { } } if {$debuglevel > 3} { - puts "CHECKRULE MATCH OK" + puts "CHECKRULE MATCH DEFAULT $default_ret" } - return 1 + return $default_ret } @@ -1139,7 +1146,7 @@ proc debug {level} { } proc task {t} { - global tasks task status robotSeq control + global tasks task status control set task $t @@ -1155,7 +1162,7 @@ proc task {t} { set status($t,bad) 0 set status($t,raw) 0 set status($t,active) 1 - set robotSeq($t) 0 + set control($t,seq) 0 set control($t,distance) 10 return 1 }