X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=73d558aad3893ad0b0b823f929247a5cb5d02dac;hb=4355628830cd0f9e27c059d20254d8e1c30896eb;hp=2d9ea64253d2dcf8b24e82ff07888f570d48f1cf;hpb=e0e42a2b8319ccf7b548ba425d42ba4c82b97ce0;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 2d9ea64..73d558a 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.41 2003/06/10 13:16:16 adam Exp $ +# $Id: robot.tcl,v 1.45 2003/06/11 10:29:41 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -9,14 +9,19 @@ proc RobotFileNext1 {area lead} { foreach n $ns { if {[file isfile $n]} { set off [string last / $n] - incr off 2 - return $lead/[string range $n $off end] + # skip / + incr off + set end [string length $n] + # skip _.tkl + incr end -6 + return $lead/[string range $n $off $end] } } foreach n $ns { if {[file isdirectory $n]} { set off [string last / $n] - incr off 2 + # skip / + incr off set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb @@ -27,6 +32,7 @@ proc RobotFileNext1 {area lead} { } proc RobotWriteRecord {outf fromurl distance} { + puts $outf {} puts $outf "" puts $outf "" puts $outf $distance @@ -42,6 +48,7 @@ proc RobotReadRecord {inf fromurlx distancex} { upvar $distancex distance gets $inf gets $inf + gets $inf set distance [string trim [gets $inf]] # puts "got distance = $distance" gets $inf @@ -50,15 +57,15 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {task area} { - global robotSeq + global control global idletime ns global status - # puts "RobotFileNext robotSeq=$robotSeq($task)" - if {$robotSeq($task) < 0} { + # puts "RobotFileNext seq=$control($task,seq)" + if {$control($task,seq) < 0} { return {} } - if {$robotSeq($task) == 0} { + if {$control($task,seq) == 0} { if {[catch {set ns($task) [glob $task/$area/*]}]} { return done } @@ -66,19 +73,19 @@ proc RobotFileNext {task area} { # puts "ns=$ns($task)" set off [string length $task/$area] incr off - set n [lindex $ns($task) $robotSeq($task)] + set n [lindex $ns($task) $control($task,seq)] # puts "n=$n" if {![string length $n]} { - set robotSeq($task) -1 + set control($task,seq) -1 flush stdout set statusfile [open $task/status w] puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)" close $statusfile return wait } - incr robotSeq($task) - if {[file isfile $n/frobots.txt]} { - puts "ok returning http://[string range $n $off end]/robots.txt" + incr control($task,seq) + if {[file isfile $n/robots.txt_.tkl]} { + # puts "ok returning http://[string range $n $off end]/robots.txt" return http://[string range $n $off end]/robots.txt } elseif {[file isdirectory $n]} { set sb [RobotFileNext1 $n http://[string range $n $off end]] @@ -98,35 +105,20 @@ proc RobotFileExist {task area host path} { if {$debuglevel > 3} { puts "RobotFileExist begin area=$area host=$host path=$path" } - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t - if {$debuglevel > 3} { - puts "RobotFileExist end npath=$npath" - } - return [file exists $npath] + return [file exists $task/$area/$host${path}_.tkl] } proc RobotFileUnlink {task area host path} { global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t + set npath $task/$area/$host${path}_.tkl # puts "npath=$npath" set comp [split $npath /] - if {[catch {exec rm [join $comp /]}]} return + if {[catch {exec rm $npath}]} return set l [llength $comp] - incr l -1 - incr l -1 + incr l -2 incr status($task,$area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] @@ -148,7 +140,7 @@ proc RobotFileOpen {task area host path {mode w}} { global status global debuglevel - puts "RobotFileOpen task=$task path=$path" + # puts "RobotFileOpen task=$task path=$path" if {![info exists workdir]} { return stdout @@ -174,12 +166,11 @@ proc RobotFileOpen {task area host path {mode w}} { if {[string length $d] == 0} { cd / } elseif {[catch {cd $d}]} { - puts "mkdir (1) d=$d" exec mkdir $d cd ./$d if {![string compare $area unvisited] && $i == $len && $mode == "w"} { if {[string compare $path /robots.txt]} { - set out [open frobots.txt w] + set out [open robots.txt_.tkl w] puts "creating robots.txt in $d" close $out incr status($task,unvisited) @@ -195,21 +186,16 @@ proc RobotFileOpen {task area host path {mode w}} { # puts "2 path=$path comp=$comp" for {set i 0} {$i < $len} {incr i} { - set d "d[lindex $comp $i]" - if {[string length $d] > 1} { + set d [lindex $comp $i] + if {[string length $d] > 0} { if {[catch {cd $d}]} { - puts "mkdir (2) d=$d" exec mkdir $d cd ./$d } } } set d [lindex $comp $len] - if {[string length $d]} { - set out [open f$d $mode] - } else { - set out [open f $mode] - } + set out [open ${d}_.tkl $mode] if {$mode == "w"} { incr status($task,$area) } @@ -217,11 +203,9 @@ proc RobotFileOpen {task area host path {mode w}} { return $out } - proc RobotStartJob {fname t} { global control - set f [open $fname r] set xml [read $f] puts "Reading $fname" @@ -308,7 +292,7 @@ proc RobotScanDir {} { } proc RobotRR {task} { - global robotSeq robotsRunning tasks robotsMax status + global control robotsRunning tasks robotsMax status puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------" incr robotsRunning -1 @@ -330,7 +314,7 @@ proc RobotRR {task} { set statusfile [open $t/status w] puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" close $statusfile - set robotSeq($t) 0 + set control($t,seq) 0 RobotStart $t } } @@ -353,7 +337,7 @@ proc RobotDaemonLoop {} { if {[info exists tasks]} { puts "daemon loop tasks $tasks" foreach t $tasks { - set robotSeq($t) 0 + set control($t,seq) 0 RobotStart $t } while {$robotsRunning} { @@ -626,6 +610,15 @@ proc RobotRedirect {task url tourl code} { } } +proc wellform {body} { + regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} body + regsub -all {<[^\>]+>} $body {} abody + regsub -all { } $abody { } body + regsub -all {&} $body {&} abody + return $abody +} + proc link {task url out href body distance} { global URL control if {[expr $distance > $control($task,distance)]} return @@ -634,7 +627,8 @@ proc link {task url out href body distance} { puts $out "" puts $out "$href" - puts $out "$body" + set abody [wellform $body] + puts $out "$abody" puts $out "" if {![RobotFileExist $task visited $host $path]} { @@ -730,11 +724,9 @@ proc RobotTextHtml {task url out} { # don't print title of document content if noindex is used if {!$noindex} { puts $out "$title" - regsub -all {} $body { } abody - regsub -all -nocase {} $abody {} bbody - regsub -all {<[^\>]+>} $bbody {} nbody + set bbody [wellform $body] puts $out "" - puts $out $nbody + puts $out $bbody puts $out "" } } -nonest base { @@ -819,6 +811,8 @@ proc RobotTextPlain {task url out} { proc RobotWriteMetadata {task url out} { global URL + set charset $URL($task,$url,charset) + puts $out "" puts $out "" set distance 1000 @@ -904,6 +898,7 @@ proc RobotReadHeader {task url sock} { set version {} set headbuf [string range $URL($task,$url,buf) 0 $n] incr n 4 + set URL($task,$url,charset) ISO-8859-1 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code @@ -912,6 +907,7 @@ proc RobotReadHeader {task url sock} { if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { set URL($task,$url,head,[string tolower $name]) [string trim $value] } + regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset) } puts "HTTP CODE $code" set URL($task,$url,state) skip @@ -1150,7 +1146,7 @@ proc debug {level} { } proc task {t} { - global tasks task status robotSeq control + global tasks task status control set task $t @@ -1166,7 +1162,7 @@ proc task {t} { set status($t,bad) 0 set status($t,raw) 0 set status($t,active) 1 - set robotSeq($t) 0 + set control($t,seq) 0 set control($t,distance) 10 return 1 }