X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=3ab1d816a01651c5850c8d311d394da721bc3c98;hb=64d3a6a403795bb44f89ffad04463fd7a8863184;hp=141ecacb41c1ad160478481120e7b482abda5f8d;hpb=78397776b89d9df48192574dadfa04488203d1e0;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 141ecac..3ab1d81 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.42 2003/06/11 08:49:09 adam Exp $ +# $Id: robot.tcl,v 1.44 2003/06/11 10:11:39 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -9,14 +9,19 @@ proc RobotFileNext1 {area lead} { foreach n $ns { if {[file isfile $n]} { set off [string last / $n] - incr off 2 - return $lead/[string range $n $off end] + # skip / + incr off + set end [string length $n] + # skip _.tkl + incr end -6 + return $lead/[string range $n $off $end] } } foreach n $ns { if {[file isdirectory $n]} { set off [string last / $n] - incr off 2 + # skip / + incr off set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb @@ -27,6 +32,7 @@ proc RobotFileNext1 {area lead} { } proc RobotWriteRecord {outf fromurl distance} { + puts $outf {} puts $outf "" puts $outf "" puts $outf $distance @@ -42,6 +48,7 @@ proc RobotReadRecord {inf fromurlx distancex} { upvar $distancex distance gets $inf gets $inf + gets $inf set distance [string trim [gets $inf]] # puts "got distance = $distance" gets $inf @@ -77,8 +84,8 @@ proc RobotFileNext {task area} { return wait } incr control($task,seq) - if {[file isfile $n/frobots.txt]} { - puts "ok returning http://[string range $n $off end]/robots.txt" + if {[file isfile $n/robots.txt_.tkl]} { + # puts "ok returning http://[string range $n $off end]/robots.txt" return http://[string range $n $off end]/robots.txt } elseif {[file isdirectory $n]} { set sb [RobotFileNext1 $n http://[string range $n $off end]] @@ -98,35 +105,20 @@ proc RobotFileExist {task area host path} { if {$debuglevel > 3} { puts "RobotFileExist begin area=$area host=$host path=$path" } - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t - if {$debuglevel > 3} { - puts "RobotFileExist end npath=$npath" - } - return [file exists $npath] + return [file exists $task/$area/$host${path}_.tkl] } proc RobotFileUnlink {task area host path} { global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t + set npath $task/$area/$host${path}_.tkl # puts "npath=$npath" set comp [split $npath /] - if {[catch {exec rm [join $comp /]}]} return + if {[catch {exec rm $npath}]} return set l [llength $comp] - incr l -1 - incr l -1 + incr l -2 incr status($task,$area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] @@ -178,7 +170,7 @@ proc RobotFileOpen {task area host path {mode w}} { cd ./$d if {![string compare $area unvisited] && $i == $len && $mode == "w"} { if {[string compare $path /robots.txt]} { - set out [open frobots.txt w] + set out [open robots.txt_.tkl w] puts "creating robots.txt in $d" close $out incr status($task,unvisited) @@ -194,8 +186,8 @@ proc RobotFileOpen {task area host path {mode w}} { # puts "2 path=$path comp=$comp" for {set i 0} {$i < $len} {incr i} { - set d "d[lindex $comp $i]" - if {[string length $d] > 1} { + set d [lindex $comp $i] + if {[string length $d] > 0} { if {[catch {cd $d}]} { exec mkdir $d cd ./$d @@ -203,11 +195,7 @@ proc RobotFileOpen {task area host path {mode w}} { } } set d [lindex $comp $len] - if {[string length $d]} { - set out [open f$d $mode] - } else { - set out [open f $mode] - } + set out [open ${d}_.tkl $mode] if {$mode == "w"} { incr status($task,$area) } @@ -215,7 +203,6 @@ proc RobotFileOpen {task area host path {mode w}} { return $out } - proc RobotStartJob {fname t} { global control @@ -816,6 +803,8 @@ proc RobotTextPlain {task url out} { proc RobotWriteMetadata {task url out} { global URL + set charset $URL($task,$url,charset) + puts $out "" puts $out "" set distance 1000 @@ -901,6 +890,7 @@ proc RobotReadHeader {task url sock} { set version {} set headbuf [string range $URL($task,$url,buf) 0 $n] incr n 4 + set URL($task,$url,charset) ISO-8859-1 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code @@ -909,6 +899,7 @@ proc RobotReadHeader {task url sock} { if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { set URL($task,$url,head,[string tolower $name]) [string trim $value] } + regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset) } puts "HTTP CODE $code" set URL($task,$url,state) skip