#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.42 2003/06/11 08:49:09 adam Exp $
+# $Id: robot.tcl,v 1.44 2003/06/11 10:11:39 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
foreach n $ns {
if {[file isfile $n]} {
set off [string last / $n]
- incr off 2
- return $lead/[string range $n $off end]
+ # skip /
+ incr off
+ set end [string length $n]
+ # skip _.tkl
+ incr end -6
+ return $lead/[string range $n $off $end]
}
}
foreach n $ns {
if {[file isdirectory $n]} {
set off [string last / $n]
- incr off 2
+ # skip /
+ incr off
set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
proc RobotWriteRecord {outf fromurl distance} {
+ puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
puts $outf "<zmbot>"
puts $outf "<distance>"
puts $outf $distance
upvar $distancex distance
gets $inf
gets $inf
+ gets $inf
set distance [string trim [gets $inf]]
# puts "got distance = $distance"
gets $inf
return wait
}
incr control($task,seq)
- if {[file isfile $n/frobots.txt]} {
- puts "ok returning http://[string range $n $off end]/robots.txt"
+ if {[file isfile $n/robots.txt_.tkl]} {
+ # puts "ok returning http://[string range $n $off end]/robots.txt"
return http://[string range $n $off end]/robots.txt
} elseif {[file isdirectory $n]} {
set sb [RobotFileNext1 $n http://[string range $n $off end]]
if {$debuglevel > 3} {
puts "RobotFileExist begin area=$area host=$host path=$path"
}
- set lpath [split $path /]
- set l [llength $lpath]
- incr l -1
- set t [lindex $lpath $l]
- incr l -1
- set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
- if {$debuglevel > 3} {
- puts "RobotFileExist end npath=$npath"
- }
- return [file exists $npath]
+ return [file exists $task/$area/$host${path}_.tkl]
}
proc RobotFileUnlink {task area host path} {
global status
# puts "RobotFileUnlink begin"
# puts "area=$area host=$host path=$path"
- set lpath [split $path /]
- set l [llength $lpath]
- incr l -1
- set t [lindex $lpath $l]
- incr l -1
- set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ set npath $task/$area/$host${path}_.tkl
# puts "npath=$npath"
set comp [split $npath /]
- if {[catch {exec rm [join $comp /]}]} return
+ if {[catch {exec rm $npath}]} return
set l [llength $comp]
- incr l -1
- incr l -1
+ incr l -2
incr status($task,$area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
cd ./$d
if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
if {[string compare $path /robots.txt]} {
- set out [open frobots.txt w]
+ set out [open robots.txt_.tkl w]
puts "creating robots.txt in $d"
close $out
incr status($task,unvisited)
# puts "2 path=$path comp=$comp"
for {set i 0} {$i < $len} {incr i} {
- set d "d[lindex $comp $i]"
- if {[string length $d] > 1} {
+ set d [lindex $comp $i]
+ if {[string length $d] > 0} {
if {[catch {cd $d}]} {
exec mkdir $d
cd ./$d
}
}
set d [lindex $comp $len]
- if {[string length $d]} {
- set out [open f$d $mode]
- } else {
- set out [open f $mode]
- }
+ set out [open ${d}_.tkl $mode]
if {$mode == "w"} {
incr status($task,$area)
}
return $out
}
-
proc RobotStartJob {fname t} {
global control
proc RobotWriteMetadata {task url out} {
global URL
+ set charset $URL($task,$url,charset)
+ puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
puts $out "<zmbot>"
set distance 1000
set version {}
set headbuf [string range $URL($task,$url,buf) 0 $n]
incr n 4
+ set URL($task,$url,charset) ISO-8859-1
set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
set URL($task,$url,head,[string tolower $name]) [string trim $value]
}
+ regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
}
puts "HTTP CODE $code"
set URL($task,$url,state) skip