X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=73d558aad3893ad0b0b823f929247a5cb5d02dac;hb=4355628830cd0f9e27c059d20254d8e1c30896eb;hp=b2a7224d00d706046fb15d19dc868e1022123b8b;hpb=2c4a844e7d87397d31d29bd3bfc56c97a5f1618b;p=tclrobot.git
diff --git a/robot.tcl b/robot.tcl
index b2a7224..73d558a 100755
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,21 +1,28 @@
+#!/usr/bin/tclsh
+# $Id: robot.tcl,v 1.45 2003/06/11 10:29:41 adam Exp $
#
-# $Id: robot.tcl,v 1.1 1996/08/06 14:04:22 adam Exp $
-#
-proc RobotFileNext {area} {
- if {[catch {set ns [glob $area/*]}]} {
+proc RobotFileNext1 {area lead} {
+ # puts "RobotFileNext1 area=$area lead=$lead"
+ if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
- set off [string first / $area]
- incr off
foreach n $ns {
if {[file isfile $n]} {
- if {[string first :.html $n] > 0} {
- return http://[string range $area/ $off end]
- }
- return http://[string range $n $off end]
+ set off [string last / $n]
+ # skip /
+ incr off
+ set end [string length $n]
+ # skip _.tkl
+ incr end -6
+ return $lead/[string range $n $off $end]
}
- if {[file isdirectory $n]} {
- set sb [RobotFileNext $n]
+ }
+ foreach n $ns {
+ if {[file isdirectory $n]} {
+ set off [string last / $n]
+ # skip /
+ incr off
+ set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
@@ -24,272 +31,1273 @@ proc RobotFileNext {area} {
return {}
}
-proc RobotFileExist {area host path} {
- set comp [split $area/$host$path /]
- set l [llength $comp]
- incr l -1
- if {![string length [lindex $comp $l]]} {
- set comp [split $area/$host$path:.html /]
+proc RobotWriteRecord {outf fromurl distance} {
+ puts $outf {}
+ puts $outf ""
+ puts $outf ""
+ puts $outf $distance
+ puts $outf ""
+ puts $outf ""
+ puts $outf $fromurl
+ puts $outf ""
+ puts $outf ""
+}
+
+proc RobotReadRecord {inf fromurlx distancex} {
+ upvar $fromurlx fromurl
+ upvar $distancex distance
+ gets $inf
+ gets $inf
+ gets $inf
+ set distance [string trim [gets $inf]]
+ # puts "got distance = $distance"
+ gets $inf
+ gets $inf
+ set fromurl [string trim [gets $inf]]
+}
+
+proc RobotFileNext {task area} {
+ global control
+ global idletime ns
+ global status
+
+ # puts "RobotFileNext seq=$control($task,seq)"
+ if {$control($task,seq) < 0} {
+ return {}
}
- return [file exists [join $comp /]]
+ if {$control($task,seq) == 0} {
+ if {[catch {set ns($task) [glob $task/$area/*]}]} {
+ return done
+ }
+ }
+ # puts "ns=$ns($task)"
+ set off [string length $task/$area]
+ incr off
+ set n [lindex $ns($task) $control($task,seq)]
+ # puts "n=$n"
+ if {![string length $n]} {
+ set control($task,seq) -1
+ flush stdout
+ set statusfile [open $task/status w]
+ puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
+ close $statusfile
+ return wait
+ }
+ incr control($task,seq)
+ if {[file isfile $n/robots.txt_.tkl]} {
+ # puts "ok returning http://[string range $n $off end]/robots.txt"
+ return http://[string range $n $off end]/robots.txt
+ } elseif {[file isdirectory $n]} {
+ set sb [RobotFileNext1 $n http://[string range $n $off end]]
+ if {[string length $sb]} {
+ return $sb
+ }
+ }
+ puts "no more work at end of RobotFileNext n=$n"
+ puts "ns=$ns($task)"
+ return {}
}
-proc RobotFileUnlink {area host path} {
- set comp [split $area/$host$path /]
- set l [llength $comp]
- incr l -1
- if {![string length [lindex $comp $l]]} {
- set comp [split $area/$host$path:.html /]
+
+proc RobotFileExist {task area host path} {
+ global debuglevel
+
+ if {$debuglevel > 3} {
+ puts "RobotFileExist begin area=$area host=$host path=$path"
}
- if {[catch {exec rm [join $comp /]}]} return
- incr l -1
+ return [file exists $task/$area/$host${path}_.tkl]
+}
+
+proc RobotFileUnlink {task area host path} {
+ global status
+ # puts "RobotFileUnlink begin"
+ # puts "area=$area host=$host path=$path"
+ set npath $task/$area/$host${path}_.tkl
+ # puts "npath=$npath"
+ set comp [split $npath /]
+ if {[catch {exec rm $npath}]} return
+
+ set l [llength $comp]
+ incr l -2
+ incr status($task,$area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
- exec rmdir ./$path
+ exec rmdir $path
+ }
+ # puts "RobotFileUnlink end"
+}
+
+proc RobotFileClose {out} {
+ if [string compare $out stdout] {
+ close $out
}
}
-proc RobotFileOpen {area host path} {
+proc RobotFileOpen {task area host path {mode w}} {
set orgPwd [pwd]
+ global workdir
+ global status
+ global debuglevel
+
+ # puts "RobotFileOpen task=$task path=$path"
- set comp [split $area/$host$path /]
+ if {![info exists workdir]} {
+ return stdout
+ }
+ if {$debuglevel > 3} {
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ }
+ if {[string compare $orgPwd $workdir]} {
+ puts "ooops. RobotFileOpen failed"
+ puts "workdir = $workdir"
+ puts "pwd = $orgPwd"
+ exit 1
+ }
+
+ set comp [split $task/$area/$host /]
set len [llength $comp]
incr len -1
- for {set i 0} {$i < $len} {incr i} {
+
+ # puts "1 comp=$comp"
+
+ for {set i 0} {$i <= $len} {incr i} {
set d [lindex $comp $i]
- if {[catch {cd ./$d}]} {
+ if {[string length $d] == 0} {
+ cd /
+ } elseif {[catch {cd $d}]} {
exec mkdir $d
cd ./$d
+ if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
+ if {[string compare $path /robots.txt]} {
+ set out [open robots.txt_.tkl w]
+ puts "creating robots.txt in $d"
+ close $out
+ incr status($task,unvisited)
+ }
+ }
+ }
+ }
+
+ set comp [split $path /]
+ set len [llength $comp]
+ incr len -1
+
+ # puts "2 path=$path comp=$comp"
+
+ for {set i 0} {$i < $len} {incr i} {
+ set d [lindex $comp $i]
+ if {[string length $d] > 0} {
+ if {[catch {cd $d}]} {
+ exec mkdir $d
+ cd ./$d
+ }
}
}
set d [lindex $comp $len]
- if {[string length $d]} {
- set out [open $d w]
- } else {
- set out [open :.html w]
+ set out [open ${d}_.tkl $mode]
+ if {$mode == "w"} {
+ incr status($task,$area)
}
cd $orgPwd
return $out
}
-proc RobotRestart {} {
+proc RobotStartJob {fname t} {
+ global control
+
+ set f [open $fname r]
+ set xml [read $f]
+ puts "Reading $fname"
+ close $f
+ if {![regexp {([^<]*)} $xml x status]} {
+ return
+ }
+ if {$status == "done"} {
+ puts "already done"
+ return
+ }
+ puts "status = $status"
+ if {![task $t]} {
+ return
+ }
+ htmlSwitch $xml \
+ url {
+ url $body
+ } filter {
+ set type $parm(type)
+ set action $parm(action)
+ if {$type == "domain"} {
+ $action url http://$body/*
+ }
+ if {$type == "url"} {
+ $action url $body
+ }
+ if {$type == "mime"} {
+ $action mime $body
+ }
+ } distance {
+ set control($t,distance) $body
+ } status {
+ set control($t,filestatus) $body
+ }
+ if {$status == "pending"} {
+ regsub {[^<]*} $xml {running} xml2
+ set f [open $fname w]
+ puts -nonewline $f $xml2
+ close $f
+ }
+}
+
+proc RobotDoneJob {t} {
+ global daemon_dir
+
+ if {![info exists daemon_dir]} {
+ return
+ }
+
+ set fname $t.tkl
+
+ set f [open $fname r]
+ set xml [read $f]
+ puts "Reading $fname"
+ regexp {([^<]*)} $xml x status
+ puts "------"
+ puts "status = $status"
+ close $f
+
+ regsub {[^<]*} $xml {done} xml2
+ set f [open $fname w]
+ puts -nonewline $f $xml2
+ close $f
+}
+
+proc RobotScanDir {} {
+ global daemon_dir
+
+ if {![info exists daemon_dir]} {
+ return
+ }
+ foreach d $daemon_dir {
+ if {[catch {set files [glob $d/*.tkl]}]} {
+ return
+ }
+ foreach fname $files {
+ if {[file isfile $fname] && [file readable $fname]} {
+ set t [file rootname $fname]
+ RobotStartJob $fname $t
+ }
+ }
+ }
+}
+
+proc RobotRR {task} {
+ global control robotsRunning tasks robotsMax status
+
+ puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
+ incr robotsRunning -1
+
+ # only one task gets through...
+ if {[string compare [lindex $tasks 0] $task]} {
+ return
+ }
+ puts "RobotRR. task = $task"
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+ puts "Scan"
+ if {[catch {RobotScanDir} msg]} {
+ puts "RobotScanDir failed"
+ puts $msg
+ }
+ foreach t $tasks {
+ set statusfile [open $t/status w]
+ puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
+ close $statusfile
+ set control($t,seq) 0
+ RobotStart $t
+ }
+}
+
+proc RobotDaemonSig {} {
+ global daemon_cnt
+
+ incr daemon_cnt
+}
+
+proc RobotDaemonLoop {} {
+ global daemon_cnt tasks robotsRunning status
+
+ set daemon_cnt 0
+ while 1 {
+ puts $daemon_cnt
+
+ RobotScanDir
+
+ if {[info exists tasks]} {
+ puts "daemon loop tasks $tasks"
+ foreach t $tasks {
+ set control($t,seq) 0
+ RobotStart $t
+ }
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+ }
+ after 30000 RobotDaemonSig
+ vwait daemon_cnt
+ }
+}
+
+proc RobotRestart {task url sock} {
+ global URL robotsRunning
+
+ close $sock
+ after cancel $URL($sock,cancel)
+
+ foreach v [array names URL $task,$url,*] {
+ unset URL($v)
+ }
+
+ incr robotsRunning -1
+ RobotStart $task
+}
+
+proc RobotStart {task} {
global URL
+ global robotsRunning robotsMax idletime status tasks
+
+ # puts "RobotStart $task running=$robotsRunning"
+ while {1} {
+ set url [RobotFileNext $task unvisited]
+ if {[string compare $url done] == 0} {
+ puts "In RobotStart task $task done"
- while {1} {
- set url [RobotFileNext unvisited]
- if {![string length $url]} break
- set r [RobotGetUrl $url {}]
- if {!$r} {
+ catch {unset ntasks}
+ foreach t $tasks {
+ if {[string compare $t $task]} {
+ lappend ntasks $t
+ } else {
+ puts "task $t done"
+ }
+ }
+ if {![info exists ntasks]} {
+ unset tasks
+ puts "all done"
+ } else {
+ set tasks $ntasks
+ }
+ RobotDoneJob $task
return
+ }
+ if {![string length $url]} {
+ return
+ }
+ incr robotsRunning
+ if {[string compare $url wait] == 0} {
+ after $idletime [list RobotRR $task]
+ return
+ }
+ set r [RobotGetUrl $task $url {}]
+ if {!$r} {
+ if {$robotsRunning >= $robotsMax} return
} else {
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- }
+ incr robotsRunning -1
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+ RobotFileClose $outf
+ }
+ RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+ }
}
- exit 0
}
-proc headSave {url out title} {
+proc headSave {task url out} {
global URL
-
- puts $out {}
- puts $out " $title"
- if {[info exists URL($url,head,Last-modified)]} {
- puts $out " $URL($url,head,Last-modified)"
+
+ if {[info exists URL($task,$url,head,last-modified)]} {
+ puts $out "$URL($task,$url,head,last-modified)"
}
puts $out {}
- if {[info exists URL($url,head,Date)]} {
- puts $out " $URL($url,head,Date)"
+ if {[info exists URL($task,$url,head,date)]} {
+ puts $out " $URL($task,$url,head,date)"
}
- if {[info exists URL($url,head,Content-length)]} {
- puts $out " $URL($url,head,Content-length)"
+ if {[info exists URL($task,$url,head,content-length)]} {
+ puts $out " $URL($task,$url,head,content-length)"
}
- if {[info exists URL($url,head,Server)]} {
- puts $out " $URL($url,head,Server)"
+ if {[info exists URL($task,$url,head,server)]} {
+ puts $out " $URL($task,$url,head,server)"
}
puts $out {}
- puts $out {}
- puts $out " $url"
- if {[info exists URL($url,head,Content-type)]} {
- puts $out " $URL($url,head,Content-type)"
+ puts $out {}
+ puts $out " $url"
+ if {[info exists URL($task,$url,head,content-type)]} {
+ puts $out " $URL($task,$url,head,content-type)"
}
- puts $out {}
+ puts $out {}
}
-proc RobotSave {url} {
+proc RobotHref {task url hrefx hostx pathx} {
+ global URL control debuglevel
+ upvar $hrefx href
+ upvar $hostx host
+ upvar $pathx path
+
+ if {$debuglevel > 1} {
+ puts "Ref input url = $url href=$href"
+ }
+
+ if {[string first { } $href] >= 0} {
+ return 0
+ }
+ if {[string length $href] > 256} {
+ return 0
+ }
+
+# Skip pages that have ? in them
+# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+# return 0
+# }
+ # get method (if any)
+ if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
+ set hpath $href
+ set method http
+ } else {
+ if {[string compare $method http]} {
+ return 0
+ }
+ }
+ # get host (if any)
+ if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
+ if {![string length $surl]} {
+ set surl /
+ }
+ if {[info exist control($task,domains)]} {
+ set ok 0
+ foreach domain $control($task,domains) {
+ if {[string match $domain $host]} {
+ set ok 1
+ break
+ }
+ }
+ if {!$ok} {
+ return 0
+ }
+ }
+ } else {
+ regexp {^([^\#]*)} $hpath x surl
+ set host $URL($task,$url,hostport)
+ }
+ if {![string length $surl]} {
+ return 0
+ }
+ if {[string first / $surl]} {
+ # relative path
+ set curpath $URL($task,$url,path)
+ if {[info exists URL($task,$url,bpath)]} {
+ set curpath $URL($task,$url,bpath)
+ }
+ regexp {^([^\#?]*)} $curpath x dpart
+ set l [string last / $dpart]
+ if {[expr $l >= 0]} {
+ set surl [string range $dpart 0 $l]$surl
+ } else {
+ set surl $dpart/$surl
+ }
+ }
+ set surllist [split $surl /]
+ catch {unset path}
+ set pathl 0
+ foreach c $surllist {
+ switch -- $c {
+ .. {
+ if {$pathl > 1} {
+ incr pathl -2
+ set path [lrange $path 0 $pathl]
+ incr pathl
+ }
+ }
+ . {
+
+ }
+ default {
+ incr pathl
+ lappend path $c
+ }
+ }
+ }
+ if {$debuglevel > 4} {
+ puts "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
+ }
+ regsub -all {~} $path {%7E} path
+ set href "$method://$host$path"
+
+ if {$debuglevel > 1} {
+ puts "Ref result = $href"
+ }
+ return [checkrule $task url $href]
+}
+
+proc RobotError {task url code} {
+ global URL
+
+ puts "Bad URL $url (code $code)"
+ set fromurl {}
+ set distance -1
+ if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+}
+
+proc RobotRedirect {task url tourl code} {
global URL
+
+ puts "Redirecting from $url to $tourl"
+
+ set distance {}
+ set fromurl {}
+ if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+ if {[RobotHref $task $url tourl host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ if {![RobotFileExist $task unvisited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+ } else {
+ set olddistance {}
+ set inf [RobotFileOpen $task visited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[string length $distance] == 0} {
+ set distance 1000
+ }
+ puts "distance=$distance olddistance=$olddistance"
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf $tourl $distance
+ RobotFileClose $outf
+ }
+ }
+ }
+ if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
+ puts "unlink failed"
+ exit 1
+ }
+}
+
+proc wellform {body} {
+ regsub -all {} $body { } abody
+ regsub -all -nocase {