X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=b6466c347186d9954fbdda74c78ed7054fab76c5;hb=9d3f82cd1140362487d8fa6372cac1b24a49d21e;hp=15468bfdfdf5bfd5408b783ed5f9b1e4f20f8b33;hpb=04f32b20fae8795aab0c1f8b703394056f3efea3;p=tclrobot.git
diff --git a/robot.tcl b/robot.tcl
index 15468bf..b6466c3 100755
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,24 +1,23 @@
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.7 2000/12/08 22:46:53 adam Exp $
+# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $
#
-proc RobotFileNext1 {area} {
+proc RobotFileNext1 {area lead} {
+ # puts "RobotFileNext1 area=$area lead=$lead"
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
- set off [string first / $area]
- incr off
-
foreach n $ns {
if {[file isfile $n]} {
- if {[string first :.html $n] > 0} {
- return http://[string range $area/ $off end]
- }
- return http://[string range $n $off end]
+ set off [string last / $n]
+ incr off 2
+ return $lead/[string range $n $off end]
}
}
foreach n $ns {
if {[file isdirectory $n]} {
- set sb [RobotFileNext1 $n]
+ set off [string last / $n]
+ incr off 2
+ set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
@@ -27,73 +26,112 @@ proc RobotFileNext1 {area} {
return {}
}
-proc RobotFileWait {} {
- global robotSeq
- set robotSeq 0
+proc RobotWriteRecord {outf fromurl distance} {
+ puts $outf ""
+ puts $outf ""
+ puts $outf $distance
+ puts $outf ""
+ puts $outf ""
+ puts $outf $fromurl
+ puts $outf ""
+ puts $outf ""
+}
+
+proc RobotReadRecord {inf fromurlx distancex} {
+ upvar $fromurlx fromurl
+ upvar $distancex distance
+ gets $inf
+ gets $inf
+ set distance [string trim [gets $inf]]
+ # puts "got distance = $distance"
+ gets $inf
+ gets $inf
+ set fromurl [string trim [gets $inf]]
}
proc RobotFileNext {area} {
global robotSeq
- if {[catch {set ns [glob ${area}/*]}]} {
- return {}
+ global idletime ns
+ global status
+
+ # puts "RobotFileNext robotSeq=$robotSeq"
+ if {$robotSeq < 0} {
+ return {}
+ }
+ if {$robotSeq == 0} {
+ if {[catch {set ns [glob ${area}/*]}]} {
+ return {}
+ }
}
set off [string length $area]
incr off
-
set n [lindex $ns $robotSeq]
if {![string length $n]} {
- puts "------------ N E X T R O U N D --------"
set robotSeq -1
- after 2000 RobotFileWait
- vwait robotSeq
-
- set n [lindex $ns $robotSeq]
- if {![string length $n]} {
- return {}
- }
+ flush stdout
+ set statusfile [open status w]
+ puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
+ close $statusfile
+ return wait
}
incr robotSeq
- if {[file isfile $n/robots.txt]} {
+ if {[file isfile $n/frobots.txt]} {
puts "ok returning http://[string range $n $off end]/robots.txt"
return http://[string range $n $off end]/robots.txt
} elseif {[file isdirectory $n]} {
- set sb [RobotFileNext1 $n]
+ set sb [RobotFileNext1 $n http://[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
}
+ puts "no more work at end of RobotFileNext n=$n"
+ puts "ns=$ns"
return {}
}
proc RobotFileExist {area host path} {
- set comp [split $area/$host$path /]
- set l [llength $comp]
+ global debuglevel
+
+ if {$debuglevel > 3} {
+ puts "RobotFileExist begin area=$area host=$host path=$path"
+ }
+ set lpath [split $path /]
+ set l [llength $lpath]
incr l -1
- if {![string length [lindex $comp $l]]} {
- set comp [split $area/$host$path:.html /]
- } elseif {[file exists [join $comp /]]} {
- return 1
- } else {
- set comp [split $area/$host$path/:.html /]
+ set t [lindex $lpath $l]
+ incr l -1
+ set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ if {$debuglevel > 3} {
+ puts "RobotFileExist end npath=$npath"
}
- return [file exists [join $comp /]]
+ return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
- set comp [split $area/$host$path /]
- set l [llength $comp]
+ global status
+ # puts "RobotFileUnlink begin"
+ # puts "area=$area host=$host path=$path"
+ set lpath [split $path /]
+ set l [llength $lpath]
incr l -1
- if {![string length [lindex $comp $l]]} {
- set comp [split $area/$host$path:.html /]
- }
+ set t [lindex $lpath $l]
+ incr l -1
+ set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ # puts "npath=$npath"
+ set comp [split $npath /]
if {[catch {exec rm [join $comp /]}]} return
+
+ set l [llength $comp]
+ incr l -1
incr l -1
+ incr status($area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
exec rmdir ./$path
}
+ # puts "RobotFileUnlink end"
}
proc RobotFileClose {out} {
@@ -105,12 +143,17 @@ proc RobotFileClose {out} {
proc RobotFileOpen {area host path {mode w}} {
set orgPwd [pwd]
global workdir
+ global status
+ global debuglevel
if {![info exists workdir]} {
return stdout
}
- puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
+ if {$debuglevel > 3} {
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ }
if {[string compare $orgPwd $workdir]} {
+ puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
puts "pwd = $orgPwd"
exit 1
@@ -119,56 +162,99 @@ proc RobotFileOpen {area host path {mode w}} {
set len [llength $comp]
incr len -1
for {set i 0} {$i < $len} {incr i} {
- set d [lindex $comp $i]
+ if {$i > 1} {
+ set d "d[lindex $comp $i]"
+ } else {
+ set d [lindex $comp $i]
+ }
if {[catch {cd ./$d}]} {
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
- set out [open robots.txt w]
+ set out [open frobots.txt w]
puts "creating robots.txt in $d"
close $out
+ incr status(unvisited)
}
}
}
set d [lindex $comp $len]
if {[string length $d]} {
- if {[file isdirectory $d]} {
- set out [open $d/:.html $mode]
- } else {
- set out [open $d $mode]
- }
+ set out [open f$d $mode]
+ if {0} {
+ if {[file isfile $d/f]} {
+ set out [open $d/f $mode]
+ } else {
+ set out [open f$d $mode]
+ }
+ }
} else {
- set out [open :.html $mode]
+ set out [open f $mode]
+ }
+ if {$mode == "w"} {
+ incr status($area)
}
cd $orgPwd
- #puts "RobotFileStop"
return $out
}
-proc RobotRestart {} {
+proc RobotRR {} {
+ global robotSeq robotsRunning
+
+ incr robotsRunning -1
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+ set robotSeq 0
+ RobotStart
+}
+
+proc RobotRestart {url sock} {
+ global URL robotsRunning
+
+ close $sock
+ after cancel $URL($sock,cancel)
+
+ foreach v [array names URL $url,*] {
+ unset URL($v)
+ }
+
+ incr robotsRunning -1
+ RobotStart
+}
+
+proc RobotStart {} {
global URL
- global robotMoreWork
-
- while {1} {
+ global robotsRunning robotsMax idletime
+
+ # puts "RobotStart"
+ while {1} {
set url [RobotFileNext unvisited]
if {![string length $url]} {
- break
+ return
+ }
+ incr robotsRunning
+ if {[string compare $url wait] == 0} {
+ after $idletime RobotRR
+ return
}
set r [RobotGetUrl $url {}]
if {!$r} {
- puts "RobotGetUrl returned 0 on url=$url"
- return
+ if {$robotsRunning >= $robotsMax} return
} else {
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- }
+ incr robotsRunning -1
+ if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotFileClose $outf
+ }
+ RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
+ }
}
- set robotMoreWork 0
}
proc headSave {url out} {
global URL
- puts $out {}
if {[info exists URL($url,head,last-modified)]} {
puts $out "$URL($url,head,last-modified)"
}
@@ -192,12 +278,27 @@ proc headSave {url out} {
}
proc RobotHref {url hrefx hostx pathx} {
- global URL domains
+ global URL domains debuglevel
upvar $hrefx href
upvar $hostx host
upvar $pathx path
- puts "Ref url = $url href=$href"
+ if {$debuglevel > 1} {
+ puts "Ref input url = $url href=$href"
+ }
+
+ if {[string first { } $href] >= 0} {
+ return 0
+ }
+ if {[string length $href] > 256} {
+ return 0
+ }
+ if {[string first {?} $href] >= 0} {
+ return 0
+ }
+ if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+ return 0
+ }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
@@ -212,26 +313,32 @@ proc RobotHref {url hrefx hostx pathx} {
if {![string length $surl]} {
set surl /
}
- set ok 0
- foreach domain $domains {
- if {[string match $domain $host]} {
- set ok 1
- break
+ if {[info exist domains]} {
+ set ok 0
+ foreach domain $domains {
+ if {[string match $domain $host]} {
+ set ok 1
+ break
+ }
}
- }
- if {!$ok} {
- return 0
- }
+ if {!$ok} {
+ return 0
+ }
+ }
} else {
regexp {^([^\#]*)} $hpath x surl
- set host $URL($url,host)
+ set host $URL($url,hostport)
}
if {![string length $surl]} {
return 0
}
if {[string first / $surl]} {
# relative path
- regexp {^([^\#?]*)} $URL($url,path) x dpart
+ set curpath $URL($url,path)
+ if {[info exists URL($url,bpath)]} {
+ set curpath $URL($url,bpath)
+ }
+ regexp {^([^\#?]*)} $curpath x dpart
set l [string last / $dpart]
if {[expr $l >= 0]} {
set surl [string range $dpart 0 $l]$surl
@@ -239,112 +346,172 @@ proc RobotHref {url hrefx hostx pathx} {
set surl $dpart/$surl
}
}
- set c [split $surl /]
- set i [llength $c]
- incr i -1
- set path [lindex $c $i]
- incr i -1
- while {$i >= 0} {
- switch -- [lindex $c $i] {
+ set surllist [split $surl /]
+ catch {unset path}
+ set pathl 0
+ foreach c $surllist {
+ switch -- $c {
.. {
- incr i -2
- }
- . {
- incr i -1
+ if {$pathl > 1} {
+ incr pathl -2
+ set path [lrange $path 0 $pathl]
+ incr pathl
+ }
}
- default {
- set path [lindex $c $i]/$path
- incr i -1
+ . {
+
+ }
+ default {
+ incr pathl
+ lappend path $c
}
}
- }
+ }
+ if {$debuglevel > 4} {
+ puts "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
+ }
+ regsub -all {~} $path {%7E} path
set href "$method://$host$path"
- puts "Ref href = $href"
- return 1
-}
-proc Robot401 {url} {
- global URL
-
- puts "Bad URL $url"
- set fromurl {}
- catch {
- set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
- set fromurl [gets $inf]
- close $inf
- }
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url 401"
- puts $outf "Reference $fromurl"
- RobotFileClose $outf
+ if {$debuglevel > 1} {
+ puts "Ref result = $href"
}
+ return [checkrule url $href]
}
-proc Robot404 {url} {
+proc RobotError {url code} {
global URL
- puts "Bad URL $url"
+ puts "Bad URL $url (code $code)"
set fromurl {}
- catch {
- set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
- set fromurl [gets $inf]
+ set distance -1
+ if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
+ set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url 404"
- puts $outf "Reference $fromurl"
+ RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
+ if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
- }
+}
-proc Robot301 {url tourl} {
+proc RobotRedirect {url tourl code} {
global URL
puts "Redirecting from $url to $tourl"
+ set distance {}
set fromurl {}
- catch {
- set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
- set fromurl [gets $inf]
+ if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
+ set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url to $tourl 301"
- puts $outf "Reference $fromurl"
+ if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
if {[RobotHref $url tourl host path]} {
- if {![RobotFileExist unvisited $host $path]} {
- puts "Mark as unvisited"
- set outf [RobotFileOpen unvisited $host $path]
- puts $outf 301
- RobotFileClose $outf
+ if {![RobotFileExist visited $host $path]} {
+ if {![RobotFileExist unvisited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+ } else {
+ set olddistance {}
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[string length $distance] == 0} {
+ set distance 1000
+ }
+ puts "distance=$distance olddistance=$olddistance"
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $tourl $distance
+ RobotFileClose $outf
+ }
}
}
+ if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} {
+ puts "unlink failed"
+ exit 1
+ }
+}
+
+proc link {url out href body distance} {
+ global URL maxdistance
+ if {[expr $distance > $maxdistance]} return
+
+ if {![RobotHref $url href host path]} return
+
+ puts $out ""
+ puts $out "$href"
+ puts $out "$body"
+ puts $out ""
+
+ if {![RobotFileExist visited $host $path]} {
+ set olddistance 1000
+ if {![RobotFileExist bad $host $path]} {
+ if {[RobotFileExist unvisited $host $path]} {
+ set inf [RobotFileOpen unvisited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ }
+ } else {
+ set olddistance 0
+ }
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ } elseif {[string compare $href $url]} {
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf xurl olddistance
+ close $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ puts "OK remarking url=$url href=$href"
+ puts "olddistance = $olddistance"
+ puts "newdistance = $distance"
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ }
}
proc RobotTextHtml {url out} {
- global URL
+ global URL maxdistance
- set head 0
+ set distance 0
+ set fdistance 0
+ if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
+ set fdistance $URL($url,dist)
+ set distance [expr $fdistance + 1]
+ }
htmlSwitch $URL($url,buf) \
title {
- if {!$head} {
- headSave $url $out
- set head 1
- }
puts $out "$body"
} -nonest meta {
- if {!$head} {
- headSave $url $out
- set head 1
- }
puts -nonewline $out "}
} body {
- regsub -all -nocase {} $body {} abody
- regsub -all {<[^\>]+>} $abody {} nbody
+ regsub -all {