X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=b6466c347186d9954fbdda74c78ed7054fab76c5;hb=9d3f82cd1140362487d8fa6372cac1b24a49d21e;hp=6323bc3421614b9ce1eb447fbcb73fdd53445959;hpb=4d94083b545d3665a3ceca7962ebb6788bc62dd3;p=tclrobot.git
diff --git a/robot.tcl b/robot.tcl
index 6323bc3..b6466c3 100755
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.19 2001/06/29 21:47:31 adam Exp $
+# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
@@ -50,7 +50,9 @@ proc RobotReadRecord {inf fromurlx distancex} {
}
proc RobotFileNext {area} {
- global robotSeq global idleTime ns
+ global robotSeq
+ global idletime ns
+ global status
# puts "RobotFileNext robotSeq=$robotSeq"
if {$robotSeq < 0} {
@@ -67,7 +69,9 @@ proc RobotFileNext {area} {
if {![string length $n]} {
set robotSeq -1
flush stdout
- puts "------------ N E X T R O U N D --------"
+ set statusfile [open status w]
+ puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
+ close $statusfile
return wait
}
incr robotSeq
@@ -87,18 +91,25 @@ proc RobotFileNext {area} {
proc RobotFileExist {area host path} {
- # puts "RobotFileExist begin area=$area host=$host path=$path"
+ global debuglevel
+
+ if {$debuglevel > 3} {
+ puts "RobotFileExist begin area=$area host=$host path=$path"
+ }
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
- # puts "RobotFileExist end npath=$npath"
+ if {$debuglevel > 3} {
+ puts "RobotFileExist end npath=$npath"
+ }
return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
+ global status
# puts "RobotFileUnlink begin"
# puts "area=$area host=$host path=$path"
set lpath [split $path /]
@@ -109,10 +120,12 @@ proc RobotFileUnlink {area host path} {
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
# puts "npath=$npath"
set comp [split $npath /]
+ if {[catch {exec rm [join $comp /]}]} return
+
set l [llength $comp]
incr l -1
- if {[catch {exec rm [join $comp /]}]} return
incr l -1
+ incr status($area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
@@ -130,11 +143,15 @@ proc RobotFileClose {out} {
proc RobotFileOpen {area host path {mode w}} {
set orgPwd [pwd]
global workdir
+ global status
+ global debuglevel
if {![info exists workdir]} {
return stdout
}
- #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ if {$debuglevel > 3} {
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ }
if {[string compare $orgPwd $workdir]} {
puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
@@ -157,19 +174,26 @@ proc RobotFileOpen {area host path {mode w}} {
set out [open frobots.txt w]
puts "creating robots.txt in $d"
close $out
+ incr status(unvisited)
}
}
}
set d [lindex $comp $len]
if {[string length $d]} {
- if {[file isdirectory $d]} {
- set out [open $d/f $mode]
- } else {
- set out [open f$d $mode]
- }
+ set out [open f$d $mode]
+ if {0} {
+ if {[file isfile $d/f]} {
+ set out [open $d/f $mode]
+ } else {
+ set out [open f$d $mode]
+ }
+ }
} else {
set out [open f $mode]
}
+ if {$mode == "w"} {
+ incr status($area)
+ }
cd $orgPwd
return $out
}
@@ -201,7 +225,7 @@ proc RobotRestart {url sock} {
proc RobotStart {} {
global URL
- global robotsRunning robotsMax idleTime
+ global robotsRunning robotsMax idletime
# puts "RobotStart"
while {1} {
@@ -211,7 +235,7 @@ proc RobotStart {} {
}
incr robotsRunning
if {[string compare $url wait] == 0} {
- after $idleTime RobotRR
+ after $idletime RobotRR
return
}
set r [RobotGetUrl $url {}]
@@ -254,12 +278,14 @@ proc headSave {url out} {
}
proc RobotHref {url hrefx hostx pathx} {
- global URL domains
+ global URL domains debuglevel
upvar $hrefx href
upvar $hostx host
upvar $pathx path
- puts "Ref url = $url href=$href"
+ if {$debuglevel > 1} {
+ puts "Ref input url = $url href=$href"
+ }
if {[string first { } $href] >= 0} {
return 0
@@ -308,7 +334,11 @@ proc RobotHref {url hrefx hostx pathx} {
}
if {[string first / $surl]} {
# relative path
- regexp {^([^\#?]*)} $URL($url,path) x dpart
+ set curpath $URL($url,path)
+ if {[info exists URL($url,bpath)]} {
+ set curpath $URL($url,bpath)
+ }
+ regexp {^([^\#?]*)} $curpath x dpart
set l [string last / $dpart]
if {[expr $l >= 0]} {
set surl [string range $dpart 0 $l]$surl
@@ -322,9 +352,10 @@ proc RobotHref {url hrefx hostx pathx} {
foreach c $surllist {
switch -- $c {
.. {
- if {$pathl > 0} {
- incr pathl -1
+ if {$pathl > 1} {
+ incr pathl -2
set path [lrange $path 0 $pathl]
+ incr pathl
}
}
. {
@@ -336,21 +367,26 @@ proc RobotHref {url hrefx hostx pathx} {
}
}
}
- if {$pathl} {
- set path [join $path /]
- } else {
- set path ""
+ if {$debuglevel > 4} {
+ puts "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
}
regsub -all {~} $path {%7E} path
set href "$method://$host$path"
- puts "Ref href = $href"
- return 1
+
+ if {$debuglevel > 1} {
+ puts "Ref result = $href"
+ }
+ return [checkrule url $href]
}
proc RobotError {url code} {
global URL
- puts "Bad URL $url, $code"
+ puts "Bad URL $url (code $code)"
set fromurl {}
set distance -1
if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
@@ -415,12 +451,62 @@ proc RobotRedirect {url tourl code} {
}
}
+proc link {url out href body distance} {
+ global URL maxdistance
+ if {[expr $distance > $maxdistance]} return
+
+ if {![RobotHref $url href host path]} return
+
+ puts $out ""
+ puts $out "$href"
+ puts $out "$body"
+ puts $out ""
+
+ if {![RobotFileExist visited $host $path]} {
+ set olddistance 1000
+ if {![RobotFileExist bad $host $path]} {
+ if {[RobotFileExist unvisited $host $path]} {
+ set inf [RobotFileOpen unvisited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ }
+ } else {
+ set olddistance 0
+ }
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ } elseif {[string compare $href $url]} {
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf xurl olddistance
+ close $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ puts "OK remarking url=$url href=$href"
+ puts "olddistance = $olddistance"
+ puts "newdistance = $distance"
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ }
+}
+
proc RobotTextHtml {url out} {
- global URL maxDistance
+ global URL maxdistance
set distance 0
- if {$maxDistance < 1000 && [info exists URL($url,dist)]} {
- set distance [expr $URL($url,dist) + 1]
+ set fdistance 0
+ if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
+ set fdistance $URL($url,dist)
+ set distance [expr $fdistance + 1]
}
htmlSwitch $URL($url,buf) \
title {
@@ -435,111 +521,34 @@ proc RobotTextHtml {url out} {
}
puts $out {>}
} body {
- regsub -all -nocase {} $body {} abody
- regsub -all {<[^\>]+>} $abody {} nbody
+ regsub -all {