X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=bfe875f339e32169de55f564fdd4aec58d213643;hb=bd463f7d1f1610a3c7a3d9e678f5c4ff27f9d546;hp=ee70b9afc44461c421fb18f837fb5fcc502e62af;hpb=7476a63e6732f7f51eea10bf38daaea4a31be57f;p=tclrobot.git
diff --git a/robot.tcl b/robot.tcl
index ee70b9a..bfe875f 100755
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.28 2001/11/13 11:17:26 adam Exp $
+# $Id: robot.tcl,v 1.32 2002/03/25 16:11:08 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
@@ -69,7 +69,9 @@ proc RobotFileNext {area} {
if {![string length $n]} {
set robotSeq -1
flush stdout
- puts "Round robin un,ba,vi=$status(unvisited),$status(bad),$status(visited)"
+ set statusfile [open status w]
+ puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
+ close $statusfile
return wait
}
incr robotSeq
@@ -89,14 +91,20 @@ proc RobotFileNext {area} {
proc RobotFileExist {area host path} {
- # puts "RobotFileExist begin area=$area host=$host path=$path"
+ global debuglevel
+
+ if {$debuglevel > 3} {
+ puts "RobotFileExist begin area=$area host=$host path=$path"
+ }
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
- # puts "RobotFileExist end npath=$npath"
+ if {$debuglevel > 3} {
+ puts "RobotFileExist end npath=$npath"
+ }
return [file exists $npath]
}
@@ -136,11 +144,14 @@ proc RobotFileOpen {area host path {mode w}} {
set orgPwd [pwd]
global workdir
global status
+ global debuglevel
if {![info exists workdir]} {
return stdout
}
- #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ if {$debuglevel > 3} {
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ }
if {[string compare $orgPwd $workdir]} {
puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
@@ -160,20 +171,18 @@ proc RobotFileOpen {area host path {mode w}} {
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
- set out [open frobots.txt w]
- puts "creating robots.txt in $d"
- close $out
- incr status(unvisited)
+ if {[string compare $path /robots.txt]} {
+ set out [open frobots.txt w]
+ puts "creating robots.txt in $d"
+ close $out
+ incr status(unvisited)
+ }
}
}
}
set d [lindex $comp $len]
if {[string length $d]} {
- if {[file isdirectory $d]} {
- set out [open $d/f $mode]
- } else {
- set out [open f$d $mode]
- }
+ set out [open f$d $mode]
} else {
set out [open f $mode]
}
@@ -279,12 +288,12 @@ proc RobotHref {url hrefx hostx pathx} {
if {[string length $href] > 256} {
return 0
}
- if {[string first {?} $href] >= 0} {
- return 0
- }
- if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
- return 0
- }
+# if {[string first {?} $href] >= 0} {
+# return 0
+# }
+# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+# return 0
+# }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
@@ -320,7 +329,11 @@ proc RobotHref {url hrefx hostx pathx} {
}
if {[string first / $surl]} {
# relative path
- regexp {^([^\#?]*)} $URL($url,path) x dpart
+ set curpath $URL($url,path)
+ if {[info exists URL($url,bpath)]} {
+ set curpath $URL($url,bpath)
+ }
+ regexp {^([^\#?]*)} $curpath x dpart
set l [string last / $dpart]
if {[expr $l >= 0]} {
set surl [string range $dpart 0 $l]$surl
@@ -433,182 +446,142 @@ proc RobotRedirect {url tourl code} {
}
}
+proc link {url out href body distance} {
+ global URL maxdistance
+ if {[expr $distance > $maxdistance]} return
+
+ if {![RobotHref $url href host path]} return
+
+ puts $out "