X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=2e175d623cb41f9c62a06a7b0d318a29dca11b45;hb=833faf12d797d629cae34abc8e84e88a6044eb7f;hp=b8db6c45d17b4e570a217921833f1c5a49b931b7;hpb=9b7fbb227055fc0ad4eeb10d482fd9e8ada7ce4e;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index b8db6c4..2e175d6 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.18 2001/06/07 08:17:00 adam Exp $ +# $Id: robot.tcl,v 1.25 2001/11/07 11:50:07 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -50,7 +50,7 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {area} { - global robotSeq global idleTime ns + global robotSeq global idletime ns # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { @@ -67,7 +67,7 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "------------ N E X T R O U N D --------" + puts "Round robin" return wait } incr robotSeq @@ -201,7 +201,7 @@ proc RobotRestart {url sock} { proc RobotStart {} { global URL - global robotsRunning robotsMax idleTime + global robotsRunning robotsMax idletime # puts "RobotStart" while {1} { @@ -211,7 +211,7 @@ proc RobotStart {} { } incr robotsRunning if {[string compare $url wait] == 0} { - after $idleTime RobotRR + after $idletime RobotRR return } set r [RobotGetUrl $url {}] @@ -254,12 +254,14 @@ proc headSave {url out} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } if {[string first { } $href] >= 0} { return 0 @@ -267,6 +269,9 @@ proc RobotHref {url hrefx hostx pathx} { if {[string length $href] > 256} { return 0 } + if {[string first {?} $href] >= 0} { + return 0 + } if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { return 0 } @@ -320,8 +325,9 @@ proc RobotHref {url hrefx hostx pathx} { switch -- $c { .. { if {$pathl > 0} { - incr pathl -1 + incr pathl -2 set path [lrange $path 0 $pathl] + incr pathl } } . { @@ -340,14 +346,17 @@ proc RobotHref {url hrefx hostx pathx} { } regsub -all {~} $path {%7E} path set href "$method://$host$path" - puts "Ref href = $href" - return 1 + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] } proc RobotError {url code} { global URL - puts "Bad URL $url, $code" + puts "Bad URL $url (code $code)" set fromurl {} set distance -1 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { @@ -413,10 +422,10 @@ proc RobotRedirect {url tourl code} { } proc RobotTextHtml {url out} { - global URL maxDistance + global URL maxdistance set distance 0 - if {$maxDistance < 1000 && [info exists URL($url,dist)]} { + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { set distance [expr $URL($url,dist) + 1] } htmlSwitch $URL($url,buf) \ @@ -432,8 +441,9 @@ proc RobotTextHtml {url out} { } puts $out {>} } body { - regsub -all -nocase {))*} $body {} abody - regsub -all {<[^\>]+>} $abody {} nbody + regsub -all {