X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=2e175d623cb41f9c62a06a7b0d318a29dca11b45;hb=833faf12d797d629cae34abc8e84e88a6044eb7f;hp=9b672cd52968a1164de792fed43818eacd4a45d1;hpb=aff790b94e4679bf6e1b6e181e9dc28ecf1c75ba;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 9b672cd..2e175d6 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,8 +1,8 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.12 2001/01/23 14:28:41 adam Exp $ +# $Id: robot.tcl,v 1.25 2001/11/07 11:50:07 adam Exp $ # proc RobotFileNext1 {area lead} { - puts "RobotFileNext1 area=$area lead=$lead" + # puts "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } @@ -43,16 +43,16 @@ proc RobotReadRecord {inf fromurlx distancex} { gets $inf gets $inf set distance [string trim [gets $inf]] - puts "got distance = $distance" + # puts "got distance = $distance" gets $inf gets $inf set fromurl [string trim [gets $inf]] } proc RobotFileNext {area} { - global robotSeq global idleTime ns + global robotSeq global idletime ns - puts "RobotFileNext robotSeq=$robotSeq" + # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { return {} } @@ -67,7 +67,7 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "------------ N E X T R O U N D --------" + puts "Round robin" return wait } incr robotSeq @@ -87,27 +87,27 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - puts "RobotFileExist begin area=$area host=$host path=$path" + # puts "RobotFileExist begin area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "RobotFileExist end npath=$npath" + # puts "RobotFileExist end npath=$npath" return [file exists $npath] } proc RobotFileUnlink {area host path} { - puts "RobotFileUnlink begin" - puts "area=$area host=$host path=$path" + # puts "RobotFileUnlink begin" + # puts "area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "npath=$npath" + # puts "npath=$npath" set comp [split $npath /] set l [llength $comp] incr l -1 @@ -118,7 +118,7 @@ proc RobotFileUnlink {area host path} { if {![catch {glob $path/*}]} return exec rmdir ./$path } - puts "RobotFileUnlink end" + # puts "RobotFileUnlink end" } proc RobotFileClose {out} { @@ -134,7 +134,7 @@ proc RobotFileOpen {area host path {mode w}} { if {![info exists workdir]} { return stdout } - puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" if {[string compare $orgPwd $workdir]} { puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" @@ -201,9 +201,9 @@ proc RobotRestart {url sock} { proc RobotStart {} { global URL - global robotsRunning robotsMax idleTime + global robotsRunning robotsMax idletime - puts "RobotStart" + # puts "RobotStart" while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { @@ -211,7 +211,7 @@ proc RobotStart {} { } incr robotsRunning if {[string compare $url wait] == 0} { - after $idleTime RobotRR + after $idletime RobotRR return } set r [RobotGetUrl $url {}] @@ -254,16 +254,24 @@ proc headSave {url out} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } if {[string first { } $href] >= 0} { return 0 } + if {[string length $href] > 256} { + return 0 + } + if {[string first {?} $href] >= 0} { + return 0 + } if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { return 0 } @@ -310,38 +318,45 @@ proc RobotHref {url hrefx hostx pathx} { set surl $dpart/$surl } } - set c [split $surl /] - set i [llength $c] - incr i -1 - set path [lindex $c $i] - incr i -1 - while {$i >= 0} { - switch -- [lindex $c $i] { + set surllist [split $surl /] + catch {unset path} + set pathl 0 + foreach c $surllist { + switch -- $c { .. { - incr i -2 - if {$i < 0} { - set i 0 + if {$pathl > 0} { + incr pathl -2 + set path [lrange $path 0 $pathl] + incr pathl } } - . { - incr i -1 - } - default { - set path [lindex $c $i]/$path - incr i -1 + . { + + } + default { + incr pathl + lappend path $c } } } + if {$pathl} { + set path [join $path /] + } else { + set path "" + } regsub -all {~} $path {%7E} path set href "$method://$host$path" - puts "Ref href = $href" - return 1 + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] } proc RobotError {url code} { global URL - puts "Bad URL $url, $code" + puts "Bad URL $url (code $code)" set fromurl {} set distance -1 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { @@ -407,10 +422,10 @@ proc RobotRedirect {url tourl code} { } proc RobotTextHtml {url out} { - global URL maxDistance + global URL maxdistance set distance 0 - if {$maxDistance < 1000 && [info exists URL($url,dist)]} { + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { set distance [expr $URL($url,dist) + 1] } htmlSwitch $URL($url,buf) \ @@ -426,8 +441,9 @@ proc RobotTextHtml {url out} { } puts $out {>} } body { - regsub -all -nocase {} $body {} abody - regsub -all {<[^\>]+>} $abody {} nbody + regsub -all {