X-Git-Url: http://git.indexdata.com/?p=tclrobot.git;a=blobdiff_plain;f=robot.tcl;h=ee70b9afc44461c421fb18f837fb5fcc502e62af;hp=c9388bcd3f2ddf24a95df708c41265daf9cb8532;hb=7476a63e6732f7f51eea10bf38daaea4a31be57f;hpb=0c2ddebb45112314921d3da60f466622b7e53845 diff --git a/robot.tcl b/robot.tcl index c9388bc..ee70b9a 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.20 2001/06/29 22:25:55 adam Exp $ +# $Id: robot.tcl,v 1.28 2001/11/13 11:17:26 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -50,7 +50,9 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {area} { - global robotSeq global idleTime ns + global robotSeq + global idletime ns + global status # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { @@ -67,7 +69,7 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "------------ N E X T R O U N D --------" + puts "Round robin un,ba,vi=$status(unvisited),$status(bad),$status(visited)" return wait } incr robotSeq @@ -99,6 +101,7 @@ proc RobotFileExist {area host path} { } proc RobotFileUnlink {area host path} { + global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" set lpath [split $path /] @@ -109,10 +112,12 @@ proc RobotFileUnlink {area host path} { set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t # puts "npath=$npath" set comp [split $npath /] + if {[catch {exec rm [join $comp /]}]} return + set l [llength $comp] incr l -1 - if {[catch {exec rm [join $comp /]}]} return incr l -1 + incr status($area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return @@ -130,6 +135,7 @@ proc RobotFileClose {out} { proc RobotFileOpen {area host path {mode w}} { set orgPwd [pwd] global workdir + global status if {![info exists workdir]} { return stdout @@ -157,6 +163,7 @@ proc RobotFileOpen {area host path {mode w}} { set out [open frobots.txt w] puts "creating robots.txt in $d" close $out + incr status(unvisited) } } } @@ -170,6 +177,9 @@ proc RobotFileOpen {area host path {mode w}} { } else { set out [open f $mode] } + if {$mode == "w"} { + incr status($area) + } cd $orgPwd return $out } @@ -201,7 +211,7 @@ proc RobotRestart {url sock} { proc RobotStart {} { global URL - global robotsRunning robotsMax idleTime + global robotsRunning robotsMax idletime # puts "RobotStart" while {1} { @@ -211,7 +221,7 @@ proc RobotStart {} { } incr robotsRunning if {[string compare $url wait] == 0} { - after $idleTime RobotRR + after $idletime RobotRR return } set r [RobotGetUrl $url {}] @@ -254,12 +264,14 @@ proc headSave {url out} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } if {[string first { } $href] >= 0} { return 0 @@ -322,7 +334,7 @@ proc RobotHref {url hrefx hostx pathx} { foreach c $surllist { switch -- $c { .. { - if {$pathl > 0} { + if {$pathl > 1} { incr pathl -2 set path [lrange $path 0 $pathl] incr pathl @@ -337,21 +349,26 @@ proc RobotHref {url hrefx hostx pathx} { } } } - if {$pathl} { - set path [join $path /] - } else { - set path "" + if {$debuglevel > 4} { + puts "pathl=$pathl output path=$path" + } + set path [join $path /] + if {![string length $path]} { + set path / } regsub -all {~} $path {%7E} path set href "$method://$host$path" - puts "Ref href = $href" - return 1 + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] } proc RobotError {url code} { global URL - puts "Bad URL $url, $code" + puts "Bad URL $url (code $code)" set fromurl {} set distance -1 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { @@ -417,10 +434,10 @@ proc RobotRedirect {url tourl code} { } proc RobotTextHtml {url out} { - global URL maxDistance + global URL maxdistance set distance 0 - if {$maxDistance < 1000 && [info exists URL($url,dist)]} { + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { set distance [expr $URL($url,dist) + 1] } htmlSwitch $URL($url,buf) \ @@ -436,8 +453,9 @@ proc RobotTextHtml {url out} { } puts $out {>} } body { - regsub -all -nocase {))*} $body {} abody - regsub -all {<[^\>]+>} $abody {} nbody + regsub -all {