X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=2e175d623cb41f9c62a06a7b0d318a29dca11b45;hb=833faf12d797d629cae34abc8e84e88a6044eb7f;hp=82e5c28ce8ee9ba634c85235473b450adaa44eec;hpb=de694c0a797c955ec564b91fee7d0293ce9bd03c;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 82e5c28..2e175d6 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,8 +1,8 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.9 2000/12/11 17:11:03 adam Exp $ +# $Id: robot.tcl,v 1.25 2001/11/07 11:50:07 adam Exp $ # proc RobotFileNext1 {area lead} { - puts "RobotFileNext1 area=$area lead=$lead" + # puts "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } @@ -26,35 +26,49 @@ proc RobotFileNext1 {area lead} { return {} } -proc RobotFileWait {} { - global robotSeq - set robotSeq 0 +proc RobotWriteRecord {outf fromurl distance} { + puts $outf "" + puts $outf "" + puts $outf $distance + puts $outf "" + puts $outf "" + puts $outf $fromurl + puts $outf "" + puts $outf "" +} + +proc RobotReadRecord {inf fromurlx distancex} { + upvar $fromurlx fromurl + upvar $distancex distance + gets $inf + gets $inf + set distance [string trim [gets $inf]] + # puts "got distance = $distance" + gets $inf + gets $inf + set fromurl [string trim [gets $inf]] } proc RobotFileNext {area} { - global robotSeq - puts "RobotFileNext robotSeq=$robotSeq" - if {[catch {set ns [glob ${area}/*]}]} { - return {} + global robotSeq global idletime ns + + # puts "RobotFileNext robotSeq=$robotSeq" + if {$robotSeq < 0} { + return {} + } + if {$robotSeq == 0} { + if {[catch {set ns [glob ${area}/*]}]} { + return {} + } } set off [string length $area] incr off - set n [lindex $ns $robotSeq] if {![string length $n]} { - flush stdout - puts "------------ N E X T R O U N D --------" set robotSeq -1 - after 60000 RobotFileWait - vwait robotSeq - - set n [lindex $ns $robotSeq] - if {![string length $n]} { - puts "robotSeq = $robotSeq" - puts "ns=$ns" - puts "no more work at index" - return {} - } + flush stdout + puts "Round robin" + return wait } incr robotSeq if {[file isfile $n/frobots.txt]} { @@ -73,29 +87,27 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - puts "RobotFileExist begin" - puts "area=$area host=$host path=$path" + # puts "RobotFileExist begin area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "npath=$npath" - puts "RobotFileExist end" + # puts "RobotFileExist end npath=$npath" return [file exists $npath] } proc RobotFileUnlink {area host path} { - puts "RobotFileUnlink begin" - puts "area=$area host=$host path=$path" + # puts "RobotFileUnlink begin" + # puts "area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "npath=$npath" + # puts "npath=$npath" set comp [split $npath /] set l [llength $comp] incr l -1 @@ -106,7 +118,7 @@ proc RobotFileUnlink {area host path} { if {![catch {glob $path/*}]} return exec rmdir ./$path } - puts "RobotFileUnlink end" + # puts "RobotFileUnlink end" } proc RobotFileClose {out} { @@ -122,7 +134,7 @@ proc RobotFileOpen {area host path {mode w}} { if {![info exists workdir]} { return stdout } - puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" if {[string compare $orgPwd $workdir]} { puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" @@ -140,7 +152,6 @@ proc RobotFileOpen {area host path {mode w}} { } if {[catch {cd ./$d}]} { exec mkdir $d - puts "creating $d" cd ./$d if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { set out [open frobots.txt w] @@ -153,39 +164,68 @@ proc RobotFileOpen {area host path {mode w}} { if {[string length $d]} { if {[file isdirectory $d]} { set out [open $d/f $mode] - puts "1" } else { set out [open f$d $mode] - puts "2" } } else { set out [open f $mode] - puts "3" } cd $orgPwd - #puts "RobotFileStop" return $out } -proc RobotRestart {sock} { - global URL - global robotMoreWork - +proc RobotRR {} { + global robotSeq robotsRunning + + incr robotsRunning -1 + while {$robotsRunning} { + vwait robotsRunning + } + set robotSeq 0 + RobotStart +} + +proc RobotRestart {url sock} { + global URL robotsRunning + close $sock after cancel $URL($sock,cancel) - while {1} { + + foreach v [array names URL $url,*] { + unset URL($v) + } + + incr robotsRunning -1 + RobotStart +} + +proc RobotStart {} { + global URL + global robotsRunning robotsMax idletime + + # puts "RobotStart" + while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { - break + return + } + incr robotsRunning + if {[string compare $url wait] == 0} { + after $idletime RobotRR + return } set r [RobotGetUrl $url {}] if {!$r} { - return + if {$robotsRunning >= $robotsMax} return } else { - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - } + incr robotsRunning -1 + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotFileClose $outf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + } } - incr robotMoreWork -1 } proc headSave {url out} { @@ -214,12 +254,27 @@ proc headSave {url out} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } + + if {[string first { } $href] >= 0} { + return 0 + } + if {[string length $href] > 256} { + return 0 + } + if {[string first {?} $href] >= 0} { + return 0 + } + if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { + return 0 + } # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href @@ -234,19 +289,21 @@ proc RobotHref {url hrefx hostx pathx} { if {![string length $surl]} { set surl / } - set ok 0 - foreach domain $domains { - if {[string match $domain $host]} { - set ok 1 - break + if {[info exist domains]} { + set ok 0 + foreach domain $domains { + if {[string match $domain $host]} { + set ok 1 + break + } } - } - if {!$ok} { - return 0 - } + if {!$ok} { + return 0 + } + } } else { regexp {^([^\#]*)} $hpath x surl - set host $URL($url,host) + set host $URL($url,hostport) } if {![string length $surl]} { return 0 @@ -261,58 +318,56 @@ proc RobotHref {url hrefx hostx pathx} { set surl $dpart/$surl } } - set c [split $surl /] - set i [llength $c] - incr i -1 - set path [lindex $c $i] - incr i -1 - while {$i >= 0} { - switch -- [lindex $c $i] { + set surllist [split $surl /] + catch {unset path} + set pathl 0 + foreach c $surllist { + switch -- $c { .. { - incr i -2 - if {$i < 0} { - set i 0 + if {$pathl > 0} { + incr pathl -2 + set path [lrange $path 0 $pathl] + incr pathl } } - . { - incr i -1 - } - default { - set path [lindex $c $i]/$path - incr i -1 + . { + + } + default { + incr pathl + lappend path $c } } } - regsub -all {~} $path {%7E} path - set ok 1 - if {[info exists URL($host,robots)]} { - foreach l $URL($host,robots) { - if {[string first [lindex $l 1] $path] == 0} { - set ok [lindex $l 0] - break - } - } + if {$pathl} { + set path [join $path /] + } else { + set path "" } + regsub -all {~} $path {%7E} path set href "$method://$host$path" - puts "Ref href = $href, ok=$ok" - return $ok + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] } proc RobotError {url code} { global URL - puts "Bad URL $url, $code" + puts "Bad URL $url (code $code)" set fromurl {} - if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] - close $inf - } - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url $code" - puts $outf "Reference $fromurl" + set distance -1 + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } } @@ -322,35 +377,57 @@ proc RobotRedirect {url tourl code} { puts "Redirecting from $url to $tourl" + set distance {} set fromurl {} - if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance RobotFileClose $inf } - if {[catch {RobotFileUnlink unvisited $URL($url,host) $URL($url,path)}]} { - puts "unlink failed" - exit 1 - } - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url to $tourl $code" - puts $outf "Reference $fromurl" + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } if {[RobotHref $url tourl host path]} { - if {![RobotFileExist unvisited $host $path]} { - puts "Mark as unvisited" - set outf [RobotFileOpen unvisited $host $path] - puts $outf $code - RobotFileClose $outf + if {![RobotFileExist visited $host $path]} { + if {![RobotFileExist unvisited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } + } else { + set olddistance {} + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[string length $distance] == 0} { + set distance 1000 + } + puts "distance=$distance olddistance=$olddistance" + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $tourl $distance + RobotFileClose $outf + } } } + if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} { + puts "unlink failed" + exit 1 + } } proc RobotTextHtml {url out} { - global URL + global URL maxdistance + set distance 0 + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { + set distance [expr $URL($url,dist) + 1] + } htmlSwitch $URL($url,buf) \ title { puts $out "$body" @@ -364,32 +441,108 @@ proc RobotTextHtml {url out} { } puts $out {>} } body { - regsub -all -nocase {} $body {} abody - regsub -all {<[^\>]+>} $abody {} nbody + regsub -all {