X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=b6466c347186d9954fbdda74c78ed7054fab76c5;hb=9d3f82cd1140362487d8fa6372cac1b24a49d21e;hp=f6d7900efe8be4dafaddf520ddc46886719b8676;hpb=8c6278f1761ad232bdc02ac2dffbeaf3a9258e59;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index f6d7900..b6466c3 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.26 2001/11/08 13:49:06 adam Exp $ +# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -50,7 +50,9 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {area} { - global robotSeq global idletime ns + global robotSeq + global idletime ns + global status # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { @@ -67,7 +69,9 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "Round robin" + set statusfile [open status w] + puts $statusfile "$status(unvisited) $status(bad) $status(visited)" + close $statusfile return wait } incr robotSeq @@ -87,18 +91,25 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - # puts "RobotFileExist begin area=$area host=$host path=$path" + global debuglevel + + if {$debuglevel > 3} { + puts "RobotFileExist begin area=$area host=$host path=$path" + } set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - # puts "RobotFileExist end npath=$npath" + if {$debuglevel > 3} { + puts "RobotFileExist end npath=$npath" + } return [file exists $npath] } proc RobotFileUnlink {area host path} { + global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" set lpath [split $path /] @@ -109,10 +120,12 @@ proc RobotFileUnlink {area host path} { set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t # puts "npath=$npath" set comp [split $npath /] + if {[catch {exec rm [join $comp /]}]} return + set l [llength $comp] incr l -1 - if {[catch {exec rm [join $comp /]}]} return incr l -1 + incr status($area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return @@ -130,11 +143,15 @@ proc RobotFileClose {out} { proc RobotFileOpen {area host path {mode w}} { set orgPwd [pwd] global workdir + global status + global debuglevel if {![info exists workdir]} { return stdout } - #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + if {$debuglevel > 3} { + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + } if {[string compare $orgPwd $workdir]} { puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" @@ -157,19 +174,26 @@ proc RobotFileOpen {area host path {mode w}} { set out [open frobots.txt w] puts "creating robots.txt in $d" close $out + incr status(unvisited) } } } set d [lindex $comp $len] if {[string length $d]} { - if {[file isdirectory $d]} { - set out [open $d/f $mode] - } else { - set out [open f$d $mode] - } + set out [open f$d $mode] + if {0} { + if {[file isfile $d/f]} { + set out [open $d/f $mode] + } else { + set out [open f$d $mode] + } + } } else { set out [open f $mode] } + if {$mode == "w"} { + incr status($area) + } cd $orgPwd return $out } @@ -310,7 +334,11 @@ proc RobotHref {url hrefx hostx pathx} { } if {[string first / $surl]} { # relative path - regexp {^([^\#?]*)} $URL($url,path) x dpart + set curpath $URL($url,path) + if {[info exists URL($url,bpath)]} { + set curpath $URL($url,bpath) + } + regexp {^([^\#?]*)} $curpath x dpart set l [string last / $dpart] if {[expr $l >= 0]} { set surl [string range $dpart 0 $l]$surl @@ -423,12 +451,62 @@ proc RobotRedirect {url tourl code} { } } +proc link {url out href body distance} { + global URL maxdistance + if {[expr $distance > $maxdistance]} return + + if {![RobotHref $url href host path]} return + + puts $out "" + puts $out "$href" + puts $out "$body" + puts $out "" + + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 + if {![RobotFileExist bad $host $path]} { + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } +} + proc RobotTextHtml {url out} { global URL maxdistance set distance 0 + set fdistance 0 if {$maxdistance < 1000 && [info exists URL($url,dist)]} { - set distance [expr $URL($url,dist) + 1] + set fdistance $URL($url,dist) + set distance [expr $fdistance + 1] } htmlSwitch $URL($url,buf) \ title { @@ -449,106 +527,28 @@ proc RobotTextHtml {url out} { puts $out "" puts $out $nbody puts $out "" + } -nonest base { + if {![info exists parm(href)]} { + continue + } + set href [string trim $parm(href)] + if {![RobotHref $url href host path]} continue + set URL($url,bpath) $path } -nonest a { if {![info exists parm(href)]} { - puts "no href" continue } - if {[expr $distance <= $maxdistance]} { - set href [string trim $parm(href)] - if {![RobotHref $url href host path]} continue - - puts $out "" - puts $out "$href" - puts $out "$body" - puts $out "" - - if {![RobotFileExist visited $host $path]} { - set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - } - } else { - set olddistance 0 - } - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] - RobotReadRecord $inf xurl olddistance - close $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - puts "OK remarking url=$url href=$href" - puts "olddistance = $olddistance" - puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } - } + link $url $out [string trim $parm(href)] $body $distance } -nonest area { if {![info exists parm(href)]} { - puts "no href" continue } - if {[expr $distance <= $maxdistance]} { - set href [string trim $parm(href)] - if {![RobotHref $url href host path]} continue - - puts $out "" - puts $out "$href" - puts $out "" - puts $out "" - - if {![RobotFileExist visited $host $path]} { - set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - } - } else { - set olddistance 0 - } - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] - RobotReadRecord $inf xurl olddistance - close $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - puts "OK remarking url=$url href=$href" - puts "olddistance = $olddistance" - puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } - } + link $url $out [string trim $parm(href)] $body $distance + } -nonest frame { + if {![info exists parm(src)]} { + continue + } + link $url $out [string trim $parm(src)] $body $fdistance } } @@ -626,11 +626,6 @@ proc RobotWriteMetadata {url out} { text/plain { RobotTextPlain $url $out } - application/pdf { - set pdff [open test.pdf w] - puts -nonewline $pdff $URL($url,buf) - close $pdff - } } puts $out "" } @@ -642,10 +637,6 @@ proc Robot200 {url} { puts -nonewline $out $URL($url,buf) RobotFileClose $out - if {![checkrule mime $URL($url,head,content-type)]} { - RobotError $url mimedeny - return - } set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] RobotWriteMetadata $url $out RobotFileClose $out @@ -680,6 +671,7 @@ proc RobotReadHeader {url sock} { if {[catch {set buffer [read $sock 2148]}]} { RobotError $url 404 RobotRestart $url $sock + return } set readCount [string length $buffer] @@ -720,12 +712,19 @@ proc RobotReadHeader {url sock} { if {![info exists URL($url,head,content-type)]} { set URL($url,head,content-type) {} } - set binary 0 - switch $URL($url,head,content-type) { - application/pdf { - set binary 1 + set binary 1 + switch -glob -- $URL($url,head,content-type) { + text/* { + set binary 0 } } + if {![regexp {/robots.txt$} $url]} { + if {![checkrule mime $URL($url,head,content-type)]} { + RobotError $url mimedeny + RobotRestart $url $sock + return + } + } fileevent $sock readable [list RobotReadContent $url $sock $binary] } default { @@ -842,6 +841,10 @@ set workdir [pwd] set idletime 60000 set acceptLanguage {} set debuglevel 0 +set status(unvisited) 0 +set status(visited) 0 +set status(bad) 0 +set status(raw) 0 # Rules: allow, deny, url @@ -1011,9 +1014,15 @@ puts "domains=$domains" puts "max distance=$maxdistance" puts "max jobs=$robotsMax" + RobotStart while {$robotsRunning} { vwait robotsRunning } + +set statusfile [open status w] +puts $statusfile "$status(unvisited) $status(bad) $status(visited)" +close $statusfile +