X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=bfe875f339e32169de55f564fdd4aec58d213643;hb=bd463f7d1f1610a3c7a3d9e678f5c4ff27f9d546;hp=ee70b9afc44461c421fb18f837fb5fcc502e62af;hpb=7476a63e6732f7f51eea10bf38daaea4a31be57f;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index ee70b9a..bfe875f 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.28 2001/11/13 11:17:26 adam Exp $ +# $Id: robot.tcl,v 1.32 2002/03/25 16:11:08 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -69,7 +69,9 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "Round robin un,ba,vi=$status(unvisited),$status(bad),$status(visited)" + set statusfile [open status w] + puts $statusfile "$status(unvisited) $status(bad) $status(visited)" + close $statusfile return wait } incr robotSeq @@ -89,14 +91,20 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - # puts "RobotFileExist begin area=$area host=$host path=$path" + global debuglevel + + if {$debuglevel > 3} { + puts "RobotFileExist begin area=$area host=$host path=$path" + } set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - # puts "RobotFileExist end npath=$npath" + if {$debuglevel > 3} { + puts "RobotFileExist end npath=$npath" + } return [file exists $npath] } @@ -136,11 +144,14 @@ proc RobotFileOpen {area host path {mode w}} { set orgPwd [pwd] global workdir global status + global debuglevel if {![info exists workdir]} { return stdout } - #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + if {$debuglevel > 3} { + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + } if {[string compare $orgPwd $workdir]} { puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" @@ -160,20 +171,18 @@ proc RobotFileOpen {area host path {mode w}} { exec mkdir $d cd ./$d if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { - set out [open frobots.txt w] - puts "creating robots.txt in $d" - close $out - incr status(unvisited) + if {[string compare $path /robots.txt]} { + set out [open frobots.txt w] + puts "creating robots.txt in $d" + close $out + incr status(unvisited) + } } } } set d [lindex $comp $len] if {[string length $d]} { - if {[file isdirectory $d]} { - set out [open $d/f $mode] - } else { - set out [open f$d $mode] - } + set out [open f$d $mode] } else { set out [open f $mode] } @@ -279,12 +288,12 @@ proc RobotHref {url hrefx hostx pathx} { if {[string length $href] > 256} { return 0 } - if {[string first {?} $href] >= 0} { - return 0 - } - if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { - return 0 - } +# if {[string first {?} $href] >= 0} { +# return 0 +# } +# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { +# return 0 +# } # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href @@ -320,7 +329,11 @@ proc RobotHref {url hrefx hostx pathx} { } if {[string first / $surl]} { # relative path - regexp {^([^\#?]*)} $URL($url,path) x dpart + set curpath $URL($url,path) + if {[info exists URL($url,bpath)]} { + set curpath $URL($url,bpath) + } + regexp {^([^\#?]*)} $curpath x dpart set l [string last / $dpart] if {[expr $l >= 0]} { set surl [string range $dpart 0 $l]$surl @@ -433,182 +446,142 @@ proc RobotRedirect {url tourl code} { } } +proc link {url out href body distance} { + global URL maxdistance + if {[expr $distance > $maxdistance]} return + + if {![RobotHref $url href host path]} return + + puts $out "" + puts $out "$href" + puts $out "$body" + puts $out "" + + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 + if {![RobotFileExist bad $host $path]} { + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } +} + proc RobotTextHtml {url out} { global URL maxdistance + # set title so we can emit it for the body + set title {} + # if true, nothing will be indexed + set noindex 0 + # if true, nothing will be followed + set nofollow 0 + set distance 0 + set fdistance 0 if {$maxdistance < 1000 && [info exists URL($url,dist)]} { - set distance [expr $URL($url,dist) + 1] + set fdistance $URL($url,dist) + set distance [expr $fdistance + 1] } htmlSwitch $URL($url,buf) \ title { - puts $out "$body" + set title $body } -nonest meta { + # collect metadata and save NAME= CONTENT=.. + set metaname {} + set metacontent {} puts -nonewline $out "" + # go through robots directives (af any) + if {![string compare $metaname robots]} { + set direcs [split [string tolower $metacontent] ,] + if {[lsearch $direcs noindex] >= 0} { + set noindex 1 + } + if {[lsearch $direcs nofollow] >= 0} { + set nofollow 1 + } } - puts $out {>} } body { - regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} bbody + regsub -all {<[^\>]+>} $bbody {} nbody + puts $out "" + puts $out $nbody + puts $out "" + } + } -nonest base { + # if {![info exists parm(href)]} { - puts "no href" continue } - if {[expr $distance <= $maxdistance]} { - set href [string trim $parm(href)] - if {![RobotHref $url href host path]} continue - - puts $out "" - puts $out "$href" - puts $out "$body" - puts $out "" - - if {![RobotFileExist visited $host $path]} { - set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - } - } else { - set olddistance 0 - } - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] - RobotReadRecord $inf xurl olddistance - close $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - puts "OK remarking url=$url href=$href" - puts "olddistance = $olddistance" - puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } - } + set href [string trim $parm(href)] + if {![RobotHref $url href host path]} continue + set URL($url,bpath) $path + } a { + # .. + # we're not using nonest - otherwise body isn't set + if {$nofollow} continue + if {![info exists parm(href)]} { + continue + } + link $url $out [string trim $parm(href)] $body $distance } -nonest area { + if {$nofollow} continue if {![info exists parm(href)]} { - puts "no href" continue } - if {[expr $distance <= $maxdistance]} { - set href [string trim $parm(href)] - if {![RobotHref $url href host path]} continue - - puts $out "" - puts $out "$href" - puts $out "" - puts $out "" - - if {![RobotFileExist visited $host $path]} { - set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - } - } else { - set olddistance 0 - } - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] - RobotReadRecord $inf xurl olddistance - close $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - puts "OK remarking url=$url href=$href" - puts "olddistance = $olddistance" - puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } - } + link $url $out [string trim $parm(href)] $body $distance } -nonest frame { if {![info exists parm(src)]} { - puts "no src" continue } - if {[expr $distance <= $maxdistance]} { - set href [string trim $parm(src)] - if {![RobotHref $url href host path]} continue - - puts $out "" - puts $out "$href" - puts $out "" - puts $out "" - - if {![RobotFileExist visited $host $path]} { - set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - } - } else { - set olddistance 0 - } - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] - RobotReadRecord $inf xurl olddistance - close $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - puts "OK remarking url=$url href=$href" - puts "olddistance = $olddistance" - puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } - } + link $url $out [string trim $parm(src)] $body $fdistance } } @@ -1082,4 +1055,7 @@ while {$robotsRunning} { vwait robotsRunning } -puts "End un,ba,vi=$status(unvisited),$status(bad),$status(visited)" +set statusfile [open status w] +puts $statusfile "$status(unvisited) $status(bad) $status(visited)" +close $statusfile +