X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=793298c6ff9f955b149d72ca8e653e59ce27eb43;hb=2e203397567386e2806359f5ac678491d184b047;hp=b6466c347186d9954fbdda74c78ed7054fab76c5;hpb=9d3f82cd1140362487d8fa6372cac1b24a49d21e;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index b6466c3..793298c 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $ +# $Id: robot.tcl,v 1.33 2002/03/25 16:13:21 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -171,23 +171,18 @@ proc RobotFileOpen {area host path {mode w}} { exec mkdir $d cd ./$d if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { - set out [open frobots.txt w] - puts "creating robots.txt in $d" - close $out - incr status(unvisited) + if {[string compare $path /robots.txt]} { + set out [open frobots.txt w] + puts "creating robots.txt in $d" + close $out + incr status(unvisited) + } } } } set d [lindex $comp $len] if {[string length $d]} { set out [open f$d $mode] - if {0} { - if {[file isfile $d/f]} { - set out [open $d/f $mode] - } else { - set out [open f$d $mode] - } - } } else { set out [open f $mode] } @@ -293,12 +288,11 @@ proc RobotHref {url hrefx hostx pathx} { if {[string length $href] > 256} { return 0 } - if {[string first {?} $href] >= 0} { - return 0 - } - if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { - return 0 - } + +# Skip pages that have ? in them +# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { +# return 0 +# } # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href @@ -502,6 +496,13 @@ proc link {url out href body distance} { proc RobotTextHtml {url out} { global URL maxdistance + # set title so we can emit it for the body + set title {} + # if true, nothing will be indexed + set noindex 0 + # if true, nothing will be followed + set nofollow 0 + set distance 0 set fdistance 0 if {$maxdistance < 1000 && [info exists URL($url,dist)]} { @@ -510,36 +511,67 @@ proc RobotTextHtml {url out} { } htmlSwitch $URL($url,buf) \ title { - puts $out "$body" + set title $body } -nonest meta { + # collect metadata and save NAME= CONTENT=.. + set metaname {} + set metacontent {} puts -nonewline $out "" + # go through robots directives (af any) + if {![string compare $metaname robots]} { + set direcs [split [string tolower $metacontent] ,] + if {[lsearch $direcs noindex] >= 0} { + set noindex 1 + } + if {[lsearch $direcs nofollow] >= 0} { + set nofollow 1 + } } - puts $out {>} } body { - regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} bbody + regsub -all {<[^\>]+>} $bbody {} nbody + puts $out "" + puts $out $nbody + puts $out "" + } } -nonest base { + # if {![info exists parm(href)]} { continue } set href [string trim $parm(href)] if {![RobotHref $url href host path]} continue set URL($url,bpath) $path - } -nonest a { + } a { + # .. + # we're not using nonest - otherwise body isn't set + if {$nofollow} continue if {![info exists parm(href)]} { continue } link $url $out [string trim $parm(href)] $body $distance } -nonest area { + if {$nofollow} continue if {![info exists parm(href)]} { continue }