#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $
+# $Id: robot.tcl,v 1.33 2002/03/25 16:13:21 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
- set out [open frobots.txt w]
- puts "creating robots.txt in $d"
- close $out
- incr status(unvisited)
+ if {[string compare $path /robots.txt]} {
+ set out [open frobots.txt w]
+ puts "creating robots.txt in $d"
+ close $out
+ incr status(unvisited)
+ }
}
}
}
set d [lindex $comp $len]
if {[string length $d]} {
set out [open f$d $mode]
- if {0} {
- if {[file isfile $d/f]} {
- set out [open $d/f $mode]
- } else {
- set out [open f$d $mode]
- }
- }
} else {
set out [open f $mode]
}
if {[string length $href] > 256} {
return 0
}
- if {[string first {?} $href] >= 0} {
- return 0
- }
- if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
- return 0
- }
+
+# Skip pages that have ? in them
+# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+# return 0
+# }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
proc RobotTextHtml {url out} {
global URL maxdistance
+ # set title so we can emit it for the body
+ set title {}
+ # if true, nothing will be indexed
+ set noindex 0
+ # if true, nothing will be followed
+ set nofollow 0
+
set distance 0
set fdistance 0
if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
}
htmlSwitch $URL($url,buf) \
title {
- puts $out "<title>$body</title>"
+ set title $body
} -nonest meta {
+ # collect metadata and save NAME= CONTENT=..
+ set metaname {}
+ set metacontent {}
puts -nonewline $out "<meta"
foreach a [array names parm] {
- puts -nonewline $out " $a"
+ set al [string tolower $a]
+ puts -nonewline $out " $al"
puts -nonewline $out {="}
puts -nonewline $out $parm($a)
puts -nonewline $out {"}
+ switch -- $al {
+ "name" {
+ set metaname [string tolower $parm($a)]
+ }
+ "content" {
+ set metacontent $parm($a)
+ }
+ }
+ }
+ puts $out "></meta>"
+ # go through robots directives (af any)
+ if {![string compare $metaname robots]} {
+ set direcs [split [string tolower $metacontent] ,]
+ if {[lsearch $direcs noindex] >= 0} {
+ set noindex 1
+ }
+ if {[lsearch $direcs nofollow] >= 0} {
+ set nofollow 1
+ }
}
- puts $out {></meta>}
} body {
- regsub -all {<!--[^-]*->} $body { } abody
- regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
- regsub -all {<[^\>]+>} $bbody {} nbody
- puts $out "<documentcontent>"
- puts $out $nbody
- puts $out "</documentcontent>"
+ # don't print title of document content if noindex is used
+ if {!$noindex} {
+ puts $out "<title>$title</title>"
+ regsub -all {<!--[^-]*-->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
+ regsub -all {<[^\>]+>} $bbody {} nbody
+ puts $out "<documentcontent>"
+ puts $out $nbody
+ puts $out "</documentcontent>"
+ }
} -nonest base {
+ # <base href=.. >
if {![info exists parm(href)]} {
continue
}
set href [string trim $parm(href)]
if {![RobotHref $url href host path]} continue
set URL($url,bpath) $path
- } -nonest a {
+ } a {
+ # <a href="...."> .. </a>
+ # we're not using nonest - otherwise body isn't set
+ if {$nofollow} continue
if {![info exists parm(href)]} {
continue
}
link $url $out [string trim $parm(href)] $body $distance
} -nonest area {
+ if {$nofollow} continue
if {![info exists parm(href)]} {
continue
}