projects
/
tclrobot.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
9d3f82c
)
Robot honour robots meta tag
author
Adam Dickmeiss
<adam@indexdata.dk>
Sun, 17 Feb 2002 09:29:18 +0000
(09:29 +0000)
committer
Adam Dickmeiss
<adam@indexdata.dk>
Sun, 17 Feb 2002 09:29:18 +0000
(09:29 +0000)
robot.tcl
patch
|
blob
|
history
diff --git
a/robot.tcl
b/robot.tcl
index
b6466c3
..
ffbfce4
100755
(executable)
--- a/
robot.tcl
+++ b/
robot.tcl
@@
-1,5
+1,5
@@
#!/usr/bin/tclsh
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $
+# $Id: robot.tcl,v 1.30 2002/02/17 09:29:18 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
@@
-502,6
+502,13
@@
proc link {url out href body distance} {
proc RobotTextHtml {url out} {
global URL maxdistance
proc RobotTextHtml {url out} {
global URL maxdistance
+ # set title so we can emit it for the body
+ set title {}
+ # if true, nothing will be indexed
+ set noindex 0
+ # if true, nothing will be followed
+ set nofollow 0
+
set distance 0
set fdistance 0
if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
set distance 0
set fdistance 0
if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
@@
-510,36
+517,67
@@
proc RobotTextHtml {url out} {
}
htmlSwitch $URL($url,buf) \
title {
}
htmlSwitch $URL($url,buf) \
title {
- puts $out "<title>$body</title>"
+ set title $body
} -nonest meta {
} -nonest meta {
+ # collect metadata and save NAME= CONTENT=..
+ set metaname {}
+ set metacontent {}
puts -nonewline $out "<meta"
foreach a [array names parm] {
puts -nonewline $out "<meta"
foreach a [array names parm] {
- puts -nonewline $out " $a"
+ set al [string tolower $a]
+ puts -nonewline $out " $al"
puts -nonewline $out {="}
puts -nonewline $out $parm($a)
puts -nonewline $out {"}
puts -nonewline $out {="}
puts -nonewline $out $parm($a)
puts -nonewline $out {"}
+ switch -- $al {
+ "name" {
+ set metaname [string tolower $parm($a)]
+ }
+ "content" {
+ set metacontent $parm($a)
+ }
+ }
+ }
+ puts $out "></meta>"
+ # go through robots directives (af any)
+ if {![string compare $metaname robots]} {
+ set direcs [split [string tolower $metacontent] ,]
+ if {[lsearch $direcs noindex] >= 0} {
+ set noindex 1
+ }
+ if {[lsearch $direcs nofollow] >= 0} {
+ set nofollow 1
+ }
}
}
- puts $out {></meta>}
} body {
} body {
- regsub -all {<!--[^-]*->} $body { } abody
- regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
- regsub -all {<[^\>]+>} $bbody {} nbody
- puts $out "<documentcontent>"
- puts $out $nbody
- puts $out "</documentcontent>"
+ # don't print title of document content if noindex is used
+ if {!$noindex} {
+ puts $out "<title>$title</title>"
+ regsub -all {<!--[^-]*->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
+ regsub -all {<[^\>]+>} $bbody {} nbody
+ puts $out "<documentcontent>"
+ puts $out $nbody
+ puts $out "</documentcontent>"
+ }
} -nonest base {
} -nonest base {
+ # <base href=.. >
if {![info exists parm(href)]} {
continue
}
set href [string trim $parm(href)]
if {![RobotHref $url href host path]} continue
set URL($url,bpath) $path
if {![info exists parm(href)]} {
continue
}
set href [string trim $parm(href)]
if {![RobotHref $url href host path]} continue
set URL($url,bpath) $path
- } -nonest a {
+ } a {
+ # <a href="...."> .. </a>
+ # we're not using nonest - otherwise body isn't set
+ if {$nofollow} continue
if {![info exists parm(href)]} {
continue
}
link $url $out [string trim $parm(href)] $body $distance
} -nonest area {
if {![info exists parm(href)]} {
continue
}
link $url $out [string trim $parm(href)] $body $distance
} -nonest area {
+ if {$nofollow} continue
if {![info exists parm(href)]} {
continue
}
if {![info exists parm(href)]} {
continue
}