projects
/
tclrobot.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
64d3a6a
)
xmlwf output
author
Adam Dickmeiss
<adam@indexdata.dk>
Wed, 11 Jun 2003 10:29:41 +0000
(10:29 +0000)
committer
Adam Dickmeiss
<adam@indexdata.dk>
Wed, 11 Jun 2003 10:29:41 +0000
(10:29 +0000)
robot.tcl
patch
|
blob
|
history
diff --git
a/robot.tcl
b/robot.tcl
index
3ab1d81
..
73d558a
100755
(executable)
--- a/
robot.tcl
+++ b/
robot.tcl
@@
-1,5
+1,5
@@
#!/usr/bin/tclsh
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.44 2003/06/11 10:11:39 adam Exp $
+# $Id: robot.tcl,v 1.45 2003/06/11 10:29:41 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
@@
-610,6
+610,15
@@
proc RobotRedirect {task url tourl code} {
}
}
}
}
+proc wellform {body} {
+ regsub -all {<!--[^-]*-->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} body
+ regsub -all {<[^\>]+>} $body {} abody
+ regsub -all { } $abody { } body
+ regsub -all {&} $body {&} abody
+ return $abody
+}
+
proc link {task url out href body distance} {
global URL control
if {[expr $distance > $control($task,distance)]} return
proc link {task url out href body distance} {
global URL control
if {[expr $distance > $control($task,distance)]} return
@@
-618,7
+627,8
@@
proc link {task url out href body distance} {
puts $out "<cr>"
puts $out "<identifier>$href</identifier>"
puts $out "<cr>"
puts $out "<identifier>$href</identifier>"
- puts $out "<description>$body</description>"
+ set abody [wellform $body]
+ puts $out "<description>$abody</description>"
puts $out "</cr>"
if {![RobotFileExist $task visited $host $path]} {
puts $out "</cr>"
if {![RobotFileExist $task visited $host $path]} {
@@
-714,11
+724,9
@@
proc RobotTextHtml {task url out} {
# don't print title of document content if noindex is used
if {!$noindex} {
puts $out "<title>$title</title>"
# don't print title of document content if noindex is used
if {!$noindex} {
puts $out "<title>$title</title>"
- regsub -all {<!--[^-]*-->} $body { } abody
- regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
- regsub -all {<[^\>]+>} $bbody {} nbody
+ set bbody [wellform $body]
puts $out "<documentcontent>"
puts $out "<documentcontent>"
- puts $out $nbody
+ puts $out $bbody
puts $out "</documentcontent>"
}
} -nonest base {
puts $out "</documentcontent>"
}
} -nonest base {