+ } -nonest area {
+ if {![info exists parm(href)]} {
+ puts "no href"
+ continue
+ }
+ if {[expr $distance <= $maxdistance]} {
+ set href [string trim $parm(href)]
+ if {![RobotHref $url href host path]} continue
+
+ puts $out "<cr>"
+ puts $out "<identifier>$href</identifier>"
+ puts $out "<description></description>"
+ puts $out "</cr>"
+
+ if {![RobotFileExist visited $host $path]} {
+ set olddistance 1000
+ if {![RobotFileExist bad $host $path]} {
+ if {[RobotFileExist unvisited $host $path]} {
+ set inf [RobotFileOpen unvisited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ }
+ } else {
+ set olddistance 0
+ }
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ } elseif {[string compare $href $url]} {
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf xurl olddistance
+ close $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ puts "OK remarking url=$url href=$href"
+ puts "olddistance = $olddistance"
+ puts "newdistance = $distance"
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ }
+ }
+ }
+}
+
+proc RobotsTxt {url} {
+ global agent URL
+
+ RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf)
+}
+
+proc RobotsTxt0 {v buf} {
+ global URL agent
+ set section 0
+ foreach l [split $buf \n] {
+ if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
+ puts "cmd=$cmd arg=$arg"
+ switch -- [string tolower $cmd] {
+ user-agent {
+ if {$section} break
+ set pat [string tolower $arg]*
+ set section [string match $pat $agent]
+ }
+ disallow {
+ if {$section} {
+ puts "rule [list 0 $arg]"
+ lappend $v [list 0 $arg]
+ }
+ }
+ allow {
+ if {$section} {
+ puts "rule [list 1 $arg]"
+ lappend $v [list 1 $arg]
+ }
+ }
+ }
+ }
+ }
+}
+
+proc RobotTextPlain {url out} {
+ global URL
+
+ puts $out "<documentcontent>"
+ regsub -all {<} $URL($url,buf) {\<} content
+ puts $out $content
+ puts $out "</documentcontent>"
+
+ if {![string compare $URL($url,path) /robots.txt]} {
+ RobotsTxt $url
+ }
+}
+
+proc RobotWriteMetadata {url out} {
+ global URL domains
+
+ puts $out "<zmbot>"
+
+ set distance 1000
+ if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
+ set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ set URL($url,dist) $distance
+ puts $out "<distance>"
+ puts $out " $distance"
+ puts $out "</distance>"
+ headSave $url $out
+ puts "Parsing $url distance=$distance"
+ switch $URL($url,head,content-type) {
+ text/html {
+ if {[string length $distance]} {
+ RobotTextHtml $url $out
+ }
+ }
+ text/plain {
+ RobotTextPlain $url $out
+ }
+ application/pdf {
+ set pdff [open test.pdf w]
+ puts -nonewline $pdff $URL($url,buf)
+ close $pdff