+proc RobotHref {url hrefx hostx pathx} {
+ global URL domains debuglevel
+ upvar $hrefx href
+ upvar $hostx host
+ upvar $pathx path
+
+ if {$debuglevel > 1} {
+ puts "Ref input url = $url href=$href"
+ }
+
+ if {[string first { } $href] >= 0} {
+ return 0
+ }
+ if {[string length $href] > 256} {
+ return 0
+ }
+ if {[string first {?} $href] >= 0} {
+ return 0
+ }
+ if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+ return 0
+ }
+ # get method (if any)
+ if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
+ set hpath $href
+ set method http
+ } else {
+ if {[string compare $method http]} {
+ return 0
+ }
+ }
+ # get host (if any)
+ if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
+ if {![string length $surl]} {
+ set surl /
+ }
+ if {[info exist domains]} {
+ set ok 0
+ foreach domain $domains {
+ if {[string match $domain $host]} {
+ set ok 1
+ break
+ }
+ }
+ if {!$ok} {
+ return 0
+ }
+ }
+ } else {
+ regexp {^([^\#]*)} $hpath x surl
+ set host $URL($url,hostport)
+ }
+ if {![string length $surl]} {
+ return 0
+ }
+ if {[string first / $surl]} {
+ # relative path
+ regexp {^([^\#?]*)} $URL($url,path) x dpart
+ set l [string last / $dpart]
+ if {[expr $l >= 0]} {
+ set surl [string range $dpart 0 $l]$surl
+ } else {
+ set surl $dpart/$surl
+ }
+ }
+ set surllist [split $surl /]
+ catch {unset path}
+ set pathl 0
+ foreach c $surllist {
+ switch -- $c {
+ .. {
+ if {$pathl > 1} {
+ incr pathl -2
+ set path [lrange $path 0 $pathl]
+ incr pathl
+ }
+ }
+ . {
+
+ }
+ default {
+ incr pathl
+ lappend path $c
+ }
+ }
+ }
+ if {$debuglevel > 4} {
+ puts "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
+ }
+ regsub -all {~} $path {%7E} path
+ set href "$method://$host$path"
+
+ if {$debuglevel > 1} {
+ puts "Ref result = $href"
+ }
+ return [checkrule url $href]
+}
+
+proc RobotError {url code} {
+ global URL
+
+ puts "Bad URL $url (code $code)"
+ set fromurl {}
+ set distance -1
+ if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
+ set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
+ if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+}
+
+proc RobotRedirect {url tourl code} {
+ global URL
+
+ puts "Redirecting from $url to $tourl"
+
+ set distance {}
+ set fromurl {}
+ if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
+ set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+ if {[RobotHref $url tourl host path]} {
+ if {![RobotFileExist visited $host $path]} {
+ if {![RobotFileExist unvisited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+ } else {
+ set olddistance {}
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[string length $distance] == 0} {
+ set distance 1000
+ }
+ puts "distance=$distance olddistance=$olddistance"
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $tourl $distance
+ RobotFileClose $outf
+ }
+ }
+ }
+ if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} {
+ puts "unlink failed"
+ exit 1
+ }
+}
+
+proc RobotTextHtml {url out} {
+ global URL maxdistance
+
+ set distance 0
+ if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
+ set distance [expr $URL($url,dist) + 1]
+ }
+ htmlSwitch $URL($url,buf) \