Added README. Ignore case in keywords in robots.txt.
[tclrobot.git] / robot.tcl
index 5bd9f82..5c2b518 100755 (executable)
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
 #!/usr/bin/tclsh 
-# $Id: robot.tcl,v 1.15 2001/06/05 08:44:50 adam Exp $
+# $Id: robot.tcl,v 1.16 2001/06/06 07:10:31 adam Exp $
 #
 proc RobotFileNext1 {area lead} {
     puts "RobotFileNext1 area=$area lead=$lead"
@@ -484,6 +484,56 @@ proc RobotTextHtml {url out} {
                    }
                }
            }
+        } -nonest area {
+            if {![info exists parm(href)]} {
+               puts "no href"
+               continue
+            }
+           if {[expr $distance <= $maxDistance]} {
+               set href [string trim $parm(href)]
+               if {![RobotHref $url href host path]} continue
+               
+               puts $out "<cr>"
+               puts $out "<identifier>$href</identifier>"
+               puts $out "<description></description>"
+               puts $out "</cr>"
+
+               if {![RobotFileExist visited $host $path]} {
+                   set olddistance 1000
+                   if {![RobotFileExist bad $host $path]} {
+                       if {[RobotFileExist unvisited $host $path]} {
+                           set inf [RobotFileOpen unvisited $host $path r]
+                           RobotReadRecord $inf oldurl olddistance
+                           RobotFileClose $inf
+                       }
+                   } else {
+                       set olddistance 0
+                   }
+                   if {[string length $olddistance] == 0} {
+                       set olddistance 1000
+                   }
+                   if {[expr $distance < $olddistance]} {
+                       set outf [RobotFileOpen unvisited $host $path]
+                       RobotWriteRecord $outf $url $distance
+                       RobotFileClose $outf
+                   }
+               } elseif {[string compare $href $url]} {
+                   set inf [RobotFileOpen visited $host $path r]
+                   RobotReadRecord $inf xurl olddistance
+                   close $inf
+                   if {[string length $olddistance] == 0} {
+                       set olddistance 1000
+                   }
+                   if {[expr $distance < $olddistance]} {
+                       puts "OK remarking url=$url href=$href"
+                       puts "olddistance = $olddistance"
+                       puts "newdistance = $distance"
+                       set outf [RobotFileOpen unvisited $host $path]
+                       RobotWriteRecord $outf $url $distance
+                       RobotFileClose $outf
+                   }
+               }
+           }
        }
 }
 
@@ -497,21 +547,21 @@ proc RobotsTxt0 {v buf} {
     global URL agent
     set section 0
     foreach l [split $buf \n] {
-       if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
+       if {[regexp {([-A-Za-z]+):[ \t]*([^\#\t ]+)} $l match cmd arg]} {
            puts "cmd=$cmd arg=$arg"
-           switch $cmd {
-               User-Agent {
+           switch -- [string tolower $cmd] {
+               user-agent {
                    if {$section} break
                    set pat [string tolower $arg]*
                    set section [string match $pat $agent]
                }
-               Disallow {
+               disallow {
                    if {$section} {
                        puts "rule [list 0 $arg]"
                        lappend $v [list 0 $arg]
                    }
                }
-               Allow {
+               allow {
                    if {$section} {
                        puts "rule [list 1 $arg]"
                        lappend $v [list 1 $arg]
@@ -707,7 +757,7 @@ proc RobotGetUrl {url phost} {
                set buf [read $inf 32768]
                close $inf
            } else {
-               set buf "User-Agent: *\nAllow: /\n"
+               set buf "User-agent: *\nAllow: /\n"
            }
            RobotsTxt0 URL($hostport,robots) $buf
        }
@@ -720,6 +770,7 @@ proc RobotGetUrl {url phost} {
            }
        }
        if {!$ok} {
+           puts "skipped due to robots.txt"
            return -1
        }
     }