Remove code that skips ?'s in URL

[tclrobot.git] / robot.tcl
diff --git a/robot.tcl b/robot.tcl

index b6466c3..793298c 100755 (executable)
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
  #!/usr/bin/tclsh 
-# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $
+# $Id: robot.tcl,v 1.33 2002/03/25 16:13:21 adam Exp $
  #
  proc RobotFileNext1 {area lead} {
      # puts "RobotFileNext1 area=$area lead=$lead"
@@ -171,23 +171,18 @@ proc RobotFileOpen {area host path {mode w}} {
              exec mkdir $d
              cd ./$d
             if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
-               set out [open frobots.txt w]
-               puts "creating robots.txt in $d"
-               close $out
-                incr status(unvisited)
+               if {[string compare $path /robots.txt]} {
+                   set out [open frobots.txt w]
+                   puts "creating robots.txt in $d"
+                   close $out
+                    incr status(unvisited)
+               }
             }
          }
      }
      set d [lindex $comp $len]
      if {[string length $d]} {
          set out [open f$d $mode]
-        if {0} {
-            if {[file isfile $d/f]} {
-                set out [open $d/f $mode]
-            } else {
-                set out [open f$d $mode]
-            }
-        }
      } else {
          set out [open f $mode]
      }
@@ -293,12 +288,11 @@ proc RobotHref {url hrefx hostx pathx} {
      if {[string length $href] > 256} {
         return 0
      }
-    if {[string first {?} $href] >= 0} {
-       return 0
-    }
-    if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
-       return 0
-    }
+
+#   Skip pages that have ? in them
+#    if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+#      return 0
+#    }
      # get method (if any)
      if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
         set hpath $href
@@ -502,6 +496,13 @@ proc link {url out href body distance} {
  proc RobotTextHtml {url out} {
      global URL maxdistance
  
+    # set title so we can emit it for the body
+    set title {}
+    # if true, nothing will be indexed
+    set noindex 0
+    # if true, nothing will be followed
+    set nofollow 0
+
      set distance 0
      set fdistance 0
      if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
@@ -510,36 +511,67 @@ proc RobotTextHtml {url out} {
      }
      htmlSwitch $URL($url,buf) \
          title {
-           puts $out "<title>$body</title>"
+            set title $body
          } -nonest meta {
+            # collect metadata and save NAME= CONTENT=..
+            set metaname {}
+            set metacontent {}
              puts -nonewline $out "<meta"
              foreach a [array names parm] {
-               puts -nonewline $out " $a"
+                set al [string tolower $a]
+               puts -nonewline $out " $al"
                  puts -nonewline $out {="}
                  puts -nonewline $out $parm($a)
                  puts -nonewline $out {"}
+                switch -- $al {
+                    "name" {
+                        set metaname [string tolower $parm($a)]
+                    }
+                    "content" {
+                        set metacontent $parm($a)
+                    }
+                }
+            }
+           puts $out "></meta>"
+            # go through robots directives (af any)
+            if {![string compare $metaname robots]} {
+                set direcs [split [string tolower $metacontent] ,]
+                if {[lsearch $direcs noindex] >= 0} {
+                    set noindex 1
+                }
+                if {[lsearch $direcs nofollow] >= 0} {
+                    set nofollow 1
+                }
              }
-           puts $out {></meta>}
         } body {
-           regsub -all {<!--[^-]*->} $body { } abody
-           regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
-           regsub -all {<[^\>]+>} $bbody {} nbody
-           puts $out "<documentcontent>"
-            puts $out $nbody
-            puts $out "</documentcontent>"
+            # don't print title of document content if noindex is used
+            if {!$noindex} {
+                puts $out "<title>$title</title>"
+                regsub -all {<!--[^-]*-->} $body { } abody
+                regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
+                regsub -all {<[^\>]+>} $bbody {} nbody
+                puts $out "<documentcontent>"
+                puts $out $nbody
+                puts $out "</documentcontent>"
+            }
          } -nonest base {
+            # <base href=.. >
              if {![info exists parm(href)]} {
                 continue
              }
              set href [string trim $parm(href)]
              if {![RobotHref $url href host path]} continue
              set URL($url,bpath) $path
-        } -nonest a {
+        } a {
+            # <a href="...."> .. </a> 
+            # we're not using nonest - otherwise body isn't set
+            if {$nofollow} continue
              if {![info exists parm(href)]} {
                 continue
              }
              link $url $out [string trim $parm(href)] $body $distance
          } -nonest area {
+            if {$nofollow} continue
              if {![info exists parm(href)]} {
                 continue
              }