From 9d508bb1bb6e7479fb9e6753797fc78151cfc0e4 Mon Sep 17 00:00:00 2001
From: Adam Dickmeiss <adam@indexdata.dk>
Date: Wed, 6 Jun 2001 07:10:31 +0000
Subject: [PATCH] Added README. Ignore case in keywords in robots.txt.

---
 README    |   71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 robot.tcl |   65 +++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 129 insertions(+), 7 deletions(-)
 create mode 100644 README

diff --git a/README b/README
new file mode 100644
index 0000000..e7b1184
--- /dev/null
+++ b/README
@@ -0,0 +1,71 @@
+zmbot: a Simple Web harvesting robot for Z'mbol.
+
+Introduction
+
+  zmbot is a simple web harvester written in Tcl. The following
+  summaries the features:
+
+  o Simple administration. One script does the job and no external
+    database is required to operate.
+
+  o Interruptible. Harvesting may safely be stopped/interrupted at any
+    point.
+
+  o Gentle harvesting. By default a site is visited once per minute -
+    robots.txt honored.
+
+  o Concurrent harvesting (jobs) in one process and one thread.
+
+  o Inspects content-type header to determine structure of page.
+
+  o Written in Tcl and is quite portable. (Some may not think this as being
+    feature; Perl version is welcomed!).
+
+  o Creates simple XML output. One file per URL.
+
+  The robot is started from the command line and takes one or more URL's
+  as parameter(s). Options, prefixed with minus, alter the behaviour of
+  the harvesting. The following options are supported:
+
+   -j jobs    The maximum number of concurrent HTTP sessions; default 5 jobs.
+
+   -i idle    Idle time in microseconds between visits to the same site;
+              default 60000 = 60 seconds.
+
+   -c count   Maximum distance from original URL as given from the command
+              line; default 50. 
+
+
+   -d domain  Only sites matching domain are visited. The domain given is
+              a Tcl glob expression (.e.g *.somwhere.com). Remember to
+              quote the domain when given on the command line so that your
+              shell doesn't expand this. This option may be repeated thus
+              allowing you to specify many "allowed" domains.
+ 
+  Example 1: Harvest three links away from www.somwhere.com world-wide:
+   ./robot.tcl -c 3 http://www.somwhere.com/
+
+  Example 2: Harvest the site www.somwhere.com only:
+   ./robot.tcl -d www.somewhere.com http://www.somewhere.com/
+
+  Example 3: Harvest up to two click from www.a.dk and www.b.dk in dk-domain:
+   ./robot.tcl -d '*.dk' -c 2 http://www.a.dk/ http://www.b.dk/
+
+  The zmbot robot creates three directories, visited, unvisited, bad
+  for visited pages, unvisited pages, and bad pages respectively. The
+  visited area holds keywords and metadata for all successully retrieved
+  pages. The unvisited area serves as a "todo" list of pages to be visited
+  in the future. The bad area holds pages that for some reason cannot be
+  retrieved: non-existant, permission denied, robots.txt disallow, etc.
+
+Installation:
+
+  $  ./configure
+  $ make
+
+  The configure script looks for the Tcl shell, tclsh, to determine the
+  location of Tcl and its configuration file tclConfig.sh. To manually specify
+  Tcl's location, add --with-tclconfig and specify the directory where
+  tclConfig.sh is installed. For example:
+    ./configure --with-tclconfig=/usr/local/lib
+
diff --git a/robot.tcl b/robot.tcl
index 5bd9f82..5c2b518 100755
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
 #!/usr/bin/tclsh 
-# $Id: robot.tcl,v 1.15 2001/06/05 08:44:50 adam Exp $
+# $Id: robot.tcl,v 1.16 2001/06/06 07:10:31 adam Exp $
 #
 proc RobotFileNext1 {area lead} {
     puts "RobotFileNext1 area=$area lead=$lead"
@@ -484,6 +484,56 @@ proc RobotTextHtml {url out} {
 		    }
 		}
 	    }
+        } -nonest area {
+            if {![info exists parm(href)]} {
+	        puts "no href"
+	        continue
+            }
+	    if {[expr $distance <= $maxDistance]} {
+		set href [string trim $parm(href)]
+		if {![RobotHref $url href host path]} continue
+		
+		puts $out "<cr>"
+		puts $out "<identifier>$href</identifier>"
+		puts $out "<description></description>"
+		puts $out "</cr>"
+
+		if {![RobotFileExist visited $host $path]} {
+		    set olddistance 1000
+		    if {![RobotFileExist bad $host $path]} {
+			if {[RobotFileExist unvisited $host $path]} {
+			    set inf [RobotFileOpen unvisited $host $path r]
+			    RobotReadRecord $inf oldurl olddistance
+			    RobotFileClose $inf
+			}
+		    } else {
+			set olddistance 0
+		    }
+		    if {[string length $olddistance] == 0} {
+			set olddistance 1000
+		    }
+		    if {[expr $distance < $olddistance]} {
+			set outf [RobotFileOpen unvisited $host $path]
+			RobotWriteRecord $outf $url $distance
+			RobotFileClose $outf
+		    }
+		} elseif {[string compare $href $url]} {
+		    set inf [RobotFileOpen visited $host $path r]
+		    RobotReadRecord $inf xurl olddistance
+		    close $inf
+		    if {[string length $olddistance] == 0} {
+			set olddistance 1000
+		    }
+		    if {[expr $distance < $olddistance]} {
+			puts "OK remarking url=$url href=$href"
+			puts "olddistance = $olddistance"
+			puts "newdistance = $distance"
+		        set outf [RobotFileOpen unvisited $host $path]
+			RobotWriteRecord $outf $url $distance
+		        RobotFileClose $outf
+		    }
+		}
+	    }
 	}
 }
 
@@ -497,21 +547,21 @@ proc RobotsTxt0 {v buf} {
     global URL agent
     set section 0
     foreach l [split $buf \n] {
-	if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
+	if {[regexp {([-A-Za-z]+):[ \t]*([^\#\t ]+)} $l match cmd arg]} {
 	    puts "cmd=$cmd arg=$arg"
-	    switch $cmd {
-		User-Agent {
+	    switch -- [string tolower $cmd] {
+		user-agent {
 		    if {$section} break
 		    set pat [string tolower $arg]*
 		    set section [string match $pat $agent]
 		}
-		Disallow {
+		disallow {
 		    if {$section} {
 			puts "rule [list 0 $arg]"
 			lappend $v [list 0 $arg]
 		    }
 		}
-		Allow {
+		allow {
 		    if {$section} {
 			puts "rule [list 1 $arg]"
 			lappend $v [list 1 $arg]
@@ -707,7 +757,7 @@ proc RobotGetUrl {url phost} {
 		set buf [read $inf 32768]
 		close $inf
 	    } else {
-		set buf "User-Agent: *\nAllow: /\n"
+		set buf "User-agent: *\nAllow: /\n"
 	    }
 	    RobotsTxt0 URL($hostport,robots) $buf
 	}
@@ -720,6 +770,7 @@ proc RobotGetUrl {url phost} {
 	    }
 	}
 	if {!$ok} {
+	    puts "skipped due to robots.txt"
 	    return -1
 	}
     }
-- 
1.7.10.4