#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.3 1998/10/15 13:27:19 adam Exp $
+# $Id: robot.tcl,v 1.7 2000/12/08 22:46:53 adam Exp $
#
-proc RobotFileNext {area} {
+proc RobotFileNext1 {area} {
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
set off [string first / $area]
incr off
+
foreach n $ns {
if {[file isfile $n]} {
if {[string first :.html $n] > 0} {
}
return http://[string range $n $off end]
}
- if {[file isdirectory $n]} {
- set sb [RobotFileNext $n]
+ }
+ foreach n $ns {
+ if {[file isdirectory $n]} {
+ set sb [RobotFileNext1 $n]
if {[string length $sb]} {
return $sb
}
return {}
}
+proc RobotFileWait {} {
+ global robotSeq
+ set robotSeq 0
+}
+
+proc RobotFileNext {area} {
+ global robotSeq
+ if {[catch {set ns [glob ${area}/*]}]} {
+ return {}
+ }
+ set off [string length $area]
+ incr off
+
+ set n [lindex $ns $robotSeq]
+ if {![string length $n]} {
+ puts "------------ N E X T R O U N D --------"
+ set robotSeq -1
+ after 2000 RobotFileWait
+ vwait robotSeq
+
+ set n [lindex $ns $robotSeq]
+ if {![string length $n]} {
+ return {}
+ }
+ }
+ incr robotSeq
+ if {[file isfile $n/robots.txt]} {
+ puts "ok returning http://[string range $n $off end]/robots.txt"
+ return http://[string range $n $off end]/robots.txt
+ } elseif {[file isdirectory $n]} {
+ set sb [RobotFileNext1 $n]
+ if {[string length $sb]} {
+ return $sb
+ }
+ }
+ return {}
+}
+
+
proc RobotFileExist {area host path} {
set comp [split $area/$host$path /]
set l [llength $comp]
incr l -1
if {![string length [lindex $comp $l]]} {
set comp [split $area/$host$path:.html /]
+ } elseif {[file exists [join $comp /]]} {
+ return 1
+ } else {
+ set comp [split $area/$host$path/:.html /]
}
return [file exists [join $comp /]]
}
}
}
-proc RobotFileOpen {area host path} {
+proc RobotFileClose {out} {
+ if [string compare $out stdout] {
+ close $out
+ }
+}
+
+proc RobotFileOpen {area host path {mode w}} {
set orgPwd [pwd]
+ global workdir
+ if {![info exists workdir]} {
+ return stdout
+ }
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
+ if {[string compare $orgPwd $workdir]} {
+ puts "workdir = $workdir"
+ puts "pwd = $orgPwd"
+ exit 1
+ }
set comp [split $area/$host$path /]
set len [llength $comp]
incr len -1
if {[catch {cd ./$d}]} {
exec mkdir $d
cd ./$d
+ if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
+ set out [open robots.txt w]
+ puts "creating robots.txt in $d"
+ close $out
+ }
}
}
set d [lindex $comp $len]
if {[string length $d]} {
- set out [open $d w]
+ if {[file isdirectory $d]} {
+ set out [open $d/:.html $mode]
+ } else {
+ set out [open $d $mode]
+ }
} else {
- set out [open :.html w]
+ set out [open :.html $mode]
}
cd $orgPwd
+ #puts "RobotFileStop"
return $out
}
proc RobotRestart {} {
global URL
-
+ global robotMoreWork
+
while {1} {
set url [RobotFileNext unvisited]
- if {![string length $url]} break
+ if {![string length $url]} {
+ break
+ }
set r [RobotGetUrl $url {}]
if {!$r} {
+ puts "RobotGetUrl returned 0 on url=$url"
return
} else {
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
}
}
- exit 0
+ set robotMoreWork 0
}
-proc headSave {url out title} {
+proc headSave {url out} {
global URL
-
- puts $out {<nwi>}
- puts $out "<ti> $title"
- if {[info exists URL($url,head,Last-modified)]} {
- puts $out "<dm> $URL($url,head,Last-modified)"
+
+ puts $out {<zmbot>}
+ if {[info exists URL($url,head,last-modified)]} {
+ puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
}
puts $out {<si>}
- if {[info exists URL($url,head,Date)]} {
- puts $out " <lc> $URL($url,head,Date)"
+ if {[info exists URL($url,head,date)]} {
+ puts $out " <date>$URL($url,head,date)</date>"
}
- if {[info exists URL($url,head,Content-length)]} {
- puts $out " <by> $URL($url,head,Content-length)"
+ if {[info exists URL($url,head,content-length)]} {
+ puts $out " <by>$URL($url,head,content-length)</by>"
}
- if {[info exists URL($url,head,Server)]} {
- puts $out " <srvr> $URL($url,head,Server)"
+ if {[info exists URL($url,head,server)]} {
+ puts $out " <format>$URL($url,head,server)</format>"
}
puts $out {</si>}
- puts $out {<av>}
- puts $out " <avli> $url"
- if {[info exists URL($url,head,Content-type)]} {
- puts $out " <ty> $URL($url,head,Content-type)"
+ puts $out {<publisher>}
+ puts $out " <identifier>$url</identifier>"
+ if {[info exists URL($url,head,content-type)]} {
+ puts $out " <type>$URL($url,head,content-type)</type>"
}
- puts $out {</av>}
+ puts $out {</publisher>}
}
-proc RobotSave {url} {
+proc RobotHref {url hrefx hostx pathx} {
+ global URL domains
+ upvar $hrefx href
+ upvar $hostx host
+ upvar $pathx path
+
+ puts "Ref url = $url href=$href"
+ # get method (if any)
+ if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
+ set hpath $href
+ set method http
+ } else {
+ if {[string compare $method http]} {
+ return 0
+ }
+ }
+ # get host (if any)
+ if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
+ if {![string length $surl]} {
+ set surl /
+ }
+ set ok 0
+ foreach domain $domains {
+ if {[string match $domain $host]} {
+ set ok 1
+ break
+ }
+ }
+ if {!$ok} {
+ return 0
+ }
+ } else {
+ regexp {^([^\#]*)} $hpath x surl
+ set host $URL($url,host)
+ }
+ if {![string length $surl]} {
+ return 0
+ }
+ if {[string first / $surl]} {
+ # relative path
+ regexp {^([^\#?]*)} $URL($url,path) x dpart
+ set l [string last / $dpart]
+ if {[expr $l >= 0]} {
+ set surl [string range $dpart 0 $l]$surl
+ } else {
+ set surl $dpart/$surl
+ }
+ }
+ set c [split $surl /]
+ set i [llength $c]
+ incr i -1
+ set path [lindex $c $i]
+ incr i -1
+ while {$i >= 0} {
+ switch -- [lindex $c $i] {
+ .. {
+ incr i -2
+ }
+ . {
+ incr i -1
+ }
+ default {
+ set path [lindex $c $i]/$path
+ incr i -1
+ }
+ }
+ }
+ set href "$method://$host$path"
+ puts "Ref href = $href"
+ return 1
+}
+
+proc Robot401 {url} {
global URL
- global domains
-
- set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
- set ti 0
- if {[info exists URL($url,line)]} {
- set htmlContent [join $URL($url,line) \n]
-
- htmlSwitch $htmlContent \
+
+ puts "Bad URL $url"
+ set fromurl {}
+ catch {
+ set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
+ set fromurl [gets $inf]
+ close $inf
+ }
+ RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
+ if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
+ puts $outf "URL=$url 401"
+ puts $outf "Reference $fromurl"
+ RobotFileClose $outf
+ }
+}
+
+proc Robot404 {url} {
+ global URL
+
+ puts "Bad URL $url"
+ set fromurl {}
+ catch {
+ set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
+ set fromurl [gets $inf]
+ RobotFileClose $inf
+ }
+ RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
+ if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
+ puts $outf "URL=$url 404"
+ puts $outf "Reference $fromurl"
+ RobotFileClose $outf
+ }
+ }
+
+proc Robot301 {url tourl} {
+ global URL
+
+ puts "Redirecting from $url to $tourl"
+
+ set fromurl {}
+ catch {
+ set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
+ set fromurl [gets $inf]
+ RobotFileClose $inf
+ }
+ RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
+ if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
+ puts $outf "URL=$url to $tourl 301"
+ puts $outf "Reference $fromurl"
+ RobotFileClose $outf
+ }
+ if {[RobotHref $url tourl host path]} {
+ if {![RobotFileExist unvisited $host $path]} {
+ puts "Mark as unvisited"
+ set outf [RobotFileOpen unvisited $host $path]
+ puts $outf 301
+ RobotFileClose $outf
+ }
+ }
+}
+
+proc RobotTextHtml {url out} {
+ global URL
+
+ set head 0
+ htmlSwitch $URL($url,buf) \
title {
- if {!$ti} {
- headSave $url $out $body
- set ti 1
+ if {!$head} {
+ headSave $url $out
+ set head 1
+ }
+ puts $out "<title>$body</title>"
+ } -nonest meta {
+ if {!$head} {
+ headSave $url $out
+ set head 1
}
+ puts -nonewline $out "<meta"
+ foreach a [array names parm] {
+ puts -nonewline $out " $a"
+ puts -nonewline $out {="}
+ puts -nonewline $out $parm($a)
+ puts -nonewline $out {"}
+ }
+ puts $out {></meta>}
} body {
regsub -all -nocase {<script.*</script>} $body {} abody
regsub -all {<[^\>]+>} $abody {} nbody
- puts $out "<body>"
+ puts $out "<documentcontent>"
puts $out $nbody
- puts $out "</body>"
+ puts $out "</documentcontent>"
} a {
if {![info exists parm(href)]} {
puts "no href"
continue
}
- if {!$ti} {
- headSave $url $out "untitled"
- set ti 1
- }
-
- if {[regexp {^\#} $parm(href)]} {
- continue
- } elseif {[regexp {^([^:]+):([^#]+)} $parm(href) x method hpath]} {
- set ok 0
- if {![string compare $method http]} {
- if {![regexp {^//([^/]+)(.*)} $hpath x host path]} {
- set host $URL($url,host)
- set path $hpath
- }
- foreach domain $domains {
- if {[string match $domain $host]} {
- set ok 1
- break
- }
- }
- }
- if {!$ok} continue
- } elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} {
- set host $URL($url,host)
- set method http
- } else {
- set ext [file extension $URL($url,path)]
- if {[string compare $ext {}]} {
- set dpart [file dirname $URL($url,path)]
- } else {
- set dpart $URL($url,path)
- }
- regexp {^([^#]+)} $parm(href) x path
- set host $URL($url,host)
- set path [string trimright $dpart /]/$path
- set method http
+ if {!$head} {
+ headSave $url $out
+ set head 1
}
- set ext [file extension $path]
- if {![string length $ext]} {
- set path [string trimright $path /]/
- } else {
- set path [string trimright $path /]
- }
- set c [split $path /]
- set i [llength $c]
- incr i -1
- set path [lindex $c $i]
- incr i -1
- while {$i >= 0} {
- switch -- [lindex $c $i] {
- .. {
- incr i -2
- }
- . {
- incr i -1
- }
- default {
- set path [lindex $c $i]/$path
- incr i -1
+ if {1} {
+ set href $parm(href)
+ if {![RobotHref $url href host path]} continue
+
+ puts $out "<cr>"
+ puts $out "<identifier>$href</identifier>"
+ puts $out "<description>$body</description>"
+ puts $out "</cr>"
+
+ if {![RobotFileExist visited $host $path]} {
+ if {![RobotFileExist bad $host $path]} {
+ if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
+ puts "--- Error $msg"
+ exit 1
+ }
+ puts $outf $url
+ RobotFileClose $outf
}
}
}
- set href "$method://$host$path"
-
- puts $out "<cr>"
- puts $out "<li> $href"
- puts $out "<cp> $body"
- puts $out "</cr>"
-
- if {![regexp {/.*bin/} $href)]} {
- if {![RobotFileExist visited $host $path]} {
- set outf [RobotFileOpen unvisited $host $path]
- close $outf
- }
- }
- }
+ }
+ if {!$head} {
+ headSave $url $out
+ set head 1
}
- if {!$ti} {
- headSave $url $out "untitled"
- set ti 1
+ puts $out "</zmbot>"
+}
+
+proc RobotTextPlain {url out} {
+ global URL
+
+ headSave $url $out
+ puts $out "<documentcontent>"
+ puts $out $URL($url,buf)
+ puts $out "</documentcontent>"
+ puts $out "</meta>"
+}
+
+proc Robot200 {url} {
+ global URL domains
+
+ puts "Parsing $url"
+ set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
+ switch $URL($url,head,content-type) {
+ text/html {
+ RobotTextHtml $url $out
+ }
+ text/plain {
+ RobotTextPlain $url $out
+ }
+ default {
+ headSave $url $out
+ puts $out "</zmbot>"
+ }
}
- puts $out "</nwi>"
- close $out
+ RobotFileClose $out
+ # puts "Parsing done"
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
}
-proc RobotRead {url sock} {
+proc RobotReadContent {url sock} {
global URL
- set readCount [gets $sock line]
- if {$readCount < 0} {
- if [eof $sock] {
- close $sock
- RobotSave $url
- RobotRestart
- }
- } elseif {$readCount > 0} {
- switch $URL($url,state) {
- head {
- puts "head: $line"
- if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} {
- set URL($url,head,$name) $value
- }
- }
- html {
- lappend URL($url,line) $line
-# puts "body: $line"
- }
- skip {
- close $sock
- RobotSave $url
- RobotRestart
- }
- }
+ set buffer [read $sock 16384]
+ set readCount [string length $buffer]
+
+ if {$readCount <= 0} {
+ close $sock
+ Robot200 $url
+ RobotRestart
} else {
- set URL($url,state) html
- if {[info exists URL($url,head,Content-type)]} {
- if {![string compare $URL($url,head,Content-type) text/html]} {
- set URL($url,state) html
- }
- }
+ # puts "Got $readCount bytes"
+ set URL($url,buf) $URL($url,buf)$buffer
}
}
-proc RobotConnect {url sock} {
+proc RobotReadHeader {url sock} {
global URL
- fileevent $sock readable [list RobotRead $url $sock]
+ set buffer [read $sock 2148]
+ set readCount [string length $buffer]
+
+ if {$readCount <= 0} {
+ Robot404 $url
+ close $sock
+ RobotRestart
+ } else {
+ # puts "Got $readCount bytes"
+ set URL($url,buf) $URL($url,buf)$buffer
+
+ set n [string first \n\n $URL($url,buf)]
+ if {$n > 1} {
+ set code 0
+ set version {}
+ set headbuf [string range $URL($url,buf) 0 $n]
+ incr n
+ incr n
+ set URL($url,buf) [string range $URL($url,buf) $n end]
+
+ regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
+ set lines [split $headbuf \n]
+ foreach line $lines {
+ if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
+ set URL($url,head,[string tolower $name]) $value
+ }
+ }
+ puts "code = $code"
+ set URL($url,state) skip
+ switch $code {
+ 301 {
+ Robot301 $url $URL($url,head,location)
+ close $sock
+ RobotRestart
+ }
+ 302 {
+ Robot301 $url $URL($url,head,location)
+ close $sock
+ RobotRestart
+ }
+ 404 {
+ Robot404 $url
+ close $sock
+ RobotRestart
+ }
+ 401 {
+ Robot401 $url
+ close $sock
+ RobotRestart
+ }
+ 200 {
+ if {![info exists URL($url,head,content-type)]} {
+ set URL($url,head,content-type) {}
+ }
+ switch $URL($url,head,content-type) {
+ text/html {
+ fileevent $sock readable [list RobotReadContent $url $sock]
+ }
+ text/plain {
+ fileevent $sock readable [list RobotReadContent $url $sock]
+ }
+ default {
+ close $sock
+ Robot200 $url
+ RobotRestart
+ }
+ }
+ }
+ default {
+ Robot404 $url
+ close $sock
+ RobotRestart
+ }
+ }
+ }
+ }
+}
+
+proc RobotConnect {url sock} {
+ global URL agent
+
+ fconfigure $sock -translation {auto crlf} -blocking 0
+ puts "Reading $url"
+ fileevent $sock readable [list RobotReadHeader $url $sock]
puts $sock "GET $URL($url,path) HTTP/1.0"
+ puts $sock "Host: $URL($url,host)"
+ puts $sock "User-Agent: $agent"
puts $sock ""
flush $sock
}
proc RobotGetUrl {url phost} {
global URL
- set port 80
puts "---------"
puts $url
- if {[regexp {([^:]+)://([^/]+)([^ ]*)} $url x method host path]} {
- puts "method=$method host=$host path=$path"
- } else {
+ if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
return -1
}
+ if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
+ set port 80
+ set host $hostport
+ }
set URL($url,method) $method
set URL($url,host) $host
set URL($url,port) $port
set URL($url,path) $path
set URL($url,state) head
+ set URL($url,buf) {}
if [catch {set sock [socket -async $host $port]}] {
return -1
}
- fconfigure $sock -translation {auto crlf}
RobotConnect $url $sock
return 0
}
}
+
+set agent "zmbot/0.0"
+if {![catch {set os [exec uname -s -r]}]} {
+ set agent "$agent ($os)"
+ puts "agent: $agent"
+}
+
+proc bgerror {m} {
+ puts "BGERROR $m"
+}
+
+set robotMoreWork 0
+set robotSeq 0
+set workdir [pwd]
+
if {[llength $argv] < 2} {
puts "Tclrobot: usage <domain> <start>"
+ puts " Example: '*.indexdata.dk' http://www.indexdata.dk/"
exit 1
}
+
set domains [lindex $argv 0]
set site [lindex $argv 1]
if {[string length $site]} {
- set x [RobotFileOpen unvisited $site /]
- close $x
+ set robotMoreWork 1
+ if [RobotGetUrl $site {}] {
+ set robotMoreWork 0
+ puts "Couldn't process $site"
+ }
}
-RobotRestart
-vwait forever
-
+while {$robotMoreWork} {
+ vwait robotMoreWork
+}