2 # $Id: robot.tcl,v 1.8 2000/12/10 22:27:48 adam Exp $
4 proc RobotFileNext1 {area} {
5 if {[catch {set ns [glob ${area}/*]}]} {
8 set off [string first / $area]
12 if {[file isfile $n]} {
13 if {[string first :.html $n] > 0} {
14 return http://[string range $area/ $off end]
16 return http://[string range $n $off end]
20 if {[file isdirectory $n]} {
21 set sb [RobotFileNext1 $n]
22 if {[string length $sb]} {
30 proc RobotFileWait {} {
35 proc RobotFileNext {area} {
37 if {[catch {set ns [glob ${area}/*]}]} {
40 set off [string length $area]
43 set n [lindex $ns $robotSeq]
44 if {![string length $n]} {
45 puts "------------ N E X T R O U N D --------"
47 after 30000 RobotFileWait
50 set n [lindex $ns $robotSeq]
51 if {![string length $n]} {
56 if {[file isfile $n/robots.txt]} {
57 puts "ok returning http://[string range $n $off end]/robots.txt"
58 return http://[string range $n $off end]/robots.txt
59 } elseif {[file isdirectory $n]} {
60 set sb [RobotFileNext1 $n]
61 if {[string length $sb]} {
69 proc RobotFileExist {area host path} {
70 set comp [split $area/$host$path /]
73 if {![string length [lindex $comp $l]]} {
74 set comp [split $area/$host$path:.html /]
75 } elseif {[file exists [join $comp /]]} {
78 set comp [split $area/$host$path/:.html /]
80 return [file exists [join $comp /]]
83 proc RobotFileUnlink {area host path} {
84 set comp [split $area/$host$path /]
87 if {![string length [lindex $comp $l]]} {
88 set comp [split $area/$host$path:.html /]
90 if {[catch {exec rm [join $comp /]}]} return
92 for {set i $l} {$i > 0} {incr i -1} {
93 set path [join [lrange $comp 0 $i] /]
94 if {![catch {glob $path/*}]} return
99 proc RobotFileClose {out} {
100 if [string compare $out stdout] {
105 proc RobotFileOpen {area host path {mode w}} {
109 if {![info exists workdir]} {
112 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
113 if {[string compare $orgPwd $workdir]} {
114 puts "ooops. RobotFileOpen failed"
115 puts "workdir = $workdir"
119 set comp [split $area/$host$path /]
120 set len [llength $comp]
122 for {set i 0} {$i < $len} {incr i} {
123 set d [lindex $comp $i]
124 if {[catch {cd ./$d}]} {
127 if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
128 set out [open robots.txt w]
129 puts "creating robots.txt in $d"
134 set d [lindex $comp $len]
135 if {[string length $d]} {
136 if {[file isdirectory $d]} {
137 set out [open $d/:.html $mode]
139 set out [open $d $mode]
142 set out [open :.html $mode]
145 #puts "RobotFileStop"
149 proc RobotRestart {sock} {
154 after cancel $URL($sock,cancel)
156 set url [RobotFileNext unvisited]
157 if {![string length $url]} {
160 set r [RobotGetUrl $url {}]
164 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
167 incr robotMoreWork -1
170 proc headSave {url out} {
174 if {[info exists URL($url,head,last-modified)]} {
175 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
178 if {[info exists URL($url,head,date)]} {
179 puts $out " <date>$URL($url,head,date)</date>"
181 if {[info exists URL($url,head,content-length)]} {
182 puts $out " <by>$URL($url,head,content-length)</by>"
184 if {[info exists URL($url,head,server)]} {
185 puts $out " <format>$URL($url,head,server)</format>"
188 puts $out {<publisher>}
189 puts $out " <identifier>$url</identifier>"
190 if {[info exists URL($url,head,content-type)]} {
191 puts $out " <type>$URL($url,head,content-type)</type>"
193 puts $out {</publisher>}
196 proc RobotHref {url hrefx hostx pathx} {
202 puts "Ref url = $url href=$href"
203 # get method (if any)
204 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
208 if {[string compare $method http]} {
213 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
214 if {![string length $surl]} {
218 foreach domain $domains {
219 if {[string match $domain $host]} {
228 regexp {^([^\#]*)} $hpath x surl
229 set host $URL($url,host)
231 if {![string length $surl]} {
234 if {[string first / $surl]} {
236 regexp {^([^\#?]*)} $URL($url,path) x dpart
237 set l [string last / $dpart]
238 if {[expr $l >= 0]} {
239 set surl [string range $dpart 0 $l]$surl
241 set surl $dpart/$surl
244 set c [split $surl /]
247 set path [lindex $c $i]
250 switch -- [lindex $c $i] {
258 set path [lindex $c $i]/$path
263 regsub -all {~} $path {%7E} path
265 if {[info exists URL($host,robots)]} {
266 foreach l $URL($host,robots) {
267 if {[string first [lindex $l 1] $path] == 0} {
273 set href "$method://$host$path"
274 puts "Ref href = $href, ok=$ok"
278 proc RobotError {url code} {
281 puts "Bad URL $url, $code"
283 if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} {
284 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
285 set fromurl [gets $inf]
288 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
289 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
290 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
291 puts $outf "URL=$url $code"
292 puts $outf "Reference $fromurl"
297 proc RobotRedirect {url tourl code} {
300 puts "Redirecting from $url to $tourl"
304 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
305 set fromurl [gets $inf]
308 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
309 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
310 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
311 puts $outf "URL=$url to $tourl $code"
312 puts $outf "Reference $fromurl"
315 if {[RobotHref $url tourl host path]} {
316 if {![RobotFileExist unvisited $host $path]} {
317 puts "Mark as unvisited"
318 set outf [RobotFileOpen unvisited $host $path]
325 proc RobotTextHtml {url out} {
329 htmlSwitch $URL($url,buf) \
335 puts $out "<title>$body</title>"
341 puts -nonewline $out "<meta"
342 foreach a [array names parm] {
343 puts -nonewline $out " $a"
344 puts -nonewline $out {="}
345 puts -nonewline $out $parm($a)
346 puts -nonewline $out {"}
350 regsub -all -nocase {<script.*</script>} $body {} abody
351 regsub -all {<[^\>]+>} $abody {} nbody
352 puts $out "<documentcontent>"
354 puts $out "</documentcontent>"
356 if {![info exists parm(href)]} {
366 if {![RobotHref $url href host path]} continue
369 puts $out "<identifier>$href</identifier>"
370 puts $out "<description>$body</description>"
373 if {![RobotFileExist visited $host $path]} {
374 if {![RobotFileExist bad $host $path]} {
375 if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
376 puts "--- Error $msg"
392 proc RobotsTxt {url} {
395 set v URL($URL($url,host),robots)
397 foreach l [split $URL($url,buf) \n] {
399 if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
400 puts "cmd=$cmd arg=$arg"
404 set pat [string tolower $arg]*
405 set section [string match $pat $agent]
409 puts "rule [list 0 $arg]"
410 lappend $v [list 0 $arg]
415 puts "rule [list 1 $arg]"
416 lappend $v [list 1 $arg]
424 proc RobotTextPlain {url out} {
428 puts $out "<documentcontent>"
429 puts $out $URL($url,buf)
430 puts $out "</documentcontent>"
433 if {![string compare $URL($url,path) /robots.txt]} {
438 proc Robot200 {url} {
442 set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
443 switch $URL($url,head,content-type) {
445 RobotTextHtml $url $out
448 RobotTextPlain $url $out
456 # puts "Parsing done"
457 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
460 proc RobotReadContent {url sock} {
463 set buffer [read $sock 16384]
464 set readCount [string length $buffer]
466 if {$readCount <= 0} {
469 } elseif {[string first \0 $buffer] >= 0} {
473 # puts "Got $readCount bytes"
474 set URL($url,buf) $URL($url,buf)$buffer
478 proc RobotReadHeader {url sock} {
481 if {[catch {set buffer [read $sock 2148]}]} {
485 set readCount [string length $buffer]
487 if {$readCount <= 0} {
491 # puts "Got $readCount bytes"
492 set URL($url,buf) $URL($url,buf)$buffer
494 set n [string first \n\n $URL($url,buf)]
498 set headbuf [string range $URL($url,buf) 0 $n]
501 set URL($url,buf) [string range $URL($url,buf) $n end]
503 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
504 set lines [split $headbuf \n]
505 foreach line $lines {
506 if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
507 set URL($url,head,[string tolower $name]) $value
511 set URL($url,state) skip
514 RobotRedirect $url $URL($url,head,location) 301
518 RobotRedirect $url $URL($url,head,location) 302
530 if {![info exists URL($url,head,content-type)]} {
531 set URL($url,head,content-type) {}
533 switch $URL($url,head,content-type) {
535 fileevent $sock readable [list RobotReadContent $url $sock]
538 fileevent $sock readable [list RobotReadContent $url $sock]
555 proc RobotSockCancel {sock url} {
557 puts "RobotSockCancel sock=$sock url=$url"
562 proc RobotConnect {url sock} {
565 fconfigure $sock -translation {auto crlf} -blocking 0
566 fileevent $sock readable [list RobotReadHeader $url $sock]
567 puts $sock "GET $URL($url,path) HTTP/1.0"
568 puts $sock "Host: $URL($url,host)"
569 puts $sock "User-Agent: $agent"
572 set URL($sock,cancel) [after 60000 [list RobotSockCancel $sock $url]]
579 proc RobotGetUrl {url phost} {
583 if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
586 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
590 set URL($url,method) $method
591 set URL($url,host) $host
592 set URL($url,port) $port
593 set URL($url,path) $path
594 set URL($url,state) head
596 if [catch {set sock [socket -async $host $port]}] {
599 RobotConnect $url $sock
604 if {![llength [info commands htmlSwitch]]} {
605 set e [info sharedlibextension]
606 if {[catch {load ./tclrobot$e}]} {
611 set agent "zmbot/0.0"
612 if {![catch {set os [exec uname -s -r]}]} {
613 set agent "$agent ($os)"
627 if {[llength $argv] < 2} {
628 puts "Tclrobot: usage <domain> <start>"
629 puts " Example: '*.indexdata.dk' http://www.indexdata.dk/"
633 set domains [lindex $argv 0]
634 foreach site [lindex $argv 1] {
636 if [RobotGetUrl $site {}] {
637 incr robotMoreWork -1
638 puts "Couldn't process $site"
642 while {$robotMoreWork} {