2 # $Id: robot.tcl,v 1.28 2001/11/13 11:17:26 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 # puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
13 return $lead/[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set off [string last / $n]
20 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
21 if {[string length $sb]} {
29 proc RobotWriteRecord {outf fromurl distance} {
31 puts $outf "<distance>"
33 puts $outf "</distance>"
34 puts $outf "<fromurl>"
36 puts $outf "</fromurl>"
40 proc RobotReadRecord {inf fromurlx distancex} {
41 upvar $fromurlx fromurl
42 upvar $distancex distance
45 set distance [string trim [gets $inf]]
46 # puts "got distance = $distance"
49 set fromurl [string trim [gets $inf]]
52 proc RobotFileNext {area} {
57 # puts "RobotFileNext robotSeq=$robotSeq"
62 if {[catch {set ns [glob ${area}/*]}]} {
66 set off [string length $area]
68 set n [lindex $ns $robotSeq]
69 if {![string length $n]} {
72 puts "Round robin un,ba,vi=$status(unvisited),$status(bad),$status(visited)"
76 if {[file isfile $n/frobots.txt]} {
77 puts "ok returning http://[string range $n $off end]/robots.txt"
78 return http://[string range $n $off end]/robots.txt
79 } elseif {[file isdirectory $n]} {
80 set sb [RobotFileNext1 $n http://[string range $n $off end]]
81 if {[string length $sb]} {
85 puts "no more work at end of RobotFileNext n=$n"
91 proc RobotFileExist {area host path} {
92 # puts "RobotFileExist begin area=$area host=$host path=$path"
93 set lpath [split $path /]
94 set l [llength $lpath]
96 set t [lindex $lpath $l]
98 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
99 # puts "RobotFileExist end npath=$npath"
100 return [file exists $npath]
103 proc RobotFileUnlink {area host path} {
105 # puts "RobotFileUnlink begin"
106 # puts "area=$area host=$host path=$path"
107 set lpath [split $path /]
108 set l [llength $lpath]
110 set t [lindex $lpath $l]
112 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
113 # puts "npath=$npath"
114 set comp [split $npath /]
115 if {[catch {exec rm [join $comp /]}]} return
117 set l [llength $comp]
120 incr status($area) -1
121 for {set i $l} {$i > 0} {incr i -1} {
122 set path [join [lrange $comp 0 $i] /]
123 if {![catch {glob $path/*}]} return
126 # puts "RobotFileUnlink end"
129 proc RobotFileClose {out} {
130 if [string compare $out stdout] {
135 proc RobotFileOpen {area host path {mode w}} {
140 if {![info exists workdir]} {
143 #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
144 if {[string compare $orgPwd $workdir]} {
145 puts "ooops. RobotFileOpen failed"
146 puts "workdir = $workdir"
150 set comp [split $area/$host$path /]
151 set len [llength $comp]
153 for {set i 0} {$i < $len} {incr i} {
155 set d "d[lindex $comp $i]"
157 set d [lindex $comp $i]
159 if {[catch {cd ./$d}]} {
162 if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
163 set out [open frobots.txt w]
164 puts "creating robots.txt in $d"
166 incr status(unvisited)
170 set d [lindex $comp $len]
171 if {[string length $d]} {
172 if {[file isdirectory $d]} {
173 set out [open $d/f $mode]
175 set out [open f$d $mode]
178 set out [open f $mode]
188 global robotSeq robotsRunning
190 incr robotsRunning -1
191 while {$robotsRunning} {
198 proc RobotRestart {url sock} {
199 global URL robotsRunning
202 after cancel $URL($sock,cancel)
204 foreach v [array names URL $url,*] {
208 incr robotsRunning -1
214 global robotsRunning robotsMax idletime
218 set url [RobotFileNext unvisited]
219 if {![string length $url]} {
223 if {[string compare $url wait] == 0} {
224 after $idletime RobotRR
227 set r [RobotGetUrl $url {}]
229 if {$robotsRunning >= $robotsMax} return
231 incr robotsRunning -1
232 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
233 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
236 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
241 proc headSave {url out} {
244 if {[info exists URL($url,head,last-modified)]} {
245 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
248 if {[info exists URL($url,head,date)]} {
249 puts $out " <date>$URL($url,head,date)</date>"
251 if {[info exists URL($url,head,content-length)]} {
252 puts $out " <by>$URL($url,head,content-length)</by>"
254 if {[info exists URL($url,head,server)]} {
255 puts $out " <format>$URL($url,head,server)</format>"
258 puts $out {<publisher>}
259 puts $out " <identifier>$url</identifier>"
260 if {[info exists URL($url,head,content-type)]} {
261 puts $out " <type>$URL($url,head,content-type)</type>"
263 puts $out {</publisher>}
266 proc RobotHref {url hrefx hostx pathx} {
267 global URL domains debuglevel
272 if {$debuglevel > 1} {
273 puts "Ref input url = $url href=$href"
276 if {[string first { } $href] >= 0} {
279 if {[string length $href] > 256} {
282 if {[string first {?} $href] >= 0} {
285 if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
288 # get method (if any)
289 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
293 if {[string compare $method http]} {
298 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
299 if {![string length $surl]} {
302 if {[info exist domains]} {
304 foreach domain $domains {
305 if {[string match $domain $host]} {
315 regexp {^([^\#]*)} $hpath x surl
316 set host $URL($url,hostport)
318 if {![string length $surl]} {
321 if {[string first / $surl]} {
323 regexp {^([^\#?]*)} $URL($url,path) x dpart
324 set l [string last / $dpart]
325 if {[expr $l >= 0]} {
326 set surl [string range $dpart 0 $l]$surl
328 set surl $dpart/$surl
331 set surllist [split $surl /]
334 foreach c $surllist {
339 set path [lrange $path 0 $pathl]
352 if {$debuglevel > 4} {
353 puts "pathl=$pathl output path=$path"
355 set path [join $path /]
356 if {![string length $path]} {
359 regsub -all {~} $path {%7E} path
360 set href "$method://$host$path"
362 if {$debuglevel > 1} {
363 puts "Ref result = $href"
365 return [checkrule url $href]
368 proc RobotError {url code} {
371 puts "Bad URL $url (code $code)"
374 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
375 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
376 RobotReadRecord $inf fromurl distance
379 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
380 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
381 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
382 RobotWriteRecord $outf $fromurl $distance
387 proc RobotRedirect {url tourl code} {
390 puts "Redirecting from $url to $tourl"
394 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
395 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
396 RobotReadRecord $inf fromurl distance
399 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
400 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
401 RobotWriteRecord $outf $fromurl $distance
404 if {[RobotHref $url tourl host path]} {
405 if {![RobotFileExist visited $host $path]} {
406 if {![RobotFileExist unvisited $host $path]} {
407 set outf [RobotFileOpen unvisited $host $path]
408 RobotWriteRecord $outf $fromurl $distance
413 set inf [RobotFileOpen visited $host $path r]
414 RobotReadRecord $inf oldurl olddistance
416 if {[string length $olddistance] == 0} {
419 if {[string length $distance] == 0} {
422 puts "distance=$distance olddistance=$olddistance"
423 if {[expr $distance < $olddistance]} {
424 set outf [RobotFileOpen unvisited $host $path]
425 RobotWriteRecord $outf $tourl $distance
430 if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} {
436 proc RobotTextHtml {url out} {
437 global URL maxdistance
440 if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
441 set distance [expr $URL($url,dist) + 1]
443 htmlSwitch $URL($url,buf) \
445 puts $out "<title>$body</title>"
447 puts -nonewline $out "<meta"
448 foreach a [array names parm] {
449 puts -nonewline $out " $a"
450 puts -nonewline $out {="}
451 puts -nonewline $out $parm($a)
452 puts -nonewline $out {"}
456 regsub -all {<!--[^-]*->} $body { } abody
457 regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
458 regsub -all {<[^\>]+>} $bbody {} nbody
459 puts $out "<documentcontent>"
461 puts $out "</documentcontent>"
463 if {![info exists parm(href)]} {
467 if {[expr $distance <= $maxdistance]} {
468 set href [string trim $parm(href)]
469 if {![RobotHref $url href host path]} continue
472 puts $out "<identifier>$href</identifier>"
473 puts $out "<description>$body</description>"
476 if {![RobotFileExist visited $host $path]} {
478 if {![RobotFileExist bad $host $path]} {
479 if {[RobotFileExist unvisited $host $path]} {
480 set inf [RobotFileOpen unvisited $host $path r]
481 RobotReadRecord $inf oldurl olddistance
487 if {[string length $olddistance] == 0} {
490 if {[expr $distance < $olddistance]} {
491 set outf [RobotFileOpen unvisited $host $path]
492 RobotWriteRecord $outf $url $distance
495 } elseif {[string compare $href $url]} {
496 set inf [RobotFileOpen visited $host $path r]
497 RobotReadRecord $inf xurl olddistance
499 if {[string length $olddistance] == 0} {
502 if {[expr $distance < $olddistance]} {
503 puts "OK remarking url=$url href=$href"
504 puts "olddistance = $olddistance"
505 puts "newdistance = $distance"
506 set outf [RobotFileOpen unvisited $host $path]
507 RobotWriteRecord $outf $url $distance
513 if {![info exists parm(href)]} {
517 if {[expr $distance <= $maxdistance]} {
518 set href [string trim $parm(href)]
519 if {![RobotHref $url href host path]} continue
522 puts $out "<identifier>$href</identifier>"
523 puts $out "<description></description>"
526 if {![RobotFileExist visited $host $path]} {
528 if {![RobotFileExist bad $host $path]} {
529 if {[RobotFileExist unvisited $host $path]} {
530 set inf [RobotFileOpen unvisited $host $path r]
531 RobotReadRecord $inf oldurl olddistance
537 if {[string length $olddistance] == 0} {
540 if {[expr $distance < $olddistance]} {
541 set outf [RobotFileOpen unvisited $host $path]
542 RobotWriteRecord $outf $url $distance
545 } elseif {[string compare $href $url]} {
546 set inf [RobotFileOpen visited $host $path r]
547 RobotReadRecord $inf xurl olddistance
549 if {[string length $olddistance] == 0} {
552 if {[expr $distance < $olddistance]} {
553 puts "OK remarking url=$url href=$href"
554 puts "olddistance = $olddistance"
555 puts "newdistance = $distance"
556 set outf [RobotFileOpen unvisited $host $path]
557 RobotWriteRecord $outf $url $distance
563 if {![info exists parm(src)]} {
567 if {[expr $distance <= $maxdistance]} {
568 set href [string trim $parm(src)]
569 if {![RobotHref $url href host path]} continue
572 puts $out "<identifier>$href</identifier>"
573 puts $out "<description></description>"
576 if {![RobotFileExist visited $host $path]} {
578 if {![RobotFileExist bad $host $path]} {
579 if {[RobotFileExist unvisited $host $path]} {
580 set inf [RobotFileOpen unvisited $host $path r]
581 RobotReadRecord $inf oldurl olddistance
587 if {[string length $olddistance] == 0} {
590 if {[expr $distance < $olddistance]} {
591 set outf [RobotFileOpen unvisited $host $path]
592 RobotWriteRecord $outf $url $distance
595 } elseif {[string compare $href $url]} {
596 set inf [RobotFileOpen visited $host $path r]
597 RobotReadRecord $inf xurl olddistance
599 if {[string length $olddistance] == 0} {
602 if {[expr $distance < $olddistance]} {
603 puts "OK remarking url=$url href=$href"
604 puts "olddistance = $olddistance"
605 puts "newdistance = $distance"
606 set outf [RobotFileOpen unvisited $host $path]
607 RobotWriteRecord $outf $url $distance
615 proc RobotsTxt {url} {
618 RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf)
621 proc RobotsTxt0 {v buf} {
624 foreach l [split $buf \n] {
625 if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
626 puts "cmd=$cmd arg=$arg"
627 switch -- [string tolower $cmd] {
630 set pat [string tolower $arg]*
631 set section [string match $pat $agent]
635 puts "rule [list 0 $arg]"
636 lappend $v [list 0 $arg]
641 puts "rule [list 1 $arg]"
642 lappend $v [list 1 $arg]
650 proc RobotTextPlain {url out} {
653 puts $out "<documentcontent>"
654 regsub -all {<} $URL($url,buf) {\<} content
656 puts $out "</documentcontent>"
658 if {![string compare $URL($url,path) /robots.txt]} {
663 proc RobotWriteMetadata {url out} {
669 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
670 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
671 RobotReadRecord $inf fromurl distance
674 set URL($url,dist) $distance
675 puts $out "<distance>"
676 puts $out " $distance"
677 puts $out "</distance>"
679 puts "Parsing $url distance=$distance"
680 switch $URL($url,head,content-type) {
682 if {[string length $distance]} {
683 RobotTextHtml $url $out
687 RobotTextPlain $url $out
693 proc Robot200 {url} {
696 set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)]
697 puts -nonewline $out $URL($url,buf)
700 set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
701 RobotWriteMetadata $url $out
704 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
707 proc RobotReadContent {url sock binary} {
710 set buffer [read $sock 16384]
711 set readCount [string length $buffer]
713 if {$readCount <= 0} {
715 RobotRestart $url $sock
716 } elseif {!$binary && [string first \0 $buffer] >= 0} {
718 RobotRestart $url $sock
720 # puts "Got $readCount bytes"
721 set URL($url,buf) $URL($url,buf)$buffer
725 proc RobotReadHeader {url sock} {
726 global URL debuglevel
728 if {$debuglevel > 1} {
729 puts "HTTP head $url"
731 if {[catch {set buffer [read $sock 2148]}]} {
733 RobotRestart $url $sock
736 set readCount [string length $buffer]
738 if {$readCount <= 0} {
740 RobotRestart $url $sock
742 # puts "Got $readCount bytes"
743 set URL($url,buf) $URL($url,buf)$buffer
745 set n [string first \r\n\r\n $URL($url,buf)]
749 set headbuf [string range $URL($url,buf) 0 $n]
751 set URL($url,buf) [string range $URL($url,buf) $n end]
753 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
754 set lines [split $headbuf \n]
755 foreach line $lines {
756 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
757 set URL($url,head,[string tolower $name]) [string trim $value]
760 puts "HTTP CODE $code"
761 set URL($url,state) skip
764 RobotRedirect $url $URL($url,head,location) 301
765 RobotRestart $url $sock
768 RobotRedirect $url $URL($url,head,location) 302
769 RobotRestart $url $sock
772 if {![info exists URL($url,head,content-type)]} {
773 set URL($url,head,content-type) {}
776 switch -glob -- $URL($url,head,content-type) {
781 if {![regexp {/robots.txt$} $url]} {
782 if {![checkrule mime $URL($url,head,content-type)]} {
783 RobotError $url mimedeny
784 RobotRestart $url $sock
788 fileevent $sock readable [list RobotReadContent $url $sock $binary]
791 RobotError $url $code
792 RobotRestart $url $sock
799 proc RobotSockCancel {url sock} {
801 puts "RobotSockCancel sock=$sock url=$url"
803 RobotRestart $url $sock
806 proc RobotConnect {url sock} {
807 global URL agent acceptLanguage
809 fconfigure $sock -translation {lf crlf} -blocking 0
810 fileevent $sock readable [list RobotReadHeader $url $sock]
811 puts $sock "GET $URL($url,path) HTTP/1.0"
812 puts $sock "Host: $URL($url,host)"
813 puts $sock "User-Agent: $agent"
814 if {[string length $acceptLanguage]} {
815 puts $sock "Accept-Language: $acceptLanguage"
819 set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]]
826 proc RobotGetUrl {url phost} {
827 global URL robotsRunning
829 puts "Retrieve $robotsRunning url=$url"
830 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
833 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
837 set URL($url,method) $method
838 set URL($url,host) $host
839 set URL($url,hostport) $hostport
840 set URL($url,path) $path
841 set URL($url,state) head
844 if {[string compare $path /robots.txt]} {
846 if {![info exists URL($hostport,robots)]} {
847 puts "READING robots.txt for host $hostport"
848 if {[RobotFileExist visited $hostport /robots.txt]} {
849 set inf [RobotFileOpen visited $hostport /robots.txt r]
850 set buf [read $inf 32768]
853 set buf "User-agent: *\nAllow: /\n"
855 RobotsTxt0 URL($hostport,robots) $buf
857 if {[info exists URL($hostport,robots)]} {
858 foreach l $URL($hostport,robots) {
859 if {[string first [lindex $l 1] $path] == 0} {
866 puts "skipped due to robots.txt"
870 if [catch {set sock [socket -async $host $port]}] {
873 RobotConnect $url $sock
878 if {![llength [info commands htmlSwitch]]} {
879 set e [info sharedlibextension]
880 if {[catch {load ./tclrobot$e}]} {
885 set agent "zmbot/0.1"
886 if {![catch {set os [exec uname -s -r]}]} {
887 set agent "$agent ($os)"
902 set acceptLanguage {}
904 set status(unvisited) 0
905 set status(visited) 0
910 # Rules: allow, deny, url
912 proc checkrule {type this} {
916 if {$debuglevel > 3} {
917 puts "CHECKRULE $type $this"
919 if {[info exist alrules]} {
921 if {$debuglevel > 3} {
925 if {[lindex $l 1] != $type} continue
926 # consider mask (! negates)
927 set masks [lindex $l 2]
929 foreach mask $masks {
930 if {$debuglevel > 4} {
931 puts "consider single mask $mask"
933 if {[string index $mask 0] == "!"} {
934 set mask [string range $mask 1 end]
935 if {[string match $mask $this]} continue
937 if {![string match $mask $this]} continue
941 if {$debuglevel > 4} {
945 # OK, we have a match
946 if {[lindex $l 0] == "allow"} {
947 if {$debuglevel > 3} {
948 puts "CHECKRULE MATCH OK"
952 if {$debuglevel > 3} {
953 puts "CHECKFULE MATCH FAIL"
959 if {$debuglevel > 3} {
960 puts "CHECKRULE MATCH OK"
969 if {[RobotHref http://www.indexdata.dk/ href host path]} {
970 if {![RobotFileExist visited $host $path]} {
971 set outf [RobotFileOpen unvisited $host $path]
972 RobotWriteRecord $outf href 0
978 proc deny {type stuff} {
981 lappend alrules [list deny $type $stuff]
984 proc allow {type stuff} {
987 lappend alrules [list allow $type $stuff]
993 set debuglevel $level
999 set l [llength $argv]
1002 puts {tclrobot: usage:}
1003 puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
1004 puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
1009 set arg [lindex $argv $i]
1010 switch -glob -- $arg {
1012 set robotsMax [string range $arg 2 end]
1013 if {![string length $robotsMax]} {
1014 set robotsMax [lindex $argv [incr i]]
1018 set maxdistance [string range $arg 2 end]
1019 if {![string length $maxdistance]} {
1020 set maxdistance [lindex $argv [incr i]]
1024 set dom [string range $arg 2 end]
1025 if {![string length $dom]} {
1026 set dom [lindex $argv [incr i]]
1028 lappend domains $dom
1031 set idletime [string range $arg 2 end]
1032 if {![string length $idletime]} {
1033 set idletime [lindex $argv [incr i]]
1037 set acceptLanguage [string range $arg 2 end]
1038 if {![string length $acceptLanguage]} {
1039 set acceptLanguage [lindex $argv [incr i]]
1043 set rfile [string range $arg 2 end]
1044 if {![string length $rfile]} {
1045 set rfile [lindex $argv [incr i]]
1051 if {[RobotHref http://www.indexdata.dk/ href host path]} {
1052 if {![RobotFileExist visited $host $path]} {
1053 set outf [RobotFileOpen unvisited $host $path]
1054 RobotWriteRecord $outf href 0
1055 RobotFileClose $outf
1063 if {![info exist domains]} {
1066 if {![info exist maxdistance]} {
1069 if {![info exist robotsMax]} {
1073 puts "domains=$domains"
1074 puts "max distance=$maxdistance"
1075 puts "max jobs=$robotsMax"
1081 while {$robotsRunning} {
1085 puts "End un,ba,vi=$status(unvisited),$status(bad),$status(visited)"