2 # $Id: tkl-web-harvester,v 1.1 2003/08/14 08:02:10 marc Exp $
20 #proc dbgmsg {level msg} {
22 # if {[expr $debuglevel >= $level]} {
28 if {[expr $debuglevel >= 0]} {
32 # dbgmsg is always called with just one string!
35 proc fnameEncode {fname} {
36 regsub -all {&} $fname {%38} fname
37 regsub -all {<} $fname {%3C} fname
38 regsub -all {>} $fname {%3E} fname
39 regsub -all {\?} $fname {%3F} fname
40 regsub -all {\*} $fname {%2A} fname
44 proc fnameDecode {fname} {
45 regsub -all {%38} $fname {&} fname
46 regsub -all {%3C} $fname {<} fname
47 regsub -all {%3E} $fname {>} fname
48 regsub -all {%3F} $fname {?} fname
49 regsub -all {%2A} $fname {*} fname
53 proc RobotFileNext1 {area lead} {
54 # dbgmsg "RobotFileNext1 area=$area lead=$lead"
55 if {[catch {set ns [glob ${area}/*]}]} {
59 if {[file isfile $n]} {
60 set off [string last / $n]
63 set end [string length $n]
66 return $lead/[string range $n $off $end]
70 if {[file isdirectory $n]} {
71 set off [string last / $n]
74 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
75 if {[string length $sb]} {
83 proc RobotWriteRecord {outf fromurl distance} {
84 puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
86 puts $outf "<distance>"
88 puts $outf "</distance>"
89 puts $outf "<fromurl>"
91 puts $outf "</fromurl>"
95 proc RobotReadRecord {inf fromurlx distancex} {
96 upvar $fromurlx fromurl
97 upvar $distancex distance
101 set distance [string trim [gets $inf]]
102 # dbgmsg "got distance = $distance"
105 set fromurl [string trim [gets $inf]]
108 proc RobotFileNext {task area} {
113 # dbgmsg "RobotFileNext seq=$control($task,seq)"
114 if {$control($task,seq) < 0} {
117 set target $control($task,target)
118 if {$control($task,seq) == 0} {
119 if {[catch {set ns($task) [glob $target/$area/*]}]} {
120 puts "----------- DONE-------- target=$target"
124 # dbgmsg "ns=$ns($task)"
125 set off [string length $target/$area]
127 set n [lindex $ns($task) $control($task,seq)]
129 if {![string length $n]} {
130 set control($task,seq) -1
131 set statusfile [open $target/status w]
132 puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
136 incr control($task,seq)
137 if {[file isfile $n/robots.txt_.tkl]} {
138 # dbgmsg "ok returning http://[string range $n $off end]/robots.txt"
139 return [fnameDecode http://[string range $n $off end]/robots.txt]
140 } elseif {[file isdirectory $n]} {
141 set sb [RobotFileNext1 $n http://[string range $n $off end]]
142 if {[string length $sb]} {
143 return [fnameDecode $sb]
146 dbgmsg "no more work at end of RobotFileNext n=$n"
147 dbgmsg "ns=$ns($task)"
152 proc RobotFileExist {task area host path} {
153 global debuglevel control
155 if {$debuglevel > 3} {
156 dbgmsg "RobotFileExist begin area=$area host=$host path=$path"
158 set target $control($task,target)
159 return [file exists [fnameEncode $target/$area/$host${path}_.tkl]]
162 proc RobotFileUnlink {task area host path} {
163 global status control
165 set target $control($task,target)
166 # dbgmsg "RobotFileUnlink begin"
167 # dbgmsg "area=$area host=$host path=$path"
168 set npath [fnameEncode $target/$area/$host${path}_.tkl]
169 # dbgmsg "npath=$npath"
170 set comp [split $npath /]
171 if {[catch {exec rm $npath}]} return
173 set l [llength $comp]
175 incr status($task,$area) -1
176 for {set i $l} {$i > 0} {incr i -1} {
177 set path [join [lrange $comp 0 $i] /]
178 if {![catch {glob $path/*}]} return
181 # dbgmsg "RobotFileUnlink end"
184 proc RobotFileClose {out} {
185 if [string compare $out stdout] {
190 proc RobotFileOpen {task area host path {mode w}} {
192 global workdir status debuglevel control
194 # dbgmsg "RobotFileOpen task=$task path=$path"
196 set target $control($task,target)
197 set path [fnameEncode $path]
199 if {![info exists workdir]} {
202 if {$debuglevel > 3} {
203 dbgmsg "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
205 if {[string compare $orgPwd $workdir]} {
206 dbgmsg "ooops. RobotFileOpen failed"
207 dbgmsg "workdir = $workdir"
208 dbgmsg "pwd = $orgPwd"
212 set comp [split $target/$area/$host /]
213 set len [llength $comp]
216 # dbgmsg "1 comp=$comp"
218 for {set i 0} {$i <= $len} {incr i} {
219 set d [lindex $comp $i]
220 if {[string length $d] == 0} {
222 } elseif {[catch {cd $d}]} {
225 if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
226 if {[string compare $path /robots.txt]} {
227 set out [open robots.txt_.tkl w]
228 dbgmsg "creating robots.txt in $d"
230 incr status($task,unvisited)
236 set comp [split $path /]
237 set len [llength $comp]
240 # dbgmsg "2 path=$path comp=$comp"
242 for {set i 0} {$i < $len} {incr i} {
243 set d [lindex $comp $i]
244 if {[string length $d] > 0} {
245 if {[catch {cd $d}]} {
251 set d [lindex $comp $len]
252 set out [open ${d}_.tkl $mode]
254 incr status($task,$area)
260 proc RobotStartJob {root task} {
263 set fname "$root$task"
264 set f [open $fname r]
266 dbgmsg "Reading $fname"
268 # task type must be 2
269 if {![regexp {<tasktype>([^<]*)</tasktype>} $xml x tasktype]} {
272 set tasktype [string trim $tasktype]
273 if {![string match 2 $tasktype]} {
276 # status must not be finished or error
277 if {![regexp {<status>([^<]*)</status>} $xml x status]} {
280 if {$status == "finished"} {
281 dbgmsg "already finished"
284 if {$status == "error"} {
285 dbgmsg "already finished due to error"
288 # ignore if task has already been processed
289 dbgmsg "status = $status"
290 if {![CreateTask $task]} {
293 set control($task,taskfname) $fname
294 dbgmsg "Reading $fname stage 2"
297 lappend starturls $body
300 set action $parm(action)
301 if {$type == "domain"} {
302 $action url http://$body/*
304 if {$type == "url"} {
307 if {$type == "mime"} {
311 set ex [file rootname [file tail $task]]
312 #set control($task,target) "$root$body/$ex"
313 set control($task,target) "$control(tmpdir)/$ex"
314 set control($task,output) "$root$body"
316 set control($task,distance) $body
318 set control($task,filestatus) $body
320 set control($task,tasktype) $body
323 if {[info exists starturls]} {
324 foreach url $starturls {
325 puts "marking start urls $url"
330 if {$status == "pending"} {
331 regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
332 set f [open $fname w]
333 puts -nonewline $f $xml2
338 proc RobotDoneJob {task} {
339 global daemon_dir control
341 if {![info exists daemon_dir]} {
344 set fname $control($task,taskfname)
345 set f [open $fname r]
347 dbgmsg "Reading $fname"
348 regexp {<status>([^<]*)</status>} $xml x status
350 dbgmsg "status = $status"
353 regsub {<status>[^<]*</status>} $xml {<status>finished</status>} xml2
354 set f [open $fname w]
355 puts -nonewline $f $xml2
359 proc RobotScanDir {} {
362 if {![info exists daemon_dir]} {
365 foreach d $daemon_dir {
366 if {[catch {set files [glob $d/*.spl]}]} {
369 foreach fname $files {
370 if {[file isfile $fname] && [file readable $fname]} {
371 set jobfile [open $fname]
372 gets $jobfile portalroot
373 gets $jobfile portaltask
376 RobotStartJob $portalroot $portaltask
382 proc RobotRR {task} {
383 global control robotsRunning tasks robotsMax status
385 dbgmsg "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
386 incr robotsRunning -1
388 # only one task gets through...
389 if {[string compare [lindex $tasks 0] $task]} {
392 dbgmsg "RobotRR. task = $task"
393 while {$robotsRunning} {
397 if {[catch {RobotScanDir} msg]} {
398 logmsg "RobotScanDir failed"
402 set target $control($t,target)
403 set statusfile [open $target/status w]
404 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
406 set control($t,seq) 0
411 proc RobotDaemonSig {} {
417 proc RobotDaemonLoop {} {
418 global daemon_cnt tasks robotsRunning status
425 if {[info exists tasks]} {
426 logmsg "daemon loop tasks $tasks"
428 set control($t,seq) 0
431 while {$robotsRunning} {
435 after 30000 RobotDaemonSig
440 proc RobotRestart {task url sock} {
441 global URL robotsRunning
444 after cancel $URL($sock,cancel)
446 foreach v [array names URL $task,$url,*] {
450 incr robotsRunning -1
454 proc RobotStart {task} {
456 global robotsRunning robotsMax idletime status tasks
458 # dbgmsg "RobotStart $task running=$robotsRunning"
460 set url [RobotFileNext $task unvisited]
461 if {[string compare $url done] == 0} {
462 dbgmsg "In RobotStart task $task done"
466 if {[string compare $t $task]} {
469 dbgmsg "task $t done"
472 if {![info exists ntasks]} {
481 if {![string length $url]} {
485 if {[string compare $url wait] == 0} {
486 after $idletime [list RobotRR $task]
489 set r [RobotGetUrl $task $url {}]
491 if {$robotsRunning >= $robotsMax} return
493 incr robotsRunning -1
494 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
495 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
498 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
503 proc headSave {task url out} {
506 if {[info exists URL($task,$url,head,last-modified)]} {
507 puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
510 if {[info exists URL($task,$url,head,date)]} {
511 puts $out " <date>$URL($task,$url,head,date)</date>"
513 if {[info exists URL($task,$url,head,content-length)]} {
514 puts $out " <by>$URL($task,$url,head,content-length)</by>"
516 if {[info exists URL($task,$url,head,server)]} {
517 puts $out " <format>$URL($task,$url,head,server)</format>"
520 puts $out {<publisher>}
521 puts $out " <identifier>$url</identifier>"
522 if {[info exists URL($task,$url,head,content-type)]} {
523 puts $out " <type>$URL($task,$url,head,content-type)</type>"
525 puts $out {</publisher>}
528 proc RobotHref {task url hrefx hostx pathx} {
529 global URL control debuglevel
534 if {$debuglevel > 1} {
535 dbgmsg "Ref input url = $url href=$href"
538 if {[string first { } $href] >= 0} {
541 if {[string length $href] > 256} {
545 # Skip pages that have ? in them
546 # if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
549 # get method (if any)
550 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
554 if {[string compare $method http]} {
559 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
560 if {![string length $surl]} {
563 if {[info exist control($task,domains)]} {
565 foreach domain $control($task,domains) {
566 if {[string match $domain $host]} {
576 regexp {^([^\#]*)} $hpath x surl
577 set host $URL($task,$url,hostport)
579 if {![string length $surl]} {
582 if {[string first / $surl]} {
584 set curpath $URL($task,$url,path)
585 if {[info exists URL($task,$url,bpath)]} {
586 set curpath $URL($task,$url,bpath)
588 regexp {^([^\#?]*)} $curpath x dpart
589 set l [string last / $dpart]
590 if {[expr $l >= 0]} {
591 set surl [string range $dpart 0 $l]$surl
593 set surl $dpart/$surl
596 set surllist [split $surl /]
599 foreach c $surllist {
604 set path [lrange $path 0 $pathl]
617 if {$debuglevel > 4} {
618 dbgmsg "pathl=$pathl output path=$path"
620 set path [join $path /]
621 if {![string length $path]} {
624 regsub -all {~} $path {%7E} path
625 set href "$method://$host$path"
627 if {$debuglevel > 1} {
628 dbgmsg "Ref result = $href"
630 return [checkrule $task url $href]
633 proc RobotError {task url code} {
636 dbgmsg "Bad URL $url (code $code)"
639 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
640 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
641 RobotReadRecord $inf fromurl distance
644 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
645 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
646 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
647 RobotWriteRecord $outf $fromurl $distance
652 proc RobotRedirect {task url tourl code} {
655 dbgmsg "Redirecting from $url to $tourl"
659 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
660 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
661 RobotReadRecord $inf fromurl distance
664 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
665 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
666 RobotWriteRecord $outf $fromurl $distance
669 if {[RobotHref $task $url tourl host path]} {
670 if {![RobotFileExist $task visited $host $path]} {
671 if {![RobotFileExist $task unvisited $host $path]} {
672 set outf [RobotFileOpen $task unvisited $host $path]
673 RobotWriteRecord $outf $fromurl $distance
678 set inf [RobotFileOpen $task visited $host $path r]
679 RobotReadRecord $inf oldurl olddistance
681 if {[string length $olddistance] == 0} {
684 if {[string length $distance] == 0} {
687 dbgmsg "distance=$distance olddistance=$olddistance"
688 if {[expr $distance < $olddistance]} {
689 set outf [RobotFileOpen $task unvisited $host $path]
690 RobotWriteRecord $outf $tourl $distance
695 if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
696 dbgmsg "unlink failed"
701 proc wellform {body} {
702 regsub -all {<!--[^-]*-->} $body { } abody
703 regsub -all -nocase {<script[^<]*</script>} $abody {} body
704 regsub -all {<[^\>]+>} $body {} abody
705 regsub -all { } $abody { } body
706 regsub -all {&} $body {&} abody
710 proc link {task url out href body distance} {
712 if {[expr $distance > $control($task,distance)]} return
714 if {![RobotHref $task $url href host path]} return
716 if ($control($task,cr)) {
718 puts $out "<identifier>$href</identifier>"
719 set abody [wellform $body]
720 puts $out "<description>$abody</description>"
724 if {![RobotFileExist $task visited $host $path]} {
726 if {![RobotFileExist $task bad $host $path]} {
727 if {[RobotFileExist $task unvisited $host $path]} {
728 set inf [RobotFileOpen $task unvisited $host $path r]
729 RobotReadRecord $inf oldurl olddistance
735 if {[string length $olddistance] == 0} {
738 if {[expr $distance < $olddistance]} {
739 set outf [RobotFileOpen $task unvisited $host $path]
740 RobotWriteRecord $outf $url $distance
743 } elseif {[string compare $href $url]} {
744 set inf [RobotFileOpen $task visited $host $path r]
745 RobotReadRecord $inf xurl olddistance
747 if {[string length $olddistance] == 0} {
750 if {[expr $distance < $olddistance]} {
751 dbgmsg "OK remarking url=$url href=$href"
752 dbgmsg "olddistance = $olddistance"
753 dbgmsg "newdistance = $distance"
754 set outf [RobotFileOpen $task unvisited $host $path]
755 RobotWriteRecord $outf $url $distance
761 proc RobotTextTkl {task url out} {
764 # set title so we can emit it for the body
766 # if true, nothing will be indexed
768 # if true, nothing will be followed
771 puts $control($task,output)
774 set distance distance
776 htmlSwitch $URL($task,$url,buf) \
778 # når title tag er hittet, er body set til indholdet af tagget
781 #puts -nonewline $out "<meta"
782 # al er list med attribut navne som fandtes ind i parm hash
783 set al [array names parm]
784 # løkke igennem attributer
786 set al [string tolower $a]
787 #puts -nonewline $out " $al"
788 #puts -nonewline $out {="}
789 #puts -nonewline $out $parm($a)
790 #puts -nonewline $out {"}
793 #puts $out "></meta>"
795 # don't print title of document content if noindex is used
797 #puts $out "<title>$title</title>"
798 # xml compilancy added
799 set bbody [wellform $body]
800 #puts $out "<documentcontent>"
802 #puts $out "</documentcontent>"
806 if {![info exists parm(href)]} {
809 set href [string trim $parm(href)]
811 # <a href="...."> .. </a>
812 # we're not using nonest - otherwise body isn't set
813 if {$nofollow} continue
814 if {![info exists parm(href)]} {
817 #puts "link $task $url $out [string trim $parm(href)] $body $distance"
819 if {$nofollow} continue
820 if {![info exists parm(href)]} {
823 #puts "link $task $url $out [string trim $parm(href)] $body $distance"
825 if {![info exists parm(src)]} {
828 #puts "link $task $url $out [string trim $parm(src)] $body $fdistance"
832 proc RobotTextHtml {task url out} {
835 # set title so we can emit it for the body
837 # if true, nothing will be indexed
839 # if true, nothing will be followed
844 if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
845 set fdistance $URL($task,$url,dist)
846 set distance [expr $fdistance + 1]
848 htmlSwitch $URL($task,$url,buf) \
852 # collect metadata and save NAME= CONTENT=..
855 puts -nonewline $out "<meta"
856 set al [array names parm]
858 set al [string tolower $a]
859 puts -nonewline $out " $al"
860 puts -nonewline $out {="}
861 puts -nonewline $out $parm($a)
862 puts -nonewline $out {"}
865 set metaname [string tolower $parm($a)]
868 set metacontent $parm($a)
874 # go through robots directives (af any)
875 if {![string compare $metaname robots]} {
876 set direcs [split [string tolower $metacontent] ,]
877 if {[lsearch $direcs noindex] >= 0} {
880 if {[lsearch $direcs nofollow] >= 0} {
885 # don't print title of document content if noindex is used
887 puts $out "<title>$title</title>"
888 set bbody [wellform $body]
889 puts $out "<documentcontent>"
891 puts $out "</documentcontent>"
895 if {![info exists parm(href)]} {
898 set href [string trim $parm(href)]
899 if {![RobotHref $task $url href host path]} continue
900 set URL($task,$url,bpath) $path
902 # <a href="...."> .. </a>
903 # we're not using nonest - otherwise body isn't set
904 if {$nofollow} continue
905 if {![info exists parm(href)]} {
908 link $task $url $out [string trim $parm(href)] $body $distance
910 if {$nofollow} continue
911 if {![info exists parm(href)]} {
914 link $task $url $out [string trim $parm(href)] $body $distance
916 if {![info exists parm(src)]} {
919 link $task $url $out [string trim $parm(src)] $body $fdistance
923 proc RobotsTxt {task url} {
926 RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
929 proc RobotsTxt0 {task v buf} {
932 foreach l [split $buf \n] {
933 if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
934 set arg [string trim $arg]
935 dbgmsg "cmd=$cmd arg=$arg"
936 switch -- [string tolower $cmd] {
939 set pat [string tolower $arg]*
940 set section [string match $pat $agent]
944 dbgmsg "rule [list 0 $arg]"
945 lappend $v [list 0 $arg]
950 dbgmsg "rule [list 1 $arg]"
951 lappend $v [list 1 $arg]
959 proc RobotTextPlain {task url out} {
962 puts $out "<documentcontent>"
963 regsub -all {<} $URL($task,$url,buf) {\<} content
965 puts $out "</documentcontent>"
967 if {![string compare $URL($task,$url,path) /robots.txt]} {
972 proc RobotWriteMetadata {task url out} {
975 set charset $URL($task,$url,charset)
976 puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
980 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
981 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
982 RobotReadRecord $inf fromurl distance
985 set URL($task,$url,dist) $distance
986 puts $out "<distance>"
987 puts $out " $distance"
988 puts $out "</distance>"
989 headSave $task $url $out
990 logmsg "Parsing $url distance=$distance"
991 switch $URL($task,$url,head,content-type) {
993 if {[string length $distance]} {
994 RobotTextHtml $task $url $out
995 RobotTextTkl $task $url $out
999 RobotTextPlain $task $url $out
1002 puts $out "</zmbot>"
1005 proc Robot200 {task url} {
1008 set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
1009 puts -nonewline $out $URL($task,$url,buf)
1012 set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
1013 RobotWriteMetadata $task $url $out
1016 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
1019 proc RobotReadContent {task url sock binary} {
1022 set buffer [read $sock 16384]
1023 set readCount [string length $buffer]
1025 if {$readCount <= 0} {
1027 RobotRestart $task $url $sock
1028 } elseif {!$binary && [string first \0 $buffer] >= 0} {
1030 RobotRestart $task $url $sock
1032 # dbgmsg "Got $readCount bytes"
1033 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
1037 proc RobotReadHeader {task url sock} {
1038 global URL debuglevel
1040 if {$debuglevel > 1} {
1041 dbgmsg "HTTP head $url"
1043 if {[catch {set buffer [read $sock 2148]}]} {
1044 RobotError $task $url 404
1045 RobotRestart $task $url $sock
1048 set readCount [string length $buffer]
1050 if {$readCount <= 0} {
1051 RobotError $task $url 404
1052 RobotRestart $task $url $sock
1054 # dbgmsg "Got $readCount bytes"
1055 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
1057 set n [string first \r\n\r\n $URL($task,$url,buf)]
1061 set headbuf [string range $URL($task,$url,buf) 0 $n]
1063 set URL($task,$url,charset) ISO-8859-1
1064 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
1066 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
1067 set lines [split $headbuf \n]
1068 foreach line $lines {
1069 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
1070 set URL($task,$url,head,[string tolower $name]) [string trim $value]
1072 regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
1074 dbgmsg "HTTP CODE $code"
1075 set URL($task,$url,state) skip
1078 RobotRedirect $task $url $URL($task,$url,head,location) 301
1079 RobotRestart $task $url $sock
1082 RobotRedirect $task $url $URL($task,$url,head,location) 302
1083 RobotRestart $task $url $sock
1086 if {![info exists URL($task,$url,head,content-type)]} {
1087 set URL($task,$url,head,content-type) {}
1090 switch -glob -- $URL($task,$url,head,content-type) {
1095 if {![regexp {/robots.txt$} $url]} {
1096 if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
1097 RobotError $task $url mimedeny
1098 RobotRestart $task $url $sock
1102 fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
1105 RobotError $task $url $code
1106 RobotRestart $task $url $sock
1113 proc RobotSockCancel {task url sock} {
1115 logmsg "RobotSockCancel sock=$sock url=$url"
1116 RobotError $task $url 401
1117 RobotRestart $task $url $sock
1120 proc RobotConnect {task url sock} {
1121 global URL agent acceptLanguage
1123 fconfigure $sock -translation {lf crlf} -blocking 0
1124 fileevent $sock readable [list RobotReadHeader $task $url $sock]
1125 puts $sock "GET $URL($task,$url,path) HTTP/1.0"
1126 puts $sock "Host: $URL($task,$url,host)"
1127 puts $sock "User-Agent: $agent"
1128 if {[string length $acceptLanguage]} {
1129 puts $sock "Accept-Language: $acceptLanguage"
1132 set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
1133 if {[catch {flush $sock}]} {
1134 RobotError $task $url 404
1135 RobotRestart $task $url $sock
1143 proc RobotGetUrl {task url phost} {
1144 global URL robotsRunning
1146 dbgmsg "Retrieve running=$robotsRunning url=$url task=$task"
1147 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
1150 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
1154 set URL($task,$url,method) $method
1155 set URL($task,$url,host) $host
1156 set URL($task,$url,hostport) $hostport
1157 set URL($task,$url,path) $path
1158 set URL($task,$url,state) head
1159 set URL($task,$url,buf) {}
1161 if {[string compare $path /robots.txt]} {
1163 if {![info exists URL($hostport,robots)]} {
1164 dbgmsg "READING robots.txt for host $hostport"
1165 if {[RobotFileExist $task visited $hostport /robots.txt]} {
1166 set inf [RobotFileOpen $task visited $hostport /robots.txt r]
1167 set buf [read $inf 32768]
1170 set buf "User-agent: *\nAllow: /\n"
1172 RobotsTxt0 $task URL($hostport,robots) $buf
1174 if {[info exists URL($hostport,robots)]} {
1175 foreach l $URL($hostport,robots) {
1176 if {[string first [lindex $l 1] $path] == 0} {
1177 set ok [lindex $l 0]
1183 dbgmsg "skipped due to robots.txt"
1187 if [catch {set sock [socket -async $host $port]}] {
1190 RobotConnect $task $url $sock
1198 if {![llength [info commands htmlSwitch]]} {
1199 if {[info exists env(tclrobot_lib)]} {
1200 set d $env(tclrobot_lib)
1202 if { $libdir > "" } {
1208 set e [info sharedlibextension]
1209 dbgmsg "About to load $d/tclrobot$e"
1210 if {[catch {load $d/tclrobot$e}]} {
1211 dbgmsg "Didn't get at $d, trying directly"
1214 dbgmsg "Loaded tclrobot$e all right"
1218 set agent "zmbot/0.2"
1219 if {![catch {set os [exec uname -s -r]}]} {
1220 set agent "$agent ($os)"
1223 dbgmsg "agent: $agent"
1231 # Rules: allow, deny, url
1233 proc checkrule {task type this} {
1239 if {$debuglevel > 3} {
1240 dbgmsg "CHECKRULE $type $this"
1242 if {[info exist control($task,alrules)]} {
1243 foreach l $control($task,alrules) {
1244 if {$debuglevel > 3} {
1245 dbgmsg "consider $l"
1248 if {[lindex $l 1] != $type} continue
1249 # consider mask (! negates)
1250 set masks [lindex $l 2]
1253 foreach mask $masks {
1254 if {$debuglevel > 4} {
1255 dbgmsg "consider single mask $mask"
1257 if {[string index $mask 0] == "!"} {
1258 set mask [string range $mask 1 end]
1259 if {[string match $mask $this]} continue
1261 if {![string match $mask $this]} continue
1265 if {$debuglevel > 4} {
1269 # OK, we have a match
1270 if {[lindex $l 0] == "allow"} {
1271 if {$debuglevel > 3} {
1272 dbgmsg "CHECKRULE MATCH OK"
1276 if {$debuglevel > 3} {
1277 dbgmsg "CHECKFULE MATCH FAIL"
1283 if {$debuglevel > 3} {
1284 dbgmsg "CHECKRULE MATCH DEFAULT $default_ret"
1291 global debuglevel task
1293 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1294 if {![RobotFileExist $task visited $host $path]} {
1295 set outf [RobotFileOpen $task unvisited $host $path]
1296 RobotWriteRecord $outf href 0
1297 RobotFileClose $outf
1302 proc deny {type stuff} {
1305 lappend control($task,alrules) [list deny $type $stuff]
1308 proc allow {type stuff} {
1311 lappend control($task,alrules) [list allow $type $stuff]
1314 proc debug {level} {
1317 set debuglevel $level
1320 proc CreateTask {t} {
1321 global tasks task status control
1325 if {[info exists tasks]} {
1326 if {[lsearch -exact $tasks $t] >= 0} {
1332 set status($t,unvisited) 0
1333 set status($t,visited) 0
1334 set status($t,bad) 0
1335 set status($t,raw) 0
1336 set status($t,active) 1
1337 set control($t,seq) 0
1338 set control($t,distance) 10
1339 set control($t,target) tmp
1340 set control($t,output) output
1341 set control($t,cr) 0
1345 # Little utility that ensures that at least one task is present (main).
1346 proc CreateMainTask {} {
1348 if {![info exist tasks]} {
1356 set l [llength $argv]
1359 puts {tclrobot: usage:}
1360 puts {tclrobot [-j jobs] [-p pid] [-T tmpdir] [-o logfile] [-i idle] [-c
1361 count] [-d domain] [-D spooldir] [-r rules] [-L libdir] [url ..]}
1362 logmsg " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
1369 set arg [lindex $argv $i]
1370 switch -glob -- $arg {
1372 set fname [string range $arg 2 end]
1373 if {![string length $fname]} {
1374 set fname [lindex $argv [incr i]]
1376 set loghandle [open $fname a]
1377 #dbgmsg "agent: $agent"
1381 set pidfname [string range $arg 2 end]
1382 if {![string length $pidfname]} {
1383 set pidfname [lindex $argv [incr i]]
1385 #dbgmsg "-p $pidfname"
1386 if {[file exists $pidfname]} {
1387 set pf [open $pidfname]
1390 logmsg "File $pidfname already exist. pid=$oldpid"
1391 if {[file isdirectory /proc/$oldpid]} {
1392 logmsg "And it's apparently running. Exiting."
1396 set pf [open $pidfname w]
1401 set tmpdir [string range $arg 2 end]
1402 if {![string length $tmpdir]} {
1403 set tmpdir [lindex $argv [incr i]]
1405 set control(tmpdir) $tmpdir
1408 set libdir [string range $arg 2 end]
1409 if {![string length $libdir]} {
1410 set libdir [lindex $argv [incr i]]
1414 set t [string range $arg 2 end]
1415 if {![string length $t]} {
1416 set t [lindex $argv [incr i]]
1421 set dir [string range $arg 2 end]
1422 if {![string length $dir]} {
1423 set dir [lindex $argv [incr i]]
1425 lappend daemon_dir $dir
1428 set robotsMax [string range $arg 2 end]
1429 if {![string length $robotsMax]} {
1430 set robotsMax [lindex $argv [incr i]]
1435 set control($task,distance) [string range $arg 2 end]
1436 if {![string length $control($task,distance)]} {
1437 set control($task,distance) [lindex $argv [incr i]]
1442 set dom [string range $arg 2 end]
1443 if {![string length $dom]} {
1444 set dom [lindex $argv [incr i]]
1446 lappend control($task,domains) $dom
1449 set idletime [string range $arg 2 end]
1450 if {![string length $idletime]} {
1451 set idletime [lindex $argv [incr i]]
1456 set acceptLanguage [string range $arg 2 end]
1457 if {![string length $acceptLanguage]} {
1458 set acceptLanguage [lindex $argv [incr i]]
1463 set rfile [string range $arg 2 end]
1464 if {![string length $rfile]} {
1465 set rfile [lindex $argv [incr i]]
1467 catch {unset maxdistance}
1469 if {[info exists maxdistance]} {
1470 set control($task,distance) $maxdistance
1476 #dbgmsg "in default: arg= $arg !!!"
1478 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1479 if {![RobotFileExist $task visited $host $path]} {
1480 set outf [RobotFileOpen $task unvisited $host $path]
1481 RobotWriteRecord $outf href 0
1482 RobotFileClose $outf
1491 dbgmsg "Parsed args, now loading"
1494 if {![info exist robotsMax]} {
1498 if {[info exist daemon_dir]} {
1499 logmsg "Daemon mode"
1504 logmsg "max distance=$control($t,distance)"
1505 if {[info exists control($t,domains)]} {
1506 logmsg "domains=$control($t,domains)"
1509 logmsg "max jobs=$robotsMax"
1515 while {$robotsRunning} {
1519 if {[info exists tasks]} {
1521 set statusfile [open $t/status w]
1522 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"