#!/usr/bin/tclsh # $Id: tkl-web-harvester,v 1.1 2003/08/14 08:02:10 marc Exp $ # set loghandle stdout set robotsRunning 0 set workdir [pwd] set idletime 15000 set acceptLanguage {} set debuglevel 1 set libdir "" proc logmsg {msg} { global loghandle puts $loghandle $msg flush $loghandle } #proc dbgmsg {level msg} { # global debuglevel # if {[expr $debuglevel >= $level]} { # logmsg $msg # } #} proc dbgmsg {msg} { global debuglevel if {[expr $debuglevel >= 0]} { logmsg $msg } } # dbgmsg is always called with just one string! proc fnameEncode {fname} { regsub -all {&} $fname {%38} fname regsub -all {<} $fname {%3C} fname regsub -all {>} $fname {%3E} fname regsub -all {\?} $fname {%3F} fname regsub -all {\*} $fname {%2A} fname return $fname } proc fnameDecode {fname} { regsub -all {%38} $fname {&} fname regsub -all {%3C} $fname {<} fname regsub -all {%3E} $fname {>} fname regsub -all {%3F} $fname {?} fname regsub -all {%2A} $fname {*} fname return $fname } proc RobotFileNext1 {area lead} { # dbgmsg "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } foreach n $ns { if {[file isfile $n]} { set off [string last / $n] # skip / incr off set end [string length $n] # skip _.tkl incr end -6 return $lead/[string range $n $off $end] } } foreach n $ns { if {[file isdirectory $n]} { set off [string last / $n] # skip / incr off set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb } } } return {} } proc RobotWriteRecord {outf fromurl distance} { puts $outf {} puts $outf "" puts $outf "" puts $outf $distance puts $outf "" puts $outf "" puts $outf $fromurl puts $outf "" puts $outf "" } proc RobotReadRecord {inf fromurlx distancex} { upvar $fromurlx fromurl upvar $distancex distance gets $inf gets $inf gets $inf set distance [string trim [gets $inf]] # dbgmsg "got distance = $distance" gets $inf gets $inf set fromurl [string trim [gets $inf]] } proc RobotFileNext {task area} { global control global idletime ns global status # dbgmsg "RobotFileNext seq=$control($task,seq)" if {$control($task,seq) < 0} { return {} } set target $control($task,target) if {$control($task,seq) == 0} { if {[catch {set ns($task) [glob $target/$area/*]}]} { puts "----------- DONE-------- target=$target" return done } } # dbgmsg "ns=$ns($task)" set off [string length $target/$area] incr off set n [lindex $ns($task) $control($task,seq)] # dbgmsg "n=$n" if {![string length $n]} { set control($task,seq) -1 set statusfile [open $target/status w] puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)" close $statusfile return wait } incr control($task,seq) if {[file isfile $n/robots.txt_.tkl]} { # dbgmsg "ok returning http://[string range $n $off end]/robots.txt" return [fnameDecode http://[string range $n $off end]/robots.txt] } elseif {[file isdirectory $n]} { set sb [RobotFileNext1 $n http://[string range $n $off end]] if {[string length $sb]} { return [fnameDecode $sb] } } dbgmsg "no more work at end of RobotFileNext n=$n" dbgmsg "ns=$ns($task)" return {} } proc RobotFileExist {task area host path} { global debuglevel control if {$debuglevel > 3} { dbgmsg "RobotFileExist begin area=$area host=$host path=$path" } set target $control($task,target) return [file exists [fnameEncode $target/$area/$host${path}_.tkl]] } proc RobotFileUnlink {task area host path} { global status control set target $control($task,target) # dbgmsg "RobotFileUnlink begin" # dbgmsg "area=$area host=$host path=$path" set npath [fnameEncode $target/$area/$host${path}_.tkl] # dbgmsg "npath=$npath" set comp [split $npath /] if {[catch {exec rm $npath}]} return set l [llength $comp] incr l -2 incr status($task,$area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return exec rmdir $path } # dbgmsg "RobotFileUnlink end" } proc RobotFileClose {out} { if [string compare $out stdout] { close $out } } proc RobotFileOpen {task area host path {mode w}} { set orgPwd [pwd] global workdir status debuglevel control # dbgmsg "RobotFileOpen task=$task path=$path" set target $control($task,target) set path [fnameEncode $path] if {![info exists workdir]} { return stdout } if {$debuglevel > 3} { dbgmsg "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" } if {[string compare $orgPwd $workdir]} { dbgmsg "ooops. RobotFileOpen failed" dbgmsg "workdir = $workdir" dbgmsg "pwd = $orgPwd" exit 1 } set comp [split $target/$area/$host /] set len [llength $comp] incr len -1 # dbgmsg "1 comp=$comp" for {set i 0} {$i <= $len} {incr i} { set d [lindex $comp $i] if {[string length $d] == 0} { cd / } elseif {[catch {cd $d}]} { exec mkdir $d cd ./$d if {![string compare $area unvisited] && $i == $len && $mode == "w"} { if {[string compare $path /robots.txt]} { set out [open robots.txt_.tkl w] dbgmsg "creating robots.txt in $d" close $out incr status($task,unvisited) } } } } set comp [split $path /] set len [llength $comp] incr len -1 # dbgmsg "2 path=$path comp=$comp" for {set i 0} {$i < $len} {incr i} { set d [lindex $comp $i] if {[string length $d] > 0} { if {[catch {cd $d}]} { exec mkdir $d cd ./$d } } } set d [lindex $comp $len] set out [open ${d}_.tkl $mode] if {$mode == "w"} { incr status($task,$area) } cd $orgPwd return $out } proc RobotStartJob {root task} { global control set fname "$root$task" set f [open $fname r] set xml [read $f] dbgmsg "Reading $fname" close $f # task type must be 2 if {![regexp {([^<]*)} $xml x tasktype]} { return } set tasktype [string trim $tasktype] if {![string match 2 $tasktype]} { return } # status must not be finished or error if {![regexp {([^<]*)} $xml x status]} { return } if {$status == "finished"} { dbgmsg "already finished" return } if {$status == "error"} { dbgmsg "already finished due to error" return } # ignore if task has already been processed dbgmsg "status = $status" if {![CreateTask $task]} { return } set control($task,taskfname) $fname dbgmsg "Reading $fname stage 2" htmlSwitch $xml \ url { lappend starturls $body } filter { set type $parm(type) set action $parm(action) if {$type == "domain"} { $action url http://$body/* } if {$type == "url"} { $action url $body } if {$type == "mime"} { $action mime $body } } target { set ex [file rootname [file tail $task]] #set control($task,target) "$root$body/$ex" set control($task,target) "$control(tmpdir)/$ex" set control($task,output) "$root$body" } distance { set control($task,distance) $body } status { set control($task,filestatus) $body } tasktype { set control($task,tasktype) $body } if {[info exists starturls]} { foreach url $starturls { puts "marking start urls $url" url $url } } if {$status == "pending"} { regsub {[^<]*} $xml {running} xml2 set f [open $fname w] puts -nonewline $f $xml2 close $f } } proc RobotDoneJob {task} { global daemon_dir control if {![info exists daemon_dir]} { return } set fname $control($task,taskfname) set f [open $fname r] set xml [read $f] dbgmsg "Reading $fname" regexp {([^<]*)} $xml x status dbgmsg "------" dbgmsg "status = $status" close $f regsub {[^<]*} $xml {finished} xml2 set f [open $fname w] puts -nonewline $f $xml2 close $f } proc RobotScanDir {} { global daemon_dir if {![info exists daemon_dir]} { return } foreach d $daemon_dir { if {[catch {set files [glob $d/*.spl]}]} { return } foreach fname $files { if {[file isfile $fname] && [file readable $fname]} { set jobfile [open $fname] gets $jobfile portalroot gets $jobfile portaltask close $jobfile RobotStartJob $portalroot $portaltask } } } } proc RobotRR {task} { global control robotsRunning tasks robotsMax status dbgmsg "RobotRR -- running=$robotsRunning max=$robotsMax---------------" incr robotsRunning -1 # only one task gets through... if {[string compare [lindex $tasks 0] $task]} { return } dbgmsg "RobotRR. task = $task" while {$robotsRunning} { vwait robotsRunning } dbgmsg "Scan" if {[catch {RobotScanDir} msg]} { logmsg "RobotScanDir failed" logmsg $msg } foreach t $tasks { set target $control($t,target) set statusfile [open $target/status w] puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" close $statusfile set control($t,seq) 0 RobotStart $t } } proc RobotDaemonSig {} { global daemon_cnt incr daemon_cnt } proc RobotDaemonLoop {} { global daemon_cnt tasks robotsRunning status set daemon_cnt 0 while 1 { logmsg $daemon_cnt RobotScanDir if {[info exists tasks]} { logmsg "daemon loop tasks $tasks" foreach t $tasks { set control($t,seq) 0 RobotStart $t } while {$robotsRunning} { vwait robotsRunning } } after 30000 RobotDaemonSig vwait daemon_cnt } } proc RobotRestart {task url sock} { global URL robotsRunning close $sock after cancel $URL($sock,cancel) foreach v [array names URL $task,$url,*] { unset URL($v) } incr robotsRunning -1 RobotStart $task } proc RobotStart {task} { global URL global robotsRunning robotsMax idletime status tasks # dbgmsg "RobotStart $task running=$robotsRunning" while {1} { set url [RobotFileNext $task unvisited] if {[string compare $url done] == 0} { dbgmsg "In RobotStart task $task done" catch {unset ntasks} foreach t $tasks { if {[string compare $t $task]} { lappend ntasks $t } else { dbgmsg "task $t done" } } if {![info exists ntasks]} { unset tasks dbgmsg "all done" } else { set tasks $ntasks } RobotDoneJob $task return } if {![string length $url]} { return } incr robotsRunning if {[string compare $url wait] == 0} { after $idletime [list RobotRR $task] return } set r [RobotGetUrl $task $url {}] if {!$r} { if {$robotsRunning >= $robotsMax} return } else { incr robotsRunning -1 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotFileClose $outf } RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) } } } proc headSave {task url out} { global URL if {[info exists URL($task,$url,head,last-modified)]} { puts $out "$URL($task,$url,head,last-modified)" } puts $out {} if {[info exists URL($task,$url,head,date)]} { puts $out " $URL($task,$url,head,date)" } if {[info exists URL($task,$url,head,content-length)]} { puts $out " $URL($task,$url,head,content-length)" } if {[info exists URL($task,$url,head,server)]} { puts $out " $URL($task,$url,head,server)" } puts $out {} puts $out {} puts $out " $url" if {[info exists URL($task,$url,head,content-type)]} { puts $out " $URL($task,$url,head,content-type)" } puts $out {} } proc RobotHref {task url hrefx hostx pathx} { global URL control debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path if {$debuglevel > 1} { dbgmsg "Ref input url = $url href=$href" } if {[string first { } $href] >= 0} { return 0 } if {[string length $href] > 256} { return 0 } # Skip pages that have ? in them # if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { # return 0 # } # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href set method http } else { if {[string compare $method http]} { return 0 } } # get host (if any) if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} { if {![string length $surl]} { set surl / } if {[info exist control($task,domains)]} { set ok 0 foreach domain $control($task,domains) { if {[string match $domain $host]} { set ok 1 break } } if {!$ok} { return 0 } } } else { regexp {^([^\#]*)} $hpath x surl set host $URL($task,$url,hostport) } if {![string length $surl]} { return 0 } if {[string first / $surl]} { # relative path set curpath $URL($task,$url,path) if {[info exists URL($task,$url,bpath)]} { set curpath $URL($task,$url,bpath) } regexp {^([^\#?]*)} $curpath x dpart set l [string last / $dpart] if {[expr $l >= 0]} { set surl [string range $dpart 0 $l]$surl } else { set surl $dpart/$surl } } set surllist [split $surl /] catch {unset path} set pathl 0 foreach c $surllist { switch -- $c { .. { if {$pathl > 1} { incr pathl -2 set path [lrange $path 0 $pathl] incr pathl } } . { } default { incr pathl lappend path $c } } } if {$debuglevel > 4} { dbgmsg "pathl=$pathl output path=$path" } set path [join $path /] if {![string length $path]} { set path / } regsub -all {~} $path {%7E} path set href "$method://$host$path" if {$debuglevel > 1} { dbgmsg "Ref result = $href" } return [checkrule $task url $href] } proc RobotError {task url code} { global URL dbgmsg "Bad URL $url (code $code)" set fromurl {} set distance -1 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] RobotReadRecord $inf fromurl distance RobotFileClose $inf } RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } } proc RobotRedirect {task url tourl code} { global URL dbgmsg "Redirecting from $url to $tourl" set distance {} set fromurl {} if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] RobotReadRecord $inf fromurl distance RobotFileClose $inf } if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } if {[RobotHref $task $url tourl host path]} { if {![RobotFileExist $task visited $host $path]} { if {![RobotFileExist $task unvisited $host $path]} { set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } } else { set olddistance {} set inf [RobotFileOpen $task visited $host $path r] RobotReadRecord $inf oldurl olddistance RobotFileClose $inf if {[string length $olddistance] == 0} { set olddistance 1000 } if {[string length $distance] == 0} { set distance 1000 } dbgmsg "distance=$distance olddistance=$olddistance" if {[expr $distance < $olddistance]} { set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $tourl $distance RobotFileClose $outf } } } if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} { dbgmsg "unlink failed" exit 1 } } proc wellform {body} { regsub -all {} $body { } abody regsub -all -nocase {} $abody {} body regsub -all {<[^\>]+>} $body {} abody regsub -all { } $abody { } body regsub -all {&} $body {&} abody return $abody } proc link {task url out href body distance} { global URL control if {[expr $distance > $control($task,distance)]} return if {![RobotHref $task $url href host path]} return if ($control($task,cr)) { puts $out "" puts $out "$href" set abody [wellform $body] puts $out "$abody" puts $out "" } if {![RobotFileExist $task visited $host $path]} { set olddistance 1000 if {![RobotFileExist $task bad $host $path]} { if {[RobotFileExist $task unvisited $host $path]} { set inf [RobotFileOpen $task unvisited $host $path r] RobotReadRecord $inf oldurl olddistance RobotFileClose $inf } } else { set olddistance 0 } if {[string length $olddistance] == 0} { set olddistance 1000 } if {[expr $distance < $olddistance]} { set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $url $distance RobotFileClose $outf } } elseif {[string compare $href $url]} { set inf [RobotFileOpen $task visited $host $path r] RobotReadRecord $inf xurl olddistance close $inf if {[string length $olddistance] == 0} { set olddistance 1000 } if {[expr $distance < $olddistance]} { dbgmsg "OK remarking url=$url href=$href" dbgmsg "olddistance = $olddistance" dbgmsg "newdistance = $distance" set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $url $distance RobotFileClose $outf } } } proc RobotTextTkl {task url out} { global URL control # set title so we can emit it for the body set title {} # if true, nothing will be indexed set noindex 0 # if true, nothing will be followed set nofollow 0 puts $control($task,output) set out stdout set distance distance htmlSwitch $URL($task,$url,buf) \ title { # når title tag er hittet, er body set til indholdet af tagget set title $body } -nonest meta { #puts -nonewline $out "" } body { # don't print title of document content if noindex is used if {!$noindex} { #puts $out "$title" # xml compilancy added set bbody [wellform $body] #puts $out "" #puts $out $bbody #puts $out "" } } -nonest base { # if {![info exists parm(href)]} { continue } set href [string trim $parm(href)] } a { # .. # we're not using nonest - otherwise body isn't set if {$nofollow} continue if {![info exists parm(href)]} { continue } #puts "link $task $url $out [string trim $parm(href)] $body $distance" } -nonest area { if {$nofollow} continue if {![info exists parm(href)]} { continue } #puts "link $task $url $out [string trim $parm(href)] $body $distance" } -nonest frame { if {![info exists parm(src)]} { continue } #puts "link $task $url $out [string trim $parm(src)] $body $fdistance" } } proc RobotTextHtml {task url out} { global URL control # set title so we can emit it for the body set title {} # if true, nothing will be indexed set noindex 0 # if true, nothing will be followed set nofollow 0 set distance 0 set fdistance 0 if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} { set fdistance $URL($task,$url,dist) set distance [expr $fdistance + 1] } htmlSwitch $URL($task,$url,buf) \ title { set title $body } -nonest meta { # collect metadata and save NAME= CONTENT=.. set metaname {} set metacontent {} puts -nonewline $out "" # go through robots directives (af any) if {![string compare $metaname robots]} { set direcs [split [string tolower $metacontent] ,] if {[lsearch $direcs noindex] >= 0} { set noindex 1 } if {[lsearch $direcs nofollow] >= 0} { set nofollow 1 } } } body { # don't print title of document content if noindex is used if {!$noindex} { puts $out "$title" set bbody [wellform $body] puts $out "" puts $out $bbody puts $out "" } } -nonest base { # if {![info exists parm(href)]} { continue } set href [string trim $parm(href)] if {![RobotHref $task $url href host path]} continue set URL($task,$url,bpath) $path } a { # .. # we're not using nonest - otherwise body isn't set if {$nofollow} continue if {![info exists parm(href)]} { continue } link $task $url $out [string trim $parm(href)] $body $distance } -nonest area { if {$nofollow} continue if {![info exists parm(href)]} { continue } link $task $url $out [string trim $parm(href)] $body $distance } -nonest frame { if {![info exists parm(src)]} { continue } link $task $url $out [string trim $parm(src)] $body $fdistance } } proc RobotsTxt {task url} { global agent URL RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf) } proc RobotsTxt0 {task v buf} { global URL agent set section 0 foreach l [split $buf \n] { if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} { set arg [string trim $arg] dbgmsg "cmd=$cmd arg=$arg" switch -- [string tolower $cmd] { user-agent { if {$section} break set pat [string tolower $arg]* set section [string match $pat $agent] } disallow { if {$section} { dbgmsg "rule [list 0 $arg]" lappend $v [list 0 $arg] } } allow { if {$section} { dbgmsg "rule [list 1 $arg]" lappend $v [list 1 $arg] } } } } } } proc RobotTextPlain {task url out} { global URL puts $out "" regsub -all {<} $URL($task,$url,buf) {\<} content puts $out $content puts $out "" if {![string compare $URL($task,$url,path) /robots.txt]} { RobotsTxt $task $url } } proc RobotWriteMetadata {task url out} { global URL set charset $URL($task,$url,charset) puts $out "" puts $out "" set distance 1000 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] RobotReadRecord $inf fromurl distance RobotFileClose $inf } set URL($task,$url,dist) $distance puts $out "" puts $out " $distance" puts $out "" headSave $task $url $out logmsg "Parsing $url distance=$distance" switch $URL($task,$url,head,content-type) { text/html { if {[string length $distance]} { RobotTextHtml $task $url $out RobotTextTkl $task $url $out } } text/plain { RobotTextPlain $task $url $out } } puts $out "" } proc Robot200 {task url} { global URL set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)] puts -nonewline $out $URL($task,$url,buf) RobotFileClose $out set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)] RobotWriteMetadata $task $url $out RobotFileClose $out RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) } proc RobotReadContent {task url sock binary} { global URL set buffer [read $sock 16384] set readCount [string length $buffer] if {$readCount <= 0} { Robot200 $task $url RobotRestart $task $url $sock } elseif {!$binary && [string first \0 $buffer] >= 0} { Robot200 $task $url RobotRestart $task $url $sock } else { # dbgmsg "Got $readCount bytes" set URL($task,$url,buf) $URL($task,$url,buf)$buffer } } proc RobotReadHeader {task url sock} { global URL debuglevel if {$debuglevel > 1} { dbgmsg "HTTP head $url" } if {[catch {set buffer [read $sock 2148]}]} { RobotError $task $url 404 RobotRestart $task $url $sock return } set readCount [string length $buffer] if {$readCount <= 0} { RobotError $task $url 404 RobotRestart $task $url $sock } else { # dbgmsg "Got $readCount bytes" set URL($task,$url,buf) $URL($task,$url,buf)$buffer set n [string first \r\n\r\n $URL($task,$url,buf)] if {$n > 1} { set code 0 set version {} set headbuf [string range $URL($task,$url,buf) 0 $n] incr n 4 set URL($task,$url,charset) ISO-8859-1 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code set lines [split $headbuf \n] foreach line $lines { if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { set URL($task,$url,head,[string tolower $name]) [string trim $value] } regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset) } dbgmsg "HTTP CODE $code" set URL($task,$url,state) skip switch $code { 301 { RobotRedirect $task $url $URL($task,$url,head,location) 301 RobotRestart $task $url $sock } 302 { RobotRedirect $task $url $URL($task,$url,head,location) 302 RobotRestart $task $url $sock } 200 { if {![info exists URL($task,$url,head,content-type)]} { set URL($task,$url,head,content-type) {} } set binary 1 switch -glob -- $URL($task,$url,head,content-type) { text/* { set binary 0 } } if {![regexp {/robots.txt$} $url]} { if {![checkrule $task mime $URL($task,$url,head,content-type)]} { RobotError $task $url mimedeny RobotRestart $task $url $sock return } } fileevent $sock readable [list RobotReadContent $task $url $sock $binary] } default { RobotError $task $url $code RobotRestart $task $url $sock } } } } } proc RobotSockCancel {task url sock} { logmsg "RobotSockCancel sock=$sock url=$url" RobotError $task $url 401 RobotRestart $task $url $sock } proc RobotConnect {task url sock} { global URL agent acceptLanguage fconfigure $sock -translation {lf crlf} -blocking 0 fileevent $sock readable [list RobotReadHeader $task $url $sock] puts $sock "GET $URL($task,$url,path) HTTP/1.0" puts $sock "Host: $URL($task,$url,host)" puts $sock "User-Agent: $agent" if {[string length $acceptLanguage]} { puts $sock "Accept-Language: $acceptLanguage" } puts $sock "" set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]] if {[catch {flush $sock}]} { RobotError $task $url 404 RobotRestart $task $url $sock } } proc RobotNop {} { } proc RobotGetUrl {task url phost} { global URL robotsRunning flush stdout dbgmsg "Retrieve running=$robotsRunning url=$url task=$task" if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} { set port 80 set host $hostport } set URL($task,$url,method) $method set URL($task,$url,host) $host set URL($task,$url,hostport) $hostport set URL($task,$url,path) $path set URL($task,$url,state) head set URL($task,$url,buf) {} if {[string compare $path /robots.txt]} { set ok 1 if {![info exists URL($hostport,robots)]} { dbgmsg "READING robots.txt for host $hostport" if {[RobotFileExist $task visited $hostport /robots.txt]} { set inf [RobotFileOpen $task visited $hostport /robots.txt r] set buf [read $inf 32768] close $inf } else { set buf "User-agent: *\nAllow: /\n" } RobotsTxt0 $task URL($hostport,robots) $buf } if {[info exists URL($hostport,robots)]} { foreach l $URL($hostport,robots) { if {[string first [lindex $l 1] $path] == 0} { set ok [lindex $l 0] break } } } if {!$ok} { dbgmsg "skipped due to robots.txt" return -1 } } if [catch {set sock [socket -async $host $port]}] { return -1 } RobotConnect $task $url $sock return 0 } proc loadlib {} { global libdir if {![llength [info commands htmlSwitch]]} { if {[info exists env(tclrobot_lib)]} { set d $env(tclrobot_lib) } else { if { $libdir > "" } { set d $libdir } else { set d . } } set e [info sharedlibextension] dbgmsg "About to load $d/tclrobot$e" if {[catch {load $d/tclrobot$e}]} { dbgmsg "Didn't get at $d, trying directly" load tclrobot$e } dbgmsg "Loaded tclrobot$e all right" } } set agent "zmbot/0.2" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" } dbgmsg "agent: $agent" proc bgerror {m} { global errorInfo dbgmsg "BGERROR $m" dbgmsg $errorInfo } # Rules: allow, deny, url proc checkrule {task type this} { global control global debuglevel set default_ret 1 if {$debuglevel > 3} { dbgmsg "CHECKRULE $type $this" } if {[info exist control($task,alrules)]} { foreach l $control($task,alrules) { if {$debuglevel > 3} { dbgmsg "consider $l" } # consider type if {[lindex $l 1] != $type} continue # consider mask (! negates) set masks [lindex $l 2] set ok 0 set default_ret 0 foreach mask $masks { if {$debuglevel > 4} { dbgmsg "consider single mask $mask" } if {[string index $mask 0] == "!"} { set mask [string range $mask 1 end] if {[string match $mask $this]} continue } else { if {![string match $mask $this]} continue } set ok 1 } if {$debuglevel > 4} { dbgmsg "ok = $ok" } if {!$ok} continue # OK, we have a match if {[lindex $l 0] == "allow"} { if {$debuglevel > 3} { dbgmsg "CHECKRULE MATCH OK" } return 1 } else { if {$debuglevel > 3} { dbgmsg "CHECKFULE MATCH FAIL" } return 0 } } } if {$debuglevel > 3} { dbgmsg "CHECKRULE MATCH DEFAULT $default_ret" } return $default_ret } proc url {href} { global debuglevel task if {[RobotHref $task http://www.indexdata.dk/ href host path]} { if {![RobotFileExist $task visited $host $path]} { set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf href 0 RobotFileClose $outf } } } proc deny {type stuff} { global control task lappend control($task,alrules) [list deny $type $stuff] } proc allow {type stuff} { global control task lappend control($task,alrules) [list allow $type $stuff] } proc debug {level} { global debuglevel set debuglevel $level } proc CreateTask {t} { global tasks task status control set task $t if {[info exists tasks]} { if {[lsearch -exact $tasks $t] >= 0} { return 0 } } lappend tasks $t set status($t,unvisited) 0 set status($t,visited) 0 set status($t,bad) 0 set status($t,raw) 0 set status($t,active) 1 set control($t,seq) 0 set control($t,distance) 10 set control($t,target) tmp set control($t,output) output set control($t,cr) 0 return 1 } # Little utility that ensures that at least one task is present (main). proc CreateMainTask {} { global tasks if {![info exist tasks]} { CreateTask main } } # Parse options set i 0 set l [llength $argv] if {$l < 1} { puts {tclrobot: usage:} puts {tclrobot [-j jobs] [-p pid] [-T tmpdir] [-o logfile] [-i idle] [-c count] [-d domain] [-D spooldir] [-r rules] [-L libdir] [url ..]} logmsg " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" exit 1 } while {$i < $l} { set arg [lindex $argv $i] switch -glob -- $arg { -o* { set fname [string range $arg 2 end] if {![string length $fname]} { set fname [lindex $argv [incr i]] } set loghandle [open $fname a] #dbgmsg "agent: $agent" #dbgmsg "-o $fname" } -p* { set pidfname [string range $arg 2 end] if {![string length $pidfname]} { set pidfname [lindex $argv [incr i]] } #dbgmsg "-p $pidfname" if {[file exists $pidfname]} { set pf [open $pidfname] gets $pf oldpid close $pf logmsg "File $pidfname already exist. pid=$oldpid" if {[file isdirectory /proc/$oldpid]} { logmsg "And it's apparently running. Exiting." exit 1 } } set pf [open $pidfname w] puts $pf [pid] close $pf } -T* { set tmpdir [string range $arg 2 end] if {![string length $tmpdir]} { set tmpdir [lindex $argv [incr i]] } set control(tmpdir) $tmpdir } -L* { set libdir [string range $arg 2 end] if {![string length $libdir]} { set libdir [lindex $argv [incr i]] } } -t* { set t [string range $arg 2 end] if {![string length $t]} { set t [lindex $argv [incr i]] } CreateTask $t } -D* { set dir [string range $arg 2 end] if {![string length $dir]} { set dir [lindex $argv [incr i]] } lappend daemon_dir $dir } -j* { set robotsMax [string range $arg 2 end] if {![string length $robotsMax]} { set robotsMax [lindex $argv [incr i]] } } -c* { CreateMainTask set control($task,distance) [string range $arg 2 end] if {![string length $control($task,distance)]} { set control($task,distance) [lindex $argv [incr i]] } } -d* { CreateMainTask set dom [string range $arg 2 end] if {![string length $dom]} { set dom [lindex $argv [incr i]] } lappend control($task,domains) $dom } -i* { set idletime [string range $arg 2 end] if {![string length $idletime]} { set idletime [lindex $argv [incr i]] } } -l* { CreateMainTask set acceptLanguage [string range $arg 2 end] if {![string length $acceptLanguage]} { set acceptLanguage [lindex $argv [incr i]] } } -r* { CreateMainTask set rfile [string range $arg 2 end] if {![string length $rfile]} { set rfile [lindex $argv [incr i]] } catch {unset maxdistance} source $rfile if {[info exists maxdistance]} { set control($task,distance) $maxdistance } } default { CreateMainTask set href $arg #dbgmsg "in default: arg= $arg !!!" loadlib if {[RobotHref $task http://www.indexdata.dk/ href host path]} { if {![RobotFileExist $task visited $host $path]} { set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf href 0 RobotFileClose $outf } } } } incr i } dbgmsg "Parsed args, now loading" loadlib if {![info exist robotsMax]} { set robotsMax 5 } if {[info exist daemon_dir]} { logmsg "Daemon mode" RobotDaemonLoop } else { foreach t $tasks { logmsg "task $t" logmsg "max distance=$control($t,distance)" if {[info exists control($t,domains)]} { logmsg "domains=$control($t,domains)" } } logmsg "max jobs=$robotsMax" foreach t $tasks { RobotStart $t } while {$robotsRunning} { vwait robotsRunning } if {[info exists tasks]} { foreach t $tasks { set statusfile [open $t/status w] puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" close $statusfile } } }