From 99f83d18ecb04380da6f07297f7b74d4e8fe7d9f Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Thu, 14 Aug 2003 08:02:10 +0000 Subject: [PATCH] tcl web harvesting script for tkl project added --- tkl-web-harvester | 1527 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1527 insertions(+) create mode 100755 tkl-web-harvester diff --git a/tkl-web-harvester b/tkl-web-harvester new file mode 100755 index 0000000..040da57 --- /dev/null +++ b/tkl-web-harvester @@ -0,0 +1,1527 @@ +#!/usr/bin/tclsh +# $Id: tkl-web-harvester,v 1.1 2003/08/14 08:02:10 marc Exp $ +# +set loghandle stdout +set robotsRunning 0 +set workdir [pwd] +set idletime 15000 +set acceptLanguage {} +set debuglevel 1 +set libdir "" + + +proc logmsg {msg} { + global loghandle + + puts $loghandle $msg + flush $loghandle +} + +#proc dbgmsg {level msg} { +# global debuglevel +# if {[expr $debuglevel >= $level]} { +# logmsg $msg +# } +#} +proc dbgmsg {msg} { + global debuglevel + if {[expr $debuglevel >= 0]} { + logmsg $msg + } +} +# dbgmsg is always called with just one string! + + +proc fnameEncode {fname} { + regsub -all {&} $fname {%38} fname + regsub -all {<} $fname {%3C} fname + regsub -all {>} $fname {%3E} fname + regsub -all {\?} $fname {%3F} fname + regsub -all {\*} $fname {%2A} fname + return $fname +} + +proc fnameDecode {fname} { + regsub -all {%38} $fname {&} fname + regsub -all {%3C} $fname {<} fname + regsub -all {%3E} $fname {>} fname + regsub -all {%3F} $fname {?} fname + regsub -all {%2A} $fname {*} fname + return $fname +} + +proc RobotFileNext1 {area lead} { + # dbgmsg "RobotFileNext1 area=$area lead=$lead" + if {[catch {set ns [glob ${area}/*]}]} { + return {} + } + foreach n $ns { + if {[file isfile $n]} { + set off [string last / $n] + # skip / + incr off + set end [string length $n] + # skip _.tkl + incr end -6 + return $lead/[string range $n $off $end] + } + } + foreach n $ns { + if {[file isdirectory $n]} { + set off [string last / $n] + # skip / + incr off + set sb [RobotFileNext1 $n $lead/[string range $n $off end]] + if {[string length $sb]} { + return $sb + } + } + } + return {} +} + +proc RobotWriteRecord {outf fromurl distance} { + puts $outf {} + puts $outf "" + puts $outf "" + puts $outf $distance + puts $outf "" + puts $outf "" + puts $outf $fromurl + puts $outf "" + puts $outf "" +} + +proc RobotReadRecord {inf fromurlx distancex} { + upvar $fromurlx fromurl + upvar $distancex distance + gets $inf + gets $inf + gets $inf + set distance [string trim [gets $inf]] + # dbgmsg "got distance = $distance" + gets $inf + gets $inf + set fromurl [string trim [gets $inf]] +} + +proc RobotFileNext {task area} { + global control + global idletime ns + global status + + # dbgmsg "RobotFileNext seq=$control($task,seq)" + if {$control($task,seq) < 0} { + return {} + } + set target $control($task,target) + if {$control($task,seq) == 0} { + if {[catch {set ns($task) [glob $target/$area/*]}]} { + puts "----------- DONE-------- target=$target" + return done + } + } + # dbgmsg "ns=$ns($task)" + set off [string length $target/$area] + incr off + set n [lindex $ns($task) $control($task,seq)] + # dbgmsg "n=$n" + if {![string length $n]} { + set control($task,seq) -1 + set statusfile [open $target/status w] + puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)" + close $statusfile + return wait + } + incr control($task,seq) + if {[file isfile $n/robots.txt_.tkl]} { + # dbgmsg "ok returning http://[string range $n $off end]/robots.txt" + return [fnameDecode http://[string range $n $off end]/robots.txt] + } elseif {[file isdirectory $n]} { + set sb [RobotFileNext1 $n http://[string range $n $off end]] + if {[string length $sb]} { + return [fnameDecode $sb] + } + } + dbgmsg "no more work at end of RobotFileNext n=$n" + dbgmsg "ns=$ns($task)" + return {} +} + + +proc RobotFileExist {task area host path} { + global debuglevel control + + if {$debuglevel > 3} { + dbgmsg "RobotFileExist begin area=$area host=$host path=$path" + } + set target $control($task,target) + return [file exists [fnameEncode $target/$area/$host${path}_.tkl]] +} + +proc RobotFileUnlink {task area host path} { + global status control + + set target $control($task,target) + # dbgmsg "RobotFileUnlink begin" + # dbgmsg "area=$area host=$host path=$path" + set npath [fnameEncode $target/$area/$host${path}_.tkl] + # dbgmsg "npath=$npath" + set comp [split $npath /] + if {[catch {exec rm $npath}]} return + + set l [llength $comp] + incr l -2 + incr status($task,$area) -1 + for {set i $l} {$i > 0} {incr i -1} { + set path [join [lrange $comp 0 $i] /] + if {![catch {glob $path/*}]} return + exec rmdir $path + } + # dbgmsg "RobotFileUnlink end" +} + +proc RobotFileClose {out} { + if [string compare $out stdout] { + close $out + } +} + +proc RobotFileOpen {task area host path {mode w}} { + set orgPwd [pwd] + global workdir status debuglevel control + + # dbgmsg "RobotFileOpen task=$task path=$path" + + set target $control($task,target) + set path [fnameEncode $path] + + if {![info exists workdir]} { + return stdout + } + if {$debuglevel > 3} { + dbgmsg "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + } + if {[string compare $orgPwd $workdir]} { + dbgmsg "ooops. RobotFileOpen failed" + dbgmsg "workdir = $workdir" + dbgmsg "pwd = $orgPwd" + exit 1 + } + + set comp [split $target/$area/$host /] + set len [llength $comp] + incr len -1 + + # dbgmsg "1 comp=$comp" + + for {set i 0} {$i <= $len} {incr i} { + set d [lindex $comp $i] + if {[string length $d] == 0} { + cd / + } elseif {[catch {cd $d}]} { + exec mkdir $d + cd ./$d + if {![string compare $area unvisited] && $i == $len && $mode == "w"} { + if {[string compare $path /robots.txt]} { + set out [open robots.txt_.tkl w] + dbgmsg "creating robots.txt in $d" + close $out + incr status($task,unvisited) + } + } + } + } + + set comp [split $path /] + set len [llength $comp] + incr len -1 + + # dbgmsg "2 path=$path comp=$comp" + + for {set i 0} {$i < $len} {incr i} { + set d [lindex $comp $i] + if {[string length $d] > 0} { + if {[catch {cd $d}]} { + exec mkdir $d + cd ./$d + } + } + } + set d [lindex $comp $len] + set out [open ${d}_.tkl $mode] + if {$mode == "w"} { + incr status($task,$area) + } + cd $orgPwd + return $out +} + +proc RobotStartJob {root task} { + global control + + set fname "$root$task" + set f [open $fname r] + set xml [read $f] + dbgmsg "Reading $fname" + close $f + # task type must be 2 + if {![regexp {([^<]*)} $xml x tasktype]} { + return + } + set tasktype [string trim $tasktype] + if {![string match 2 $tasktype]} { + return + } + # status must not be finished or error + if {![regexp {([^<]*)} $xml x status]} { + return + } + if {$status == "finished"} { + dbgmsg "already finished" + return + } + if {$status == "error"} { + dbgmsg "already finished due to error" + return + } + # ignore if task has already been processed + dbgmsg "status = $status" + if {![CreateTask $task]} { + return + } + set control($task,taskfname) $fname + dbgmsg "Reading $fname stage 2" + htmlSwitch $xml \ + url { + lappend starturls $body + } filter { + set type $parm(type) + set action $parm(action) + if {$type == "domain"} { + $action url http://$body/* + } + if {$type == "url"} { + $action url $body + } + if {$type == "mime"} { + $action mime $body + } + } target { + set ex [file rootname [file tail $task]] + #set control($task,target) "$root$body/$ex" + set control($task,target) "$control(tmpdir)/$ex" + set control($task,output) "$root$body" + } distance { + set control($task,distance) $body + } status { + set control($task,filestatus) $body + } tasktype { + set control($task,tasktype) $body + } + + if {[info exists starturls]} { + foreach url $starturls { + puts "marking start urls $url" + url $url + } + } + + if {$status == "pending"} { + regsub {[^<]*} $xml {running} xml2 + set f [open $fname w] + puts -nonewline $f $xml2 + close $f + } +} + +proc RobotDoneJob {task} { + global daemon_dir control + + if {![info exists daemon_dir]} { + return + } + set fname $control($task,taskfname) + set f [open $fname r] + set xml [read $f] + dbgmsg "Reading $fname" + regexp {([^<]*)} $xml x status + dbgmsg "------" + dbgmsg "status = $status" + close $f + + regsub {[^<]*} $xml {finished} xml2 + set f [open $fname w] + puts -nonewline $f $xml2 + close $f +} + +proc RobotScanDir {} { + global daemon_dir + + if {![info exists daemon_dir]} { + return + } + foreach d $daemon_dir { + if {[catch {set files [glob $d/*.spl]}]} { + return + } + foreach fname $files { + if {[file isfile $fname] && [file readable $fname]} { + set jobfile [open $fname] + gets $jobfile portalroot + gets $jobfile portaltask + close $jobfile + + RobotStartJob $portalroot $portaltask + } + } + } +} + +proc RobotRR {task} { + global control robotsRunning tasks robotsMax status + + dbgmsg "RobotRR -- running=$robotsRunning max=$robotsMax---------------" + incr robotsRunning -1 + + # only one task gets through... + if {[string compare [lindex $tasks 0] $task]} { + return + } + dbgmsg "RobotRR. task = $task" + while {$robotsRunning} { + vwait robotsRunning + } + dbgmsg "Scan" + if {[catch {RobotScanDir} msg]} { + logmsg "RobotScanDir failed" + logmsg $msg + } + foreach t $tasks { + set target $control($t,target) + set statusfile [open $target/status w] + puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" + close $statusfile + set control($t,seq) 0 + RobotStart $t + } +} + +proc RobotDaemonSig {} { + global daemon_cnt + + incr daemon_cnt +} + +proc RobotDaemonLoop {} { + global daemon_cnt tasks robotsRunning status + + set daemon_cnt 0 + while 1 { + logmsg $daemon_cnt + RobotScanDir + + if {[info exists tasks]} { + logmsg "daemon loop tasks $tasks" + foreach t $tasks { + set control($t,seq) 0 + RobotStart $t + } + while {$robotsRunning} { + vwait robotsRunning + } + } + after 30000 RobotDaemonSig + vwait daemon_cnt + } +} + +proc RobotRestart {task url sock} { + global URL robotsRunning + + close $sock + after cancel $URL($sock,cancel) + + foreach v [array names URL $task,$url,*] { + unset URL($v) + } + + incr robotsRunning -1 + RobotStart $task +} + +proc RobotStart {task} { + global URL + global robotsRunning robotsMax idletime status tasks + + # dbgmsg "RobotStart $task running=$robotsRunning" + while {1} { + set url [RobotFileNext $task unvisited] + if {[string compare $url done] == 0} { + dbgmsg "In RobotStart task $task done" + + catch {unset ntasks} + foreach t $tasks { + if {[string compare $t $task]} { + lappend ntasks $t + } else { + dbgmsg "task $t done" + } + } + if {![info exists ntasks]} { + unset tasks + dbgmsg "all done" + } else { + set tasks $ntasks + } + RobotDoneJob $task + return + } + if {![string length $url]} { + return + } + incr robotsRunning + if {[string compare $url wait] == 0} { + after $idletime [list RobotRR $task] + return + } + set r [RobotGetUrl $task $url {}] + if {!$r} { + if {$robotsRunning >= $robotsMax} return + } else { + incr robotsRunning -1 + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] + RobotFileClose $outf + } + RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) + } + } +} + +proc headSave {task url out} { + global URL + + if {[info exists URL($task,$url,head,last-modified)]} { + puts $out "$URL($task,$url,head,last-modified)" + } + puts $out {} + if {[info exists URL($task,$url,head,date)]} { + puts $out " $URL($task,$url,head,date)" + } + if {[info exists URL($task,$url,head,content-length)]} { + puts $out " $URL($task,$url,head,content-length)" + } + if {[info exists URL($task,$url,head,server)]} { + puts $out " $URL($task,$url,head,server)" + } + puts $out {} + puts $out {} + puts $out " $url" + if {[info exists URL($task,$url,head,content-type)]} { + puts $out " $URL($task,$url,head,content-type)" + } + puts $out {} +} + +proc RobotHref {task url hrefx hostx pathx} { + global URL control debuglevel + upvar $hrefx href + upvar $hostx host + upvar $pathx path + + if {$debuglevel > 1} { + dbgmsg "Ref input url = $url href=$href" + } + + if {[string first { } $href] >= 0} { + return 0 + } + if {[string length $href] > 256} { + return 0 + } + +# Skip pages that have ? in them +# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { +# return 0 +# } + # get method (if any) + if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { + set hpath $href + set method http + } else { + if {[string compare $method http]} { + return 0 + } + } + # get host (if any) + if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} { + if {![string length $surl]} { + set surl / + } + if {[info exist control($task,domains)]} { + set ok 0 + foreach domain $control($task,domains) { + if {[string match $domain $host]} { + set ok 1 + break + } + } + if {!$ok} { + return 0 + } + } + } else { + regexp {^([^\#]*)} $hpath x surl + set host $URL($task,$url,hostport) + } + if {![string length $surl]} { + return 0 + } + if {[string first / $surl]} { + # relative path + set curpath $URL($task,$url,path) + if {[info exists URL($task,$url,bpath)]} { + set curpath $URL($task,$url,bpath) + } + regexp {^([^\#?]*)} $curpath x dpart + set l [string last / $dpart] + if {[expr $l >= 0]} { + set surl [string range $dpart 0 $l]$surl + } else { + set surl $dpart/$surl + } + } + set surllist [split $surl /] + catch {unset path} + set pathl 0 + foreach c $surllist { + switch -- $c { + .. { + if {$pathl > 1} { + incr pathl -2 + set path [lrange $path 0 $pathl] + incr pathl + } + } + . { + + } + default { + incr pathl + lappend path $c + } + } + } + if {$debuglevel > 4} { + dbgmsg "pathl=$pathl output path=$path" + } + set path [join $path /] + if {![string length $path]} { + set path / + } + regsub -all {~} $path {%7E} path + set href "$method://$host$path" + + if {$debuglevel > 1} { + dbgmsg "Ref result = $href" + } + return [checkrule $task url $href] +} + +proc RobotError {task url code} { + global URL + + dbgmsg "Bad URL $url (code $code)" + set fromurl {} + set distance -1 + if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { + set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } +} + +proc RobotRedirect {task url tourl code} { + global URL + + dbgmsg "Redirecting from $url to $tourl" + + set distance {} + set fromurl {} + if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { + set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } + if {[RobotHref $task $url tourl host path]} { + if {![RobotFileExist $task visited $host $path]} { + if {![RobotFileExist $task unvisited $host $path]} { + set outf [RobotFileOpen $task unvisited $host $path] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } + } else { + set olddistance {} + set inf [RobotFileOpen $task visited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[string length $distance] == 0} { + set distance 1000 + } + dbgmsg "distance=$distance olddistance=$olddistance" + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen $task unvisited $host $path] + RobotWriteRecord $outf $tourl $distance + RobotFileClose $outf + } + } + } + if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} { + dbgmsg "unlink failed" + exit 1 + } +} + +proc wellform {body} { + regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} body + regsub -all {<[^\>]+>} $body {} abody + regsub -all { } $abody { } body + regsub -all {&} $body {&} abody + return $abody +} + +proc link {task url out href body distance} { + global URL control + if {[expr $distance > $control($task,distance)]} return + + if {![RobotHref $task $url href host path]} return + + if ($control($task,cr)) { + puts $out "" + puts $out "$href" + set abody [wellform $body] + puts $out "$abody" + puts $out "" + } + + if {![RobotFileExist $task visited $host $path]} { + set olddistance 1000 + if {![RobotFileExist $task bad $host $path]} { + if {[RobotFileExist $task unvisited $host $path]} { + set inf [RobotFileOpen $task unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen $task unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen $task visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + dbgmsg "OK remarking url=$url href=$href" + dbgmsg "olddistance = $olddistance" + dbgmsg "newdistance = $distance" + set outf [RobotFileOpen $task unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } +} + +proc RobotTextTkl {task url out} { + global URL control + + # set title so we can emit it for the body + set title {} + # if true, nothing will be indexed + set noindex 0 + # if true, nothing will be followed + set nofollow 0 + + puts $control($task,output) + + set out stdout + set distance distance + + htmlSwitch $URL($task,$url,buf) \ + title { + # når title tag er hittet, er body set til indholdet af tagget + set title $body + } -nonest meta { + #puts -nonewline $out "" + } body { + # don't print title of document content if noindex is used + if {!$noindex} { + #puts $out "$title" + # xml compilancy added + set bbody [wellform $body] + #puts $out "" + #puts $out $bbody + #puts $out "" + } + } -nonest base { + # + if {![info exists parm(href)]} { + continue + } + set href [string trim $parm(href)] + } a { + # .. + # we're not using nonest - otherwise body isn't set + if {$nofollow} continue + if {![info exists parm(href)]} { + continue + } + #puts "link $task $url $out [string trim $parm(href)] $body $distance" + } -nonest area { + if {$nofollow} continue + if {![info exists parm(href)]} { + continue + } + #puts "link $task $url $out [string trim $parm(href)] $body $distance" + } -nonest frame { + if {![info exists parm(src)]} { + continue + } + #puts "link $task $url $out [string trim $parm(src)] $body $fdistance" + } +} + +proc RobotTextHtml {task url out} { + global URL control + + # set title so we can emit it for the body + set title {} + # if true, nothing will be indexed + set noindex 0 + # if true, nothing will be followed + set nofollow 0 + + set distance 0 + set fdistance 0 + if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} { + set fdistance $URL($task,$url,dist) + set distance [expr $fdistance + 1] + } + htmlSwitch $URL($task,$url,buf) \ + title { + set title $body + } -nonest meta { + # collect metadata and save NAME= CONTENT=.. + set metaname {} + set metacontent {} + puts -nonewline $out "" + # go through robots directives (af any) + if {![string compare $metaname robots]} { + set direcs [split [string tolower $metacontent] ,] + if {[lsearch $direcs noindex] >= 0} { + set noindex 1 + } + if {[lsearch $direcs nofollow] >= 0} { + set nofollow 1 + } + } + } body { + # don't print title of document content if noindex is used + if {!$noindex} { + puts $out "$title" + set bbody [wellform $body] + puts $out "" + puts $out $bbody + puts $out "" + } + } -nonest base { + # + if {![info exists parm(href)]} { + continue + } + set href [string trim $parm(href)] + if {![RobotHref $task $url href host path]} continue + set URL($task,$url,bpath) $path + } a { + # .. + # we're not using nonest - otherwise body isn't set + if {$nofollow} continue + if {![info exists parm(href)]} { + continue + } + link $task $url $out [string trim $parm(href)] $body $distance + } -nonest area { + if {$nofollow} continue + if {![info exists parm(href)]} { + continue + } + link $task $url $out [string trim $parm(href)] $body $distance + } -nonest frame { + if {![info exists parm(src)]} { + continue + } + link $task $url $out [string trim $parm(src)] $body $fdistance + } +} + +proc RobotsTxt {task url} { + global agent URL + + RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf) +} + +proc RobotsTxt0 {task v buf} { + global URL agent + set section 0 + foreach l [split $buf \n] { + if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} { + set arg [string trim $arg] + dbgmsg "cmd=$cmd arg=$arg" + switch -- [string tolower $cmd] { + user-agent { + if {$section} break + set pat [string tolower $arg]* + set section [string match $pat $agent] + } + disallow { + if {$section} { + dbgmsg "rule [list 0 $arg]" + lappend $v [list 0 $arg] + } + } + allow { + if {$section} { + dbgmsg "rule [list 1 $arg]" + lappend $v [list 1 $arg] + } + } + } + } + } +} + +proc RobotTextPlain {task url out} { + global URL + + puts $out "" + regsub -all {<} $URL($task,$url,buf) {\<} content + puts $out $content + puts $out "" + + if {![string compare $URL($task,$url,path) /robots.txt]} { + RobotsTxt $task $url + } +} + +proc RobotWriteMetadata {task url out} { + global URL + + set charset $URL($task,$url,charset) + puts $out "" + puts $out "" + + set distance 1000 + if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { + set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + set URL($task,$url,dist) $distance + puts $out "" + puts $out " $distance" + puts $out "" + headSave $task $url $out + logmsg "Parsing $url distance=$distance" + switch $URL($task,$url,head,content-type) { + text/html { + if {[string length $distance]} { + RobotTextHtml $task $url $out + RobotTextTkl $task $url $out + } + } + text/plain { + RobotTextPlain $task $url $out + } + } + puts $out "" +} + +proc Robot200 {task url} { + global URL + + set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)] + puts -nonewline $out $URL($task,$url,buf) + RobotFileClose $out + + set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)] + RobotWriteMetadata $task $url $out + RobotFileClose $out + + RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) +} + +proc RobotReadContent {task url sock binary} { + global URL + + set buffer [read $sock 16384] + set readCount [string length $buffer] + + if {$readCount <= 0} { + Robot200 $task $url + RobotRestart $task $url $sock + } elseif {!$binary && [string first \0 $buffer] >= 0} { + Robot200 $task $url + RobotRestart $task $url $sock + } else { + # dbgmsg "Got $readCount bytes" + set URL($task,$url,buf) $URL($task,$url,buf)$buffer + } +} + +proc RobotReadHeader {task url sock} { + global URL debuglevel + + if {$debuglevel > 1} { + dbgmsg "HTTP head $url" + } + if {[catch {set buffer [read $sock 2148]}]} { + RobotError $task $url 404 + RobotRestart $task $url $sock + return + } + set readCount [string length $buffer] + + if {$readCount <= 0} { + RobotError $task $url 404 + RobotRestart $task $url $sock + } else { + # dbgmsg "Got $readCount bytes" + set URL($task,$url,buf) $URL($task,$url,buf)$buffer + + set n [string first \r\n\r\n $URL($task,$url,buf)] + if {$n > 1} { + set code 0 + set version {} + set headbuf [string range $URL($task,$url,buf) 0 $n] + incr n 4 + set URL($task,$url,charset) ISO-8859-1 + set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end] + + regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code + set lines [split $headbuf \n] + foreach line $lines { + if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { + set URL($task,$url,head,[string tolower $name]) [string trim $value] + } + regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset) + } + dbgmsg "HTTP CODE $code" + set URL($task,$url,state) skip + switch $code { + 301 { + RobotRedirect $task $url $URL($task,$url,head,location) 301 + RobotRestart $task $url $sock + } + 302 { + RobotRedirect $task $url $URL($task,$url,head,location) 302 + RobotRestart $task $url $sock + } + 200 { + if {![info exists URL($task,$url,head,content-type)]} { + set URL($task,$url,head,content-type) {} + } + set binary 1 + switch -glob -- $URL($task,$url,head,content-type) { + text/* { + set binary 0 + } + } + if {![regexp {/robots.txt$} $url]} { + if {![checkrule $task mime $URL($task,$url,head,content-type)]} { + RobotError $task $url mimedeny + RobotRestart $task $url $sock + return + } + } + fileevent $sock readable [list RobotReadContent $task $url $sock $binary] + } + default { + RobotError $task $url $code + RobotRestart $task $url $sock + } + } + } + } +} + +proc RobotSockCancel {task url sock} { + + logmsg "RobotSockCancel sock=$sock url=$url" + RobotError $task $url 401 + RobotRestart $task $url $sock +} + +proc RobotConnect {task url sock} { + global URL agent acceptLanguage + + fconfigure $sock -translation {lf crlf} -blocking 0 + fileevent $sock readable [list RobotReadHeader $task $url $sock] + puts $sock "GET $URL($task,$url,path) HTTP/1.0" + puts $sock "Host: $URL($task,$url,host)" + puts $sock "User-Agent: $agent" + if {[string length $acceptLanguage]} { + puts $sock "Accept-Language: $acceptLanguage" + } + puts $sock "" + set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]] + if {[catch {flush $sock}]} { + RobotError $task $url 404 + RobotRestart $task $url $sock + } +} + +proc RobotNop {} { + +} + +proc RobotGetUrl {task url phost} { + global URL robotsRunning + flush stdout + dbgmsg "Retrieve running=$robotsRunning url=$url task=$task" + if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { + return -1 + } + if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} { + set port 80 + set host $hostport + } + set URL($task,$url,method) $method + set URL($task,$url,host) $host + set URL($task,$url,hostport) $hostport + set URL($task,$url,path) $path + set URL($task,$url,state) head + set URL($task,$url,buf) {} + + if {[string compare $path /robots.txt]} { + set ok 1 + if {![info exists URL($hostport,robots)]} { + dbgmsg "READING robots.txt for host $hostport" + if {[RobotFileExist $task visited $hostport /robots.txt]} { + set inf [RobotFileOpen $task visited $hostport /robots.txt r] + set buf [read $inf 32768] + close $inf + } else { + set buf "User-agent: *\nAllow: /\n" + } + RobotsTxt0 $task URL($hostport,robots) $buf + } + if {[info exists URL($hostport,robots)]} { + foreach l $URL($hostport,robots) { + if {[string first [lindex $l 1] $path] == 0} { + set ok [lindex $l 0] + break + } + } + } + if {!$ok} { + dbgmsg "skipped due to robots.txt" + return -1 + } + } + if [catch {set sock [socket -async $host $port]}] { + return -1 + } + RobotConnect $task $url $sock + + return 0 +} + +proc loadlib {} { + global libdir + + if {![llength [info commands htmlSwitch]]} { + if {[info exists env(tclrobot_lib)]} { + set d $env(tclrobot_lib) + } else { + if { $libdir > "" } { + set d $libdir + } else { + set d . + } + } + set e [info sharedlibextension] + dbgmsg "About to load $d/tclrobot$e" + if {[catch {load $d/tclrobot$e}]} { + dbgmsg "Didn't get at $d, trying directly" + load tclrobot$e + } + dbgmsg "Loaded tclrobot$e all right" + } +} + +set agent "zmbot/0.2" +if {![catch {set os [exec uname -s -r]}]} { + set agent "$agent ($os)" +} + +dbgmsg "agent: $agent" + +proc bgerror {m} { + global errorInfo + dbgmsg "BGERROR $m" + dbgmsg $errorInfo +} + +# Rules: allow, deny, url + +proc checkrule {task type this} { + global control + global debuglevel + + set default_ret 1 + + if {$debuglevel > 3} { + dbgmsg "CHECKRULE $type $this" + } + if {[info exist control($task,alrules)]} { + foreach l $control($task,alrules) { + if {$debuglevel > 3} { + dbgmsg "consider $l" + } + # consider type + if {[lindex $l 1] != $type} continue + # consider mask (! negates) + set masks [lindex $l 2] + set ok 0 + set default_ret 0 + foreach mask $masks { + if {$debuglevel > 4} { + dbgmsg "consider single mask $mask" + } + if {[string index $mask 0] == "!"} { + set mask [string range $mask 1 end] + if {[string match $mask $this]} continue + } else { + if {![string match $mask $this]} continue + } + set ok 1 + } + if {$debuglevel > 4} { + dbgmsg "ok = $ok" + } + if {!$ok} continue + # OK, we have a match + if {[lindex $l 0] == "allow"} { + if {$debuglevel > 3} { + dbgmsg "CHECKRULE MATCH OK" + } + return 1 + } else { + if {$debuglevel > 3} { + dbgmsg "CHECKFULE MATCH FAIL" + } + return 0 + } + } + } + if {$debuglevel > 3} { + dbgmsg "CHECKRULE MATCH DEFAULT $default_ret" + } + return $default_ret +} + + +proc url {href} { + global debuglevel task + + if {[RobotHref $task http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist $task visited $host $path]} { + set outf [RobotFileOpen $task unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } +} + +proc deny {type stuff} { + global control task + + lappend control($task,alrules) [list deny $type $stuff] +} + +proc allow {type stuff} { + global control task + + lappend control($task,alrules) [list allow $type $stuff] +} + +proc debug {level} { + global debuglevel + + set debuglevel $level +} + +proc CreateTask {t} { + global tasks task status control + + set task $t + + if {[info exists tasks]} { + if {[lsearch -exact $tasks $t] >= 0} { + return 0 + } + } + + lappend tasks $t + set status($t,unvisited) 0 + set status($t,visited) 0 + set status($t,bad) 0 + set status($t,raw) 0 + set status($t,active) 1 + set control($t,seq) 0 + set control($t,distance) 10 + set control($t,target) tmp + set control($t,output) output + set control($t,cr) 0 + return 1 +} + +# Little utility that ensures that at least one task is present (main). +proc CreateMainTask {} { + global tasks + if {![info exist tasks]} { + CreateTask main + } +} + +# Parse options + +set i 0 +set l [llength $argv] + +if {$l < 1} { + puts {tclrobot: usage:} + puts {tclrobot [-j jobs] [-p pid] [-T tmpdir] [-o logfile] [-i idle] [-c + count] [-d domain] [-D spooldir] [-r rules] [-L libdir] [url ..]} + logmsg " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" + + exit 1 +} + + +while {$i < $l} { + set arg [lindex $argv $i] + switch -glob -- $arg { + -o* { + set fname [string range $arg 2 end] + if {![string length $fname]} { + set fname [lindex $argv [incr i]] + } + set loghandle [open $fname a] + #dbgmsg "agent: $agent" + #dbgmsg "-o $fname" + } + -p* { + set pidfname [string range $arg 2 end] + if {![string length $pidfname]} { + set pidfname [lindex $argv [incr i]] + } + #dbgmsg "-p $pidfname" + if {[file exists $pidfname]} { + set pf [open $pidfname] + gets $pf oldpid + close $pf + logmsg "File $pidfname already exist. pid=$oldpid" + if {[file isdirectory /proc/$oldpid]} { + logmsg "And it's apparently running. Exiting." + exit 1 + } + } + set pf [open $pidfname w] + puts $pf [pid] + close $pf + } + -T* { + set tmpdir [string range $arg 2 end] + if {![string length $tmpdir]} { + set tmpdir [lindex $argv [incr i]] + } + set control(tmpdir) $tmpdir + } + -L* { + set libdir [string range $arg 2 end] + if {![string length $libdir]} { + set libdir [lindex $argv [incr i]] + } + } + -t* { + set t [string range $arg 2 end] + if {![string length $t]} { + set t [lindex $argv [incr i]] + } + CreateTask $t + } + -D* { + set dir [string range $arg 2 end] + if {![string length $dir]} { + set dir [lindex $argv [incr i]] + } + lappend daemon_dir $dir + } + -j* { + set robotsMax [string range $arg 2 end] + if {![string length $robotsMax]} { + set robotsMax [lindex $argv [incr i]] + } + } + -c* { + CreateMainTask + set control($task,distance) [string range $arg 2 end] + if {![string length $control($task,distance)]} { + set control($task,distance) [lindex $argv [incr i]] + } + } + -d* { + CreateMainTask + set dom [string range $arg 2 end] + if {![string length $dom]} { + set dom [lindex $argv [incr i]] + } + lappend control($task,domains) $dom + } + -i* { + set idletime [string range $arg 2 end] + if {![string length $idletime]} { + set idletime [lindex $argv [incr i]] + } + } + -l* { + CreateMainTask + set acceptLanguage [string range $arg 2 end] + if {![string length $acceptLanguage]} { + set acceptLanguage [lindex $argv [incr i]] + } + } + -r* { + CreateMainTask + set rfile [string range $arg 2 end] + if {![string length $rfile]} { + set rfile [lindex $argv [incr i]] + } + catch {unset maxdistance} + source $rfile + if {[info exists maxdistance]} { + set control($task,distance) $maxdistance + } + } + default { + CreateMainTask + set href $arg + #dbgmsg "in default: arg= $arg !!!" + loadlib + if {[RobotHref $task http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist $task visited $host $path]} { + set outf [RobotFileOpen $task unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } + } + } + incr i +} + + +dbgmsg "Parsed args, now loading" +loadlib + +if {![info exist robotsMax]} { + set robotsMax 5 +} + +if {[info exist daemon_dir]} { + logmsg "Daemon mode" + RobotDaemonLoop +} else { + foreach t $tasks { + logmsg "task $t" + logmsg "max distance=$control($t,distance)" + if {[info exists control($t,domains)]} { + logmsg "domains=$control($t,domains)" + } + } + logmsg "max jobs=$robotsMax" + + foreach t $tasks { + RobotStart $t + } + + while {$robotsRunning} { + vwait robotsRunning + } + + if {[info exists tasks]} { + foreach t $tasks { + set statusfile [open $t/status w] + puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" + close $statusfile + } + } +} + -- 1.7.10.4