X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=73d558aad3893ad0b0b823f929247a5cb5d02dac;hb=4355628830cd0f9e27c059d20254d8e1c30896eb;hp=b6466c347186d9954fbdda74c78ed7054fab76c5;hpb=9d3f82cd1140362487d8fa6372cac1b24a49d21e;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index b6466c3..73d558a 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $ +# $Id: robot.tcl,v 1.45 2003/06/11 10:29:41 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -9,14 +9,19 @@ proc RobotFileNext1 {area lead} { foreach n $ns { if {[file isfile $n]} { set off [string last / $n] - incr off 2 - return $lead/[string range $n $off end] + # skip / + incr off + set end [string length $n] + # skip _.tkl + incr end -6 + return $lead/[string range $n $off $end] } } foreach n $ns { if {[file isdirectory $n]} { set off [string last / $n] - incr off 2 + # skip / + incr off set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb @@ -27,6 +32,7 @@ proc RobotFileNext1 {area lead} { } proc RobotWriteRecord {outf fromurl distance} { + puts $outf {} puts $outf "" puts $outf "" puts $outf $distance @@ -42,6 +48,7 @@ proc RobotReadRecord {inf fromurlx distancex} { upvar $distancex distance gets $inf gets $inf + gets $inf set distance [string trim [gets $inf]] # puts "got distance = $distance" gets $inf @@ -49,34 +56,36 @@ proc RobotReadRecord {inf fromurlx distancex} { set fromurl [string trim [gets $inf]] } -proc RobotFileNext {area} { - global robotSeq +proc RobotFileNext {task area} { + global control global idletime ns global status - # puts "RobotFileNext robotSeq=$robotSeq" - if {$robotSeq < 0} { + # puts "RobotFileNext seq=$control($task,seq)" + if {$control($task,seq) < 0} { return {} } - if {$robotSeq == 0} { - if {[catch {set ns [glob ${area}/*]}]} { - return {} + if {$control($task,seq) == 0} { + if {[catch {set ns($task) [glob $task/$area/*]}]} { + return done } } - set off [string length $area] + # puts "ns=$ns($task)" + set off [string length $task/$area] incr off - set n [lindex $ns $robotSeq] + set n [lindex $ns($task) $control($task,seq)] + # puts "n=$n" if {![string length $n]} { - set robotSeq -1 + set control($task,seq) -1 flush stdout - set statusfile [open status w] - puts $statusfile "$status(unvisited) $status(bad) $status(visited)" + set statusfile [open $task/status w] + puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)" close $statusfile return wait } - incr robotSeq - if {[file isfile $n/frobots.txt]} { - puts "ok returning http://[string range $n $off end]/robots.txt" + incr control($task,seq) + if {[file isfile $n/robots.txt_.tkl]} { + # puts "ok returning http://[string range $n $off end]/robots.txt" return http://[string range $n $off end]/robots.txt } elseif {[file isdirectory $n]} { set sb [RobotFileNext1 $n http://[string range $n $off end]] @@ -85,51 +94,36 @@ proc RobotFileNext {area} { } } puts "no more work at end of RobotFileNext n=$n" - puts "ns=$ns" + puts "ns=$ns($task)" return {} } -proc RobotFileExist {area host path} { +proc RobotFileExist {task area host path} { global debuglevel if {$debuglevel > 3} { puts "RobotFileExist begin area=$area host=$host path=$path" } - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - if {$debuglevel > 3} { - puts "RobotFileExist end npath=$npath" - } - return [file exists $npath] + return [file exists $task/$area/$host${path}_.tkl] } -proc RobotFileUnlink {area host path} { +proc RobotFileUnlink {task area host path} { global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" - set lpath [split $path /] - set l [llength $lpath] - incr l -1 - set t [lindex $lpath $l] - incr l -1 - set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + set npath $task/$area/$host${path}_.tkl # puts "npath=$npath" set comp [split $npath /] - if {[catch {exec rm [join $comp /]}]} return + if {[catch {exec rm $npath}]} return set l [llength $comp] - incr l -1 - incr l -1 - incr status($area) -1 + incr l -2 + incr status($task,$area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return - exec rmdir ./$path + exec rmdir $path } # puts "RobotFileUnlink end" } @@ -140,12 +134,14 @@ proc RobotFileClose {out} { } } -proc RobotFileOpen {area host path {mode w}} { +proc RobotFileOpen {task area host path {mode w}} { set orgPwd [pwd] global workdir global status global debuglevel + # puts "RobotFileOpen task=$task path=$path" + if {![info exists workdir]} { return stdout } @@ -158,127 +154,291 @@ proc RobotFileOpen {area host path {mode w}} { puts "pwd = $orgPwd" exit 1 } - set comp [split $area/$host$path /] + + set comp [split $task/$area/$host /] set len [llength $comp] incr len -1 - for {set i 0} {$i < $len} {incr i} { - if {$i > 1} { - set d "d[lindex $comp $i]" - } else { - set d [lindex $comp $i] - } - if {[catch {cd ./$d}]} { + + # puts "1 comp=$comp" + + for {set i 0} {$i <= $len} {incr i} { + set d [lindex $comp $i] + if {[string length $d] == 0} { + cd / + } elseif {[catch {cd $d}]} { exec mkdir $d cd ./$d - if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { - set out [open frobots.txt w] - puts "creating robots.txt in $d" - close $out - incr status(unvisited) + if {![string compare $area unvisited] && $i == $len && $mode == "w"} { + if {[string compare $path /robots.txt]} { + set out [open robots.txt_.tkl w] + puts "creating robots.txt in $d" + close $out + incr status($task,unvisited) + } } } } - set d [lindex $comp $len] - if {[string length $d]} { - set out [open f$d $mode] - if {0} { - if {[file isfile $d/f]} { - set out [open $d/f $mode] - } else { - set out [open f$d $mode] + + set comp [split $path /] + set len [llength $comp] + incr len -1 + + # puts "2 path=$path comp=$comp" + + for {set i 0} {$i < $len} {incr i} { + set d [lindex $comp $i] + if {[string length $d] > 0} { + if {[catch {cd $d}]} { + exec mkdir $d + cd ./$d } } - } else { - set out [open f $mode] } + set d [lindex $comp $len] + set out [open ${d}_.tkl $mode] if {$mode == "w"} { - incr status($area) + incr status($task,$area) } cd $orgPwd return $out } -proc RobotRR {} { - global robotSeq robotsRunning +proc RobotStartJob {fname t} { + global control + + set f [open $fname r] + set xml [read $f] + puts "Reading $fname" + close $f + if {![regexp {([^<]*)} $xml x status]} { + return + } + if {$status == "done"} { + puts "already done" + return + } + puts "status = $status" + if {![task $t]} { + return + } + htmlSwitch $xml \ + url { + url $body + } filter { + set type $parm(type) + set action $parm(action) + if {$type == "domain"} { + $action url http://$body/* + } + if {$type == "url"} { + $action url $body + } + if {$type == "mime"} { + $action mime $body + } + } distance { + set control($t,distance) $body + } status { + set control($t,filestatus) $body + } + if {$status == "pending"} { + regsub {[^<]*} $xml {running} xml2 + set f [open $fname w] + puts -nonewline $f $xml2 + close $f + } +} + +proc RobotDoneJob {t} { + global daemon_dir + if {![info exists daemon_dir]} { + return + } + + set fname $t.tkl + + set f [open $fname r] + set xml [read $f] + puts "Reading $fname" + regexp {([^<]*)} $xml x status + puts "------" + puts "status = $status" + close $f + + regsub {[^<]*} $xml {done} xml2 + set f [open $fname w] + puts -nonewline $f $xml2 + close $f +} + +proc RobotScanDir {} { + global daemon_dir + + if {![info exists daemon_dir]} { + return + } + foreach d $daemon_dir { + if {[catch {set files [glob $d/*.tkl]}]} { + return + } + foreach fname $files { + if {[file isfile $fname] && [file readable $fname]} { + set t [file rootname $fname] + RobotStartJob $fname $t + } + } + } +} + +proc RobotRR {task} { + global control robotsRunning tasks robotsMax status + + puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------" incr robotsRunning -1 + + # only one task gets through... + if {[string compare [lindex $tasks 0] $task]} { + return + } + puts "RobotRR. task = $task" while {$robotsRunning} { vwait robotsRunning } - set robotSeq 0 - RobotStart + puts "Scan" + if {[catch {RobotScanDir} msg]} { + puts "RobotScanDir failed" + puts $msg + } + foreach t $tasks { + set statusfile [open $t/status w] + puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" + close $statusfile + set control($t,seq) 0 + RobotStart $t + } +} + +proc RobotDaemonSig {} { + global daemon_cnt + + incr daemon_cnt +} + +proc RobotDaemonLoop {} { + global daemon_cnt tasks robotsRunning status + + set daemon_cnt 0 + while 1 { + puts $daemon_cnt + + RobotScanDir + + if {[info exists tasks]} { + puts "daemon loop tasks $tasks" + foreach t $tasks { + set control($t,seq) 0 + RobotStart $t + } + while {$robotsRunning} { + vwait robotsRunning + } + } + after 30000 RobotDaemonSig + vwait daemon_cnt + } } -proc RobotRestart {url sock} { +proc RobotRestart {task url sock} { global URL robotsRunning close $sock after cancel $URL($sock,cancel) - foreach v [array names URL $url,*] { + foreach v [array names URL $task,$url,*] { unset URL($v) } incr robotsRunning -1 - RobotStart + RobotStart $task } -proc RobotStart {} { +proc RobotStart {task} { global URL - global robotsRunning robotsMax idletime + global robotsRunning robotsMax idletime status tasks - # puts "RobotStart" + # puts "RobotStart $task running=$robotsRunning" while {1} { - set url [RobotFileNext unvisited] + set url [RobotFileNext $task unvisited] + if {[string compare $url done] == 0} { + puts "In RobotStart task $task done" + + catch {unset ntasks} + foreach t $tasks { + if {[string compare $t $task]} { + lappend ntasks $t + } else { + puts "task $t done" + } + } + if {![info exists ntasks]} { + unset tasks + puts "all done" + } else { + set tasks $ntasks + } + RobotDoneJob $task + return + } if {![string length $url]} { return } - incr robotsRunning + incr robotsRunning if {[string compare $url wait] == 0} { - after $idletime RobotRR - return + after $idletime [list RobotRR $task] + return } - set r [RobotGetUrl $url {}] + set r [RobotGetUrl $task $url {}] if {!$r} { if {$robotsRunning >= $robotsMax} return } else { incr robotsRunning -1 - if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotFileClose $outf } - RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) } } } -proc headSave {url out} { +proc headSave {task url out} { global URL - if {[info exists URL($url,head,last-modified)]} { - puts $out "$URL($url,head,last-modified)" + if {[info exists URL($task,$url,head,last-modified)]} { + puts $out "$URL($task,$url,head,last-modified)" } puts $out {} - if {[info exists URL($url,head,date)]} { - puts $out " $URL($url,head,date)" + if {[info exists URL($task,$url,head,date)]} { + puts $out " $URL($task,$url,head,date)" } - if {[info exists URL($url,head,content-length)]} { - puts $out " $URL($url,head,content-length)" + if {[info exists URL($task,$url,head,content-length)]} { + puts $out " $URL($task,$url,head,content-length)" } - if {[info exists URL($url,head,server)]} { - puts $out " $URL($url,head,server)" + if {[info exists URL($task,$url,head,server)]} { + puts $out " $URL($task,$url,head,server)" } puts $out {} puts $out {} puts $out " $url" - if {[info exists URL($url,head,content-type)]} { - puts $out " $URL($url,head,content-type)" + if {[info exists URL($task,$url,head,content-type)]} { + puts $out " $URL($task,$url,head,content-type)" } puts $out {} } -proc RobotHref {url hrefx hostx pathx} { - global URL domains debuglevel +proc RobotHref {task url hrefx hostx pathx} { + global URL control debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path @@ -293,12 +453,11 @@ proc RobotHref {url hrefx hostx pathx} { if {[string length $href] > 256} { return 0 } - if {[string first {?} $href] >= 0} { - return 0 - } - if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { - return 0 - } + +# Skip pages that have ? in them +# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { +# return 0 +# } # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href @@ -313,13 +472,13 @@ proc RobotHref {url hrefx hostx pathx} { if {![string length $surl]} { set surl / } - if {[info exist domains]} { + if {[info exist control($task,domains)]} { set ok 0 - foreach domain $domains { + foreach domain $control($task,domains) { if {[string match $domain $host]} { set ok 1 break - } + } } if {!$ok} { return 0 @@ -327,16 +486,16 @@ proc RobotHref {url hrefx hostx pathx} { } } else { regexp {^([^\#]*)} $hpath x surl - set host $URL($url,hostport) + set host $URL($task,$url,hostport) } if {![string length $surl]} { return 0 } if {[string first / $surl]} { # relative path - set curpath $URL($url,path) - if {[info exists URL($url,bpath)]} { - set curpath $URL($url,bpath) + set curpath $URL($task,$url,path) + if {[info exists URL($task,$url,bpath)]} { + set curpath $URL($task,$url,bpath) } regexp {^([^\#?]*)} $curpath x dpart set l [string last / $dpart] @@ -380,55 +539,55 @@ proc RobotHref {url hrefx hostx pathx} { if {$debuglevel > 1} { puts "Ref result = $href" } - return [checkrule url $href] + return [checkrule $task url $href] } -proc RobotError {url code} { +proc RobotError {task url code} { global URL puts "Bad URL $url (code $code)" set fromurl {} set distance -1 - if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { + set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] RobotReadRecord $inf fromurl distance RobotFileClose $inf } - RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) - if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } } -proc RobotRedirect {url tourl code} { +proc RobotRedirect {task url tourl code} { global URL puts "Redirecting from $url to $tourl" set distance {} set fromurl {} - if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { - set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { + set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] RobotReadRecord $inf fromurl distance RobotFileClose $inf } - if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { + set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } - if {[RobotHref $url tourl host path]} { - if {![RobotFileExist visited $host $path]} { - if {![RobotFileExist unvisited $host $path]} { - set outf [RobotFileOpen unvisited $host $path] + if {[RobotHref $task $url tourl host path]} { + if {![RobotFileExist $task visited $host $path]} { + if {![RobotFileExist $task unvisited $host $path]} { + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $fromurl $distance RobotFileClose $outf } } else { set olddistance {} - set inf [RobotFileOpen visited $host $path r] + set inf [RobotFileOpen $task visited $host $path r] RobotReadRecord $inf oldurl olddistance RobotFileClose $inf if {[string length $olddistance] == 0} { @@ -439,34 +598,44 @@ proc RobotRedirect {url tourl code} { } puts "distance=$distance olddistance=$olddistance" if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $tourl $distance RobotFileClose $outf } } } - if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} { + if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} { puts "unlink failed" exit 1 } } -proc link {url out href body distance} { - global URL maxdistance - if {[expr $distance > $maxdistance]} return +proc wellform {body} { + regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} body + regsub -all {<[^\>]+>} $body {} abody + regsub -all { } $abody { } body + regsub -all {&} $body {&} abody + return $abody +} + +proc link {task url out href body distance} { + global URL control + if {[expr $distance > $control($task,distance)]} return - if {![RobotHref $url href host path]} return + if {![RobotHref $task $url href host path]} return puts $out "" puts $out "$href" - puts $out "$body" + set abody [wellform $body] + puts $out "$abody" puts $out "" - if {![RobotFileExist visited $host $path]} { + if {![RobotFileExist $task visited $host $path]} { set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] + if {![RobotFileExist $task bad $host $path]} { + if {[RobotFileExist $task unvisited $host $path]} { + set inf [RobotFileOpen $task unvisited $host $path r] RobotReadRecord $inf oldurl olddistance RobotFileClose $inf } @@ -477,12 +646,12 @@ proc link {url out href body distance} { set olddistance 1000 } if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $url $distance RobotFileClose $outf } } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] + set inf [RobotFileOpen $task visited $host $path r] RobotReadRecord $inf xurl olddistance close $inf if {[string length $olddistance] == 0} { @@ -492,77 +661,116 @@ proc link {url out href body distance} { puts "OK remarking url=$url href=$href" puts "olddistance = $olddistance" puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] + set outf [RobotFileOpen $task unvisited $host $path] RobotWriteRecord $outf $url $distance RobotFileClose $outf } } } -proc RobotTextHtml {url out} { - global URL maxdistance +proc RobotTextHtml {task url out} { + global URL control + + # set title so we can emit it for the body + set title {} + # if true, nothing will be indexed + set noindex 0 + # if true, nothing will be followed + set nofollow 0 set distance 0 set fdistance 0 - if {$maxdistance < 1000 && [info exists URL($url,dist)]} { - set fdistance $URL($url,dist) + if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} { + set fdistance $URL($task,$url,dist) set distance [expr $fdistance + 1] } - htmlSwitch $URL($url,buf) \ + htmlSwitch $URL($task,$url,buf) \ title { - puts $out "$body" + set title $body } -nonest meta { + # collect metadata and save NAME= CONTENT=.. + set metaname {} + set metacontent {} puts -nonewline $out "" + # go through robots directives (af any) + if {![string compare $metaname robots]} { + set direcs [split [string tolower $metacontent] ,] + if {[lsearch $direcs noindex] >= 0} { + set noindex 1 + } + if {[lsearch $direcs nofollow] >= 0} { + set nofollow 1 + } } - puts $out {>} } body { - regsub -all {