X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=bfe875f339e32169de55f564fdd4aec58d213643;hb=bd463f7d1f1610a3c7a3d9e678f5c4ff27f9d546;hp=b2a7224d00d706046fb15d19dc868e1022123b8b;hpb=2c4a844e7d87397d31d29bd3bfc56c97a5f1618b;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index b2a7224..bfe875f 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,21 +1,23 @@ +#!/usr/bin/tclsh +# $Id: robot.tcl,v 1.32 2002/03/25 16:11:08 adam Exp $ # -# $Id: robot.tcl,v 1.1 1996/08/06 14:04:22 adam Exp $ -# -proc RobotFileNext {area} { - if {[catch {set ns [glob $area/*]}]} { +proc RobotFileNext1 {area lead} { + # puts "RobotFileNext1 area=$area lead=$lead" + if {[catch {set ns [glob ${area}/*]}]} { return {} } - set off [string first / $area] - incr off foreach n $ns { if {[file isfile $n]} { - if {[string first :.html $n] > 0} { - return http://[string range $area/ $off end] - } - return http://[string range $n $off end] + set off [string last / $n] + incr off 2 + return $lead/[string range $n $off end] } - if {[file isdirectory $n]} { - set sb [RobotFileNext $n] + } + foreach n $ns { + if {[file isdirectory $n]} { + set off [string last / $n] + incr off 2 + set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb } @@ -24,241 +26,770 @@ proc RobotFileNext {area} { return {} } +proc RobotWriteRecord {outf fromurl distance} { + puts $outf "" + puts $outf "" + puts $outf $distance + puts $outf "" + puts $outf "" + puts $outf $fromurl + puts $outf "" + puts $outf "" +} + +proc RobotReadRecord {inf fromurlx distancex} { + upvar $fromurlx fromurl + upvar $distancex distance + gets $inf + gets $inf + set distance [string trim [gets $inf]] + # puts "got distance = $distance" + gets $inf + gets $inf + set fromurl [string trim [gets $inf]] +} + +proc RobotFileNext {area} { + global robotSeq + global idletime ns + global status + + # puts "RobotFileNext robotSeq=$robotSeq" + if {$robotSeq < 0} { + return {} + } + if {$robotSeq == 0} { + if {[catch {set ns [glob ${area}/*]}]} { + return {} + } + } + set off [string length $area] + incr off + set n [lindex $ns $robotSeq] + if {![string length $n]} { + set robotSeq -1 + flush stdout + set statusfile [open status w] + puts $statusfile "$status(unvisited) $status(bad) $status(visited)" + close $statusfile + return wait + } + incr robotSeq + if {[file isfile $n/frobots.txt]} { + puts "ok returning http://[string range $n $off end]/robots.txt" + return http://[string range $n $off end]/robots.txt + } elseif {[file isdirectory $n]} { + set sb [RobotFileNext1 $n http://[string range $n $off end]] + if {[string length $sb]} { + return $sb + } + } + puts "no more work at end of RobotFileNext n=$n" + puts "ns=$ns" + return {} +} + + proc RobotFileExist {area host path} { - set comp [split $area/$host$path /] - set l [llength $comp] + global debuglevel + + if {$debuglevel > 3} { + puts "RobotFileExist begin area=$area host=$host path=$path" + } + set lpath [split $path /] + set l [llength $lpath] + incr l -1 + set t [lindex $lpath $l] incr l -1 - if {![string length [lindex $comp $l]]} { - set comp [split $area/$host$path:.html /] + set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + if {$debuglevel > 3} { + puts "RobotFileExist end npath=$npath" } - return [file exists [join $comp /]] + return [file exists $npath] } proc RobotFileUnlink {area host path} { - set comp [split $area/$host$path /] - set l [llength $comp] + global status + # puts "RobotFileUnlink begin" + # puts "area=$area host=$host path=$path" + set lpath [split $path /] + set l [llength $lpath] incr l -1 - if {![string length [lindex $comp $l]]} { - set comp [split $area/$host$path:.html /] - } + set t [lindex $lpath $l] + incr l -1 + set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + # puts "npath=$npath" + set comp [split $npath /] if {[catch {exec rm [join $comp /]}]} return + + set l [llength $comp] incr l -1 + incr l -1 + incr status($area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return exec rmdir ./$path } + # puts "RobotFileUnlink end" +} + +proc RobotFileClose {out} { + if [string compare $out stdout] { + close $out + } } -proc RobotFileOpen {area host path} { +proc RobotFileOpen {area host path {mode w}} { set orgPwd [pwd] + global workdir + global status + global debuglevel + if {![info exists workdir]} { + return stdout + } + if {$debuglevel > 3} { + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + } + if {[string compare $orgPwd $workdir]} { + puts "ooops. RobotFileOpen failed" + puts "workdir = $workdir" + puts "pwd = $orgPwd" + exit 1 + } set comp [split $area/$host$path /] set len [llength $comp] incr len -1 for {set i 0} {$i < $len} {incr i} { - set d [lindex $comp $i] + if {$i > 1} { + set d "d[lindex $comp $i]" + } else { + set d [lindex $comp $i] + } if {[catch {cd ./$d}]} { exec mkdir $d cd ./$d + if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { + if {[string compare $path /robots.txt]} { + set out [open frobots.txt w] + puts "creating robots.txt in $d" + close $out + incr status(unvisited) + } + } } } set d [lindex $comp $len] if {[string length $d]} { - set out [open $d w] + set out [open f$d $mode] } else { - set out [open :.html w] + set out [open f $mode] + } + if {$mode == "w"} { + incr status($area) } cd $orgPwd return $out } -proc RobotRestart {} { - global URL +proc RobotRR {} { + global robotSeq robotsRunning + + incr robotsRunning -1 + while {$robotsRunning} { + vwait robotsRunning + } + set robotSeq 0 + RobotStart +} + +proc RobotRestart {url sock} { + global URL robotsRunning + + close $sock + after cancel $URL($sock,cancel) + + foreach v [array names URL $url,*] { + unset URL($v) + } + + incr robotsRunning -1 + RobotStart +} - while {1} { +proc RobotStart {} { + global URL + global robotsRunning robotsMax idletime + + # puts "RobotStart" + while {1} { set url [RobotFileNext unvisited] - if {![string length $url]} break + if {![string length $url]} { + return + } + incr robotsRunning + if {[string compare $url wait] == 0} { + after $idletime RobotRR + return + } set r [RobotGetUrl $url {}] if {!$r} { - return + if {$robotsRunning >= $robotsMax} return } else { - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - } + incr robotsRunning -1 + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotFileClose $outf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + } } - exit 0 } -proc headSave {url out title} { +proc headSave {url out} { global URL - - puts $out {} - puts $out " $title" - if {[info exists URL($url,head,Last-modified)]} { - puts $out " $URL($url,head,Last-modified)" + + if {[info exists URL($url,head,last-modified)]} { + puts $out "$URL($url,head,last-modified)" } puts $out {} - if {[info exists URL($url,head,Date)]} { - puts $out " $URL($url,head,Date)" + if {[info exists URL($url,head,date)]} { + puts $out " $URL($url,head,date)" } - if {[info exists URL($url,head,Content-length)]} { - puts $out " $URL($url,head,Content-length)" + if {[info exists URL($url,head,content-length)]} { + puts $out " $URL($url,head,content-length)" } - if {[info exists URL($url,head,Server)]} { - puts $out " $URL($url,head,Server)" + if {[info exists URL($url,head,server)]} { + puts $out " $URL($url,head,server)" } puts $out {} - puts $out {} - puts $out " $url" - if {[info exists URL($url,head,Content-type)]} { - puts $out " $URL($url,head,Content-type)" + puts $out {} + puts $out " $url" + if {[info exists URL($url,head,content-type)]} { + puts $out " $URL($url,head,content-type)" } - puts $out {} + puts $out {} } -proc RobotSave {url} { +proc RobotHref {url hrefx hostx pathx} { + global URL domains debuglevel + upvar $hrefx href + upvar $hostx host + upvar $pathx path + + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } + + if {[string first { } $href] >= 0} { + return 0 + } + if {[string length $href] > 256} { + return 0 + } +# if {[string first {?} $href] >= 0} { +# return 0 +# } +# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { +# return 0 +# } + # get method (if any) + if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { + set hpath $href + set method http + } else { + if {[string compare $method http]} { + return 0 + } + } + # get host (if any) + if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} { + if {![string length $surl]} { + set surl / + } + if {[info exist domains]} { + set ok 0 + foreach domain $domains { + if {[string match $domain $host]} { + set ok 1 + break + } + } + if {!$ok} { + return 0 + } + } + } else { + regexp {^([^\#]*)} $hpath x surl + set host $URL($url,hostport) + } + if {![string length $surl]} { + return 0 + } + if {[string first / $surl]} { + # relative path + set curpath $URL($url,path) + if {[info exists URL($url,bpath)]} { + set curpath $URL($url,bpath) + } + regexp {^([^\#?]*)} $curpath x dpart + set l [string last / $dpart] + if {[expr $l >= 0]} { + set surl [string range $dpart 0 $l]$surl + } else { + set surl $dpart/$surl + } + } + set surllist [split $surl /] + catch {unset path} + set pathl 0 + foreach c $surllist { + switch -- $c { + .. { + if {$pathl > 1} { + incr pathl -2 + set path [lrange $path 0 $pathl] + incr pathl + } + } + . { + + } + default { + incr pathl + lappend path $c + } + } + } + if {$debuglevel > 4} { + puts "pathl=$pathl output path=$path" + } + set path [join $path /] + if {![string length $path]} { + set path / + } + regsub -all {~} $path {%7E} path + set href "$method://$host$path" + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] +} + +proc RobotError {url code} { global URL + + puts "Bad URL $url (code $code)" + set fromurl {} + set distance -1 + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } +} + +proc RobotRedirect {url tourl code} { + global URL + + puts "Redirecting from $url to $tourl" + + set distance {} + set fromurl {} + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } + if {[RobotHref $url tourl host path]} { + if {![RobotFileExist visited $host $path]} { + if {![RobotFileExist unvisited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } + } else { + set olddistance {} + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[string length $distance] == 0} { + set distance 1000 + } + puts "distance=$distance olddistance=$olddistance" + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $tourl $distance + RobotFileClose $outf + } + } + } + if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} { + puts "unlink failed" + exit 1 + } +} + +proc link {url out href body distance} { + global URL maxdistance + if {[expr $distance > $maxdistance]} return - set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] - set ti 0 - if {[info exists URL($url,line)]} { - set htmlContent [join $URL($url,line)] - - htmlSwitch $htmlContent \ - title { - if {!$ti} { - headSave $url $out $body - set ti 1 + if {![RobotHref $url href host path]} return + + puts $out "" + puts $out "$href" + puts $out "$body" + puts $out "" + + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 + if {![RobotFileExist bad $host $path]} { + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf } - } a { - if {![info exists parm(href)]} continue - if {!$ti} { - headSave $url $out "untitled" - set ti 1 + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } +} + +proc RobotTextHtml {url out} { + global URL maxdistance + + # set title so we can emit it for the body + set title {} + # if true, nothing will be indexed + set noindex 0 + # if true, nothing will be followed + set nofollow 0 + + set distance 0 + set fdistance 0 + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { + set fdistance $URL($url,dist) + set distance [expr $fdistance + 1] + } + htmlSwitch $URL($url,buf) \ + title { + set title $body + } -nonest meta { + # collect metadata and save NAME= CONTENT=.. + set metaname {} + set metacontent {} + puts -nonewline $out "" + # go through robots directives (af any) + if {![string compare $metaname robots]} { + set direcs [split [string tolower $metacontent] ,] + if {[lsearch $direcs noindex] >= 0} { + set noindex 1 } - } elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} { - set host $URL($url,host) - set method http - } else { - puts " href=$parm(href)" - set ext [file extension $URL($url,path)] - if {[string compare $ext {}]} { - set dpart [file dirname $URL($url,path)] - } else { - set dpart $URL($url,path) + if {[lsearch $direcs nofollow] >= 0} { + set nofollow 1 } - regexp {^([^#]+)} $parm(href) x path - set host $URL($url,host) - set path [string trimright $dpart /]/$path - set method http } - set ext [file extension $path] - if {![string length $ext]} { - set path [string trimright $path /]/ - } else { - set path [string trimright $path /] + } body { + # don't print title of document content if noindex is used + if {!$noindex} { + puts $out "$title" + regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} bbody + regsub -all {<[^\>]+>} $bbody {} nbody + puts $out "" + puts $out $nbody + puts $out "" } - set c [split $path /] - set i [llength $c] - incr i -1 - set path [lindex $c $i] - incr i -1 - while {$i >= 0} { - switch -- [lindex $c $i] { - .. { - incr i -2 - } - . { - incr i -1 + } -nonest base { + # + if {![info exists parm(href)]} { + continue + } + set href [string trim $parm(href)] + if {![RobotHref $url href host path]} continue + set URL($url,bpath) $path + } a { + # .. + # we're not using nonest - otherwise body isn't set + if {$nofollow} continue + if {![info exists parm(href)]} { + continue + } + link $url $out [string trim $parm(href)] $body $distance + } -nonest area { + if {$nofollow} continue + if {![info exists parm(href)]} { + continue + } + link $url $out [string trim $parm(href)] $body $distance + } -nonest frame { + if {![info exists parm(src)]} { + continue + } + link $url $out [string trim $parm(src)] $body $fdistance + } +} + +proc RobotsTxt {url} { + global agent URL + + RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf) +} + +proc RobotsTxt0 {v buf} { + global URL agent + set section 0 + foreach l [split $buf \n] { + if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} { + puts "cmd=$cmd arg=$arg" + switch -- [string tolower $cmd] { + user-agent { + if {$section} break + set pat [string tolower $arg]* + set section [string match $pat $agent] + } + disallow { + if {$section} { + puts "rule [list 0 $arg]" + lappend $v [list 0 $arg] } - default { - set path [lindex $c $i]/$path - incr i -1 + } + allow { + if {$section} { + puts "rule [list 1 $arg]" + lappend $v [list 1 $arg] } } } - set href "$method://$host$path" - - puts $out "" - puts $out "
  • $href" - puts $out " $body" - puts $out "" - - if {![regexp {/.*bin/} $href)]} { - if {![RobotFileExist visited $host $path]} { - set outf [RobotFileOpen unvisited $host $path] - close $outf - } - } - } - } - if {!$ti} { - headSave $url $out "untitled" - set ti 1 + } } - puts $out "" - close $out - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) } -proc RobotRead {url sock} { +proc RobotTextPlain {url out} { global URL - set readCount [gets $sock line] - if {$readCount < 0} { - if [eof $sock] { - close $sock - RobotSave $url - RobotRestart - } - } elseif {$readCount > 0} { - switch $URL($url,state) { - head { - puts "head: $line" - if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} { - set URL($url,head,$name) $value - } - } - html { - lappend URL($url,line) $line -# puts "body: $line" - } - skip { - close $sock - RobotSave $url - RobotRestart + puts $out "" + regsub -all {<} $URL($url,buf) {\<} content + puts $out $content + puts $out "" + + if {![string compare $URL($url,path) /robots.txt]} { + RobotsTxt $url + } +} + +proc RobotWriteMetadata {url out} { + global URL domains + + puts $out "" + + set distance 1000 + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + set URL($url,dist) $distance + puts $out "" + puts $out " $distance" + puts $out "" + headSave $url $out + puts "Parsing $url distance=$distance" + switch $URL($url,head,content-type) { + text/html { + if {[string length $distance]} { + RobotTextHtml $url $out } } - } else { - set URL($url,state) skip - if {[info exists URL($url,head,Content-type)]} { - if {![string compare $URL($url,head,Content-type) text/html]} { - set URL($url,state) html - } + text/plain { + RobotTextPlain $url $out } } + puts $out "" } -proc RobotConnect {url sock} { +proc Robot200 {url} { + global URL domains + + set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)] + puts -nonewline $out $URL($url,buf) + RobotFileClose $out + + set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] + RobotWriteMetadata $url $out + RobotFileClose $out + + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) +} + +proc RobotReadContent {url sock binary} { global URL - fileevent $sock readable [list RobotRead $url $sock] + set buffer [read $sock 16384] + set readCount [string length $buffer] + + if {$readCount <= 0} { + Robot200 $url + RobotRestart $url $sock + } elseif {!$binary && [string first \0 $buffer] >= 0} { + Robot200 $url + RobotRestart $url $sock + } else { + # puts "Got $readCount bytes" + set URL($url,buf) $URL($url,buf)$buffer + } +} + +proc RobotReadHeader {url sock} { + global URL debuglevel + + if {$debuglevel > 1} { + puts "HTTP head $url" + } + if {[catch {set buffer [read $sock 2148]}]} { + RobotError $url 404 + RobotRestart $url $sock + return + } + set readCount [string length $buffer] + + if {$readCount <= 0} { + RobotError $url 404 + RobotRestart $url $sock + } else { + # puts "Got $readCount bytes" + set URL($url,buf) $URL($url,buf)$buffer + + set n [string first \r\n\r\n $URL($url,buf)] + if {$n > 1} { + set code 0 + set version {} + set headbuf [string range $URL($url,buf) 0 $n] + incr n 4 + set URL($url,buf) [string range $URL($url,buf) $n end] + + regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code + set lines [split $headbuf \n] + foreach line $lines { + if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { + set URL($url,head,[string tolower $name]) [string trim $value] + } + } + puts "HTTP CODE $code" + set URL($url,state) skip + switch $code { + 301 { + RobotRedirect $url $URL($url,head,location) 301 + RobotRestart $url $sock + } + 302 { + RobotRedirect $url $URL($url,head,location) 302 + RobotRestart $url $sock + } + 200 { + if {![info exists URL($url,head,content-type)]} { + set URL($url,head,content-type) {} + } + set binary 1 + switch -glob -- $URL($url,head,content-type) { + text/* { + set binary 0 + } + } + if {![regexp {/robots.txt$} $url]} { + if {![checkrule mime $URL($url,head,content-type)]} { + RobotError $url mimedeny + RobotRestart $url $sock + return + } + } + fileevent $sock readable [list RobotReadContent $url $sock $binary] + } + default { + RobotError $url $code + RobotRestart $url $sock + } + } + } + } +} + +proc RobotSockCancel {url sock} { + + puts "RobotSockCancel sock=$sock url=$url" + RobotError $url 401 + RobotRestart $url $sock +} + +proc RobotConnect {url sock} { + global URL agent acceptLanguage + + fconfigure $sock -translation {lf crlf} -blocking 0 + fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" + puts $sock "Host: $URL($url,host)" + puts $sock "User-Agent: $agent" + if {[string length $acceptLanguage]} { + puts $sock "Accept-Language: $acceptLanguage" + } puts $sock "" flush $sock + set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]] } proc RobotNop {} { @@ -266,30 +797,265 @@ proc RobotNop {} { } proc RobotGetUrl {url phost} { - global URL - set port 80 - puts "---------" - puts $url - if {[regexp {([^:]+)://([^/]+)([^ ?]*)} $url x method host path]} { - puts "method=$method host=$host path=$path" - } else { + global URL robotsRunning + flush stdout + puts "Retrieve $robotsRunning url=$url" + if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } + if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} { + set port 80 + set host $hostport + } set URL($url,method) $method set URL($url,host) $host - set URL($url,port) $port + set URL($url,hostport) $hostport set URL($url,path) $path set URL($url,state) head + set URL($url,buf) {} + + if {[string compare $path /robots.txt]} { + set ok 1 + if {![info exists URL($hostport,robots)]} { + puts "READING robots.txt for host $hostport" + if {[RobotFileExist visited $hostport /robots.txt]} { + set inf [RobotFileOpen visited $hostport /robots.txt r] + set buf [read $inf 32768] + close $inf + } else { + set buf "User-agent: *\nAllow: /\n" + } + RobotsTxt0 URL($hostport,robots) $buf + } + if {[info exists URL($hostport,robots)]} { + foreach l $URL($hostport,robots) { + if {[string first [lindex $l 1] $path] == 0} { + set ok [lindex $l 0] + break + } + } + } + if {!$ok} { + puts "skipped due to robots.txt" + return -1 + } + } if [catch {set sock [socket -async $host $port]}] { return -1 } - fconfigure $sock -translation {auto crlf} RobotConnect $url $sock return 0 } -#RobotGetUrl http://www.dtv.dk/ {} -RobotRestart -vwait forever +if {![llength [info commands htmlSwitch]]} { + set e [info sharedlibextension] + if {[catch {load ./tclrobot$e}]} { + load tclrobot$e + } +} + +set agent "zmbot/0.1" +if {![catch {set os [exec uname -s -r]}]} { + set agent "$agent ($os)" +} + +puts "agent: $agent" + +proc bgerror {m} { + global errorInfo + puts "BGERROR $m" + puts $errorInfo +} + +set robotsRunning 0 +set robotSeq 0 +set workdir [pwd] +set idletime 60000 +set acceptLanguage {} +set debuglevel 0 +set status(unvisited) 0 +set status(visited) 0 +set status(bad) 0 +set status(raw) 0 + + +# Rules: allow, deny, url + +proc checkrule {type this} { + global alrules + global debuglevel + + if {$debuglevel > 3} { + puts "CHECKRULE $type $this" + } + if {[info exist alrules]} { + foreach l $alrules { + if {$debuglevel > 3} { + puts "consider $l" + } + # consider type + if {[lindex $l 1] != $type} continue + # consider mask (! negates) + set masks [lindex $l 2] + set ok 0 + foreach mask $masks { + if {$debuglevel > 4} { + puts "consider single mask $mask" + } + if {[string index $mask 0] == "!"} { + set mask [string range $mask 1 end] + if {[string match $mask $this]} continue + } else { + if {![string match $mask $this]} continue + } + set ok 1 + } + if {$debuglevel > 4} { + puts "ok = $ok" + } + if {!$ok} continue + # OK, we have a match + if {[lindex $l 0] == "allow"} { + if {$debuglevel > 3} { + puts "CHECKRULE MATCH OK" + } + return 1 + } else { + if {$debuglevel > 3} { + puts "CHECKFULE MATCH FAIL" + } + return 0 + } + } + } + if {$debuglevel > 3} { + puts "CHECKRULE MATCH OK" + } + return 1 +} + + +proc url {href} { + global debuglevel + + if {[RobotHref http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } +} + +proc deny {type stuff} { + global alrules + + lappend alrules [list deny $type $stuff] +} + +proc allow {type stuff} { + global alrules + + lappend alrules [list allow $type $stuff] +} + +proc debug {level} { + global debuglevel + + set debuglevel $level +} + +# Parse options + +set i 0 +set l [llength $argv] + +if {$l < 2} { + puts {tclrobot: usage:} + puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]} + puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" + + exit 1 +} +while {$i < $l} { + set arg [lindex $argv $i] + switch -glob -- $arg { + -j* { + set robotsMax [string range $arg 2 end] + if {![string length $robotsMax]} { + set robotsMax [lindex $argv [incr i]] + } + } + -c* { + set maxdistance [string range $arg 2 end] + if {![string length $maxdistance]} { + set maxdistance [lindex $argv [incr i]] + } + } + -d* { + set dom [string range $arg 2 end] + if {![string length $dom]} { + set dom [lindex $argv [incr i]] + } + lappend domains $dom + } + -i* { + set idletime [string range $arg 2 end] + if {![string length $idletime]} { + set idletime [lindex $argv [incr i]] + } + } + -l* { + set acceptLanguage [string range $arg 2 end] + if {![string length $acceptLanguage]} { + set acceptLanguage [lindex $argv [incr i]] + } + } + -r* { + set rfile [string range $arg 2 end] + if {![string length $rfile]} { + set rfile [lindex $argv [incr i]] + } + source $rfile + } + default { + set href $arg + if {[RobotHref http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } + } + } + incr i +} + +if {![info exist domains]} { + set domains {*} +} +if {![info exist maxdistance]} { + set maxdistance 50 +} +if {![info exist robotsMax]} { + set robotsMax 5 +} + +puts "domains=$domains" +puts "max distance=$maxdistance" +puts "max jobs=$robotsMax" + + +RobotStart + + +while {$robotsRunning} { + vwait robotsRunning +} + +set statusfile [open status w] +puts $statusfile "$status(unvisited) $status(bad) $status(visited)" +close $statusfile