X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=b49ca0b618e3d2dcf2a1c0439e3519e6756f6128;hb=93e08ecd3b8121ef98503e321a5f3039ce330dd2;hp=5c2b51893b9f4a53b1fef607c5eed61454ef4468;hpb=9d508bb1bb6e7479fb9e6753797fc78151cfc0e4;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 5c2b518..b49ca0b 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,8 +1,8 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.16 2001/06/06 07:10:31 adam Exp $ +# $Id: robot.tcl,v 1.24 2001/11/07 11:30:52 adam Exp $ # proc RobotFileNext1 {area lead} { - puts "RobotFileNext1 area=$area lead=$lead" + # puts "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } @@ -43,16 +43,16 @@ proc RobotReadRecord {inf fromurlx distancex} { gets $inf gets $inf set distance [string trim [gets $inf]] - puts "got distance = $distance" + # puts "got distance = $distance" gets $inf gets $inf set fromurl [string trim [gets $inf]] } proc RobotFileNext {area} { - global robotSeq global idleTime ns + global robotSeq global idletime ns - puts "RobotFileNext robotSeq=$robotSeq" + # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { return {} } @@ -67,7 +67,7 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "------------ N E X T R O U N D --------" + puts "Round robin" return wait } incr robotSeq @@ -87,27 +87,27 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - puts "RobotFileExist begin area=$area host=$host path=$path" + # puts "RobotFileExist begin area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "RobotFileExist end npath=$npath" + # puts "RobotFileExist end npath=$npath" return [file exists $npath] } proc RobotFileUnlink {area host path} { - puts "RobotFileUnlink begin" - puts "area=$area host=$host path=$path" + # puts "RobotFileUnlink begin" + # puts "area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "npath=$npath" + # puts "npath=$npath" set comp [split $npath /] set l [llength $comp] incr l -1 @@ -118,7 +118,7 @@ proc RobotFileUnlink {area host path} { if {![catch {glob $path/*}]} return exec rmdir ./$path } - puts "RobotFileUnlink end" + # puts "RobotFileUnlink end" } proc RobotFileClose {out} { @@ -134,7 +134,7 @@ proc RobotFileOpen {area host path {mode w}} { if {![info exists workdir]} { return stdout } - puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" if {[string compare $orgPwd $workdir]} { puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" @@ -201,9 +201,9 @@ proc RobotRestart {url sock} { proc RobotStart {} { global URL - global robotsRunning robotsMax idleTime + global robotsRunning robotsMax idletime - puts "RobotStart" + # puts "RobotStart" while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { @@ -211,7 +211,7 @@ proc RobotStart {} { } incr robotsRunning if {[string compare $url wait] == 0} { - after $idleTime RobotRR + after $idletime RobotRR return } set r [RobotGetUrl $url {}] @@ -254,12 +254,14 @@ proc headSave {url out} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } if {[string first { } $href] >= 0} { return 0 @@ -267,6 +269,9 @@ proc RobotHref {url hrefx hostx pathx} { if {[string length $href] > 256} { return 0 } + if {[string first {?} $href] >= 0} { + return 0 + } if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { return 0 } @@ -313,38 +318,45 @@ proc RobotHref {url hrefx hostx pathx} { set surl $dpart/$surl } } - set c [split $surl /] - set i [llength $c] - incr i -1 - set path [lindex $c $i] - incr i -1 - while {$i >= 0} { - switch -- [lindex $c $i] { + set surllist [split $surl /] + catch {unset path} + set pathl 0 + foreach c $surllist { + switch -- $c { .. { - incr i -2 - if {$i < 0} { - set i 0 + if {$pathl > 0} { + incr pathl -2 + set path [lrange $path 0 $pathl] + incr pathl } } - . { - incr i -1 - } - default { - set path [lindex $c $i]/$path - incr i -1 + . { + + } + default { + incr pathl + lappend path $c } } } + if {$pathl} { + set path [join $path /] + } else { + set path "" + } regsub -all {~} $path {%7E} path set href "$method://$host$path" - puts "Ref href = $href" - return 1 + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] } proc RobotError {url code} { global URL - puts "Bad URL $url, $code" + puts "Bad URL $url (code $code)" set fromurl {} set distance -1 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { @@ -410,10 +422,10 @@ proc RobotRedirect {url tourl code} { } proc RobotTextHtml {url out} { - global URL maxDistance + global URL maxdistance set distance 0 - if {$maxDistance < 1000 && [info exists URL($url,dist)]} { + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { set distance [expr $URL($url,dist) + 1] } htmlSwitch $URL($url,buf) \ @@ -429,7 +441,7 @@ proc RobotTextHtml {url out} { } puts $out {>} } body { - regsub -all -nocase {} $body {} abody + regsub -all -nocase {))*} $body {} abody regsub -all {<[^\>]+>} $abody {} nbody puts $out "" puts $out $nbody @@ -439,7 +451,7 @@ proc RobotTextHtml {url out} { puts "no href" continue } - if {[expr $distance <= $maxDistance]} { + if {[expr $distance <= $maxdistance]} { set href [string trim $parm(href)] if {![RobotHref $url href host path]} continue @@ -489,7 +501,7 @@ proc RobotTextHtml {url out} { puts "no href" continue } - if {[expr $distance <= $maxDistance]} { + if {[expr $distance <= $maxdistance]} { set href [string trim $parm(href)] if {![RobotHref $url href host path]} continue @@ -547,7 +559,7 @@ proc RobotsTxt0 {v buf} { global URL agent set section 0 foreach l [split $buf \n] { - if {[regexp {([-A-Za-z]+):[ \t]*([^\#\t ]+)} $l match cmd arg]} { + if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} { puts "cmd=$cmd arg=$arg" switch -- [string tolower $cmd] { user-agent { @@ -576,7 +588,8 @@ proc RobotTextPlain {url out} { global URL puts $out "" - puts $out $URL($url,buf) + regsub -all {<} $URL($url,buf) {\<} content + puts $out $content puts $out "" if {![string compare $URL($url,path) /robots.txt]} { @@ -584,10 +597,9 @@ proc RobotTextPlain {url out} { } } -proc Robot200 {url} { +proc RobotWriteMetadata {url out} { global URL domains - - set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] + puts $out "" set distance 1000 @@ -603,30 +615,44 @@ proc Robot200 {url} { headSave $url $out puts "Parsing $url distance=$distance" switch $URL($url,head,content-type) { - text/html { - if {[string length $distance]} { - RobotTextHtml $url $out - } - } - text/plain { - RobotTextPlain $url $out - } - application/pdf { - set pdff [open test.pdf w] - puts -nonewline $pdff $URL($url,buf) - close $pdff - } + text/html { + if {[string length $distance]} { + RobotTextHtml $url $out + } + } + text/plain { + RobotTextPlain $url $out + } + application/pdf { + set pdff [open test.pdf w] + puts -nonewline $pdff $URL($url,buf) + close $pdff + } } puts $out "" +} + +proc Robot200 {url} { + global URL domains + + set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)] + puts -nonewline $out $URL($url,buf) + RobotFileClose $out + + if {![checkrule mime $URL($url,head,content-type)]} { + RobotError $url mimedeny + return + } + set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] + RobotWriteMetadata $url $out RobotFileClose $out - # puts "Parsing done" + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) } proc RobotReadContent {url sock binary} { global URL - puts "RobotReadContent $url" set buffer [read $sock 16384] set readCount [string length $buffer] @@ -643,9 +669,11 @@ proc RobotReadContent {url sock binary} { } proc RobotReadHeader {url sock} { - global URL + global URL debuglevel - puts "RobotReadHeader $url" + if {$debuglevel > 1} { + puts "HTTP head $url" + } if {[catch {set buffer [read $sock 2148]}]} { RobotError $url 404 RobotRestart $url $sock @@ -674,7 +702,7 @@ proc RobotReadHeader {url sock} { set URL($url,head,[string tolower $name]) [string trim $value] } } - puts "code = $code" + puts "HTTP CODE $code" set URL($url,state) skip switch $code { 301 { @@ -714,13 +742,16 @@ proc RobotSockCancel {url sock} { } proc RobotConnect {url sock} { - global URL agent + global URL agent acceptLanguage fconfigure $sock -translation {lf crlf} -blocking 0 fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" puts $sock "User-Agent: $agent" + if {[string length $acceptLanguage]} { + puts $sock "Accept-Language: $acceptLanguage" + } puts $sock "" flush $sock set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]] @@ -733,7 +764,7 @@ proc RobotNop {} { proc RobotGetUrl {url phost} { global URL robotsRunning flush stdout - puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url" + puts "Retrieve $robotsRunning url=$url" if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } @@ -789,7 +820,7 @@ if {![llength [info commands htmlSwitch]]} { } } -set agent "zmbot/0.0" +set agent "zmbot/0.1" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" } @@ -805,17 +836,108 @@ proc bgerror {m} { set robotsRunning 0 set robotSeq 0 set workdir [pwd] -set idleTime 60000 +set idletime 60000 +set acceptLanguage {} set i 0 set l [llength $argv] if {$l < 2} { - puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]} + puts {tclrobot: usage:} + puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]} puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" exit 1 } +# Rules: allow, deny, url +set debuglevel 0 + +proc checkrule {type this} { + global alrules + global debuglevel + + if {$debuglevel > 3} { + puts "CHECKRULE $type $this" + } + if {[info exist alrules]} { + foreach l $alrules { + if {$debuglevel > 3} { + puts "consider $l" + } + # consider type + if {[lindex $l 1] != $type} continue + # consider mask (! negates) + set masks [lindex $l 2] + set ok 0 + foreach mask $masks { + if {$debuglevel > 4} { + puts "consider single mask $mask" + } + if {[string index $mask 0] == "!"} { + set mask [string range $mask 1 end] + if {[string match $mask $this]} continue + } else { + if {![string match $mask $this]} continue + } + set ok 1 + } + if {$debuglevel > 4} { + puts "ok = $ok" + } + if {!$ok} continue + # OK, we have a match + if {[lindex $l 0] == "allow"} { + if {$debuglevel > 3} { + puts "CHECKRULE MATCH OK" + } + return 1 + } else { + if {$debuglevel > 3} { + puts "CHECKFULE MATCH FAIL" + } + return 0 + } + } + } + if {$debuglevel > 3} { + puts "CHECKRULE MATCH OK" + } + return 1 +} + + +proc url {href} { + global debuglevel + + if {[RobotHref http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } +} + +proc deny {type stuff} { + global alrules + + lappend alrules [list deny $type $stuff] +} + +proc allow {type stuff} { + global alrules + + lappend alrules [list allow $type $stuff] +} + +proc debug {level} { + global debuglevel + + set debuglevel $level +} + +# Parse options + while {$i < $l} { set arg [lindex $argv $i] switch -glob -- $arg { @@ -826,9 +948,9 @@ while {$i < $l} { } } -c* { - set maxDistance [string range $arg 2 end] - if {![string length $maxDistance]} { - set maxDistance [lindex $argv [incr i]] + set maxdistance [string range $arg 2 end] + if {![string length $maxdistance]} { + set maxdistance [lindex $argv [incr i]] } } -d* { @@ -839,11 +961,24 @@ while {$i < $l} { lappend domains $dom } -i* { - set idleTime [string range $arg 2 end] - if {![string length $idleTime]} { - set idleTime [lindex $argv [incr i]] + set idletime [string range $arg 2 end] + if {![string length $idletime]} { + set idletime [lindex $argv [incr i]] } } + -l* { + set acceptLanguage [string range $arg 2 end] + if {![string length $acceptLanguage]} { + set acceptLanguage [lindex $argv [incr i]] + } + } + -r* { + set rfile [string range $arg 2 end] + if {![string length $rfile]} { + set rfile [lindex $argv [incr i]] + } + source $rfile + } default { set href $arg if {[RobotHref http://www.indexdata.dk/ href host path]} { @@ -861,15 +996,15 @@ while {$i < $l} { if {![info exist domains]} { set domains {*} } -if {![info exist maxDistance]} { - set maxDistance 50 +if {![info exist maxdistance]} { + set maxdistance 50 } if {![info exist robotsMax]} { set robotsMax 5 } puts "domains=$domains" -puts "max distance=$maxDistance" +puts "max distance=$maxdistance" puts "max jobs=$robotsMax" RobotStart