X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=bfe875f339e32169de55f564fdd4aec58d213643;hb=bd463f7d1f1610a3c7a3d9e678f5c4ff27f9d546;hp=6323bc3421614b9ce1eb447fbcb73fdd53445959;hpb=4d94083b545d3665a3ceca7962ebb6788bc62dd3;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 6323bc3..bfe875f 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.19 2001/06/29 21:47:31 adam Exp $ +# $Id: robot.tcl,v 1.32 2002/03/25 16:11:08 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -50,7 +50,9 @@ proc RobotReadRecord {inf fromurlx distancex} { } proc RobotFileNext {area} { - global robotSeq global idleTime ns + global robotSeq + global idletime ns + global status # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { @@ -67,7 +69,9 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "------------ N E X T R O U N D --------" + set statusfile [open status w] + puts $statusfile "$status(unvisited) $status(bad) $status(visited)" + close $statusfile return wait } incr robotSeq @@ -87,18 +91,25 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - # puts "RobotFileExist begin area=$area host=$host path=$path" + global debuglevel + + if {$debuglevel > 3} { + puts "RobotFileExist begin area=$area host=$host path=$path" + } set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - # puts "RobotFileExist end npath=$npath" + if {$debuglevel > 3} { + puts "RobotFileExist end npath=$npath" + } return [file exists $npath] } proc RobotFileUnlink {area host path} { + global status # puts "RobotFileUnlink begin" # puts "area=$area host=$host path=$path" set lpath [split $path /] @@ -109,10 +120,12 @@ proc RobotFileUnlink {area host path} { set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t # puts "npath=$npath" set comp [split $npath /] + if {[catch {exec rm [join $comp /]}]} return + set l [llength $comp] incr l -1 - if {[catch {exec rm [join $comp /]}]} return incr l -1 + incr status($area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return @@ -130,11 +143,15 @@ proc RobotFileClose {out} { proc RobotFileOpen {area host path {mode w}} { set orgPwd [pwd] global workdir + global status + global debuglevel if {![info exists workdir]} { return stdout } - #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + if {$debuglevel > 3} { + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + } if {[string compare $orgPwd $workdir]} { puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" @@ -154,22 +171,24 @@ proc RobotFileOpen {area host path {mode w}} { exec mkdir $d cd ./$d if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { - set out [open frobots.txt w] - puts "creating robots.txt in $d" - close $out + if {[string compare $path /robots.txt]} { + set out [open frobots.txt w] + puts "creating robots.txt in $d" + close $out + incr status(unvisited) + } } } } set d [lindex $comp $len] if {[string length $d]} { - if {[file isdirectory $d]} { - set out [open $d/f $mode] - } else { - set out [open f$d $mode] - } + set out [open f$d $mode] } else { set out [open f $mode] } + if {$mode == "w"} { + incr status($area) + } cd $orgPwd return $out } @@ -201,7 +220,7 @@ proc RobotRestart {url sock} { proc RobotStart {} { global URL - global robotsRunning robotsMax idleTime + global robotsRunning robotsMax idletime # puts "RobotStart" while {1} { @@ -211,7 +230,7 @@ proc RobotStart {} { } incr robotsRunning if {[string compare $url wait] == 0} { - after $idleTime RobotRR + after $idletime RobotRR return } set r [RobotGetUrl $url {}] @@ -254,12 +273,14 @@ proc headSave {url out} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } if {[string first { } $href] >= 0} { return 0 @@ -267,12 +288,12 @@ proc RobotHref {url hrefx hostx pathx} { if {[string length $href] > 256} { return 0 } - if {[string first {?} $href] >= 0} { - return 0 - } - if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { - return 0 - } +# if {[string first {?} $href] >= 0} { +# return 0 +# } +# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { +# return 0 +# } # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href @@ -308,7 +329,11 @@ proc RobotHref {url hrefx hostx pathx} { } if {[string first / $surl]} { # relative path - regexp {^([^\#?]*)} $URL($url,path) x dpart + set curpath $URL($url,path) + if {[info exists URL($url,bpath)]} { + set curpath $URL($url,bpath) + } + regexp {^([^\#?]*)} $curpath x dpart set l [string last / $dpart] if {[expr $l >= 0]} { set surl [string range $dpart 0 $l]$surl @@ -322,9 +347,10 @@ proc RobotHref {url hrefx hostx pathx} { foreach c $surllist { switch -- $c { .. { - if {$pathl > 0} { - incr pathl -1 + if {$pathl > 1} { + incr pathl -2 set path [lrange $path 0 $pathl] + incr pathl } } . { @@ -336,21 +362,26 @@ proc RobotHref {url hrefx hostx pathx} { } } } - if {$pathl} { - set path [join $path /] - } else { - set path "" + if {$debuglevel > 4} { + puts "pathl=$pathl output path=$path" + } + set path [join $path /] + if {![string length $path]} { + set path / } regsub -all {~} $path {%7E} path set href "$method://$host$path" - puts "Ref href = $href" - return 1 + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] } proc RobotError {url code} { global URL - puts "Bad URL $url, $code" + puts "Bad URL $url (code $code)" set fromurl {} set distance -1 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { @@ -415,131 +446,142 @@ proc RobotRedirect {url tourl code} { } } +proc link {url out href body distance} { + global URL maxdistance + if {[expr $distance > $maxdistance]} return + + if {![RobotHref $url href host path]} return + + puts $out "" + puts $out "$href" + puts $out "$body" + puts $out "" + + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 + if {![RobotFileExist bad $host $path]} { + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } +} + proc RobotTextHtml {url out} { - global URL maxDistance + global URL maxdistance + + # set title so we can emit it for the body + set title {} + # if true, nothing will be indexed + set noindex 0 + # if true, nothing will be followed + set nofollow 0 set distance 0 - if {$maxDistance < 1000 && [info exists URL($url,dist)]} { - set distance [expr $URL($url,dist) + 1] + set fdistance 0 + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { + set fdistance $URL($url,dist) + set distance [expr $fdistance + 1] } htmlSwitch $URL($url,buf) \ title { - puts $out "$body" + set title $body } -nonest meta { + # collect metadata and save NAME= CONTENT=.. + set metaname {} + set metacontent {} puts -nonewline $out "" + # go through robots directives (af any) + if {![string compare $metaname robots]} { + set direcs [split [string tolower $metacontent] ,] + if {[lsearch $direcs noindex] >= 0} { + set noindex 1 + } + if {[lsearch $direcs nofollow] >= 0} { + set nofollow 1 + } } - puts $out {>} } body { - regsub -all -nocase {))*} $body {} abody - regsub -all {<[^\>]+>} $abody {} nbody - puts $out "" - puts $out $nbody - puts $out "" - } -nonest a { + # don't print title of document content if noindex is used + if {!$noindex} { + puts $out "$title" + regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} bbody + regsub -all {<[^\>]+>} $bbody {} nbody + puts $out "" + puts $out $nbody + puts $out "" + } + } -nonest base { + # if {![info exists parm(href)]} { - puts "no href" continue } - if {[expr $distance <= $maxDistance]} { - set href [string trim $parm(href)] - if {![RobotHref $url href host path]} continue - - puts $out "" - puts $out "$href" - puts $out "$body" - puts $out "" - - if {![RobotFileExist visited $host $path]} { - set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - } - } else { - set olddistance 0 - } - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] - RobotReadRecord $inf xurl olddistance - close $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - puts "OK remarking url=$url href=$href" - puts "olddistance = $olddistance" - puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } - } + set href [string trim $parm(href)] + if {![RobotHref $url href host path]} continue + set URL($url,bpath) $path + } a { + # .. + # we're not using nonest - otherwise body isn't set + if {$nofollow} continue + if {![info exists parm(href)]} { + continue + } + link $url $out [string trim $parm(href)] $body $distance } -nonest area { + if {$nofollow} continue if {![info exists parm(href)]} { - puts "no href" continue } - if {[expr $distance <= $maxDistance]} { - set href [string trim $parm(href)] - if {![RobotHref $url href host path]} continue - - puts $out "" - puts $out "$href" - puts $out "" - puts $out "" - - if {![RobotFileExist visited $host $path]} { - set olddistance 1000 - if {![RobotFileExist bad $host $path]} { - if {[RobotFileExist unvisited $host $path]} { - set inf [RobotFileOpen unvisited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - } - } else { - set olddistance 0 - } - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } elseif {[string compare $href $url]} { - set inf [RobotFileOpen visited $host $path r] - RobotReadRecord $inf xurl olddistance - close $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - puts "OK remarking url=$url href=$href" - puts "olddistance = $olddistance" - puts "newdistance = $distance" - set outf [RobotFileOpen unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } - } + link $url $out [string trim $parm(href)] $body $distance + } -nonest frame { + if {![info exists parm(src)]} { + continue + } + link $url $out [string trim $parm(src)] $body $fdistance } } @@ -591,14 +633,9 @@ proc RobotTextPlain {url out} { } } -proc Robot200 {url} { +proc RobotWriteMetadata {url out} { global URL domains - - set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)] - puts -nonewline $out $URL($url,buf) - RobotFileClose $out - set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] puts $out "" set distance 1000 @@ -614,30 +651,35 @@ proc Robot200 {url} { headSave $url $out puts "Parsing $url distance=$distance" switch $URL($url,head,content-type) { - text/html { - if {[string length $distance]} { - RobotTextHtml $url $out - } - } - text/plain { - RobotTextPlain $url $out - } - application/pdf { - set pdff [open test.pdf w] - puts -nonewline $pdff $URL($url,buf) - close $pdff - } + text/html { + if {[string length $distance]} { + RobotTextHtml $url $out + } + } + text/plain { + RobotTextPlain $url $out + } } puts $out "" +} + +proc Robot200 {url} { + global URL domains + + set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)] + puts -nonewline $out $URL($url,buf) + RobotFileClose $out + + set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] + RobotWriteMetadata $url $out RobotFileClose $out - # puts "Parsing done" + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) } proc RobotReadContent {url sock binary} { global URL - puts "RobotReadContent $url" set buffer [read $sock 16384] set readCount [string length $buffer] @@ -654,12 +696,15 @@ proc RobotReadContent {url sock binary} { } proc RobotReadHeader {url sock} { - global URL + global URL debuglevel - puts "RobotReadHeader $url" + if {$debuglevel > 1} { + puts "HTTP head $url" + } if {[catch {set buffer [read $sock 2148]}]} { RobotError $url 404 RobotRestart $url $sock + return } set readCount [string length $buffer] @@ -685,7 +730,7 @@ proc RobotReadHeader {url sock} { set URL($url,head,[string tolower $name]) [string trim $value] } } - puts "code = $code" + puts "HTTP CODE $code" set URL($url,state) skip switch $code { 301 { @@ -700,12 +745,19 @@ proc RobotReadHeader {url sock} { if {![info exists URL($url,head,content-type)]} { set URL($url,head,content-type) {} } - set binary 0 - switch $URL($url,head,content-type) { - application/pdf { - set binary 1 + set binary 1 + switch -glob -- $URL($url,head,content-type) { + text/* { + set binary 0 } } + if {![regexp {/robots.txt$} $url]} { + if {![checkrule mime $URL($url,head,content-type)]} { + RobotError $url mimedeny + RobotRestart $url $sock + return + } + } fileevent $sock readable [list RobotReadContent $url $sock $binary] } default { @@ -747,7 +799,7 @@ proc RobotNop {} { proc RobotGetUrl {url phost} { global URL robotsRunning flush stdout - puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url" + puts "Retrieve $robotsRunning url=$url" if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } @@ -803,7 +855,7 @@ if {![llength [info commands htmlSwitch]]} { } } -set agent "zmbot/0.0" +set agent "zmbot/0.1" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" } @@ -819,29 +871,113 @@ proc bgerror {m} { set robotsRunning 0 set robotSeq 0 set workdir [pwd] -set idleTime 60000 +set idletime 60000 set acceptLanguage {} +set debuglevel 0 +set status(unvisited) 0 +set status(visited) 0 +set status(bad) 0 +set status(raw) 0 -set i 0 -set l [llength $argv] -# For testing only -if {0} { - set url "http://www.sportsfiskeren.dk/sportsfiskeren/corner/index.htm" - set href "../../data/../../data2/newsovs.asp?Mode=5" +# Rules: allow, deny, url - set URL($url,path) /sportsfiskeren/corner/index.htm - set URL($url,hostport) www.sportsfiskeren.dk - RobotHref $url href host path - exit 0 +proc checkrule {type this} { + global alrules + global debuglevel + + if {$debuglevel > 3} { + puts "CHECKRULE $type $this" + } + if {[info exist alrules]} { + foreach l $alrules { + if {$debuglevel > 3} { + puts "consider $l" + } + # consider type + if {[lindex $l 1] != $type} continue + # consider mask (! negates) + set masks [lindex $l 2] + set ok 0 + foreach mask $masks { + if {$debuglevel > 4} { + puts "consider single mask $mask" + } + if {[string index $mask 0] == "!"} { + set mask [string range $mask 1 end] + if {[string match $mask $this]} continue + } else { + if {![string match $mask $this]} continue + } + set ok 1 + } + if {$debuglevel > 4} { + puts "ok = $ok" + } + if {!$ok} continue + # OK, we have a match + if {[lindex $l 0] == "allow"} { + if {$debuglevel > 3} { + puts "CHECKRULE MATCH OK" + } + return 1 + } else { + if {$debuglevel > 3} { + puts "CHECKFULE MATCH FAIL" + } + return 0 + } + } + } + if {$debuglevel > 3} { + puts "CHECKRULE MATCH OK" + } + return 1 +} + + +proc url {href} { + global debuglevel + + if {[RobotHref http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } +} + +proc deny {type stuff} { + global alrules + + lappend alrules [list deny $type $stuff] +} + +proc allow {type stuff} { + global alrules + + lappend alrules [list allow $type $stuff] +} + +proc debug {level} { + global debuglevel + + set debuglevel $level } +# Parse options + +set i 0 +set l [llength $argv] + if {$l < 2} { - puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]} + puts {tclrobot: usage:} + puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]} puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" + exit 1 } - while {$i < $l} { set arg [lindex $argv $i] switch -glob -- $arg { @@ -852,9 +988,9 @@ while {$i < $l} { } } -c* { - set maxDistance [string range $arg 2 end] - if {![string length $maxDistance]} { - set maxDistance [lindex $argv [incr i]] + set maxdistance [string range $arg 2 end] + if {![string length $maxdistance]} { + set maxdistance [lindex $argv [incr i]] } } -d* { @@ -865,9 +1001,9 @@ while {$i < $l} { lappend domains $dom } -i* { - set idleTime [string range $arg 2 end] - if {![string length $idleTime]} { - set idleTime [lindex $argv [incr i]] + set idletime [string range $arg 2 end] + if {![string length $idletime]} { + set idletime [lindex $argv [incr i]] } } -l* { @@ -876,6 +1012,13 @@ while {$i < $l} { set acceptLanguage [lindex $argv [incr i]] } } + -r* { + set rfile [string range $arg 2 end] + if {![string length $rfile]} { + set rfile [lindex $argv [incr i]] + } + source $rfile + } default { set href $arg if {[RobotHref http://www.indexdata.dk/ href host path]} { @@ -893,19 +1036,26 @@ while {$i < $l} { if {![info exist domains]} { set domains {*} } -if {![info exist maxDistance]} { - set maxDistance 50 +if {![info exist maxdistance]} { + set maxdistance 50 } if {![info exist robotsMax]} { set robotsMax 5 } puts "domains=$domains" -puts "max distance=$maxDistance" +puts "max distance=$maxdistance" puts "max jobs=$robotsMax" + RobotStart + while {$robotsRunning} { vwait robotsRunning } + +set statusfile [open status w] +puts $statusfile "$status(unvisited) $status(bad) $status(visited)" +close $statusfile +