X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=b6466c347186d9954fbdda74c78ed7054fab76c5;hb=9d3f82cd1140362487d8fa6372cac1b24a49d21e;hp=ab3cef4b8b50fe5773d31c671be66ad3cde707a5;hpb=87b050c8552f5b45c870b8c942ca67fe1da363a5;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index ab3cef4..b6466c3 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,22 +1,23 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.5 1999/12/27 11:49:31 adam Exp $ +# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $ # -proc RobotFileNext {area} { +proc RobotFileNext1 {area lead} { + # puts "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } - set off [string first / $area] - incr off - foreach n $ns { if {[file isfile $n]} { - if {[string first :.html $n] > 0} { - return http://[string range $area/ $off end] - } - return http://[string range $n $off end] + set off [string last / $n] + incr off 2 + return $lead/[string range $n $off end] } - if {[file isdirectory $n]} { - set sb [RobotFileNext $n] + } + foreach n $ns { + if {[file isdirectory $n]} { + set off [string last / $n] + incr off 2 + set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb } @@ -25,42 +26,134 @@ proc RobotFileNext {area} { return {} } +proc RobotWriteRecord {outf fromurl distance} { + puts $outf "" + puts $outf "" + puts $outf $distance + puts $outf "" + puts $outf "" + puts $outf $fromurl + puts $outf "" + puts $outf "" +} + +proc RobotReadRecord {inf fromurlx distancex} { + upvar $fromurlx fromurl + upvar $distancex distance + gets $inf + gets $inf + set distance [string trim [gets $inf]] + # puts "got distance = $distance" + gets $inf + gets $inf + set fromurl [string trim [gets $inf]] +} + +proc RobotFileNext {area} { + global robotSeq + global idletime ns + global status + + # puts "RobotFileNext robotSeq=$robotSeq" + if {$robotSeq < 0} { + return {} + } + if {$robotSeq == 0} { + if {[catch {set ns [glob ${area}/*]}]} { + return {} + } + } + set off [string length $area] + incr off + set n [lindex $ns $robotSeq] + if {![string length $n]} { + set robotSeq -1 + flush stdout + set statusfile [open status w] + puts $statusfile "$status(unvisited) $status(bad) $status(visited)" + close $statusfile + return wait + } + incr robotSeq + if {[file isfile $n/frobots.txt]} { + puts "ok returning http://[string range $n $off end]/robots.txt" + return http://[string range $n $off end]/robots.txt + } elseif {[file isdirectory $n]} { + set sb [RobotFileNext1 $n http://[string range $n $off end]] + if {[string length $sb]} { + return $sb + } + } + puts "no more work at end of RobotFileNext n=$n" + puts "ns=$ns" + return {} +} + + proc RobotFileExist {area host path} { - set comp [split $area/$host$path /] - set l [llength $comp] + global debuglevel + + if {$debuglevel > 3} { + puts "RobotFileExist begin area=$area host=$host path=$path" + } + set lpath [split $path /] + set l [llength $lpath] incr l -1 - if {![string length [lindex $comp $l]]} { - set comp [split $area/$host$path:.html /] - } elseif {[file exists [join $comp /]]} { - return 1 - } else { - set comp [split $area/$host$path/:.html /] + set t [lindex $lpath $l] + incr l -1 + set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + if {$debuglevel > 3} { + puts "RobotFileExist end npath=$npath" } - return [file exists [join $comp /]] + return [file exists $npath] } proc RobotFileUnlink {area host path} { - set comp [split $area/$host$path /] - set l [llength $comp] + global status + # puts "RobotFileUnlink begin" + # puts "area=$area host=$host path=$path" + set lpath [split $path /] + set l [llength $lpath] incr l -1 - if {![string length [lindex $comp $l]]} { - set comp [split $area/$host$path:.html /] - } + set t [lindex $lpath $l] + incr l -1 + set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + # puts "npath=$npath" + set comp [split $npath /] if {[catch {exec rm [join $comp /]}]} return + + set l [llength $comp] + incr l -1 incr l -1 + incr status($area) -1 for {set i $l} {$i > 0} {incr i -1} { set path [join [lrange $comp 0 $i] /] if {![catch {glob $path/*}]} return exec rmdir ./$path } + # puts "RobotFileUnlink end" } -proc RobotFileOpen {area host path} { +proc RobotFileClose {out} { + if [string compare $out stdout] { + close $out + } +} + +proc RobotFileOpen {area host path {mode w}} { set orgPwd [pwd] global workdir + global status + global debuglevel - #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path" + if {![info exists workdir]} { + return stdout + } + if {$debuglevel > 3} { + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + } if {[string compare $orgPwd $workdir]} { + puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" puts "pwd = $orgPwd" exit 1 @@ -69,48 +162,99 @@ proc RobotFileOpen {area host path} { set len [llength $comp] incr len -1 for {set i 0} {$i < $len} {incr i} { - set d [lindex $comp $i] + if {$i > 1} { + set d "d[lindex $comp $i]" + } else { + set d [lindex $comp $i] + } if {[catch {cd ./$d}]} { exec mkdir $d cd ./$d + if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { + set out [open frobots.txt w] + puts "creating robots.txt in $d" + close $out + incr status(unvisited) + } } } set d [lindex $comp $len] if {[string length $d]} { - set out [open $d w] + set out [open f$d $mode] + if {0} { + if {[file isfile $d/f]} { + set out [open $d/f $mode] + } else { + set out [open f$d $mode] + } + } } else { - set out [open :.html w] + set out [open f $mode] + } + if {$mode == "w"} { + incr status($area) } cd $orgPwd - #puts "RobotFileStop" return $out } -proc RobotRestart {} { +proc RobotRR {} { + global robotSeq robotsRunning + + incr robotsRunning -1 + while {$robotsRunning} { + vwait robotsRunning + } + set robotSeq 0 + RobotStart +} + +proc RobotRestart {url sock} { + global URL robotsRunning + + close $sock + after cancel $URL($sock,cancel) + + foreach v [array names URL $url,*] { + unset URL($v) + } + + incr robotsRunning -1 + RobotStart +} + +proc RobotStart {} { global URL - - while {1} { + global robotsRunning robotsMax idletime + + # puts "RobotStart" + while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { - puts "No more unvisited" - break + return + } + incr robotsRunning + if {[string compare $url wait] == 0} { + after $idletime RobotRR + return } set r [RobotGetUrl $url {}] if {!$r} { - puts "RobotGetUrl returned 0 on url=$url" - return + if {$robotsRunning >= $robotsMax} return } else { - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - } + incr robotsRunning -1 + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotFileClose $outf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + } } - exit 0 } -proc headSave {url out title} { +proc headSave {url out} { global URL - puts $out {} - puts $out "$title" if {[info exists URL($url,head,last-modified)]} { puts $out "$URL($url,head,last-modified)" } @@ -134,12 +278,27 @@ proc headSave {url out title} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - # puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } + + if {[string first { } $href] >= 0} { + return 0 + } + if {[string length $href] > 256} { + return 0 + } + if {[string first {?} $href] >= 0} { + return 0 + } + if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { + return 0 + } # get method (if any) if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { set hpath $href @@ -150,259 +309,454 @@ proc RobotHref {url hrefx hostx pathx} { } } # get host (if any) - if {![regexp {^//([^/]+)(.*)} $hpath x host epath]} { - set epath $hpath - set host $URL($url,host) - } else { - if {![string length $epath]} { - set epath / + if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} { + if {![string length $surl]} { + set surl / } - set ok 0 - foreach domain $domains { - if {[string match $domain $host]} { - set ok 1 - break + if {[info exist domains]} { + set ok 0 + foreach domain $domains { + if {[string match $domain $host]} { + set ok 1 + break + } } - } - if {!$ok} { - return 0 - } + if {!$ok} { + return 0 + } + } + } else { + regexp {^([^\#]*)} $hpath x surl + set host $URL($url,hostport) } - if {[regexp {^(\#|\?)} $epath]} { - # within page + if {![string length $surl]} { return 0 - } elseif {![regexp {^([/][^\#?]*)} $epath x path]} { + } + if {[string first / $surl]} { # relative path - set ext [file extension $URL($url,path)] - if {[string compare $ext {}]} { - set dpart [file dirname $URL($url,path)] + set curpath $URL($url,path) + if {[info exists URL($url,bpath)]} { + set curpath $URL($url,bpath) + } + regexp {^([^\#?]*)} $curpath x dpart + set l [string last / $dpart] + if {[expr $l >= 0]} { + set surl [string range $dpart 0 $l]$surl } else { - set dpart $URL($url,path) + set surl $dpart/$surl } - regexp {^([^\#?]+)} $epath x path - set path [string trimright $dpart /]/$path - } - set c [split $path /] - set i [llength $c] - incr i -1 - set path [lindex $c $i] - incr i -1 - while {$i >= 0} { - switch -- [lindex $c $i] { + } + set surllist [split $surl /] + catch {unset path} + set pathl 0 + foreach c $surllist { + switch -- $c { .. { - incr i -2 - } - . { - incr i -1 + if {$pathl > 1} { + incr pathl -2 + set path [lrange $path 0 $pathl] + incr pathl + } } - default { - set path [lindex $c $i]/$path - incr i -1 + . { + + } + default { + incr pathl + lappend path $c } } - } + } + if {$debuglevel > 4} { + puts "pathl=$pathl output path=$path" + } + set path [join $path /] + if {![string length $path]} { + set path / + } + regsub -all {~} $path {%7E} path set href "$method://$host$path" - # puts "Ref href = $href" - return 1 -} - -proc Robot401 {url} { - global URL - puts "Bad link $url" - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist forbidden $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen forbidden $URL($url,host) $URL($url,path)] - close $outf + if {$debuglevel > 1} { + puts "Ref result = $href" } + return [checkrule url $href] } -proc Robot404 {url} { +proc RobotError {url code} { global URL - puts "Bad link $url" - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - close $outf + puts "Bad URL $url (code $code)" + set fromurl {} + set distance -1 + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path) + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf } } -proc Robot301 {url tourl} { +proc RobotRedirect {url tourl code} { global URL puts "Redirecting from $url to $tourl" - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + + set distance {} + set fromurl {} + if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { + set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r] + RobotReadRecord $inf fromurl distance + RobotFileClose $inf + } + if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} { + set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } if {[RobotHref $url tourl host path]} { - if {![RobotFileExist unvisited $host $path]} { - set outf [RobotFileOpen unvisited $host $path] - close $outf + if {![RobotFileExist visited $host $path]} { + if {![RobotFileExist unvisited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $fromurl $distance + RobotFileClose $outf + } + } else { + set olddistance {} + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[string length $distance] == 0} { + set distance 1000 + } + puts "distance=$distance olddistance=$olddistance" + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $tourl $distance + RobotFileClose $outf + } } } + if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} { + puts "unlink failed" + exit 1 + } } -proc Robot200 {url} { - global URL domains +proc link {url out href body distance} { + global URL maxdistance + if {[expr $distance > $maxdistance]} return + + if {![RobotHref $url href host path]} return - # puts "Parsing $url" - set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] - set ti 0 - if {[info exists URL($url,buf)]} { - set htmlContent $URL($url,buf) - - htmlSwitch $htmlContent \ + puts $out "" + puts $out "$href" + puts $out "$body" + puts $out "" + + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 + if {![RobotFileExist bad $host $path]} { + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } +} + +proc RobotTextHtml {url out} { + global URL maxdistance + + set distance 0 + set fdistance 0 + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { + set fdistance $URL($url,dist) + set distance [expr $fdistance + 1] + } + htmlSwitch $URL($url,buf) \ title { - if {!$ti} { - headSave $url $out $body - set ti 1 + puts $out "$body" + } -nonest meta { + puts -nonewline $out "} } body { - regsub -all -nocase {} $body {} abody - regsub -all {<[^\>]+>} $abody {} nbody + regsub -all {