From: Adam Dickmeiss Date: Mon, 11 Dec 2000 17:11:03 +0000 (+0000) Subject: Fixed problem with links having .. for root directory of web server. X-Git-Tag: ZMBOT.0.1~32 X-Git-Url: http://git.indexdata.com/?p=tclrobot.git;a=commitdiff_plain;h=de694c0a797c955ec564b91fee7d0293ce9bd03c Fixed problem with links having .. for root directory of web server. Thank you FrontPage. --- diff --git a/dcdot.tcl b/dcdot.tcl index a29e6f5..3b2edcb 100755 --- a/dcdot.tcl +++ b/dcdot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: dcdot.tcl,v 1.3 2000/12/08 22:46:53 adam Exp $ +# $Id: dcdot.tcl,v 1.4 2000/12/11 17:11:03 adam Exp $ # proc RobotRestart {} { @@ -62,20 +62,20 @@ proc RobotReadHeader {url sock} { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer - set n [string first \n\n $URL($url,buf)] + set n [string first \r\n\r\n $URL($url,buf)] if {$n > 1} { + puts "string first match n = $n" set code 0 set version {} set headbuf [string range $URL($url,buf) 0 $n] - incr n - incr n + incr n 4 set URL($url,buf) [string range $URL($url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code set lines [split $headbuf \n] foreach line $lines { if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} { - set URL($url,head,[string tolower $name]) $value + set URL($url,head,[string tolower $name]) [string trim $value] } } set URL($url,state) skip @@ -91,6 +91,10 @@ proc RobotReadHeader {url sock} { text/plain { fileevent $sock readable [list RobotReadContent $url $sock] } + application/pdf { + puts "ok preceeed with this thingy" + fileevent $sock readable [list RobotReadContent $url $sock] + } default { close $sock Robot200 $url @@ -110,7 +114,7 @@ proc RobotReadHeader {url sock} { proc RobotConnect {url sock} { global URL agent - fconfigure $sock -translation {auto crlf} -blocking 0 + fconfigure $sock -translation {lf crlf} -blocking 0 fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" @@ -172,4 +176,8 @@ if {$argc == 1} { foreach v [array names URL $url,head,*] { puts "$v = $URL($v)" } + puts "Buffer length is [string length $URL($url,buf)]" + set f [open out.pdf w] + puts -nonewline $f $URL($url,buf) + close $f } diff --git a/robot.tcl b/robot.tcl index 4934b92..82e5c28 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,24 +1,23 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.8 2000/12/10 22:27:48 adam Exp $ +# $Id: robot.tcl,v 1.9 2000/12/11 17:11:03 adam Exp $ # -proc RobotFileNext1 {area} { +proc RobotFileNext1 {area lead} { + puts "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } - set off [string first / $area] - incr off - foreach n $ns { if {[file isfile $n]} { - if {[string first :.html $n] > 0} { - return http://[string range $area/ $off end] - } - return http://[string range $n $off end] + set off [string last / $n] + incr off 2 + return $lead/[string range $n $off end] } } foreach n $ns { if {[file isdirectory $n]} { - set sb [RobotFileNext1 $n] + set off [string last / $n] + incr off 2 + set sb [RobotFileNext1 $n $lead/[string range $n $off end]] if {[string length $sb]} { return $sb } @@ -34,6 +33,7 @@ proc RobotFileWait {} { proc RobotFileNext {area} { global robotSeq + puts "RobotFileNext robotSeq=$robotSeq" if {[catch {set ns [glob ${area}/*]}]} { return {} } @@ -42,51 +42,63 @@ proc RobotFileNext {area} { set n [lindex $ns $robotSeq] if {![string length $n]} { + flush stdout puts "------------ N E X T R O U N D --------" set robotSeq -1 - after 30000 RobotFileWait + after 60000 RobotFileWait vwait robotSeq set n [lindex $ns $robotSeq] if {![string length $n]} { + puts "robotSeq = $robotSeq" + puts "ns=$ns" + puts "no more work at index" return {} } } incr robotSeq - if {[file isfile $n/robots.txt]} { + if {[file isfile $n/frobots.txt]} { puts "ok returning http://[string range $n $off end]/robots.txt" return http://[string range $n $off end]/robots.txt } elseif {[file isdirectory $n]} { - set sb [RobotFileNext1 $n] + set sb [RobotFileNext1 $n http://[string range $n $off end]] if {[string length $sb]} { return $sb } } + puts "no more work at end of RobotFileNext n=$n" + puts "ns=$ns" return {} } proc RobotFileExist {area host path} { - set comp [split $area/$host$path /] - set l [llength $comp] + puts "RobotFileExist begin" + puts "area=$area host=$host path=$path" + set lpath [split $path /] + set l [llength $lpath] incr l -1 - if {![string length [lindex $comp $l]]} { - set comp [split $area/$host$path:.html /] - } elseif {[file exists [join $comp /]]} { - return 1 - } else { - set comp [split $area/$host$path/:.html /] - } - return [file exists [join $comp /]] + set t [lindex $lpath $l] + incr l -1 + set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + puts "npath=$npath" + puts "RobotFileExist end" + return [file exists $npath] } proc RobotFileUnlink {area host path} { - set comp [split $area/$host$path /] + puts "RobotFileUnlink begin" + puts "area=$area host=$host path=$path" + set lpath [split $path /] + set l [llength $lpath] + incr l -1 + set t [lindex $lpath $l] + incr l -1 + set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t + puts "npath=$npath" + set comp [split $npath /] set l [llength $comp] incr l -1 - if {![string length [lindex $comp $l]]} { - set comp [split $area/$host$path:.html /] - } if {[catch {exec rm [join $comp /]}]} return incr l -1 for {set i $l} {$i > 0} {incr i -1} { @@ -94,6 +106,7 @@ proc RobotFileUnlink {area host path} { if {![catch {glob $path/*}]} return exec rmdir ./$path } + puts "RobotFileUnlink end" } proc RobotFileClose {out} { @@ -120,12 +133,17 @@ proc RobotFileOpen {area host path {mode w}} { set len [llength $comp] incr len -1 for {set i 0} {$i < $len} {incr i} { - set d [lindex $comp $i] + if {$i > 1} { + set d "d[lindex $comp $i]" + } else { + set d [lindex $comp $i] + } if {[catch {cd ./$d}]} { exec mkdir $d + puts "creating $d" cd ./$d if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { - set out [open robots.txt w] + set out [open frobots.txt w] puts "creating robots.txt in $d" close $out } @@ -134,12 +152,15 @@ proc RobotFileOpen {area host path {mode w}} { set d [lindex $comp $len] if {[string length $d]} { if {[file isdirectory $d]} { - set out [open $d/:.html $mode] + set out [open $d/f $mode] + puts "1" } else { - set out [open $d $mode] + set out [open f$d $mode] + puts "2" } } else { - set out [open :.html $mode] + set out [open f $mode] + puts "3" } cd $orgPwd #puts "RobotFileStop" @@ -170,7 +191,6 @@ proc RobotRestart {sock} { proc headSave {url out} { global URL - puts $out {} if {[info exists URL($url,head,last-modified)]} { puts $out "$URL($url,head,last-modified)" } @@ -250,6 +270,9 @@ proc RobotHref {url hrefx hostx pathx} { switch -- [lindex $c $i] { .. { incr i -2 + if {$i < 0} { + set i 0 + } } . { incr i -1 @@ -300,12 +323,15 @@ proc RobotRedirect {url tourl code} { puts "Redirecting from $url to $tourl" set fromurl {} - catch { + if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} { set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] set fromurl [gets $inf] RobotFileClose $inf } - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) + if {[catch {RobotFileUnlink unvisited $URL($url,host) $URL($url,path)}]} { + puts "unlink failed" + exit 1 + } if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] puts $outf "URL=$url to $tourl $code" @@ -325,19 +351,10 @@ proc RobotRedirect {url tourl code} { proc RobotTextHtml {url out} { global URL - set head 0 htmlSwitch $URL($url,buf) \ title { - if {!$head} { - headSave $url $out - set head 1 - } puts $out "$body" } -nonest meta { - if {!$head} { - headSave $url $out - set head 1 - } puts -nonewline $out "" } proc RobotsTxt {url} { @@ -424,11 +432,9 @@ proc RobotsTxt {url} { proc RobotTextPlain {url out} { global URL - headSave $url $out puts $out "" puts $out $URL($url,buf) puts $out "" - puts $out "" if {![string compare $URL($url,path) /robots.txt]} { RobotsTxt $url @@ -440,6 +446,8 @@ proc Robot200 {url} { puts "Parsing $url" set out [RobotFileOpen visited $URL($url,host) $URL($url,path)] + puts $out "" + headSave $url $out switch $URL($url,head,content-type) { text/html { RobotTextHtml $url $out @@ -447,17 +455,19 @@ proc Robot200 {url} { text/plain { RobotTextPlain $url $out } - default { - headSave $url $out - puts $out "" + application/pdf { + set pdff [open test.pdf w] + puts -nonewline $pdff $URL($url,buf) + close $pdff } } + puts $out "" RobotFileClose $out # puts "Parsing done" RobotFileUnlink unvisited $URL($url,host) $URL($url,path) } -proc RobotReadContent {url sock} { +proc RobotReadContent {url sock binary} { global URL set buffer [read $sock 16384] @@ -466,7 +476,7 @@ proc RobotReadContent {url sock} { if {$readCount <= 0} { Robot200 $url RobotRestart $sock - } elseif {[string first \0 $buffer] >= 0} { + } elseif {!$binary && [string first \0 $buffer] >= 0} { Robot200 $url RobotRestart $sock } else { @@ -478,6 +488,7 @@ proc RobotReadContent {url sock} { proc RobotReadHeader {url sock} { global URL + puts "RobotReadHeader $url" if {[catch {set buffer [read $sock 2148]}]} { RobotError $url 404 RobotRestart $sock @@ -491,20 +502,19 @@ proc RobotReadHeader {url sock} { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer - set n [string first \n\n $URL($url,buf)] + set n [string first \r\n\r\n $URL($url,buf)] if {$n > 1} { set code 0 set version {} set headbuf [string range $URL($url,buf) 0 $n] - incr n - incr n + incr n 4 set URL($url,buf) [string range $URL($url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code set lines [split $headbuf \n] foreach line $lines { if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} { - set URL($url,head,[string tolower $name]) $value + set URL($url,head,[string tolower $name]) [string trim $value] } } puts "code = $code" @@ -518,33 +528,20 @@ proc RobotReadHeader {url sock} { RobotRedirect $url $URL($url,head,location) 302 RobotRestart $sock } - 404 { - RobotError $url 404 - RobotRestart $sock - } - 401 { - RobotError $url 401 - RobotRestart $sock - } 200 { if {![info exists URL($url,head,content-type)]} { set URL($url,head,content-type) {} } + set binary 0 switch $URL($url,head,content-type) { - text/html { - fileevent $sock readable [list RobotReadContent $url $sock] - } - text/plain { - fileevent $sock readable [list RobotReadContent $url $sock] - } - default { - Robot200 $url - RobotRestart $sock + application/pdf { + set binary 1 } } + fileevent $sock readable [list RobotReadContent $url $sock $binary] } default { - RobotError $url 404 + RobotError $url $code RobotRestart $sock } } @@ -562,7 +559,7 @@ proc RobotSockCancel {sock url} { proc RobotConnect {url sock} { global URL agent - fconfigure $sock -translation {auto crlf} -blocking 0 + fconfigure $sock -translation {lf crlf} -blocking 0 fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" @@ -578,6 +575,7 @@ proc RobotNop {} { proc RobotGetUrl {url phost} { global URL + flush stdout puts "---------" puts $url if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {