X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=dcdot.tcl;h=8e15fea593a473224bac1ef1eebcf84f6d74af26;hb=3201adca0560cf447024e23b0e572c9b5137111c;hp=aeb4f91484a9e635a060ee5d3ff9f6e66bc8c51a;hpb=d8234df96ab8fb03ed71f6358f7211ebe725b495;p=tclrobot.git diff --git a/dcdot.tcl b/dcdot.tcl index aeb4f91..8e15fea 100755 --- a/dcdot.tcl +++ b/dcdot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: dcdot.tcl,v 1.1 2000/12/07 20:16:11 adam Exp $ +# $Id: dcdot.tcl,v 1.5 2003/01/13 13:59:07 adam Exp $ # proc RobotRestart {} { @@ -10,29 +10,23 @@ proc RobotRestart {} { proc RobotTextHtml {url} { global URL - - set head 0 - htmlSwitch $URL($url,buf) \ - title { - set URL($url,title) $body - } -nonest meta { - set scheme {} - if {[info exist parm(scheme)]} { - set scheme $parm(scheme) - unset parm(scheme) - } - if {[info exist parm(name)]} { - if {[info exist parm(content)]} { - set URL($url,meta,$parm(name),$scheme) $parm(content) - unset parm(content) - } - unset parm(name) - } - } a { - if {[info exists parm(href)]} { - lappend URL($url,links) $parm(href) - } - } + + set b $URL($url,buf) + set e {]*>} + catch {unset $URL($url,meta)} + while {[regexp -nocase -indices $e $b i]} { + set meta [string range $b [lindex $i 0] [lindex $i 1]] + lappend URL($url,meta) $meta + set b [string range $b [lindex $i 1] end] + } + set b $URL($url,buf) + set e {[^>]*>} + catch {unset $URL($url,meta)} + while {[regexp -nocase -indices $e $b i]} { + set title [string range $b [lindex $i 0] [lindex $i 1]] + lappend URL($url,title) $title + set b [string range $b [lindex $i 1] end] + } } proc Robot200 {url} { @@ -76,23 +70,24 @@ proc RobotReadHeader {url sock} { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer - set n [string first \n\n $URL($url,buf)] + set n [string first \r\n\r\n $URL($url,buf)] if {$n > 1} { + puts "string first match n = $n" set code 0 set version {} set headbuf [string range $URL($url,buf) 0 $n] - incr n - incr n + incr n 4 set URL($url,buf) [string range $URL($url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code set lines [split $headbuf \n] foreach line $lines { - if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} { - set URL($url,head,[string tolower $name]) $value + if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { + set URL($url,head,[string tolower $name]) [string trim $value] } } set URL($url,state) skip + puts "code=$code" switch $code { 200 { if {![info exists URL($url,head,content-type)]} { @@ -105,6 +100,10 @@ proc RobotReadHeader {url sock} { text/plain { fileevent $sock readable [list RobotReadContent $url $sock] } + application/pdf { + puts "ok preceeed with this thingy" + fileevent $sock readable [list RobotReadContent $url $sock] + } default { close $sock Robot200 $url @@ -113,7 +112,6 @@ proc RobotReadHeader {url sock} { } } default { - Robot404 $url close $sock RobotRestart } @@ -125,7 +123,7 @@ proc RobotReadHeader {url sock} { proc RobotConnect {url sock} { global URL agent - fconfigure $sock -translation {auto crlf} -blocking 0 + fconfigure $sock -translation {lf crlf} -blocking 0 fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" @@ -157,14 +155,7 @@ proc RobotGetUrl {url phost} { return 0 } -if {![llength [info commands htmlSwitch]]} { - set e [info sharedlibextension] - if {[catch {load ./tclrobot$e}]} { - load tclrobot$e - } -} - -set agent "zmbot/0.0" +set agent "dcdot.tcl/0.0" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" } @@ -185,8 +176,22 @@ proc RobotGetDCDOT {url} { if {$argc == 1} { set url [lindex $argv 0] RobotGetDCDOT $url - set mask {,meta,[Dd][Cc]\.*} - foreach a [array names URL $url$mask] { - puts "URL($a) = $URL($a)" + set mask {,meta} + if {[info exist URL($url,meta)]} { + foreach m $URL($url,meta) { + puts $m + } + } + if {[info exist URL($url,title)]} { + foreach m $URL($url,title) { + puts $m + } } -} \ No newline at end of file + foreach v [array names URL $url,head,*] { + puts "$v = $URL($v)" + } + puts "Buffer length is [string length $URL($url,buf)]" + set f [open out.pdf w] + puts -nonewline $f $URL($url,buf) + close $f +}