Added -nonest for htmlSwitch statement. Robot puts reference to
[tclrobot.git] / dcdot.tcl
diff --git a/dcdot.tcl b/dcdot.tcl
new file mode 100755 (executable)
index 0000000..aeb4f91
--- /dev/null
+++ b/dcdot.tcl
@@ -0,0 +1,192 @@
+#!/usr/bin/tclsh 
+# $Id: dcdot.tcl,v 1.1 2000/12/07 20:16:11 adam Exp $
+#
+
+proc RobotRestart {} {
+    global robotMoreWork
+
+    set robotMoreWork 0
+}
+
+proc RobotTextHtml {url} {
+    global URL
+    
+    set head 0
+    htmlSwitch $URL($url,buf) \
+        title {
+           set URL($url,title) $body
+        } -nonest meta {
+           set scheme {}
+           if {[info exist parm(scheme)]} {
+               set scheme $parm(scheme)
+               unset parm(scheme)
+           }
+           if {[info exist parm(name)]} {
+               if {[info exist parm(content)]} {
+                   set URL($url,meta,$parm(name),$scheme) $parm(content)
+                   unset parm(content)
+               }
+               unset parm(name)
+           }
+        } a {
+            if {[info exists parm(href)]} {
+               lappend URL($url,links) $parm(href)
+           }
+       }
+}
+
+proc Robot200 {url} {
+    global URL domains
+    
+    # puts "Parsing $url"
+    switch $URL($url,head,content-type) {
+       text/html {
+           RobotTextHtml $url
+       }
+    }
+    # puts "Parsing done"
+}
+
+proc RobotReadContent {url sock} {
+    global URL
+
+    set buffer [read $sock 16384]
+    set readCount [string length $buffer]
+    
+    if {$readCount <= 0} {
+       close $sock
+       Robot200 $url
+       RobotRestart
+    } else {
+       # puts "Got $readCount bytes"
+       set URL($url,buf) $URL($url,buf)$buffer
+    }
+}
+
+proc RobotReadHeader {url sock} {
+    global URL
+
+    set buffer [read $sock 2148]
+    set readCount [string length $buffer]
+    
+    if {$readCount <= 0} {
+       close $sock
+       RobotRestart
+    } else {
+       # puts "Got $readCount bytes"
+       set URL($url,buf) $URL($url,buf)$buffer
+       
+       set n [string first \n\n $URL($url,buf)]
+       if {$n > 1} {
+           set code 0
+           set version {}
+           set headbuf [string range $URL($url,buf) 0 $n]
+           incr n
+           incr n
+           set URL($url,buf) [string range $URL($url,buf) $n end]
+           
+           regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
+           set lines [split $headbuf \n]
+           foreach line $lines {
+               if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
+                   set URL($url,head,[string tolower $name]) $value
+               }
+           }
+           set URL($url,state) skip
+           switch $code {
+               200 {
+                   if {![info exists URL($url,head,content-type)]} {
+                       set URL($url,head,content-type) {}
+                   }
+                   switch $URL($url,head,content-type) {
+                       text/html {
+                           fileevent $sock readable [list RobotReadContent $url $sock]
+                       }
+                       text/plain {
+                           fileevent $sock readable [list RobotReadContent $url $sock]
+                       }
+                       default {
+                           close $sock
+                           Robot200 $url
+                           RobotRestart
+                       }
+                   }
+               }
+               default {
+                   Robot404 $url
+                   close $sock
+                   RobotRestart
+               }
+           }
+       }
+    }
+}
+
+proc RobotConnect {url sock} {
+    global URL agent
+
+    fconfigure $sock -translation {auto crlf} -blocking 0
+    fileevent $sock readable [list RobotReadHeader $url $sock]
+    puts $sock "GET $URL($url,path) HTTP/1.0"
+    puts $sock "Host: $URL($url,host)"
+    puts $sock "User-Agent: $agent"
+    puts $sock ""
+    flush $sock
+}
+
+proc RobotGetUrl {url phost} {
+    global URL
+    if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
+        return -1
+    }
+    if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
+       set port 80
+       set host $hostport
+    }
+    set URL($url,method) $method
+    set URL($url,host) $host
+    set URL($url,port) $port
+    set URL($url,path) $path
+    set URL($url,state) head
+    set URL($url,buf) {}
+    if [catch {set sock [socket -async $host $port]}] {
+        return -1
+    }
+    RobotConnect $url $sock
+
+    return 0
+}
+
+if {![llength [info commands htmlSwitch]]} {
+    set e [info sharedlibextension]
+    if {[catch {load ./tclrobot$e}]} {
+       load tclrobot$e
+    }
+}
+
+set agent "zmbot/0.0"
+if {![catch {set os [exec uname -s -r]}]} {
+    set agent "$agent ($os)"
+}
+
+proc RobotGetDCDOT {url} {
+    global robotMoreWork 1
+
+    set robotMoreWork 1
+    if [RobotGetUrl $url {}] {
+       set robotMoreWork 0
+    }
+
+    while {$robotMoreWork} {
+       vwait robotMoreWork
+    }
+}
+
+if {$argc == 1} {
+    set url [lindex $argv 0]
+    RobotGetDCDOT $url
+    set mask {,meta,[Dd][Cc]\.*}
+    foreach a [array names URL $url$mask] {
+       puts "URL($a) = $URL($a)"
+    }
+}
\ No newline at end of file