X-Git-Url: http://git.indexdata.com/?a=blobdiff_plain;f=robot.tcl;h=73d558aad3893ad0b0b823f929247a5cb5d02dac;hb=4355628830cd0f9e27c059d20254d8e1c30896eb;hp=a90d6e8d91c801e0ed392f4b48797c9542c2be46;hpb=1eb62bcb2a33b6e5bf29ec82e2fed329953bbf9a;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index a90d6e8..73d558a 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,5 +1,5 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.43 2003/06/11 09:40:22 adam Exp $ +# $Id: robot.tcl,v 1.45 2003/06/11 10:29:41 adam Exp $ # proc RobotFileNext1 {area lead} { # puts "RobotFileNext1 area=$area lead=$lead" @@ -32,6 +32,7 @@ proc RobotFileNext1 {area lead} { } proc RobotWriteRecord {outf fromurl distance} { + puts $outf {} puts $outf "" puts $outf "" puts $outf $distance @@ -47,6 +48,7 @@ proc RobotReadRecord {inf fromurlx distancex} { upvar $distancex distance gets $inf gets $inf + gets $inf set distance [string trim [gets $inf]] # puts "got distance = $distance" gets $inf @@ -608,6 +610,15 @@ proc RobotRedirect {task url tourl code} { } } +proc wellform {body} { + regsub -all {} $body { } abody + regsub -all -nocase {} $abody {} body + regsub -all {<[^\>]+>} $body {} abody + regsub -all { } $abody { } body + regsub -all {&} $body {&} abody + return $abody +} + proc link {task url out href body distance} { global URL control if {[expr $distance > $control($task,distance)]} return @@ -616,7 +627,8 @@ proc link {task url out href body distance} { puts $out "" puts $out "$href" - puts $out "$body" + set abody [wellform $body] + puts $out "$abody" puts $out "" if {![RobotFileExist $task visited $host $path]} { @@ -712,11 +724,9 @@ proc RobotTextHtml {task url out} { # don't print title of document content if noindex is used if {!$noindex} { puts $out "$title" - regsub -all {} $body { } abody - regsub -all -nocase {} $abody {} bbody - regsub -all {<[^\>]+>} $bbody {} nbody + set bbody [wellform $body] puts $out "" - puts $out $nbody + puts $out $bbody puts $out "" } } -nonest base { @@ -801,6 +811,8 @@ proc RobotTextPlain {task url out} { proc RobotWriteMetadata {task url out} { global URL + set charset $URL($task,$url,charset) + puts $out "" puts $out "" set distance 1000 @@ -886,6 +898,7 @@ proc RobotReadHeader {task url sock} { set version {} set headbuf [string range $URL($task,$url,buf) 0 $n] incr n 4 + set URL($task,$url,charset) ISO-8859-1 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end] regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code @@ -894,6 +907,7 @@ proc RobotReadHeader {task url sock} { if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { set URL($task,$url,head,[string tolower $name]) [string trim $value] } + regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset) } puts "HTTP CODE $code" set URL($task,$url,state) skip