changeset 1:cba4887feb2c

Check the content type of documents that are being downloaded Abort if the content type of the document being downloaded cannot be handled. This is to primarily to prevent accidental downloads of potentially large files.
author Guido Berhoerster <guido+relmon@berhoerster.name>
date Sun, 19 Oct 2014 20:56:27 +0200
parents 8c5330f6e9e4
children 0203fffb4d74
files relmon.tcl
diffstat 1 files changed, 67 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/relmon.tcl	Sun Oct 19 20:44:39 2014 +0200
+++ b/relmon.tcl	Sun Oct 19 20:56:27 2014 +0200
@@ -1,6 +1,6 @@
 #!/usr/bin/tclsh
 #
-# Copyright (C) 2011 Guido Berhoerster <guido+relmon@berhoerster.name>
+# Copyright (C) 2014 Guido Berhoerster <guido+relmon@berhoerster.name>
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -46,7 +46,8 @@
 
 
 namespace eval ::relmon::common {
-    namespace export cmpVersions isUrlValid urlGetHost parseStateFile
+    namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \
+            parseStateFile
 }
 
 # implementation of the Debian version comparison algorithm described at
@@ -140,6 +141,18 @@
                 [dict get $urlParts "host"] : ""}]
 }
 
+proc ::relmon::common::normalizeHttpHeaders {headers} {
+    set httpHeaders [dict create]
+    foreach {header value} $headers {
+        set words {}
+        foreach word [split $header "-"] {
+            lappend words [string totitle $word]
+        }
+        dict set httpHeaders [join $words "-"] $value
+    }
+    return $httpHeaders
+}
+
 proc ::relmon::common::parseStateFile {stateFile} {
     try {
         set f [open $stateFile "r"]
@@ -359,9 +372,8 @@
     upvar 1 $bodyDataName bodyData
     set extractedUrls {}
     set resultUrls [dict create]
-    set bareContentType [string trim [lindex [split $contentType ";"] 0]]
     # extract all URLs or URL fragments
-    switch -- $bareContentType {
+    switch -- $contentType {
         {text/html} -
         {application/xhtml+xml} {
             # HTML/XHTML
@@ -460,8 +472,7 @@
     return
 }
 
-proc ::relmon::update::HandleSuccessfulTransfer {item httpHeaders
-        httpBodyName} {
+proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} {
     upvar 1 $httpBodyName httpBody
     variable Log
     variable StateBuffer
@@ -470,11 +481,6 @@
 
     set name [dict get $item "name"]
     set url [dict get $item "url"]
-    if {[dict exists $httpHeaders "Content-Type"]} {
-        set contentType [dict get $httpHeaders "Content-Type"]
-    } else {
-        set contentType ""
-    }
     set patternIndex [dict get $item "pattern_index"]
     set pattern [lindex [dict get $Watchlist $name "patterns"] $patternIndex]
 
@@ -482,7 +488,8 @@
 
     # parse data
     try {
-        set urls [ExtractUrls httpBody $contentType $url $pattern]
+        set urls [ExtractUrls httpBody [dict get $item "content_type"] $url \
+                $pattern]
     } trap {RELMON} {errorMsg} {
         # continue on tdom parsing errors or when receiving documents with an
         # unsupported content type
@@ -508,7 +515,7 @@
                 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \
                         [dict create "name" $name "url" $newUrl \
                         "pattern_index" [expr {$patternIndex + 1}] \
-                        "num_redirects" 0 "num_retries" 0]
+                        "content_type" "" "num_redirects" 0 "num_retries" 0]
             } else {
                 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\""
             }
@@ -578,7 +585,7 @@
                 redirect"
 
         dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \
-                [dict replace $item "url" $redirectUrl \
+                [dict replace $item "url" $redirectUrl "content_type" "" \
                 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \
                 "num_retries" 0]
     } else {
@@ -635,10 +642,10 @@
     return
 }
 
-proc ::relmon::update::OnTransferFinishedWrapper {token} {
+proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} {
     # ensure that exceptions get raised, by default http catches all errors and
     # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262
-    if {[catch {OnTransferFinished $token} -> errorOptions]} {
+    if {[catch {eval $callbackCmd $args} -> errorOptions]} {
         OnError [dict get $errorOptions "-errorinfo"] $errorOptions
     }
     return
@@ -706,7 +713,10 @@
         try {
             set token [http::geturl $url \
                     -timeout [dict get $Config "transfer_time_limit"] \
-                    -command [namespace code OnTransferFinishedWrapper]]
+                    -progress [namespace code {TransferCallbackWrapper \
+                    OnTransferProgress}] \
+                    -command [namespace code {TransferCallbackWrapper \
+                    OnTransferFinished}]]
         } on ok {} {
             dict set ActiveTransfers $token $item
 
@@ -755,6 +765,33 @@
     return
 }
 
+proc ::relmon::update::OnTransferProgress {token total current} {
+    upvar #0 $token httpState
+    variable ActiveTransfers
+    variable Log
+
+    # try to determine content type and abort transfer if content type is not
+    # one that can be parsed, this is primarily to prevent accidental downloads
+    if {[dict get $ActiveTransfers $token "content_type"] eq ""} {
+        set httpHeaders [relmon::common::normalizeHttpHeaders \
+                $httpState(meta)]
+
+        if {[dict exists $httpHeaders "Content-Type"]} {
+            set contentType [string trim [lindex [split \
+                    [dict get $httpHeaders "Content-Type"] ";"] 0]]
+            dict set ActiveTransfers $token "content_type" $contentType
+            if {$contentType ni {"text/html" "application/xhtml+xml"
+                    "application/atom+xml" "application/rss+xml"
+                    "text/plain"}} {
+                ${Log}::warn "\"[dict get $ActiveTransfers $token "name"]\":\
+                        \"[dict get $ActiveTransfers $token "url"]\": content\
+                        type \"$contentType\" is not acceptable"
+                http::reset $token
+            }
+        }
+    }
+}
+
 proc ::relmon::update::OnTransferFinished {token} {
     upvar #0 $token httpState
     variable Config
@@ -778,13 +815,14 @@
     switch -- $httpState(status) {
         {ok} {
             # normalize headers
-            set httpHeaders [dict create]
-            foreach {header value} $httpState(meta) {
-                set words {}
-                foreach word [split $header "-"] {
-                    lappend words [string totitle $word]
-                }
-                dict set httpHeaders [join $words "-"] $value
+            set httpHeaders [relmon::common::normalizeHttpHeaders \
+                    $httpState(meta)]
+
+            # try to determine content type
+            if {([dict get $item "content_type"] eq "") &&
+                    [dict exists $httpHeaders "Content-Type"]} {
+                dict set item "content_type" [string trim [lindex [split \
+                        [dict get $httpHeaders "Content-Type"] ";"] 0]]
             }
 
             # dispatch based on HTTP status code
@@ -794,13 +832,16 @@
                     HandleRedirect $item $httpCode $httpHeaders
                 }
                 {200} {
-                    HandleSuccessfulTransfer $item $httpHeaders httpState(body)
+                    HandleSuccessfulTransfer $item httpState(body)
                 }
                 default {
                     HandleProtocolError $item $httpState(http)
                 }
             }
         }
+        {reset} {
+            # aborted due to wrong content type
+        }
         {eof} -
         {timeout} {
             # timeout or connection reset
@@ -1122,6 +1163,7 @@
                 "name" $name \
                 "url" [dict get $watchlistItem "base_url"] \
                 "pattern_index" 0 \
+                "content_type" "" \
                 "num_redirects" 0 \
                 "num_retries" 0]
         dict incr Statistics "items"