Mercurial > projects > relmon
diff relmon.tcl @ 1:cba4887feb2c
Check the content type of documents that are being downloaded
Abort if the content type of the document being downloaded cannot be handled.
This is to primarily to prevent accidental downloads of potentially large files.
author | Guido Berhoerster <guido+relmon@berhoerster.name> |
---|---|
date | Sun, 19 Oct 2014 20:56:27 +0200 |
parents | 8c5330f6e9e4 |
children |
line wrap: on
line diff
--- a/relmon.tcl Sun Oct 19 20:44:39 2014 +0200 +++ b/relmon.tcl Sun Oct 19 20:56:27 2014 +0200 @@ -1,6 +1,6 @@ #!/usr/bin/tclsh # -# Copyright (C) 2011 Guido Berhoerster <guido+relmon@berhoerster.name> +# Copyright (C) 2014 Guido Berhoerster <guido+relmon@berhoerster.name> # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the @@ -46,7 +46,8 @@ namespace eval ::relmon::common { - namespace export cmpVersions isUrlValid urlGetHost parseStateFile + namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \ + parseStateFile } # implementation of the Debian version comparison algorithm described at @@ -140,6 +141,18 @@ [dict get $urlParts "host"] : ""}] } +proc ::relmon::common::normalizeHttpHeaders {headers} { + set httpHeaders [dict create] + foreach {header value} $headers { + set words {} + foreach word [split $header "-"] { + lappend words [string totitle $word] + } + dict set httpHeaders [join $words "-"] $value + } + return $httpHeaders +} + proc ::relmon::common::parseStateFile {stateFile} { try { set f [open $stateFile "r"] @@ -359,9 +372,8 @@ upvar 1 $bodyDataName bodyData set extractedUrls {} set resultUrls [dict create] - set bareContentType [string trim [lindex [split $contentType ";"] 0]] # extract all URLs or URL fragments - switch -- $bareContentType { + switch -- $contentType { {text/html} - {application/xhtml+xml} { # HTML/XHTML @@ -460,8 +472,7 @@ return } -proc ::relmon::update::HandleSuccessfulTransfer {item httpHeaders - httpBodyName} { +proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} { upvar 1 $httpBodyName httpBody variable Log variable StateBuffer @@ -470,11 +481,6 @@ set name [dict get $item "name"] set url [dict get $item "url"] - if {[dict exists $httpHeaders "Content-Type"]} { - set contentType [dict get $httpHeaders "Content-Type"] - } else { - set contentType "" - } set patternIndex [dict get $item "pattern_index"] set pattern [lindex [dict get $Watchlist $name "patterns"] $patternIndex] @@ -482,7 +488,8 @@ # parse data try { - set urls [ExtractUrls httpBody $contentType $url $pattern] + set urls [ExtractUrls httpBody [dict get $item "content_type"] $url \ + $pattern] } trap {RELMON} {errorMsg} { # continue on tdom parsing errors or when receiving documents with an # unsupported content type @@ -508,7 +515,7 @@ dict lappend Queue [::relmon::common::urlGetHost $newUrl] \ [dict create "name" $name "url" $newUrl \ "pattern_index" [expr {$patternIndex + 1}] \ - "num_redirects" 0 "num_retries" 0] + "content_type" "" "num_redirects" 0 "num_retries" 0] } else { ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\"" } @@ -578,7 +585,7 @@ redirect" dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \ - [dict replace $item "url" $redirectUrl \ + [dict replace $item "url" $redirectUrl "content_type" "" \ "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \ "num_retries" 0] } else { @@ -635,10 +642,10 @@ return } -proc ::relmon::update::OnTransferFinishedWrapper {token} { +proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} { # ensure that exceptions get raised, by default http catches all errors and # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262 - if {[catch {OnTransferFinished $token} -> errorOptions]} { + if {[catch {eval $callbackCmd $args} -> errorOptions]} { OnError [dict get $errorOptions "-errorinfo"] $errorOptions } return @@ -706,7 +713,10 @@ try { set token [http::geturl $url \ -timeout [dict get $Config "transfer_time_limit"] \ - -command [namespace code OnTransferFinishedWrapper]] + -progress [namespace code {TransferCallbackWrapper \ + OnTransferProgress}] \ + -command [namespace code {TransferCallbackWrapper \ + OnTransferFinished}]] } on ok {} { dict set ActiveTransfers $token $item @@ -755,6 +765,33 @@ return } +proc ::relmon::update::OnTransferProgress {token total current} { + upvar #0 $token httpState + variable ActiveTransfers + variable Log + + # try to determine content type and abort transfer if content type is not + # one that can be parsed, this is primarily to prevent accidental downloads + if {[dict get $ActiveTransfers $token "content_type"] eq ""} { + set httpHeaders [relmon::common::normalizeHttpHeaders \ + $httpState(meta)] + + if {[dict exists $httpHeaders "Content-Type"]} { + set contentType [string trim [lindex [split \ + [dict get $httpHeaders "Content-Type"] ";"] 0]] + dict set ActiveTransfers $token "content_type" $contentType + if {$contentType ni {"text/html" "application/xhtml+xml" + "application/atom+xml" "application/rss+xml" + "text/plain"}} { + ${Log}::warn "\"[dict get $ActiveTransfers $token "name"]\":\ + \"[dict get $ActiveTransfers $token "url"]\": content\ + type \"$contentType\" is not acceptable" + http::reset $token + } + } + } +} + proc ::relmon::update::OnTransferFinished {token} { upvar #0 $token httpState variable Config @@ -778,13 +815,14 @@ switch -- $httpState(status) { {ok} { # normalize headers - set httpHeaders [dict create] - foreach {header value} $httpState(meta) { - set words {} - foreach word [split $header "-"] { - lappend words [string totitle $word] - } - dict set httpHeaders [join $words "-"] $value + set httpHeaders [relmon::common::normalizeHttpHeaders \ + $httpState(meta)] + + # try to determine content type + if {([dict get $item "content_type"] eq "") && + [dict exists $httpHeaders "Content-Type"]} { + dict set item "content_type" [string trim [lindex [split \ + [dict get $httpHeaders "Content-Type"] ";"] 0]] } # dispatch based on HTTP status code @@ -794,13 +832,16 @@ HandleRedirect $item $httpCode $httpHeaders } {200} { - HandleSuccessfulTransfer $item $httpHeaders httpState(body) + HandleSuccessfulTransfer $item httpState(body) } default { HandleProtocolError $item $httpState(http) } } } + {reset} { + # aborted due to wrong content type + } {eof} - {timeout} { # timeout or connection reset @@ -1122,6 +1163,7 @@ "name" $name \ "url" [dict get $watchlistItem "base_url"] \ "pattern_index" 0 \ + "content_type" "" \ "num_redirects" 0 \ "num_retries" 0] dict incr Statistics "items"