projects/relmon

changeset 1:cba4887feb2c

Check the content type of documents that are being downloaded

Abort if the content type of the document being downloaded cannot be handled.
This is to primarily to prevent accidental downloads of potentially large files.
author Guido Berhoerster <guido+relmon@berhoerster.name>
date Sun Oct 19 20:56:27 2014 +0200 (2014-10-19)
parents 8c5330f6e9e4
children 0203fffb4d74
files relmon.tcl
line diff
     1.1 --- a/relmon.tcl	Sun Oct 19 20:44:39 2014 +0200
     1.2 +++ b/relmon.tcl	Sun Oct 19 20:56:27 2014 +0200
     1.3 @@ -1,6 +1,6 @@
     1.4  #!/usr/bin/tclsh
     1.5  #
     1.6 -# Copyright (C) 2011 Guido Berhoerster <guido+relmon@berhoerster.name>
     1.7 +# Copyright (C) 2014 Guido Berhoerster <guido+relmon@berhoerster.name>
     1.8  #
     1.9  # Permission is hereby granted, free of charge, to any person obtaining
    1.10  # a copy of this software and associated documentation files (the
    1.11 @@ -46,7 +46,8 @@
    1.12  
    1.13  
    1.14  namespace eval ::relmon::common {
    1.15 -    namespace export cmpVersions isUrlValid urlGetHost parseStateFile
    1.16 +    namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \
    1.17 +            parseStateFile
    1.18  }
    1.19  
    1.20  # implementation of the Debian version comparison algorithm described at
    1.21 @@ -140,6 +141,18 @@
    1.22                  [dict get $urlParts "host"] : ""}]
    1.23  }
    1.24  
    1.25 +proc ::relmon::common::normalizeHttpHeaders {headers} {
    1.26 +    set httpHeaders [dict create]
    1.27 +    foreach {header value} $headers {
    1.28 +        set words {}
    1.29 +        foreach word [split $header "-"] {
    1.30 +            lappend words [string totitle $word]
    1.31 +        }
    1.32 +        dict set httpHeaders [join $words "-"] $value
    1.33 +    }
    1.34 +    return $httpHeaders
    1.35 +}
    1.36 +
    1.37  proc ::relmon::common::parseStateFile {stateFile} {
    1.38      try {
    1.39          set f [open $stateFile "r"]
    1.40 @@ -359,9 +372,8 @@
    1.41      upvar 1 $bodyDataName bodyData
    1.42      set extractedUrls {}
    1.43      set resultUrls [dict create]
    1.44 -    set bareContentType [string trim [lindex [split $contentType ";"] 0]]
    1.45      # extract all URLs or URL fragments
    1.46 -    switch -- $bareContentType {
    1.47 +    switch -- $contentType {
    1.48          {text/html} -
    1.49          {application/xhtml+xml} {
    1.50              # HTML/XHTML
    1.51 @@ -460,8 +472,7 @@
    1.52      return
    1.53  }
    1.54  
    1.55 -proc ::relmon::update::HandleSuccessfulTransfer {item httpHeaders
    1.56 -        httpBodyName} {
    1.57 +proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} {
    1.58      upvar 1 $httpBodyName httpBody
    1.59      variable Log
    1.60      variable StateBuffer
    1.61 @@ -470,11 +481,6 @@
    1.62  
    1.63      set name [dict get $item "name"]
    1.64      set url [dict get $item "url"]
    1.65 -    if {[dict exists $httpHeaders "Content-Type"]} {
    1.66 -        set contentType [dict get $httpHeaders "Content-Type"]
    1.67 -    } else {
    1.68 -        set contentType ""
    1.69 -    }
    1.70      set patternIndex [dict get $item "pattern_index"]
    1.71      set pattern [lindex [dict get $Watchlist $name "patterns"] $patternIndex]
    1.72  
    1.73 @@ -482,7 +488,8 @@
    1.74  
    1.75      # parse data
    1.76      try {
    1.77 -        set urls [ExtractUrls httpBody $contentType $url $pattern]
    1.78 +        set urls [ExtractUrls httpBody [dict get $item "content_type"] $url \
    1.79 +                $pattern]
    1.80      } trap {RELMON} {errorMsg} {
    1.81          # continue on tdom parsing errors or when receiving documents with an
    1.82          # unsupported content type
    1.83 @@ -508,7 +515,7 @@
    1.84                  dict lappend Queue [::relmon::common::urlGetHost $newUrl] \
    1.85                          [dict create "name" $name "url" $newUrl \
    1.86                          "pattern_index" [expr {$patternIndex + 1}] \
    1.87 -                        "num_redirects" 0 "num_retries" 0]
    1.88 +                        "content_type" "" "num_redirects" 0 "num_retries" 0]
    1.89              } else {
    1.90                  ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\""
    1.91              }
    1.92 @@ -578,7 +585,7 @@
    1.93                  redirect"
    1.94  
    1.95          dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \
    1.96 -                [dict replace $item "url" $redirectUrl \
    1.97 +                [dict replace $item "url" $redirectUrl "content_type" "" \
    1.98                  "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \
    1.99                  "num_retries" 0]
   1.100      } else {
   1.101 @@ -635,10 +642,10 @@
   1.102      return
   1.103  }
   1.104  
   1.105 -proc ::relmon::update::OnTransferFinishedWrapper {token} {
   1.106 +proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} {
   1.107      # ensure that exceptions get raised, by default http catches all errors and
   1.108      # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262
   1.109 -    if {[catch {OnTransferFinished $token} -> errorOptions]} {
   1.110 +    if {[catch {eval $callbackCmd $args} -> errorOptions]} {
   1.111          OnError [dict get $errorOptions "-errorinfo"] $errorOptions
   1.112      }
   1.113      return
   1.114 @@ -706,7 +713,10 @@
   1.115          try {
   1.116              set token [http::geturl $url \
   1.117                      -timeout [dict get $Config "transfer_time_limit"] \
   1.118 -                    -command [namespace code OnTransferFinishedWrapper]]
   1.119 +                    -progress [namespace code {TransferCallbackWrapper \
   1.120 +                    OnTransferProgress}] \
   1.121 +                    -command [namespace code {TransferCallbackWrapper \
   1.122 +                    OnTransferFinished}]]
   1.123          } on ok {} {
   1.124              dict set ActiveTransfers $token $item
   1.125  
   1.126 @@ -755,6 +765,33 @@
   1.127      return
   1.128  }
   1.129  
   1.130 +proc ::relmon::update::OnTransferProgress {token total current} {
   1.131 +    upvar #0 $token httpState
   1.132 +    variable ActiveTransfers
   1.133 +    variable Log
   1.134 +
   1.135 +    # try to determine content type and abort transfer if content type is not
   1.136 +    # one that can be parsed, this is primarily to prevent accidental downloads
   1.137 +    if {[dict get $ActiveTransfers $token "content_type"] eq ""} {
   1.138 +        set httpHeaders [relmon::common::normalizeHttpHeaders \
   1.139 +                $httpState(meta)]
   1.140 +
   1.141 +        if {[dict exists $httpHeaders "Content-Type"]} {
   1.142 +            set contentType [string trim [lindex [split \
   1.143 +                    [dict get $httpHeaders "Content-Type"] ";"] 0]]
   1.144 +            dict set ActiveTransfers $token "content_type" $contentType
   1.145 +            if {$contentType ni {"text/html" "application/xhtml+xml"
   1.146 +                    "application/atom+xml" "application/rss+xml"
   1.147 +                    "text/plain"}} {
   1.148 +                ${Log}::warn "\"[dict get $ActiveTransfers $token "name"]\":\
   1.149 +                        \"[dict get $ActiveTransfers $token "url"]\": content\
   1.150 +                        type \"$contentType\" is not acceptable"
   1.151 +                http::reset $token
   1.152 +            }
   1.153 +        }
   1.154 +    }
   1.155 +}
   1.156 +
   1.157  proc ::relmon::update::OnTransferFinished {token} {
   1.158      upvar #0 $token httpState
   1.159      variable Config
   1.160 @@ -778,13 +815,14 @@
   1.161      switch -- $httpState(status) {
   1.162          {ok} {
   1.163              # normalize headers
   1.164 -            set httpHeaders [dict create]
   1.165 -            foreach {header value} $httpState(meta) {
   1.166 -                set words {}
   1.167 -                foreach word [split $header "-"] {
   1.168 -                    lappend words [string totitle $word]
   1.169 -                }
   1.170 -                dict set httpHeaders [join $words "-"] $value
   1.171 +            set httpHeaders [relmon::common::normalizeHttpHeaders \
   1.172 +                    $httpState(meta)]
   1.173 +
   1.174 +            # try to determine content type
   1.175 +            if {([dict get $item "content_type"] eq "") &&
   1.176 +                    [dict exists $httpHeaders "Content-Type"]} {
   1.177 +                dict set item "content_type" [string trim [lindex [split \
   1.178 +                        [dict get $httpHeaders "Content-Type"] ";"] 0]]
   1.179              }
   1.180  
   1.181              # dispatch based on HTTP status code
   1.182 @@ -794,13 +832,16 @@
   1.183                      HandleRedirect $item $httpCode $httpHeaders
   1.184                  }
   1.185                  {200} {
   1.186 -                    HandleSuccessfulTransfer $item $httpHeaders httpState(body)
   1.187 +                    HandleSuccessfulTransfer $item httpState(body)
   1.188                  }
   1.189                  default {
   1.190                      HandleProtocolError $item $httpState(http)
   1.191                  }
   1.192              }
   1.193          }
   1.194 +        {reset} {
   1.195 +            # aborted due to wrong content type
   1.196 +        }
   1.197          {eof} -
   1.198          {timeout} {
   1.199              # timeout or connection reset
   1.200 @@ -1122,6 +1163,7 @@
   1.201                  "name" $name \
   1.202                  "url" [dict get $watchlistItem "base_url"] \
   1.203                  "pattern_index" 0 \
   1.204 +                "content_type" "" \
   1.205                  "num_redirects" 0 \
   1.206                  "num_retries" 0]
   1.207          dict incr Statistics "items"