comparison relmon.tcl @ 1:cba4887feb2c

Check the content type of documents that are being downloaded Abort if the content type of the document being downloaded cannot be handled. This is to primarily to prevent accidental downloads of potentially large files.
author Guido Berhoerster <guido+relmon@berhoerster.name>
date Sun, 19 Oct 2014 20:56:27 +0200
parents 8c5330f6e9e4
children
comparison
equal deleted inserted replaced
0:8c5330f6e9e4 1:cba4887feb2c
1 #!/usr/bin/tclsh 1 #!/usr/bin/tclsh
2 # 2 #
3 # Copyright (C) 2011 Guido Berhoerster <guido+relmon@berhoerster.name> 3 # Copyright (C) 2014 Guido Berhoerster <guido+relmon@berhoerster.name>
4 # 4 #
5 # Permission is hereby granted, free of charge, to any person obtaining 5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the 6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including 7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish, 8 # without limitation the rights to use, copy, modify, merge, publish,
44 variable VERSION 1 44 variable VERSION 1
45 } 45 }
46 46
47 47
48 namespace eval ::relmon::common { 48 namespace eval ::relmon::common {
49 namespace export cmpVersions isUrlValid urlGetHost parseStateFile 49 namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \
50 parseStateFile
50 } 51 }
51 52
52 # implementation of the Debian version comparison algorithm described at 53 # implementation of the Debian version comparison algorithm described at
53 # http://www.debian.org/doc/debian-policy/ch-controlfields.html#s-f-Version 54 # http://www.debian.org/doc/debian-policy/ch-controlfields.html#s-f-Version
54 proc ::relmon::common::cmpVersion {v1 v2} { 55 proc ::relmon::common::cmpVersion {v1 v2} {
138 proc ::relmon::common::urlGetHost {url} { 139 proc ::relmon::common::urlGetHost {url} {
139 return [expr {(![catch {dict create {*}[uri::split $url]} urlParts]) ? 140 return [expr {(![catch {dict create {*}[uri::split $url]} urlParts]) ?
140 [dict get $urlParts "host"] : ""}] 141 [dict get $urlParts "host"] : ""}]
141 } 142 }
142 143
144 proc ::relmon::common::normalizeHttpHeaders {headers} {
145 set httpHeaders [dict create]
146 foreach {header value} $headers {
147 set words {}
148 foreach word [split $header "-"] {
149 lappend words [string totitle $word]
150 }
151 dict set httpHeaders [join $words "-"] $value
152 }
153 return $httpHeaders
154 }
155
143 proc ::relmon::common::parseStateFile {stateFile} { 156 proc ::relmon::common::parseStateFile {stateFile} {
144 try { 157 try {
145 set f [open $stateFile "r"] 158 set f [open $stateFile "r"]
146 } trap {POSIX} {errorMsg errorOptions} { 159 } trap {POSIX} {errorMsg errorOptions} {
147 return -options $errorOptions \ 160 return -options $errorOptions \
357 proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl 370 proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl
358 rePattern} { 371 rePattern} {
359 upvar 1 $bodyDataName bodyData 372 upvar 1 $bodyDataName bodyData
360 set extractedUrls {} 373 set extractedUrls {}
361 set resultUrls [dict create] 374 set resultUrls [dict create]
362 set bareContentType [string trim [lindex [split $contentType ";"] 0]]
363 # extract all URLs or URL fragments 375 # extract all URLs or URL fragments
364 switch -- $bareContentType { 376 switch -- $contentType {
365 {text/html} - 377 {text/html} -
366 {application/xhtml+xml} { 378 {application/xhtml+xml} {
367 # HTML/XHTML 379 # HTML/XHTML
368 # if tdom parsing has failed or not found any "<a>" element, 380 # if tdom parsing has failed or not found any "<a>" element,
369 # preprocess the document with htmlparse and try again 381 # preprocess the document with htmlparse and try again
458 } 470 }
459 471
460 return 472 return
461 } 473 }
462 474
463 proc ::relmon::update::HandleSuccessfulTransfer {item httpHeaders 475 proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} {
464 httpBodyName} {
465 upvar 1 $httpBodyName httpBody 476 upvar 1 $httpBodyName httpBody
466 variable Log 477 variable Log
467 variable StateBuffer 478 variable StateBuffer
468 variable Queue 479 variable Queue
469 variable Watchlist 480 variable Watchlist
470 481
471 set name [dict get $item "name"] 482 set name [dict get $item "name"]
472 set url [dict get $item "url"] 483 set url [dict get $item "url"]
473 if {[dict exists $httpHeaders "Content-Type"]} {
474 set contentType [dict get $httpHeaders "Content-Type"]
475 } else {
476 set contentType ""
477 }
478 set patternIndex [dict get $item "pattern_index"] 484 set patternIndex [dict get $item "pattern_index"]
479 set pattern [lindex [dict get $Watchlist $name "patterns"] $patternIndex] 485 set pattern [lindex [dict get $Watchlist $name "patterns"] $patternIndex]
480 486
481 ${Log}::info "\"$name\": \"$url\": transfer finished" 487 ${Log}::info "\"$name\": \"$url\": transfer finished"
482 488
483 # parse data 489 # parse data
484 try { 490 try {
485 set urls [ExtractUrls httpBody $contentType $url $pattern] 491 set urls [ExtractUrls httpBody [dict get $item "content_type"] $url \
492 $pattern]
486 } trap {RELMON} {errorMsg} { 493 } trap {RELMON} {errorMsg} {
487 # continue on tdom parsing errors or when receiving documents with an 494 # continue on tdom parsing errors or when receiving documents with an
488 # unsupported content type 495 # unsupported content type
489 set urls [dict create] 496 set urls [dict create]
490 set warningMsg "\"$name\": \"$url\": failed to parse data: $errorMsg" 497 set warningMsg "\"$name\": \"$url\": failed to parse data: $errorMsg"
506 ${Log}::debug "\"$name\": \"$url\": queuing \"$newUrl\"" 513 ${Log}::debug "\"$name\": \"$url\": queuing \"$newUrl\""
507 514
508 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \ 515 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \
509 [dict create "name" $name "url" $newUrl \ 516 [dict create "name" $name "url" $newUrl \
510 "pattern_index" [expr {$patternIndex + 1}] \ 517 "pattern_index" [expr {$patternIndex + 1}] \
511 "num_redirects" 0 "num_retries" 0] 518 "content_type" "" "num_redirects" 0 "num_retries" 0]
512 } else { 519 } else {
513 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\"" 520 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\""
514 } 521 }
515 } 522 }
516 } else { 523 } else {
576 if {[dict get $item "num_redirects"] < 10} { 583 if {[dict get $item "num_redirects"] < 10} {
577 ${Log}::debug "\"$name\": \"$url\": queuing \"$redirectUrl\" due to\ 584 ${Log}::debug "\"$name\": \"$url\": queuing \"$redirectUrl\" due to\
578 redirect" 585 redirect"
579 586
580 dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \ 587 dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \
581 [dict replace $item "url" $redirectUrl \ 588 [dict replace $item "url" $redirectUrl "content_type" "" \
582 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \ 589 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \
583 "num_retries" 0] 590 "num_retries" 0]
584 } else { 591 } else {
585 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ 592 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\
586 redirects" 593 redirects"
633 ${Log}::warn $warningMsg 640 ${Log}::warn $warningMsg
634 StateItemAppendError $name $warningMsg 641 StateItemAppendError $name $warningMsg
635 return 642 return
636 } 643 }
637 644
638 proc ::relmon::update::OnTransferFinishedWrapper {token} { 645 proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} {
639 # ensure that exceptions get raised, by default http catches all errors and 646 # ensure that exceptions get raised, by default http catches all errors and
640 # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262 647 # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262
641 if {[catch {OnTransferFinished $token} -> errorOptions]} { 648 if {[catch {eval $callbackCmd $args} -> errorOptions]} {
642 OnError [dict get $errorOptions "-errorinfo"] $errorOptions 649 OnError [dict get $errorOptions "-errorinfo"] $errorOptions
643 } 650 }
644 return 651 return
645 } 652 }
646 653
704 set url [dict get $item "url"] 711 set url [dict get $item "url"]
705 set name [dict get $item "name"] 712 set name [dict get $item "name"]
706 try { 713 try {
707 set token [http::geturl $url \ 714 set token [http::geturl $url \
708 -timeout [dict get $Config "transfer_time_limit"] \ 715 -timeout [dict get $Config "transfer_time_limit"] \
709 -command [namespace code OnTransferFinishedWrapper]] 716 -progress [namespace code {TransferCallbackWrapper \
717 OnTransferProgress}] \
718 -command [namespace code {TransferCallbackWrapper \
719 OnTransferFinished}]]
710 } on ok {} { 720 } on ok {} {
711 dict set ActiveTransfers $token $item 721 dict set ActiveTransfers $token $item
712 722
713 ${Log}::info "\"$name\": \"$url\": starting transfer" 723 ${Log}::info "\"$name\": \"$url\": starting transfer"
714 } on error {errorMsg} { 724 } on error {errorMsg} {
753 } 763 }
754 764
755 return 765 return
756 } 766 }
757 767
768 proc ::relmon::update::OnTransferProgress {token total current} {
769 upvar #0 $token httpState
770 variable ActiveTransfers
771 variable Log
772
773 # try to determine content type and abort transfer if content type is not
774 # one that can be parsed, this is primarily to prevent accidental downloads
775 if {[dict get $ActiveTransfers $token "content_type"] eq ""} {
776 set httpHeaders [relmon::common::normalizeHttpHeaders \
777 $httpState(meta)]
778
779 if {[dict exists $httpHeaders "Content-Type"]} {
780 set contentType [string trim [lindex [split \
781 [dict get $httpHeaders "Content-Type"] ";"] 0]]
782 dict set ActiveTransfers $token "content_type" $contentType
783 if {$contentType ni {"text/html" "application/xhtml+xml"
784 "application/atom+xml" "application/rss+xml"
785 "text/plain"}} {
786 ${Log}::warn "\"[dict get $ActiveTransfers $token "name"]\":\
787 \"[dict get $ActiveTransfers $token "url"]\": content\
788 type \"$contentType\" is not acceptable"
789 http::reset $token
790 }
791 }
792 }
793 }
794
758 proc ::relmon::update::OnTransferFinished {token} { 795 proc ::relmon::update::OnTransferFinished {token} {
759 upvar #0 $token httpState 796 upvar #0 $token httpState
760 variable Config 797 variable Config
761 variable HostConnections 798 variable HostConnections
762 variable Queue 799 variable Queue
776 dict incr HostConnections $host -1 813 dict incr HostConnections $host -1
777 814
778 switch -- $httpState(status) { 815 switch -- $httpState(status) {
779 {ok} { 816 {ok} {
780 # normalize headers 817 # normalize headers
781 set httpHeaders [dict create] 818 set httpHeaders [relmon::common::normalizeHttpHeaders \
782 foreach {header value} $httpState(meta) { 819 $httpState(meta)]
783 set words {} 820
784 foreach word [split $header "-"] { 821 # try to determine content type
785 lappend words [string totitle $word] 822 if {([dict get $item "content_type"] eq "") &&
786 } 823 [dict exists $httpHeaders "Content-Type"]} {
787 dict set httpHeaders [join $words "-"] $value 824 dict set item "content_type" [string trim [lindex [split \
825 [dict get $httpHeaders "Content-Type"] ";"] 0]]
788 } 826 }
789 827
790 # dispatch based on HTTP status code 828 # dispatch based on HTTP status code
791 set httpCode [http::ncode $token] 829 set httpCode [http::ncode $token]
792 switch -glob -- $httpCode { 830 switch -glob -- $httpCode {
793 {30[12378]} { 831 {30[12378]} {
794 HandleRedirect $item $httpCode $httpHeaders 832 HandleRedirect $item $httpCode $httpHeaders
795 } 833 }
796 {200} { 834 {200} {
797 HandleSuccessfulTransfer $item $httpHeaders httpState(body) 835 HandleSuccessfulTransfer $item httpState(body)
798 } 836 }
799 default { 837 default {
800 HandleProtocolError $item $httpState(http) 838 HandleProtocolError $item $httpState(http)
801 } 839 }
802 } 840 }
841 }
842 {reset} {
843 # aborted due to wrong content type
803 } 844 }
804 {eof} - 845 {eof} -
805 {timeout} { 846 {timeout} {
806 # timeout or connection reset 847 # timeout or connection reset
807 HandleTimeoutReset $item 848 HandleTimeoutReset $item
1120 [dict get $watchlistItem "base_url"]] \ 1161 [dict get $watchlistItem "base_url"]] \
1121 [dict create \ 1162 [dict create \
1122 "name" $name \ 1163 "name" $name \
1123 "url" [dict get $watchlistItem "base_url"] \ 1164 "url" [dict get $watchlistItem "base_url"] \
1124 "pattern_index" 0 \ 1165 "pattern_index" 0 \
1166 "content_type" "" \
1125 "num_redirects" 0 \ 1167 "num_redirects" 0 \
1126 "num_retries" 0] 1168 "num_retries" 0]
1127 dict incr Statistics "items" 1169 dict incr Statistics "items"
1128 dict set StateBuffer $name [dict create "versions" [dict create] \ 1170 dict set StateBuffer $name [dict create "versions" [dict create] \
1129 "errors" [list]] 1171 "errors" [list]]