Mercurial > projects > relmon
comparison relmon.tcl @ 1:cba4887feb2c
Check the content type of documents that are being downloaded
Abort if the content type of the document being downloaded cannot be handled.
This is to primarily to prevent accidental downloads of potentially large files.
author | Guido Berhoerster <guido+relmon@berhoerster.name> |
---|---|
date | Sun, 19 Oct 2014 20:56:27 +0200 |
parents | 8c5330f6e9e4 |
children |
comparison
equal
deleted
inserted
replaced
0:8c5330f6e9e4 | 1:cba4887feb2c |
---|---|
1 #!/usr/bin/tclsh | 1 #!/usr/bin/tclsh |
2 # | 2 # |
3 # Copyright (C) 2011 Guido Berhoerster <guido+relmon@berhoerster.name> | 3 # Copyright (C) 2014 Guido Berhoerster <guido+relmon@berhoerster.name> |
4 # | 4 # |
5 # Permission is hereby granted, free of charge, to any person obtaining | 5 # Permission is hereby granted, free of charge, to any person obtaining |
6 # a copy of this software and associated documentation files (the | 6 # a copy of this software and associated documentation files (the |
7 # "Software"), to deal in the Software without restriction, including | 7 # "Software"), to deal in the Software without restriction, including |
8 # without limitation the rights to use, copy, modify, merge, publish, | 8 # without limitation the rights to use, copy, modify, merge, publish, |
44 variable VERSION 1 | 44 variable VERSION 1 |
45 } | 45 } |
46 | 46 |
47 | 47 |
48 namespace eval ::relmon::common { | 48 namespace eval ::relmon::common { |
49 namespace export cmpVersions isUrlValid urlGetHost parseStateFile | 49 namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \ |
50 parseStateFile | |
50 } | 51 } |
51 | 52 |
52 # implementation of the Debian version comparison algorithm described at | 53 # implementation of the Debian version comparison algorithm described at |
53 # http://www.debian.org/doc/debian-policy/ch-controlfields.html#s-f-Version | 54 # http://www.debian.org/doc/debian-policy/ch-controlfields.html#s-f-Version |
54 proc ::relmon::common::cmpVersion {v1 v2} { | 55 proc ::relmon::common::cmpVersion {v1 v2} { |
138 proc ::relmon::common::urlGetHost {url} { | 139 proc ::relmon::common::urlGetHost {url} { |
139 return [expr {(![catch {dict create {*}[uri::split $url]} urlParts]) ? | 140 return [expr {(![catch {dict create {*}[uri::split $url]} urlParts]) ? |
140 [dict get $urlParts "host"] : ""}] | 141 [dict get $urlParts "host"] : ""}] |
141 } | 142 } |
142 | 143 |
144 proc ::relmon::common::normalizeHttpHeaders {headers} { | |
145 set httpHeaders [dict create] | |
146 foreach {header value} $headers { | |
147 set words {} | |
148 foreach word [split $header "-"] { | |
149 lappend words [string totitle $word] | |
150 } | |
151 dict set httpHeaders [join $words "-"] $value | |
152 } | |
153 return $httpHeaders | |
154 } | |
155 | |
143 proc ::relmon::common::parseStateFile {stateFile} { | 156 proc ::relmon::common::parseStateFile {stateFile} { |
144 try { | 157 try { |
145 set f [open $stateFile "r"] | 158 set f [open $stateFile "r"] |
146 } trap {POSIX} {errorMsg errorOptions} { | 159 } trap {POSIX} {errorMsg errorOptions} { |
147 return -options $errorOptions \ | 160 return -options $errorOptions \ |
357 proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl | 370 proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl |
358 rePattern} { | 371 rePattern} { |
359 upvar 1 $bodyDataName bodyData | 372 upvar 1 $bodyDataName bodyData |
360 set extractedUrls {} | 373 set extractedUrls {} |
361 set resultUrls [dict create] | 374 set resultUrls [dict create] |
362 set bareContentType [string trim [lindex [split $contentType ";"] 0]] | |
363 # extract all URLs or URL fragments | 375 # extract all URLs or URL fragments |
364 switch -- $bareContentType { | 376 switch -- $contentType { |
365 {text/html} - | 377 {text/html} - |
366 {application/xhtml+xml} { | 378 {application/xhtml+xml} { |
367 # HTML/XHTML | 379 # HTML/XHTML |
368 # if tdom parsing has failed or not found any "<a>" element, | 380 # if tdom parsing has failed or not found any "<a>" element, |
369 # preprocess the document with htmlparse and try again | 381 # preprocess the document with htmlparse and try again |
458 } | 470 } |
459 | 471 |
460 return | 472 return |
461 } | 473 } |
462 | 474 |
463 proc ::relmon::update::HandleSuccessfulTransfer {item httpHeaders | 475 proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} { |
464 httpBodyName} { | |
465 upvar 1 $httpBodyName httpBody | 476 upvar 1 $httpBodyName httpBody |
466 variable Log | 477 variable Log |
467 variable StateBuffer | 478 variable StateBuffer |
468 variable Queue | 479 variable Queue |
469 variable Watchlist | 480 variable Watchlist |
470 | 481 |
471 set name [dict get $item "name"] | 482 set name [dict get $item "name"] |
472 set url [dict get $item "url"] | 483 set url [dict get $item "url"] |
473 if {[dict exists $httpHeaders "Content-Type"]} { | |
474 set contentType [dict get $httpHeaders "Content-Type"] | |
475 } else { | |
476 set contentType "" | |
477 } | |
478 set patternIndex [dict get $item "pattern_index"] | 484 set patternIndex [dict get $item "pattern_index"] |
479 set pattern [lindex [dict get $Watchlist $name "patterns"] $patternIndex] | 485 set pattern [lindex [dict get $Watchlist $name "patterns"] $patternIndex] |
480 | 486 |
481 ${Log}::info "\"$name\": \"$url\": transfer finished" | 487 ${Log}::info "\"$name\": \"$url\": transfer finished" |
482 | 488 |
483 # parse data | 489 # parse data |
484 try { | 490 try { |
485 set urls [ExtractUrls httpBody $contentType $url $pattern] | 491 set urls [ExtractUrls httpBody [dict get $item "content_type"] $url \ |
492 $pattern] | |
486 } trap {RELMON} {errorMsg} { | 493 } trap {RELMON} {errorMsg} { |
487 # continue on tdom parsing errors or when receiving documents with an | 494 # continue on tdom parsing errors or when receiving documents with an |
488 # unsupported content type | 495 # unsupported content type |
489 set urls [dict create] | 496 set urls [dict create] |
490 set warningMsg "\"$name\": \"$url\": failed to parse data: $errorMsg" | 497 set warningMsg "\"$name\": \"$url\": failed to parse data: $errorMsg" |
506 ${Log}::debug "\"$name\": \"$url\": queuing \"$newUrl\"" | 513 ${Log}::debug "\"$name\": \"$url\": queuing \"$newUrl\"" |
507 | 514 |
508 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \ | 515 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \ |
509 [dict create "name" $name "url" $newUrl \ | 516 [dict create "name" $name "url" $newUrl \ |
510 "pattern_index" [expr {$patternIndex + 1}] \ | 517 "pattern_index" [expr {$patternIndex + 1}] \ |
511 "num_redirects" 0 "num_retries" 0] | 518 "content_type" "" "num_redirects" 0 "num_retries" 0] |
512 } else { | 519 } else { |
513 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\"" | 520 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\"" |
514 } | 521 } |
515 } | 522 } |
516 } else { | 523 } else { |
576 if {[dict get $item "num_redirects"] < 10} { | 583 if {[dict get $item "num_redirects"] < 10} { |
577 ${Log}::debug "\"$name\": \"$url\": queuing \"$redirectUrl\" due to\ | 584 ${Log}::debug "\"$name\": \"$url\": queuing \"$redirectUrl\" due to\ |
578 redirect" | 585 redirect" |
579 | 586 |
580 dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \ | 587 dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \ |
581 [dict replace $item "url" $redirectUrl \ | 588 [dict replace $item "url" $redirectUrl "content_type" "" \ |
582 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \ | 589 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \ |
583 "num_retries" 0] | 590 "num_retries" 0] |
584 } else { | 591 } else { |
585 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ | 592 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ |
586 redirects" | 593 redirects" |
633 ${Log}::warn $warningMsg | 640 ${Log}::warn $warningMsg |
634 StateItemAppendError $name $warningMsg | 641 StateItemAppendError $name $warningMsg |
635 return | 642 return |
636 } | 643 } |
637 | 644 |
638 proc ::relmon::update::OnTransferFinishedWrapper {token} { | 645 proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} { |
639 # ensure that exceptions get raised, by default http catches all errors and | 646 # ensure that exceptions get raised, by default http catches all errors and |
640 # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262 | 647 # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262 |
641 if {[catch {OnTransferFinished $token} -> errorOptions]} { | 648 if {[catch {eval $callbackCmd $args} -> errorOptions]} { |
642 OnError [dict get $errorOptions "-errorinfo"] $errorOptions | 649 OnError [dict get $errorOptions "-errorinfo"] $errorOptions |
643 } | 650 } |
644 return | 651 return |
645 } | 652 } |
646 | 653 |
704 set url [dict get $item "url"] | 711 set url [dict get $item "url"] |
705 set name [dict get $item "name"] | 712 set name [dict get $item "name"] |
706 try { | 713 try { |
707 set token [http::geturl $url \ | 714 set token [http::geturl $url \ |
708 -timeout [dict get $Config "transfer_time_limit"] \ | 715 -timeout [dict get $Config "transfer_time_limit"] \ |
709 -command [namespace code OnTransferFinishedWrapper]] | 716 -progress [namespace code {TransferCallbackWrapper \ |
717 OnTransferProgress}] \ | |
718 -command [namespace code {TransferCallbackWrapper \ | |
719 OnTransferFinished}]] | |
710 } on ok {} { | 720 } on ok {} { |
711 dict set ActiveTransfers $token $item | 721 dict set ActiveTransfers $token $item |
712 | 722 |
713 ${Log}::info "\"$name\": \"$url\": starting transfer" | 723 ${Log}::info "\"$name\": \"$url\": starting transfer" |
714 } on error {errorMsg} { | 724 } on error {errorMsg} { |
753 } | 763 } |
754 | 764 |
755 return | 765 return |
756 } | 766 } |
757 | 767 |
768 proc ::relmon::update::OnTransferProgress {token total current} { | |
769 upvar #0 $token httpState | |
770 variable ActiveTransfers | |
771 variable Log | |
772 | |
773 # try to determine content type and abort transfer if content type is not | |
774 # one that can be parsed, this is primarily to prevent accidental downloads | |
775 if {[dict get $ActiveTransfers $token "content_type"] eq ""} { | |
776 set httpHeaders [relmon::common::normalizeHttpHeaders \ | |
777 $httpState(meta)] | |
778 | |
779 if {[dict exists $httpHeaders "Content-Type"]} { | |
780 set contentType [string trim [lindex [split \ | |
781 [dict get $httpHeaders "Content-Type"] ";"] 0]] | |
782 dict set ActiveTransfers $token "content_type" $contentType | |
783 if {$contentType ni {"text/html" "application/xhtml+xml" | |
784 "application/atom+xml" "application/rss+xml" | |
785 "text/plain"}} { | |
786 ${Log}::warn "\"[dict get $ActiveTransfers $token "name"]\":\ | |
787 \"[dict get $ActiveTransfers $token "url"]\": content\ | |
788 type \"$contentType\" is not acceptable" | |
789 http::reset $token | |
790 } | |
791 } | |
792 } | |
793 } | |
794 | |
758 proc ::relmon::update::OnTransferFinished {token} { | 795 proc ::relmon::update::OnTransferFinished {token} { |
759 upvar #0 $token httpState | 796 upvar #0 $token httpState |
760 variable Config | 797 variable Config |
761 variable HostConnections | 798 variable HostConnections |
762 variable Queue | 799 variable Queue |
776 dict incr HostConnections $host -1 | 813 dict incr HostConnections $host -1 |
777 | 814 |
778 switch -- $httpState(status) { | 815 switch -- $httpState(status) { |
779 {ok} { | 816 {ok} { |
780 # normalize headers | 817 # normalize headers |
781 set httpHeaders [dict create] | 818 set httpHeaders [relmon::common::normalizeHttpHeaders \ |
782 foreach {header value} $httpState(meta) { | 819 $httpState(meta)] |
783 set words {} | 820 |
784 foreach word [split $header "-"] { | 821 # try to determine content type |
785 lappend words [string totitle $word] | 822 if {([dict get $item "content_type"] eq "") && |
786 } | 823 [dict exists $httpHeaders "Content-Type"]} { |
787 dict set httpHeaders [join $words "-"] $value | 824 dict set item "content_type" [string trim [lindex [split \ |
825 [dict get $httpHeaders "Content-Type"] ";"] 0]] | |
788 } | 826 } |
789 | 827 |
790 # dispatch based on HTTP status code | 828 # dispatch based on HTTP status code |
791 set httpCode [http::ncode $token] | 829 set httpCode [http::ncode $token] |
792 switch -glob -- $httpCode { | 830 switch -glob -- $httpCode { |
793 {30[12378]} { | 831 {30[12378]} { |
794 HandleRedirect $item $httpCode $httpHeaders | 832 HandleRedirect $item $httpCode $httpHeaders |
795 } | 833 } |
796 {200} { | 834 {200} { |
797 HandleSuccessfulTransfer $item $httpHeaders httpState(body) | 835 HandleSuccessfulTransfer $item httpState(body) |
798 } | 836 } |
799 default { | 837 default { |
800 HandleProtocolError $item $httpState(http) | 838 HandleProtocolError $item $httpState(http) |
801 } | 839 } |
802 } | 840 } |
841 } | |
842 {reset} { | |
843 # aborted due to wrong content type | |
803 } | 844 } |
804 {eof} - | 845 {eof} - |
805 {timeout} { | 846 {timeout} { |
806 # timeout or connection reset | 847 # timeout or connection reset |
807 HandleTimeoutReset $item | 848 HandleTimeoutReset $item |
1120 [dict get $watchlistItem "base_url"]] \ | 1161 [dict get $watchlistItem "base_url"]] \ |
1121 [dict create \ | 1162 [dict create \ |
1122 "name" $name \ | 1163 "name" $name \ |
1123 "url" [dict get $watchlistItem "base_url"] \ | 1164 "url" [dict get $watchlistItem "base_url"] \ |
1124 "pattern_index" 0 \ | 1165 "pattern_index" 0 \ |
1166 "content_type" "" \ | |
1125 "num_redirects" 0 \ | 1167 "num_redirects" 0 \ |
1126 "num_retries" 0] | 1168 "num_retries" 0] |
1127 dict incr Statistics "items" | 1169 dict incr Statistics "items" |
1128 dict set StateBuffer $name [dict create "versions" [dict create] \ | 1170 dict set StateBuffer $name [dict create "versions" [dict create] \ |
1129 "errors" [list]] | 1171 "errors" [list]] |