comparison relmon.tcl.in @ 5:86a0c5d11f05 version-1

Add discover subcommand The discover subcommand assists with the creation of watchlist entries and allows to reproduce each step of an update operation for a watchlist entry.
author Guido Berhoerster <guido+relmon@berhoerster.name>
date Sun, 26 Oct 2014 21:36:05 +0100
parents 6d87242c537e
children
comparison
equal deleted inserted replaced
4:f28486666a4f 5:86a0c5d11f05
40 40
41 41
42 namespace eval ::relmon { 42 namespace eval ::relmon {
43 # version 43 # version
44 variable VERSION @VERSION@ 44 variable VERSION @VERSION@
45
46 # default log format
47 variable LogFormatDefault {%d \[%p\] %m}
48
49 # debugging log format
50 variable LogFormatDebug {%d \[%p\] \[%M\] %m}
45 } 51 }
46 52
47 53
48 namespace eval ::relmon::common { 54 namespace eval ::relmon::common {
49 namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \ 55 namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \
175 } 181 }
176 182
177 return $state 183 return $state
178 } 184 }
179 185
180 186 proc ::relmon::common::parseWatchlist {watchlistFilename} {
181 namespace eval ::relmon::update { 187 set Watchlist [dict create]
182 # commandline option help text
183 variable usage "usage: relmon update \[-dev\] \[-c max_connections\] \[-C\
184 ca_dir\] \[-D delay\]\n\
185 \ \[-H max_host_connections\] \[-i\
186 item\[,...\]\] \[-l logfile\]\n\
187 \ \[-r retries\] \[-t min_time\] watchlist\
188 statefile"
189
190 # configuration options
191 variable Config [dict create \
192 "log_file" "" \
193 "log_level" "notice" \
194 "history_limit" 20 \
195 "connection_limit" 16 \
196 "host_connection_limit" 4 \
197 "transfer_time_limit" 60000 \
198 "retry_limit" 3 \
199 "host_delay" 0 \
200 "timestamp_filter" 0 \
201 "error_filter" 0 \
202 "item_filter" {} \
203 "ca_dir" "" \
204 "state_file" "" \
205 "watchlist_file" ""]
206
207 # exit status
208 variable ExitStatus
209
210 # transfer statistics
211 variable Statistics [dict create \
212 "start_time" 0 \
213 "end_time" 0 \
214 "requests" 0 \
215 "items" 0]
216
217 # watchlist
218 variable Watchlist
219
220 # ID of a delayed run of ManageTransfers
221 variable ManageTransfersId ""
222
223 # queue of pending transfers
224 variable Queue
225
226 # number of active connections per host
227 variable HostConnections
228
229 # delays before opening a new connection to a host
230 variable HostDelays
231
232 # active transfers
233 variable ActiveTransfers
234
235 # buffer for tracking the state of unfinished items
236 variable StateBuffer
237
238 # buffer needed by htmlparse::parse for constructing the preprocessed HTML
239 # document
240 variable PreprocessedHtmlBuffer
241
242 # logger handle
243 variable Log
244
245 # logfile handle
246 variable Lf
247 }
248
249 proc ::relmon::update::OnError {message returnOptions} {
250 # internal error, abort
251 puts stderr [dict get $returnOptions "-errorinfo"]
252
253 exit 1
254 }
255
256 proc ::relmon::update::CleanupBeforeExit {commandString operation} {
257 variable Lf
258
259 # close logfile
260 if {($Lf ne "") && ($Lf ni {stdin stderr})} {
261 close $Lf
262 set Lf ""
263 }
264
265 return
266 }
267
268 proc ::relmon::update::ParseWatchlist {watchlistFilename} {
269 variable Watchlist
270
271 set lineno 0 188 set lineno 0
272 set f [open $watchlistFilename "r"] 189 set f [open $watchlistFilename "r"]
273 try { 190 try {
274 while {[chan gets $f line] != -1} { 191 while {[chan gets $f line] != -1} {
275 set fields [textutil::split::splitx [string trim $line] {[\t ]+}] 192 set fields [textutil::split::splitx [string trim $line] {[\t ]+}]
336 } 253 }
337 } finally { 254 } finally {
338 close $f 255 close $f
339 } 256 }
340 257
341 return 258 return $Watchlist
342 } 259 }
343 260
344 proc ::relmon::update::ProcessHtmlElement {tag slash param textBehindTheTag} { 261
262 namespace eval ::relmon::crawler {
263 # status returned by crawl
264 variable Status
265
266 # configuration
267 variable Config
268
269 # transfer statistics
270 variable Statistics [dict create \
271 "start_time" 0 \
272 "end_time" 0 \
273 "requests" 0 \
274 "items" 0]
275
276 # ID of a delayed run of ManageTransfers
277 variable ManageTransfersId ""
278
279 # queue of pending transfers
280 variable Queue
281
282 # number of active connections per host
283 variable HostConnections [dict create]
284
285 # delays before opening a new connection to a host
286 variable HostDelays [dict create]
287
288 # active transfers
289 variable ActiveTransfers [dict create]
290
291 # number of running or queued transfers of watchlist items
292 variable RemainingTransfers [dict create]
293
294 # buffer for tracking the state of unfinished items
295 variable StateBuffer [dict create]
296
297 # buffer needed by htmlparse::parse for constructing the preprocessed HTML
298 # document
299 variable PreprocessedHtmlBuffer
300
301 # callback when all transfers for an item are finished
302 variable OnItemFinishedCmd
303
304 # logger handle
305 variable Log
306
307 namespace export crawl
308 }
309
310 proc ::relmon::crawler::OnError {message returnOptions} {
311 # internal error, abort
312 puts stderr [dict get $returnOptions "-errorinfo"]
313
314 exit 1
315 }
316
317 proc ::relmon::crawler::ProcessHtmlElement {tag slash param textBehindTheTag} {
345 variable PreprocessedHtmlBuffer 318 variable PreprocessedHtmlBuffer
346 319
347 # copy every "<a>" element into PreprocessedHtmlBuffer 320 # copy every "<a>" element into PreprocessedHtmlBuffer
348 if {($slash eq "") && ([string tolower $tag] eq "a")} { 321 if {($slash eq "") && ([string tolower $tag] eq "a")} {
349 append PreprocessedHtmlBuffer "<$tag $param></$tag>" 322 append PreprocessedHtmlBuffer "<$tag $param></$tag>"
350 } 323 }
351 324
352 return 325 return
353 } 326 }
354 327
355 proc ::relmon::update::PreprocessHtml {bodyDataName} { 328 proc ::relmon::crawler::PreprocessHtml {bodyDataName} {
356 upvar 1 $bodyDataName bodyData 329 upvar 1 $bodyDataName bodyData
357 variable PreprocessedHtmlBuffer 330 variable PreprocessedHtmlBuffer
358 331
359 # preprocess the document with htmlparse by constructing a new document 332 # preprocess the document with htmlparse by constructing a new document
360 # consisting only of found "<a>" elements which then can be fed into tdom 333 # consisting only of found "<a>" elements which then can be fed into tdom
365 set PreprocessedHtmlBuffer "<html><body>" 338 set PreprocessedHtmlBuffer "<html><body>"
366 htmlparse::parse -cmd [namespace code ProcessHtmlElement] $bodyData 339 htmlparse::parse -cmd [namespace code ProcessHtmlElement] $bodyData
367 append PreprocessedHtmlBuffer "</body></html>" 340 append PreprocessedHtmlBuffer "</body></html>"
368 } 341 }
369 342
370 proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl 343 proc ::relmon::crawler::ExtractUrls {bodyDataName contentType baseUrl
371 rePattern} { 344 rePattern} {
372 upvar 1 $bodyDataName bodyData 345 upvar 1 $bodyDataName bodyData
373 set extractedUrls {} 346 set extractedUrls {}
374 set resultUrls [dict create] 347 set resultUrls [dict create]
375 # extract all URLs or URL fragments 348 # extract all URLs or URL fragments
460 } 433 }
461 434
462 return $resultUrls 435 return $resultUrls
463 } 436 }
464 437
465 proc ::relmon::update::StateItemAppendError {name logMsg} { 438 proc ::relmon::crawler::StateItemAppendError {name logMsg} {
466 variable StateBuffer 439 variable StateBuffer
467 440
468 dict update StateBuffer $name stateItem { 441 dict update StateBuffer $name stateItem {
469 dict lappend stateItem "errors" $logMsg 442 dict lappend stateItem "errors" $logMsg
470 } 443 }
471 444
472 return 445 return
473 } 446 }
474 447
475 proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} { 448 proc ::relmon::crawler::HandleSuccessfulTransfer {item httpBodyName} {
476 upvar 1 $httpBodyName httpBody 449 upvar 1 $httpBodyName httpBody
477 variable Log 450 variable Log
478 variable StateBuffer 451 variable StateBuffer
479 variable Queue 452 variable Queue
453 variable RemainingTransfers
480 variable Watchlist 454 variable Watchlist
481 455
482 set name [dict get $item "name"] 456 set name [dict get $item "name"]
483 set url [dict get $item "url"] 457 set url [dict get $item "url"]
484 set patternIndex [dict get $item "pattern_index"] 458 set patternIndex [dict get $item "pattern_index"]
514 488
515 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \ 489 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \
516 [dict create "name" $name "url" $newUrl \ 490 [dict create "name" $name "url" $newUrl \
517 "pattern_index" [expr {$patternIndex + 1}] \ 491 "pattern_index" [expr {$patternIndex + 1}] \
518 "content_type" "" "num_redirects" 0 "num_retries" 0] 492 "content_type" "" "num_redirects" 0 "num_retries" 0]
493 dict incr RemainingTransfers $name
519 } else { 494 } else {
520 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\"" 495 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\""
521 } 496 }
522 } 497 }
523 } else { 498 } else {
524 # otherwise this branch has finished, try to extract the versions and 499 # otherwise this branch has finished, try to extract the versions and
525 # store them in the buffer 500 # store them in the buffer
526 dict for {finalUrl matched} $urls { 501 dict for {finalUrl matched} $urls {
502 dict set StateBuffer $name "urls" $url $urls
527 if {$matched} { 503 if {$matched} {
528 regexp -line -- $pattern $finalUrl -> version 504 regexp -line -- $pattern $finalUrl -> version
529 if {$version ne ""} { 505 if {$version ne ""} {
530 ${Log}::debug "\"$name\": \"$url\": extracted version\ 506 ${Log}::debug "\"$name\": \"$url\": extracted version\
531 \"$version\" from \"$finalUrl\" found on\ 507 \"$version\" from \"$finalUrl\" found on\
542 } 518 }
543 519
544 return 520 return
545 } 521 }
546 522
547 proc ::relmon::update::HandleRedirect {item httpCode httpHeaders} { 523 proc ::relmon::crawler::HandleRedirect {item httpCode httpHeaders} {
548 variable Log 524 variable Log
549 variable Queue 525 variable Queue
526 variable RemainingTransfers
550 527
551 set name [dict get $item "name"] 528 set name [dict get $item "name"]
552 set url [dict get $item "url"] 529 set url [dict get $item "url"]
553 530
554 if {![dict exists $httpHeaders "Location"]} { 531 if {![dict exists $httpHeaders "Location"]} {
586 563
587 dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \ 564 dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \
588 [dict replace $item "url" $redirectUrl "content_type" "" \ 565 [dict replace $item "url" $redirectUrl "content_type" "" \
589 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \ 566 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \
590 "num_retries" 0] 567 "num_retries" 0]
568 dict incr RemainingTransfers $name
591 } else { 569 } else {
592 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ 570 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\
593 redirects" 571 redirects"
594 ${Log}::warn $warningMsg 572 ${Log}::warn $warningMsg
595 StateItemAppendError $name $warningMsg 573 StateItemAppendError $name $warningMsg
596 } 574 }
597 575
598 return 576 return
599 } 577 }
600 578
601 proc ::relmon::update::HandleProtocolError {item httpCode} { 579 proc ::relmon::crawler::HandleProtocolError {item httpCode} {
602 variable Log 580 variable Log
603 set name [dict get $item "name"] 581 set name [dict get $item "name"]
604 set url [dict get $item "url"] 582 set url [dict get $item "url"]
605 set warningMsg "\"$name\": \"$url\": transfer failed: $httpCode" 583 set warningMsg "\"$name\": \"$url\": transfer failed: $httpCode"
606 ${Log}::warn $warningMsg 584 ${Log}::warn $warningMsg
607 StateItemAppendError $name $warningMsg 585 StateItemAppendError $name $warningMsg
608 return 586 return
609 } 587 }
610 588
611 proc ::relmon::update::HandleTimeoutReset {item} { 589 proc ::relmon::crawler::HandleTimeoutReset {item} {
612 variable Log 590 variable Log
613 variable Config 591 variable Config
614 variable Queue 592 variable Queue
593 variable RemainingTransfers
615 set name [dict get $item "name"] 594 set name [dict get $item "name"]
616 set url [dict get $item "url"] 595 set url [dict get $item "url"]
617 596
618 # retry by re-queuing the target URL until reaching the limit 597 # retry by re-queuing the target URL until reaching the limit
619 if {[dict get $item "num_retries"] < [dict get $Config "retry_limit"]} { 598 if {[dict get $item "num_retries"] < [dict get $Config "retry_limit"]} {
620 ${Log}::warn "\"$name\": \"$url\": connection timed out or was reset,\ 599 ${Log}::warn "\"$name\": \"$url\": connection timed out or was reset,\
621 retrying" 600 retrying"
622 dict lappend Queue [::relmon::common::urlGetHost $url] \ 601 dict lappend Queue [::relmon::common::urlGetHost $url] \
623 [dict replace $item \ 602 [dict replace $item \
624 "num_retries" [expr {[dict get $item "num_retries"] + 1}]] 603 "num_retries" [expr {[dict get $item "num_retries"] + 1}]]
604 dict incr RemainingTransfers $name
625 } else { 605 } else {
626 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ 606 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\
627 retries" 607 retries"
628 ${Log}::warn $warningMsg 608 ${Log}::warn $warningMsg
629 StateItemAppendError $name $warningMsg 609 StateItemAppendError $name $warningMsg
630 } 610 }
631 611
632 return 612 return
633 } 613 }
634 614
635 proc ::relmon::update::HandleConnectionError {item errorMsg} { 615 proc ::relmon::crawler::HandleConnectionError {item errorMsg} {
636 variable Log 616 variable Log
637 set name [dict get $item "name"] 617 set name [dict get $item "name"]
638 set url [dict get $item "url"] 618 set url [dict get $item "url"]
639 set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg" 619 set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg"
640 ${Log}::warn $warningMsg 620 ${Log}::warn $warningMsg
641 StateItemAppendError $name $warningMsg 621 StateItemAppendError $name $warningMsg
642 return 622 return
643 } 623 }
644 624
645 proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} { 625 proc ::relmon::crawler::TransferCallbackWrapper {callbackCmd args} {
646 # ensure that exceptions get raised, by default http catches all errors and 626 # ensure that exceptions get raised, by default http catches all errors and
647 # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262 627 # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262
648 if {[catch {eval $callbackCmd $args} -> errorOptions]} { 628 if {[catch {eval $callbackCmd $args} -> errorOptions]} {
649 OnError [dict get $errorOptions "-errorinfo"] $errorOptions 629 OnError [dict get $errorOptions "-errorinfo"] $errorOptions
650 } 630 }
651 return 631 return
652 } 632 }
653 633
654 proc ::relmon::update::ManageTransfers {} { 634 proc ::relmon::crawler::ManageTransfers {} {
655 variable Config 635 variable Config
656 variable ManageTransfersId 636 variable ManageTransfersId
657 variable Queue 637 variable Queue
658 variable HostConnections 638 variable HostConnections
659 variable HostDelays 639 variable HostDelays
660 variable ActiveTransfers 640 variable ActiveTransfers
661 variable ExitStatus 641 variable RemainingTransfers
642 variable Status
662 variable Log 643 variable Log
663 644
664 after cancel $ManageTransfersId 645 after cancel $ManageTransfersId
665 646
666 # try to initiate new transfers 647 # try to initiate new transfers
724 } on error {errorMsg} { 705 } on error {errorMsg} {
725 # an error occured during socket setup, e.g. a DNS lookup failure 706 # an error occured during socket setup, e.g. a DNS lookup failure
726 set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg" 707 set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg"
727 ${Log}::warn $warningMsg 708 ${Log}::warn $warningMsg
728 StateItemAppendError $name $warningMsg 709 StateItemAppendError $name $warningMsg
710 dict incr RemainingTransfers $name -1
729 } 711 }
730 } 712 }
731 713
732 # terminate the event loop if there are no remaining transfers 714 # terminate the event loop if there are no remaining transfers
733 if {([dict size $ActiveTransfers]) == 0 && ([dict size $Queue] == 0)} { 715 if {[::tcl::mathop::+ {*}[dict values $RemainingTransfers]] == 0} {
734 set ExitStatus 0 716 if {([dict size $ActiveTransfers]) > 0 || ([dict size $Queue] > 0)} {
717 ${Log}::error "inconsistent internal state"
718 set Status 1
719 }
720 set Status 0
735 return 721 return
736 } 722 }
737 723
738 # due to per-host connection limits and per-host delays the maximum number 724 # due to per-host connection limits and per-host delays the maximum number
739 # of connections may not be reached although there are still items in the 725 # of connections may not be reached although there are still items in the
763 } 749 }
764 750
765 return 751 return
766 } 752 }
767 753
768 proc ::relmon::update::OnTransferProgress {token total current} { 754 proc ::relmon::crawler::OnTransferProgress {token total current} {
769 upvar #0 $token httpState 755 upvar #0 $token httpState
770 variable ActiveTransfers 756 variable ActiveTransfers
771 variable Log 757 variable Log
772 758
773 # try to determine content type and abort transfer if content type is not 759 # try to determine content type and abort transfer if content type is not
790 } 776 }
791 } 777 }
792 } 778 }
793 } 779 }
794 780
795 proc ::relmon::update::OnTransferFinished {token} { 781 proc ::relmon::crawler::OnTransferFinished {token} {
796 upvar #0 $token httpState 782 upvar #0 $token httpState
797 variable Config 783 variable Config
798 variable HostConnections 784 variable HostConnections
799 variable Queue
800 variable ActiveTransfers 785 variable ActiveTransfers
786 variable RemainingTransfers
801 variable Statistics 787 variable Statistics
802 variable StateBuffer 788 variable StateBuffer
803 variable State 789 variable State
804 variable Log 790 variable Log
791 variable OnItemFinishedCmd
805 792
806 set item [dict get $ActiveTransfers $token] 793 set item [dict get $ActiveTransfers $token]
807 set name [dict get $item "name"] 794 set name [dict get $item "name"]
808 set host [relmon::common::urlGetHost [dict get $item "url"]] 795 set host [relmon::common::urlGetHost [dict get $item "url"]]
809 796
810 # update list of per-host connections, and number of remaining transfers 797 # update list of per-host connections, and number of remaining transfers
811 # for this item 798 # for this item
812 dict unset ActiveTransfers $token 799 dict unset ActiveTransfers $token
813 dict incr HostConnections $host -1 800 dict incr HostConnections $host -1
801 dict incr RemainingTransfers $name -1
814 802
815 switch -- $httpState(status) { 803 switch -- $httpState(status) {
816 {ok} { 804 {ok} {
817 # normalize headers 805 # normalize headers
818 set httpHeaders [relmon::common::normalizeHttpHeaders \ 806 set httpHeaders [relmon::common::normalizeHttpHeaders \
852 HandleConnectionError $item [lindex $httpState(error) 0] 840 HandleConnectionError $item [lindex $httpState(error) 0]
853 } 841 }
854 } 842 }
855 843
856 # check if all transfers of this item are finished 844 # check if all transfers of this item are finished
857 set itemFinished 1 845 if {[dict get $RemainingTransfers $name] == 0} {
858 dict for {queueHost queueItems} $Queue { 846 eval $OnItemFinishedCmd [list $name [dict get $StateBuffer $name]]
859 foreach queueItem $queueItems { 847
860 if {[dict get $queueItem "name"] eq $name} {
861 set itemFinished 0
862 }
863 }
864 }
865 dict for {activeToken activeItem} $ActiveTransfers {
866 if {[dict get $activeItem "name"] eq $name} {
867 set itemFinished 0
868 }
869 }
870 if {$itemFinished} {
871 set timestamp [clock milliseconds]
872
873 # create httpState item if it does not exist yet
874 if {![dict exists $State $name]} {
875 dict set State $name [dict create "versions" [dict create] \
876 "history" [list] "timestamp" 0 "errors" [list]]
877 }
878
879 # if there are no versions, log an error message since something must
880 # be wrong
881 if {[llength [dict get $StateBuffer $name "versions"]] == 0} {
882 set warningMsg "\"$name\": no versions found"
883 ${Log}::warn $warningMsg
884 StateItemAppendError $name $warningMsg
885 }
886
887 # update httpState item
888 dict set State $name "errors" [dict get $StateBuffer $name "errors"]
889 dict set State $name "timestamp" $timestamp
890 if {[llength [dict get $StateBuffer $name "errors"]] == 0} {
891 # expire old history entries
892 set history [lrange [dict get $State $name "history"] \
893 [expr {[llength [dict get $State $name "history"]] -
894 [dict get $Config "history_limit"] + 1}] end]
895
896 # add currently latest available version to history if it is either
897 # newer than the previous one or if the previous one is no longer
898 # available (e.g. if it has been removed or the watchlist pattern
899 # has been changed)
900 set prevLatestVersion [lindex $history end 0]
901 set curLatestVersion [lindex \
902 [lsort -command ::relmon::common::cmpVersion \
903 [dict keys [dict get $StateBuffer $name "versions"]]] end]
904 if {([::relmon::common::cmpVersion $curLatestVersion \
905 $prevLatestVersion] > 0) ||
906 ![dict exists $StateBuffer $name "versions" \
907 $prevLatestVersion]} {
908 lappend history [list $curLatestVersion $timestamp]
909 dict set State $name "history" $history
910 }
911 dict set State $name "versions" [dict get $StateBuffer $name \
912 "versions"]
913 }
914 dict unset StateBuffer $name 848 dict unset StateBuffer $name
915 849
916 ${Log}::notice "progress: [dict size $StateBuffer]/[dict get \ 850 ${Log}::notice "progress: [dict size $StateBuffer]/[dict get \
917 $Statistics "items"] items left" 851 $Statistics "items"] items left"
918 } 852 }
923 857
924 return 858 return
925 } 859 }
926 860
927 # control certificate verification and log errors during TLS handshake 861 # control certificate verification and log errors during TLS handshake
928 proc ::relmon::update::OnTlsHandshake {type args} { 862 proc ::relmon::crawler::OnTlsHandshake {type args} {
929 variable Config 863 variable Config
930 variable Log 864 variable Log
931 865
932 switch -- ${type} { 866 switch -- ${type} {
933 {error} { 867 {error} {
955 } 889 }
956 } 890 }
957 } 891 }
958 } 892 }
959 893
894 proc ::relmon::crawler::crawl {config watchlist log onItemFinishedCmd} {
895 variable Config $config
896 variable Watchlist $watchlist
897 variable Queue
898 variable Statistics
899 variable Log $log
900 variable OnItemFinishedCmd $onItemFinishedCmd
901 variable Status
902 variable RemainingTransfers
903 variable StateBuffer
904
905 # initialize queue and state buffer from the watchlist
906 set Queue [dict create]
907 dict for {name watchlistItem} $Watchlist {
908 dict lappend Queue [::relmon::common::urlGetHost \
909 [dict get $watchlistItem "base_url"]] \
910 [dict create \
911 "name" $name \
912 "url" [dict get $watchlistItem "base_url"] \
913 "pattern_index" 0 \
914 "content_type" "" \
915 "num_redirects" 0 \
916 "num_retries" 0]
917 dict set RemainingTransfers $name 1
918 dict incr Statistics "items"
919 dict set StateBuffer $name [dict create "versions" [dict create] \
920 "errors" [list] "urls" [dict create]]
921 }
922
923 # configure http and tls
924 http::register https 443 [list tls::socket \
925 -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \
926 -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1]
927 http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\
928 Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION"
929
930 # handle errors while in the event loop
931 interp bgerror {} [namespace code OnError]
932
933 dict set Statistics "start_time" [clock milliseconds]
934
935 # enter the main loop
936 after idle [namespace code ManageTransfers]
937 vwait [namespace which -variable Status]
938
939 # display statistics
940 dict set Statistics "end_time" [clock milliseconds]
941 ${Log}::notice "items checked: [dict get $Statistics "items"]"
942 ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] -
943 [dict get $Statistics "start_time"]) / 1000}]s"
944
945 return $Status
946 }
947
948
949 namespace eval ::relmon::update {
950 # commandline option help text
951 variable usage "usage: relmon update \[-dev\] \[-c max_connections\]\
952 \[-C ca_dir\] \[-D host_delay\]\n\
953 \ \[-H max_connections_per_host\]\
954 \[-i item_list\]\n\
955 \ \[-l logfile\] \[-r retries\]\
956 \[-t before_seconds\]\n\
957 \ \[-T timeout_seconds\] watchlist statefile"
958
959 # configuration options
960 variable Config [dict create \
961 "log_file" "" \
962 "log_level" "notice" \
963 "history_limit" 20 \
964 "connection_limit" 16 \
965 "host_connection_limit" 4 \
966 "transfer_time_limit" 60000 \
967 "retry_limit" 3 \
968 "host_delay" 500 \
969 "host_delays" [dict create] \
970 "timestamp_filter" 0 \
971 "error_filter" 0 \
972 "item_filter" {} \
973 "ca_dir" "" \
974 "state_file" "" \
975 "watchlist_file" ""]
976
977 # watchlist
978 variable Watchlist
979
980 # logger handle
981 variable Log
982
983 # logfile handle
984 variable Lf
985 }
986
987 proc ::relmon::update::OnItemFinished {name stateItem} {
988 variable State
989 variable Log
990 variable Config
991
992 set timestamp [clock milliseconds]
993
994 # create httpState item if it does not exist yet
995 if {![dict exists $State $name]} {
996 dict set State $name [dict create "versions" [dict create] \
997 "history" [list] "timestamp" 0 "errors" [list]]
998 }
999
1000 # if there are no versions, log an error message since something must
1001 # be wrong
1002 if {[llength [dict get $stateItem "versions"]] == 0} {
1003 set warningMsg "\"$name\": no versions found"
1004 ${Log}::warn $warningMsg
1005 dict lappend stateItem "errors" $warningMsg
1006 }
1007
1008 # update httpState item
1009 dict set State $name "errors" [dict get $stateItem "errors"]
1010 dict set State $name "timestamp" $timestamp
1011 if {[llength [dict get $stateItem "errors"]] == 0} {
1012 # expire old history entries
1013 set history [lrange [dict get $State $name "history"] \
1014 [expr {[llength [dict get $State $name "history"]] -
1015 [dict get $Config "history_limit"] + 1}] end]
1016
1017 # add currently latest available version to history if it is either
1018 # newer than the previous one or if the previous one is no longer
1019 # available (e.g. if it has been removed or the watchlist pattern
1020 # has been changed)
1021 set prevLatestVersion [lindex $history end 0]
1022 set curLatestVersion [lindex \
1023 [lsort -command ::relmon::common::cmpVersion \
1024 [dict keys [dict get $stateItem "versions"]]] end]
1025 if {([::relmon::common::cmpVersion $curLatestVersion \
1026 $prevLatestVersion] > 0) ||
1027 ![dict exists $stateItem "versions" $prevLatestVersion]} {
1028 lappend history [list $curLatestVersion $timestamp]
1029 dict set State $name "history" $history
1030 }
1031 dict set State $name "versions" [dict get $stateItem "versions"]
1032 }
1033 }
1034
1035 proc ::relmon::update::CleanupBeforeExit {commandString op} {
1036 variable Lf
1037
1038 # close logfile
1039 if {($Lf ne "") && ($Lf ni {stdin stderr})} {
1040 close $Lf
1041 set Lf ""
1042 }
1043
1044 return
1045 }
1046
960 proc ::relmon::update::main {args} { 1047 proc ::relmon::update::main {args} {
961 variable Config 1048 variable Config
962 variable usage 1049 variable usage
963 variable Statistics
964 variable Watchlist [dict create]
965 variable Queue [dict create]
966 variable HostConnections [dict create]
967 variable HostDelays [dict create]
968 variable ActiveTransfers [dict create]
969 variable State 1050 variable State
970 variable StateBuffer [dict create]
971 variable PreprocessedHtmlBuffer
972 variable Log 1051 variable Log
973 variable Lf "" 1052 variable Lf ""
974 variable ExitStatus
975 1053
976 # parse commandline 1054 # parse commandline
977 while {[set GetoptRet [cmdline::getopt args \ 1055 while {[set GetoptRet [cmdline::getopt args \
978 {c.arg C.arg d D.arg e H.arg i.arg l.arg r.arg t.arg T.arg v} \ 1056 {c.arg C.arg d D.arg e H.arg i.arg l.arg r.arg t.arg T.arg v} \
979 OptArg OptVal]] == 1} { 1057 OptArg OptVal]] == 1} {
1099 } else { 1177 } else {
1100 set Lf stderr 1178 set Lf stderr
1101 } 1179 }
1102 set Log [logger::init global] 1180 set Log [logger::init global]
1103 if {[dict get $Config "log_level"] eq "debug"} { 1181 if {[dict get $Config "log_level"] eq "debug"} {
1104 set logFormat {%d \[%p\] \[%M\] %m} 1182 set logFormat $relmon::LogFormatDebug
1105 } else { 1183 } else {
1106 set logFormat {%d \[%p\] %m} 1184 set logFormat $relmon::LogFormatDefault
1107 } 1185 }
1108 logger::utils::applyAppender -appender fileAppend -appenderArgs \ 1186 logger::utils::applyAppender -appender fileAppend -appenderArgs \
1109 [list -outputChannel $Lf -conversionPattern $logFormat] \ 1187 [list -outputChannel $Lf -conversionPattern $logFormat] \
1110 -serviceCmd $Log 1188 -serviceCmd $Log
1111 1189
1114 1192
1115 ${Log}::notice "relmon.tcl starting up" 1193 ${Log}::notice "relmon.tcl starting up"
1116 1194
1117 # parse the watchlist 1195 # parse the watchlist
1118 try { 1196 try {
1119 ParseWatchlist [dict get $Config "watchlist_file"] 1197 set Watchlist [relmon::common::parseWatchlist \
1198 [dict get $Config "watchlist_file"]]
1120 } trap {POSIX} {errorMsg errorOptions} - \ 1199 } trap {POSIX} {errorMsg errorOptions} - \
1121 trap {RELMON} {errorMsg errorOptions} { 1200 trap {RELMON} {errorMsg errorOptions} {
1122 ${Log}::error $errorMsg 1201 ${Log}::error $errorMsg
1123 exit 1 1202 exit 1
1124 } 1203 }
1133 trap {RELMON} {errorMsg} { 1212 trap {RELMON} {errorMsg} {
1134 ${Log}::error $errorMsg 1213 ${Log}::error $errorMsg
1135 exit 1 1214 exit 1
1136 } 1215 }
1137 1216
1138 # initialize queue and state buffer from the watchlist 1217 # apply filters specified on the command line to watchlist items
1139 dict set Statistics "start_time" [clock milliseconds] 1218 set currentTime [clock milliseconds]
1140 dict for {name watchlistItem} $Watchlist { 1219 dict for {name watchlistItem} $Watchlist {
1141 # apply filters specified on the command line to watchlist items 1220 if {(([llength [dict get $Config "item_filter"]] > 0) &&
1142 if {([llength [dict get $Config "item_filter"]] > 0) && 1221 ($name ni [dict get $Config "item_filter"])) ||
1143 ($name ni [dict get $Config "item_filter"])} { 1222 ([dict get $Config "error_filter"] &&
1144 continue
1145 }
1146
1147 if {[dict get $Config "error_filter"] &&
1148 [dict exists $State $name "errors"] && 1223 [dict exists $State $name "errors"] &&
1149 ([llength [dict get $State $name "errors"]] == 0)} { 1224 ([llength [dict get $State $name "errors"]] == 0)) ||
1150 continue 1225 ([dict exists $State $name "timestamp"] &&
1151 } 1226 ([dict get $State $name "timestamp"] > $currentTime -
1152 1227 [dict get $Config "timestamp_filter"]))} {
1153 if {[dict exists $State $name "timestamp"] && 1228 dict unset Watchlist $name
1154 ([dict get $State $name "timestamp"] > 1229 }
1155 [dict get $Statistics "start_time"] - 1230 }
1156 [dict get $Config "timestamp_filter"])} { 1231
1157 continue 1232 set ExitStatus [relmon::crawler::crawl $Config $Watchlist $Log \
1158 } 1233 [namespace code OnItemFinished]]
1159
1160 dict lappend Queue [::relmon::common::urlGetHost \
1161 [dict get $watchlistItem "base_url"]] \
1162 [dict create \
1163 "name" $name \
1164 "url" [dict get $watchlistItem "base_url"] \
1165 "pattern_index" 0 \
1166 "content_type" "" \
1167 "num_redirects" 0 \
1168 "num_retries" 0]
1169 dict incr Statistics "items"
1170 dict set StateBuffer $name [dict create "versions" [dict create] \
1171 "errors" [list]]
1172 }
1173
1174 # configure http and tls
1175 http::register https 443 [list tls::socket \
1176 -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \
1177 -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1]
1178 http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\
1179 Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION"
1180
1181 # handle errors while in the event loop
1182 interp bgerror {} [namespace code OnError]
1183
1184 # enter the main loop
1185 after idle [namespace code ManageTransfers]
1186 vwait [namespace which -variable ExitStatus]
1187
1188 dict set Statistics "end_time" [clock milliseconds]
1189
1190 # display statistics
1191 ${Log}::notice "items checked: [dict get $Statistics "items"]"
1192 ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] -
1193 [dict get $Statistics "start_time"]) / 1000}]s"
1194 1234
1195 # serialize the new state 1235 # serialize the new state
1196 set JsonStateItems {} 1236 set JsonStateItems {}
1197 dict for {item data} $State { 1237 dict for {item data} $State {
1198 set versions {} 1238 set versions {}
1675 } 1715 }
1676 exit 0 1716 exit 0
1677 } 1717 }
1678 1718
1679 1719
1720 namespace eval ::relmon::discover {
1721 # commandline option help text
1722 variable usage "usage: relmon discover \[-d\] \[-c max_connections\]\
1723 \[-C ca_dir\] \[-D host_delay\]\n\
1724 \ \[-H max_connections_per_host\]\
1725 \[-r retries\]\n\
1726 \ \[-t before_seconds\]\
1727 \[-T timeout_seconds\] base_url\n\
1728 \ \[pattern...\]"
1729
1730 # configuration options
1731 variable Config [dict create \
1732 "log_file" "" \
1733 "log_level" "info" \
1734 "connection_limit" 16 \
1735 "host_connection_limit" 4 \
1736 "transfer_time_limit" 60000 \
1737 "retry_limit" 3 \
1738 "host_delay" 500 \
1739 "host_delays" [dict create] \
1740 "ca_dir" ""]
1741
1742 # transfer statistics
1743 variable Statistics [dict create \
1744 "start_time" [clock milliseconds] \
1745 "end_time" 0 \
1746 "requests" 0 \
1747 "items" 0]
1748
1749 # watchlist
1750 variable Queue [dict create]
1751
1752 # logger handle
1753 variable Log
1754 }
1755
1756 proc ::relmon::discover::OnItemFinished {name stateItem} {
1757 variable Log
1758
1759 dict for {url urls} [dict get $stateItem "urls"] {
1760 dict for {linkUrl matched} $urls {
1761 if {$matched} {
1762 puts "\"$url\": found matching URL \"$linkUrl\""
1763 } else {
1764 puts "\"$url\": found non-matching URL \"$linkUrl\""
1765 }
1766 }
1767 }
1768 }
1769
1770 proc ::relmon::discover::main {args} {
1771 variable Config
1772 variable usage
1773 variable Log
1774 variable Queue
1775
1776 # parse commandline
1777 while {[set GetoptRet [cmdline::getopt args \
1778 {c.arg C.arg d D.arg h H.arg r.arg t.arg T.arg ?} \
1779 OptArg OptVal]] == 1} {
1780 switch -glob -- $OptArg {
1781 {c} {
1782 if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
1783 puts stderr "invalid value passed to \"-$OptArg\""
1784 exit 1
1785 }
1786 dict set Config "host_connection_limit" $OptVal
1787 }
1788 {C} {
1789 if {![file isdirectory $OptVal]} {
1790 puts stderr "directory \"$OptVal\" is not a directory"
1791 exit 1
1792 } elseif {![file readable $OptVal] ||
1793 ![file executable $OptVal]} {
1794 puts stderr "directory \"$OptVal\" is not readable"
1795 exit 1
1796 }
1797 dict set Config "ca_dir" $OptVal
1798 }
1799 {d} {
1800 dict set Config "log_level" "debug"
1801 }
1802 {D} {
1803 if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
1804 puts stderr "invalid value passed to \"-$OptArg\""
1805 exit 1
1806 }
1807 dict set Config "host_delay" $OptVal
1808 }
1809 {[h?]} {
1810 puts stderr $usage
1811 exit 0
1812 }
1813 {H} {
1814 if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
1815 puts stderr "invalid value passed to \"-$OptArg\""
1816 exit 1
1817 }
1818 dict set Config "connection_limit" $OptVal
1819 }
1820 {r} {
1821 if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
1822 puts stderr "invalid value passed to \"-$OptArg\""
1823 exit 1
1824 }
1825 dict set Config "retry_limit" $OptVal
1826 }
1827 {t} {
1828 if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
1829 puts stderr "invalid value passed to \"-$OptArg\""
1830 exit 1
1831 }
1832 dict set Config "timestamp_filter" [expr {$OptVal * 1000}]
1833 }
1834 {T} {
1835 if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
1836 puts stderr "invalid value passed to \"-$OptArg\""
1837 exit 1
1838 }
1839 dict set Config "transfer_time_limit" [expr {$OptVal * 1000}]
1840 }
1841 }
1842 }
1843 set argc [llength $args]
1844 if {$GetoptRet == -1} {
1845 puts stderr "unknown command line option \"-$OptArg\""
1846 puts stderr $usage
1847 exit 1
1848 }
1849 if {$argc < 1} {
1850 puts stderr $usage
1851 exit 1
1852 }
1853
1854 set patterns [lassign $args baseUrl]
1855
1856 # validate URL
1857 if {![::relmon::common::isUrlValid $baseUrl]} {
1858 puts stderr "invalid base URL"
1859 exit 1
1860 }
1861
1862 # process patterns
1863 set processedPatterns {}
1864 set reInfo {}
1865 set patternIndex 0
1866 foreach pattern $patterns {
1867 incr patternIndex
1868
1869 # make trailing slashes optional
1870 if {($patternIndex != [llength $patterns]) &&
1871 ([string index $pattern end] eq "/")} {
1872 append pattern {?}
1873 }
1874
1875 # ensure patterns are anchored to the end of the line
1876 if {[string index $pattern end] ne "$"} {
1877 append pattern {$}
1878 }
1879
1880 # actually validate the regular expression
1881 try {
1882 set reInfo [regexp -about -- $pattern ""]
1883 } on error {errorMsg} {
1884 puts stderr $errorMsg
1885 exit 1
1886 }
1887 lappend processedPatterns $pattern
1888 }
1889 # add a dummy pattern "()" if the last pattern is does not contain a
1890 # capturing group, this will match every link and capture an empty string
1891 if {[lindex $reInfo 0] < 1} {
1892 lappend processedPatterns {()}
1893 }
1894
1895 # construct a watchlist from the command line arguments
1896 set watchlist [dict create "new project" [dict create "base_url" $baseUrl \
1897 "patterns" $processedPatterns]]
1898
1899 set Log [logger::init global]
1900 if {[dict get $Config "log_level"] eq "debug"} {
1901 set logFormat $relmon::LogFormatDebug
1902 } else {
1903 set logFormat $relmon::LogFormatDefault
1904 }
1905 logger::utils::applyAppender -appender fileAppend -appenderArgs \
1906 [list -outputChannel stderr -conversionPattern $logFormat] \
1907 -serviceCmd $Log
1908
1909 # set default logging level
1910 ${Log}::setlevel [dict get $Config "log_level"]
1911
1912 ${Log}::notice "relmon.tcl starting up"
1913
1914 set ExitStatus [relmon::crawler::crawl $Config $watchlist $Log \
1915 [namespace code OnItemFinished]]
1916
1917 exit $ExitStatus
1918 }
1919
1920
1680 proc ::relmon::main {args} { 1921 proc ::relmon::main {args} {
1681 variable usage 1922 variable usage
1682 set subArgs [lassign $args subCommand] 1923 set subArgs [lassign $args subCommand]
1683 1924
1684 # generate list of subcommands 1925 # generate list of subcommands