Mercurial > projects > relmon
comparison relmon.tcl.in @ 5:86a0c5d11f05 version-1
Add discover subcommand
The discover subcommand assists with the creation of watchlist entries and
allows to reproduce each step of an update operation for a watchlist entry.
author | Guido Berhoerster <guido+relmon@berhoerster.name> |
---|---|
date | Sun, 26 Oct 2014 21:36:05 +0100 |
parents | 6d87242c537e |
children |
comparison
equal
deleted
inserted
replaced
4:f28486666a4f | 5:86a0c5d11f05 |
---|---|
40 | 40 |
41 | 41 |
42 namespace eval ::relmon { | 42 namespace eval ::relmon { |
43 # version | 43 # version |
44 variable VERSION @VERSION@ | 44 variable VERSION @VERSION@ |
45 | |
46 # default log format | |
47 variable LogFormatDefault {%d \[%p\] %m} | |
48 | |
49 # debugging log format | |
50 variable LogFormatDebug {%d \[%p\] \[%M\] %m} | |
45 } | 51 } |
46 | 52 |
47 | 53 |
48 namespace eval ::relmon::common { | 54 namespace eval ::relmon::common { |
49 namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \ | 55 namespace export cmpVersions isUrlValid urlGetHost normalizeHttpHeaders \ |
175 } | 181 } |
176 | 182 |
177 return $state | 183 return $state |
178 } | 184 } |
179 | 185 |
180 | 186 proc ::relmon::common::parseWatchlist {watchlistFilename} { |
181 namespace eval ::relmon::update { | 187 set Watchlist [dict create] |
182 # commandline option help text | |
183 variable usage "usage: relmon update \[-dev\] \[-c max_connections\] \[-C\ | |
184 ca_dir\] \[-D delay\]\n\ | |
185 \ \[-H max_host_connections\] \[-i\ | |
186 item\[,...\]\] \[-l logfile\]\n\ | |
187 \ \[-r retries\] \[-t min_time\] watchlist\ | |
188 statefile" | |
189 | |
190 # configuration options | |
191 variable Config [dict create \ | |
192 "log_file" "" \ | |
193 "log_level" "notice" \ | |
194 "history_limit" 20 \ | |
195 "connection_limit" 16 \ | |
196 "host_connection_limit" 4 \ | |
197 "transfer_time_limit" 60000 \ | |
198 "retry_limit" 3 \ | |
199 "host_delay" 0 \ | |
200 "timestamp_filter" 0 \ | |
201 "error_filter" 0 \ | |
202 "item_filter" {} \ | |
203 "ca_dir" "" \ | |
204 "state_file" "" \ | |
205 "watchlist_file" ""] | |
206 | |
207 # exit status | |
208 variable ExitStatus | |
209 | |
210 # transfer statistics | |
211 variable Statistics [dict create \ | |
212 "start_time" 0 \ | |
213 "end_time" 0 \ | |
214 "requests" 0 \ | |
215 "items" 0] | |
216 | |
217 # watchlist | |
218 variable Watchlist | |
219 | |
220 # ID of a delayed run of ManageTransfers | |
221 variable ManageTransfersId "" | |
222 | |
223 # queue of pending transfers | |
224 variable Queue | |
225 | |
226 # number of active connections per host | |
227 variable HostConnections | |
228 | |
229 # delays before opening a new connection to a host | |
230 variable HostDelays | |
231 | |
232 # active transfers | |
233 variable ActiveTransfers | |
234 | |
235 # buffer for tracking the state of unfinished items | |
236 variable StateBuffer | |
237 | |
238 # buffer needed by htmlparse::parse for constructing the preprocessed HTML | |
239 # document | |
240 variable PreprocessedHtmlBuffer | |
241 | |
242 # logger handle | |
243 variable Log | |
244 | |
245 # logfile handle | |
246 variable Lf | |
247 } | |
248 | |
249 proc ::relmon::update::OnError {message returnOptions} { | |
250 # internal error, abort | |
251 puts stderr [dict get $returnOptions "-errorinfo"] | |
252 | |
253 exit 1 | |
254 } | |
255 | |
256 proc ::relmon::update::CleanupBeforeExit {commandString operation} { | |
257 variable Lf | |
258 | |
259 # close logfile | |
260 if {($Lf ne "") && ($Lf ni {stdin stderr})} { | |
261 close $Lf | |
262 set Lf "" | |
263 } | |
264 | |
265 return | |
266 } | |
267 | |
268 proc ::relmon::update::ParseWatchlist {watchlistFilename} { | |
269 variable Watchlist | |
270 | |
271 set lineno 0 | 188 set lineno 0 |
272 set f [open $watchlistFilename "r"] | 189 set f [open $watchlistFilename "r"] |
273 try { | 190 try { |
274 while {[chan gets $f line] != -1} { | 191 while {[chan gets $f line] != -1} { |
275 set fields [textutil::split::splitx [string trim $line] {[\t ]+}] | 192 set fields [textutil::split::splitx [string trim $line] {[\t ]+}] |
336 } | 253 } |
337 } finally { | 254 } finally { |
338 close $f | 255 close $f |
339 } | 256 } |
340 | 257 |
341 return | 258 return $Watchlist |
342 } | 259 } |
343 | 260 |
344 proc ::relmon::update::ProcessHtmlElement {tag slash param textBehindTheTag} { | 261 |
262 namespace eval ::relmon::crawler { | |
263 # status returned by crawl | |
264 variable Status | |
265 | |
266 # configuration | |
267 variable Config | |
268 | |
269 # transfer statistics | |
270 variable Statistics [dict create \ | |
271 "start_time" 0 \ | |
272 "end_time" 0 \ | |
273 "requests" 0 \ | |
274 "items" 0] | |
275 | |
276 # ID of a delayed run of ManageTransfers | |
277 variable ManageTransfersId "" | |
278 | |
279 # queue of pending transfers | |
280 variable Queue | |
281 | |
282 # number of active connections per host | |
283 variable HostConnections [dict create] | |
284 | |
285 # delays before opening a new connection to a host | |
286 variable HostDelays [dict create] | |
287 | |
288 # active transfers | |
289 variable ActiveTransfers [dict create] | |
290 | |
291 # number of running or queued transfers of watchlist items | |
292 variable RemainingTransfers [dict create] | |
293 | |
294 # buffer for tracking the state of unfinished items | |
295 variable StateBuffer [dict create] | |
296 | |
297 # buffer needed by htmlparse::parse for constructing the preprocessed HTML | |
298 # document | |
299 variable PreprocessedHtmlBuffer | |
300 | |
301 # callback when all transfers for an item are finished | |
302 variable OnItemFinishedCmd | |
303 | |
304 # logger handle | |
305 variable Log | |
306 | |
307 namespace export crawl | |
308 } | |
309 | |
310 proc ::relmon::crawler::OnError {message returnOptions} { | |
311 # internal error, abort | |
312 puts stderr [dict get $returnOptions "-errorinfo"] | |
313 | |
314 exit 1 | |
315 } | |
316 | |
317 proc ::relmon::crawler::ProcessHtmlElement {tag slash param textBehindTheTag} { | |
345 variable PreprocessedHtmlBuffer | 318 variable PreprocessedHtmlBuffer |
346 | 319 |
347 # copy every "<a>" element into PreprocessedHtmlBuffer | 320 # copy every "<a>" element into PreprocessedHtmlBuffer |
348 if {($slash eq "") && ([string tolower $tag] eq "a")} { | 321 if {($slash eq "") && ([string tolower $tag] eq "a")} { |
349 append PreprocessedHtmlBuffer "<$tag $param></$tag>" | 322 append PreprocessedHtmlBuffer "<$tag $param></$tag>" |
350 } | 323 } |
351 | 324 |
352 return | 325 return |
353 } | 326 } |
354 | 327 |
355 proc ::relmon::update::PreprocessHtml {bodyDataName} { | 328 proc ::relmon::crawler::PreprocessHtml {bodyDataName} { |
356 upvar 1 $bodyDataName bodyData | 329 upvar 1 $bodyDataName bodyData |
357 variable PreprocessedHtmlBuffer | 330 variable PreprocessedHtmlBuffer |
358 | 331 |
359 # preprocess the document with htmlparse by constructing a new document | 332 # preprocess the document with htmlparse by constructing a new document |
360 # consisting only of found "<a>" elements which then can be fed into tdom | 333 # consisting only of found "<a>" elements which then can be fed into tdom |
365 set PreprocessedHtmlBuffer "<html><body>" | 338 set PreprocessedHtmlBuffer "<html><body>" |
366 htmlparse::parse -cmd [namespace code ProcessHtmlElement] $bodyData | 339 htmlparse::parse -cmd [namespace code ProcessHtmlElement] $bodyData |
367 append PreprocessedHtmlBuffer "</body></html>" | 340 append PreprocessedHtmlBuffer "</body></html>" |
368 } | 341 } |
369 | 342 |
370 proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl | 343 proc ::relmon::crawler::ExtractUrls {bodyDataName contentType baseUrl |
371 rePattern} { | 344 rePattern} { |
372 upvar 1 $bodyDataName bodyData | 345 upvar 1 $bodyDataName bodyData |
373 set extractedUrls {} | 346 set extractedUrls {} |
374 set resultUrls [dict create] | 347 set resultUrls [dict create] |
375 # extract all URLs or URL fragments | 348 # extract all URLs or URL fragments |
460 } | 433 } |
461 | 434 |
462 return $resultUrls | 435 return $resultUrls |
463 } | 436 } |
464 | 437 |
465 proc ::relmon::update::StateItemAppendError {name logMsg} { | 438 proc ::relmon::crawler::StateItemAppendError {name logMsg} { |
466 variable StateBuffer | 439 variable StateBuffer |
467 | 440 |
468 dict update StateBuffer $name stateItem { | 441 dict update StateBuffer $name stateItem { |
469 dict lappend stateItem "errors" $logMsg | 442 dict lappend stateItem "errors" $logMsg |
470 } | 443 } |
471 | 444 |
472 return | 445 return |
473 } | 446 } |
474 | 447 |
475 proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} { | 448 proc ::relmon::crawler::HandleSuccessfulTransfer {item httpBodyName} { |
476 upvar 1 $httpBodyName httpBody | 449 upvar 1 $httpBodyName httpBody |
477 variable Log | 450 variable Log |
478 variable StateBuffer | 451 variable StateBuffer |
479 variable Queue | 452 variable Queue |
453 variable RemainingTransfers | |
480 variable Watchlist | 454 variable Watchlist |
481 | 455 |
482 set name [dict get $item "name"] | 456 set name [dict get $item "name"] |
483 set url [dict get $item "url"] | 457 set url [dict get $item "url"] |
484 set patternIndex [dict get $item "pattern_index"] | 458 set patternIndex [dict get $item "pattern_index"] |
514 | 488 |
515 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \ | 489 dict lappend Queue [::relmon::common::urlGetHost $newUrl] \ |
516 [dict create "name" $name "url" $newUrl \ | 490 [dict create "name" $name "url" $newUrl \ |
517 "pattern_index" [expr {$patternIndex + 1}] \ | 491 "pattern_index" [expr {$patternIndex + 1}] \ |
518 "content_type" "" "num_redirects" 0 "num_retries" 0] | 492 "content_type" "" "num_redirects" 0 "num_retries" 0] |
493 dict incr RemainingTransfers $name | |
519 } else { | 494 } else { |
520 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\"" | 495 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\"" |
521 } | 496 } |
522 } | 497 } |
523 } else { | 498 } else { |
524 # otherwise this branch has finished, try to extract the versions and | 499 # otherwise this branch has finished, try to extract the versions and |
525 # store them in the buffer | 500 # store them in the buffer |
526 dict for {finalUrl matched} $urls { | 501 dict for {finalUrl matched} $urls { |
502 dict set StateBuffer $name "urls" $url $urls | |
527 if {$matched} { | 503 if {$matched} { |
528 regexp -line -- $pattern $finalUrl -> version | 504 regexp -line -- $pattern $finalUrl -> version |
529 if {$version ne ""} { | 505 if {$version ne ""} { |
530 ${Log}::debug "\"$name\": \"$url\": extracted version\ | 506 ${Log}::debug "\"$name\": \"$url\": extracted version\ |
531 \"$version\" from \"$finalUrl\" found on\ | 507 \"$version\" from \"$finalUrl\" found on\ |
542 } | 518 } |
543 | 519 |
544 return | 520 return |
545 } | 521 } |
546 | 522 |
547 proc ::relmon::update::HandleRedirect {item httpCode httpHeaders} { | 523 proc ::relmon::crawler::HandleRedirect {item httpCode httpHeaders} { |
548 variable Log | 524 variable Log |
549 variable Queue | 525 variable Queue |
526 variable RemainingTransfers | |
550 | 527 |
551 set name [dict get $item "name"] | 528 set name [dict get $item "name"] |
552 set url [dict get $item "url"] | 529 set url [dict get $item "url"] |
553 | 530 |
554 if {![dict exists $httpHeaders "Location"]} { | 531 if {![dict exists $httpHeaders "Location"]} { |
586 | 563 |
587 dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \ | 564 dict lappend Queue [::relmon::common::urlGetHost $redirectUrl] \ |
588 [dict replace $item "url" $redirectUrl "content_type" "" \ | 565 [dict replace $item "url" $redirectUrl "content_type" "" \ |
589 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \ | 566 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \ |
590 "num_retries" 0] | 567 "num_retries" 0] |
568 dict incr RemainingTransfers $name | |
591 } else { | 569 } else { |
592 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ | 570 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ |
593 redirects" | 571 redirects" |
594 ${Log}::warn $warningMsg | 572 ${Log}::warn $warningMsg |
595 StateItemAppendError $name $warningMsg | 573 StateItemAppendError $name $warningMsg |
596 } | 574 } |
597 | 575 |
598 return | 576 return |
599 } | 577 } |
600 | 578 |
601 proc ::relmon::update::HandleProtocolError {item httpCode} { | 579 proc ::relmon::crawler::HandleProtocolError {item httpCode} { |
602 variable Log | 580 variable Log |
603 set name [dict get $item "name"] | 581 set name [dict get $item "name"] |
604 set url [dict get $item "url"] | 582 set url [dict get $item "url"] |
605 set warningMsg "\"$name\": \"$url\": transfer failed: $httpCode" | 583 set warningMsg "\"$name\": \"$url\": transfer failed: $httpCode" |
606 ${Log}::warn $warningMsg | 584 ${Log}::warn $warningMsg |
607 StateItemAppendError $name $warningMsg | 585 StateItemAppendError $name $warningMsg |
608 return | 586 return |
609 } | 587 } |
610 | 588 |
611 proc ::relmon::update::HandleTimeoutReset {item} { | 589 proc ::relmon::crawler::HandleTimeoutReset {item} { |
612 variable Log | 590 variable Log |
613 variable Config | 591 variable Config |
614 variable Queue | 592 variable Queue |
593 variable RemainingTransfers | |
615 set name [dict get $item "name"] | 594 set name [dict get $item "name"] |
616 set url [dict get $item "url"] | 595 set url [dict get $item "url"] |
617 | 596 |
618 # retry by re-queuing the target URL until reaching the limit | 597 # retry by re-queuing the target URL until reaching the limit |
619 if {[dict get $item "num_retries"] < [dict get $Config "retry_limit"]} { | 598 if {[dict get $item "num_retries"] < [dict get $Config "retry_limit"]} { |
620 ${Log}::warn "\"$name\": \"$url\": connection timed out or was reset,\ | 599 ${Log}::warn "\"$name\": \"$url\": connection timed out or was reset,\ |
621 retrying" | 600 retrying" |
622 dict lappend Queue [::relmon::common::urlGetHost $url] \ | 601 dict lappend Queue [::relmon::common::urlGetHost $url] \ |
623 [dict replace $item \ | 602 [dict replace $item \ |
624 "num_retries" [expr {[dict get $item "num_retries"] + 1}]] | 603 "num_retries" [expr {[dict get $item "num_retries"] + 1}]] |
604 dict incr RemainingTransfers $name | |
625 } else { | 605 } else { |
626 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ | 606 set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ |
627 retries" | 607 retries" |
628 ${Log}::warn $warningMsg | 608 ${Log}::warn $warningMsg |
629 StateItemAppendError $name $warningMsg | 609 StateItemAppendError $name $warningMsg |
630 } | 610 } |
631 | 611 |
632 return | 612 return |
633 } | 613 } |
634 | 614 |
635 proc ::relmon::update::HandleConnectionError {item errorMsg} { | 615 proc ::relmon::crawler::HandleConnectionError {item errorMsg} { |
636 variable Log | 616 variable Log |
637 set name [dict get $item "name"] | 617 set name [dict get $item "name"] |
638 set url [dict get $item "url"] | 618 set url [dict get $item "url"] |
639 set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg" | 619 set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg" |
640 ${Log}::warn $warningMsg | 620 ${Log}::warn $warningMsg |
641 StateItemAppendError $name $warningMsg | 621 StateItemAppendError $name $warningMsg |
642 return | 622 return |
643 } | 623 } |
644 | 624 |
645 proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} { | 625 proc ::relmon::crawler::TransferCallbackWrapper {callbackCmd args} { |
646 # ensure that exceptions get raised, by default http catches all errors and | 626 # ensure that exceptions get raised, by default http catches all errors and |
647 # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262 | 627 # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262 |
648 if {[catch {eval $callbackCmd $args} -> errorOptions]} { | 628 if {[catch {eval $callbackCmd $args} -> errorOptions]} { |
649 OnError [dict get $errorOptions "-errorinfo"] $errorOptions | 629 OnError [dict get $errorOptions "-errorinfo"] $errorOptions |
650 } | 630 } |
651 return | 631 return |
652 } | 632 } |
653 | 633 |
654 proc ::relmon::update::ManageTransfers {} { | 634 proc ::relmon::crawler::ManageTransfers {} { |
655 variable Config | 635 variable Config |
656 variable ManageTransfersId | 636 variable ManageTransfersId |
657 variable Queue | 637 variable Queue |
658 variable HostConnections | 638 variable HostConnections |
659 variable HostDelays | 639 variable HostDelays |
660 variable ActiveTransfers | 640 variable ActiveTransfers |
661 variable ExitStatus | 641 variable RemainingTransfers |
642 variable Status | |
662 variable Log | 643 variable Log |
663 | 644 |
664 after cancel $ManageTransfersId | 645 after cancel $ManageTransfersId |
665 | 646 |
666 # try to initiate new transfers | 647 # try to initiate new transfers |
724 } on error {errorMsg} { | 705 } on error {errorMsg} { |
725 # an error occured during socket setup, e.g. a DNS lookup failure | 706 # an error occured during socket setup, e.g. a DNS lookup failure |
726 set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg" | 707 set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg" |
727 ${Log}::warn $warningMsg | 708 ${Log}::warn $warningMsg |
728 StateItemAppendError $name $warningMsg | 709 StateItemAppendError $name $warningMsg |
710 dict incr RemainingTransfers $name -1 | |
729 } | 711 } |
730 } | 712 } |
731 | 713 |
732 # terminate the event loop if there are no remaining transfers | 714 # terminate the event loop if there are no remaining transfers |
733 if {([dict size $ActiveTransfers]) == 0 && ([dict size $Queue] == 0)} { | 715 if {[::tcl::mathop::+ {*}[dict values $RemainingTransfers]] == 0} { |
734 set ExitStatus 0 | 716 if {([dict size $ActiveTransfers]) > 0 || ([dict size $Queue] > 0)} { |
717 ${Log}::error "inconsistent internal state" | |
718 set Status 1 | |
719 } | |
720 set Status 0 | |
735 return | 721 return |
736 } | 722 } |
737 | 723 |
738 # due to per-host connection limits and per-host delays the maximum number | 724 # due to per-host connection limits and per-host delays the maximum number |
739 # of connections may not be reached although there are still items in the | 725 # of connections may not be reached although there are still items in the |
763 } | 749 } |
764 | 750 |
765 return | 751 return |
766 } | 752 } |
767 | 753 |
768 proc ::relmon::update::OnTransferProgress {token total current} { | 754 proc ::relmon::crawler::OnTransferProgress {token total current} { |
769 upvar #0 $token httpState | 755 upvar #0 $token httpState |
770 variable ActiveTransfers | 756 variable ActiveTransfers |
771 variable Log | 757 variable Log |
772 | 758 |
773 # try to determine content type and abort transfer if content type is not | 759 # try to determine content type and abort transfer if content type is not |
790 } | 776 } |
791 } | 777 } |
792 } | 778 } |
793 } | 779 } |
794 | 780 |
795 proc ::relmon::update::OnTransferFinished {token} { | 781 proc ::relmon::crawler::OnTransferFinished {token} { |
796 upvar #0 $token httpState | 782 upvar #0 $token httpState |
797 variable Config | 783 variable Config |
798 variable HostConnections | 784 variable HostConnections |
799 variable Queue | |
800 variable ActiveTransfers | 785 variable ActiveTransfers |
786 variable RemainingTransfers | |
801 variable Statistics | 787 variable Statistics |
802 variable StateBuffer | 788 variable StateBuffer |
803 variable State | 789 variable State |
804 variable Log | 790 variable Log |
791 variable OnItemFinishedCmd | |
805 | 792 |
806 set item [dict get $ActiveTransfers $token] | 793 set item [dict get $ActiveTransfers $token] |
807 set name [dict get $item "name"] | 794 set name [dict get $item "name"] |
808 set host [relmon::common::urlGetHost [dict get $item "url"]] | 795 set host [relmon::common::urlGetHost [dict get $item "url"]] |
809 | 796 |
810 # update list of per-host connections, and number of remaining transfers | 797 # update list of per-host connections, and number of remaining transfers |
811 # for this item | 798 # for this item |
812 dict unset ActiveTransfers $token | 799 dict unset ActiveTransfers $token |
813 dict incr HostConnections $host -1 | 800 dict incr HostConnections $host -1 |
801 dict incr RemainingTransfers $name -1 | |
814 | 802 |
815 switch -- $httpState(status) { | 803 switch -- $httpState(status) { |
816 {ok} { | 804 {ok} { |
817 # normalize headers | 805 # normalize headers |
818 set httpHeaders [relmon::common::normalizeHttpHeaders \ | 806 set httpHeaders [relmon::common::normalizeHttpHeaders \ |
852 HandleConnectionError $item [lindex $httpState(error) 0] | 840 HandleConnectionError $item [lindex $httpState(error) 0] |
853 } | 841 } |
854 } | 842 } |
855 | 843 |
856 # check if all transfers of this item are finished | 844 # check if all transfers of this item are finished |
857 set itemFinished 1 | 845 if {[dict get $RemainingTransfers $name] == 0} { |
858 dict for {queueHost queueItems} $Queue { | 846 eval $OnItemFinishedCmd [list $name [dict get $StateBuffer $name]] |
859 foreach queueItem $queueItems { | 847 |
860 if {[dict get $queueItem "name"] eq $name} { | |
861 set itemFinished 0 | |
862 } | |
863 } | |
864 } | |
865 dict for {activeToken activeItem} $ActiveTransfers { | |
866 if {[dict get $activeItem "name"] eq $name} { | |
867 set itemFinished 0 | |
868 } | |
869 } | |
870 if {$itemFinished} { | |
871 set timestamp [clock milliseconds] | |
872 | |
873 # create httpState item if it does not exist yet | |
874 if {![dict exists $State $name]} { | |
875 dict set State $name [dict create "versions" [dict create] \ | |
876 "history" [list] "timestamp" 0 "errors" [list]] | |
877 } | |
878 | |
879 # if there are no versions, log an error message since something must | |
880 # be wrong | |
881 if {[llength [dict get $StateBuffer $name "versions"]] == 0} { | |
882 set warningMsg "\"$name\": no versions found" | |
883 ${Log}::warn $warningMsg | |
884 StateItemAppendError $name $warningMsg | |
885 } | |
886 | |
887 # update httpState item | |
888 dict set State $name "errors" [dict get $StateBuffer $name "errors"] | |
889 dict set State $name "timestamp" $timestamp | |
890 if {[llength [dict get $StateBuffer $name "errors"]] == 0} { | |
891 # expire old history entries | |
892 set history [lrange [dict get $State $name "history"] \ | |
893 [expr {[llength [dict get $State $name "history"]] - | |
894 [dict get $Config "history_limit"] + 1}] end] | |
895 | |
896 # add currently latest available version to history if it is either | |
897 # newer than the previous one or if the previous one is no longer | |
898 # available (e.g. if it has been removed or the watchlist pattern | |
899 # has been changed) | |
900 set prevLatestVersion [lindex $history end 0] | |
901 set curLatestVersion [lindex \ | |
902 [lsort -command ::relmon::common::cmpVersion \ | |
903 [dict keys [dict get $StateBuffer $name "versions"]]] end] | |
904 if {([::relmon::common::cmpVersion $curLatestVersion \ | |
905 $prevLatestVersion] > 0) || | |
906 ![dict exists $StateBuffer $name "versions" \ | |
907 $prevLatestVersion]} { | |
908 lappend history [list $curLatestVersion $timestamp] | |
909 dict set State $name "history" $history | |
910 } | |
911 dict set State $name "versions" [dict get $StateBuffer $name \ | |
912 "versions"] | |
913 } | |
914 dict unset StateBuffer $name | 848 dict unset StateBuffer $name |
915 | 849 |
916 ${Log}::notice "progress: [dict size $StateBuffer]/[dict get \ | 850 ${Log}::notice "progress: [dict size $StateBuffer]/[dict get \ |
917 $Statistics "items"] items left" | 851 $Statistics "items"] items left" |
918 } | 852 } |
923 | 857 |
924 return | 858 return |
925 } | 859 } |
926 | 860 |
927 # control certificate verification and log errors during TLS handshake | 861 # control certificate verification and log errors during TLS handshake |
928 proc ::relmon::update::OnTlsHandshake {type args} { | 862 proc ::relmon::crawler::OnTlsHandshake {type args} { |
929 variable Config | 863 variable Config |
930 variable Log | 864 variable Log |
931 | 865 |
932 switch -- ${type} { | 866 switch -- ${type} { |
933 {error} { | 867 {error} { |
955 } | 889 } |
956 } | 890 } |
957 } | 891 } |
958 } | 892 } |
959 | 893 |
894 proc ::relmon::crawler::crawl {config watchlist log onItemFinishedCmd} { | |
895 variable Config $config | |
896 variable Watchlist $watchlist | |
897 variable Queue | |
898 variable Statistics | |
899 variable Log $log | |
900 variable OnItemFinishedCmd $onItemFinishedCmd | |
901 variable Status | |
902 variable RemainingTransfers | |
903 variable StateBuffer | |
904 | |
905 # initialize queue and state buffer from the watchlist | |
906 set Queue [dict create] | |
907 dict for {name watchlistItem} $Watchlist { | |
908 dict lappend Queue [::relmon::common::urlGetHost \ | |
909 [dict get $watchlistItem "base_url"]] \ | |
910 [dict create \ | |
911 "name" $name \ | |
912 "url" [dict get $watchlistItem "base_url"] \ | |
913 "pattern_index" 0 \ | |
914 "content_type" "" \ | |
915 "num_redirects" 0 \ | |
916 "num_retries" 0] | |
917 dict set RemainingTransfers $name 1 | |
918 dict incr Statistics "items" | |
919 dict set StateBuffer $name [dict create "versions" [dict create] \ | |
920 "errors" [list] "urls" [dict create]] | |
921 } | |
922 | |
923 # configure http and tls | |
924 http::register https 443 [list tls::socket \ | |
925 -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \ | |
926 -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1] | |
927 http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\ | |
928 Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION" | |
929 | |
930 # handle errors while in the event loop | |
931 interp bgerror {} [namespace code OnError] | |
932 | |
933 dict set Statistics "start_time" [clock milliseconds] | |
934 | |
935 # enter the main loop | |
936 after idle [namespace code ManageTransfers] | |
937 vwait [namespace which -variable Status] | |
938 | |
939 # display statistics | |
940 dict set Statistics "end_time" [clock milliseconds] | |
941 ${Log}::notice "items checked: [dict get $Statistics "items"]" | |
942 ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] - | |
943 [dict get $Statistics "start_time"]) / 1000}]s" | |
944 | |
945 return $Status | |
946 } | |
947 | |
948 | |
949 namespace eval ::relmon::update { | |
950 # commandline option help text | |
951 variable usage "usage: relmon update \[-dev\] \[-c max_connections\]\ | |
952 \[-C ca_dir\] \[-D host_delay\]\n\ | |
953 \ \[-H max_connections_per_host\]\ | |
954 \[-i item_list\]\n\ | |
955 \ \[-l logfile\] \[-r retries\]\ | |
956 \[-t before_seconds\]\n\ | |
957 \ \[-T timeout_seconds\] watchlist statefile" | |
958 | |
959 # configuration options | |
960 variable Config [dict create \ | |
961 "log_file" "" \ | |
962 "log_level" "notice" \ | |
963 "history_limit" 20 \ | |
964 "connection_limit" 16 \ | |
965 "host_connection_limit" 4 \ | |
966 "transfer_time_limit" 60000 \ | |
967 "retry_limit" 3 \ | |
968 "host_delay" 500 \ | |
969 "host_delays" [dict create] \ | |
970 "timestamp_filter" 0 \ | |
971 "error_filter" 0 \ | |
972 "item_filter" {} \ | |
973 "ca_dir" "" \ | |
974 "state_file" "" \ | |
975 "watchlist_file" ""] | |
976 | |
977 # watchlist | |
978 variable Watchlist | |
979 | |
980 # logger handle | |
981 variable Log | |
982 | |
983 # logfile handle | |
984 variable Lf | |
985 } | |
986 | |
987 proc ::relmon::update::OnItemFinished {name stateItem} { | |
988 variable State | |
989 variable Log | |
990 variable Config | |
991 | |
992 set timestamp [clock milliseconds] | |
993 | |
994 # create httpState item if it does not exist yet | |
995 if {![dict exists $State $name]} { | |
996 dict set State $name [dict create "versions" [dict create] \ | |
997 "history" [list] "timestamp" 0 "errors" [list]] | |
998 } | |
999 | |
1000 # if there are no versions, log an error message since something must | |
1001 # be wrong | |
1002 if {[llength [dict get $stateItem "versions"]] == 0} { | |
1003 set warningMsg "\"$name\": no versions found" | |
1004 ${Log}::warn $warningMsg | |
1005 dict lappend stateItem "errors" $warningMsg | |
1006 } | |
1007 | |
1008 # update httpState item | |
1009 dict set State $name "errors" [dict get $stateItem "errors"] | |
1010 dict set State $name "timestamp" $timestamp | |
1011 if {[llength [dict get $stateItem "errors"]] == 0} { | |
1012 # expire old history entries | |
1013 set history [lrange [dict get $State $name "history"] \ | |
1014 [expr {[llength [dict get $State $name "history"]] - | |
1015 [dict get $Config "history_limit"] + 1}] end] | |
1016 | |
1017 # add currently latest available version to history if it is either | |
1018 # newer than the previous one or if the previous one is no longer | |
1019 # available (e.g. if it has been removed or the watchlist pattern | |
1020 # has been changed) | |
1021 set prevLatestVersion [lindex $history end 0] | |
1022 set curLatestVersion [lindex \ | |
1023 [lsort -command ::relmon::common::cmpVersion \ | |
1024 [dict keys [dict get $stateItem "versions"]]] end] | |
1025 if {([::relmon::common::cmpVersion $curLatestVersion \ | |
1026 $prevLatestVersion] > 0) || | |
1027 ![dict exists $stateItem "versions" $prevLatestVersion]} { | |
1028 lappend history [list $curLatestVersion $timestamp] | |
1029 dict set State $name "history" $history | |
1030 } | |
1031 dict set State $name "versions" [dict get $stateItem "versions"] | |
1032 } | |
1033 } | |
1034 | |
1035 proc ::relmon::update::CleanupBeforeExit {commandString op} { | |
1036 variable Lf | |
1037 | |
1038 # close logfile | |
1039 if {($Lf ne "") && ($Lf ni {stdin stderr})} { | |
1040 close $Lf | |
1041 set Lf "" | |
1042 } | |
1043 | |
1044 return | |
1045 } | |
1046 | |
960 proc ::relmon::update::main {args} { | 1047 proc ::relmon::update::main {args} { |
961 variable Config | 1048 variable Config |
962 variable usage | 1049 variable usage |
963 variable Statistics | |
964 variable Watchlist [dict create] | |
965 variable Queue [dict create] | |
966 variable HostConnections [dict create] | |
967 variable HostDelays [dict create] | |
968 variable ActiveTransfers [dict create] | |
969 variable State | 1050 variable State |
970 variable StateBuffer [dict create] | |
971 variable PreprocessedHtmlBuffer | |
972 variable Log | 1051 variable Log |
973 variable Lf "" | 1052 variable Lf "" |
974 variable ExitStatus | |
975 | 1053 |
976 # parse commandline | 1054 # parse commandline |
977 while {[set GetoptRet [cmdline::getopt args \ | 1055 while {[set GetoptRet [cmdline::getopt args \ |
978 {c.arg C.arg d D.arg e H.arg i.arg l.arg r.arg t.arg T.arg v} \ | 1056 {c.arg C.arg d D.arg e H.arg i.arg l.arg r.arg t.arg T.arg v} \ |
979 OptArg OptVal]] == 1} { | 1057 OptArg OptVal]] == 1} { |
1099 } else { | 1177 } else { |
1100 set Lf stderr | 1178 set Lf stderr |
1101 } | 1179 } |
1102 set Log [logger::init global] | 1180 set Log [logger::init global] |
1103 if {[dict get $Config "log_level"] eq "debug"} { | 1181 if {[dict get $Config "log_level"] eq "debug"} { |
1104 set logFormat {%d \[%p\] \[%M\] %m} | 1182 set logFormat $relmon::LogFormatDebug |
1105 } else { | 1183 } else { |
1106 set logFormat {%d \[%p\] %m} | 1184 set logFormat $relmon::LogFormatDefault |
1107 } | 1185 } |
1108 logger::utils::applyAppender -appender fileAppend -appenderArgs \ | 1186 logger::utils::applyAppender -appender fileAppend -appenderArgs \ |
1109 [list -outputChannel $Lf -conversionPattern $logFormat] \ | 1187 [list -outputChannel $Lf -conversionPattern $logFormat] \ |
1110 -serviceCmd $Log | 1188 -serviceCmd $Log |
1111 | 1189 |
1114 | 1192 |
1115 ${Log}::notice "relmon.tcl starting up" | 1193 ${Log}::notice "relmon.tcl starting up" |
1116 | 1194 |
1117 # parse the watchlist | 1195 # parse the watchlist |
1118 try { | 1196 try { |
1119 ParseWatchlist [dict get $Config "watchlist_file"] | 1197 set Watchlist [relmon::common::parseWatchlist \ |
1198 [dict get $Config "watchlist_file"]] | |
1120 } trap {POSIX} {errorMsg errorOptions} - \ | 1199 } trap {POSIX} {errorMsg errorOptions} - \ |
1121 trap {RELMON} {errorMsg errorOptions} { | 1200 trap {RELMON} {errorMsg errorOptions} { |
1122 ${Log}::error $errorMsg | 1201 ${Log}::error $errorMsg |
1123 exit 1 | 1202 exit 1 |
1124 } | 1203 } |
1133 trap {RELMON} {errorMsg} { | 1212 trap {RELMON} {errorMsg} { |
1134 ${Log}::error $errorMsg | 1213 ${Log}::error $errorMsg |
1135 exit 1 | 1214 exit 1 |
1136 } | 1215 } |
1137 | 1216 |
1138 # initialize queue and state buffer from the watchlist | 1217 # apply filters specified on the command line to watchlist items |
1139 dict set Statistics "start_time" [clock milliseconds] | 1218 set currentTime [clock milliseconds] |
1140 dict for {name watchlistItem} $Watchlist { | 1219 dict for {name watchlistItem} $Watchlist { |
1141 # apply filters specified on the command line to watchlist items | 1220 if {(([llength [dict get $Config "item_filter"]] > 0) && |
1142 if {([llength [dict get $Config "item_filter"]] > 0) && | 1221 ($name ni [dict get $Config "item_filter"])) || |
1143 ($name ni [dict get $Config "item_filter"])} { | 1222 ([dict get $Config "error_filter"] && |
1144 continue | |
1145 } | |
1146 | |
1147 if {[dict get $Config "error_filter"] && | |
1148 [dict exists $State $name "errors"] && | 1223 [dict exists $State $name "errors"] && |
1149 ([llength [dict get $State $name "errors"]] == 0)} { | 1224 ([llength [dict get $State $name "errors"]] == 0)) || |
1150 continue | 1225 ([dict exists $State $name "timestamp"] && |
1151 } | 1226 ([dict get $State $name "timestamp"] > $currentTime - |
1152 | 1227 [dict get $Config "timestamp_filter"]))} { |
1153 if {[dict exists $State $name "timestamp"] && | 1228 dict unset Watchlist $name |
1154 ([dict get $State $name "timestamp"] > | 1229 } |
1155 [dict get $Statistics "start_time"] - | 1230 } |
1156 [dict get $Config "timestamp_filter"])} { | 1231 |
1157 continue | 1232 set ExitStatus [relmon::crawler::crawl $Config $Watchlist $Log \ |
1158 } | 1233 [namespace code OnItemFinished]] |
1159 | |
1160 dict lappend Queue [::relmon::common::urlGetHost \ | |
1161 [dict get $watchlistItem "base_url"]] \ | |
1162 [dict create \ | |
1163 "name" $name \ | |
1164 "url" [dict get $watchlistItem "base_url"] \ | |
1165 "pattern_index" 0 \ | |
1166 "content_type" "" \ | |
1167 "num_redirects" 0 \ | |
1168 "num_retries" 0] | |
1169 dict incr Statistics "items" | |
1170 dict set StateBuffer $name [dict create "versions" [dict create] \ | |
1171 "errors" [list]] | |
1172 } | |
1173 | |
1174 # configure http and tls | |
1175 http::register https 443 [list tls::socket \ | |
1176 -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \ | |
1177 -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1] | |
1178 http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\ | |
1179 Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION" | |
1180 | |
1181 # handle errors while in the event loop | |
1182 interp bgerror {} [namespace code OnError] | |
1183 | |
1184 # enter the main loop | |
1185 after idle [namespace code ManageTransfers] | |
1186 vwait [namespace which -variable ExitStatus] | |
1187 | |
1188 dict set Statistics "end_time" [clock milliseconds] | |
1189 | |
1190 # display statistics | |
1191 ${Log}::notice "items checked: [dict get $Statistics "items"]" | |
1192 ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] - | |
1193 [dict get $Statistics "start_time"]) / 1000}]s" | |
1194 | 1234 |
1195 # serialize the new state | 1235 # serialize the new state |
1196 set JsonStateItems {} | 1236 set JsonStateItems {} |
1197 dict for {item data} $State { | 1237 dict for {item data} $State { |
1198 set versions {} | 1238 set versions {} |
1675 } | 1715 } |
1676 exit 0 | 1716 exit 0 |
1677 } | 1717 } |
1678 | 1718 |
1679 | 1719 |
1720 namespace eval ::relmon::discover { | |
1721 # commandline option help text | |
1722 variable usage "usage: relmon discover \[-d\] \[-c max_connections\]\ | |
1723 \[-C ca_dir\] \[-D host_delay\]\n\ | |
1724 \ \[-H max_connections_per_host\]\ | |
1725 \[-r retries\]\n\ | |
1726 \ \[-t before_seconds\]\ | |
1727 \[-T timeout_seconds\] base_url\n\ | |
1728 \ \[pattern...\]" | |
1729 | |
1730 # configuration options | |
1731 variable Config [dict create \ | |
1732 "log_file" "" \ | |
1733 "log_level" "info" \ | |
1734 "connection_limit" 16 \ | |
1735 "host_connection_limit" 4 \ | |
1736 "transfer_time_limit" 60000 \ | |
1737 "retry_limit" 3 \ | |
1738 "host_delay" 500 \ | |
1739 "host_delays" [dict create] \ | |
1740 "ca_dir" ""] | |
1741 | |
1742 # transfer statistics | |
1743 variable Statistics [dict create \ | |
1744 "start_time" [clock milliseconds] \ | |
1745 "end_time" 0 \ | |
1746 "requests" 0 \ | |
1747 "items" 0] | |
1748 | |
1749 # watchlist | |
1750 variable Queue [dict create] | |
1751 | |
1752 # logger handle | |
1753 variable Log | |
1754 } | |
1755 | |
1756 proc ::relmon::discover::OnItemFinished {name stateItem} { | |
1757 variable Log | |
1758 | |
1759 dict for {url urls} [dict get $stateItem "urls"] { | |
1760 dict for {linkUrl matched} $urls { | |
1761 if {$matched} { | |
1762 puts "\"$url\": found matching URL \"$linkUrl\"" | |
1763 } else { | |
1764 puts "\"$url\": found non-matching URL \"$linkUrl\"" | |
1765 } | |
1766 } | |
1767 } | |
1768 } | |
1769 | |
1770 proc ::relmon::discover::main {args} { | |
1771 variable Config | |
1772 variable usage | |
1773 variable Log | |
1774 variable Queue | |
1775 | |
1776 # parse commandline | |
1777 while {[set GetoptRet [cmdline::getopt args \ | |
1778 {c.arg C.arg d D.arg h H.arg r.arg t.arg T.arg ?} \ | |
1779 OptArg OptVal]] == 1} { | |
1780 switch -glob -- $OptArg { | |
1781 {c} { | |
1782 if {![string is digit -strict $OptVal] || ($OptVal <= 0)} { | |
1783 puts stderr "invalid value passed to \"-$OptArg\"" | |
1784 exit 1 | |
1785 } | |
1786 dict set Config "host_connection_limit" $OptVal | |
1787 } | |
1788 {C} { | |
1789 if {![file isdirectory $OptVal]} { | |
1790 puts stderr "directory \"$OptVal\" is not a directory" | |
1791 exit 1 | |
1792 } elseif {![file readable $OptVal] || | |
1793 ![file executable $OptVal]} { | |
1794 puts stderr "directory \"$OptVal\" is not readable" | |
1795 exit 1 | |
1796 } | |
1797 dict set Config "ca_dir" $OptVal | |
1798 } | |
1799 {d} { | |
1800 dict set Config "log_level" "debug" | |
1801 } | |
1802 {D} { | |
1803 if {![string is digit -strict $OptVal] || ($OptVal < 0)} { | |
1804 puts stderr "invalid value passed to \"-$OptArg\"" | |
1805 exit 1 | |
1806 } | |
1807 dict set Config "host_delay" $OptVal | |
1808 } | |
1809 {[h?]} { | |
1810 puts stderr $usage | |
1811 exit 0 | |
1812 } | |
1813 {H} { | |
1814 if {![string is digit -strict $OptVal] || ($OptVal <= 0)} { | |
1815 puts stderr "invalid value passed to \"-$OptArg\"" | |
1816 exit 1 | |
1817 } | |
1818 dict set Config "connection_limit" $OptVal | |
1819 } | |
1820 {r} { | |
1821 if {![string is digit -strict $OptVal] || ($OptVal < 0)} { | |
1822 puts stderr "invalid value passed to \"-$OptArg\"" | |
1823 exit 1 | |
1824 } | |
1825 dict set Config "retry_limit" $OptVal | |
1826 } | |
1827 {t} { | |
1828 if {![string is digit -strict $OptVal] || ($OptVal < 0)} { | |
1829 puts stderr "invalid value passed to \"-$OptArg\"" | |
1830 exit 1 | |
1831 } | |
1832 dict set Config "timestamp_filter" [expr {$OptVal * 1000}] | |
1833 } | |
1834 {T} { | |
1835 if {![string is digit -strict $OptVal] || ($OptVal <= 0)} { | |
1836 puts stderr "invalid value passed to \"-$OptArg\"" | |
1837 exit 1 | |
1838 } | |
1839 dict set Config "transfer_time_limit" [expr {$OptVal * 1000}] | |
1840 } | |
1841 } | |
1842 } | |
1843 set argc [llength $args] | |
1844 if {$GetoptRet == -1} { | |
1845 puts stderr "unknown command line option \"-$OptArg\"" | |
1846 puts stderr $usage | |
1847 exit 1 | |
1848 } | |
1849 if {$argc < 1} { | |
1850 puts stderr $usage | |
1851 exit 1 | |
1852 } | |
1853 | |
1854 set patterns [lassign $args baseUrl] | |
1855 | |
1856 # validate URL | |
1857 if {![::relmon::common::isUrlValid $baseUrl]} { | |
1858 puts stderr "invalid base URL" | |
1859 exit 1 | |
1860 } | |
1861 | |
1862 # process patterns | |
1863 set processedPatterns {} | |
1864 set reInfo {} | |
1865 set patternIndex 0 | |
1866 foreach pattern $patterns { | |
1867 incr patternIndex | |
1868 | |
1869 # make trailing slashes optional | |
1870 if {($patternIndex != [llength $patterns]) && | |
1871 ([string index $pattern end] eq "/")} { | |
1872 append pattern {?} | |
1873 } | |
1874 | |
1875 # ensure patterns are anchored to the end of the line | |
1876 if {[string index $pattern end] ne "$"} { | |
1877 append pattern {$} | |
1878 } | |
1879 | |
1880 # actually validate the regular expression | |
1881 try { | |
1882 set reInfo [regexp -about -- $pattern ""] | |
1883 } on error {errorMsg} { | |
1884 puts stderr $errorMsg | |
1885 exit 1 | |
1886 } | |
1887 lappend processedPatterns $pattern | |
1888 } | |
1889 # add a dummy pattern "()" if the last pattern is does not contain a | |
1890 # capturing group, this will match every link and capture an empty string | |
1891 if {[lindex $reInfo 0] < 1} { | |
1892 lappend processedPatterns {()} | |
1893 } | |
1894 | |
1895 # construct a watchlist from the command line arguments | |
1896 set watchlist [dict create "new project" [dict create "base_url" $baseUrl \ | |
1897 "patterns" $processedPatterns]] | |
1898 | |
1899 set Log [logger::init global] | |
1900 if {[dict get $Config "log_level"] eq "debug"} { | |
1901 set logFormat $relmon::LogFormatDebug | |
1902 } else { | |
1903 set logFormat $relmon::LogFormatDefault | |
1904 } | |
1905 logger::utils::applyAppender -appender fileAppend -appenderArgs \ | |
1906 [list -outputChannel stderr -conversionPattern $logFormat] \ | |
1907 -serviceCmd $Log | |
1908 | |
1909 # set default logging level | |
1910 ${Log}::setlevel [dict get $Config "log_level"] | |
1911 | |
1912 ${Log}::notice "relmon.tcl starting up" | |
1913 | |
1914 set ExitStatus [relmon::crawler::crawl $Config $watchlist $Log \ | |
1915 [namespace code OnItemFinished]] | |
1916 | |
1917 exit $ExitStatus | |
1918 } | |
1919 | |
1920 | |
1680 proc ::relmon::main {args} { | 1921 proc ::relmon::main {args} { |
1681 variable usage | 1922 variable usage |
1682 set subArgs [lassign $args subCommand] | 1923 set subArgs [lassign $args subCommand] |
1683 | 1924 |
1684 # generate list of subcommands | 1925 # generate list of subcommands |