Mercurial > projects > relmon
changeset 5:86a0c5d11f05 version-1
Add discover subcommand
The discover subcommand assists with the creation of watchlist entries and
allows to reproduce each step of an update operation for a watchlist entry.
author | Guido Berhoerster <guido+relmon@berhoerster.name> |
---|---|
date | Sun, 26 Oct 2014 21:36:05 +0100 |
parents | f28486666a4f |
children | 45fe94dbc236 |
files | relmon.1.xml relmon.tcl.in |
diffstat | 2 files changed, 611 insertions(+), 229 deletions(-) [+] |
line wrap: on
line diff
--- a/relmon.1.xml Fri Oct 24 22:44:39 2014 +0200 +++ b/relmon.1.xml Sun Oct 26 21:36:05 2014 +0100 @@ -150,6 +150,42 @@ </arg> </cmdsynopsis> <cmdsynopsis> + <command>relmon discover</command> + <arg choice="opt"> + <option>-d</option> + </arg> + <arg choice="opt"> + <option>-c</option> + <replaceable>max_connections</replaceable> + </arg> + <arg choice="opt"> + <option>-C</option> + <replaceable>ca_dir</replaceable> + </arg> + <arg choice="opt"> + <option>-D</option> + <replaceable>delay</replaceable> + </arg> + <arg choice="opt"> + <option>-H</option> + <replaceable>max_host_connections</replaceable> + </arg> + <arg choice="opt"> + <option>-r</option> + <replaceable>retries</replaceable> + </arg> + <arg choice="opt"> + <option>-t</option> + <replaceable>min_time</replaceable> + </arg> + <arg choice="plain"> + <replaceable>base_url</replaceable> + </arg> + <arg choice="opt"> + <replaceable>pattern</replaceable> + </arg> + </cmdsynopsis> + <cmdsynopsis> <command>relmon help</command> <arg choice="opt"> <replaceable>subcommand</replaceable> @@ -396,6 +432,97 @@ </varlistentry> <varlistentry> <term> + <command>discover</command> + </term> + <listitem> + <para>The <command>discover</command> subcommand assists with the + creation of watchlist entries. The arguments to the + <command>discover</command> subcommand correspond to the fields + of a watchlist entry without the name field, see + <citerefentry><refentrytitle>relmon_format</refentrytitle> + <manvolnum>4</manvolnum></citerefentry> for details on the + format. Only the <replaceable>base_url</replaceable> is mandatory + and <command>relmon</command> will log all matching and + non-matching links of each retrieved document or feed so that + each step of an update operation for a watchlist entry can be + reproduced.</para> + <para>The following options are supported:</para> + <variablelist> + <varlistentry> + <term> + <option>-d</option> + </term> + <listitem> + <para>Trace and log all transfers and parsing. This option is + used for debugging purposes.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-c</option> + <replaceable>max_connections</replaceable> + </term> + <listitem> + <para>Limit the number of simultaneous connections to the + specified number.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-C</option> + <replaceable>ca_dir</replaceable> + </term> + <listitem> + <para>Verify the validity of TLS certificates using the CA + certificates in the specified directory.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-D</option> + <replaceable>delay</replaceable> + </term> + <listitem> + <para>Wait at least the specified number of seconds before + making subsequent connections to the same host.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-H</option> + <replaceable>max_host_connections</replaceable> + </term> + <listitem> + <para>Limit the number of simultaneous connections to a single + host to the specified number.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-r</option> + <replaceable>retries</replaceable> + </term> + <listitem> + <para>Limit the number of retries in case of connection + failures.</para> + </listitem> + </varlistentry> + <varlistentry> + <term> + <option>-t</option> + <replaceable>min_time</replaceable> + </term> + <listitem> + <para>Only update version information for projects from the + watchlist which have not been updated for the specified + number of seconds.</para> + </listitem> + </varlistentry> + </variablelist> + </listitem> + </varlistentry> + <varlistentry> + <term> <command>help</command> </term> <listitem> @@ -409,6 +536,20 @@ <refsect1> <title>Examples</title> <example> + <title>Creating a new watchlist entry</title> + <para>The following command displays all links found in the HTML document + at <uri>http://example.net/foo/</uri>:</para> + <screen> +$ relmon discover http://example.net/foo/ + </screen> + <para>The following command tests whether the specified version-matching + regular expression matches the distribution file linked from + <uri>http://example.net/foo/</uri>:</para> + <screen> +$ relmon discover http://example.net/foo/ '/foo-([[:digit:].]+)\.tar\.gz' + </screen> + </example> + <example> <title>Updating version information</title> <para>The following command retrieves the version information for the entries in the watchlist <filename>foo.watchlist</filename> and stores
--- a/relmon.tcl.in Fri Oct 24 22:44:39 2014 +0200 +++ b/relmon.tcl.in Sun Oct 26 21:36:05 2014 +0100 @@ -42,6 +42,12 @@ namespace eval ::relmon { # version variable VERSION @VERSION@ + + # default log format + variable LogFormatDefault {%d \[%p\] %m} + + # debugging log format + variable LogFormatDebug {%d \[%p\] \[%M\] %m} } @@ -177,97 +183,8 @@ return $state } - -namespace eval ::relmon::update { - # commandline option help text - variable usage "usage: relmon update \[-dev\] \[-c max_connections\] \[-C\ - ca_dir\] \[-D delay\]\n\ - \ \[-H max_host_connections\] \[-i\ - item\[,...\]\] \[-l logfile\]\n\ - \ \[-r retries\] \[-t min_time\] watchlist\ - statefile" - - # configuration options - variable Config [dict create \ - "log_file" "" \ - "log_level" "notice" \ - "history_limit" 20 \ - "connection_limit" 16 \ - "host_connection_limit" 4 \ - "transfer_time_limit" 60000 \ - "retry_limit" 3 \ - "host_delay" 0 \ - "timestamp_filter" 0 \ - "error_filter" 0 \ - "item_filter" {} \ - "ca_dir" "" \ - "state_file" "" \ - "watchlist_file" ""] - - # exit status - variable ExitStatus - - # transfer statistics - variable Statistics [dict create \ - "start_time" 0 \ - "end_time" 0 \ - "requests" 0 \ - "items" 0] - - # watchlist - variable Watchlist - - # ID of a delayed run of ManageTransfers - variable ManageTransfersId "" - - # queue of pending transfers - variable Queue - - # number of active connections per host - variable HostConnections - - # delays before opening a new connection to a host - variable HostDelays - - # active transfers - variable ActiveTransfers - - # buffer for tracking the state of unfinished items - variable StateBuffer - - # buffer needed by htmlparse::parse for constructing the preprocessed HTML - # document - variable PreprocessedHtmlBuffer - - # logger handle - variable Log - - # logfile handle - variable Lf -} - -proc ::relmon::update::OnError {message returnOptions} { - # internal error, abort - puts stderr [dict get $returnOptions "-errorinfo"] - - exit 1 -} - -proc ::relmon::update::CleanupBeforeExit {commandString operation} { - variable Lf - - # close logfile - if {($Lf ne "") && ($Lf ni {stdin stderr})} { - close $Lf - set Lf "" - } - - return -} - -proc ::relmon::update::ParseWatchlist {watchlistFilename} { - variable Watchlist - +proc ::relmon::common::parseWatchlist {watchlistFilename} { + set Watchlist [dict create] set lineno 0 set f [open $watchlistFilename "r"] try { @@ -338,10 +255,66 @@ close $f } - return + return $Watchlist } -proc ::relmon::update::ProcessHtmlElement {tag slash param textBehindTheTag} { + +namespace eval ::relmon::crawler { + # status returned by crawl + variable Status + + # configuration + variable Config + + # transfer statistics + variable Statistics [dict create \ + "start_time" 0 \ + "end_time" 0 \ + "requests" 0 \ + "items" 0] + + # ID of a delayed run of ManageTransfers + variable ManageTransfersId "" + + # queue of pending transfers + variable Queue + + # number of active connections per host + variable HostConnections [dict create] + + # delays before opening a new connection to a host + variable HostDelays [dict create] + + # active transfers + variable ActiveTransfers [dict create] + + # number of running or queued transfers of watchlist items + variable RemainingTransfers [dict create] + + # buffer for tracking the state of unfinished items + variable StateBuffer [dict create] + + # buffer needed by htmlparse::parse for constructing the preprocessed HTML + # document + variable PreprocessedHtmlBuffer + + # callback when all transfers for an item are finished + variable OnItemFinishedCmd + + # logger handle + variable Log + + namespace export crawl +} + +proc ::relmon::crawler::OnError {message returnOptions} { + # internal error, abort + puts stderr [dict get $returnOptions "-errorinfo"] + + exit 1 +} + +proc ::relmon::crawler::ProcessHtmlElement {tag slash param textBehindTheTag} { variable PreprocessedHtmlBuffer # copy every "<a>" element into PreprocessedHtmlBuffer @@ -352,7 +325,7 @@ return } -proc ::relmon::update::PreprocessHtml {bodyDataName} { +proc ::relmon::crawler::PreprocessHtml {bodyDataName} { upvar 1 $bodyDataName bodyData variable PreprocessedHtmlBuffer @@ -367,7 +340,7 @@ append PreprocessedHtmlBuffer "</body></html>" } -proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl +proc ::relmon::crawler::ExtractUrls {bodyDataName contentType baseUrl rePattern} { upvar 1 $bodyDataName bodyData set extractedUrls {} @@ -462,7 +435,7 @@ return $resultUrls } -proc ::relmon::update::StateItemAppendError {name logMsg} { +proc ::relmon::crawler::StateItemAppendError {name logMsg} { variable StateBuffer dict update StateBuffer $name stateItem { @@ -472,11 +445,12 @@ return } -proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} { +proc ::relmon::crawler::HandleSuccessfulTransfer {item httpBodyName} { upvar 1 $httpBodyName httpBody variable Log variable StateBuffer variable Queue + variable RemainingTransfers variable Watchlist set name [dict get $item "name"] @@ -516,6 +490,7 @@ [dict create "name" $name "url" $newUrl \ "pattern_index" [expr {$patternIndex + 1}] \ "content_type" "" "num_redirects" 0 "num_retries" 0] + dict incr RemainingTransfers $name } else { ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\"" } @@ -524,6 +499,7 @@ # otherwise this branch has finished, try to extract the versions and # store them in the buffer dict for {finalUrl matched} $urls { + dict set StateBuffer $name "urls" $url $urls if {$matched} { regexp -line -- $pattern $finalUrl -> version if {$version ne ""} { @@ -544,9 +520,10 @@ return } -proc ::relmon::update::HandleRedirect {item httpCode httpHeaders} { +proc ::relmon::crawler::HandleRedirect {item httpCode httpHeaders} { variable Log variable Queue + variable RemainingTransfers set name [dict get $item "name"] set url [dict get $item "url"] @@ -588,6 +565,7 @@ [dict replace $item "url" $redirectUrl "content_type" "" \ "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \ "num_retries" 0] + dict incr RemainingTransfers $name } else { set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ redirects" @@ -598,7 +576,7 @@ return } -proc ::relmon::update::HandleProtocolError {item httpCode} { +proc ::relmon::crawler::HandleProtocolError {item httpCode} { variable Log set name [dict get $item "name"] set url [dict get $item "url"] @@ -608,10 +586,11 @@ return } -proc ::relmon::update::HandleTimeoutReset {item} { +proc ::relmon::crawler::HandleTimeoutReset {item} { variable Log variable Config variable Queue + variable RemainingTransfers set name [dict get $item "name"] set url [dict get $item "url"] @@ -622,6 +601,7 @@ dict lappend Queue [::relmon::common::urlGetHost $url] \ [dict replace $item \ "num_retries" [expr {[dict get $item "num_retries"] + 1}]] + dict incr RemainingTransfers $name } else { set warningMsg "\"$name\": \"$url\": exceeded maximum number of\ retries" @@ -632,7 +612,7 @@ return } -proc ::relmon::update::HandleConnectionError {item errorMsg} { +proc ::relmon::crawler::HandleConnectionError {item errorMsg} { variable Log set name [dict get $item "name"] set url [dict get $item "url"] @@ -642,7 +622,7 @@ return } -proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} { +proc ::relmon::crawler::TransferCallbackWrapper {callbackCmd args} { # ensure that exceptions get raised, by default http catches all errors and # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262 if {[catch {eval $callbackCmd $args} -> errorOptions]} { @@ -651,14 +631,15 @@ return } -proc ::relmon::update::ManageTransfers {} { +proc ::relmon::crawler::ManageTransfers {} { variable Config variable ManageTransfersId variable Queue variable HostConnections variable HostDelays variable ActiveTransfers - variable ExitStatus + variable RemainingTransfers + variable Status variable Log after cancel $ManageTransfersId @@ -726,12 +707,17 @@ set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg" ${Log}::warn $warningMsg StateItemAppendError $name $warningMsg + dict incr RemainingTransfers $name -1 } } # terminate the event loop if there are no remaining transfers - if {([dict size $ActiveTransfers]) == 0 && ([dict size $Queue] == 0)} { - set ExitStatus 0 + if {[::tcl::mathop::+ {*}[dict values $RemainingTransfers]] == 0} { + if {([dict size $ActiveTransfers]) > 0 || ([dict size $Queue] > 0)} { + ${Log}::error "inconsistent internal state" + set Status 1 + } + set Status 0 return } @@ -765,7 +751,7 @@ return } -proc ::relmon::update::OnTransferProgress {token total current} { +proc ::relmon::crawler::OnTransferProgress {token total current} { upvar #0 $token httpState variable ActiveTransfers variable Log @@ -792,16 +778,17 @@ } } -proc ::relmon::update::OnTransferFinished {token} { +proc ::relmon::crawler::OnTransferFinished {token} { upvar #0 $token httpState variable Config variable HostConnections - variable Queue variable ActiveTransfers + variable RemainingTransfers variable Statistics variable StateBuffer variable State variable Log + variable OnItemFinishedCmd set item [dict get $ActiveTransfers $token] set name [dict get $item "name"] @@ -811,6 +798,7 @@ # for this item dict unset ActiveTransfers $token dict incr HostConnections $host -1 + dict incr RemainingTransfers $name -1 switch -- $httpState(status) { {ok} { @@ -854,63 +842,9 @@ } # check if all transfers of this item are finished - set itemFinished 1 - dict for {queueHost queueItems} $Queue { - foreach queueItem $queueItems { - if {[dict get $queueItem "name"] eq $name} { - set itemFinished 0 - } - } - } - dict for {activeToken activeItem} $ActiveTransfers { - if {[dict get $activeItem "name"] eq $name} { - set itemFinished 0 - } - } - if {$itemFinished} { - set timestamp [clock milliseconds] - - # create httpState item if it does not exist yet - if {![dict exists $State $name]} { - dict set State $name [dict create "versions" [dict create] \ - "history" [list] "timestamp" 0 "errors" [list]] - } + if {[dict get $RemainingTransfers $name] == 0} { + eval $OnItemFinishedCmd [list $name [dict get $StateBuffer $name]] - # if there are no versions, log an error message since something must - # be wrong - if {[llength [dict get $StateBuffer $name "versions"]] == 0} { - set warningMsg "\"$name\": no versions found" - ${Log}::warn $warningMsg - StateItemAppendError $name $warningMsg - } - - # update httpState item - dict set State $name "errors" [dict get $StateBuffer $name "errors"] - dict set State $name "timestamp" $timestamp - if {[llength [dict get $StateBuffer $name "errors"]] == 0} { - # expire old history entries - set history [lrange [dict get $State $name "history"] \ - [expr {[llength [dict get $State $name "history"]] - - [dict get $Config "history_limit"] + 1}] end] - - # add currently latest available version to history if it is either - # newer than the previous one or if the previous one is no longer - # available (e.g. if it has been removed or the watchlist pattern - # has been changed) - set prevLatestVersion [lindex $history end 0] - set curLatestVersion [lindex \ - [lsort -command ::relmon::common::cmpVersion \ - [dict keys [dict get $StateBuffer $name "versions"]]] end] - if {([::relmon::common::cmpVersion $curLatestVersion \ - $prevLatestVersion] > 0) || - ![dict exists $StateBuffer $name "versions" \ - $prevLatestVersion]} { - lappend history [list $curLatestVersion $timestamp] - dict set State $name "history" $history - } - dict set State $name "versions" [dict get $StateBuffer $name \ - "versions"] - } dict unset StateBuffer $name ${Log}::notice "progress: [dict size $StateBuffer]/[dict get \ @@ -925,7 +859,7 @@ } # control certificate verification and log errors during TLS handshake -proc ::relmon::update::OnTlsHandshake {type args} { +proc ::relmon::crawler::OnTlsHandshake {type args} { variable Config variable Log @@ -957,21 +891,165 @@ } } +proc ::relmon::crawler::crawl {config watchlist log onItemFinishedCmd} { + variable Config $config + variable Watchlist $watchlist + variable Queue + variable Statistics + variable Log $log + variable OnItemFinishedCmd $onItemFinishedCmd + variable Status + variable RemainingTransfers + variable StateBuffer + + # initialize queue and state buffer from the watchlist + set Queue [dict create] + dict for {name watchlistItem} $Watchlist { + dict lappend Queue [::relmon::common::urlGetHost \ + [dict get $watchlistItem "base_url"]] \ + [dict create \ + "name" $name \ + "url" [dict get $watchlistItem "base_url"] \ + "pattern_index" 0 \ + "content_type" "" \ + "num_redirects" 0 \ + "num_retries" 0] + dict set RemainingTransfers $name 1 + dict incr Statistics "items" + dict set StateBuffer $name [dict create "versions" [dict create] \ + "errors" [list] "urls" [dict create]] + } + + # configure http and tls + http::register https 443 [list tls::socket \ + -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \ + -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1] + http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\ + Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION" + + # handle errors while in the event loop + interp bgerror {} [namespace code OnError] + + dict set Statistics "start_time" [clock milliseconds] + + # enter the main loop + after idle [namespace code ManageTransfers] + vwait [namespace which -variable Status] + + # display statistics + dict set Statistics "end_time" [clock milliseconds] + ${Log}::notice "items checked: [dict get $Statistics "items"]" + ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] - + [dict get $Statistics "start_time"]) / 1000}]s" + + return $Status +} + + +namespace eval ::relmon::update { + # commandline option help text + variable usage "usage: relmon update \[-dev\] \[-c max_connections\]\ + \[-C ca_dir\] \[-D host_delay\]\n\ + \ \[-H max_connections_per_host\]\ + \[-i item_list\]\n\ + \ \[-l logfile\] \[-r retries\]\ + \[-t before_seconds\]\n\ + \ \[-T timeout_seconds\] watchlist statefile" + + # configuration options + variable Config [dict create \ + "log_file" "" \ + "log_level" "notice" \ + "history_limit" 20 \ + "connection_limit" 16 \ + "host_connection_limit" 4 \ + "transfer_time_limit" 60000 \ + "retry_limit" 3 \ + "host_delay" 500 \ + "host_delays" [dict create] \ + "timestamp_filter" 0 \ + "error_filter" 0 \ + "item_filter" {} \ + "ca_dir" "" \ + "state_file" "" \ + "watchlist_file" ""] + + # watchlist + variable Watchlist + + # logger handle + variable Log + + # logfile handle + variable Lf +} + +proc ::relmon::update::OnItemFinished {name stateItem} { + variable State + variable Log + variable Config + + set timestamp [clock milliseconds] + + # create httpState item if it does not exist yet + if {![dict exists $State $name]} { + dict set State $name [dict create "versions" [dict create] \ + "history" [list] "timestamp" 0 "errors" [list]] + } + + # if there are no versions, log an error message since something must + # be wrong + if {[llength [dict get $stateItem "versions"]] == 0} { + set warningMsg "\"$name\": no versions found" + ${Log}::warn $warningMsg + dict lappend stateItem "errors" $warningMsg + } + + # update httpState item + dict set State $name "errors" [dict get $stateItem "errors"] + dict set State $name "timestamp" $timestamp + if {[llength [dict get $stateItem "errors"]] == 0} { + # expire old history entries + set history [lrange [dict get $State $name "history"] \ + [expr {[llength [dict get $State $name "history"]] - + [dict get $Config "history_limit"] + 1}] end] + + # add currently latest available version to history if it is either + # newer than the previous one or if the previous one is no longer + # available (e.g. if it has been removed or the watchlist pattern + # has been changed) + set prevLatestVersion [lindex $history end 0] + set curLatestVersion [lindex \ + [lsort -command ::relmon::common::cmpVersion \ + [dict keys [dict get $stateItem "versions"]]] end] + if {([::relmon::common::cmpVersion $curLatestVersion \ + $prevLatestVersion] > 0) || + ![dict exists $stateItem "versions" $prevLatestVersion]} { + lappend history [list $curLatestVersion $timestamp] + dict set State $name "history" $history + } + dict set State $name "versions" [dict get $stateItem "versions"] + } +} + +proc ::relmon::update::CleanupBeforeExit {commandString op} { + variable Lf + + # close logfile + if {($Lf ne "") && ($Lf ni {stdin stderr})} { + close $Lf + set Lf "" + } + + return +} + proc ::relmon::update::main {args} { variable Config variable usage - variable Statistics - variable Watchlist [dict create] - variable Queue [dict create] - variable HostConnections [dict create] - variable HostDelays [dict create] - variable ActiveTransfers [dict create] variable State - variable StateBuffer [dict create] - variable PreprocessedHtmlBuffer variable Log variable Lf "" - variable ExitStatus # parse commandline while {[set GetoptRet [cmdline::getopt args \ @@ -1101,9 +1179,9 @@ } set Log [logger::init global] if {[dict get $Config "log_level"] eq "debug"} { - set logFormat {%d \[%p\] \[%M\] %m} + set logFormat $relmon::LogFormatDebug } else { - set logFormat {%d \[%p\] %m} + set logFormat $relmon::LogFormatDefault } logger::utils::applyAppender -appender fileAppend -appenderArgs \ [list -outputChannel $Lf -conversionPattern $logFormat] \ @@ -1116,7 +1194,8 @@ # parse the watchlist try { - ParseWatchlist [dict get $Config "watchlist_file"] + set Watchlist [relmon::common::parseWatchlist \ + [dict get $Config "watchlist_file"]] } trap {POSIX} {errorMsg errorOptions} - \ trap {RELMON} {errorMsg errorOptions} { ${Log}::error $errorMsg @@ -1135,62 +1214,23 @@ exit 1 } - # initialize queue and state buffer from the watchlist - dict set Statistics "start_time" [clock milliseconds] + # apply filters specified on the command line to watchlist items + set currentTime [clock milliseconds] dict for {name watchlistItem} $Watchlist { - # apply filters specified on the command line to watchlist items - if {([llength [dict get $Config "item_filter"]] > 0) && - ($name ni [dict get $Config "item_filter"])} { - continue - } - - if {[dict get $Config "error_filter"] && + if {(([llength [dict get $Config "item_filter"]] > 0) && + ($name ni [dict get $Config "item_filter"])) || + ([dict get $Config "error_filter"] && [dict exists $State $name "errors"] && - ([llength [dict get $State $name "errors"]] == 0)} { - continue + ([llength [dict get $State $name "errors"]] == 0)) || + ([dict exists $State $name "timestamp"] && + ([dict get $State $name "timestamp"] > $currentTime - + [dict get $Config "timestamp_filter"]))} { + dict unset Watchlist $name } - - if {[dict exists $State $name "timestamp"] && - ([dict get $State $name "timestamp"] > - [dict get $Statistics "start_time"] - - [dict get $Config "timestamp_filter"])} { - continue - } - - dict lappend Queue [::relmon::common::urlGetHost \ - [dict get $watchlistItem "base_url"]] \ - [dict create \ - "name" $name \ - "url" [dict get $watchlistItem "base_url"] \ - "pattern_index" 0 \ - "content_type" "" \ - "num_redirects" 0 \ - "num_retries" 0] - dict incr Statistics "items" - dict set StateBuffer $name [dict create "versions" [dict create] \ - "errors" [list]] } - # configure http and tls - http::register https 443 [list tls::socket \ - -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \ - -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1] - http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\ - Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION" - - # handle errors while in the event loop - interp bgerror {} [namespace code OnError] - - # enter the main loop - after idle [namespace code ManageTransfers] - vwait [namespace which -variable ExitStatus] - - dict set Statistics "end_time" [clock milliseconds] - - # display statistics - ${Log}::notice "items checked: [dict get $Statistics "items"]" - ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] - - [dict get $Statistics "start_time"]) / 1000}]s" + set ExitStatus [relmon::crawler::crawl $Config $Watchlist $Log \ + [namespace code OnItemFinished]] # serialize the new state set JsonStateItems {} @@ -1677,6 +1717,207 @@ } +namespace eval ::relmon::discover { + # commandline option help text + variable usage "usage: relmon discover \[-d\] \[-c max_connections\]\ + \[-C ca_dir\] \[-D host_delay\]\n\ + \ \[-H max_connections_per_host\]\ + \[-r retries\]\n\ + \ \[-t before_seconds\]\ + \[-T timeout_seconds\] base_url\n\ + \ \[pattern...\]" + + # configuration options + variable Config [dict create \ + "log_file" "" \ + "log_level" "info" \ + "connection_limit" 16 \ + "host_connection_limit" 4 \ + "transfer_time_limit" 60000 \ + "retry_limit" 3 \ + "host_delay" 500 \ + "host_delays" [dict create] \ + "ca_dir" ""] + + # transfer statistics + variable Statistics [dict create \ + "start_time" [clock milliseconds] \ + "end_time" 0 \ + "requests" 0 \ + "items" 0] + + # watchlist + variable Queue [dict create] + + # logger handle + variable Log +} + +proc ::relmon::discover::OnItemFinished {name stateItem} { + variable Log + + dict for {url urls} [dict get $stateItem "urls"] { + dict for {linkUrl matched} $urls { + if {$matched} { + puts "\"$url\": found matching URL \"$linkUrl\"" + } else { + puts "\"$url\": found non-matching URL \"$linkUrl\"" + } + } + } +} + +proc ::relmon::discover::main {args} { + variable Config + variable usage + variable Log + variable Queue + + # parse commandline + while {[set GetoptRet [cmdline::getopt args \ + {c.arg C.arg d D.arg h H.arg r.arg t.arg T.arg ?} \ + OptArg OptVal]] == 1} { + switch -glob -- $OptArg { + {c} { + if {![string is digit -strict $OptVal] || ($OptVal <= 0)} { + puts stderr "invalid value passed to \"-$OptArg\"" + exit 1 + } + dict set Config "host_connection_limit" $OptVal + } + {C} { + if {![file isdirectory $OptVal]} { + puts stderr "directory \"$OptVal\" is not a directory" + exit 1 + } elseif {![file readable $OptVal] || + ![file executable $OptVal]} { + puts stderr "directory \"$OptVal\" is not readable" + exit 1 + } + dict set Config "ca_dir" $OptVal + } + {d} { + dict set Config "log_level" "debug" + } + {D} { + if {![string is digit -strict $OptVal] || ($OptVal < 0)} { + puts stderr "invalid value passed to \"-$OptArg\"" + exit 1 + } + dict set Config "host_delay" $OptVal + } + {[h?]} { + puts stderr $usage + exit 0 + } + {H} { + if {![string is digit -strict $OptVal] || ($OptVal <= 0)} { + puts stderr "invalid value passed to \"-$OptArg\"" + exit 1 + } + dict set Config "connection_limit" $OptVal + } + {r} { + if {![string is digit -strict $OptVal] || ($OptVal < 0)} { + puts stderr "invalid value passed to \"-$OptArg\"" + exit 1 + } + dict set Config "retry_limit" $OptVal + } + {t} { + if {![string is digit -strict $OptVal] || ($OptVal < 0)} { + puts stderr "invalid value passed to \"-$OptArg\"" + exit 1 + } + dict set Config "timestamp_filter" [expr {$OptVal * 1000}] + } + {T} { + if {![string is digit -strict $OptVal] || ($OptVal <= 0)} { + puts stderr "invalid value passed to \"-$OptArg\"" + exit 1 + } + dict set Config "transfer_time_limit" [expr {$OptVal * 1000}] + } + } + } + set argc [llength $args] + if {$GetoptRet == -1} { + puts stderr "unknown command line option \"-$OptArg\"" + puts stderr $usage + exit 1 + } + if {$argc < 1} { + puts stderr $usage + exit 1 + } + + set patterns [lassign $args baseUrl] + + # validate URL + if {![::relmon::common::isUrlValid $baseUrl]} { + puts stderr "invalid base URL" + exit 1 + } + + # process patterns + set processedPatterns {} + set reInfo {} + set patternIndex 0 + foreach pattern $patterns { + incr patternIndex + + # make trailing slashes optional + if {($patternIndex != [llength $patterns]) && + ([string index $pattern end] eq "/")} { + append pattern {?} + } + + # ensure patterns are anchored to the end of the line + if {[string index $pattern end] ne "$"} { + append pattern {$} + } + + # actually validate the regular expression + try { + set reInfo [regexp -about -- $pattern ""] + } on error {errorMsg} { + puts stderr $errorMsg + exit 1 + } + lappend processedPatterns $pattern + } + # add a dummy pattern "()" if the last pattern is does not contain a + # capturing group, this will match every link and capture an empty string + if {[lindex $reInfo 0] < 1} { + lappend processedPatterns {()} + } + + # construct a watchlist from the command line arguments + set watchlist [dict create "new project" [dict create "base_url" $baseUrl \ + "patterns" $processedPatterns]] + + set Log [logger::init global] + if {[dict get $Config "log_level"] eq "debug"} { + set logFormat $relmon::LogFormatDebug + } else { + set logFormat $relmon::LogFormatDefault + } + logger::utils::applyAppender -appender fileAppend -appenderArgs \ + [list -outputChannel stderr -conversionPattern $logFormat] \ + -serviceCmd $Log + + # set default logging level + ${Log}::setlevel [dict get $Config "log_level"] + + ${Log}::notice "relmon.tcl starting up" + + set ExitStatus [relmon::crawler::crawl $Config $watchlist $Log \ + [namespace code OnItemFinished]] + + exit $ExitStatus +} + + proc ::relmon::main {args} { variable usage set subArgs [lassign $args subCommand]