projects/relmon

changeset 5:86a0c5d11f05 version-1

Add discover subcommand

The discover subcommand assists with the creation of watchlist entries and
allows to reproduce each step of an update operation for a watchlist entry.
author Guido Berhoerster <guido+relmon@berhoerster.name>
date Sun Oct 26 21:36:05 2014 +0100 (2014-10-26)
parents f28486666a4f
children 45fe94dbc236
files relmon.1.xml relmon.tcl.in
line diff
     1.1 --- a/relmon.1.xml	Fri Oct 24 22:44:39 2014 +0200
     1.2 +++ b/relmon.1.xml	Sun Oct 26 21:36:05 2014 +0100
     1.3 @@ -150,6 +150,42 @@
     1.4        </arg>
     1.5      </cmdsynopsis>
     1.6      <cmdsynopsis>
     1.7 +      <command>relmon discover</command>
     1.8 +      <arg choice="opt">
     1.9 +        <option>-d</option>
    1.10 +      </arg>
    1.11 +      <arg choice="opt">
    1.12 +        <option>-c</option>
    1.13 +        <replaceable>max_connections</replaceable>
    1.14 +      </arg>
    1.15 +      <arg choice="opt">
    1.16 +        <option>-C</option>
    1.17 +        <replaceable>ca_dir</replaceable>
    1.18 +      </arg>
    1.19 +      <arg choice="opt">
    1.20 +        <option>-D</option>
    1.21 +        <replaceable>delay</replaceable>
    1.22 +      </arg>
    1.23 +      <arg choice="opt">
    1.24 +        <option>-H</option>
    1.25 +        <replaceable>max_host_connections</replaceable>
    1.26 +      </arg>
    1.27 +      <arg choice="opt">
    1.28 +        <option>-r</option>
    1.29 +        <replaceable>retries</replaceable>
    1.30 +      </arg>
    1.31 +      <arg choice="opt">
    1.32 +        <option>-t</option>
    1.33 +        <replaceable>min_time</replaceable>
    1.34 +      </arg>
    1.35 +      <arg choice="plain">
    1.36 +        <replaceable>base_url</replaceable>
    1.37 +      </arg>
    1.38 +      <arg choice="opt">
    1.39 +        <replaceable>pattern</replaceable>
    1.40 +      </arg>
    1.41 +    </cmdsynopsis>
    1.42 +    <cmdsynopsis>
    1.43        <command>relmon help</command>
    1.44        <arg choice="opt">
    1.45          <replaceable>subcommand</replaceable>
    1.46 @@ -396,6 +432,97 @@
    1.47        </varlistentry>
    1.48        <varlistentry>
    1.49          <term>
    1.50 +          <command>discover</command>
    1.51 +        </term>
    1.52 +        <listitem>
    1.53 +            <para>The <command>discover</command> subcommand assists with the
    1.54 +              creation of watchlist entries. The arguments to the
    1.55 +              <command>discover</command> subcommand correspond to the fields
    1.56 +              of a watchlist entry without the name field, see
    1.57 +              <citerefentry><refentrytitle>relmon_format</refentrytitle>
    1.58 +              <manvolnum>4</manvolnum></citerefentry> for details on the
    1.59 +              format. Only the <replaceable>base_url</replaceable> is mandatory
    1.60 +              and <command>relmon</command> will log all matching and
    1.61 +              non-matching links of each retrieved document or feed so that
    1.62 +              each step of an update operation for a watchlist entry can be
    1.63 +              reproduced.</para>
    1.64 +          <para>The following options are supported:</para>
    1.65 +          <variablelist>
    1.66 +            <varlistentry>
    1.67 +              <term>
    1.68 +                <option>-d</option>
    1.69 +              </term>
    1.70 +              <listitem>
    1.71 +                <para>Trace and log all transfers and parsing. This option is
    1.72 +                  used for debugging purposes.</para>
    1.73 +              </listitem>
    1.74 +            </varlistentry>
    1.75 +            <varlistentry>
    1.76 +              <term>
    1.77 +                <option>-c</option>
    1.78 +                <replaceable>max_connections</replaceable>
    1.79 +              </term>
    1.80 +              <listitem>
    1.81 +                <para>Limit the number of simultaneous connections to the
    1.82 +                  specified number.</para>
    1.83 +              </listitem>
    1.84 +            </varlistentry>
    1.85 +            <varlistentry>
    1.86 +              <term>
    1.87 +                <option>-C</option>
    1.88 +                <replaceable>ca_dir</replaceable>
    1.89 +              </term>
    1.90 +              <listitem>
    1.91 +                <para>Verify the validity of TLS certificates using the CA
    1.92 +                  certificates in the specified directory.</para>
    1.93 +              </listitem>
    1.94 +            </varlistentry>
    1.95 +            <varlistentry>
    1.96 +              <term>
    1.97 +                <option>-D</option>
    1.98 +                <replaceable>delay</replaceable>
    1.99 +              </term>
   1.100 +              <listitem>
   1.101 +                <para>Wait at least the specified number of seconds before
   1.102 +                  making subsequent connections to the same host.</para>
   1.103 +              </listitem>
   1.104 +            </varlistentry>
   1.105 +            <varlistentry>
   1.106 +              <term>
   1.107 +                <option>-H</option>
   1.108 +                <replaceable>max_host_connections</replaceable>
   1.109 +              </term>
   1.110 +              <listitem>
   1.111 +                <para>Limit the number of simultaneous connections to a single
   1.112 +                  host to the specified number.</para>
   1.113 +              </listitem>
   1.114 +            </varlistentry>
   1.115 +            <varlistentry>
   1.116 +              <term>
   1.117 +                <option>-r</option>
   1.118 +                <replaceable>retries</replaceable>
   1.119 +              </term>
   1.120 +              <listitem>
   1.121 +                <para>Limit the number of retries in case of connection
   1.122 +                  failures.</para>
   1.123 +              </listitem>
   1.124 +            </varlistentry>
   1.125 +            <varlistentry>
   1.126 +              <term>
   1.127 +                <option>-t</option>
   1.128 +                <replaceable>min_time</replaceable>
   1.129 +              </term>
   1.130 +              <listitem>
   1.131 +                <para>Only update version information for projects from the
   1.132 +                  watchlist which have not been updated for the specified
   1.133 +                  number of seconds.</para>
   1.134 +              </listitem>
   1.135 +            </varlistentry>
   1.136 +          </variablelist>
   1.137 +        </listitem>
   1.138 +      </varlistentry>
   1.139 +      <varlistentry>
   1.140 +        <term>
   1.141            <command>help</command>
   1.142          </term>
   1.143          <listitem>
   1.144 @@ -409,6 +536,20 @@
   1.145    <refsect1>
   1.146      <title>Examples</title>
   1.147      <example>
   1.148 +      <title>Creating a new watchlist entry</title>
   1.149 +      <para>The following command displays all links found in the HTML document
   1.150 +        at <uri>http://example.net/foo/</uri>:</para>
   1.151 +      <screen>
   1.152 +$ relmon discover http://example.net/foo/
   1.153 +      </screen>
   1.154 +      <para>The following command tests whether the specified version-matching
   1.155 +        regular expression matches the distribution file linked from
   1.156 +        <uri>http://example.net/foo/</uri>:</para>
   1.157 +      <screen>
   1.158 +$ relmon discover http://example.net/foo/ '/foo-([[:digit:].]+)\.tar\.gz'
   1.159 +      </screen>
   1.160 +    </example>
   1.161 +    <example>
   1.162        <title>Updating version information</title>
   1.163        <para>The following command retrieves the version information for the
   1.164          entries in the watchlist <filename>foo.watchlist</filename> and stores
     2.1 --- a/relmon.tcl.in	Fri Oct 24 22:44:39 2014 +0200
     2.2 +++ b/relmon.tcl.in	Sun Oct 26 21:36:05 2014 +0100
     2.3 @@ -42,6 +42,12 @@
     2.4  namespace eval ::relmon {
     2.5      # version
     2.6      variable VERSION @VERSION@
     2.7 +
     2.8 +    # default log format
     2.9 +    variable LogFormatDefault {%d \[%p\] %m}
    2.10 +
    2.11 +    # debugging log format
    2.12 +    variable LogFormatDebug {%d \[%p\] \[%M\] %m}
    2.13  }
    2.14  
    2.15  
    2.16 @@ -177,97 +183,8 @@
    2.17      return $state
    2.18  }
    2.19  
    2.20 -
    2.21 -namespace eval ::relmon::update {
    2.22 -    # commandline option help text
    2.23 -    variable usage "usage: relmon update \[-dev\] \[-c max_connections\] \[-C\
    2.24 -            ca_dir\] \[-D delay\]\n\
    2.25 -            \                    \[-H max_host_connections\] \[-i\
    2.26 -            item\[,...\]\] \[-l logfile\]\n\
    2.27 -            \                    \[-r retries\] \[-t min_time\] watchlist\
    2.28 -            statefile"
    2.29 -
    2.30 -    # configuration options
    2.31 -    variable Config [dict create \
    2.32 -            "log_file" "" \
    2.33 -            "log_level" "notice" \
    2.34 -            "history_limit" 20 \
    2.35 -            "connection_limit" 16 \
    2.36 -            "host_connection_limit" 4 \
    2.37 -            "transfer_time_limit" 60000 \
    2.38 -            "retry_limit" 3 \
    2.39 -            "host_delay" 0 \
    2.40 -            "timestamp_filter" 0 \
    2.41 -            "error_filter" 0 \
    2.42 -            "item_filter" {} \
    2.43 -            "ca_dir" "" \
    2.44 -            "state_file" "" \
    2.45 -            "watchlist_file" ""]
    2.46 -
    2.47 -    # exit status
    2.48 -    variable ExitStatus
    2.49 -
    2.50 -    # transfer statistics
    2.51 -    variable Statistics [dict create \
    2.52 -            "start_time" 0 \
    2.53 -            "end_time" 0 \
    2.54 -            "requests" 0 \
    2.55 -            "items" 0]
    2.56 -
    2.57 -    # watchlist
    2.58 -    variable Watchlist
    2.59 -
    2.60 -    # ID of a delayed run of ManageTransfers
    2.61 -    variable ManageTransfersId ""
    2.62 -
    2.63 -    # queue of pending transfers
    2.64 -    variable Queue
    2.65 -
    2.66 -    # number of active connections per host
    2.67 -    variable HostConnections
    2.68 -
    2.69 -    # delays before opening a new connection to a host
    2.70 -    variable HostDelays
    2.71 -
    2.72 -    # active transfers
    2.73 -    variable ActiveTransfers
    2.74 -
    2.75 -    # buffer for tracking the state of unfinished items
    2.76 -    variable StateBuffer
    2.77 -
    2.78 -    # buffer needed by htmlparse::parse for constructing the preprocessed HTML
    2.79 -    # document
    2.80 -    variable PreprocessedHtmlBuffer
    2.81 -
    2.82 -    # logger handle
    2.83 -    variable Log
    2.84 -
    2.85 -    # logfile handle
    2.86 -    variable Lf
    2.87 -}
    2.88 -
    2.89 -proc ::relmon::update::OnError {message returnOptions} {
    2.90 -    # internal error, abort
    2.91 -    puts stderr [dict get $returnOptions "-errorinfo"]
    2.92 -
    2.93 -    exit 1
    2.94 -}
    2.95 -
    2.96 -proc ::relmon::update::CleanupBeforeExit {commandString operation} {
    2.97 -    variable Lf
    2.98 -
    2.99 -    # close logfile
   2.100 -    if {($Lf ne "") && ($Lf ni {stdin stderr})} {
   2.101 -        close $Lf
   2.102 -        set Lf ""
   2.103 -    }
   2.104 -
   2.105 -    return
   2.106 -}
   2.107 -
   2.108 -proc ::relmon::update::ParseWatchlist {watchlistFilename} {
   2.109 -    variable Watchlist
   2.110 -
   2.111 +proc ::relmon::common::parseWatchlist {watchlistFilename} {
   2.112 +    set Watchlist [dict create]
   2.113      set lineno 0
   2.114      set f [open $watchlistFilename "r"]
   2.115      try {
   2.116 @@ -338,10 +255,66 @@
   2.117          close $f
   2.118      }
   2.119  
   2.120 -    return
   2.121 +    return $Watchlist
   2.122  }
   2.123  
   2.124 -proc ::relmon::update::ProcessHtmlElement {tag slash param textBehindTheTag} {
   2.125 +
   2.126 +namespace eval ::relmon::crawler {
   2.127 +    # status returned by crawl
   2.128 +    variable Status
   2.129 +
   2.130 +    # configuration
   2.131 +    variable Config
   2.132 +
   2.133 +    # transfer statistics
   2.134 +    variable Statistics [dict create \
   2.135 +            "start_time" 0 \
   2.136 +            "end_time" 0 \
   2.137 +            "requests" 0 \
   2.138 +            "items" 0]
   2.139 +
   2.140 +    # ID of a delayed run of ManageTransfers
   2.141 +    variable ManageTransfersId ""
   2.142 +
   2.143 +    # queue of pending transfers
   2.144 +    variable Queue
   2.145 +
   2.146 +    # number of active connections per host
   2.147 +    variable HostConnections [dict create]
   2.148 +
   2.149 +    # delays before opening a new connection to a host
   2.150 +    variable HostDelays [dict create]
   2.151 +
   2.152 +    # active transfers
   2.153 +    variable ActiveTransfers [dict create]
   2.154 +
   2.155 +    # number of running or queued transfers of watchlist items
   2.156 +    variable RemainingTransfers [dict create]
   2.157 +
   2.158 +    # buffer for tracking the state of unfinished items
   2.159 +    variable StateBuffer [dict create]
   2.160 +
   2.161 +    # buffer needed by htmlparse::parse for constructing the preprocessed HTML
   2.162 +    # document
   2.163 +    variable PreprocessedHtmlBuffer
   2.164 +
   2.165 +    # callback when all transfers for an item are finished
   2.166 +    variable OnItemFinishedCmd
   2.167 +
   2.168 +    # logger handle
   2.169 +    variable Log
   2.170 +
   2.171 +    namespace export crawl
   2.172 +}
   2.173 +
   2.174 +proc ::relmon::crawler::OnError {message returnOptions} {
   2.175 +    # internal error, abort
   2.176 +    puts stderr [dict get $returnOptions "-errorinfo"]
   2.177 +
   2.178 +    exit 1
   2.179 +}
   2.180 +
   2.181 +proc ::relmon::crawler::ProcessHtmlElement {tag slash param textBehindTheTag} {
   2.182      variable PreprocessedHtmlBuffer
   2.183  
   2.184      # copy every "<a>" element into PreprocessedHtmlBuffer
   2.185 @@ -352,7 +325,7 @@
   2.186      return
   2.187  }
   2.188  
   2.189 -proc ::relmon::update::PreprocessHtml {bodyDataName} {
   2.190 +proc ::relmon::crawler::PreprocessHtml {bodyDataName} {
   2.191      upvar 1 $bodyDataName bodyData
   2.192      variable PreprocessedHtmlBuffer
   2.193  
   2.194 @@ -367,7 +340,7 @@
   2.195      append PreprocessedHtmlBuffer "</body></html>"
   2.196  }
   2.197  
   2.198 -proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl
   2.199 +proc ::relmon::crawler::ExtractUrls {bodyDataName contentType baseUrl
   2.200          rePattern} {
   2.201      upvar 1 $bodyDataName bodyData
   2.202      set extractedUrls {}
   2.203 @@ -462,7 +435,7 @@
   2.204      return $resultUrls
   2.205  }
   2.206  
   2.207 -proc ::relmon::update::StateItemAppendError {name logMsg} {
   2.208 +proc ::relmon::crawler::StateItemAppendError {name logMsg} {
   2.209      variable StateBuffer
   2.210  
   2.211      dict update StateBuffer $name stateItem {
   2.212 @@ -472,11 +445,12 @@
   2.213      return
   2.214  }
   2.215  
   2.216 -proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} {
   2.217 +proc ::relmon::crawler::HandleSuccessfulTransfer {item httpBodyName} {
   2.218      upvar 1 $httpBodyName httpBody
   2.219      variable Log
   2.220      variable StateBuffer
   2.221      variable Queue
   2.222 +    variable RemainingTransfers
   2.223      variable Watchlist
   2.224  
   2.225      set name [dict get $item "name"]
   2.226 @@ -516,6 +490,7 @@
   2.227                          [dict create "name" $name "url" $newUrl \
   2.228                          "pattern_index" [expr {$patternIndex + 1}] \
   2.229                          "content_type" "" "num_redirects" 0 "num_retries" 0]
   2.230 +                dict incr RemainingTransfers $name
   2.231              } else {
   2.232                  ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\""
   2.233              }
   2.234 @@ -524,6 +499,7 @@
   2.235          # otherwise this branch has finished, try to extract the versions and
   2.236          # store them in the buffer
   2.237          dict for {finalUrl matched} $urls {
   2.238 +            dict set StateBuffer $name "urls" $url $urls
   2.239              if {$matched} {
   2.240                  regexp -line -- $pattern $finalUrl -> version
   2.241                  if {$version ne ""} {
   2.242 @@ -544,9 +520,10 @@
   2.243      return
   2.244  }
   2.245  
   2.246 -proc ::relmon::update::HandleRedirect {item httpCode httpHeaders} {
   2.247 +proc ::relmon::crawler::HandleRedirect {item httpCode httpHeaders} {
   2.248      variable Log
   2.249      variable Queue
   2.250 +    variable RemainingTransfers
   2.251  
   2.252      set name [dict get $item "name"]
   2.253      set url [dict get $item "url"]
   2.254 @@ -588,6 +565,7 @@
   2.255                  [dict replace $item "url" $redirectUrl "content_type" "" \
   2.256                  "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \
   2.257                  "num_retries" 0]
   2.258 +        dict incr RemainingTransfers $name
   2.259      } else {
   2.260          set warningMsg "\"$name\": \"$url\": exceeded maximum number of\
   2.261                  redirects"
   2.262 @@ -598,7 +576,7 @@
   2.263      return
   2.264  }
   2.265  
   2.266 -proc ::relmon::update::HandleProtocolError {item httpCode} {
   2.267 +proc ::relmon::crawler::HandleProtocolError {item httpCode} {
   2.268      variable Log
   2.269      set name [dict get $item "name"]
   2.270      set url [dict get $item "url"]
   2.271 @@ -608,10 +586,11 @@
   2.272      return
   2.273  }
   2.274  
   2.275 -proc ::relmon::update::HandleTimeoutReset {item} {
   2.276 +proc ::relmon::crawler::HandleTimeoutReset {item} {
   2.277      variable Log
   2.278      variable Config
   2.279      variable Queue
   2.280 +    variable RemainingTransfers
   2.281      set name [dict get $item "name"]
   2.282      set url [dict get $item "url"]
   2.283  
   2.284 @@ -622,6 +601,7 @@
   2.285          dict lappend Queue [::relmon::common::urlGetHost $url] \
   2.286                  [dict replace $item \
   2.287                  "num_retries" [expr {[dict get $item "num_retries"] + 1}]]
   2.288 +        dict incr RemainingTransfers $name
   2.289      } else {
   2.290          set warningMsg "\"$name\": \"$url\": exceeded maximum number of\
   2.291                  retries"
   2.292 @@ -632,7 +612,7 @@
   2.293      return
   2.294  }
   2.295  
   2.296 -proc ::relmon::update::HandleConnectionError {item errorMsg} {
   2.297 +proc ::relmon::crawler::HandleConnectionError {item errorMsg} {
   2.298      variable Log
   2.299      set name [dict get $item "name"]
   2.300      set url [dict get $item "url"]
   2.301 @@ -642,7 +622,7 @@
   2.302      return
   2.303  }
   2.304  
   2.305 -proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} {
   2.306 +proc ::relmon::crawler::TransferCallbackWrapper {callbackCmd args} {
   2.307      # ensure that exceptions get raised, by default http catches all errors and
   2.308      # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262
   2.309      if {[catch {eval $callbackCmd $args} -> errorOptions]} {
   2.310 @@ -651,14 +631,15 @@
   2.311      return
   2.312  }
   2.313  
   2.314 -proc ::relmon::update::ManageTransfers {} {
   2.315 +proc ::relmon::crawler::ManageTransfers {} {
   2.316      variable Config
   2.317      variable ManageTransfersId
   2.318      variable Queue
   2.319      variable HostConnections
   2.320      variable HostDelays
   2.321      variable ActiveTransfers
   2.322 -    variable ExitStatus
   2.323 +    variable RemainingTransfers
   2.324 +    variable Status
   2.325      variable Log
   2.326  
   2.327      after cancel $ManageTransfersId
   2.328 @@ -726,12 +707,17 @@
   2.329              set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg"
   2.330              ${Log}::warn $warningMsg
   2.331              StateItemAppendError $name $warningMsg
   2.332 +            dict incr RemainingTransfers $name -1
   2.333          }
   2.334      }
   2.335  
   2.336      # terminate the event loop if there are no remaining transfers
   2.337 -    if {([dict size $ActiveTransfers]) == 0 && ([dict size $Queue] == 0)} {
   2.338 -        set ExitStatus 0
   2.339 +    if {[::tcl::mathop::+ {*}[dict values $RemainingTransfers]] == 0} {
   2.340 +        if {([dict size $ActiveTransfers]) > 0 || ([dict size $Queue] > 0)} {
   2.341 +            ${Log}::error "inconsistent internal state"
   2.342 +            set Status 1
   2.343 +        }
   2.344 +        set Status 0
   2.345          return
   2.346      }
   2.347  
   2.348 @@ -765,7 +751,7 @@
   2.349      return
   2.350  }
   2.351  
   2.352 -proc ::relmon::update::OnTransferProgress {token total current} {
   2.353 +proc ::relmon::crawler::OnTransferProgress {token total current} {
   2.354      upvar #0 $token httpState
   2.355      variable ActiveTransfers
   2.356      variable Log
   2.357 @@ -792,16 +778,17 @@
   2.358      }
   2.359  }
   2.360  
   2.361 -proc ::relmon::update::OnTransferFinished {token} {
   2.362 +proc ::relmon::crawler::OnTransferFinished {token} {
   2.363      upvar #0 $token httpState
   2.364      variable Config
   2.365      variable HostConnections
   2.366 -    variable Queue
   2.367      variable ActiveTransfers
   2.368 +    variable RemainingTransfers
   2.369      variable Statistics
   2.370      variable StateBuffer
   2.371      variable State
   2.372      variable Log
   2.373 +    variable OnItemFinishedCmd
   2.374  
   2.375      set item [dict get $ActiveTransfers $token]
   2.376      set name [dict get $item "name"]
   2.377 @@ -811,6 +798,7 @@
   2.378      # for this item
   2.379      dict unset ActiveTransfers $token
   2.380      dict incr HostConnections $host -1
   2.381 +    dict incr RemainingTransfers $name -1
   2.382  
   2.383      switch -- $httpState(status) {
   2.384          {ok} {
   2.385 @@ -854,63 +842,9 @@
   2.386      }
   2.387  
   2.388      # check if all transfers of this item are finished
   2.389 -    set itemFinished 1
   2.390 -    dict for {queueHost queueItems} $Queue {
   2.391 -        foreach queueItem $queueItems {
   2.392 -            if {[dict get $queueItem "name"] eq $name} {
   2.393 -                set itemFinished 0
   2.394 -            }
   2.395 -        }
   2.396 -    }
   2.397 -    dict for {activeToken activeItem} $ActiveTransfers {
   2.398 -        if {[dict get $activeItem "name"] eq $name} {
   2.399 -            set itemFinished 0
   2.400 -        }
   2.401 -    }
   2.402 -    if {$itemFinished} {
   2.403 -        set timestamp [clock milliseconds]
   2.404 +    if {[dict get $RemainingTransfers $name] == 0} {
   2.405 +        eval $OnItemFinishedCmd [list $name [dict get $StateBuffer $name]]
   2.406  
   2.407 -        # create httpState item if it does not exist yet
   2.408 -        if {![dict exists $State $name]} {
   2.409 -            dict set State $name [dict create "versions" [dict create] \
   2.410 -                    "history" [list] "timestamp" 0 "errors" [list]]
   2.411 -        }
   2.412 -
   2.413 -        # if there are no versions, log an error message since something must
   2.414 -        # be wrong
   2.415 -        if {[llength [dict get $StateBuffer $name "versions"]] == 0} {
   2.416 -            set warningMsg "\"$name\": no versions found"
   2.417 -            ${Log}::warn $warningMsg
   2.418 -            StateItemAppendError $name $warningMsg
   2.419 -        }
   2.420 -
   2.421 -        # update httpState item
   2.422 -        dict set State $name "errors" [dict get $StateBuffer $name "errors"]
   2.423 -        dict set State $name "timestamp" $timestamp
   2.424 -        if {[llength [dict get $StateBuffer $name "errors"]] == 0} {
   2.425 -            # expire old history entries
   2.426 -            set history [lrange [dict get $State $name "history"] \
   2.427 -                    [expr {[llength [dict get $State $name "history"]] -
   2.428 -                    [dict get $Config "history_limit"] + 1}] end]
   2.429 -
   2.430 -            # add currently latest available version to history if it is either
   2.431 -            # newer than the previous one or if the previous one is no longer
   2.432 -            # available (e.g. if it has been removed or the watchlist pattern
   2.433 -            # has been changed)
   2.434 -            set prevLatestVersion [lindex $history end 0]
   2.435 -            set curLatestVersion [lindex \
   2.436 -                    [lsort -command ::relmon::common::cmpVersion \
   2.437 -                    [dict keys [dict get $StateBuffer $name "versions"]]] end]
   2.438 -            if {([::relmon::common::cmpVersion $curLatestVersion \
   2.439 -                    $prevLatestVersion] > 0) ||
   2.440 -                    ![dict exists $StateBuffer $name "versions" \
   2.441 -                    $prevLatestVersion]} {
   2.442 -                lappend history [list $curLatestVersion $timestamp]
   2.443 -                dict set State $name "history" $history
   2.444 -            }
   2.445 -            dict set State $name "versions" [dict get $StateBuffer $name \
   2.446 -                    "versions"]
   2.447 -        }
   2.448          dict unset StateBuffer $name
   2.449  
   2.450          ${Log}::notice "progress: [dict size $StateBuffer]/[dict get \
   2.451 @@ -925,7 +859,7 @@
   2.452  }
   2.453  
   2.454  # control certificate verification and log errors during TLS handshake
   2.455 -proc ::relmon::update::OnTlsHandshake {type args} {
   2.456 +proc ::relmon::crawler::OnTlsHandshake {type args} {
   2.457      variable Config
   2.458      variable Log
   2.459  
   2.460 @@ -957,21 +891,165 @@
   2.461      }
   2.462  }
   2.463  
   2.464 +proc ::relmon::crawler::crawl {config watchlist log onItemFinishedCmd} {
   2.465 +    variable Config $config
   2.466 +    variable Watchlist $watchlist
   2.467 +    variable Queue
   2.468 +    variable Statistics
   2.469 +    variable Log $log
   2.470 +    variable OnItemFinishedCmd $onItemFinishedCmd
   2.471 +    variable Status
   2.472 +    variable RemainingTransfers
   2.473 +    variable StateBuffer
   2.474 +
   2.475 +    # initialize queue and state buffer from the watchlist
   2.476 +    set Queue [dict create]
   2.477 +    dict for {name watchlistItem} $Watchlist {
   2.478 +        dict lappend Queue [::relmon::common::urlGetHost \
   2.479 +                [dict get $watchlistItem "base_url"]] \
   2.480 +                [dict create \
   2.481 +                "name" $name \
   2.482 +                "url" [dict get $watchlistItem "base_url"] \
   2.483 +                "pattern_index" 0 \
   2.484 +                "content_type" "" \
   2.485 +                "num_redirects" 0 \
   2.486 +                "num_retries" 0]
   2.487 +        dict set RemainingTransfers $name 1
   2.488 +        dict incr Statistics "items"
   2.489 +        dict set StateBuffer $name [dict create "versions" [dict create] \
   2.490 +                "errors" [list] "urls" [dict create]]
   2.491 +    }
   2.492 +
   2.493 +    # configure http and tls
   2.494 +    http::register https 443 [list tls::socket \
   2.495 +            -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \
   2.496 +            -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1]
   2.497 +    http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\
   2.498 +            Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION"
   2.499 +
   2.500 +    # handle errors while in the event loop
   2.501 +    interp bgerror {} [namespace code OnError]
   2.502 +
   2.503 +    dict set Statistics "start_time" [clock milliseconds]
   2.504 +
   2.505 +    # enter the main loop
   2.506 +    after idle [namespace code ManageTransfers]
   2.507 +    vwait [namespace which -variable Status]
   2.508 +
   2.509 +    # display statistics
   2.510 +    dict set Statistics "end_time" [clock milliseconds]
   2.511 +    ${Log}::notice "items checked: [dict get $Statistics "items"]"
   2.512 +    ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] -
   2.513 +        [dict get $Statistics "start_time"]) / 1000}]s"
   2.514 +
   2.515 +    return $Status
   2.516 +}
   2.517 +
   2.518 +
   2.519 +namespace eval ::relmon::update {
   2.520 +    # commandline option help text
   2.521 +    variable usage "usage: relmon update \[-dev\] \[-c max_connections\]\
   2.522 +            \[-C ca_dir\] \[-D host_delay\]\n\
   2.523 +            \                    \[-H max_connections_per_host\]\
   2.524 +            \[-i item_list\]\n\
   2.525 +            \                    \[-l logfile\] \[-r retries\]\
   2.526 +            \[-t before_seconds\]\n\
   2.527 +            \                    \[-T timeout_seconds\] watchlist statefile"
   2.528 +
   2.529 +    # configuration options
   2.530 +    variable Config [dict create \
   2.531 +            "log_file" "" \
   2.532 +            "log_level" "notice" \
   2.533 +            "history_limit" 20 \
   2.534 +            "connection_limit" 16 \
   2.535 +            "host_connection_limit" 4 \
   2.536 +            "transfer_time_limit" 60000 \
   2.537 +            "retry_limit" 3 \
   2.538 +            "host_delay" 500 \
   2.539 +            "host_delays" [dict create] \
   2.540 +            "timestamp_filter" 0 \
   2.541 +            "error_filter" 0 \
   2.542 +            "item_filter" {} \
   2.543 +            "ca_dir" "" \
   2.544 +            "state_file" "" \
   2.545 +            "watchlist_file" ""]
   2.546 +
   2.547 +    # watchlist
   2.548 +    variable Watchlist
   2.549 +
   2.550 +    # logger handle
   2.551 +    variable Log
   2.552 +
   2.553 +    # logfile handle
   2.554 +    variable Lf
   2.555 +}
   2.556 +
   2.557 +proc ::relmon::update::OnItemFinished {name stateItem} {
   2.558 +    variable State
   2.559 +    variable Log
   2.560 +    variable Config
   2.561 +
   2.562 +    set timestamp [clock milliseconds]
   2.563 +
   2.564 +    # create httpState item if it does not exist yet
   2.565 +    if {![dict exists $State $name]} {
   2.566 +        dict set State $name [dict create "versions" [dict create] \
   2.567 +                "history" [list] "timestamp" 0 "errors" [list]]
   2.568 +    }
   2.569 +
   2.570 +    # if there are no versions, log an error message since something must
   2.571 +    # be wrong
   2.572 +    if {[llength [dict get $stateItem "versions"]] == 0} {
   2.573 +        set warningMsg "\"$name\": no versions found"
   2.574 +        ${Log}::warn $warningMsg
   2.575 +        dict lappend stateItem "errors" $warningMsg
   2.576 +    }
   2.577 +
   2.578 +    # update httpState item
   2.579 +    dict set State $name "errors" [dict get $stateItem "errors"]
   2.580 +    dict set State $name "timestamp" $timestamp
   2.581 +    if {[llength [dict get $stateItem "errors"]] == 0} {
   2.582 +        # expire old history entries
   2.583 +        set history [lrange [dict get $State $name "history"] \
   2.584 +                [expr {[llength [dict get $State $name "history"]] -
   2.585 +                [dict get $Config "history_limit"] + 1}] end]
   2.586 +
   2.587 +        # add currently latest available version to history if it is either
   2.588 +        # newer than the previous one or if the previous one is no longer
   2.589 +        # available (e.g. if it has been removed or the watchlist pattern
   2.590 +        # has been changed)
   2.591 +        set prevLatestVersion [lindex $history end 0]
   2.592 +        set curLatestVersion [lindex \
   2.593 +                [lsort -command ::relmon::common::cmpVersion \
   2.594 +                [dict keys [dict get $stateItem "versions"]]] end]
   2.595 +        if {([::relmon::common::cmpVersion $curLatestVersion \
   2.596 +                $prevLatestVersion] > 0) ||
   2.597 +                ![dict exists $stateItem "versions" $prevLatestVersion]} {
   2.598 +            lappend history [list $curLatestVersion $timestamp]
   2.599 +            dict set State $name "history" $history
   2.600 +        }
   2.601 +        dict set State $name "versions" [dict get $stateItem "versions"]
   2.602 +    }
   2.603 +}
   2.604 +
   2.605 +proc ::relmon::update::CleanupBeforeExit {commandString op} {
   2.606 +    variable Lf
   2.607 +
   2.608 +    # close logfile
   2.609 +    if {($Lf ne "") && ($Lf ni {stdin stderr})} {
   2.610 +        close $Lf
   2.611 +        set Lf ""
   2.612 +    }
   2.613 +
   2.614 +    return
   2.615 +}
   2.616 +
   2.617  proc ::relmon::update::main {args} {
   2.618      variable Config
   2.619      variable usage
   2.620 -    variable Statistics
   2.621 -    variable Watchlist [dict create]
   2.622 -    variable Queue [dict create]
   2.623 -    variable HostConnections [dict create]
   2.624 -    variable HostDelays [dict create]
   2.625 -    variable ActiveTransfers [dict create]
   2.626      variable State
   2.627 -    variable StateBuffer [dict create]
   2.628 -    variable PreprocessedHtmlBuffer
   2.629      variable Log
   2.630      variable Lf ""
   2.631 -    variable ExitStatus
   2.632  
   2.633      # parse commandline
   2.634      while {[set GetoptRet [cmdline::getopt args \
   2.635 @@ -1101,9 +1179,9 @@
   2.636      }
   2.637      set Log [logger::init global]
   2.638      if {[dict get $Config "log_level"] eq "debug"} {
   2.639 -        set logFormat {%d \[%p\] \[%M\] %m}
   2.640 +        set logFormat $relmon::LogFormatDebug
   2.641      } else {
   2.642 -        set logFormat {%d \[%p\] %m}
   2.643 +        set logFormat $relmon::LogFormatDefault
   2.644      }
   2.645      logger::utils::applyAppender -appender fileAppend -appenderArgs \
   2.646              [list -outputChannel $Lf -conversionPattern $logFormat] \
   2.647 @@ -1116,7 +1194,8 @@
   2.648  
   2.649      # parse the watchlist
   2.650      try {
   2.651 -        ParseWatchlist [dict get $Config "watchlist_file"]
   2.652 +        set Watchlist [relmon::common::parseWatchlist \
   2.653 +                [dict get $Config "watchlist_file"]]
   2.654      } trap {POSIX} {errorMsg errorOptions} - \
   2.655      trap {RELMON} {errorMsg errorOptions} {
   2.656          ${Log}::error $errorMsg
   2.657 @@ -1135,62 +1214,23 @@
   2.658          exit 1
   2.659      }
   2.660  
   2.661 -    # initialize queue and state buffer from the watchlist
   2.662 -    dict set Statistics "start_time" [clock milliseconds]
   2.663 +    # apply filters specified on the command line to watchlist items
   2.664 +    set currentTime [clock milliseconds]
   2.665      dict for {name watchlistItem} $Watchlist {
   2.666 -        # apply filters specified on the command line to watchlist items
   2.667 -        if {([llength [dict get $Config "item_filter"]] > 0) &&
   2.668 -                ($name ni [dict get $Config "item_filter"])} {
   2.669 -            continue
   2.670 +        if {(([llength [dict get $Config "item_filter"]] > 0) &&
   2.671 +                ($name ni [dict get $Config "item_filter"])) ||
   2.672 +                ([dict get $Config "error_filter"] &&
   2.673 +                [dict exists $State $name "errors"] &&
   2.674 +                ([llength [dict get $State $name "errors"]] == 0)) ||
   2.675 +                ([dict exists $State $name "timestamp"] &&
   2.676 +                ([dict get $State $name "timestamp"] > $currentTime -
   2.677 +                [dict get $Config "timestamp_filter"]))} {
   2.678 +            dict unset Watchlist $name
   2.679          }
   2.680 -
   2.681 -        if {[dict get $Config "error_filter"] &&
   2.682 -                [dict exists $State $name "errors"] &&
   2.683 -                ([llength [dict get $State $name "errors"]] == 0)} {
   2.684 -            continue
   2.685 -        }
   2.686 -
   2.687 -        if {[dict exists $State $name "timestamp"] &&
   2.688 -                ([dict get $State $name "timestamp"] >
   2.689 -                [dict get $Statistics "start_time"] -
   2.690 -                [dict get $Config "timestamp_filter"])} {
   2.691 -            continue
   2.692 -        }
   2.693 -
   2.694 -        dict lappend Queue [::relmon::common::urlGetHost \
   2.695 -                [dict get $watchlistItem "base_url"]] \
   2.696 -                [dict create \
   2.697 -                "name" $name \
   2.698 -                "url" [dict get $watchlistItem "base_url"] \
   2.699 -                "pattern_index" 0 \
   2.700 -                "content_type" "" \
   2.701 -                "num_redirects" 0 \
   2.702 -                "num_retries" 0]
   2.703 -        dict incr Statistics "items"
   2.704 -        dict set StateBuffer $name [dict create "versions" [dict create] \
   2.705 -                "errors" [list]]
   2.706      }
   2.707  
   2.708 -    # configure http and tls
   2.709 -    http::register https 443 [list tls::socket \
   2.710 -            -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \
   2.711 -            -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1]
   2.712 -    http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\
   2.713 -            Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION"
   2.714 -
   2.715 -    # handle errors while in the event loop
   2.716 -    interp bgerror {} [namespace code OnError]
   2.717 -
   2.718 -    # enter the main loop
   2.719 -    after idle [namespace code ManageTransfers]
   2.720 -    vwait [namespace which -variable ExitStatus]
   2.721 -
   2.722 -    dict set Statistics "end_time" [clock milliseconds]
   2.723 -
   2.724 -    # display statistics
   2.725 -    ${Log}::notice "items checked: [dict get $Statistics "items"]"
   2.726 -    ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] -
   2.727 -        [dict get $Statistics "start_time"]) / 1000}]s"
   2.728 +    set ExitStatus [relmon::crawler::crawl $Config $Watchlist $Log \
   2.729 +            [namespace code OnItemFinished]]
   2.730  
   2.731      # serialize the new state
   2.732      set JsonStateItems {}
   2.733 @@ -1677,6 +1717,207 @@
   2.734  }
   2.735  
   2.736  
   2.737 +namespace eval ::relmon::discover {
   2.738 +    # commandline option help text
   2.739 +    variable usage "usage: relmon discover \[-d\] \[-c max_connections\]\
   2.740 +            \[-C ca_dir\] \[-D host_delay\]\n\
   2.741 +            \                      \[-H max_connections_per_host\]\
   2.742 +            \[-r retries\]\n\
   2.743 +            \                      \[-t before_seconds\]\
   2.744 +            \[-T timeout_seconds\] base_url\n\
   2.745 +            \                      \[pattern...\]"
   2.746 +
   2.747 +    # configuration options
   2.748 +    variable Config [dict create \
   2.749 +            "log_file" "" \
   2.750 +            "log_level" "info" \
   2.751 +            "connection_limit" 16 \
   2.752 +            "host_connection_limit" 4 \
   2.753 +            "transfer_time_limit" 60000 \
   2.754 +            "retry_limit" 3 \
   2.755 +            "host_delay" 500 \
   2.756 +            "host_delays" [dict create] \
   2.757 +            "ca_dir" ""]
   2.758 +
   2.759 +    # transfer statistics
   2.760 +    variable Statistics [dict create \
   2.761 +            "start_time" [clock milliseconds] \
   2.762 +            "end_time" 0 \
   2.763 +            "requests" 0 \
   2.764 +            "items" 0]
   2.765 +
   2.766 +    # watchlist
   2.767 +    variable Queue [dict create]
   2.768 +
   2.769 +    # logger handle
   2.770 +    variable Log
   2.771 +}
   2.772 +
   2.773 +proc ::relmon::discover::OnItemFinished {name stateItem} {
   2.774 +    variable Log
   2.775 +
   2.776 +    dict for {url urls} [dict get $stateItem "urls"] {
   2.777 +        dict for {linkUrl matched} $urls {
   2.778 +            if {$matched} {
   2.779 +                puts "\"$url\": found matching URL \"$linkUrl\""
   2.780 +            } else {
   2.781 +                puts "\"$url\": found non-matching URL \"$linkUrl\""
   2.782 +            }
   2.783 +        }
   2.784 +    }
   2.785 +}
   2.786 +
   2.787 +proc ::relmon::discover::main {args} {
   2.788 +    variable Config
   2.789 +    variable usage
   2.790 +    variable Log
   2.791 +    variable Queue
   2.792 +
   2.793 +    # parse commandline
   2.794 +    while {[set GetoptRet [cmdline::getopt args \
   2.795 +            {c.arg C.arg d D.arg h H.arg r.arg t.arg T.arg ?} \
   2.796 +            OptArg OptVal]] == 1} {
   2.797 +        switch -glob -- $OptArg {
   2.798 +            {c} {
   2.799 +                if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
   2.800 +                    puts stderr "invalid value passed to \"-$OptArg\""
   2.801 +                    exit 1
   2.802 +                }
   2.803 +                dict set Config "host_connection_limit" $OptVal
   2.804 +            }
   2.805 +            {C} {
   2.806 +                if {![file isdirectory $OptVal]} {
   2.807 +                    puts stderr "directory \"$OptVal\" is not a directory"
   2.808 +                    exit 1
   2.809 +                } elseif {![file readable $OptVal] ||
   2.810 +                        ![file executable $OptVal]} {
   2.811 +                    puts stderr "directory \"$OptVal\" is not readable"
   2.812 +                    exit 1
   2.813 +                }
   2.814 +                dict set Config "ca_dir" $OptVal
   2.815 +            }
   2.816 +            {d} {
   2.817 +                dict set Config "log_level" "debug"
   2.818 +            }
   2.819 +            {D} {
   2.820 +                if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
   2.821 +                    puts stderr "invalid value passed to \"-$OptArg\""
   2.822 +                    exit 1
   2.823 +                }
   2.824 +                dict set Config "host_delay" $OptVal
   2.825 +            }
   2.826 +            {[h?]} {
   2.827 +                puts stderr $usage
   2.828 +                exit 0
   2.829 +            }
   2.830 +            {H} {
   2.831 +                if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
   2.832 +                    puts stderr "invalid value passed to \"-$OptArg\""
   2.833 +                    exit 1
   2.834 +                }
   2.835 +                dict set Config "connection_limit" $OptVal
   2.836 +            }
   2.837 +            {r} {
   2.838 +                if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
   2.839 +                    puts stderr "invalid value passed to \"-$OptArg\""
   2.840 +                    exit 1
   2.841 +                }
   2.842 +                dict set Config "retry_limit" $OptVal
   2.843 +            }
   2.844 +            {t} {
   2.845 +                if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
   2.846 +                    puts stderr "invalid value passed to \"-$OptArg\""
   2.847 +                    exit 1
   2.848 +                }
   2.849 +                dict set Config "timestamp_filter" [expr {$OptVal * 1000}]
   2.850 +            }
   2.851 +            {T} {
   2.852 +                if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
   2.853 +                    puts stderr "invalid value passed to \"-$OptArg\""
   2.854 +                    exit 1
   2.855 +                }
   2.856 +                dict set Config "transfer_time_limit" [expr {$OptVal * 1000}]
   2.857 +            }
   2.858 +        }
   2.859 +    }
   2.860 +    set argc [llength $args]
   2.861 +    if {$GetoptRet == -1} {
   2.862 +        puts stderr "unknown command line option \"-$OptArg\""
   2.863 +        puts stderr $usage
   2.864 +        exit 1
   2.865 +    }
   2.866 +    if {$argc < 1} {
   2.867 +        puts stderr $usage
   2.868 +        exit 1
   2.869 +    }
   2.870 +
   2.871 +    set patterns [lassign $args baseUrl]
   2.872 +
   2.873 +    # validate URL
   2.874 +    if {![::relmon::common::isUrlValid $baseUrl]} {
   2.875 +        puts stderr "invalid base URL"
   2.876 +        exit 1
   2.877 +    }
   2.878 +
   2.879 +    # process patterns
   2.880 +    set processedPatterns {}
   2.881 +    set reInfo {}
   2.882 +    set patternIndex 0
   2.883 +    foreach pattern $patterns {
   2.884 +        incr patternIndex
   2.885 +
   2.886 +        # make trailing slashes optional
   2.887 +        if {($patternIndex != [llength $patterns]) &&
   2.888 +                ([string index $pattern end] eq "/")} {
   2.889 +            append pattern {?}
   2.890 +        }
   2.891 +
   2.892 +        # ensure patterns are anchored to the end of the line
   2.893 +        if {[string index $pattern end] ne "$"} {
   2.894 +            append pattern {$}
   2.895 +        }
   2.896 +
   2.897 +        # actually validate the regular expression
   2.898 +        try {
   2.899 +            set reInfo [regexp -about -- $pattern ""]
   2.900 +        } on error {errorMsg} {
   2.901 +            puts stderr $errorMsg
   2.902 +            exit 1
   2.903 +        }
   2.904 +        lappend processedPatterns $pattern
   2.905 +    }
   2.906 +    # add a dummy pattern "()" if the last pattern is does not contain a
   2.907 +    # capturing group, this will match every link and capture an empty string
   2.908 +    if {[lindex $reInfo 0] < 1} {
   2.909 +        lappend processedPatterns {()}
   2.910 +    }
   2.911 +
   2.912 +    # construct a watchlist from the command line arguments
   2.913 +    set watchlist [dict create "new project" [dict create "base_url" $baseUrl \
   2.914 +            "patterns" $processedPatterns]]
   2.915 +
   2.916 +    set Log [logger::init global]
   2.917 +    if {[dict get $Config "log_level"] eq "debug"} {
   2.918 +        set logFormat $relmon::LogFormatDebug
   2.919 +    } else {
   2.920 +        set logFormat $relmon::LogFormatDefault
   2.921 +    }
   2.922 +    logger::utils::applyAppender -appender fileAppend -appenderArgs \
   2.923 +            [list -outputChannel stderr -conversionPattern $logFormat] \
   2.924 +            -serviceCmd $Log
   2.925 +
   2.926 +    # set default logging level
   2.927 +    ${Log}::setlevel [dict get $Config "log_level"]
   2.928 +
   2.929 +    ${Log}::notice "relmon.tcl starting up"
   2.930 +
   2.931 +    set ExitStatus [relmon::crawler::crawl $Config $watchlist $Log \
   2.932 +            [namespace code OnItemFinished]]
   2.933 +
   2.934 +    exit $ExitStatus
   2.935 +}
   2.936 +
   2.937 +
   2.938  proc ::relmon::main {args} {
   2.939      variable usage
   2.940      set subArgs [lassign $args subCommand]