changeset 5:86a0c5d11f05 version-1

Add discover subcommand The discover subcommand assists with the creation of watchlist entries and allows to reproduce each step of an update operation for a watchlist entry.
author Guido Berhoerster <guido+relmon@berhoerster.name>
date Sun, 26 Oct 2014 21:36:05 +0100
parents f28486666a4f
children 45fe94dbc236
files relmon.1.xml relmon.tcl.in
diffstat 2 files changed, 611 insertions(+), 229 deletions(-) [+]
line wrap: on
line diff
--- a/relmon.1.xml	Fri Oct 24 22:44:39 2014 +0200
+++ b/relmon.1.xml	Sun Oct 26 21:36:05 2014 +0100
@@ -150,6 +150,42 @@
       </arg>
     </cmdsynopsis>
     <cmdsynopsis>
+      <command>relmon discover</command>
+      <arg choice="opt">
+        <option>-d</option>
+      </arg>
+      <arg choice="opt">
+        <option>-c</option>
+        <replaceable>max_connections</replaceable>
+      </arg>
+      <arg choice="opt">
+        <option>-C</option>
+        <replaceable>ca_dir</replaceable>
+      </arg>
+      <arg choice="opt">
+        <option>-D</option>
+        <replaceable>delay</replaceable>
+      </arg>
+      <arg choice="opt">
+        <option>-H</option>
+        <replaceable>max_host_connections</replaceable>
+      </arg>
+      <arg choice="opt">
+        <option>-r</option>
+        <replaceable>retries</replaceable>
+      </arg>
+      <arg choice="opt">
+        <option>-t</option>
+        <replaceable>min_time</replaceable>
+      </arg>
+      <arg choice="plain">
+        <replaceable>base_url</replaceable>
+      </arg>
+      <arg choice="opt">
+        <replaceable>pattern</replaceable>
+      </arg>
+    </cmdsynopsis>
+    <cmdsynopsis>
       <command>relmon help</command>
       <arg choice="opt">
         <replaceable>subcommand</replaceable>
@@ -396,6 +432,97 @@
       </varlistentry>
       <varlistentry>
         <term>
+          <command>discover</command>
+        </term>
+        <listitem>
+            <para>The <command>discover</command> subcommand assists with the
+              creation of watchlist entries. The arguments to the
+              <command>discover</command> subcommand correspond to the fields
+              of a watchlist entry without the name field, see
+              <citerefentry><refentrytitle>relmon_format</refentrytitle>
+              <manvolnum>4</manvolnum></citerefentry> for details on the
+              format. Only the <replaceable>base_url</replaceable> is mandatory
+              and <command>relmon</command> will log all matching and
+              non-matching links of each retrieved document or feed so that
+              each step of an update operation for a watchlist entry can be
+              reproduced.</para>
+          <para>The following options are supported:</para>
+          <variablelist>
+            <varlistentry>
+              <term>
+                <option>-d</option>
+              </term>
+              <listitem>
+                <para>Trace and log all transfers and parsing. This option is
+                  used for debugging purposes.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>
+                <option>-c</option>
+                <replaceable>max_connections</replaceable>
+              </term>
+              <listitem>
+                <para>Limit the number of simultaneous connections to the
+                  specified number.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>
+                <option>-C</option>
+                <replaceable>ca_dir</replaceable>
+              </term>
+              <listitem>
+                <para>Verify the validity of TLS certificates using the CA
+                  certificates in the specified directory.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>
+                <option>-D</option>
+                <replaceable>delay</replaceable>
+              </term>
+              <listitem>
+                <para>Wait at least the specified number of seconds before
+                  making subsequent connections to the same host.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>
+                <option>-H</option>
+                <replaceable>max_host_connections</replaceable>
+              </term>
+              <listitem>
+                <para>Limit the number of simultaneous connections to a single
+                  host to the specified number.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>
+                <option>-r</option>
+                <replaceable>retries</replaceable>
+              </term>
+              <listitem>
+                <para>Limit the number of retries in case of connection
+                  failures.</para>
+              </listitem>
+            </varlistentry>
+            <varlistentry>
+              <term>
+                <option>-t</option>
+                <replaceable>min_time</replaceable>
+              </term>
+              <listitem>
+                <para>Only update version information for projects from the
+                  watchlist which have not been updated for the specified
+                  number of seconds.</para>
+              </listitem>
+            </varlistentry>
+          </variablelist>
+        </listitem>
+      </varlistentry>
+      <varlistentry>
+        <term>
           <command>help</command>
         </term>
         <listitem>
@@ -409,6 +536,20 @@
   <refsect1>
     <title>Examples</title>
     <example>
+      <title>Creating a new watchlist entry</title>
+      <para>The following command displays all links found in the HTML document
+        at <uri>http://example.net/foo/</uri>:</para>
+      <screen>
+$ relmon discover http://example.net/foo/
+      </screen>
+      <para>The following command tests whether the specified version-matching
+        regular expression matches the distribution file linked from
+        <uri>http://example.net/foo/</uri>:</para>
+      <screen>
+$ relmon discover http://example.net/foo/ '/foo-([[:digit:].]+)\.tar\.gz'
+      </screen>
+    </example>
+    <example>
       <title>Updating version information</title>
       <para>The following command retrieves the version information for the
         entries in the watchlist <filename>foo.watchlist</filename> and stores
--- a/relmon.tcl.in	Fri Oct 24 22:44:39 2014 +0200
+++ b/relmon.tcl.in	Sun Oct 26 21:36:05 2014 +0100
@@ -42,6 +42,12 @@
 namespace eval ::relmon {
     # version
     variable VERSION @VERSION@
+
+    # default log format
+    variable LogFormatDefault {%d \[%p\] %m}
+
+    # debugging log format
+    variable LogFormatDebug {%d \[%p\] \[%M\] %m}
 }
 
 
@@ -177,97 +183,8 @@
     return $state
 }
 
-
-namespace eval ::relmon::update {
-    # commandline option help text
-    variable usage "usage: relmon update \[-dev\] \[-c max_connections\] \[-C\
-            ca_dir\] \[-D delay\]\n\
-            \                    \[-H max_host_connections\] \[-i\
-            item\[,...\]\] \[-l logfile\]\n\
-            \                    \[-r retries\] \[-t min_time\] watchlist\
-            statefile"
-
-    # configuration options
-    variable Config [dict create \
-            "log_file" "" \
-            "log_level" "notice" \
-            "history_limit" 20 \
-            "connection_limit" 16 \
-            "host_connection_limit" 4 \
-            "transfer_time_limit" 60000 \
-            "retry_limit" 3 \
-            "host_delay" 0 \
-            "timestamp_filter" 0 \
-            "error_filter" 0 \
-            "item_filter" {} \
-            "ca_dir" "" \
-            "state_file" "" \
-            "watchlist_file" ""]
-
-    # exit status
-    variable ExitStatus
-
-    # transfer statistics
-    variable Statistics [dict create \
-            "start_time" 0 \
-            "end_time" 0 \
-            "requests" 0 \
-            "items" 0]
-
-    # watchlist
-    variable Watchlist
-
-    # ID of a delayed run of ManageTransfers
-    variable ManageTransfersId ""
-
-    # queue of pending transfers
-    variable Queue
-
-    # number of active connections per host
-    variable HostConnections
-
-    # delays before opening a new connection to a host
-    variable HostDelays
-
-    # active transfers
-    variable ActiveTransfers
-
-    # buffer for tracking the state of unfinished items
-    variable StateBuffer
-
-    # buffer needed by htmlparse::parse for constructing the preprocessed HTML
-    # document
-    variable PreprocessedHtmlBuffer
-
-    # logger handle
-    variable Log
-
-    # logfile handle
-    variable Lf
-}
-
-proc ::relmon::update::OnError {message returnOptions} {
-    # internal error, abort
-    puts stderr [dict get $returnOptions "-errorinfo"]
-
-    exit 1
-}
-
-proc ::relmon::update::CleanupBeforeExit {commandString operation} {
-    variable Lf
-
-    # close logfile
-    if {($Lf ne "") && ($Lf ni {stdin stderr})} {
-        close $Lf
-        set Lf ""
-    }
-
-    return
-}
-
-proc ::relmon::update::ParseWatchlist {watchlistFilename} {
-    variable Watchlist
-
+proc ::relmon::common::parseWatchlist {watchlistFilename} {
+    set Watchlist [dict create]
     set lineno 0
     set f [open $watchlistFilename "r"]
     try {
@@ -338,10 +255,66 @@
         close $f
     }
 
-    return
+    return $Watchlist
 }
 
-proc ::relmon::update::ProcessHtmlElement {tag slash param textBehindTheTag} {
+
+namespace eval ::relmon::crawler {
+    # status returned by crawl
+    variable Status
+
+    # configuration
+    variable Config
+
+    # transfer statistics
+    variable Statistics [dict create \
+            "start_time" 0 \
+            "end_time" 0 \
+            "requests" 0 \
+            "items" 0]
+
+    # ID of a delayed run of ManageTransfers
+    variable ManageTransfersId ""
+
+    # queue of pending transfers
+    variable Queue
+
+    # number of active connections per host
+    variable HostConnections [dict create]
+
+    # delays before opening a new connection to a host
+    variable HostDelays [dict create]
+
+    # active transfers
+    variable ActiveTransfers [dict create]
+
+    # number of running or queued transfers of watchlist items
+    variable RemainingTransfers [dict create]
+
+    # buffer for tracking the state of unfinished items
+    variable StateBuffer [dict create]
+
+    # buffer needed by htmlparse::parse for constructing the preprocessed HTML
+    # document
+    variable PreprocessedHtmlBuffer
+
+    # callback when all transfers for an item are finished
+    variable OnItemFinishedCmd
+
+    # logger handle
+    variable Log
+
+    namespace export crawl
+}
+
+proc ::relmon::crawler::OnError {message returnOptions} {
+    # internal error, abort
+    puts stderr [dict get $returnOptions "-errorinfo"]
+
+    exit 1
+}
+
+proc ::relmon::crawler::ProcessHtmlElement {tag slash param textBehindTheTag} {
     variable PreprocessedHtmlBuffer
 
     # copy every "<a>" element into PreprocessedHtmlBuffer
@@ -352,7 +325,7 @@
     return
 }
 
-proc ::relmon::update::PreprocessHtml {bodyDataName} {
+proc ::relmon::crawler::PreprocessHtml {bodyDataName} {
     upvar 1 $bodyDataName bodyData
     variable PreprocessedHtmlBuffer
 
@@ -367,7 +340,7 @@
     append PreprocessedHtmlBuffer "</body></html>"
 }
 
-proc ::relmon::update::ExtractUrls {bodyDataName contentType baseUrl
+proc ::relmon::crawler::ExtractUrls {bodyDataName contentType baseUrl
         rePattern} {
     upvar 1 $bodyDataName bodyData
     set extractedUrls {}
@@ -462,7 +435,7 @@
     return $resultUrls
 }
 
-proc ::relmon::update::StateItemAppendError {name logMsg} {
+proc ::relmon::crawler::StateItemAppendError {name logMsg} {
     variable StateBuffer
 
     dict update StateBuffer $name stateItem {
@@ -472,11 +445,12 @@
     return
 }
 
-proc ::relmon::update::HandleSuccessfulTransfer {item httpBodyName} {
+proc ::relmon::crawler::HandleSuccessfulTransfer {item httpBodyName} {
     upvar 1 $httpBodyName httpBody
     variable Log
     variable StateBuffer
     variable Queue
+    variable RemainingTransfers
     variable Watchlist
 
     set name [dict get $item "name"]
@@ -516,6 +490,7 @@
                         [dict create "name" $name "url" $newUrl \
                         "pattern_index" [expr {$patternIndex + 1}] \
                         "content_type" "" "num_redirects" 0 "num_retries" 0]
+                dict incr RemainingTransfers $name
             } else {
                 ${Log}::debug "\"$name\": \"$url\": ignoring \"$newUrl\""
             }
@@ -524,6 +499,7 @@
         # otherwise this branch has finished, try to extract the versions and
         # store them in the buffer
         dict for {finalUrl matched} $urls {
+            dict set StateBuffer $name "urls" $url $urls
             if {$matched} {
                 regexp -line -- $pattern $finalUrl -> version
                 if {$version ne ""} {
@@ -544,9 +520,10 @@
     return
 }
 
-proc ::relmon::update::HandleRedirect {item httpCode httpHeaders} {
+proc ::relmon::crawler::HandleRedirect {item httpCode httpHeaders} {
     variable Log
     variable Queue
+    variable RemainingTransfers
 
     set name [dict get $item "name"]
     set url [dict get $item "url"]
@@ -588,6 +565,7 @@
                 [dict replace $item "url" $redirectUrl "content_type" "" \
                 "num_redirects" [expr {[dict get $item "num_redirects"] + 1}] \
                 "num_retries" 0]
+        dict incr RemainingTransfers $name
     } else {
         set warningMsg "\"$name\": \"$url\": exceeded maximum number of\
                 redirects"
@@ -598,7 +576,7 @@
     return
 }
 
-proc ::relmon::update::HandleProtocolError {item httpCode} {
+proc ::relmon::crawler::HandleProtocolError {item httpCode} {
     variable Log
     set name [dict get $item "name"]
     set url [dict get $item "url"]
@@ -608,10 +586,11 @@
     return
 }
 
-proc ::relmon::update::HandleTimeoutReset {item} {
+proc ::relmon::crawler::HandleTimeoutReset {item} {
     variable Log
     variable Config
     variable Queue
+    variable RemainingTransfers
     set name [dict get $item "name"]
     set url [dict get $item "url"]
 
@@ -622,6 +601,7 @@
         dict lappend Queue [::relmon::common::urlGetHost $url] \
                 [dict replace $item \
                 "num_retries" [expr {[dict get $item "num_retries"] + 1}]]
+        dict incr RemainingTransfers $name
     } else {
         set warningMsg "\"$name\": \"$url\": exceeded maximum number of\
                 retries"
@@ -632,7 +612,7 @@
     return
 }
 
-proc ::relmon::update::HandleConnectionError {item errorMsg} {
+proc ::relmon::crawler::HandleConnectionError {item errorMsg} {
     variable Log
     set name [dict get $item "name"]
     set url [dict get $item "url"]
@@ -642,7 +622,7 @@
     return
 }
 
-proc ::relmon::update::TransferCallbackWrapper {callbackCmd args} {
+proc ::relmon::crawler::TransferCallbackWrapper {callbackCmd args} {
     # ensure that exceptions get raised, by default http catches all errors and
     # silently ignores them, see https://core.tcl.tk/tcl/tktview?name=1414262
     if {[catch {eval $callbackCmd $args} -> errorOptions]} {
@@ -651,14 +631,15 @@
     return
 }
 
-proc ::relmon::update::ManageTransfers {} {
+proc ::relmon::crawler::ManageTransfers {} {
     variable Config
     variable ManageTransfersId
     variable Queue
     variable HostConnections
     variable HostDelays
     variable ActiveTransfers
-    variable ExitStatus
+    variable RemainingTransfers
+    variable Status
     variable Log
 
     after cancel $ManageTransfersId
@@ -726,12 +707,17 @@
             set warningMsg "\"$name\": \"$url\": transfer failed: $errorMsg"
             ${Log}::warn $warningMsg
             StateItemAppendError $name $warningMsg
+            dict incr RemainingTransfers $name -1
         }
     }
 
     # terminate the event loop if there are no remaining transfers
-    if {([dict size $ActiveTransfers]) == 0 && ([dict size $Queue] == 0)} {
-        set ExitStatus 0
+    if {[::tcl::mathop::+ {*}[dict values $RemainingTransfers]] == 0} {
+        if {([dict size $ActiveTransfers]) > 0 || ([dict size $Queue] > 0)} {
+            ${Log}::error "inconsistent internal state"
+            set Status 1
+        }
+        set Status 0
         return
     }
 
@@ -765,7 +751,7 @@
     return
 }
 
-proc ::relmon::update::OnTransferProgress {token total current} {
+proc ::relmon::crawler::OnTransferProgress {token total current} {
     upvar #0 $token httpState
     variable ActiveTransfers
     variable Log
@@ -792,16 +778,17 @@
     }
 }
 
-proc ::relmon::update::OnTransferFinished {token} {
+proc ::relmon::crawler::OnTransferFinished {token} {
     upvar #0 $token httpState
     variable Config
     variable HostConnections
-    variable Queue
     variable ActiveTransfers
+    variable RemainingTransfers
     variable Statistics
     variable StateBuffer
     variable State
     variable Log
+    variable OnItemFinishedCmd
 
     set item [dict get $ActiveTransfers $token]
     set name [dict get $item "name"]
@@ -811,6 +798,7 @@
     # for this item
     dict unset ActiveTransfers $token
     dict incr HostConnections $host -1
+    dict incr RemainingTransfers $name -1
 
     switch -- $httpState(status) {
         {ok} {
@@ -854,63 +842,9 @@
     }
 
     # check if all transfers of this item are finished
-    set itemFinished 1
-    dict for {queueHost queueItems} $Queue {
-        foreach queueItem $queueItems {
-            if {[dict get $queueItem "name"] eq $name} {
-                set itemFinished 0
-            }
-        }
-    }
-    dict for {activeToken activeItem} $ActiveTransfers {
-        if {[dict get $activeItem "name"] eq $name} {
-            set itemFinished 0
-        }
-    }
-    if {$itemFinished} {
-        set timestamp [clock milliseconds]
-
-        # create httpState item if it does not exist yet
-        if {![dict exists $State $name]} {
-            dict set State $name [dict create "versions" [dict create] \
-                    "history" [list] "timestamp" 0 "errors" [list]]
-        }
+    if {[dict get $RemainingTransfers $name] == 0} {
+        eval $OnItemFinishedCmd [list $name [dict get $StateBuffer $name]]
 
-        # if there are no versions, log an error message since something must
-        # be wrong
-        if {[llength [dict get $StateBuffer $name "versions"]] == 0} {
-            set warningMsg "\"$name\": no versions found"
-            ${Log}::warn $warningMsg
-            StateItemAppendError $name $warningMsg
-        }
-
-        # update httpState item
-        dict set State $name "errors" [dict get $StateBuffer $name "errors"]
-        dict set State $name "timestamp" $timestamp
-        if {[llength [dict get $StateBuffer $name "errors"]] == 0} {
-            # expire old history entries
-            set history [lrange [dict get $State $name "history"] \
-                    [expr {[llength [dict get $State $name "history"]] -
-                    [dict get $Config "history_limit"] + 1}] end]
-
-            # add currently latest available version to history if it is either
-            # newer than the previous one or if the previous one is no longer
-            # available (e.g. if it has been removed or the watchlist pattern
-            # has been changed)
-            set prevLatestVersion [lindex $history end 0]
-            set curLatestVersion [lindex \
-                    [lsort -command ::relmon::common::cmpVersion \
-                    [dict keys [dict get $StateBuffer $name "versions"]]] end]
-            if {([::relmon::common::cmpVersion $curLatestVersion \
-                    $prevLatestVersion] > 0) ||
-                    ![dict exists $StateBuffer $name "versions" \
-                    $prevLatestVersion]} {
-                lappend history [list $curLatestVersion $timestamp]
-                dict set State $name "history" $history
-            }
-            dict set State $name "versions" [dict get $StateBuffer $name \
-                    "versions"]
-        }
         dict unset StateBuffer $name
 
         ${Log}::notice "progress: [dict size $StateBuffer]/[dict get \
@@ -925,7 +859,7 @@
 }
 
 # control certificate verification and log errors during TLS handshake
-proc ::relmon::update::OnTlsHandshake {type args} {
+proc ::relmon::crawler::OnTlsHandshake {type args} {
     variable Config
     variable Log
 
@@ -957,21 +891,165 @@
     }
 }
 
+proc ::relmon::crawler::crawl {config watchlist log onItemFinishedCmd} {
+    variable Config $config
+    variable Watchlist $watchlist
+    variable Queue
+    variable Statistics
+    variable Log $log
+    variable OnItemFinishedCmd $onItemFinishedCmd
+    variable Status
+    variable RemainingTransfers
+    variable StateBuffer
+
+    # initialize queue and state buffer from the watchlist
+    set Queue [dict create]
+    dict for {name watchlistItem} $Watchlist {
+        dict lappend Queue [::relmon::common::urlGetHost \
+                [dict get $watchlistItem "base_url"]] \
+                [dict create \
+                "name" $name \
+                "url" [dict get $watchlistItem "base_url"] \
+                "pattern_index" 0 \
+                "content_type" "" \
+                "num_redirects" 0 \
+                "num_retries" 0]
+        dict set RemainingTransfers $name 1
+        dict incr Statistics "items"
+        dict set StateBuffer $name [dict create "versions" [dict create] \
+                "errors" [list] "urls" [dict create]]
+    }
+
+    # configure http and tls
+    http::register https 443 [list tls::socket \
+            -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \
+            -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1]
+    http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\
+            Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION"
+
+    # handle errors while in the event loop
+    interp bgerror {} [namespace code OnError]
+
+    dict set Statistics "start_time" [clock milliseconds]
+
+    # enter the main loop
+    after idle [namespace code ManageTransfers]
+    vwait [namespace which -variable Status]
+
+    # display statistics
+    dict set Statistics "end_time" [clock milliseconds]
+    ${Log}::notice "items checked: [dict get $Statistics "items"]"
+    ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] -
+        [dict get $Statistics "start_time"]) / 1000}]s"
+
+    return $Status
+}
+
+
+namespace eval ::relmon::update {
+    # commandline option help text
+    variable usage "usage: relmon update \[-dev\] \[-c max_connections\]\
+            \[-C ca_dir\] \[-D host_delay\]\n\
+            \                    \[-H max_connections_per_host\]\
+            \[-i item_list\]\n\
+            \                    \[-l logfile\] \[-r retries\]\
+            \[-t before_seconds\]\n\
+            \                    \[-T timeout_seconds\] watchlist statefile"
+
+    # configuration options
+    variable Config [dict create \
+            "log_file" "" \
+            "log_level" "notice" \
+            "history_limit" 20 \
+            "connection_limit" 16 \
+            "host_connection_limit" 4 \
+            "transfer_time_limit" 60000 \
+            "retry_limit" 3 \
+            "host_delay" 500 \
+            "host_delays" [dict create] \
+            "timestamp_filter" 0 \
+            "error_filter" 0 \
+            "item_filter" {} \
+            "ca_dir" "" \
+            "state_file" "" \
+            "watchlist_file" ""]
+
+    # watchlist
+    variable Watchlist
+
+    # logger handle
+    variable Log
+
+    # logfile handle
+    variable Lf
+}
+
+proc ::relmon::update::OnItemFinished {name stateItem} {
+    variable State
+    variable Log
+    variable Config
+
+    set timestamp [clock milliseconds]
+
+    # create httpState item if it does not exist yet
+    if {![dict exists $State $name]} {
+        dict set State $name [dict create "versions" [dict create] \
+                "history" [list] "timestamp" 0 "errors" [list]]
+    }
+
+    # if there are no versions, log an error message since something must
+    # be wrong
+    if {[llength [dict get $stateItem "versions"]] == 0} {
+        set warningMsg "\"$name\": no versions found"
+        ${Log}::warn $warningMsg
+        dict lappend stateItem "errors" $warningMsg
+    }
+
+    # update httpState item
+    dict set State $name "errors" [dict get $stateItem "errors"]
+    dict set State $name "timestamp" $timestamp
+    if {[llength [dict get $stateItem "errors"]] == 0} {
+        # expire old history entries
+        set history [lrange [dict get $State $name "history"] \
+                [expr {[llength [dict get $State $name "history"]] -
+                [dict get $Config "history_limit"] + 1}] end]
+
+        # add currently latest available version to history if it is either
+        # newer than the previous one or if the previous one is no longer
+        # available (e.g. if it has been removed or the watchlist pattern
+        # has been changed)
+        set prevLatestVersion [lindex $history end 0]
+        set curLatestVersion [lindex \
+                [lsort -command ::relmon::common::cmpVersion \
+                [dict keys [dict get $stateItem "versions"]]] end]
+        if {([::relmon::common::cmpVersion $curLatestVersion \
+                $prevLatestVersion] > 0) ||
+                ![dict exists $stateItem "versions" $prevLatestVersion]} {
+            lappend history [list $curLatestVersion $timestamp]
+            dict set State $name "history" $history
+        }
+        dict set State $name "versions" [dict get $stateItem "versions"]
+    }
+}
+
+proc ::relmon::update::CleanupBeforeExit {commandString op} {
+    variable Lf
+
+    # close logfile
+    if {($Lf ne "") && ($Lf ni {stdin stderr})} {
+        close $Lf
+        set Lf ""
+    }
+
+    return
+}
+
 proc ::relmon::update::main {args} {
     variable Config
     variable usage
-    variable Statistics
-    variable Watchlist [dict create]
-    variable Queue [dict create]
-    variable HostConnections [dict create]
-    variable HostDelays [dict create]
-    variable ActiveTransfers [dict create]
     variable State
-    variable StateBuffer [dict create]
-    variable PreprocessedHtmlBuffer
     variable Log
     variable Lf ""
-    variable ExitStatus
 
     # parse commandline
     while {[set GetoptRet [cmdline::getopt args \
@@ -1101,9 +1179,9 @@
     }
     set Log [logger::init global]
     if {[dict get $Config "log_level"] eq "debug"} {
-        set logFormat {%d \[%p\] \[%M\] %m}
+        set logFormat $relmon::LogFormatDebug
     } else {
-        set logFormat {%d \[%p\] %m}
+        set logFormat $relmon::LogFormatDefault
     }
     logger::utils::applyAppender -appender fileAppend -appenderArgs \
             [list -outputChannel $Lf -conversionPattern $logFormat] \
@@ -1116,7 +1194,8 @@
 
     # parse the watchlist
     try {
-        ParseWatchlist [dict get $Config "watchlist_file"]
+        set Watchlist [relmon::common::parseWatchlist \
+                [dict get $Config "watchlist_file"]]
     } trap {POSIX} {errorMsg errorOptions} - \
     trap {RELMON} {errorMsg errorOptions} {
         ${Log}::error $errorMsg
@@ -1135,62 +1214,23 @@
         exit 1
     }
 
-    # initialize queue and state buffer from the watchlist
-    dict set Statistics "start_time" [clock milliseconds]
+    # apply filters specified on the command line to watchlist items
+    set currentTime [clock milliseconds]
     dict for {name watchlistItem} $Watchlist {
-        # apply filters specified on the command line to watchlist items
-        if {([llength [dict get $Config "item_filter"]] > 0) &&
-                ($name ni [dict get $Config "item_filter"])} {
-            continue
-        }
-
-        if {[dict get $Config "error_filter"] &&
+        if {(([llength [dict get $Config "item_filter"]] > 0) &&
+                ($name ni [dict get $Config "item_filter"])) ||
+                ([dict get $Config "error_filter"] &&
                 [dict exists $State $name "errors"] &&
-                ([llength [dict get $State $name "errors"]] == 0)} {
-            continue
+                ([llength [dict get $State $name "errors"]] == 0)) ||
+                ([dict exists $State $name "timestamp"] &&
+                ([dict get $State $name "timestamp"] > $currentTime -
+                [dict get $Config "timestamp_filter"]))} {
+            dict unset Watchlist $name
         }
-
-        if {[dict exists $State $name "timestamp"] &&
-                ([dict get $State $name "timestamp"] >
-                [dict get $Statistics "start_time"] -
-                [dict get $Config "timestamp_filter"])} {
-            continue
-        }
-
-        dict lappend Queue [::relmon::common::urlGetHost \
-                [dict get $watchlistItem "base_url"]] \
-                [dict create \
-                "name" $name \
-                "url" [dict get $watchlistItem "base_url"] \
-                "pattern_index" 0 \
-                "content_type" "" \
-                "num_redirects" 0 \
-                "num_retries" 0]
-        dict incr Statistics "items"
-        dict set StateBuffer $name [dict create "versions" [dict create] \
-                "errors" [list]]
     }
 
-    # configure http and tls
-    http::register https 443 [list tls::socket \
-            -cadir [dict get $Config "ca_dir"] -request 1 -require 1 \
-            -command [namespace code OnTlsHandshake] -ssl2 0 -ssl3 1 -tls1 1]
-    http::config -useragent "Mozilla/5.0 (Windows NT 6.1; rv:16.0)\
-            Gecko/20100101 Firefox/16.0 relmon/$relmon::VERSION"
-
-    # handle errors while in the event loop
-    interp bgerror {} [namespace code OnError]
-
-    # enter the main loop
-    after idle [namespace code ManageTransfers]
-    vwait [namespace which -variable ExitStatus]
-
-    dict set Statistics "end_time" [clock milliseconds]
-
-    # display statistics
-    ${Log}::notice "items checked: [dict get $Statistics "items"]"
-    ${Log}::notice "time elapsed: [expr {([dict get $Statistics "end_time"] -
-        [dict get $Statistics "start_time"]) / 1000}]s"
+    set ExitStatus [relmon::crawler::crawl $Config $Watchlist $Log \
+            [namespace code OnItemFinished]]
 
     # serialize the new state
     set JsonStateItems {}
@@ -1677,6 +1717,207 @@
 }
 
 
+namespace eval ::relmon::discover {
+    # commandline option help text
+    variable usage "usage: relmon discover \[-d\] \[-c max_connections\]\
+            \[-C ca_dir\] \[-D host_delay\]\n\
+            \                      \[-H max_connections_per_host\]\
+            \[-r retries\]\n\
+            \                      \[-t before_seconds\]\
+            \[-T timeout_seconds\] base_url\n\
+            \                      \[pattern...\]"
+
+    # configuration options
+    variable Config [dict create \
+            "log_file" "" \
+            "log_level" "info" \
+            "connection_limit" 16 \
+            "host_connection_limit" 4 \
+            "transfer_time_limit" 60000 \
+            "retry_limit" 3 \
+            "host_delay" 500 \
+            "host_delays" [dict create] \
+            "ca_dir" ""]
+
+    # transfer statistics
+    variable Statistics [dict create \
+            "start_time" [clock milliseconds] \
+            "end_time" 0 \
+            "requests" 0 \
+            "items" 0]
+
+    # watchlist
+    variable Queue [dict create]
+
+    # logger handle
+    variable Log
+}
+
+proc ::relmon::discover::OnItemFinished {name stateItem} {
+    variable Log
+
+    dict for {url urls} [dict get $stateItem "urls"] {
+        dict for {linkUrl matched} $urls {
+            if {$matched} {
+                puts "\"$url\": found matching URL \"$linkUrl\""
+            } else {
+                puts "\"$url\": found non-matching URL \"$linkUrl\""
+            }
+        }
+    }
+}
+
+proc ::relmon::discover::main {args} {
+    variable Config
+    variable usage
+    variable Log
+    variable Queue
+
+    # parse commandline
+    while {[set GetoptRet [cmdline::getopt args \
+            {c.arg C.arg d D.arg h H.arg r.arg t.arg T.arg ?} \
+            OptArg OptVal]] == 1} {
+        switch -glob -- $OptArg {
+            {c} {
+                if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
+                    puts stderr "invalid value passed to \"-$OptArg\""
+                    exit 1
+                }
+                dict set Config "host_connection_limit" $OptVal
+            }
+            {C} {
+                if {![file isdirectory $OptVal]} {
+                    puts stderr "directory \"$OptVal\" is not a directory"
+                    exit 1
+                } elseif {![file readable $OptVal] ||
+                        ![file executable $OptVal]} {
+                    puts stderr "directory \"$OptVal\" is not readable"
+                    exit 1
+                }
+                dict set Config "ca_dir" $OptVal
+            }
+            {d} {
+                dict set Config "log_level" "debug"
+            }
+            {D} {
+                if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
+                    puts stderr "invalid value passed to \"-$OptArg\""
+                    exit 1
+                }
+                dict set Config "host_delay" $OptVal
+            }
+            {[h?]} {
+                puts stderr $usage
+                exit 0
+            }
+            {H} {
+                if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
+                    puts stderr "invalid value passed to \"-$OptArg\""
+                    exit 1
+                }
+                dict set Config "connection_limit" $OptVal
+            }
+            {r} {
+                if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
+                    puts stderr "invalid value passed to \"-$OptArg\""
+                    exit 1
+                }
+                dict set Config "retry_limit" $OptVal
+            }
+            {t} {
+                if {![string is digit -strict $OptVal] || ($OptVal < 0)} {
+                    puts stderr "invalid value passed to \"-$OptArg\""
+                    exit 1
+                }
+                dict set Config "timestamp_filter" [expr {$OptVal * 1000}]
+            }
+            {T} {
+                if {![string is digit -strict $OptVal] || ($OptVal <= 0)} {
+                    puts stderr "invalid value passed to \"-$OptArg\""
+                    exit 1
+                }
+                dict set Config "transfer_time_limit" [expr {$OptVal * 1000}]
+            }
+        }
+    }
+    set argc [llength $args]
+    if {$GetoptRet == -1} {
+        puts stderr "unknown command line option \"-$OptArg\""
+        puts stderr $usage
+        exit 1
+    }
+    if {$argc < 1} {
+        puts stderr $usage
+        exit 1
+    }
+
+    set patterns [lassign $args baseUrl]
+
+    # validate URL
+    if {![::relmon::common::isUrlValid $baseUrl]} {
+        puts stderr "invalid base URL"
+        exit 1
+    }
+
+    # process patterns
+    set processedPatterns {}
+    set reInfo {}
+    set patternIndex 0
+    foreach pattern $patterns {
+        incr patternIndex
+
+        # make trailing slashes optional
+        if {($patternIndex != [llength $patterns]) &&
+                ([string index $pattern end] eq "/")} {
+            append pattern {?}
+        }
+
+        # ensure patterns are anchored to the end of the line
+        if {[string index $pattern end] ne "$"} {
+            append pattern {$}
+        }
+
+        # actually validate the regular expression
+        try {
+            set reInfo [regexp -about -- $pattern ""]
+        } on error {errorMsg} {
+            puts stderr $errorMsg
+            exit 1
+        }
+        lappend processedPatterns $pattern
+    }
+    # add a dummy pattern "()" if the last pattern is does not contain a
+    # capturing group, this will match every link and capture an empty string
+    if {[lindex $reInfo 0] < 1} {
+        lappend processedPatterns {()}
+    }
+
+    # construct a watchlist from the command line arguments
+    set watchlist [dict create "new project" [dict create "base_url" $baseUrl \
+            "patterns" $processedPatterns]]
+
+    set Log [logger::init global]
+    if {[dict get $Config "log_level"] eq "debug"} {
+        set logFormat $relmon::LogFormatDebug
+    } else {
+        set logFormat $relmon::LogFormatDefault
+    }
+    logger::utils::applyAppender -appender fileAppend -appenderArgs \
+            [list -outputChannel stderr -conversionPattern $logFormat] \
+            -serviceCmd $Log
+
+    # set default logging level
+    ${Log}::setlevel [dict get $Config "log_level"]
+
+    ${Log}::notice "relmon.tcl starting up"
+
+    set ExitStatus [relmon::crawler::crawl $Config $watchlist $Log \
+            [namespace code OnItemFinished]]
+
+    exit $ExitStatus
+}
+
+
 proc ::relmon::main {args} {
     variable usage
     set subArgs [lassign $args subCommand]