-- |
-- How to use:
-- 
-- > import Control.Watchdog
-- > import Data.Time
-- >
-- > errorProneTask :: IO (Either String ())
-- > errorProneTask = do
-- >     getCurrentTime >>= print
-- >     return $ Left "some error"
-- >
-- > main = watchdog $ watch errorProneTask
--
-- Result:
--
-- @
-- 2012-07-09 21:48:19.592252 UTC
-- Watchdog: Error executing task (some error) - waiting 1s before trying again.
-- 2012-07-09 21:48:20.594381 UTC
-- Watchdog: Error executing task (some error) - waiting 2s before trying again.
-- 2012-07-09 21:48:22.597069 UTC
-- Watchdog: Error executing task (some error) - waiting 4s before trying again.
-- ...
-- @
--
-- Alternatively the watchdog can stop after a certain number of attempts:
--
-- > import Control.Watchdog
-- > import Data.Time
-- >
-- > errorProneTask :: IO (Either String ())
-- > errorProneTask = do
-- >     getCurrentTime >>= print
-- >     return $ Left "some error"
-- >
-- > main = do
-- >     result <- watchdog $ do
-- >         setMaximumRetries 2
-- >         watchImpatiently errorProneTask
-- >     print result
--
-- Result:
--
-- @
-- 2012-07-09 21:55:41.046432 UTC
-- Watchdog: Error executing task (some error) - waiting 1s before trying again.
-- 2012-07-09 21:55:42.047246 UTC
-- Watchdog: Error executing task (some error) - waiting 2s before trying again.
-- 2012-07-09 21:55:44.049993 UTC
-- Left \"some error\"
-- @
--
-- The watchdog will execute the task and check the return value, which should
-- be an 'Either' value where 'Left' signals an error and 'Right' signals success.
--
-- The watchdog will backoff exponentially (up to a maximum delay) in case of
-- persisting errors, but will reset after the task has been running for a while
-- without problems (see 'setResetDuration') and start a new cycle of
-- exponential backoff should new errors arise.
--
-- The module is intended to be used in different watchdog settings. For example
-- to keep an eye on a server process (use 'watch' and only return a succesful
-- result when the server is doing a clean shutdown) or to retry an action
-- multiple times, if necessary, before giving up (use 'watchImpatiently').  A
-- monadic approach is used to modify the various settings. Below is a code
-- sample with all possible configuration options and their default values:
--
-- > import Control.Watchdog
-- > import Data.Time
-- >
-- > errorProneTask :: IO (Either String ())
-- > errorProneTask = do
-- >     getCurrentTime >>= print
-- >     return $ Left "some error"
-- >
-- > main = watchdog $ do
-- >         setInitialDelay $ 1 * 10^6      -- 1 second
-- >         setMaximumDelay $ 300 * 10^6    -- 300 seconds
-- >         setMaximumRetries 10            -- has no effect when using 'watch'
-- >         setResetDuration $ 30 * 10^6    -- 30 seconds
-- >         setLoggingAction defaultLogger
-- >         watch errorProneTask
--

{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module Control.Watchdog
    ( watchdog
    , watch
    , watchImpatiently
    , setInitialDelay
    , setMaximumDelay
    , setMaximumRetries
    , setResetDuration
    , setLoggingAction
    , defaultLogger
    , silentLogger
    , formatWatchdogError
    , WatchdogLogger
    , WatchdogAction
    ) where

import Control.Concurrent
import Control.Monad.State.Strict
import Data.Time

data WatchdogState = WatchdogState { wcInitialDelay :: Int
                                   , wcMaximumDelay :: Int
                                   , wcResetDuration :: Int
                                   , wcMaximumRetries :: Integer
                                   , wcLoggingAction :: WatchdogLogger
                                   }

data WatchdogTaskStatus a = FailedImmediately String
                          | FailedAfterAWhile String
                          | CompletedSuccessfully a

newtype WatchdogAction a = WA { runWA :: StateT WatchdogState IO a }
                           deriving (Monad, MonadIO, MonadState WatchdogState)

-- | Type synonym for a watchdog logger.
type WatchdogLogger = String     -- ^ Error message returned by the task.
                    -> Maybe Int -- ^ Waiting time - if any - before trying again.
                    -> IO ()

defaultConf :: WatchdogState
defaultConf = WatchdogState { wcInitialDelay = 1 * 10 ^ (6::Integer)
                            , wcMaximumDelay = 300 * 10 ^ (6::Integer)
                            , wcMaximumRetries = 10
                            , wcResetDuration = 30 * 10 ^ (6::Integer)
                            , wcLoggingAction = defaultLogger
                            }

-- | The Watchdog monad. Used to configure and eventually run a watchdog.
watchdog :: WatchdogAction a -> IO a
watchdog action = evalStateT (runWA action) defaultConf

-- | Set the initial delay in microseconds. The first time the watchdog pauses
-- will be for this amount of time. The default is 1 second.
setInitialDelay :: Int -> WatchdogAction ()
setInitialDelay delay = do
    conf <- get
    put conf { wcInitialDelay = delay }

-- | Set the maximum delay in microseconds. When a task fails to execute
-- properly multiple times in quick succession, the delay is doubled each time
-- until it stays constant at the maximum delay. The default is 300 seconds.
setMaximumDelay :: Int -> WatchdogAction ()
setMaximumDelay delay = do
    conf <- get
    put conf { wcMaximumDelay = delay }

-- | If a task has been running for some time, the watchdog will consider
-- the next failure to be something unrelated and reset the waiting time
-- back to the initial delay. This function sets the amount of time in
-- microseconds that needs to pass before the watchdog will consider a task to
-- be successfully running. The default is 30 seconds.
setResetDuration :: Int -> WatchdogAction ()
setResetDuration duration = do
    conf <- get
    put conf { wcResetDuration = duration }

-- | Set the number of retries after which the watchdog will give up and
-- return with a permanent error. This setting is only used in combination with
-- 'watchImpatiently'. The default is 10.
setMaximumRetries :: Integer -> WatchdogAction ()
setMaximumRetries retries = do
    conf <- get
    put conf { wcMaximumRetries = retries }

-- | Set the logging action that will be called by the watchdog. The supplied
-- function of type 'WatchdogLogger' will be provided with the error message of
-- the task and either 'Nothing' if the watchdog will retry immediately or 'Just
-- delay' if the watchdog will now pause for the specified amount of time before
-- trying again.  The default is 'defaultLogger'.
setLoggingAction :: WatchdogLogger -> WatchdogAction ()
setLoggingAction f = do
    conf <- get
    put conf { wcLoggingAction = f }

-- | Watch a task, restarting it potentially forever or until it returns with a
-- result. The task should return an 'Either', where 'Left' in combination with
-- an error message signals an error and 'Right' with an arbitrary result
-- signals success.
watch :: IO (Either String a) -> WatchdogAction a
watch task = do
    conf <- get
    liftIO $ go conf (wcInitialDelay conf)
  where
    go conf errorDelay = do
        status <- timeTask (wcResetDuration conf) task
        case status of
            CompletedSuccessfully result -> return result
            FailedAfterAWhile err -> do
                let errorDelay' = wcInitialDelay conf
                    loggingAction = wcLoggingAction conf
                loggingAction err Nothing -- log that we will retry immediately
                go conf errorDelay'
            FailedImmediately err -> do
                let errorDelay' =
                        min (errorDelay * 2) (wcMaximumDelay conf)
                    loggingAction = wcLoggingAction conf
                loggingAction err (Just errorDelay) -- log that we will
                                                    -- retry after a delay
                threadDelay errorDelay
                go conf errorDelay'

-- | Watch a task, but only restart it a limited number of times (see
-- 'setMaximumRetries'). If the failure persists, it will be returned as a 'Left',
-- otherwise it will be 'Right' with the result of the task.
watchImpatiently :: IO (Either String b) -> WatchdogAction (Either String b)
watchImpatiently task = do
    conf <- get
    liftIO $ go conf 0 "" (wcInitialDelay conf)
  where
    go conf retries lastError errorDelay = do
        status <- timeTask (wcResetDuration conf) task
        case status of
            CompletedSuccessfully result -> return $ Right result
            FailedAfterAWhile err ->
                if retries >= wcMaximumRetries conf
                    then return $ Left lastError
                    else do
                        let errorDelay' = wcInitialDelay conf
                            loggingAction = wcLoggingAction conf
                        loggingAction err Nothing
                        go conf (retries + 1) err errorDelay'
            FailedImmediately err ->
                if retries >= wcMaximumRetries conf
                    then return $ Left lastError
                    else do
                        let errorDelay' =
                                min (errorDelay * 2) (wcMaximumDelay conf)
                            loggingAction = wcLoggingAction conf
                        loggingAction err (Just errorDelay)
                        threadDelay errorDelay
                        go conf (retries + 1) err errorDelay'

-- | The default logging action. It will call 'formatWatchdogError' and display
-- the result on STDOUT.
defaultLogger :: WatchdogLogger
defaultLogger taskErr delay = putStrLn $ formatWatchdogError taskErr delay

-- | Disable logging by passing this function to 'setLoggingAction'.
silentLogger :: WatchdogLogger
silentLogger _ _ = return ()

-- | Format the watchdog status report. Will produce output like this:
--
-- @
-- Watchdog: Error executing task (some error) - trying again immediately.
-- Watchdog: Error executing task (some error) - waiting 1s before trying again.
-- @
formatWatchdogError :: String -- ^ Error message returned by the task.
                    -> Maybe Int -- ^ Waiting time - if any - before trying again.
                    -> String
formatWatchdogError taskErr Nothing =
    "Watchdog: Error executing task (" ++ taskErr ++ ") - trying again immediately."
formatWatchdogError taskErr (Just delay) =
    let asNominalDiffTime :: NominalDiffTime    -- just to display it properly
        asNominalDiffTime = fromIntegral delay / 10 ^ (6 :: Integer)
    in "Watchdog: Error executing task (" ++ taskErr ++ ") - waiting"
        ++ " " ++ show asNominalDiffTime ++ " before trying again."

-- | Helper class which can be wrapped around a task.
-- The task should return an 'Either', where Left in combination
-- with an error message signals an error and Right with an arbitrary
-- result signals success. In case of failure, the wrapper will check whether
-- the function ran less than 30 seconds and then return 'FailedImmediately'
-- or 'FailedAfterAWhile' accordingly.
timeTask :: Int -> IO (Either String a) -> IO (WatchdogTaskStatus a)
timeTask resetDuration task = do
    start <- getCurrentTime
    status <- task
    stop <- getCurrentTime
    case status of
        Right result -> return $ CompletedSuccessfully result
        Left err -> 
            let cutOff = fromIntegral resetDuration / 10 ^ (6 :: Integer)
            in if diffUTCTime stop start < cutOff
                        then return $ FailedImmediately err
                        else return $ FailedAfterAWhile err