{-# LANGUAGE GADTs #-}

module Reporter.Honeycomb.Internal where

import qualified Control.Exception.Safe as Exception
import qualified Data.Aeson as Aeson
import qualified Data.ByteString as ByteString
import qualified Data.List
import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Text.Encoding as Encoding
import qualified Data.Text.IO
import qualified Data.UUID
import qualified Data.UUID.V4
import qualified Data.Word
import qualified Dict
import qualified Environment
import qualified GHC.Stack as Stack
import qualified List
import qualified Log
import qualified Log.HttpRequest as HttpRequest
import qualified Log.Kafka as Kafka
import qualified Log.RedisCommands as RedisCommands
import qualified Log.SqlQuery as SqlQuery
import qualified Maybe
import qualified Network.HTTP.Client as HTTP
import qualified Network.HTTP.Client.TLS as HTTP.TLS
import qualified Network.HostName
import qualified Platform
import qualified Platform.AesonHelpers as AesonHelpers
import qualified Platform.Timer as Timer
import qualified System.Random as Random
import qualified Text
import qualified Text as NriText
import qualified Prelude

batchApiEndpoint :: Text -> Text
batchApiEndpoint datasetName = "https://api.honeycomb.io/1/batch/" ++ datasetName

-- | Report a tracing span to Honeycomb.
report :: Handler -> Text -> Platform.TracingSpan -> Prelude.IO ()
report handler' _requestId span = do
  sendOrSample <- makeSharedTraceData handler' span
  case sendOrSample of
    SampledOut -> Prelude.pure ()
    SendToHoneycomb sharedTraceData -> sendToHoneycomb handler' sharedTraceData span

sendToHoneycomb :: Handler -> SharedTraceData -> Platform.TracingSpan -> Prelude.IO ()
sendToHoneycomb handler' sharedTraceData span = do
  let events = toBatchEvents sharedTraceData span
  baseRequest <-
    settings handler'
      |> datasetName
      |> batchApiEndpoint
      |> Text.toList
      |> HTTP.parseRequest
  let req =
        baseRequest
          { HTTP.method = "POST",
            HTTP.requestHeaders =
              [ ( "X-Honeycomb-Team",
                  settings handler'
                    |> apiKey
                    |> Log.unSecret
                )
              ],
            HTTP.requestBody = HTTP.RequestBodyLBS (Aeson.encode events)
          }
  _ <- HTTP.httpNoBody req (http handler')
  Prelude.pure ()

getRootSpanRequestPath :: Platform.TracingSpan -> Maybe Text
getRootSpanRequestPath rootSpan =
  Platform.details rootSpan
    |> Maybe.andThen
      ( Platform.renderTracingSpanDetails
          [ Platform.Renderer (\(HttpRequest.Incoming details) -> HttpRequest.endpoint details)
          ]
      )
    |> Maybe.andThen identity

getSpanEndpoint :: Platform.TracingSpan -> Maybe Text
getSpanEndpoint span =
  span
    |> Platform.details
    |> Maybe.andThen
      ( Platform.renderTracingSpanDetails
          [ Platform.Renderer (\(HttpRequest.Incoming details) -> HttpRequest.endpoint details),
            Platform.Renderer Kafka.topic
          ]
      )
    |> Maybe.andThen identity

deriveSampleRate :: Platform.TracingSpan -> Settings -> Float
deriveSampleRate rootSpan settings =
  let isNonAppRequestPath =
        case getRootSpanRequestPath rootSpan of
          Nothing -> False
          -- You might be tempted to use `endpoint` instead of `path`, but
          -- healthcheck endpoints don't populate `HttpRequest.endpoint`.
          -- Fix that first before trying this.
          Just requestPath -> List.any (requestPath ==) ["/health/readiness", "/metrics", "/health/liveness"]
      baseRate = modifyFractionOfSuccessRequestsLogged settings (fractionOfSuccessRequestsLogged settings) rootSpan
      requestDurationMs =
        Timer.difference (Platform.started rootSpan) (Platform.finished rootSpan)
          |> Platform.inMicroseconds
          |> Prelude.fromIntegral
          |> (*) 1e-3
      apdexTMs =
        modifyApdexTimeMs settings (apdexTimeMs settings) rootSpan
          |> Prelude.fromIntegral
   in if isNonAppRequestPath
        then --
        -- We have 2678400 seconds in a month
        -- We health-check once per second per Pod in Haskell
        -- We have 2-3 pods at idle per service
        -- We have some 5 services
        -- We have up to 4 environments (staging, prod, demo, backyard)
        --
        -- Healthchecks would be 107,136,000 / sampleRate traces per month
        --
        -- But we also don't wanna never log them, who knows, they might cause
        -- problems
        --
        -- High sample rates might make honeycomb make ridiculous assumptions
        -- about the actual request rate tho. Adjust if that's the case.
          baseRate / 500
        else sampleRateForDuration baseRate requestDurationMs apdexTMs

-- For every increase of apdexTU in the request duration we double the chance of
-- a request getting logged, up to a maximum of 1.
--
-- An example plot of this function, for:
--  * apdex 30ms
--  * baseRate 1/1000
--  * from 1ms to 300ms

-- https://www.wolframalpha.com/input/?i=plot+1%2Fmax%281%2F1000%2C+min%281%2C+%281%2F1000%29+*+%281.5+%5E+%28x+%2F+30%29%29%29%29+from+x%3D1+to+x%3D300
sampleRateForDuration :: Float -> Float -> Float -> Float
sampleRateForDuration baseRate requestDurationMs apdexTMs =
  baseRate * (1.5 ^ (requestDurationMs / apdexTMs))
    |> clamp baseRate 1

calculateApdex :: Settings -> Platform.TracingSpan -> Float
calculateApdex settings span =
  case Platform.succeeded span of
    Platform.Failed -> 0
    Platform.FailedWith _ -> 0
    Platform.Succeeded ->
      let duration =
            spanDuration span
              |> Platform.inMicroseconds
              |> Prelude.fromIntegral
          apdexTUs = 1000 * apdexTimeMs settings
       in if duration < apdexTUs
            then 1
            else
              if duration < (4 * apdexTUs)
                then 0.5
                else 0

toBatchEvents :: SharedTraceData -> Platform.TracingSpan -> List BatchEvent
toBatchEvents sharedTraceData span =
  let (_, events) =
        batchEventsHelper
          sharedTraceData
          Nothing
          (emptyStatsByName, 0)
          span
   in events

batchEventsHelper ::
  SharedTraceData ->
  Maybe SpanId ->
  (StatsByName, Int) ->
  Platform.TracingSpan ->
  ((StatsByName, Int), [BatchEvent])
batchEventsHelper sharedTraceData parentSpanId (statsByName, spanIndex) span = do
  let isRootSpan = parentSpanId == Nothing
  let thisSpansId = SpanId (requestId sharedTraceData ++ "-" ++ NriText.fromInt spanIndex)
  let ((lastStatsByName, lastSpanIndex), nestedChildren) =
        Data.List.mapAccumL
          (batchEventsHelper sharedTraceData (Just thisSpansId))
          ( -- Don't record the root span. We have only one of those per trace,
            -- so there's no statistics we can do with it.
            if isRootSpan
              then statsByName
              else recordStats span statsByName,
            spanIndex + 1
          )
          (Platform.children span)
  let children = List.concat nestedChildren
  let duration =
        Timer.difference (Platform.started span) (Platform.finished span)
          |> Platform.inMicroseconds
  let timestamp = Timer.toISO8601 (timer sharedTraceData) (Platform.started span)
  let sourceLocation =
        Platform.frame span
          |> Maybe.map
            ( \(_, frame) ->
                Text.fromList (Stack.srcLocFile frame)
                  ++ ":"
                  ++ Text.fromInt (Prelude.fromIntegral (Stack.srcLocStartLine frame))
            )
  let isError = case Platform.succeeded span of
        Platform.Succeeded -> False
        Platform.Failed -> True
        Platform.FailedWith _ -> True
  let addStats span' =
        if isRootSpan
          then perSpanNameStats lastStatsByName span'
          else span'
  let addEndpoint span' =
        case endpoint sharedTraceData of
          Nothing -> span'
          Just endpoint -> addField "endpoint" endpoint span'
  let hcSpan =
        initSpan sharedTraceData
          |> addField "name" (Platform.name span)
          |> addField "trace.span_id" thisSpansId
          |> addField "trace.parent_id" parentSpanId
          |> addField "duration_ms" (Prelude.fromIntegral duration / 1000)
          |> addField "allocated_bytes" (Platform.allocated span)
          |> addField "failed" isError
          |> addField "source_location" sourceLocation
          |> addDetails span
          |> addEndpoint
          |> addStats
  ( (lastStatsByName, lastSpanIndex),
    BatchEvent
      { batchevent_time = timestamp,
        batchevent_data = hcSpan,
        batchevent_samplerate = sampleRate sharedTraceData
      } :
    children
    )

data Stats = Stats
  { count :: Int,
    totalTimeMicroseconds :: Data.Word.Word64
  }

newtype StatsByName = StatsByName (Dict.Dict Text Stats)

emptyStatsByName :: StatsByName
emptyStatsByName = StatsByName Dict.empty

recordStats :: Platform.TracingSpan -> StatsByName -> StatsByName
recordStats span (StatsByName statsByName) =
  let name = Platform.name span
      duration = Platform.inMicroseconds (spanDuration span)
      newStats =
        case Dict.get name statsByName of
          Nothing ->
            Stats
              { count = 1,
                totalTimeMicroseconds = duration
              }
          Just stats ->
            Stats
              { count = 1 + count stats,
                totalTimeMicroseconds = duration + totalTimeMicroseconds stats
              }
   in StatsByName (Dict.insert name newStats statsByName)

spanDuration :: Platform.TracingSpan -> Platform.MonotonicTime
spanDuration span =
  Timer.difference (Platform.started span) (Platform.finished span)

perSpanNameStats :: StatsByName -> Span -> Span
perSpanNameStats (StatsByName statsByName) span =
  let -- chose foldr to preserve order, not super important tho
      statsForCategory (name, stats) acc =
        let calls = count stats
            total = Prelude.fromIntegral (totalTimeMicroseconds stats) / 1000
            average = total / Prelude.fromIntegral calls
            saneName = useAsKey name
         in acc
              |> addField ("stats.total_time_ms." ++ saneName) total
              |> addField ("stats.average_time_ms." ++ saneName) average
              |> addField ("stats.count." ++ saneName) calls
   in List.foldl statsForCategory span (Dict.toList statsByName)

useAsKey :: Text -> Text
useAsKey str =
  str
    |> NriText.toLower
    |> NriText.replace " " "_"

-- Customize how we render span details for different kinds of spans to
-- Honeycomb.
--
-- In the past we used the toHashMap helper to generate the JSON representations
-- we send to honeycomb (see its documentation in the Helpers module to learn
-- more about it). This turned out to be a poor fit because it creates a ton of
-- top-level JSON keys, each of which Honeycomb will create a unique column for.
--
-- If we ever hit 10k unique column names (and we were past the thousands when
-- this code was introduced) Honeycomb will stop accepting traces from us.
--
-- "Unique column names" means different column names that Honeycomb has seen us
-- report on a span.
addDetails :: Platform.TracingSpan -> Span -> Span
addDetails tracingSpan honeycombSpan =
  case Platform.details tracingSpan of
    Just details ->
      details
        |> Platform.renderTracingSpanDetails
          [ Platform.Renderer (renderDetailsLog honeycombSpan),
            Platform.Renderer (renderDetailsRedis honeycombSpan),
            Platform.Renderer
              ( \(_ :: HttpRequest.Incoming) ->
                  renderDetailsGeneric "http" details honeycombSpan
              ),
            Platform.Renderer
              ( \(_ :: SqlQuery.Details) ->
                  renderDetailsGeneric "sql" details honeycombSpan
              ),
            Platform.Renderer
              ( \(_ :: Kafka.Details) ->
                  renderDetailsGeneric "kafka" details honeycombSpan
              )
          ]
        -- `renderTracingSpanDetails` returns Nothing when type of details
        -- doesn't match any in our list of functions above.
        --
        -- We'll default to using the default JSON encoding of the honeycombSpan.
        -- Assuming it encodes into a JSON object with multiple keys (every
        -- known details object we have does this) we'll use that object
        -- directly.
        |> Maybe.withDefault
          (renderDetailsGeneric (Platform.name tracingSpan) details honeycombSpan)
    Nothing -> honeycombSpan

renderDetailsGeneric :: Text -> Platform.SomeTracingSpanDetails -> Span -> Span
renderDetailsGeneric keyPrefix details honeycombSpan =
  case Aeson.toJSON details of
    Aeson.Object object ->
      AesonHelpers.foldObject
        (\key value -> addField (useAsKey (keyPrefix ++ "." ++ key)) value)
        honeycombSpan
        object
    jsonVal -> addField keyPrefix jsonVal honeycombSpan

-- LogContext is an unbounded list of key value pairs with possibly nested
-- stuff in them. Aeson flatens the nesting, so:
--
-- {error: [{quiz: [{"some-quiz-id": "some context"}]}]}
--
-- becomes
--
-- {"error.0.quiz.0.some-quiz-id": "some context"}
--
-- - With "some-quiz-id" in the example above, we have an unbounded number of
--   unique columns.
-- - With long lists, the `.0` parts helps boost our unique column name growth.
--
-- We don't need Honeycomb to collect rich error information.
-- That's what we pay Bugsnag for.
renderDetailsLog :: Span -> Log.LogContexts -> Span
renderDetailsLog span context@(Log.LogContexts contexts) =
  if List.length contexts > 5
    then addField "log.context" context span
    else
      List.foldl
        (\(Log.Context key val) -> addField ("log." ++ key) val)
        span
        contexts

-- Redis creates one column per command for batches
-- Let's trace what matters:
-- - How many of each command
-- - The full blob in a single column
-- - The rest of our Info record
renderDetailsRedis :: Span -> RedisCommands.Details -> Span
renderDetailsRedis span redisInfo =
  let addCommandCounts span' =
        redisInfo
          |> RedisCommands.commands
          |> List.filterMap (NriText.words >> List.head)
          |> NonEmpty.groupWith identity
          |> List.foldr
            ( \xs ->
                addField
                  ("redis." ++ NonEmpty.head xs ++ ".count")
                  (NonEmpty.length xs)
            )
            span'
      fullBlob =
        redisInfo
          |> RedisCommands.commands
          |> Aeson.toJSON
   in span
        |> addField "redis.commands" fullBlob
        |> addField "redis.host" (RedisCommands.host redisInfo)
        |> addField "redis.port" (RedisCommands.port redisInfo)
        |> addCommandCounts

data BatchEvent = BatchEvent
  { batchevent_time :: Text,
    batchevent_data :: Span,
    batchevent_samplerate :: Int
  }
  deriving (Generic, Show)

options :: Aeson.Options
options =
  Aeson.defaultOptions
    { -- Drop the batchevent_ prefix
      Aeson.fieldLabelModifier = List.drop 1 << Prelude.dropWhile ('_' /=)
    }

instance Aeson.ToJSON BatchEvent where
  toJSON = Aeson.genericToJSON options

data SharedTraceData = SharedTraceData
  { -- | We use this to turn GHC.Clock-produced timestamps into regular times.
    timer :: Timer.Timer,
    -- | Each request has a unique id, for correlating spans for the same request.
    requestId :: Text,
    -- | A honeycomb span with the common fields for this request pre-applied.
    initSpan :: Span,
    -- | The 'endpoint' of the request this trace describes. Honeycomb uses
    -- this for a breakdown-by-endpoint on the dataset home.
    endpoint :: Maybe Text,
    -- | The amount of similar traces this one trace represents. For example,
    -- if we send this trace but sampled out 9 similar ones, sample rate will be
    -- 10. This will let honeycomb know it should count this trace as 10.
    sampleRate :: Int
  }

-- | Honeycomb defines a span to be a list of key-value pairs, which we model
-- using a dictionary. Honeycomb expects as values anything that's valid JSON.
--
-- We could use the `Aeson.Value` type to model values, but that mean we'd be
-- encoding spans for honeycomb in two steps: first from our original types to
-- `Aeson.Value`, then to the `ByteString` we send in the network request.
-- `Aeson` has a more efficient encoding strategy that is able to encode types
-- into JSON in one go. To use that we accept as keys any value we know we'll be
-- able to encode into JSON later, once we got the whole payload we want to send
-- to honeycomb together.
newtype Span = Span (Dict.Dict Text JsonEncodable)
  deriving (Aeson.ToJSON, Show)

data JsonEncodable where
  JsonEncodable :: Aeson.ToJSON a => a -> JsonEncodable

instance Aeson.ToJSON JsonEncodable where
  toEncoding (JsonEncodable x) = Aeson.toEncoding x
  toJSON (JsonEncodable x) = Aeson.toJSON x

instance Show JsonEncodable where
  show (JsonEncodable x) = Prelude.show (Aeson.toJSON x)

emptySpan :: Span
emptySpan = Span Dict.empty

addField :: Aeson.ToJSON a => Text -> a -> Span -> Span
addField key val (Span span) = Span (Dict.insert key (JsonEncodable val) span)

newtype SpanId = SpanId Text
  deriving (Aeson.ToJSON, Eq, Show)

-- | Contextual information this reporter needs to do its work. You can create
-- one using 'handler'.
data Handler = Handler
  { http :: HTTP.Manager,
    settings :: Settings,
    makeSharedTraceData :: Platform.TracingSpan -> Prelude.IO SendOrSample
  }

data SendOrSample
  = SendToHoneycomb SharedTraceData
  | SampledOut

-- | Create a 'Handler' for a specified set of 'Settings'. Do this once when
-- your application starts and reuse the 'Handler' you get.
handler :: Settings -> Prelude.IO Handler
handler settings = do
  timer <- Timer.mkTimer
  http <- HTTP.TLS.getGlobalManager
  revision <- getRevision
  hostname' <- Network.HostName.getHostName
  let baseSpan =
        emptySpan
          |> addField "service_name" (serviceName settings)
          |> addField "hostname" (Text.fromList hostname')
          |> addField "revision" revision
  Prelude.pure
    Handler
      { http = http,
        settings = settings,
        makeSharedTraceData = \span -> do
          -- This is an initial implementation of sampling, based on
          -- https://docs.honeycomb.io/working-with-your-data/best-practices/sampling/
          -- using Dynamic Sampling based on whether the request was successful or not.
          --
          -- We can go further and:
          --
          --  * Not sample requests above a certain configurable threshold, to replicate
          --    NewRelic's slow request tracing.
          --  * Apply some sampling rate to errors
          --  * Apply different sample rates depending on traffic (easiest approximation
          --    is basing it off of time of day) so we sample less at low traffic

          (skipLogging, sampleRate) <-
            case Platform.succeeded span of
              Platform.Succeeded -> do
                let probability = deriveSampleRate span settings
                roll <- Random.randomRIO (0.0, 1.0)
                Prelude.pure (roll > probability, round (1 / probability))
              Platform.Failed -> Prelude.pure (False, 1)
              Platform.FailedWith _ -> Prelude.pure (False, 1)
          uuid <- Data.UUID.V4.nextRandom
          if skipLogging
            then Prelude.pure SampledOut
            else
              Prelude.pure
                <| SendToHoneycomb
                  SharedTraceData
                    { timer,
                      sampleRate,
                      requestId = Data.UUID.toText uuid,
                      endpoint = getSpanEndpoint span,
                      initSpan =
                        baseSpan
                          |> addField "apdex" (calculateApdex settings span)
                          -- Don't use requestId if we don't do Distributed Tracing
                          -- Else, it will create traces with no parent sharing the same TraceId
                          -- Which makes Honeycomb's UI confused
                          |> addField "trace.trace_id" (Data.UUID.toText uuid)
                    }
      }

newtype Revision = Revision Text
  deriving (Show, Aeson.ToJSON)

-- | Get the GIT revision of the current code. We do this by reading a file that
-- our K8S setup is supposed to provide.
getRevision :: Prelude.IO Revision
getRevision = do
  eitherRevision <- Exception.tryAny <| Data.Text.IO.readFile "revision"
  case eitherRevision of
    Prelude.Left _err -> Prelude.pure (Revision "no revision file found")
    Prelude.Right version -> Prelude.pure (Revision version)

-- | Configuration settings for ths reporter. A value of this type can be read
-- from the environment using the 'decoder' function.
data Settings = Settings
  { -- | The Honeycomb API key to use.
    apiKey :: Log.Secret ByteString.ByteString,
    -- | The name of the honeycomb dataset to report to. If the dataset does not
    -- exist yet, Honeycomb will create it when you first send a request for it.
    --
    -- [@environment variable@] HONEYCOMB_API_KEY
    -- [@default value@] *****
    datasetName :: Text,
    -- | The name of the service we're reporting for.
    --
    -- [@environment variable@] HONEYCOMB_SERVICE_NAME
    -- [@default value@] service
    serviceName :: Text,
    -- | The fraction of successfull requests that will be reported. If your
    -- service receives a lot of requests you might want reduce this to safe
    -- cost.
    --
    -- [@environment variable@] HONEYCOMB_FRACTION_OF_SUCCESS_REQUESTS_LOGGED
    -- [@default value@] 1
    fractionOfSuccessRequestsLogged :: Float,
    -- | The apdex time for this service in ms. Requests handled faster than
    -- this time will be sampled according to the
    -- @HONEYCOMB_FRACTION_OF_SUCCESS_REQUESTS_LOGGED@ variable. Slower request
    -- will have a larger chance to be reported.
    --
    -- [@environment variable@] HONEYCOMB_APDEX_TIME_IN_MILLISECONDS
    -- [@default value@] 100
    apdexTimeMs :: Int,
    -- | Allows overriding the default sample rates for given spans.
    -- This allows us to change the sample rate for certain endpoints within an
    -- application, for example if a path is critical but low volume we may choose
    -- to increase the rate.
    -- [@default value@] the input float
    modifyFractionOfSuccessRequestsLogged :: Float -> Platform.TracingSpan -> Float,
    -- | Allows overriding the default apdex rates for given spans.
    -- This allows us to change the apdex for certain endpoints within an
    -- application, for example if a path is significantly lower volume than
    -- another the apdex may require tuning.
    -- [@default value@] the input int
    modifyApdexTimeMs :: Int -> Platform.TracingSpan -> Int
  }

-- | Read 'Settings' from environment variables. Default variables will be used
-- in case no environment variable is set for an option.
decoder :: Environment.Decoder Settings
decoder =
  Prelude.pure Settings
    |> andMap honeycombApiKeyDecoder
    |> andMap datasetNameDecoder
    |> andMap honeycombAppNameDecoder
    |> andMap fractionOfSuccessRequestsLoggedDecoder
    |> andMap apdexTimeMsDecoder
    |> andMap (Prelude.pure always)
    |> andMap (Prelude.pure always)

honeycombApiKeyDecoder :: Environment.Decoder (Log.Secret ByteString.ByteString)
honeycombApiKeyDecoder =
  Environment.variable
    Environment.Variable
      { Environment.name = "HONEYCOMB_API_KEY",
        Environment.description = "The API key for Honeycomb",
        Environment.defaultValue = "*****"
      }
    (Environment.text |> map Encoding.encodeUtf8 |> Environment.secret)

datasetNameDecoder :: Environment.Decoder Text
datasetNameDecoder =
  Environment.variable
    Environment.Variable
      { Environment.name = "HONEYCOMB_DATASET",
        Environment.description = "Name of the dataset honeycomb should log to.",
        Environment.defaultValue = "dataset"
      }
    Environment.text

honeycombAppNameDecoder :: Environment.Decoder Text
honeycombAppNameDecoder =
  Environment.variable
    Environment.Variable
      { Environment.name = "HONEYCOMB_SERVICE_NAME",
        Environment.description = "Variable that sets the honeycomb service name.",
        Environment.defaultValue = "service"
      }
    Environment.text

fractionOfSuccessRequestsLoggedDecoder :: Environment.Decoder Float
fractionOfSuccessRequestsLoggedDecoder =
  Environment.variable
    Environment.Variable
      { Environment.name = "HONEYCOMB_FRACTION_OF_SUCCESS_REQUESTS_LOGGED",
        Environment.description = "The fraction of successful requests logged. Defaults to logging all successful requests.",
        Environment.defaultValue = "1"
      }
    Environment.float

apdexTimeMsDecoder :: Environment.Decoder Int
apdexTimeMsDecoder =
  Environment.variable
    Environment.Variable
      { Environment.name = "HONEYCOMB_APDEX_TIME_IN_MILLISECONDS",
        Environment.description = "The T value in the apdex, the time in milliseconds in which a healthy request should complete.",
        Environment.defaultValue = "100"
      }
    Environment.int