-- |
-- Module      :  Mcmc.Proposal.Hamiltonian.Masses
-- Description :  Mass matrices
-- Copyright   :  2022 Dominik Schrempf
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  experimental
-- Portability :  portable
--
-- Creation date: Tue Jun 14 10:09:24 2022.
module Mcmc.Proposal.Hamiltonian.Masses
  ( Mu,
    MassesI,
    toGMatrix,
    cleanMatrix,
    getMassesI,
    getMus,
    Dimension,
    vectorToMasses,
    massesToVector,

    -- * Tuning
    SmoothingParameter (..),
    tuneDiagonalMassesOnly,
    tuneAllMasses,
  )
where

import Data.Maybe
import qualified Data.Vector as VB
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Unboxed as VU
import Mcmc.Proposal.Hamiltonian.Common
import qualified Numeric.LinearAlgebra as L
import Numeric.Natural
import qualified Statistics.Covariance as S
import qualified Statistics.Function as S
import qualified Statistics.Sample as S

-- Mean vector containing zeroes. We save this vector because it is required
-- when sampling from the multivariate normal distribution.
type Mu = L.Vector Double

-- General, symmetric, inverted mass matrix.
type MassesI = L.GMatrix

-- Purge masses and inverted masses (excluding the diagonal) strictly smaller
-- than the precision.
--
-- If changed, also change help text of 'HTuneMasses', which is indirectly
-- affected via 'massMin', and 'massMax'.
precision :: Double
precision = 1e-8

isDiag :: L.Matrix Double -> Bool
isDiag xs = abs (sumDiag - sumFull) < precision
  where
    xsAbs = L.cmap abs xs
    sumDiag = L.sumElements (L.takeDiag xsAbs)
    sumFull = L.sumElements xsAbs

-- Consider a matrix sparse if less than (5 * number of rows) elements are
-- non-zero.
isSparse :: L.Matrix Double -> Bool
isSparse xs = nNonZero < fromIntegral nMax
  where
    n = L.rows xs
    m = min 5 n
    nMax = n * m
    f x = if abs x >= precision then 1 else 0 :: Double
    xsInd = L.cmap f xs
    nNonZero = L.sumElements xsInd

toAssocMatrix :: L.Matrix Double -> L.AssocMatrix
toAssocMatrix xs
  | n /= m = error "toAssocMatrix: Matrix not square."
  | otherwise =
      [ ((i, j), e)
        | i <- [0 .. (n - 1)],
          j <- [0 .. (n - 1)],
          let e = xs `L.atIndex` (i, j),
          abs e >= precision
      ]
  where
    n = L.rows xs
    m = L.cols xs

toGMatrix :: L.Matrix Double -> L.GMatrix
toGMatrix xs
  | n == 0 || m == 0 = error "toGMatrix: Matrix empty."
  | n /= m = error "toGMatrix: Matrix not square."
  | isDiag xs = L.mkDiagR n m $ L.takeDiag xs
  | isSparse xs = L.mkSparse $ toAssocMatrix xs
  | otherwise = L.mkDense xs
  where
    n = L.rows xs
    m = L.cols xs

-- Diagonal:
-- - NaN values are set to 1.
-- - Negative values are set to 1.
-- - Small elements are set to 'precision'.
--
-- Off-diagonal:
-- - NaN values are set to 0.
-- - Elements with absolute values strictly smaller than 'precision' are purged.
--
-- We are permissive with negative and NaN values because adequate masses are
-- crucial. The Hamiltonian algorithms also work when the masses are off.
cleanMatrix :: L.Matrix Double -> L.Matrix Double
cleanMatrix xs =
  L.diag (L.cmap cleanDiag xsDiag) + L.cmap cleanOffDiag xsOffDiag
  where
    xsDiag = L.takeDiag xs
    cleanDiag x
      | isNaN x = 1
      | x < 0 = 1
      -- The strict comparison is important.
      | x < precision = precision
      | otherwise = x
    xsOffDiag = xs - L.diag xsDiag
    cleanOffDiag x
      | isNaN x = 0
      -- The strict comparison is important.
      | abs x < precision = 0
      | otherwise = x

getMassesI :: L.Herm Double -> L.GMatrix
getMassesI xs
  | n == 0 || m == 0 = error "getMassesI: Matrix empty."
  | n /= m = error "getMassesI: Matrix not square."
  | sign /= 1.0 = error "getMassesI: Determinant of matrix is negative."
  | otherwise = toGMatrix $ cleanMatrix xsI
  where
    xs' = L.unSym xs
    n = L.rows xs'
    m = L.cols xs'
    (xsI, (_, sign)) = L.invlndet xs'

getMus :: Masses -> L.Vector Double
getMus xs
  | n == 0 || m == 0 = error "getMu: Matrix empty."
  | n /= m = error "getMu: Matrix not square."
  | otherwise = L.fromList $ replicate n 0.0
  where
    xs' = L.unSym xs
    n = L.rows xs'
    m = L.cols xs'

-- Dimension of the proposal.
type Dimension = Int

massesToVector :: Masses -> VU.Vector Double
massesToVector = VU.convert . L.flatten . L.unSym

vectorToMasses :: Dimension -> VU.Vector Double -> Masses
vectorToMasses d = L.trustSym . L.reshape d . VU.convert

-- If changed, also change help text of 'HTuneMasses'.
massMin :: Double
massMin = precision

-- If changed, also change help text of 'HTuneMasses'.
massMax :: Double
massMax = recip precision

-- Minimal number of unique samples required for tuning the diagonal entries of
-- the mass matrix.
--
-- NOTE: If changed, also change help text of 'HTuneMasses'.
samplesDiagonalMin :: Int
samplesDiagonalMin = 61

-- Minimal number of samples required for tuning all entries of the mass matrix.
--
-- NOTE: If changed, also change help text of 'HTuneMasses'.
samplesAllMinWith :: Dimension -> Int
samplesAllMinWith d = samplesDiagonalMin + max samplesDiagonalMin d

getSampleSize :: VS.Vector Double -> Int
getSampleSize = VS.length . VS.uniq . S.gsort

rescueAfter :: Double -> Double
rescueAfter y
  | massMin > abs y = signum y * massMin
  | abs y > massMax = signum y * massMax
  | otherwise = y

rescueBefore :: (Double -> Double -> Double) -> Double -> Double -> Double
rescueBefore f old new
  -- Be permissive with NaN and infinite values.
  | isNaN new = old
  | isInfinite new = old
  | otherwise = f old new

rescue :: (Double -> Double -> Double) -> Double -> Double -> Double
rescue f old new = rescueAfter $ rescueBefore f old new

-- -- I do not know where I got this function from, but it works pretty well!
-- interpolate :: Double -> Double -> Double
-- interpolate old new = interSqrt ** 2
--   where
--     interSqrt = recip 3 * (sqrt old + 2 * sqrt new)

-- | This parameter plays the same role as @m@ in the dual averaging algorithm.
-- In the beginning, when @m@ is zero, the newly estimated masses will take full
-- precedence over the current masses. After some auto tuning steps, when @m@ is
-- larger, the newly estimated masses influence the current masses only
-- slightly.
newtype SmoothingParameter = SmoothingParameter Natural

-- Another interpolation function I came up with. It is pretty cool, because
-- (similar to above) it interpolates the square roots, which is what we want.
interpolate' :: SmoothingParameter -> Double -> Double -> Double
interpolate' (SmoothingParameter m) oldSquared newSquared = finSign * (fin ** 2)
  where
    oldSign = signum oldSquared
    newSign = signum newSquared
    sqrt' = sqrt . abs
    old = oldSign * sqrt' oldSquared
    new = newSign * sqrt' newSquared
    -- The new mass will be the second last boundary. That is, the larger the
    -- number of bins is, the more informative the new mass is compared to the
    -- old mass.
    nbins = max (100 - fromIntegral m) 2
    fin = recip nbins * (old + (nbins - 1) * new)
    finSign = signum fin

getNewMassDiagonalWithSampleSize :: SmoothingParameter -> Int -> Double -> Double -> Double
getNewMassDiagonalWithSampleSize m sampleSize massOld massEstimate
  | sampleSize < samplesDiagonalMin = massOld
  -- Diagonal masses are variances which are strictly positive.
  | massEstimate <= 0 = massOld
  | otherwise = rescue (interpolate' m) massOld massEstimate

getNewMassDiagonal :: SmoothingParameter -> Double -> Double -> Double
getNewMassDiagonal m massOld massEstimate
  -- Diagonal masses are variances which are strictly positive.
  | massEstimate <= 0 = massOld
  | otherwise = rescue (interpolate' m) massOld massEstimate

getNewMassOffDiagonal :: SmoothingParameter -> Double -> Double -> Double
getNewMassOffDiagonal m = rescue (interpolate' m)

-- The Cholesky decomposition, which is performed when sampling new momenta with
-- 'generateMomenta', requires a positive definite covariance matrix. The
-- Graphical Lasso algorithm finds positive definite covariance matrices, but
-- sometimes positive definiteness is violated because of numerical errors.
-- Further, when non-diagonal masses are already non-zero, the tuning of
-- diagonal masses only may violate positive definiteness.
--
-- Find the closest positive definite matrix of a given matrix.
--
-- See https://gist.github.com/fasiha/fdb5cec2054e6f1c6ae35476045a0bbd.
findClosestPositiveDefiniteMatrix :: L.Matrix Double -> L.Matrix Double
findClosestPositiveDefiniteMatrix a
  | n == 0 || m == 0 = error "findClosestPositiveDefiniteMatrix: Matrix empty."
  | n /= m = error "findClosestPositiveDefiniteMatrix: Matrix not square."
  | isPositiveDefinite a = a
  | otherwise = go a3 1
  where
    n = L.rows a
    m = L.cols a
    b = L.unSym $ L.sym a
    (_, s, v) = L.svd b
    h = L.tr v L.<> (L.diag s L.<> v)
    a2 = L.scale 0.5 (b + h)
    a3 = L.unSym $ L.sym a2
    isPositiveDefinite = isJust . L.mbChol . L.trustSym
    --
    i = L.ident n
    -- See https://hackage.haskell.org/package/ieee754-0.8.0/docs/src/Numeric-IEEE.html#line-177.
    eps = 2.2204460492503131e-16
    go x k
      | isPositiveDefinite x = x
      | otherwise =
          let minEig = L.minElement $ L.cmap L.realPart $ L.eigenvalues x
              nu = negate minEig * (k ** 2) + eps
              x' = x + L.scale nu i
           in go x' (k + 1)

tuneDiagonalMassesOnly ::
  SmoothingParameter ->
  -- Conversion from value to vector.
  (a -> Positions) ->
  -- Value vector.
  VB.Vector a ->
  -- Old mass matrix, and inverted mass matrix.
  (Masses, MassesI) ->
  -- new mass matrix, and inverted mass matrix.
  (Masses, MassesI)
-- NOTE: Here, we lose time because we convert the states to vectors again,
-- something that has already been done. But then, auto tuning is not a runtime
-- determining factor.
tuneDiagonalMassesOnly m toVec xs (ms, msI)
  -- If not enough data is available, do not tune.
  | VB.length xs < samplesDiagonalMin = (ms, msI)
  | dimState /= dimMs = error "tuneDiagonalMassesOnly: Dimension mismatch."
  -- Replace the diagonal.
  | otherwise =
      let msDirty = msOld - L.diag msDiagonalOld + L.diag msDiagonalNew
          -- Positive definite matrices are symmetric.
          ms' = L.trustSym $ findClosestPositiveDefiniteMatrix $ cleanMatrix msDirty
          msI' = getMassesI ms'
       in (ms', msI')
  where
    -- xs: Each element contains all parameters of one iteration.
    -- xs': Each element is a vector containing all parameters changed by the
    -- proposal of one iteration.
    xs' = VB.map toVec xs
    -- xs'': Matrix with each row containing all parameter values changed by the
    -- proposal of one iteration.
    xs'' = L.fromRows $ VB.toList xs'
    -- We can safely use 'VB.head' here since the length of 'xs' must be larger
    -- than 'samplesDiagonalMin'.
    dimState = VS.length $ VB.head xs'
    sampleSizes = VS.fromList $ map getSampleSize $ L.toColumns xs''
    msOld = L.unSym ms
    dimMs = L.rows msOld
    msDiagonalOld = L.takeDiag msOld
    msDiagonalEstimate = VS.fromList $ map (recip . S.variance) $ L.toColumns xs''
    msDiagonalNew =
      VS.zipWith3
        (getNewMassDiagonalWithSampleSize m)
        sampleSizes
        msDiagonalOld
        msDiagonalEstimate

-- This value was carefully tuned using the example "hamiltonian".
defaultGraphicalLassoPenalty :: Double
defaultGraphicalLassoPenalty = 0.3

interpolateElementWise :: SmoothingParameter -> L.Matrix Double -> L.Matrix Double -> L.Matrix Double
interpolateElementWise m old new
  | mO /= mN = err "different number of rows"
  | nO /= nN = err "different number of columns"
  | mO /= nO = err "not square"
  | mO < 1 = err "empty matrix"
  | otherwise = L.build (mO, nO) f
  where
    mO = L.rows old
    nO = L.cols old
    mN = L.rows new
    nN = L.cols new
    err msg = error $ "interpolateElementWise: " <> msg
    f iD jD =
      let -- This sucks a bit, because we need a function (e -> e -> e), and
          -- since the return type is Double, the indices are also Doubble.
          i = round iD
          j = round jD
          g = if i == j then getNewMassDiagonal m else getNewMassOffDiagonal m
          eO = old `L.atIndex` (i, j)
          eN = new `L.atIndex` (i, j)
       in g eO eN

tuneAllMasses ::
  SmoothingParameter ->
  -- Conversion from value to vector.
  (a -> Positions) ->
  -- Value vector.
  VB.Vector a ->
  -- Old mass matrix, and inverted mass matrix.
  (Masses, MassesI) ->
  -- New mass matrix, and inverted mass matrix.
  (Masses, MassesI)
-- NOTE: Here, we lose time because we convert the states to vectors again,
-- something that has already been done. But then, auto tuning is not a runtime
-- determining factor.
tuneAllMasses m toVec xs (ms, msI)
  -- If not enough data is available, do not tune.
  | VB.length xs < samplesDiagonalMin = (ms, msI)
  -- If not enough data is available, only the diagonal masses are tuned.
  | VB.length xs < samplesAllMinWith dimMs = fallbackDiagonal
  | L.rank xs'' /= dimState = fallbackDiagonal
  | dimState /= dimMs = error "tuneAllMasses: Dimension mismatch."
  | otherwise = (msNew, msINew)
  where
    fallbackDiagonal = tuneDiagonalMassesOnly m toVec xs (ms, msI)
    -- xs: Each element contains all parameters of one iteration.
    -- xs': Each element is a vector containing all parameters changed by the
    -- proposal of one iteration.
    xs' = VB.map toVec xs
    -- xs'': Matrix with each row containing all parameter values changed by the
    -- proposal of one iteration.
    xs'' = L.fromRows $ VB.toList xs'
    -- We can safely use 'VB.head' here since the length of 'xs' must be larger
    -- than 'samplesDiagonalMin'.
    dimState = VS.length $ VB.head xs'
    dimMs = L.rows $ L.unSym ms
    (_, ss, xsNormalized) = S.scale xs''
    -- The first value is the covariance matrix sigma, which the inverted mass
    -- matrix (precision matrix). However, we interpolate the new mass matrix
    -- using the old one and the new estimate, so we have to recalculate the
    -- covariance matrix anyways.
    (_, precNormalized) =
      either error id $
        S.graphicalLasso defaultGraphicalLassoPenalty xsNormalized
    ms' = S.rescalePWith ss (L.unSym precNormalized)
    -- Clean NaNs, infinities; ensure positive definiteness. The masses should
    -- be positive definite, but sometimes they happen to be not because of
    -- numerical errors.
    msNewDirty = interpolateElementWise m (L.unSym ms) ms'
    -- Positive definite matrices are symmetric.
    msNew = L.trustSym $ findClosestPositiveDefiniteMatrix $ cleanMatrix msNewDirty
    msINew = getMassesI msNew