{-# LANGUAGE FlexibleContexts #-}

-- |
-- Module      :  ELynx.Alphabet.DistributionDiversity
-- Description :  Summarize statistics for alphabets
-- Copyright   :  2021 Dominik Schrempf
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Mon Feb 25 13:32:56 2019.
module ELynx.Alphabet.DistributionDiversity
  ( -- * Entropy
    entropy,
    kEffEntropy,

    -- * Homoplasy
    homoplasy,
    kEffHomoplasy,

    -- * Count characters
    frequencyCharacters,
  )
where

import qualified Data.Set as S
import Data.Vector.Generic
  ( Vector,
    toList,
  )
import qualified Data.Vector.Generic as V
import ELynx.Alphabet.Alphabet
import ELynx.Alphabet.Character

eps :: Double
eps :: Double
eps = Double
1e-12

-- Calculate x*log(x) but set to 0.0 when x is smaller than 'eps'.
xLogX :: Double -> Double
xLogX :: Double -> Double
xLogX Double
x
  | Double
x forall a. Ord a => a -> a -> Bool
< Double
0.0 = forall a. HasCallStack => [Char] -> a
error [Char]
"Argument lower than zero."
  | Double
eps forall a. Ord a => a -> a -> Bool
> Double
x = Double
0.0
  | Bool
otherwise = Double
x forall a. Num a => a -> a -> a
* forall a. Floating a => a -> a
log Double
x

-- | Entropy of vector.
entropy :: (Vector v Double) => v Double -> Double
entropy :: forall (v :: * -> *). Vector v Double => v Double -> Double
entropy v Double
v =
  if forall a. RealFloat a => a -> Bool
isNaN Double
res
    then
      forall a. HasCallStack => [Char] -> a
error
        ([Char]
"entropy: Sesult of following vector is NaN: " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (forall (v :: * -> *) a. Vector v a => v a -> [a]
toList v Double
v) forall a. [a] -> [a] -> [a]
++ [Char]
".")
    else Double
res
  where
    res :: Double
res = forall a. Num a => a -> a
negate forall a b. (a -> b) -> a -> b
$ forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
V.sum forall a b. (a -> b) -> a -> b
$ forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map Double -> Double
xLogX v Double
v

-- | Effective number of used characters measured using 'entropy'. The result
-- only makes sense when the sum of the array is 1.0.
kEffEntropy :: Vector v Double => v Double -> Double
kEffEntropy :: forall (v :: * -> *). Vector v Double => v Double -> Double
kEffEntropy v Double
v = if Double
e forall a. Ord a => a -> a -> Bool
< Double
eps then Double
1.0 else forall a. Floating a => a -> a
exp Double
e where e :: Double
e = forall (v :: * -> *). Vector v Double => v Double -> Double
entropy v Double
v

-- | Probability of homoplasy of vector. The result is the probability of
-- binomially sampling the same character twice and only makes sense when the
-- sum of the array is 1.0.
homoplasy :: Vector v Double => v Double -> Double
homoplasy :: forall (v :: * -> *). Vector v Double => v Double -> Double
homoplasy v Double
v = forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
V.sum forall a b. (a -> b) -> a -> b
$ forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (\Double
x -> Double
x forall a. Num a => a -> a -> a
* Double
x) v Double
v

-- | Effective number of used characters measured using 'homoplasy'. The result
-- only makes sense when the sum of the array is 1.0.
kEffHomoplasy :: Vector v Double => v Double -> Double
kEffHomoplasy :: forall (v :: * -> *). Vector v Double => v Double -> Double
kEffHomoplasy v Double
v = Double
1.0 forall a. Fractional a => a -> a -> a
/ forall (v :: * -> *). Vector v Double => v Double -> Double
homoplasy v Double
v

-- XXX: Use mutable vector; then V.// is much faster.
-- Increment element at index in vector by one.
incrementElemIndexByOne :: Vector v Int => [Int] -> v Int -> v Int
incrementElemIndexByOne :: forall (v :: * -> *). Vector v Int => [Int] -> v Int -> v Int
incrementElemIndexByOne [Int]
is v Int
v = v Int
v forall (v :: * -> *) a. Vector v a => v a -> [(Int, a)] -> v a
V.// forall a b. [a] -> [b] -> [(a, b)]
zip [Int]
is [Int]
es'
  where
    es' :: [Int]
es' = [v Int
v forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
V.! Int
i forall a. Num a => a -> a -> a
+ Int
1 | Int
i <- [Int]
is]

-- For a given code and counts vector, increment the count of the given character.
acc :: Vector v Int => AlphabetSpec -> v Int -> Character -> v Int
acc :: forall (v :: * -> *).
Vector v Int =>
AlphabetSpec -> v Int -> Character -> v Int
acc AlphabetSpec
alph v Int
vec Character
char = forall (v :: * -> *). Vector v Int => [Int] -> v Int -> v Int
incrementElemIndexByOne [Int]
is v Int
vec
  where
    is :: [Int]
is = [forall a. Ord a => a -> Set a -> Int
S.findIndex Character
c (AlphabetSpec -> Set Character
std AlphabetSpec
alph) | Character
c <- AlphabetSpec -> Character -> [Character]
toStd AlphabetSpec
alph Character
char]

countCharacters ::
  (Vector v Character, Vector v Int) => AlphabetSpec -> v Character -> v Int
countCharacters :: forall (v :: * -> *).
(Vector v Character, Vector v Int) =>
AlphabetSpec -> v Character -> v Int
countCharacters AlphabetSpec
alph = forall (v :: * -> *) b a.
Vector v b =>
(a -> b -> a) -> a -> v b -> a
V.foldl' (forall (v :: * -> *).
Vector v Int =>
AlphabetSpec -> v Int -> Character -> v Int
acc AlphabetSpec
alph) v Int
zeroCounts
  where
    nChars :: Int
nChars = forall (t :: * -> *) a. Foldable t => t a -> Int
length (AlphabetSpec -> Set Character
std AlphabetSpec
alph)
    zeroCounts :: v Int
zeroCounts = forall (v :: * -> *) a. Vector v a => Int -> a -> v a
V.replicate Int
nChars (Int
0 :: Int)

saveDivision :: Int -> Int -> Double
saveDivision :: Int -> Int -> Double
saveDivision Int
value Int
divisor =
  if Int
divisor forall a. Eq a => a -> a -> Bool
== Int
0 then Double
0.0 else forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
value forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
divisor

-- | For a given code vector of characters, calculate frequency of characters.
-- The input vector has arbitrary length (most often the number of sequences in
-- an alignment), the length of the output vector is the number of characters in
-- the alphabet.
frequencyCharacters ::
  (Vector v Character, Vector v Int, Vector v Double) =>
  AlphabetSpec ->
  v Character ->
  v Double
frequencyCharacters :: forall (v :: * -> *).
(Vector v Character, Vector v Int, Vector v Double) =>
AlphabetSpec -> v Character -> v Double
frequencyCharacters AlphabetSpec
alph v Character
d = forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (Int -> Int -> Double
`saveDivision` Int
s) v Int
counts
  where
    counts :: v Int
counts = forall (v :: * -> *).
(Vector v Character, Vector v Int) =>
AlphabetSpec -> v Character -> v Int
countCharacters AlphabetSpec
alph v Character
d
    s :: Int
s = forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
V.sum v Int
counts