{-# LANGUAGE FlexibleContexts #-} -- | -- Module : ELynx.Data.Alphabet.DistributionDiversity -- Description : Summarize statistics for alphabets -- Copyright : (c) Dominik Schrempf 2021 -- License : GPL-3.0-or-later -- -- Maintainer : dominik.schrempf@gmail.com -- Stability : unstable -- Portability : portable -- -- Creation date: Mon Feb 25 13:32:56 2019. module ELynx.Data.Alphabet.DistributionDiversity ( -- * Entropy entropy, kEffEntropy, -- * Homoplasy homoplasy, kEffHomoplasy, -- * Count characters frequencyCharacters, ) where import qualified Data.Set as S import Data.Vector.Generic ( Vector, toList, ) import qualified Data.Vector.Generic as V import ELynx.Data.Alphabet.Alphabet import ELynx.Data.Alphabet.Character eps :: Double eps = 1e-12 -- Calculate x*log(x) but set to 0.0 when x is smaller than 'eps'. xLogX :: Double -> Double xLogX x | x < 0.0 = error "Argument lower than zero." | eps > x = 0.0 | otherwise = x * log x -- | Entropy of vector. entropy :: (Vector v Double) => v Double -> Double entropy v = if isNaN res then error ("entropy: Sesult of following vector is NaN: " ++ show (toList v) ++ ".") else res where res = negate $ V.sum $ V.map xLogX v -- | Effective number of used characters measured using 'entropy'. The result -- only makes sense when the sum of the array is 1.0. kEffEntropy :: Vector v Double => v Double -> Double kEffEntropy v = if e < eps then 1.0 else exp e where e = entropy v -- | Probability of homoplasy of vector. The result is the probability of -- binomially sampling the same character twice and only makes sense when the -- sum of the array is 1.0. homoplasy :: Vector v Double => v Double -> Double homoplasy v = V.sum $ V.map (\x -> x * x) v -- | Effective number of used characters measured using 'homoplasy'. The result -- only makes sense when the sum of the array is 1.0. kEffHomoplasy :: Vector v Double => v Double -> Double kEffHomoplasy v = 1.0 / homoplasy v -- XXX: Use mutable vector; then V.// is much faster. -- Increment element at index in vector by one. incrementElemIndexByOne :: Vector v Int => [Int] -> v Int -> v Int incrementElemIndexByOne is v = v V.// zip is es' where es' = [v V.! i + 1 | i <- is] -- For a given code and counts vector, increment the count of the given character. acc :: Vector v Int => AlphabetSpec -> v Int -> Character -> v Int acc alph vec char = incrementElemIndexByOne is vec where is = [S.findIndex c (std alph) | c <- toStd alph char] countCharacters :: (Vector v Character, Vector v Int) => AlphabetSpec -> v Character -> v Int countCharacters alph = V.foldl' (acc alph) zeroCounts where nChars = length (std alph) zeroCounts = V.replicate nChars (0 :: Int) saveDivision :: Int -> Int -> Double saveDivision value divisor = if divisor == 0 then 0.0 else fromIntegral value / fromIntegral divisor -- | For a given code vector of characters, calculate frequency of characters. -- The input vector has arbitrary length (most often the number of sequences in -- an alignment), the length of the output vector is the number of characters in -- the alphabet. frequencyCharacters :: (Vector v Character, Vector v Int, Vector v Double) => AlphabetSpec -> v Character -> v Double frequencyCharacters alph d = V.map (`saveDivision` s) counts where counts = countCharacters alph d s = V.sum counts