module ELynx.Data.Sequence.Alignment
( Alignment (..),
length,
nSequences,
fromSequences,
toSequences,
summarize,
join,
concat,
concatAlignments,
filterColsConstant,
filterColsConstantSoft,
filterColsOnlyStd,
filterColsStd,
filterColsNoGaps,
FrequencyData,
distribution,
toFrequencyData,
kEffEntropy,
kEffHomoplasy,
countIUPACChars,
countGaps,
countUnknowns,
subSample,
randomSubSample,
)
where
import Control.Monad hiding (join)
import Control.Monad.Primitive
import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List hiding
( concat,
length,
)
import qualified Data.Matrix.Unboxed as M
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import qualified ELynx.Data.Alphabet.DistributionDiversity as D
import ELynx.Data.Sequence.Defaults
import qualified ELynx.Data.Sequence.Sequence as S
import System.Random.MWC
import Prelude hiding
( concat,
length,
)
data Alignment = Alignment
{ names :: [S.Name],
descriptions :: [S.Description],
alphabet :: A.Alphabet,
matrix :: M.Matrix Character
}
deriving (Show, Eq)
length :: Alignment -> Int
length = M.cols . matrix
nSequences :: Alignment -> Int
nSequences = M.rows . matrix
fromSequences :: [S.Sequence] -> Either String Alignment
fromSequences ss
| S.equalLength ss && allEqual (map S.alphabet ss) =
Right $
Alignment ns ds a d
| S.equalLength ss = Left "Sequences do not have equal codes."
| otherwise = Left "Sequences do not have equal lengths."
where
ns = map S.name ss
ds = map S.description ss
a = S.alphabet $ head ss
bss = map S.characters ss
d = M.fromRows bss
allEqual [] = True
allEqual xs = all (== head xs) $ tail xs
toSequences :: Alignment -> [S.Sequence]
toSequences (Alignment ns ds a da) =
zipWith3
(\n d r -> S.Sequence n d a r)
ns
ds
rows
where
rows = M.toRows da
header :: Alignment -> BL.ByteString
header a =
BL.unlines $
[ BL.pack "Multi sequence alignment.",
BL.pack $ "Code: " ++ A.alphabetDescription (alphabet a) ++ ".",
BL.pack $ "Length: " ++ show (length a) ++ "."
]
++ reportLengthSummary
++ reportNumberSummary
where
reportLengthSummary =
[ BL.pack $
"For each sequence, the "
++ show summaryLength
++ " first bases are shown."
| length a > summaryLength
]
reportNumberSummary =
[ BL.pack $
show summaryNSequences
++ " out of "
++ show (nSequences a)
++ " sequences are shown."
| nSequences a > summaryNSequences
]
summarize :: Alignment -> BL.ByteString
summarize a = header a <> S.body (toSequences a)
(===) :: V.Unbox a => M.Matrix a -> M.Matrix a -> M.Matrix a
(===) l r = M.fromRows $ lRs ++ rRs
where
lRs = M.toRows l
rRs = M.toRows r
(|||) :: V.Unbox a => M.Matrix a -> M.Matrix a -> M.Matrix a
(|||) l r = M.fromColumns $ lCs ++ rCs
where
lCs = M.toColumns l
rCs = M.toColumns r
join :: Alignment -> Alignment -> Alignment
join t b
| length t /= length b =
error
"join: Multi sequence alignments do not have equal lengths."
| alphabet t /= alphabet b =
error
"join: Multi sequence alignments do not have equal alphabets."
| otherwise = Alignment ns ds al (tD === bD)
where
ns = names t ++ names b
ds = descriptions t ++ descriptions b
tD = matrix t
bD = matrix b
al = alphabet t
concat :: Alignment -> Alignment -> Alignment
concat l r
| nSequences l /= nSequences r =
error
"concat: Multi sequence alignments do not have an equal number of sequences."
| alphabet l /= alphabet r =
error "concat: Multi sequence alignments do not have an equal alphabets."
| names l /= names r =
error "concat: Multi sequence alignments do not have an equal names."
| descriptions l /= descriptions r =
error "concat: Multi sequence alignments do not have an equal descriptions."
| otherwise =
Alignment (names l) (descriptions l) (alphabet l) (lD ||| rD)
where
lD = matrix l
rD = matrix r
concatAlignments :: [Alignment] -> Alignment
concatAlignments [] = error "concatAlignments: Nothing to concatenate."
concatAlignments [a] = a
concatAlignments as = foldl' concat (head as) (tail as)
filterColsWith :: (V.Vector Character -> Bool) -> Alignment -> Alignment
filterColsWith p a = a {matrix = m'}
where
m' = M.fromColumns . filter p . M.toColumns $ matrix a
filterColsConstant :: Alignment -> Alignment
filterColsConstant = filterColsWith (\v -> V.all (== V.head v) v)
filterColsConstantSoft :: Alignment -> Alignment
filterColsConstantSoft a = filterColsWith f a
where
al = alphabet a
f v = case V.find (A.isStd al) v of
Nothing -> False
Just c -> V.all (\x -> x == c || (A.isGap al) x || (A.isUnknown al) x) v
filterColsOnlyStd :: Alignment -> Alignment
filterColsOnlyStd a = filterColsWith (V.all $ A.isStd (alphabet a)) a
filterColsStd :: Double -> Alignment -> Alignment
filterColsStd prop a =
filterColsWith
(\col -> prop * n <= fromIntegral (V.length (V.filter (A.isStd al) col)))
a
where
al = alphabet a
n = fromIntegral $ nSequences a
filterColsNoGaps :: Alignment -> Alignment
filterColsNoGaps a = filterColsWith (V.all $ not . A.isGap (alphabet a)) a
type FrequencyData = M.Matrix Double
fMapColParChunk ::
(V.Unbox a, V.Unbox b) =>
Int ->
(V.Vector a -> V.Vector b) ->
M.Matrix a ->
M.Matrix b
fMapColParChunk n f m =
M.fromColumns (map f (M.toColumns m) `using` parListChunk n rseq)
toFrequencyData :: Alignment -> FrequencyData
toFrequencyData a = fMapColParChunk 100 (D.frequencyCharacters spec) (matrix a)
where
spec = A.alphabetSpec (alphabet a)
distribution :: FrequencyData -> [Double]
distribution fd =
map (/ fromIntegral nSites) $
V.toList $
foldl1
(V.zipWith (+))
(M.toColumns fd)
where
nSites = M.cols fd
parMapChunk :: Int -> (a -> b) -> [a] -> [b]
parMapChunk n f as = map f as `using` parListChunk n rseq
chunksize :: Int
chunksize = 500
kEffEntropy :: FrequencyData -> [Double]
kEffEntropy fd = parMapChunk chunksize D.kEffEntropy (M.toColumns fd)
kEffHomoplasy :: FrequencyData -> [Double]
kEffHomoplasy fd = parMapChunk chunksize D.kEffHomoplasy (M.toColumns fd)
countIUPACChars :: Alignment -> Int
countIUPACChars a = V.length . V.filter (A.isIUPAC (alphabet a)) $ allChars
where
allChars = M.flatten $ matrix a
countGaps :: Alignment -> Int
countGaps a = V.length . V.filter (A.isGap (alphabet a)) $ allChars
where
allChars = M.flatten $ matrix a
countUnknowns :: Alignment -> Int
countUnknowns a = V.length . V.filter (A.isUnknown (alphabet a)) $ allChars
where
allChars = M.flatten $ matrix a
subSampleMatrix :: V.Unbox a => [Int] -> M.Matrix a -> M.Matrix a
subSampleMatrix is m =
M.fromColumns $ foldl' (\a i -> M.takeColumn m i : a) [] is
subSample :: [Int] -> Alignment -> Alignment
subSample is a = a {matrix = m'} where m' = subSampleMatrix is $ matrix a
randomSubSample ::
PrimMonad m => Int -> Alignment -> Gen (PrimState m) -> m Alignment
randomSubSample n a g = do
let l = length a
is <- replicateM n $ uniformR (0, l - 1) g
return $ subSample is a