{-# LANGUAGE OverloadedStrings #-}
module ELynx.Data.Sequence.Sequence
(
Name,
Description,
Characters,
Sequence (..),
fromByteString,
toByteString,
header,
summarize,
summarizeSequences,
body,
length,
equalLength,
longest,
trim,
concat,
concatSequences,
filterShorterThan,
filterLongerThan,
filterStandard,
)
where
import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List (maximumBy)
import Data.Ord (comparing)
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Data.Alphabet.Alphabet as A
import ELynx.Data.Alphabet.Character
import ELynx.Data.Sequence.Defaults
import qualified Text.Printf as P
import Prelude hiding
( concat,
length,
)
import qualified Prelude as Pr
( length,
)
type Name = BL.ByteString
type Description = BL.ByteString
type Characters = V.Vector Character
fromByteString :: BL.ByteString -> Characters
fromByteString = V.fromList . map fromChar . BL.unpack
toByteString :: Characters -> BL.ByteString
toByteString = BL.pack . map toChar . V.toList
data Sequence = Sequence
{ name :: Name,
description :: Description,
alphabet :: A.Alphabet,
characters :: Characters
}
deriving (Show, Eq)
alignRight :: Int -> BL.ByteString -> BL.ByteString
alignRight n s =
BL.replicate (fromIntegral n - l) ' ' <> BL.take (fromIntegral n) s
where
l = BL.length s
alignLeft :: Int -> BL.ByteString -> BL.ByteString
alignLeft n s =
BL.take (fromIntegral n) s <> BL.replicate (fromIntegral n - l) ' '
where
l = BL.length s
getInfo :: Sequence -> BL.ByteString
getInfo s =
BL.unwords
[ alignLeft nameWidth (name s),
alignRight fieldWidth (BL.pack $ show $ alphabet s),
alignRight fieldWidth (BL.pack . show $ len),
alignRight fieldWidth (BL.pack $ P.printf "%2.2f" pGaps)
]
where
len = length s
nGaps = countGaps s
pGaps = 100 * fromIntegral nGaps / fromIntegral len :: Double
summarizeByteString :: Int -> BL.ByteString -> BL.ByteString
summarizeByteString l s
| BL.length s >= fromIntegral l = BL.take (fromIntegral l) s <> BL.pack "..."
| otherwise = s
summarize :: Sequence -> BL.ByteString
summarize s =
BL.unwords
[getInfo s, summarizeByteString summaryLength $ toByteString (characters s)]
summarizeSequences :: [Sequence] -> BL.ByteString
summarizeSequences ss = header ss <> body (take summaryNSequences ss)
tableHeader :: BL.ByteString
tableHeader =
BL.unwords
[ alignLeft nameWidth "Name",
alignRight fieldWidth "Code",
alignRight fieldWidth "Length",
alignRight fieldWidth "Gaps [%]",
"Sequence"
]
header :: [Sequence] -> BL.ByteString
header ss =
BL.unlines $
reportIfSubsetIsShown
++ [ BL.pack $
"For each sequence, the "
++ show summaryLength
++ " first bases are shown.",
BL.pack $ "List contains " ++ show (Pr.length ss) ++ " sequences.",
"",
tableHeader
]
where
l = Pr.length ss
s =
show summaryNSequences
++ " out of "
++ show (Pr.length ss)
++ " sequences are shown."
reportIfSubsetIsShown
| l > summaryNSequences = [BL.pack s]
| otherwise = []
body :: [Sequence] -> BL.ByteString
body ss = BL.unlines (map summarize ss `using` parListChunk 5 rdeepseq)
length :: Sequence -> Int
length = fromIntegral . V.length . characters
equalLength :: [Sequence] -> Bool
equalLength = allEqual . map length
where
allEqual [] = True
allEqual xs = all (== head xs) $ tail xs
longest :: [Sequence] -> Sequence
longest = maximumBy (comparing length)
countGaps :: Sequence -> Int
countGaps s = V.length . V.filter (A.isGap $ alphabet s) $ characters s
trim :: Int -> Sequence -> Sequence
trim n (Sequence nm d a cs) = Sequence nm d a (V.take (fromIntegral n) cs)
concat :: Sequence -> Sequence -> Sequence
concat (Sequence i d c cs) (Sequence j f k ks)
| i /= j =
error $
"concatenate: Sequences do not have equal names: "
++ BL.unpack i
++ ", "
++ BL.unpack j
++ "."
| d /= f =
error $
"concatenate: Sequences do not have equal descriptions: "
++ BL.unpack d
++ ", "
++ BL.unpack f
++ "."
| c /= k =
error $
"concatenate: Sequences do not have equal alphabets: "
++ show c
++ ", "
++ show k
++ "."
| otherwise =
Sequence i d c (cs <> ks)
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences [] = error "concatenateSequences: Nothing to concatenate."
concatSequences [ss] = ss
concatSequences sss = foldl1 (zipWith concat) sss
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan n = filter (\x -> length x < n)
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan n = filter (\x -> length x > n)
filterStandard :: [Sequence] -> [Sequence]
filterStandard = filter (\s -> anyStandard (alphabet s) s)
anyStandard :: A.Alphabet -> Sequence -> Bool
anyStandard a s = V.any (A.isStd a) cs where cs = characters s