{-# LANGUAGE OverloadedStrings #-} {- | Module : ELynx.Data.Sequence Description : Hereditary sequences Copyright : (c) Dominik Schrempf 2018 License : GPL-3.0-or-later Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : portable Creation date: Thu Oct 4 18:54:51 2018. This module is to be imported qualified. -} module ELynx.Data.Sequence.Sequence ( -- * Types Name , Description , Characters , Sequence(..) -- * Input , fromByteString -- * Output , toByteString , header , summarize , summarizeSequences , body -- * Analysis , length , equalLength , longest -- * Manipulation , trim , concat , concatSequences -- * Filtering , filterShorterThan , filterLongerThan , filterStandard ) where import Control.Parallel.Strategies import qualified Data.ByteString.Lazy.Char8 as L import Data.List ( maximumBy ) import Data.Ord ( comparing ) import qualified Data.Vector.Unboxed as V import Prelude hiding ( concat , length ) import qualified Prelude as Pr ( length ) import qualified Text.Printf as P import qualified ELynx.Data.Alphabet.Alphabet as A import ELynx.Data.Alphabet.Character import ELynx.Data.Sequence.Defaults import ELynx.Tools -- | For now, 'Name's are just 'L.ByteString's. type Name = L.ByteString -- | The description of a sequence. type Description = L.ByteString -- | The vector of characters of a sequence. type Characters = V.Vector Character -- | Convert byte string to sequence characters. fromByteString :: L.ByteString -> Characters fromByteString = V.fromList . map fromChar . L.unpack -- | Convert sequence characters to byte string. toByteString :: Characters -> L.ByteString toByteString = L.pack . map toChar . V.toList -- | Sequences have a name, a possibly empty description, a code and hopefully a -- lot of data. data Sequence = Sequence { name :: Name , description :: Description , alphabet :: A.Alphabet , characters :: Characters } deriving (Show, Eq) getInfo :: Sequence -> L.ByteString getInfo s = L.unwords [ alignLeft nameWidth (name s) , alignRight fieldWidth (L.pack $ show $ alphabet s) , alignRight fieldWidth (L.pack . show $ len) , alignRight fieldWidth (L.pack $ P.printf "%2.2f" pGaps) ] where len = length s nGaps = countGaps s pGaps = 100 * fromIntegral nGaps / fromIntegral len :: Double -- | Trim and show a 'Sequence'. summarize :: Sequence -> L.ByteString summarize s = L.unwords [getInfo s, summarizeByteString summaryLength $ toByteString (characters s)] -- | Trim and show a list of 'Sequence's. summarizeSequences :: [Sequence] -> L.ByteString summarizeSequences ss = header ss <> body (take summaryNSequences ss) -- | Header printed before 'Sequence' list. tableHeader :: L.ByteString tableHeader = L.unwords [ alignLeft nameWidth "Name" , alignRight fieldWidth "Code" , alignRight fieldWidth "Length" , alignRight fieldWidth "Gaps [%]" , "Sequence" ] -- | A short description of the sequence. header :: [Sequence] -> L.ByteString header ss = L.unlines $ reportIfSubsetIsShown ++ [ L.pack $ "For each sequence, the " ++ show summaryLength ++ " first bases are shown." , L.pack $ "List contains " ++ show (Pr.length ss) ++ " sequences." , "" , tableHeader ] where l = Pr.length ss s = show summaryNSequences ++ " out of " ++ show (Pr.length ss) ++ " sequences are shown." reportIfSubsetIsShown | l > summaryNSequences = [L.pack s] | otherwise = [] -- | Trim and show a list of 'Sequence's. body :: [Sequence] -> L.ByteString body ss = L.unlines (map summarize ss `using` parListChunk 5 rdeepseq) -- | Calculate length of 'Sequence'. length :: Sequence -> Int length = fromIntegral . V.length . characters -- | Check if all 'Sequence's have equal length. equalLength :: [Sequence] -> Bool equalLength = allEqual . map length -- | Find the longest 'Sequence' in a list. longest :: [Sequence] -> Sequence longest = maximumBy (comparing length) -- | Count number of gaps or unknown characters in sequence. countGaps :: Sequence -> Int countGaps s = V.length . V.filter (A.isGap $ alphabet s) $ characters s -- | Trim to given length. trim :: Int -> Sequence -> Sequence trim n (Sequence nm d a cs) = Sequence nm d a (V.take (fromIntegral n) cs) -- | Concatenate two sequences. 'Name's have to match. concat :: Sequence -> Sequence -> Sequence concat (Sequence i d c cs) (Sequence j f k ks) | i /= j = error $ "concatenate: Sequences do not have equal names: " ++ L.unpack i ++ ", " ++ L.unpack j ++ "." | d /= f = error $ "concatenate: Sequences do not have equal descriptions: " ++ L.unpack d ++ ", " ++ L.unpack f ++ "." | c /= k = error $ "concatenate: Sequences do not have equal alphabets: " ++ show c ++ ", " ++ show k ++ "." | otherwise = Sequence i d c (cs <> ks) -- | Concatenate a list of sequences, see 'concat'. concatSequences :: [[Sequence]] -> [Sequence] concatSequences [] = error "concatenateSequences: Nothing to concatenate." concatSequences [ss] = ss concatSequences sss = foldl1 (zipWith concat) sss -- | Only take 'Sequence's that are shorter than a given number. filterShorterThan :: Int -> [Sequence] -> [Sequence] filterShorterThan n = filter (\x -> length x < n) -- | Only take 'Sequence's that are longer than a given number. filterLongerThan :: Int -> [Sequence] -> [Sequence] filterLongerThan n = filter (\x -> length x > n) -- | Only take 'Sequence's that contain at least on non-IUPAC character. filterStandard :: [Sequence] -> [Sequence] filterStandard = filter (\s -> anyStandard (alphabet s) s) -- Are all characters IUPAC characters? anyStandard :: A.Alphabet -> Sequence -> Bool anyStandard a s = V.any (A.isStd a) cs where cs = characters s