{-# LANGUAGE OverloadedStrings #-} {- | Module : ELynx.Data.Sequence Description : Hereditary sequences Copyright : (c) Dominik Schrempf 2018 License : GPL-3 Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : portable Creation date: Thu Oct 4 18:54:51 2018. This module is to be imported qualified. -} module ELynx.Data.Sequence.Sequence ( -- * Types Name , Characters , Sequence (..) -- * Input , fromByteString -- * Output , toByteString , header , summarize , summarizeSequences , body -- * Analysis , length , equalLength , longest -- * Manipulation , trim , concat , concatSequences -- * Filtering , filterShorterThan , filterLongerThan , filterStandard ) where import Control.Parallel.Strategies import qualified Data.ByteString.Lazy.Char8 as L import Data.List (maximumBy) import Data.Ord (comparing) import qualified Data.Vector.Unboxed as V import Prelude hiding (concat, length) import qualified Prelude as Pr (length) import qualified Text.Printf as P import qualified ELynx.Data.Alphabet.Alphabet as A import ELynx.Data.Alphabet.Character import ELynx.Data.Sequence.Defaults import ELynx.Tools.ByteString import ELynx.Tools.Equality -- | For now, 'Name's are just 'L.ByteString's. type Name = L.ByteString -- | The vector of characters of a sequence. type Characters = V.Vector Character -- | Convert byte string to sequence characters. fromByteString :: L.ByteString -> Characters fromByteString = V.fromList . map fromChar . L.unpack -- | Convert sequence characters to byte string. toByteString :: Characters -> L.ByteString toByteString = L.pack . map toChar . V.toList -- | Sequences have a name, a code and hopefully a lot of data. data Sequence = Sequence { name :: Name , alphabet :: A.Alphabet , characters :: Characters } deriving (Show, Eq) getInfo :: Sequence -> L.ByteString getInfo s = L.unwords [ alignLeft nameWidth (name s) , alignRight fieldWidth (L.pack $ show $ alphabet s) , alignRight fieldWidth (L.pack . show $ len) , alignRight fieldWidth (L.pack $ P.printf "%.3f" pGaps) ] where len = length s nGaps = countGaps s pGaps = fromIntegral nGaps / fromIntegral len :: Double -- | Trim and show a 'Sequence'. summarize :: Sequence -> L.ByteString summarize s = L.unwords [ getInfo s , summarizeByteString summaryLength $ toByteString (characters s) ] -- | Trim and show a list of 'Sequence's. summarizeSequences :: [Sequence] -> L.ByteString summarizeSequences ss = header ss <> body (take summaryNSequences ss) -- | Header printed before 'Sequence' list. tableHeader :: L.ByteString tableHeader = L.unwords [ alignLeft nameWidth "Name" , alignRight fieldWidth "Code" , alignRight fieldWidth "Length" , alignRight fieldWidth "Gaps [%]" , "Sequence" ] -- | A short description of the sequence. header :: [Sequence] -> L.ByteString header ss = L.unlines $ reportIfSubsetIsShown ++ [ L.pack $ "For each sequence, the " ++ show summaryLength ++ " first bases are shown." , L.pack $ "List contains " ++ show (Pr.length ss) ++ " sequences." , "" , tableHeader ] where l = Pr.length ss s = show summaryNSequences ++ " out of " ++ show (Pr.length ss) ++ " sequences are shown." reportIfSubsetIsShown | l > summaryNSequences = [L.pack s] | otherwise = [] -- | Trim and show a list of 'Sequence's. body :: [Sequence] -> L.ByteString body ss = L.unlines (map summarize ss `using` parListChunk 5 rdeepseq) -- | Calculate length of 'Sequence'. length :: Sequence -> Int length = fromIntegral . V.length . characters -- | Check if all 'Sequence's have equal length. equalLength :: [Sequence] -> Bool equalLength = allEqual . map length -- | Find the longest 'Sequence' in a list. longest :: [Sequence] -> Sequence longest = maximumBy (comparing length) -- | Count number of gaps or unknown characters in sequence. countGaps :: Sequence -> Int countGaps s = V.length . V.filter (A.isGap $ alphabet s) $ characters s -- | Trim to given length. trim :: Int -> Sequence -> Sequence trim n (Sequence nm a cs) = Sequence nm a (V.take (fromIntegral n) cs) -- | Concatenate two sequences. 'Name's have to match. concat :: Sequence -> Sequence -> Sequence concat (Sequence i c cs) (Sequence j k ks) | i == j && c == k = Sequence i c (cs <> ks) | otherwise = error $ "concatenate: Sequences do not have equal names: " ++ L.unpack i ++ ", " ++ L.unpack j ++ "." -- | Concatenate a list of sequences, see 'concat'. concatSequences :: [[Sequence]] -> [Sequence] concatSequences [] = error "concatenateSequences: Nothing to concatenate." concatSequences [ss] = ss concatSequences sss = foldl1 (zipWith concat) sss -- | Only take 'Sequence's that are shorter than a given number. filterShorterThan :: Int -> [Sequence] -> [Sequence] filterShorterThan n = filter (\x -> length x < n) -- | Only take 'Sequence's that are longer than a given number. filterLongerThan :: Int -> [Sequence] -> [Sequence] filterLongerThan n = filter (\x -> length x > n) -- | Only take 'Sequence's that contain at least on non-IUPAC character. filterStandard :: [Sequence] -> [Sequence] filterStandard = filter (\s -> anyStandard (alphabet s) s) -- Are all characters IUPAC characters? anyStandard :: A.Alphabet -> Sequence -> Bool anyStandard a s = V.any (A.isStd a) cs where cs = characters s