{-# LANGUAGE OverloadedStrings #-} -- | -- Module : ELynx.Data.Sequence -- Description : Hereditary sequences -- Copyright : (c) Dominik Schrempf 2018 -- License : GPL-3.0-or-later -- -- Maintainer : dominik.schrempf@gmail.com -- Stability : unstable -- Portability : portable -- -- Creation date: Thu Oct 4 18:54:51 2018. -- -- This module is to be imported qualified. module ELynx.Data.Sequence.Sequence ( -- * Types Name, Description, Characters, Sequence (..), -- * Input fromByteString, -- * Output toByteString, header, summarize, summarizeSequences, body, -- * Analysis length, equalLength, longest, -- * Manipulation trim, concat, concatSequences, -- * Filtering filterShorterThan, filterLongerThan, filterStandard, ) where import Control.Parallel.Strategies import qualified Data.ByteString.Lazy.Char8 as BL import Data.List (maximumBy) import Data.Ord (comparing) import qualified Data.Vector.Unboxed as V import qualified ELynx.Data.Alphabet.Alphabet as A import ELynx.Data.Alphabet.Character import ELynx.Data.Sequence.Defaults import qualified Text.Printf as P import Prelude hiding ( concat, length, ) import qualified Prelude as Pr ( length, ) -- | For now, 'Name's are just 'BL.ByteString's. type Name = BL.ByteString -- | The description of a sequence. type Description = BL.ByteString -- | The vector of characters of a sequence. type Characters = V.Vector Character -- | Convert byte string to sequence characters. fromByteString :: BL.ByteString -> Characters fromByteString = V.fromList . map fromChar . BL.unpack -- | Convert sequence characters to byte string. toByteString :: Characters -> BL.ByteString toByteString = BL.pack . map toChar . V.toList -- | Sequences have a name, a possibly empty description, a code and hopefully a -- lot of data. data Sequence = Sequence { name :: Name, description :: Description, alphabet :: A.Alphabet, characters :: Characters } deriving (Show, Eq) alignRight :: Int -> BL.ByteString -> BL.ByteString alignRight n s = BL.replicate (fromIntegral n - l) ' ' <> BL.take (fromIntegral n) s where l = BL.length s alignLeft :: Int -> BL.ByteString -> BL.ByteString alignLeft n s = BL.take (fromIntegral n) s <> BL.replicate (fromIntegral n - l) ' ' where l = BL.length s getInfo :: Sequence -> BL.ByteString getInfo s = BL.unwords [ alignLeft nameWidth (name s), alignRight fieldWidth (BL.pack $ show $ alphabet s), alignRight fieldWidth (BL.pack . show $ len), alignRight fieldWidth (BL.pack $ P.printf "%2.2f" pGaps) ] where len = length s nGaps = countGaps s pGaps = 100 * fromIntegral nGaps / fromIntegral len :: Double -- If a string is longer than a given value, trim it and add some dots. summarizeByteString :: Int -> BL.ByteString -> BL.ByteString summarizeByteString l s | BL.length s >= fromIntegral l = BL.take (fromIntegral l) s <> BL.pack "..." | otherwise = s -- | Trim and show a 'Sequence'. summarize :: Sequence -> BL.ByteString summarize s = BL.unwords [getInfo s, summarizeByteString summaryLength $ toByteString (characters s)] -- | Trim and show a list of 'Sequence's. summarizeSequences :: [Sequence] -> BL.ByteString summarizeSequences ss = header ss <> body (take summaryNSequences ss) -- | Header printed before 'Sequence' list. tableHeader :: BL.ByteString tableHeader = BL.unwords [ alignLeft nameWidth "Name", alignRight fieldWidth "Code", alignRight fieldWidth "Length", alignRight fieldWidth "Gaps [%]", "Sequence" ] -- | A short description of the sequence. header :: [Sequence] -> BL.ByteString header ss = BL.unlines $ reportIfSubsetIsShown ++ [ BL.pack $ "For each sequence, the " ++ show summaryLength ++ " first bases are shown.", BL.pack $ "List contains " ++ show (Pr.length ss) ++ " sequences.", "", tableHeader ] where l = Pr.length ss s = show summaryNSequences ++ " out of " ++ show (Pr.length ss) ++ " sequences are shown." reportIfSubsetIsShown | l > summaryNSequences = [BL.pack s] | otherwise = [] -- | Trim and show a list of 'Sequence's. body :: [Sequence] -> BL.ByteString body ss = BL.unlines (map summarize ss `using` parListChunk 5 rdeepseq) -- | Calculate length of 'Sequence'. length :: Sequence -> Int length = fromIntegral . V.length . characters -- | Check if all 'Sequence's have equal length. equalLength :: [Sequence] -> Bool equalLength = allEqual . map length where allEqual [] = True allEqual xs = all (== head xs) $ tail xs -- | Find the longest 'Sequence' in a list. longest :: [Sequence] -> Sequence longest = maximumBy (comparing length) -- | Count number of gaps or unknown characters in sequence. countGaps :: Sequence -> Int countGaps s = V.length . V.filter (A.isGap $ alphabet s) $ characters s -- | Trim to given length. trim :: Int -> Sequence -> Sequence trim n (Sequence nm d a cs) = Sequence nm d a (V.take (fromIntegral n) cs) -- | Concatenate two sequences. 'Name's have to match. concat :: Sequence -> Sequence -> Sequence concat (Sequence i d c cs) (Sequence j f k ks) | i /= j = error $ "concatenate: Sequences do not have equal names: " ++ BL.unpack i ++ ", " ++ BL.unpack j ++ "." | d /= f = error $ "concatenate: Sequences do not have equal descriptions: " ++ BL.unpack d ++ ", " ++ BL.unpack f ++ "." | c /= k = error $ "concatenate: Sequences do not have equal alphabets: " ++ show c ++ ", " ++ show k ++ "." | otherwise = Sequence i d c (cs <> ks) -- | Concatenate a list of sequences, see 'concat'. concatSequences :: [[Sequence]] -> [Sequence] concatSequences [] = error "concatenateSequences: Nothing to concatenate." concatSequences [ss] = ss concatSequences sss = foldl1 (zipWith concat) sss -- | Only take 'Sequence's that are shorter than a given number. filterShorterThan :: Int -> [Sequence] -> [Sequence] filterShorterThan n = filter (\x -> length x < n) -- | Only take 'Sequence's that are longer than a given number. filterLongerThan :: Int -> [Sequence] -> [Sequence] filterLongerThan n = filter (\x -> length x > n) -- | Only take 'Sequence's that contain at least on non-IUPAC character. filterStandard :: [Sequence] -> [Sequence] filterStandard = filter (\s -> anyStandard (alphabet s) s) -- Are all characters IUPAC characters? anyStandard :: A.Alphabet -> Sequence -> Bool anyStandard a s = V.any (A.isStd a) cs where cs = characters s