{-# LANGUAGE OverloadedStrings #-}
module ELynx.Sequence.Sequence
(
Name,
Description,
Characters,
Sequence (..),
fromByteString,
toByteString,
header,
summarize,
summarizeSequences,
body,
length,
equalLength,
longest,
trim,
concat,
concatSequences,
filterShorterThan,
filterLongerThan,
filterStandard,
)
where
import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List (maximumBy)
import Data.Ord (comparing)
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Alphabet.Alphabet as A
import ELynx.Alphabet.Character
import ELynx.Sequence.Defaults
import qualified Text.Printf as P
import Prelude hiding
( concat,
length,
)
import qualified Prelude as Pr
( length,
)
type Name = BL.ByteString
type Description = BL.ByteString
type Characters = V.Vector Character
fromByteString :: BL.ByteString -> Characters
fromByteString :: Name -> Characters
fromByteString = forall a. Unbox a => [a] -> Vector a
V.fromList forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map Char -> Character
fromChar forall b c a. (b -> c) -> (a -> b) -> a -> c
. Name -> [Char]
BL.unpack
toByteString :: Characters -> BL.ByteString
toByteString :: Characters -> Name
toByteString = [Char] -> Name
BL.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map Character -> Char
toChar forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Unbox a => Vector a -> [a]
V.toList
data Sequence = Sequence
{ Sequence -> Name
name :: Name,
Sequence -> Name
description :: Description,
Sequence -> Alphabet
alphabet :: A.Alphabet,
Sequence -> Characters
characters :: Characters
}
deriving (Int -> Sequence -> ShowS
[Sequence] -> ShowS
Sequence -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [Sequence] -> ShowS
$cshowList :: [Sequence] -> ShowS
show :: Sequence -> [Char]
$cshow :: Sequence -> [Char]
showsPrec :: Int -> Sequence -> ShowS
$cshowsPrec :: Int -> Sequence -> ShowS
Show, Sequence -> Sequence -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Sequence -> Sequence -> Bool
$c/= :: Sequence -> Sequence -> Bool
== :: Sequence -> Sequence -> Bool
$c== :: Sequence -> Sequence -> Bool
Eq)
alignRight :: Int -> BL.ByteString -> BL.ByteString
alignRight :: Int -> Name -> Name
alignRight Int
n Name
s =
Int64 -> Char -> Name
BL.replicate (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n forall a. Num a => a -> a -> a
- Int64
l) Char
' ' forall a. Semigroup a => a -> a -> a
<> Int64 -> Name -> Name
BL.take (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) Name
s
where
l :: Int64
l = Name -> Int64
BL.length Name
s
alignLeft :: Int -> BL.ByteString -> BL.ByteString
alignLeft :: Int -> Name -> Name
alignLeft Int
n Name
s =
Int64 -> Name -> Name
BL.take (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) Name
s forall a. Semigroup a => a -> a -> a
<> Int64 -> Char -> Name
BL.replicate (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n forall a. Num a => a -> a -> a
- Int64
l) Char
' '
where
l :: Int64
l = Name -> Int64
BL.length Name
s
getInfo :: Sequence -> BL.ByteString
getInfo :: Sequence -> Name
getInfo Sequence
s =
[Name] -> Name
BL.unwords
[ Int -> Name -> Name
alignLeft Int
nameWidth (Sequence -> Name
name Sequence
s),
Int -> Name -> Name
alignRight Int
fieldWidth ([Char] -> Name
BL.pack forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> [Char]
show forall a b. (a -> b) -> a -> b
$ Sequence -> Alphabet
alphabet Sequence
s),
Int -> Name -> Name
alignRight Int
fieldWidth ([Char] -> Name
BL.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show forall a b. (a -> b) -> a -> b
$ Int
len),
Int -> Name -> Name
alignRight Int
fieldWidth ([Char] -> Name
BL.pack forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => [Char] -> r
P.printf [Char]
"%2.2f" Double
pGaps)
]
where
len :: Int
len = Sequence -> Int
length Sequence
s
nGaps :: Int
nGaps = Sequence -> Int
countGaps Sequence
s
pGaps :: Double
pGaps = Double
100 forall a. Num a => a -> a -> a
* forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nGaps forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len :: Double
summarizeByteString :: Int -> BL.ByteString -> BL.ByteString
summarizeByteString :: Int -> Name -> Name
summarizeByteString Int
l Name
s
| Name -> Int64
BL.length Name
s forall a. Ord a => a -> a -> Bool
>= forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l = Int64 -> Name -> Name
BL.take (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l) Name
s forall a. Semigroup a => a -> a -> a
<> [Char] -> Name
BL.pack [Char]
"..."
| Bool
otherwise = Name
s
summarize :: Sequence -> BL.ByteString
summarize :: Sequence -> Name
summarize Sequence
s =
[Name] -> Name
BL.unwords
[Sequence -> Name
getInfo Sequence
s, Int -> Name -> Name
summarizeByteString Int
summaryLength forall a b. (a -> b) -> a -> b
$ Characters -> Name
toByteString (Sequence -> Characters
characters Sequence
s)]
summarizeSequences :: [Sequence] -> BL.ByteString
summarizeSequences :: [Sequence] -> Name
summarizeSequences [Sequence]
ss = [Sequence] -> Name
header [Sequence]
ss forall a. Semigroup a => a -> a -> a
<> [Sequence] -> Name
body (forall a. Int -> [a] -> [a]
take Int
summaryNSequences [Sequence]
ss)
tableHeader :: BL.ByteString
=
[Name] -> Name
BL.unwords
[ Int -> Name -> Name
alignLeft Int
nameWidth Name
"Name",
Int -> Name -> Name
alignRight Int
fieldWidth Name
"Code",
Int -> Name -> Name
alignRight Int
fieldWidth Name
"Length",
Int -> Name -> Name
alignRight Int
fieldWidth Name
"Gaps [%]",
Name
"Sequence"
]
header :: [Sequence] -> BL.ByteString
[Sequence]
ss =
[Name] -> Name
BL.unlines forall a b. (a -> b) -> a -> b
$
[Name]
reportIfSubsetIsShown
forall a. [a] -> [a] -> [a]
++ [ [Char] -> Name
BL.pack forall a b. (a -> b) -> a -> b
$
[Char]
"For each sequence, the "
forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
summaryLength
forall a. [a] -> [a] -> [a]
++ [Char]
" first bases are shown.",
[Char] -> Name
BL.pack forall a b. (a -> b) -> a -> b
$ [Char]
"List contains " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss) forall a. [a] -> [a] -> [a]
++ [Char]
" sequences.",
Name
"",
Name
tableHeader
]
where
l :: Int
l = forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss
s :: [Char]
s =
forall a. Show a => a -> [Char]
show Int
summaryNSequences
forall a. [a] -> [a] -> [a]
++ [Char]
" out of "
forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss)
forall a. [a] -> [a] -> [a]
++ [Char]
" sequences are shown."
reportIfSubsetIsShown :: [Name]
reportIfSubsetIsShown
| Int
l forall a. Ord a => a -> a -> Bool
> Int
summaryNSequences = [[Char] -> Name
BL.pack [Char]
s]
| Bool
otherwise = []
body :: [Sequence] -> BL.ByteString
body :: [Sequence] -> Name
body [Sequence]
ss = [Name] -> Name
BL.unlines (forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Name
summarize [Sequence]
ss forall a. a -> Strategy a -> a
`using` forall a. Int -> Strategy a -> Strategy [a]
parListChunk Int
5 forall a. NFData a => Strategy a
rdeepseq)
length :: Sequence -> Int
length :: Sequence -> Int
length = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Unbox a => Vector a -> Int
V.length forall b c a. (b -> c) -> (a -> b) -> a -> c
. Sequence -> Characters
characters
equalLength :: [Sequence] -> Bool
equalLength :: [Sequence] -> Bool
equalLength = forall {a}. Eq a => [a] -> Bool
allEqual forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Int
length
where
allEqual :: [a] -> Bool
allEqual [] = Bool
True
allEqual [a]
xs = forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (forall a. Eq a => a -> a -> Bool
== forall a. [a] -> a
head [a]
xs) forall a b. (a -> b) -> a -> b
$ forall a. [a] -> [a]
tail [a]
xs
longest :: [Sequence] -> Sequence
longest :: [Sequence] -> Sequence
longest = forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing Sequence -> Int
length)
countGaps :: Sequence -> Int
countGaps :: Sequence -> Int
countGaps Sequence
s = forall a. Unbox a => Vector a -> Int
V.length forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Alphabet -> Character -> Bool
A.isGap forall a b. (a -> b) -> a -> b
$ Sequence -> Alphabet
alphabet Sequence
s) forall a b. (a -> b) -> a -> b
$ Sequence -> Characters
characters Sequence
s
trim :: Int -> Sequence -> Sequence
trim :: Int -> Sequence -> Sequence
trim Int
n (Sequence Name
nm Name
d Alphabet
a Characters
cs) = Name -> Name -> Alphabet -> Characters -> Sequence
Sequence Name
nm Name
d Alphabet
a (forall a. Unbox a => Int -> Vector a -> Vector a
V.take (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) Characters
cs)
concat :: Sequence -> Sequence -> Sequence
concat :: Sequence -> Sequence -> Sequence
concat (Sequence Name
i Name
d Alphabet
c Characters
cs) (Sequence Name
j Name
f Alphabet
k Characters
ks)
| Name
i forall a. Eq a => a -> a -> Bool
/= Name
j =
forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$
[Char]
"concatenate: Sequences do not have equal names: "
forall a. [a] -> [a] -> [a]
++ Name -> [Char]
BL.unpack Name
i
forall a. [a] -> [a] -> [a]
++ [Char]
", "
forall a. [a] -> [a] -> [a]
++ Name -> [Char]
BL.unpack Name
j
forall a. [a] -> [a] -> [a]
++ [Char]
"."
| Name
d forall a. Eq a => a -> a -> Bool
/= Name
f =
forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$
[Char]
"concatenate: Sequences do not have equal descriptions: "
forall a. [a] -> [a] -> [a]
++ Name -> [Char]
BL.unpack Name
d
forall a. [a] -> [a] -> [a]
++ [Char]
", "
forall a. [a] -> [a] -> [a]
++ Name -> [Char]
BL.unpack Name
f
forall a. [a] -> [a] -> [a]
++ [Char]
"."
| Alphabet
c forall a. Eq a => a -> a -> Bool
/= Alphabet
k =
forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$
[Char]
"concatenate: Sequences do not have equal alphabets: "
forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Alphabet
c
forall a. [a] -> [a] -> [a]
++ [Char]
", "
forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Alphabet
k
forall a. [a] -> [a] -> [a]
++ [Char]
"."
| Bool
otherwise =
Name -> Name -> Alphabet -> Characters -> Sequence
Sequence Name
i Name
d Alphabet
c (Characters
cs forall a. Semigroup a => a -> a -> a
<> Characters
ks)
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences [] = forall a. HasCallStack => [Char] -> a
error [Char]
"concatenateSequences: Nothing to concatenate."
concatSequences [[Sequence]
ss] = [Sequence]
ss
concatSequences [[Sequence]]
sss = forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldl1 (forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Sequence -> Sequence -> Sequence
concat) [[Sequence]]
sss
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan Int
n = forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
x -> Sequence -> Int
length Sequence
x forall a. Ord a => a -> a -> Bool
< Int
n)
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan Int
n = forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
x -> Sequence -> Int
length Sequence
x forall a. Ord a => a -> a -> Bool
> Int
n)
filterStandard :: [Sequence] -> [Sequence]
filterStandard :: [Sequence] -> [Sequence]
filterStandard = forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
s -> Alphabet -> Sequence -> Bool
anyStandard (Sequence -> Alphabet
alphabet Sequence
s) Sequence
s)
anyStandard :: A.Alphabet -> Sequence -> Bool
anyStandard :: Alphabet -> Sequence -> Bool
anyStandard Alphabet
a Sequence
s = forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
V.any (Alphabet -> Character -> Bool
A.isStd Alphabet
a) Characters
cs where cs :: Characters
cs = Sequence -> Characters
characters Sequence
s