{-# LANGUAGE OverloadedStrings #-}

-- |
-- Module      :  ELynx.Sequence
-- Description :  Hereditary sequences
-- Copyright   :  2021 Dominik Schrempf
-- License     :  GPL-3.0-or-later
--
-- Maintainer  :  dominik.schrempf@gmail.com
-- Stability   :  unstable
-- Portability :  portable
--
-- Creation date: Thu Oct  4 18:54:51 2018.
--
-- This module is to be imported qualified.
module ELynx.Sequence.Sequence
  ( -- * Types
    Name,
    Description,
    Characters,
    Sequence (..),

    -- * Input
    fromByteString,

    -- * Output
    toByteString,
    header,
    summarize,
    summarizeSequences,
    body,

    -- * Analysis
    length,
    equalLength,
    longest,

    -- * Manipulation
    trim,
    concat,
    concatSequences,

    -- * Filtering
    filterShorterThan,
    filterLongerThan,
    filterStandard,
  )
where

import Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List (maximumBy)
import Data.Ord (comparing)
import qualified Data.Vector.Unboxed as V
import qualified ELynx.Alphabet.Alphabet as A
import ELynx.Alphabet.Character
import ELynx.Sequence.Defaults
import qualified Text.Printf as P
import Prelude hiding
  ( concat,
    length,
  )
import qualified Prelude as Pr
  ( length,
  )

-- | 'Name's are just 'BL.ByteString's.
type Name = BL.ByteString

-- | The description of a sequence.
type Description = BL.ByteString

-- | The vector of characters of a sequence.
type Characters = V.Vector Character

-- | Convert byte string to sequence characters.
fromByteString :: BL.ByteString -> Characters
fromByteString :: Name -> Characters
fromByteString = forall a. Unbox a => [a] -> Vector a
V.fromList forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map Char -> Character
fromChar forall b c a. (b -> c) -> (a -> b) -> a -> c
. Name -> [Char]
BL.unpack

-- | Convert sequence characters to byte string.
toByteString :: Characters -> BL.ByteString
toByteString :: Characters -> Name
toByteString = [Char] -> Name
BL.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map Character -> Char
toChar forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Unbox a => Vector a -> [a]
V.toList

-- | Sequences have a name, a possibly empty description, a code and hopefully a
-- lot of data.
data Sequence = Sequence
  { Sequence -> Name
name :: Name,
    Sequence -> Name
description :: Description,
    Sequence -> Alphabet
alphabet :: A.Alphabet,
    Sequence -> Characters
characters :: Characters
  }
  deriving (Int -> Sequence -> ShowS
[Sequence] -> ShowS
Sequence -> [Char]
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
showList :: [Sequence] -> ShowS
$cshowList :: [Sequence] -> ShowS
show :: Sequence -> [Char]
$cshow :: Sequence -> [Char]
showsPrec :: Int -> Sequence -> ShowS
$cshowsPrec :: Int -> Sequence -> ShowS
Show, Sequence -> Sequence -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Sequence -> Sequence -> Bool
$c/= :: Sequence -> Sequence -> Bool
== :: Sequence -> Sequence -> Bool
$c== :: Sequence -> Sequence -> Bool
Eq)

alignRight :: Int -> BL.ByteString -> BL.ByteString
alignRight :: Int -> Name -> Name
alignRight Int
n Name
s =
  Int64 -> Char -> Name
BL.replicate (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n forall a. Num a => a -> a -> a
- Int64
l) Char
' ' forall a. Semigroup a => a -> a -> a
<> Int64 -> Name -> Name
BL.take (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) Name
s
  where
    l :: Int64
l = Name -> Int64
BL.length Name
s

alignLeft :: Int -> BL.ByteString -> BL.ByteString
alignLeft :: Int -> Name -> Name
alignLeft Int
n Name
s =
  Int64 -> Name -> Name
BL.take (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) Name
s forall a. Semigroup a => a -> a -> a
<> Int64 -> Char -> Name
BL.replicate (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n forall a. Num a => a -> a -> a
- Int64
l) Char
' '
  where
    l :: Int64
l = Name -> Int64
BL.length Name
s

getInfo :: Sequence -> BL.ByteString
getInfo :: Sequence -> Name
getInfo Sequence
s =
  [Name] -> Name
BL.unwords
    [ Int -> Name -> Name
alignLeft Int
nameWidth (Sequence -> Name
name Sequence
s),
      Int -> Name -> Name
alignRight Int
fieldWidth ([Char] -> Name
BL.pack forall a b. (a -> b) -> a -> b
$ forall a. Show a => a -> [Char]
show forall a b. (a -> b) -> a -> b
$ Sequence -> Alphabet
alphabet Sequence
s),
      Int -> Name -> Name
alignRight Int
fieldWidth ([Char] -> Name
BL.pack forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => a -> [Char]
show forall a b. (a -> b) -> a -> b
$ Int
len),
      Int -> Name -> Name
alignRight Int
fieldWidth ([Char] -> Name
BL.pack forall a b. (a -> b) -> a -> b
$ forall r. PrintfType r => [Char] -> r
P.printf [Char]
"%2.2f" Double
pGaps)
    ]
  where
    len :: Int
len = Sequence -> Int
length Sequence
s
    nGaps :: Int
nGaps = Sequence -> Int
countGaps Sequence
s
    pGaps :: Double
pGaps = Double
100 forall a. Num a => a -> a -> a
* forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
nGaps forall a. Fractional a => a -> a -> a
/ forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len :: Double

-- If a string is longer than a given value, trim it and add some dots.
summarizeByteString :: Int -> BL.ByteString -> BL.ByteString
summarizeByteString :: Int -> Name -> Name
summarizeByteString Int
l Name
s
  | Name -> Int64
BL.length Name
s forall a. Ord a => a -> a -> Bool
>= forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l = Int64 -> Name -> Name
BL.take (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l) Name
s forall a. Semigroup a => a -> a -> a
<> [Char] -> Name
BL.pack [Char]
"..."
  | Bool
otherwise = Name
s

-- | Trim and show a 'Sequence'.
summarize :: Sequence -> BL.ByteString
summarize :: Sequence -> Name
summarize Sequence
s =
  [Name] -> Name
BL.unwords
    [Sequence -> Name
getInfo Sequence
s, Int -> Name -> Name
summarizeByteString Int
summaryLength forall a b. (a -> b) -> a -> b
$ Characters -> Name
toByteString (Sequence -> Characters
characters Sequence
s)]

-- | Trim and show a list of 'Sequence's.
summarizeSequences :: [Sequence] -> BL.ByteString
summarizeSequences :: [Sequence] -> Name
summarizeSequences [Sequence]
ss = [Sequence] -> Name
header [Sequence]
ss forall a. Semigroup a => a -> a -> a
<> [Sequence] -> Name
body (forall a. Int -> [a] -> [a]
take Int
summaryNSequences [Sequence]
ss)

-- | Header printed before 'Sequence' list.
tableHeader :: BL.ByteString
tableHeader :: Name
tableHeader =
  [Name] -> Name
BL.unwords
    [ Int -> Name -> Name
alignLeft Int
nameWidth Name
"Name",
      Int -> Name -> Name
alignRight Int
fieldWidth Name
"Code",
      Int -> Name -> Name
alignRight Int
fieldWidth Name
"Length",
      Int -> Name -> Name
alignRight Int
fieldWidth Name
"Gaps [%]",
      Name
"Sequence"
    ]

-- | A short description of the sequence.
header :: [Sequence] -> BL.ByteString
header :: [Sequence] -> Name
header [Sequence]
ss =
  [Name] -> Name
BL.unlines forall a b. (a -> b) -> a -> b
$
    [Name]
reportIfSubsetIsShown
      forall a. [a] -> [a] -> [a]
++ [ [Char] -> Name
BL.pack forall a b. (a -> b) -> a -> b
$
             [Char]
"For each sequence, the "
               forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Int
summaryLength
               forall a. [a] -> [a] -> [a]
++ [Char]
" first bases are shown.",
           [Char] -> Name
BL.pack forall a b. (a -> b) -> a -> b
$ [Char]
"List contains " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss) forall a. [a] -> [a] -> [a]
++ [Char]
" sequences.",
           Name
"",
           Name
tableHeader
         ]
  where
    l :: Int
l = forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss
    s :: [Char]
s =
      forall a. Show a => a -> [Char]
show Int
summaryNSequences
        forall a. [a] -> [a] -> [a]
++ [Char]
" out of "
        forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show (forall (t :: * -> *) a. Foldable t => t a -> Int
Pr.length [Sequence]
ss)
        forall a. [a] -> [a] -> [a]
++ [Char]
" sequences are shown."
    reportIfSubsetIsShown :: [Name]
reportIfSubsetIsShown
      | Int
l forall a. Ord a => a -> a -> Bool
> Int
summaryNSequences = [[Char] -> Name
BL.pack [Char]
s]
      | Bool
otherwise = []

-- | Trim and show a list of 'Sequence's.
body :: [Sequence] -> BL.ByteString
body :: [Sequence] -> Name
body [Sequence]
ss = [Name] -> Name
BL.unlines (forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Name
summarize [Sequence]
ss forall a. a -> Strategy a -> a
`using` forall a. Int -> Strategy a -> Strategy [a]
parListChunk Int
5 forall a. NFData a => Strategy a
rdeepseq)

-- | Calculate length of 'Sequence'.
length :: Sequence -> Int
length :: Sequence -> Int
length = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Unbox a => Vector a -> Int
V.length forall b c a. (b -> c) -> (a -> b) -> a -> c
. Sequence -> Characters
characters

-- | Check if all 'Sequence's have equal length.
equalLength :: [Sequence] -> Bool
equalLength :: [Sequence] -> Bool
equalLength = forall {a}. Eq a => [a] -> Bool
allEqual forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map Sequence -> Int
length
  where
    allEqual :: [a] -> Bool
allEqual [] = Bool
True
    allEqual [a]
xs = forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (forall a. Eq a => a -> a -> Bool
== forall a. [a] -> a
head [a]
xs) forall a b. (a -> b) -> a -> b
$ forall a. [a] -> [a]
tail [a]
xs

-- | Find the longest 'Sequence' in a list.
longest :: [Sequence] -> Sequence
longest :: [Sequence] -> Sequence
longest = forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy (forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing Sequence -> Int
length)

-- | Count number of gaps or unknown characters in sequence.
countGaps :: Sequence -> Int
countGaps :: Sequence -> Int
countGaps Sequence
s = forall a. Unbox a => Vector a -> Int
V.length forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Unbox a => (a -> Bool) -> Vector a -> Vector a
V.filter (Alphabet -> Character -> Bool
A.isGap forall a b. (a -> b) -> a -> b
$ Sequence -> Alphabet
alphabet Sequence
s) forall a b. (a -> b) -> a -> b
$ Sequence -> Characters
characters Sequence
s

-- | Trim to given length.
trim :: Int -> Sequence -> Sequence
trim :: Int -> Sequence -> Sequence
trim Int
n (Sequence Name
nm Name
d Alphabet
a Characters
cs) = Name -> Name -> Alphabet -> Characters -> Sequence
Sequence Name
nm Name
d Alphabet
a (forall a. Unbox a => Int -> Vector a -> Vector a
V.take (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n) Characters
cs)

-- | Concatenate two sequences. 'Name's have to match.
concat :: Sequence -> Sequence -> Sequence
concat :: Sequence -> Sequence -> Sequence
concat (Sequence Name
i Name
d Alphabet
c Characters
cs) (Sequence Name
j Name
f Alphabet
k Characters
ks)
  | Name
i forall a. Eq a => a -> a -> Bool
/= Name
j =
      forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$
        [Char]
"concatenate: Sequences do not have equal names: "
          forall a. [a] -> [a] -> [a]
++ Name -> [Char]
BL.unpack Name
i
          forall a. [a] -> [a] -> [a]
++ [Char]
", "
          forall a. [a] -> [a] -> [a]
++ Name -> [Char]
BL.unpack Name
j
          forall a. [a] -> [a] -> [a]
++ [Char]
"."
  | Name
d forall a. Eq a => a -> a -> Bool
/= Name
f =
      forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$
        [Char]
"concatenate: Sequences do not have equal descriptions: "
          forall a. [a] -> [a] -> [a]
++ Name -> [Char]
BL.unpack Name
d
          forall a. [a] -> [a] -> [a]
++ [Char]
", "
          forall a. [a] -> [a] -> [a]
++ Name -> [Char]
BL.unpack Name
f
          forall a. [a] -> [a] -> [a]
++ [Char]
"."
  | Alphabet
c forall a. Eq a => a -> a -> Bool
/= Alphabet
k =
      forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$
        [Char]
"concatenate: Sequences do not have equal alphabets: "
          forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Alphabet
c
          forall a. [a] -> [a] -> [a]
++ [Char]
", "
          forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show Alphabet
k
          forall a. [a] -> [a] -> [a]
++ [Char]
"."
  | Bool
otherwise =
      Name -> Name -> Alphabet -> Characters -> Sequence
Sequence Name
i Name
d Alphabet
c (Characters
cs forall a. Semigroup a => a -> a -> a
<> Characters
ks)

-- | Concatenate a list of sequences, see 'concat'.
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences [] = forall a. HasCallStack => [Char] -> a
error [Char]
"concatenateSequences: Nothing to concatenate."
concatSequences [[Sequence]
ss] = [Sequence]
ss
concatSequences [[Sequence]]
sss = forall (t :: * -> *) a. Foldable t => (a -> a -> a) -> t a -> a
foldl1 (forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Sequence -> Sequence -> Sequence
concat) [[Sequence]]
sss

-- | Only take 'Sequence's that are shorter than a given number.
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan Int
n = forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
x -> Sequence -> Int
length Sequence
x forall a. Ord a => a -> a -> Bool
< Int
n)

-- | Only take 'Sequence's that are longer than a given number.
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan Int
n = forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
x -> Sequence -> Int
length Sequence
x forall a. Ord a => a -> a -> Bool
> Int
n)

-- | Only take 'Sequence's that contain at least on non-IUPAC character.
filterStandard :: [Sequence] -> [Sequence]
filterStandard :: [Sequence] -> [Sequence]
filterStandard = forall a. (a -> Bool) -> [a] -> [a]
filter (\Sequence
s -> Alphabet -> Sequence -> Bool
anyStandard (Sequence -> Alphabet
alphabet Sequence
s) Sequence
s)

-- Are all characters IUPAC characters?
anyStandard :: A.Alphabet -> Sequence -> Bool
anyStandard :: Alphabet -> Sequence -> Bool
anyStandard Alphabet
a Sequence
s = forall a. Unbox a => (a -> Bool) -> Vector a -> Bool
V.any (Alphabet -> Character -> Bool
A.isStd Alphabet
a) Characters
cs where cs :: Characters
cs = Sequence -> Characters
characters Sequence
s