{-# LANGUAGE OverloadedStrings #-}

{- |
Module      :  ELynx.Data.Sequence
Description :  Hereditary sequences
Copyright   :  (c) Dominik Schrempf 2018
License     :  GPL-3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  portable

Creation date: Thu Oct  4 18:54:51 2018.

This module is to be imported qualified.

-}

module ELynx.Data.Sequence.Sequence
  ( -- * Types
    Name
  , Characters
  , Sequence (..)
  -- * Input
  , fromByteString
  -- * Output
  , toByteString
  , header
  , summarize
  , summarizeSequences
  , body
  -- * Analysis
  , length
  , equalLength
  , longest
  -- * Manipulation
  , trim
  , concat
  , concatSequences
  -- * Filtering
  , filterShorterThan
  , filterLongerThan
  , filterStandard
  ) where

import           Control.Parallel.Strategies
import qualified Data.ByteString.Lazy.Char8    as L
import           Data.List                     (maximumBy)
import           Data.Ord                      (comparing)
import qualified Data.Vector.Unboxed           as V
import           Prelude                       hiding (concat, length)
import qualified Prelude                       as Pr (length)
import qualified Text.Printf                   as P

import qualified ELynx.Data.Alphabet.Alphabet  as A
import           ELynx.Data.Alphabet.Character
import           ELynx.Data.Sequence.Defaults
import           ELynx.Tools.ByteString
import           ELynx.Tools.Equality

-- | For now, 'Name's are just 'L.ByteString's.
type Name = L.ByteString

-- | The vector of characters of a sequence.
type Characters = V.Vector Character

-- | Convert byte string to sequence characters.
fromByteString :: L.ByteString -> Characters
fromByteString = V.fromList . map fromChar . L.unpack

-- | Convert sequence characters to byte string.
toByteString :: Characters -> L.ByteString
toByteString = L.pack . map toChar . V.toList

-- | Sequences have a name, a code and hopefully a lot of data.
data Sequence = Sequence { name       :: Name
                         , alphabet   :: A.Alphabet
                         , characters :: Characters }
  deriving (Show, Eq)

getInfo :: Sequence -> L.ByteString
getInfo s = L.unwords [ alignLeft nameWidth (name s)
                      , alignRight fieldWidth (L.pack $ show $ alphabet s)
                      , alignRight fieldWidth (L.pack . show $ len)
                      , alignRight fieldWidth (L.pack $ P.printf "%.3f" pGaps) ]
  where len = length s
        nGaps = countGaps s
        pGaps = fromIntegral nGaps / fromIntegral len :: Double

-- | Trim and show a 'Sequence'.
summarize :: Sequence -> L.ByteString
summarize s = L.unwords [ getInfo s , summarizeByteString summaryLength $ toByteString (characters s) ]

-- | Trim and show a list of 'Sequence's.
summarizeSequences :: [Sequence] -> L.ByteString
summarizeSequences ss = header ss <> body (take summaryNSequences ss)

-- | Header printed before 'Sequence' list.
tableHeader :: L.ByteString
tableHeader = L.unwords [ alignLeft  nameWidth "Name"
                        , alignRight fieldWidth        "Code"
                        , alignRight fieldWidth        "Length"
                        , alignRight fieldWidth        "Gaps [%]"
                        , "Sequence" ]

-- | A short description of the sequence.
header :: [Sequence] -> L.ByteString
header ss = L.unlines $
  reportIfSubsetIsShown ++
  [ L.pack $ "For each sequence, the " ++ show summaryLength ++ " first bases are shown."
  , L.pack $ "List contains " ++ show (Pr.length ss) ++ " sequences."
  , ""
  , tableHeader ]
  where l = Pr.length ss
        s = show summaryNSequences ++ " out of " ++
            show (Pr.length ss) ++ " sequences are shown."
        reportIfSubsetIsShown
          | l > summaryNSequences = [L.pack s]
          | otherwise = []

-- | Trim and show a list of 'Sequence's.
body :: [Sequence] -> L.ByteString
body ss = L.unlines (map summarize ss `using` parListChunk 5 rdeepseq)

-- | Calculate length of 'Sequence'.
length :: Sequence -> Int
length = fromIntegral . V.length . characters

-- | Check if all 'Sequence's have equal length.
equalLength :: [Sequence] -> Bool
equalLength = allEqual . map length

-- | Find the longest 'Sequence' in a list.
longest :: [Sequence] -> Sequence
longest = maximumBy (comparing length)

-- | Count number of gaps or unknown characters in sequence.
countGaps :: Sequence -> Int
countGaps s = V.length . V.filter (A.isGap $ alphabet s) $ characters s

-- | Trim to given length.
trim :: Int -> Sequence -> Sequence
trim n (Sequence nm a cs) = Sequence nm a (V.take (fromIntegral n) cs)

-- | Concatenate two sequences. 'Name's have to match.
concat :: Sequence -> Sequence -> Sequence
concat (Sequence i c cs) (Sequence j k ks)
  | i == j && c == k = Sequence i c (cs <> ks)
  | otherwise        = error $ "concatenate: Sequences do not have equal names: "
                       ++ L.unpack i ++ ", " ++ L.unpack j ++ "."

-- | Concatenate a list of sequences, see 'concat'.
concatSequences :: [[Sequence]] -> [Sequence]
concatSequences []   = error "concatenateSequences: Nothing to concatenate."
concatSequences [ss] = ss
concatSequences sss  = foldl1 (zipWith concat) sss

-- | Only take 'Sequence's that are shorter than a given number.
filterShorterThan :: Int -> [Sequence] -> [Sequence]
filterShorterThan n = filter (\x -> length x < n)

-- | Only take 'Sequence's that are longer than a given number.
filterLongerThan :: Int -> [Sequence] -> [Sequence]
filterLongerThan n = filter (\x -> length x > n)

-- | Only take 'Sequence's that contain at least on non-IUPAC character.
filterStandard :: [Sequence] -> [Sequence]
filterStandard = filter (\s -> anyStandard (alphabet s) s)

-- Are all characters IUPAC characters?
anyStandard :: A.Alphabet -> Sequence -> Bool
anyStandard a s = V.any (A.isStd a) cs
  where cs = characters s