{-# LANGUAGE BangPatterns          #-}
{-# LANGUAGE FlexibleContexts      #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ScopedTypeVariables   #-}
module Bio.Seq
    (
    -- * Alphabet
      Basic
    , IUPAC
    , Ext
    -- * Sequence types
    , DNA
    , RNA
    , Peptide
    , BioSeq'(..)
    , BioSeq(..)
    -- * DNA related functions
    , rc
    , gcContent
    , nucleotideFreq
    ) where

import qualified Data.ByteString.Char8 as B
import           Data.Char8            (toUpper)
import qualified Data.HashMap.Strict   as M
import qualified Data.HashSet          as S
import           Data.Proxy            (Proxy (..))
import           Prelude               hiding (length)

-- | Alphabet defined by http://www.chem.qmul.ac.uk/iupac/
-- | Standard unambiguous alphabet
data Basic

-- | full IUPAC alphabet, including ambiguous letters
data IUPAC

-- | extended alphabet
data Ext

-- | DNA sequence
newtype DNA alphabet = DNA B.ByteString

-- | RNA sequence
newtype RNA alphabet = RNA B.ByteString

-- | Peptide sequence
newtype Peptide alphabet = Peptide B.ByteString

instance Show (DNA a) where
    show :: DNA a -> String
show (DNA ByteString
s) = ByteString -> String
B.unpack ByteString
s

instance Semigroup (DNA a) where
    <> :: DNA a -> DNA a -> DNA a
(<>) (DNA ByteString
x) (DNA ByteString
y) = ByteString -> DNA a
forall alphabet. ByteString -> DNA alphabet
DNA (ByteString
x ByteString -> ByteString -> ByteString
forall a. Semigroup a => a -> a -> a
<> ByteString
y)

instance Monoid (DNA a) where
    mempty :: DNA a
mempty = ByteString -> DNA a
forall alphabet. ByteString -> DNA alphabet
DNA ByteString
B.empty
    mconcat :: [DNA a] -> DNA a
mconcat [DNA a]
dnas = ByteString -> DNA a
forall alphabet. ByteString -> DNA alphabet
DNA (ByteString -> DNA a)
-> ([DNA a] -> ByteString) -> [DNA a] -> DNA a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [ByteString] -> ByteString
B.concat ([ByteString] -> ByteString)
-> ([DNA a] -> [ByteString]) -> [DNA a] -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (DNA a -> ByteString) -> [DNA a] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map DNA a -> ByteString
forall (s :: * -> *) a. BioSeq' s => s a -> ByteString
toBS ([DNA a] -> DNA a) -> [DNA a] -> DNA a
forall a b. (a -> b) -> a -> b
$ [DNA a]
dnas

class BioSeq' s where
    toBS :: s a -> B.ByteString
    unsafeFromBS :: B.ByteString -> s a

    slice :: Int -> Int -> s a -> s a

    length :: s a -> Int
    length = ByteString -> Int
B.length (ByteString -> Int) -> (s a -> ByteString) -> s a -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. s a -> ByteString
forall (s :: * -> *) a. BioSeq' s => s a -> ByteString
toBS
    {-# MINIMAL toBS, slice, unsafeFromBS #-}

instance BioSeq' DNA where
    toBS :: DNA a -> ByteString
toBS (DNA ByteString
s) = ByteString
s
    unsafeFromBS :: ByteString -> DNA a
unsafeFromBS = ByteString -> DNA a
forall alphabet. ByteString -> DNA alphabet
DNA
    slice :: Int -> Int -> DNA a -> DNA a
slice Int
i Int
l (DNA ByteString
s) = ByteString -> DNA a
forall alphabet. ByteString -> DNA alphabet
DNA (ByteString -> DNA a)
-> (ByteString -> ByteString) -> ByteString -> DNA a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> ByteString
B.take Int
l (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> ByteString
B.drop Int
i (ByteString -> DNA a) -> ByteString -> DNA a
forall a b. (a -> b) -> a -> b
$ ByteString
s

instance BioSeq' RNA where
    toBS :: RNA a -> ByteString
toBS (RNA ByteString
s) = ByteString
s
    unsafeFromBS :: ByteString -> RNA a
unsafeFromBS = ByteString -> RNA a
forall a. ByteString -> RNA a
RNA
    slice :: Int -> Int -> RNA a -> RNA a
slice Int
i Int
l (RNA ByteString
s) = ByteString -> RNA a
forall a. ByteString -> RNA a
RNA (ByteString -> RNA a)
-> (ByteString -> ByteString) -> ByteString -> RNA a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> ByteString
B.take Int
l (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> ByteString
B.drop Int
i (ByteString -> RNA a) -> ByteString -> RNA a
forall a b. (a -> b) -> a -> b
$ ByteString
s

instance BioSeq' Peptide where
    toBS :: Peptide a -> ByteString
toBS (Peptide ByteString
s) = ByteString
s
    unsafeFromBS :: ByteString -> Peptide a
unsafeFromBS = ByteString -> Peptide a
forall a. ByteString -> Peptide a
Peptide
    slice :: Int -> Int -> Peptide a -> Peptide a
slice Int
i Int
l (Peptide ByteString
s) = ByteString -> Peptide a
forall a. ByteString -> Peptide a
Peptide (ByteString -> Peptide a)
-> (ByteString -> ByteString) -> ByteString -> Peptide a
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> ByteString
B.take Int
l (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> ByteString -> ByteString
B.drop Int
i (ByteString -> Peptide a) -> ByteString -> Peptide a
forall a b. (a -> b) -> a -> b
$ ByteString
s

class BioSeq' seq => BioSeq seq alphabet where
    alphabet :: Proxy (seq alphabet) -> S.HashSet Char
    fromBS :: B.ByteString -> Either String (seq alphabet)
    fromBS ByteString
input = case (Maybe Char -> Char -> (Maybe Char, Char))
-> Maybe Char -> ByteString -> (Maybe Char, ByteString)
forall acc.
(acc -> Char -> (acc, Char))
-> acc -> ByteString -> (acc, ByteString)
B.mapAccumL Maybe Char -> Char -> (Maybe Char, Char)
fun Maybe Char
forall a. Maybe a
Nothing ByteString
input of
        (Maybe Char
Nothing, ByteString
r) -> seq alphabet -> Either String (seq alphabet)
forall a b. b -> Either a b
Right (seq alphabet -> Either String (seq alphabet))
-> seq alphabet -> Either String (seq alphabet)
forall a b. (a -> b) -> a -> b
$ ByteString -> seq alphabet
forall (s :: * -> *) a. BioSeq' s => ByteString -> s a
unsafeFromBS ByteString
r
        (Just Char
e, ByteString
_)  -> String -> Either String (seq alphabet)
forall a b. a -> Either a b
Left (String -> Either String (seq alphabet))
-> String -> Either String (seq alphabet)
forall a b. (a -> b) -> a -> b
$ String
"Bio.Seq.fromBS: unknown character: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Char
e]
      where
        fun :: Maybe Char -> Char -> (Maybe Char, Char)
fun (Just Char
e) Char
x = (Char -> Maybe Char
forall a. a -> Maybe a
Just Char
e, Char
x)
        fun Maybe Char
Nothing Char
x = let x' :: Char
x' = Char -> Char
toUpper Char
x
                        in if Char
x' Char -> HashSet Char -> Bool
forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
`S.member` Proxy (seq alphabet) -> HashSet Char
forall (seq :: * -> *) alphabet.
BioSeq seq alphabet =>
Proxy (seq alphabet) -> HashSet Char
alphabet (Proxy (seq alphabet)
forall k (t :: k). Proxy t
Proxy :: Proxy (seq alphabet))
                            then (Maybe Char
forall a. Maybe a
Nothing, Char
x')
                            else (Char -> Maybe Char
forall a. a -> Maybe a
Just Char
x', Char
x')
    {-# MINIMAL alphabet #-}

instance BioSeq DNA Basic where
    alphabet :: Proxy (DNA Basic) -> HashSet Char
alphabet Proxy (DNA Basic)
_ = String -> HashSet Char
forall a. (Eq a, Hashable a) => [a] -> HashSet a
S.fromList String
"ACGT"

instance BioSeq DNA IUPAC where
    alphabet :: Proxy (DNA IUPAC) -> HashSet Char
alphabet Proxy (DNA IUPAC)
_ = String -> HashSet Char
forall a. (Eq a, Hashable a) => [a] -> HashSet a
S.fromList String
"ACGTNVHDBMKWSYR"

instance BioSeq DNA Ext where
    alphabet :: Proxy (DNA Ext) -> HashSet Char
alphabet Proxy (DNA Ext)
_ = HashSet Char
forall a. HasCallStack => a
undefined

instance BioSeq RNA Basic where
    alphabet :: Proxy (RNA Basic) -> HashSet Char
alphabet Proxy (RNA Basic)
_ = String -> HashSet Char
forall a. (Eq a, Hashable a) => [a] -> HashSet a
S.fromList String
"ACGU"

-- | O(n) Reverse complementary of DNA sequence.
rc :: DNA alphabet -> DNA alphabet
rc :: DNA alphabet -> DNA alphabet
rc (DNA ByteString
s) = ByteString -> DNA alphabet
forall alphabet. ByteString -> DNA alphabet
DNA (ByteString -> DNA alphabet)
-> (ByteString -> ByteString) -> ByteString -> DNA alphabet
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> ByteString -> ByteString
B.map Char -> Char
f (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
B.reverse (ByteString -> DNA alphabet) -> ByteString -> DNA alphabet
forall a b. (a -> b) -> a -> b
$ ByteString
s
  where
    f :: Char -> Char
f Char
x = case Char
x of
        Char
'A' -> Char
'T'
        Char
'C' -> Char
'G'
        Char
'G' -> Char
'C'
        Char
'T' -> Char
'A'
        Char
_   -> Char
x

-- | O(n) Compute GC content.
gcContent :: DNA alphabet -> Double
gcContent :: DNA alphabet -> Double
gcContent = (\(Double
a,Int
b) -> Double
a Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
b) ((Double, Int) -> Double)
-> (DNA alphabet -> (Double, Int)) -> DNA alphabet -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ((Double, Int) -> Char -> (Double, Int))
-> (Double, Int) -> ByteString -> (Double, Int)
forall a. (a -> Char -> a) -> a -> ByteString -> a
B.foldl' (Double, Int) -> Char -> (Double, Int)
forall b a. (Num b, Fractional a) => (a, b) -> Char -> (a, b)
f (Double
0.0,Int
0::Int) (ByteString -> (Double, Int))
-> (DNA alphabet -> ByteString) -> DNA alphabet -> (Double, Int)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DNA alphabet -> ByteString
forall (s :: * -> *) a. BioSeq' s => s a -> ByteString
toBS
  where
    f :: (a, b) -> Char -> (a, b)
f (!a
x,!b
n) Char
c =
        let x' :: a
x' = case Char
c of
                Char
'A' -> a
x
                Char
'C' -> a
x a -> a -> a
forall a. Num a => a -> a -> a
+ a
1
                Char
'G' -> a
x a -> a -> a
forall a. Num a => a -> a -> a
+ a
1
                Char
'T' -> a
x
                Char
'H' -> a
x a -> a -> a
forall a. Num a => a -> a -> a
+ a
0.25
                Char
'D' -> a
x a -> a -> a
forall a. Num a => a -> a -> a
+ a
0.25
                Char
'V' -> a
x a -> a -> a
forall a. Num a => a -> a -> a
+ a
0.75
                Char
'B' -> a
x a -> a -> a
forall a. Num a => a -> a -> a
+ a
0.75
                Char
'S' -> a
x a -> a -> a
forall a. Num a => a -> a -> a
+ a
1
                Char
'W' -> a
x
                Char
_   -> a
x a -> a -> a
forall a. Num a => a -> a -> a
+ a
0.5     -- "NMKYR"
        in (a
x', b
nb -> b -> b
forall a. Num a => a -> a -> a
+b
1)

-- | O(n) Compute single nucleotide frequency.
nucleotideFreq :: forall a . BioSeq DNA a => DNA a -> M.HashMap Char Int
nucleotideFreq :: DNA a -> HashMap Char Int
nucleotideFreq DNA a
dna = (HashMap Char Int -> Char -> HashMap Char Int)
-> HashMap Char Int -> ByteString -> HashMap Char Int
forall a. (a -> Char -> a) -> a -> ByteString -> a
B.foldl' HashMap Char Int -> Char -> HashMap Char Int
forall k v.
(Eq k, Hashable k, Num v) =>
HashMap k v -> k -> HashMap k v
f HashMap Char Int
m0 (ByteString -> HashMap Char Int)
-> (DNA a -> ByteString) -> DNA a -> HashMap Char Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DNA a -> ByteString
forall (s :: * -> *) a. BioSeq' s => s a -> ByteString
toBS (DNA a -> HashMap Char Int) -> DNA a -> HashMap Char Int
forall a b. (a -> b) -> a -> b
$ DNA a
dna
  where
    m0 :: HashMap Char Int
m0 = [(Char, Int)] -> HashMap Char Int
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.fromList ([(Char, Int)] -> HashMap Char Int)
-> (Int -> [(Char, Int)]) -> Int -> HashMap Char Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> [Int] -> [(Char, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip (HashSet Char -> String
forall a. HashSet a -> [a]
S.toList (HashSet Char -> String) -> HashSet Char -> String
forall a b. (a -> b) -> a -> b
$ Proxy (DNA a) -> HashSet Char
forall (seq :: * -> *) alphabet.
BioSeq seq alphabet =>
Proxy (seq alphabet) -> HashSet Char
alphabet (Proxy (DNA a)
forall k (t :: k). Proxy t
Proxy :: Proxy (DNA a))) ([Int] -> [(Char, Int)]) -> (Int -> [Int]) -> Int -> [(Char, Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Int]
forall a. a -> [a]
repeat (Int -> HashMap Char Int) -> Int -> HashMap Char Int
forall a b. (a -> b) -> a -> b
$ Int
0
    f :: HashMap k v -> k -> HashMap k v
f HashMap k v
m k
x = (v -> v) -> k -> HashMap k v -> HashMap k v
forall k v.
(Eq k, Hashable k) =>
(v -> v) -> k -> HashMap k v -> HashMap k v
M.adjust (v -> v -> v
forall a. Num a => a -> a -> a
+v
1) k
x HashMap k v
m
{-# INLINE nucleotideFreq #-}