module Bio.Seq
(
Basic
, IUPAC
, Ext
, DNA
, RNA
, Peptide
, BioSeq'(..)
, BioSeq(..)
, rc
, gcContent
, nucleotideFreq
) where
import qualified Data.ByteString.Char8 as B
import Data.Char8 (toUpper)
import qualified Data.HashMap.Strict as M
import qualified Data.HashSet as S
import Prelude hiding (length)
data Basic
data IUPAC
data Ext
newtype DNA alphabet = DNA B.ByteString
newtype RNA alphabet = RNA B.ByteString
newtype Peptide alphabet = Peptide B.ByteString
instance Show (DNA a) where
show (DNA s) = B.unpack s
instance Monoid (DNA a) where
mempty = DNA B.empty
mappend (DNA x) (DNA y) = DNA (x `mappend` y)
mconcat dnas = DNA . B.concat . map toBS $ dnas
class BioSeq' s where
toBS :: s a -> B.ByteString
slice :: Int -> Int -> s a -> s a
length :: s a -> Int
length = B.length . toBS
instance BioSeq' DNA where
toBS (DNA s) = s
slice i l (DNA s) = DNA . B.take l . B.drop i $ s
instance BioSeq' RNA where
toBS (RNA s) = s
slice i l (RNA s) = RNA . B.take l . B.drop i $ s
instance BioSeq' Peptide where
toBS (Peptide s) = s
slice i l (Peptide s) = Peptide . B.take l . B.drop i $ s
class BioSeq' s => BioSeq s a where
alphabet :: s a -> S.HashSet Char
fromBS :: B.ByteString -> s a
instance BioSeq DNA Basic where
alphabet _ = S.fromList "ACGT"
fromBS = DNA . B.map (f . toUpper)
where
f x | x `S.member` alphabet (undefined :: DNA Basic) = x
| otherwise = error $ "Bio.Seq.fromBS: unknown character: " ++ [x]
instance BioSeq DNA IUPAC where
alphabet _ = S.fromList "ACGTNVHDBMKWSYR"
fromBS = DNA . B.map (f . toUpper)
where
f x | x `S.member` alphabet (undefined :: DNA IUPAC) = x
| otherwise = error $ "Bio.Seq.fromBS: unknown character: " ++ [x]
instance BioSeq DNA Ext where
alphabet = undefined
fromBS = undefined
instance BioSeq RNA Basic where
alphabet _ = S.fromList "ACGU"
fromBS = RNA . B.map (f . toUpper)
where
f x | x `S.member` alphabet (undefined :: RNA Basic) = x
| otherwise = error $ "Bio.Seq.fromBS: unknown character: " ++ [x]
rc :: DNA alphabet -> DNA alphabet
rc (DNA s) = DNA . B.map f . B.reverse $ s
where
f x = case x of
'A' -> 'T'
'C' -> 'G'
'G' -> 'C'
'T' -> 'A'
_ -> x
gcContent :: DNA alphabet -> Double
gcContent = (\(a,b) -> a / fromIntegral b) . B.foldl' f (0.0,0::Int) . toBS
where
f (!x,!n) c =
let x' = case c of
'A' -> x
'C' -> x + 1
'G' -> x + 1
'T' -> x
'H' -> x + 0.25
'D' -> x + 0.25
'V' -> x + 0.75
'B' -> x + 0.75
'S' -> x + 1
'W' -> x
_ -> x + 0.5
in (x', n+1)
nucleotideFreq :: BioSeq DNA a => DNA a -> M.HashMap Char Int
nucleotideFreq dna = B.foldl' f m0 . toBS $ dna
where
m0 = M.fromList . zip (S.toList $ alphabet dna) . repeat $ 0
f m x = M.adjust (+1) x m