{-# LANGUAGE BangPatterns #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE ScopedTypeVariables #-} module Bio.Seq ( -- * Alphabet Basic , IUPAC , Ext -- * Sequence types , DNA , RNA , Peptide , BioSeq'(..) , BioSeq(..) -- * DNA related functions , rc , gcContent , nucleotideFreq ) where import qualified Data.ByteString.Char8 as B import Data.Char8 (toUpper) import qualified Data.HashMap.Strict as M import qualified Data.HashSet as S import Data.Proxy (Proxy (..)) import Prelude hiding (length) -- | Alphabet defined by http://www.chem.qmul.ac.uk/iupac/ -- | Standard unambiguous alphabet data Basic -- | full IUPAC alphabet, including ambiguous letters data IUPAC -- | extended alphabet data Ext -- | DNA sequence newtype DNA alphabet = DNA B.ByteString -- | RNA sequence newtype RNA alphabet = RNA B.ByteString -- | Peptide sequence newtype Peptide alphabet = Peptide B.ByteString instance Show (DNA a) where show (DNA s) = B.unpack s instance Semigroup (DNA a) where (<>) (DNA x) (DNA y) = DNA (x <> y) instance Monoid (DNA a) where mempty = DNA B.empty mconcat dnas = DNA . B.concat . map toBS $ dnas class BioSeq' s where toBS :: s a -> B.ByteString unsafeFromBS :: B.ByteString -> s a slice :: Int -> Int -> s a -> s a length :: s a -> Int length = B.length . toBS {-# MINIMAL toBS, slice, unsafeFromBS #-} instance BioSeq' DNA where toBS (DNA s) = s unsafeFromBS = DNA slice i l (DNA s) = DNA . B.take l . B.drop i $ s instance BioSeq' RNA where toBS (RNA s) = s unsafeFromBS = RNA slice i l (RNA s) = RNA . B.take l . B.drop i $ s instance BioSeq' Peptide where toBS (Peptide s) = s unsafeFromBS = Peptide slice i l (Peptide s) = Peptide . B.take l . B.drop i $ s class BioSeq' seq => BioSeq seq alphabet where alphabet :: Proxy (seq alphabet) -> S.HashSet Char fromBS :: B.ByteString -> Either String (seq alphabet) fromBS input = case B.mapAccumL fun Nothing input of (Nothing, r) -> Right $ unsafeFromBS r (Just e, _) -> Left $ "Bio.Seq.fromBS: unknown character: " ++ [e] where fun (Just e) x = (Just e, x) fun Nothing x = let x' = toUpper x in if x' `S.member` alphabet (Proxy :: Proxy (seq alphabet)) then (Nothing, x') else (Just x', x') {-# MINIMAL alphabet #-} instance BioSeq DNA Basic where alphabet _ = S.fromList "ACGT" instance BioSeq DNA IUPAC where alphabet _ = S.fromList "ACGTNVHDBMKWSYR" instance BioSeq DNA Ext where alphabet _ = undefined instance BioSeq RNA Basic where alphabet _ = S.fromList "ACGU" -- | O(n) Reverse complementary of DNA sequence. rc :: DNA alphabet -> DNA alphabet rc (DNA s) = DNA . B.map f . B.reverse $ s where f x = case x of 'A' -> 'T' 'C' -> 'G' 'G' -> 'C' 'T' -> 'A' _ -> x -- | O(n) Compute GC content. gcContent :: DNA alphabet -> Double gcContent = (\(a,b) -> a / fromIntegral b) . B.foldl' f (0.0,0::Int) . toBS where f (!x,!n) c = let x' = case c of 'A' -> x 'C' -> x + 1 'G' -> x + 1 'T' -> x 'H' -> x + 0.25 'D' -> x + 0.25 'V' -> x + 0.75 'B' -> x + 0.75 'S' -> x + 1 'W' -> x _ -> x + 0.5 -- "NMKYR" in (x', n+1) -- | O(n) Compute single nucleotide frequency. nucleotideFreq :: forall a . BioSeq DNA a => DNA a -> M.HashMap Char Int nucleotideFreq dna = B.foldl' f m0 . toBS $ dna where m0 = M.fromList . zip (S.toList $ alphabet (Proxy :: Proxy (DNA a))) . repeat $ 0 f m x = M.adjust (+1) x m {-# INLINE nucleotideFreq #-}