{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TypeSynonymInstances #-} -- | Encodings for nucleotides and pairs. Currently very ViennaRNA-centric. -- This might change over time. -- -- TODO do not export Nucleotide ctor? -- -- TODO With GHC 7.2 we should be able to activate Read,Show for the overlapped -- VU.Vector Nucleotide and get better string representations for 'Primary'. -- -- TODO nucE -> nucN ? -- -- TODO add all possible characters? module Biobase.RNA where import Control.DeepSeq import Data.Char (toUpper) import Data.Ix import Data.Primitive.Types import qualified Data.ByteString.Char8 as BSS import qualified Data.ByteString.Lazy.Char8 as BSL import qualified Data.Vector.Generic as VG import qualified Data.Vector.Generic.Mutable as VGM import qualified Data.Vector.Unboxed as VU import Biobase.RNA.NucBounds -- * single nucleotide encoding -- | Represent nucleotides as machine 'Int's newtype Nucleotide = Nucleotide Int deriving (Eq,Ord,Ix,NFData) -- | We have the canonical nucleotides A,C,G,U, unknown nucleotides as E and a -- chain symbol used to separate different nucleotide chains. (nucE: nucA:nucC:nucG:nucU: nucChain:_) = map Nucleotide [0..] -- | Conversion to and from nucleotides. class MkNucleotide a where mkNuc :: a -> Nucleotide fromNuc :: Nucleotide -> a -- | instance for characters instance MkNucleotide Char where mkNuc c' | Just n <- c `lookup` c2n = n | c=='&' = nucChain | otherwise = nucE where c2n = zip "ACGU" [nucA..nucU] c = toUpper c' fromNuc n | Just c <- n `lookup` n2c = c | n == nucChain = '&' | otherwise = 'E' where n2c = zip [nucA..nucU] "ACGU" -- | The enum instance. instance Enum Nucleotide where toEnum x | x>=0 && x<=5 = Nucleotide x | otherwise = error $ "can't make to enum" ++ show x fromEnum (Nucleotide x) = x {-# INLINE toEnum #-} {-# INLINE fromEnum #-} deriving instance Prim Nucleotide deriving instance VGM.MVector VU.MVector Nucleotide deriving instance VG.Vector VU.Vector Nucleotide deriving instance VU.Unbox Nucleotide -- | The Bounded instance uses bounds that are explicitly "too small" as other -- symbols have special meaning and should not be used with normal bounds. -- -- TODO think about better ways to handle this! instance Bounded Nucleotide where minBound = nucE maxBound = nucChain -- | The typical lookup table should use 'NucBounds' instead of 'Bounded' and -- here 'minExtended' and 'maxExtended' as this covers all characters one would -- typically expect. instance NucBounds Nucleotide where minNormal = nucA maxNormal = nucU minExtended = nucE maxExtended = nucU instance Show Nucleotide where show n = [fromNuc n] instance Read Nucleotide where readsPrec p [] = [] readsPrec p (x:xs) | x ==' ' = readsPrec p xs | Just n <- x `lookup` c2n = [(n,xs)] | otherwise = [] where c2n = zip "EACGU&" [nucE .. nucChain] -- * Representing nucleotide chains, aka primary sequence -- | We use plain unboxed vectors for the primary sequence type Primary = VU.Vector Nucleotide -- | Conversion from/to primary sequence class MkPrimary a where mkPrimary :: a -> Primary unPrimary :: Primary -> a instance MkPrimary [Nucleotide] where mkPrimary = VU.fromList unPrimary = VU.toList instance MkPrimary String where mkPrimary = VU.fromList . map mkNuc unPrimary = map fromNuc . VU.toList instance MkPrimary BSL.ByteString where mkPrimary = VU.fromList . map mkNuc . BSL.unpack unPrimary = BSL.pack . map fromNuc . VU.toList instance MkPrimary BSS.ByteString where mkPrimary = VU.fromList . map mkNuc . BSS.unpack unPrimary = BSS.pack . map fromNuc . VU.toList -- * Some convenience structures eacgu = [nucE..nucU] acgu = [nucA..nucU] acguPairs = [(x,y)|x<-acgu,y<-acgu] -- * Currently impossible instances (changing with ghc 7.2) {- TODO reactivate these instances with a later version of GHC instance Show Primary where show p = "mkPrimary " ++ unPrimary p instance Read Primary where readsPrec p xs | "mkPrimary " == chk = [(mkPrimary ps,ys)] | otherwise = error $ show (chk,others,ps,ys) where (chk,others) = splitAt 10 xs (ps,ys) = span (`elem` "ACGUE") others -}