-- | Secondary structure: define basepairs as Int-tuples, the three edges, a -- nucleotide can use for pairing and the cis/trans isomerism. Both edges and -- cis/trans come with a tag for "unknown". -- -- Since we often want to make "pairedness" explicit, we have a newtype for -- this as well. -- -- TODO set ext-annotations to be (isomerism,edge,edge) and have a asString -- instance to read "cWW" "tSH" and other notation. module Biobase.Secondary.Basepair where import Data.Aeson import Data.Binary import Data.Char (toLower, toUpper) import Data.Ix (Ix(..)) import Data.List as L import Data.Primitive.Types import Data.Serialize (Serialize) import Data.Tuple (swap) import Data.Vector.Fusion.Stream.Monadic (map,Step(..),flatten) import Data.Vector.Unboxed.Deriving import GHC.Base (remInt,quotInt) import GHC.Generics import qualified Data.Vector.Generic as VG import qualified Data.Vector.Generic.Mutable as VGM import qualified Data.Vector.Unboxed as VU import Text.Read import Biobase.Types.BioSequence import Data.PrimitiveArray hiding (Complement(..),map) import Biobase.Primary import Biobase.Primary.Nuc.RNA import Biobase.Primary.Nuc -- * Newtype for efficient basepair encoding. -- | Encode a base pair as a single @Int@. newtype Basepair = BP { getBP :: Int } deriving (Eq,Ord,Ix,Generic) derivingUnbox "Basepair" [t| Basepair -> Int |] [| getBP |] [| BP |] instance Binary Basepair instance Serialize Basepair instance FromJSON Basepair instance ToJSON Basepair instance Index Basepair where newtype LimitType Basepair = LtBP Basepair instance IndexStream z => IndexStream (z:.Basepair) where streamUp (ls:..LtBP (BP l)) (hs:..LtBP (BP h)) = flatten mk step $ streamUp ls hs where mk z = return (z,l) step (z,k) | k > h = return $ Done | otherwise = return $ Yield (z:.BP k) (z,k+1) {-# Inline [0] mk #-} {-# Inline [0] step #-} {-# Inline streamUp #-} streamDown (ls:..LtBP (BP l)) (hs:..LtBP (BP h)) = flatten mk step $ streamDown ls hs where mk z = return (z,h) step (z,k) | k < l = return $ Done | otherwise = return $ Yield (z:.BP k) (z,k-1) {-# Inline [0] mk #-} {-# Inline [0] step #-} {-# Inline streamDown #-} instance IndexStream Basepair pattern AA = BP 0 pattern AC = BP 1 pattern AG = BP 2 pattern AU = BP 3 pattern CA = BP 4 pattern CC = BP 5 pattern CG = BP 6 pattern CU = BP 7 pattern GA = BP 8 pattern GC = BP 9 pattern GG = BP 10 pattern GU = BP 11 pattern UA = BP 12 pattern UC = BP 13 pattern UG = BP 14 pattern UU = BP 15 pattern NS = BP 16 pattern NoBP = BP 17 {- class MkBasepair a where mkBasepair :: a -> Basepair fromBasepair :: Basepair -> a -- | If we get a "legal" base pair, we just create it, all other -- combinations yield 'NoBP'. Non-standard base pairs have to be created -- explicitly using @NS@. When going back to @a@, non-standard and no pair -- yield @(N,N)@. instance MkBasepair (Letter RNA,Letter RNA) where mkBasepair (l,r) | l >= A && l <= U && r >= A && r <= U = BP $ 4 * getLetter l + getLetter r | otherwise = NoBP fromBasepair k | k == NoBP || k == NS = (N,N) | otherwise = let (l,r) = getBP k `divMod` 4 in (Letter l, Letter r) {-# Inline mkBasepair #-} {-# Inline fromBasepair #-} -} -- * Newtypes for extended secondary structures -- ** Encode which of three edges is engaged in base pairing -- | Each nucleotide in a pair may be paired using one of three edges: -- watson-crick, sugar, or hoogsteen. newtype Edge = Edge {unEdge :: Int} deriving (Eq,Ord,Ix,Generic) pattern W = Edge 0 pattern S = Edge 1 pattern H = Edge 2 instance Binary Edge instance Serialize Edge instance FromJSON Edge instance ToJSON Edge -- | Human-readable Show instance. instance Show Edge where show H = "H" show S = "S" show W = "W" -- | Human-readable Read instance. instance Read Edge where readPrec = parens $ do Ident s <- lexP return $ case s of "H" -> H "S" -> S "W" -> W _ -> error $ "read Edge: " ++ s instance Bounded Edge where minBound = W maxBound = H instance Enum Edge where toEnum = Edge fromEnum = unEdge derivingUnbox "Edge" [t| Edge -> Int |] [| unEdge |] [| Edge |] -- ** Is the base pair in cis or trans configuration -- | Nucleotides in a pairing may be in the cis(==?) or trans(==?) state. newtype CTisomerism = CT {unCT :: Int} deriving (Eq,Ord,Ix,Generic) pattern Cis = CT 0 pattern Trn = CT 1 instance Binary CTisomerism instance Serialize CTisomerism instance FromJSON CTisomerism instance ToJSON CTisomerism -- | Human-readable Show instance. instance Show CTisomerism where show Cis = "C" show Trn = "T" -- | Human-readable Read instance. instance Read CTisomerism where readPrec = parens $ do Ident s <- lexP return $ case s of "C" -> Cis "T" -> Trn _ -> error $ "read CTisomerism: " ++ s instance Bounded CTisomerism where minBound = Cis maxBound = Trn instance Enum CTisomerism where toEnum = CT fromEnum = unCT derivingUnbox "CTisomerism" [t| CTisomerism -> Int |] [| unCT |] [| CT |] -- * Types -- | A basepair is simply a pair of Ints which are 0-indexing a sequence. type PairIdx = (Int,Int) -- | A pair as a tuple containing 'Nuc's. type Pair n = (Letter RNA n, Letter RNA n) -- | Annotation for a basepair. type ExtPairAnnotation = (CTisomerism,Edge,Edge) -- | An extended basepair is a basepair, annotated with edge and CTisomerism. type ExtPairIdx = (PairIdx,ExtPairAnnotation) -- | An extended basepair, with nucleotides an annotation. type ExtPair n = (Pair n, ExtPairAnnotation) -- * little helpers pattern CHH = (Cis,H,H) pattern CHS = (Cis,H,S) pattern CHW = (Cis,H,W) pattern CSH = (Cis,S,H) pattern CSS = (Cis,S,S) pattern CSW = (Cis,S,W) pattern CWH = (Cis,W,H) pattern CWS = (Cis,W,S) pattern CWW = (Cis,W,W) pattern THH = (Trn,H,H) pattern THS = (Trn,H,S) pattern THW = (Trn,H,W) pattern TSH = (Trn,S,H) pattern TSS = (Trn,S,S) pattern TSW = (Trn,S,W) pattern TWH = (Trn,W,H) pattern TWS = (Trn,W,S) pattern TWW = (Trn,W,W)