-- | Wrappers for structural data. Encoded as bytestrings. This differs from -- @BiobaseXNA@, where specialized encodings are used. These structures are -- supposedly "short", they need to fit into a strict bytestring. -- -- TODO Consider where to move each type. There are merge possibilities between -- BiobaseXNA and BiobaseTypes. -- -- TODO QuickCheck @Arbitrary@ for @RNAss@. module Biobase.Types.Structure where import Control.Applicative import Control.DeepSeq import Control.Lens import Control.Monad.Error.Class import Control.Monad (foldM,unless) import Data.Attoparsec.ByteString.Char8 import Data.Attoparsec.Combinator import Data.Bifunctor (second) import Data.ByteString (ByteString) import Data.Data import Data.List (foldl1',foldl') import Data.Monoid ((<>)) import Data.Set (Set) import GHC.Generics (Generic) import qualified Data.ByteString.Char8 as BS8 import qualified Data.List as L import qualified Data.Set as S import qualified Data.Set as Set import qualified Data.Vector.Unboxed as VU import qualified Test.QuickCheck as Q import Data.Forest.StructuredPaired -- | Secondary structure using @()@ for paired elements, and @.@ for unpaired -- ones. It is assumed that the @()@ match up. These structures from a Monoid. newtype RNAss = RNAss { _rnass ∷ ByteString } deriving (Eq,Ord,Show,Read,Data,Typeable,Generic,Semigroup,Monoid) makeLenses ''RNAss instance NFData RNAss -- | Ensemble structure encoding. *Very* different type ctor name chosen! The -- structure of this string makes verification much more complicated. -- -- TODO describe encoding used by RNAfold for the ensemble string. newtype RNAensembleStructure = RNAes { _rnaes ∷ ByteString } deriving (Eq,Ord,Show,Read,Data,Typeable,Generic) makeLenses ''RNAensembleStructure instance NFData RNAensembleStructure -- | Cofolded structure. data RNAds = RNAds { _rnadsL ∷ !ByteString , _rnadsR ∷ !ByteString } deriving (Eq,Ord,Show,Read,Data,Typeable,Generic) makeLenses ''RNAds instance NFData RNAds -- | A Prism that turns ByteStrings with a single @&@ into @RNAds@. rnads ∷ Prism' ByteString RNAds rnads = prism (\(RNAds l r) → BS8.concat [l, "&", r]) (\s → case BS8.split '&' s of [l,r] → Right (RNAds l r) ; _ → Left s) {-# Inline rnads #-} -- | Isomorphism from @RNAds@ to @(RNAss,RNAss)@. The @RNAss@ are only -- legal if taken both: @rnassFromDimer . both@. rnads2rnassPair ∷ Iso' RNAds (RNAss, RNAss) rnads2rnassPair = iso (\(RNAds l r) → (RNAss l, RNAss r)) (\(RNAss l, RNAss r) → RNAds l r) {-# Inline rnads2rnassPair #-} -- | Try to create a dimeric structure. mkRNAds ∷ (Monad m, MonadError RNAStructureError m) ⇒ ByteString → m RNAds mkRNAds q = BS8.split '&' q & \case [l,r] → do -- TODO can still fail with unmatched brackets. return $ RNAds { _rnadsL = l , _rnadsR = r } _ → throwError $ RNAStructureError "mkRNAds: not a dimer" q {-# Inline mkRNAds #-} -- | Capture what might be wrong with the RNAss. data RNAStructureError = RNAStructureError { _rnaStructureError ∷ String , _rnaOffender ∷ ByteString } deriving (Show,Generic) instance NFData RNAStructureError -- | Verifies that the given RNAss is properly formatted. Otherwise, error out. -- -- TODO Implement! Check with BiobaseXNA and the stack effort in there. This -- might influence if the verification goes into BiobaseXNA and happens via an -- @Iso'@. verifyRNAss ∷ (Monad m, MonadError RNAStructureError m) ⇒ RNAss → m RNAss verifyRNAss ss = do return ss -- | The set of nucleotide pairs, together with the sequence length. data RNApset = RNApset { _rnapset ∷ !(Set (Int,Int)) -- ^ the set of nucleotide pairs. , _rnapsetSLen ∷ !Int -- ^ length of the underlying nucleotide sequence. } deriving (Read,Show,Eq,Ord,Generic) makeLenses ''RNApset instance NFData RNApset -- | Transform an 'RNAss' into a set of base pairs @(i,j)@. The pairs are -- 0-based. rnassPairSet ∷ (MonadError String m) ⇒ RNAss → m RNApset rnassPairSet (RNAss s2) = do let go (set,ks ) (i,'(') = return (set,i:ks) go (set,i:is) (j,')') = return (Set.insert (i,j) set, is) go (set,[] ) (j,')') = throwError $ "unequal brackets in \"" ++ BS8.unpack s2 ++ "\" at position: " ++ show j go (set,ks ) (_,'.') = return (set,ks) (set,ss) ← foldM go (Set.empty,[]) . L.zip [0..] $ BS8.unpack s2 unless (null ss) $ throwError $ "unequal brackets in \"" ++ BS8.unpack s2 ++ "\" with opening bracket(s): " ++ show ss return $ RNApset set (BS8.length s2) {-# Inlinable rnassPairSet #-} -- | Genereate a simple structured/paired forest from a secondary structure string. rnassSPForest ∷ (MonadError String m) ⇒ RNAss → m (SPForest ByteString Char) rnassSPForest (RNAss s2) = either throwError return $ parseOnly (manyElems <* endOfInput) s2 where tree = SPT <$> char '(' <*> someElems <*> char ')' "SPT" unpaired = SPR <$> takeWhile1 (=='.') "SPR" someElems = SPJ <$> many1 (tree <|> unpaired) "many1 SPT / SPR" manyElems = (\case {[] → SPE; xs → SPJ xs}) <$> many (tree <|> unpaired) "many0 SPT / SPR" {-# Inlinable rnassSPForest #-} -- | Compactify such an SPForest. This means that all stems are now represented -- by a single 'SPT' data constructor. compactifySPForest ∷ SPForest ByteString Char → SPForest ByteString ByteString compactifySPForest = go . second BS8.singleton where go SPE = SPE go (SPR x) = SPR x go (SPJ xs) = SPJ (map go xs) go (SPT l (SPJ [x]) r) = go $ SPT l x r go (SPT l (SPT l' t r') r) = go $ SPT (l <> l') t (r' <> r) go (SPT l t r) = SPT l (go t) r -- | RNA pair set, but a transformation error calls @error@. rnassPairSet' ∷ RNAss → RNApset rnassPairSet' = either error id . rnassPairSet rnapsetRNAss ∷ RNApset → RNAss rnapsetRNAss (RNApset ps l) = RNAss $ BS8.pack $ VU.toList xs where xs = VU.replicate l '.' VU.// ls VU.// rs ls = L.map ((,'(') . fst) $ S.toList ps rs = L.map ((,')') . snd) $ S.toList ps -- | Calculates the number of different base pairs between two structures. This -- ignores the length of the underlying sequences. pairDist ∷ RNApset → RNApset → Int pairDist (RNApset p1 _) (RNApset p2 _) = Set.size z1 + Set.size z2 where i = Set.intersection p1 p2 z1 = p1 `Set.difference` i z2 = p2 `Set.difference` i -- * Arbitrary instances. This only creates legal instances, but does *not* -- take into account ViennaRNA rules like three unpaired nucleotides in the -- hairpin. -- -- TODO @shrink@ is a bit more complicated, but can be done via a set of pairs. instance Q.Arbitrary RNApset where arbitrary = do -- Given left and right bounds, create pairs. let go ∷ Int → Int → Q.Gen (Set (Int,Int)) go l r | l >= r = return S.empty | otherwise = do -- right border of stack c ∷ Int ← Q.oneof [ Q.choose (l+1,r) -- wide jump , Q.choose (l+1, min r $ l+20) -- short jump ] -- with @1..10@ stack length z ∷ Int ← Q.choose (0,5) let stack = S.fromList [(l+k,c-k) | k ← [0..z-1], l+k+1 < c-k] right ← go (c+1) r return $ S.union stack right -- generate RNA structures between 0 and 100 nucleotides. l ∷ Int ← Q.choose (0,199) s ← go 0 l return $ RNApset s (l+1) instance Q.Arbitrary RNAss where arbitrary = rnapsetRNAss <$> Q.arbitrary