-- | Everything needed to wrap @RNAfold@ results, parse lines with @RNAfold@ -- input, fold a sequence with @rnafold@, and more. module BioInf.ViennaRNA.RNAfold ( module BioInf.ViennaRNA.RNAfold , module BioInf.ViennaRNA.Types ) where import Control.Arrow import Control.DeepSeq import Control.Lens import Control.Monad (guard) import Data.Attoparsec.ByteString as A import Data.Attoparsec.ByteString.Char8 as A8 import Data.ByteString.Builder import Data.ByteString (ByteString) import Data.ByteString.Char8 as BS import Data.Char (toUpper) import Data.Maybe.Strict import Data.Monoid import Data.Tuple (swap) import Debug.Trace import GHC.Generics (Generic) import Prelude hiding(Maybe(..)) import Test.QuickCheck as QC import Text.Printf import Biobase.Types.Energy import Biobase.Types.Sequence (mkRNAseq, RNAseq(..), rnaseq) import Biobase.Types.Structure import qualified BioInf.ViennaRNA.Bindings as Bindings import BioInf.ViennaRNA.Internal import BioInf.ViennaRNA.Types -- | Space-efficient RNAfold structure. RNAfold allows a DNA input, but this is -- not allowed in this data structure. If you want that, keep the original -- input string around. -- -- This structure provides only @Getter@'s. Only the @sequenceID@ can be -- updated via @sequenceIDlens@. This is to prevent accidental updates of -- fields that are actually interdependent. -- -- Missing parts are sloppily encoded by bogus values and empty strings. All -- @Folded@ structures and derived values are lazy. In case @rnafold@ was used -- to construct the structure, calculations are deferred until needed. -- -- TODO newtype the sequence id. -- -- TODO temperature. How to encode? Kelvin? In BiobaseTypes! Could use (Nat)SciTypes! -- -- TODO complete BP probability array -- -- TODO lazy fields for computation on demand! -- -- TODO Wrapped via @Maybe@? data RNAfold = RNAfold { _sequenceID ∷ !ByteString -- ^ Set to @not . null@ if the sequence was given a name. This is (likely) a -- fasta-style identifier. , _input ∷ !RNAseq -- ^ The input sequence, converting into an RNA string. , _mfe ∷ !Folded -- ^ Minimum-free energy and corresponding structure. , _mfeFrequency ∷ !Double -- ^ The mfe frequency can be calculated as follows: @exp ((ensemble energy - -- mfe energy) / kT)@. -- -- ^ TODO newtype wrapper? , _ensemble ∷ !Folded -- ^ Uses special syntax with unpaired, weakly paired, somewhat paired, -- somewhat paired up or down, strongly paired up or down for the ensemble. -- The energy is the *ensemble free energy*. , _centroid ∷ !Folded -- ^ Centroid energy and structure. , _centroidDistance ∷ !Double -- ^ Centroid distance to ensemble. , _diversity ∷ !Double -- ^ Average basepair distance between all structures in the Boltzmann -- ensemble -- -- TODO Needs own newtype? , _temperature ∷ !(Maybe Double) -- ^ Temperature in Celsius -- -- TODO own newtype Celsius } deriving (Read,Show,Eq,Ord,Generic) makeLensesWith (lensRules & generateUpdateableOptics .~ False) ''RNAfold makeLensesFor [("_sequenceID", "sequenceIDlens")] ''RNAfold instance NFData RNAfold -- | Fold a sequence. -- -- This function is threadsafe via the viennaRNA-mutex. -- -- TODO add temperature parameter -- -- TODO consider creating a "super-lens" that updates whenever @_input@ or -- @_temperature@ change. rnafold ∷ RNAseq → RNAfold rnafold _input = unsafePerformIO . withMutex $! do let _temperature = Just 37 let _sequenceID = "" _mfe ← uncurry Folded . swap <$> (DG *** RNAss) <$> Bindings.mfe (_input^.rnaseq) (_centroid, _centroidDistance) ← (\(e,s,d) → (Folded (RNAss s) (DG e), d)) <$> Bindings.centroidTemp 37 (_input^.rnaseq) -- fucked up from here let k0 = 273.15 let gasconst = 1.98717 -- in kcal * (K^(-1)) * (mol^(-1)) let kT = (k0 + 37) * gasconst * 1000 let _ensemble = Folded (RNAss "DO NOT USE ME") (DG 999999) let _diversity = 999999 -- the energy of the mfe structure calculated with @dangles=1@ model, -- otherwise we get different mfe frequency values compared to rnafold. let d1mfeenergy = 0 let _mfeFrequency = exp $ (_centroid^.foldedEnergy.to dG - d1mfeenergy) / kT return $! RNAfold {..} {-# NoInline rnafold #-} data RNArewrite = NoRewrite | ForceRNA deriving (Read,Show,Eq,Ord,Generic) -- | Parsing for 'RNAfold'. This should parse all variants that @RNAfold@ -- produces. -- -- The parser for a 'Folded' structure has to deal with different "energy" -- types. The different energies are bracketed by different types of brackets. -- -- @ -- mfe (((...))) ( -1.20) -- ensemble (((...))) [ -1.41] -- centroid (((...))) { -1.20 d=1.06} -- @ -- -- TODO Move into submodule. -- -- TODO pipes-based streaming parser -- -- TODO how to handle parsing the BP probability array, if known? -- -- TODO I think it is possible to figure the line type based on the energy and -- the brackets around the energy. pRNAfold ∷ RNArewrite → Double → Parser RNAfold pRNAfold r celsius = do let _temperature = Just celsius let endedLine = A.takeTill isEndOfLine <* endOfLine let s2line = A8.takeWhile1 (`BS.elem` "(.)") <* skipSpace let ensline = A8.takeWhile1 (`BS.elem` "(){}.,|") <* skipSpace let nope = Folded (RNAss "") (DG 0) let rewrite = case r of NoRewrite → id ForceRNA → BS.map (go . toUpper) where go x | x `BS.elem` "ACGUacgu" = x go x = 'N' _sequenceID ← option "" $ char '>' *> endedLine _input ← RNAseq <$> rewrite <$> endedLine let l = _input^.rnaseq.to BS.length let lenGuard s2 = guard (BS.length s2 == l) ("s2 line length /= _input length: " ++ show (s2,_input) ++ "") -- mfe is always present ?! _mfe ← do s2 ← s2line lenGuard s2 skipSpace e ← char '(' *> skipSpace *> signed double <* char ')' let _foldedStructure = RNAss s2 let _foldedEnergy = DG e return Folded{..} "mfe" -- from here on, things are optional, including the newline after the mfe -- energy! _ensemble ← option nope $ (do endOfLine s2 ← ensline lenGuard s2 skipSpace e ← char '[' *> skipSpace *> signed double <* char ']' let _foldedStructure = RNAss s2 let _foldedEnergy = DG e return Folded{..} "ensemble") (_centroid, _centroidDistance) ← option (nope, 0) $ (do endOfLine s2 ← s2line lenGuard s2 e ← char '{' *> skipSpace *> signed double -- TODO handle @d@ values d ← skipSpace *> "d=" *> skipSpace *> double skipSpace *> char '}' let _foldedStructure = RNAss s2 let _foldedEnergy = DG e return (Folded{..}, d) "centroid") (_mfeFrequency, _diversity) ← option (0, 0) $ (do endOfLine -- previous line ending skipSpace "frequency of mfe structure in ensemble" skipSpace f ← double "; ensemble diversity" skipSpace ed ← double skipSpace return (f, ed) "mfe frequency / diversity") return RNAfold{..} builderRNAfold ∷ RNAfold → Builder builderRNAfold RNAfold{..} = mconcat [hdr, sqnce, emfe, nsmbl, cntrd] where -- SLOW! dbl d = byteString . BS.pack $ printf "%.2f" (d∷Double) addFolded l r Folded{..} = if BS.null (_rnass _foldedStructure) then mempty else nl <> byteString (_rnass _foldedStructure) <> char7 ' ' <> char7 l <> dbl (dG _foldedEnergy) <> char7 r nl = char7 '\n' hdr = if BS.null _sequenceID then mempty else char7 '>' <> byteString _sequenceID <> nl sqnce = byteString (_input^.rnaseq) emfe = addFolded '(' ')' _mfe nsmbl = addFolded '[' ']' _ensemble cntrd = let s = _centroid^.foldedStructure.rnass in if BS.null (_centroid^.foldedStructure.rnass) then mempty else nl <> byteString s <> " {" <> dbl (_centroid^.foldedEnergy.to dG) <> " d=" <> dbl _diversity <> "}" -- ** QuickCheck generator for 'RNAfold'. This is slightly unusual but allows -- us to generate random sequence/structure pairs easily. -- -- This should only be used for quickcheck testing, not anything involving -- properties of random RNAs. -- -- Generation is simple: A sequence with @[0,100]@ characters from @ACGU@ is -- generated and the 'RNAfold' structure is filled by 'rnafold'. We test for -- size @0@ explicitly, too! instance Arbitrary RNAfold where -- Generate a sequence and calculate the structure for it. arbitrary = choose (0,100) >>= \k → rnafold <$> mkRNAseq <$> BS.pack <$> vectorOf k (QC.elements "ACGU") -- Try with all sequences missing one character. shrink rna = [ rnafold $ mkRNAseq $ BS.pack s | s ← shrink $ rna^.input.rnaseq.to unpack ]