-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | RNA folding with non-canonical basepairs and base-triplets. -- -- The algorithm implemented here-in provides extended RNA secondary -- structure prediction. Each predicted nucleotide pairing is extended -- with an annotation describing which of three nucleotide edges is -- engaged in the pairing. In addition, each nucleotide may be engaged in -- more than one pairing. -- -- The algorithm is described in -- -- Hoener zu Siederdissen C, Bernhart SH, Stadler PF, Hofacker IL, -- -- A Folding Algorithm for Extended RNA Secondary Structures, -- -- Bioinformatics (2011) 27 (13), i129-136 -- -- http://www.tbi.univie.ac.at/software/rnawolf/ -- -- Please note that experimental does mean experimental. We are -- mostly concerned with determining a good set of (heuristic) rules for -- run-time reduction currently. This version does include stacking and -- is able to fold sequences of a few hundred nucleotides in seconds. -- -- Triplet calculations will come back with the next version (in a few -- days). The recursions require a number of changes to keep the runtimes -- down (as has been done for the extended loops without triplets). -- -- We have recently split the Biohaskell libraries into smaller -- individual libraries. In addition, stacking, intermediate arrays, -- fusion and newtype-wrapping did require a number of changes. Please -- send a mail, if you encounter strange behaviour or bugs. -- -- Last Changes: -- -- @package RNAwolf @version 0.3.1.0 -- | RNA-folding parameter space. -- -- TODO find better names for types, functions, and minima/maxima. module BioInf.Params -- | A (very) rich set of paramters. -- -- TODO 1xn interior loops should be tested (how often do they occur?) -- -- TODO external loop data Params Params :: PaLength -> PaExtPairNN -> Pa2ExtPairs -> Pa2ExtPairs -> PaLength -> PaLength -> PaExtPairNN -> PaLength -> Pa2ExtPairs -> PaExtPair -> PaExtPairNN -> Double -> Double -> Double -> PaDistance -> Double -> Params hairpinLength :: Params -> PaLength hairpinClose :: Params -> PaExtPairNN stem :: Params -> Pa2ExtPairs stemTriplet :: Params -> Pa2ExtPairs interiorLength :: Params -> PaLength interiorAsym :: Params -> PaLength interiorClose :: Params -> PaExtPairNN bulgeLength :: Params -> PaLength bulgeTriplet :: Params -> Pa2ExtPairs bulgeClose :: Params -> PaExtPair mbClose :: Params -> PaExtPairNN multiBranched :: Params -> Double multiHelix :: Params -> Double multiUnpaired :: Params -> Double pairDistance :: Params -> PaDistance interMolInit :: Params -> Double -- | An array which encodes length information type PaLength = PrimArray Int Double -- | This is an experimental annotation for long-distance interactions type PaDistance = PrimArray Int Double -- | An array holding information for one extended pair. type PaExtPair = PrimArray ExtPair Double -- | An array holding information for two extended pairs, e.g. stems. type Pa2ExtPairs = PrimArray (ExtPair, ExtPair) Double -- | An array holding information for one extended pair and two unpaired -- nucleotides, closes a loop. type PaExtPairNN = PrimArray (ExtPair, Nuc, Nuc) Double instance Read Params instance Show Params -- | Given a list of doubles with the exact required length import -- into a Params structure. module BioInf.Params.Import -- | Cast a list of values to parameters. -- -- NOTE This operation is rather fragile if there are layout changes. -- Consider Repr for this. -- -- NOTE BIG FAT WARNING: BE ABSOLUTELY SURE THAT ALL IMPORTS AND EXPORTS -- FOLLOW THIS ORDERING EXACTLY, OTHERWISE KEYS WILL BE MAPPED TO WRONG -- POSITIONS DURING LOOKUP AND VALUES END UP SOMEWHERE ELSE. fromList :: [Double] -> Params -- | split up a list accordings to given lengths splitXs :: [Int] -> [Double] -> [[Double]] -- | Exporting parameters is a bit more involved as we need the ability to -- export into a database format as well as linearize to list form. module BioInf.Params.Export -- | Just a long list of doubles. toList :: Params -> [Double] module BioInf.RNAwolf.Types -- | Should really go into BiobaseXNA -- -- Should really go into BiobaseXNA type ExtBT = Int -> Int -> CTisomerism -> Edge -> Edge -> Double -> BTAnswer type NBT = Int -> Int -> Double -> BTAnswer type BTAnswer = [([ExtPairIdx], Double)] type Table = PrimArray PairIdx Double type ExtTable = PrimArray ExtPairIdx Double type BaseF a = Params -> Primary -> a type ExtFeatures a = Int -> Int -> CTisomerism -> Edge -> Edge -> a type Features a = Int -> Int -> a type Tables = (EStem, NStem, NInte, NInteLoop, NBulg, NBulgLoop, NMult, NMbr, NMbr1, NMultLoop, NExtn) newtype EStem EStem :: ExtTable -> EStem unEStem :: EStem -> ExtTable newtype NStem NStem :: Table -> NStem unNStem :: NStem -> Table newtype NInte NInte :: Table -> NInte unNInte :: NInte -> Table newtype NInteLoop NInteLoop :: Table -> NInteLoop unNInteLoop :: NInteLoop -> Table newtype NMult NMult :: Table -> NMult unNMult :: NMult -> Table newtype NBulg NBulg :: Table -> NBulg unNBulg :: NBulg -> Table newtype NBulgLoop NBulgLoop :: Table -> NBulgLoop unBulgLoop :: NBulgLoop -> Table newtype NMbr NMbr :: Table -> NMbr unNMbr :: NMbr -> Table newtype NMbr1 NMbr1 :: Table -> NMbr1 unNMbr1 :: NMbr1 -> Table newtype NExtn NExtn :: Table -> NExtn unNExtn :: NExtn -> Table newtype NMultLoop NMultLoop :: Table -> NMultLoop unMultLoop :: NMultLoop -> Table module BioInf.RNAwolf.Bulge -- | The outer closing pair of a bulge loop (one unpaired region). fBulgeOuter :: BaseF (NBulgLoop -> ExtFeatures (Vector (PairIdx, Double))) -- | Outer part of a normal bulge btBulgeOuter :: Params -> Primary -> EStem -> NBulgLoop -> NBT -> ExtBT -- | The loop-part of bulges. Increases speed by 2x fBulgeLoop :: BaseF (NBulg -> Features (Vector (PairIdx, Double))) -- | Index generator for bulged loops -- -- Backtrack the bulge loop part. btBulgeLoop :: Params -> Primary -> NBulgLoop -> NBulg -> NBT -> NBT -- | Inner part of a bulge to speed up bulge calculations fBulgeInner :: BaseF (EStem -> Features (Vector (ExtPairIdx, Double))) -- | Backtrack the inner part of a bulge btBulgeInner :: Params -> Primary -> NBulg -> EStem -> ExtBT -> NBT -- | External loops are complete substructures, of which zero to many sit -- on the chain of nucleotides. module BioInf.RNAwolf.Extern -- | An external loop with an unpaired nucleotide to the left fLeftUnpaired :: BaseF (NExtn -> Features (Vector (PairIdx, Double))) -- | Backtracking a structure with an unpaired nucleotide to the left. -- -- FIXME In btLeftUnpaired, allow only non-empty structures on the right. -- It would be nice to make the recursion scheme take care of that. btLeftUnpaired :: Params -> Primary -> NExtn -> NBT -> NBT -- | Energy for exactly one stem at (i,k) fStem :: BaseF (NStem -> Features (Vector (PairIdx, Double))) -- | Backtrack one stem with right index k. btStem :: Params -> Primary -> NExtn -> NStem -> NBT -> NBT -- | This one is important as otherwise, some stretches of nucleotides will -- always have to be paired. (Obviously, I forgot to add this one for a -- time...) fOne :: BaseF (Features (Vector (PairIdx, Double))) btOne :: Params -> Primary -> NExtn -> NBT -- | External structures with more than one stem have a NStem on the left -- and an external NExtn structure on the right. fStems :: BaseF (NStem -> NExtn -> Features (Vector (Int, Double))) -- | Backtracking of an external structure with more than one stem btStems :: Params -> Primary -> NStem -> NExtn -> NBT -> NBT -> NBT module BioInf.RNAwolf.Hairpin -- | A hairpin is a number of 0 or more unpaired nucleotides, enclosed by -- the nucleotides (i,j) which pair. -- -- TODO should we allow hairpins with no unpaired nucleotides in the pin? -- They do occur, but only under special circumstances which we should -- model differently... -- -- TODO re-allow IMI fHairpin :: [Int] -> BaseF (ExtFeatures (Vector (ExtPairIdx, Double))) -- | Backtracking hairpins. btHairpin :: Params -> Primary -> EStem -> ExtBT module BioInf.RNAwolf.Interior -- | The outer part of an interior loop. Given a certain basepair type, add -- the cost from the unpaired part. fInteriorOuter :: BaseF (NInteLoop -> ExtFeatures (Vector (PairIdx, Double))) btInteriorOuter :: Params -> Primary -> EStem -> NInteLoop -> NBT -> ExtBT -- | Performs the interior loop calculations between (i,j) outer and -- (k,l) inner part. The score based on the unpaired nucleotides -- is independent of both, the outer and the inner basepair type. -- -- NOTE / TODO -- fusion enabled for this function (due to it taking 50% -- of the time), full fusion is still dependent on other factors and -- needs to be checked (in particular, we still have allocation events) fInteriorLoop :: BaseF (NInte -> Features (Vector (PairIdx, Double))) -- | Backtrack the unpaired loop region btInteriorLoop :: Params -> Primary -> NInteLoop -> NInte -> NBT -> NBT -- | This opens up an interior loop. For each index (i,j) we minimize over -- all possible basepair types. fInteriorInner :: BaseF (EStem -> Features (Vector (ExtPairIdx, Double))) -- | Backtrack from an NInte result to the corresponding EStem parts btInteriorInner :: Params -> Primary -> NInte -> EStem -> ExtBT -> NBT -- | Since backtracking interior loops is mostly selfcontained, we -- encapsulate the above three functions -- which we can't do easily with -- the forward calculations as they actually have to save on runtime. -- -- Given the outer indices (i,j), produces delta_i and delta_j so that -- i+delta_i and j-delta_j are the inner indices. fInteriorKLs -- should fuse and should make sure that l-k>=4 is always true (maxd). -- Furthermore the maximal unpaired length of both sides combined is -- determined by maxLength. -- -- TODO better name than maxLength fInteriorKLs :: Int -> Int -> Vector (Int, Int) -- | Functions for handling non-triplet multibranched loops. -- -- TODO We can do the loop-splitting thing again to speed up -- multibranched closing by x10. module BioInf.RNAwolf.Multibranched -- | Energy for having the rightmost nucleotide (at j) unpaired in NMBr. fUnpairedRight :: BaseF (NMbr -> Features (Vector (PairIdx, Double))) -- | Backtrack in NMbr if the nucleotide at j is unpaired. btUnpairedRight :: Params -> Primary -> NMbr -> NBT -> NBT -- | Energy for having the rightmost nucleotide (at j) unpaired in NMBr1. fUnpairedRight1 :: BaseF (NMbr1 -> Features (Vector (PairIdx, Double))) -- | Backtrack NMbr1 if the nucleotide at j is unpaired. btUnpairedRight1 :: Params -> Primary -> NMbr1 -> NBT -> NBT -- | A multibranched helix (except the closing one). (i,j) are closed by a -- basepair. Backtracking into the EStem reveals the type of pairing. fMlHelix :: BaseF (EStem -> Features (Vector (ExtPairIdx, Double))) -- | Backtracks from (i,j) in NMult into the extended-pairing EStem. btMlHelix :: Params -> Primary -> NMult -> EStem -> ExtBT -> NBT -- | Closes a multibranch loop. -- -- TODO make completely triplet compliant fMlClose :: BaseF (NMultLoop -> ExtFeatures (Vector (PairIdx, Double))) -- | Backtrack from and extended annotation (ij,ext) into the helper table -- NMultLoop. btMlClose :: Params -> Primary -> EStem -> NMultLoop -> NBT -> ExtBT -- | Multibranched loop helper function that combines at least one -- stem with exactly one stem but does not add the closing -- energy from (i,j). fMlLoop :: BaseF (NMbr -> NMbr1 -> Features (Vector (Int, Double))) -- | Backtracking the multibranched loop. btMlLoop :: Params -> Primary -> NMultLoop -> NMbr -> NMbr1 -> NBT -> NBT -> NBT -- | Backtrack a single stem in NMbr, where the stem has zero or more -- unpaired nucleotides to the left. fMlStem :: BaseF (NMult -> Features (Vector (Int, Double))) -- | Backtrack by trying to find a multilooped helix. btMlStem :: Params -> Primary -> NMbr -> NMult -> NBT -> NBT -- | Add a stem to a multibranch table containing already at least one -- stem. fMlStems :: BaseF (NMbr -> NMult -> Features (Vector (Int, Double))) -- | Backtrack by finding the splitting index between an NMbr composite -- structure and a single multibranched stem NMult (which can contain -- unpaired nucleotides to the left). btMlStems :: Params -> Primary -> NMbr -> NMult -> NBT -> NBT -> NBT -- | Add a single stem to a multibranch table containing zero stems -- already. -- -- TODO this would be equal to mlHelix, unify! fMl1Stem :: BaseF (NMult -> Features (Vector ((Int, Int), Double))) -- | Backtrack a single stem closed at (i,j) for NMbr1. Takes the route -- through NMult which solves for the exact pairtype. btMl1Stem :: Params -> Primary -> NMbr1 -> NMult -> NBT -> NBT module BioInf.RNAwolf.Stem -- | A normal stem is created by taking the minimum over all possible -- basepairs of the extended stem. fNstem :: BaseF (EStem -> Features (Vector (ExtPairIdx, Double))) -- | Backtrack from a normal stem back into the extended stem. btNstem :: Params -> Primary -> NStem -> EStem -> ExtBT -> NBT -- | A stem is extended by another pair. The score contribution is -- dependent on the previous pair. Note that for score lookup purposes, -- the inner pair is switched. fStem :: BaseF (EStem -> ExtFeatures (Vector (ExtPairIdx, Double))) -- | Stem backtracking. btStem :: Params -> Primary -> EStem -> ExtBT -> ExtBT module BioInf.RNAwolf.TripletBulge module BioInf.RNAwolf.TripletStem -- | The RNAwolf folding algorithm, version 1.9. We now have full stacking -- and rich parameters everywhere. In general, most parameters closely -- follow what we have for ViennaRNA 1.8 but with extended RNA secondary -- structures, instead of canonicals only. Further refinements of the -- parameter system will follow. -- -- TODO right now, 1-diagrams only, 2-diagrams come back in a few days. I -- want to be sure that the full stacking approach does not introduce -- subtle bugs. -- -- TODO recast all fZZZ functions for folding to actually fuse on -- minimum/fZZZ. -- -- TODO VU.! -> VU.unsafeIndex -- -- TODO possibly very big TODO: is this being optimized? : fold $ g z -- where g z = if z==True then [1..10] else []. If this is not optimized, -- we should change all functions below in a way that allows -- optimization. (I dont think fusion can fire on these objects...) -- -- TODO rewrite minimumVU to accept Either ctors and specialize on -- them. Left to be used for strange errors, Right for -- correct streams module BioInf.RNAwolf -- | Wrapper around the state monad. rnaWolf :: Params -> Primary -> Tables -- | Given parameters, input, score band, and filled tables we can -- backtrack. -- -- NOTE the order in which backtracking for individual functions is -- performed, is important. In case of ties in energy, the first result -- is taken. This should be considered! -- -- -- -- TODO all the crap in comments are bug-fix backtracking options. rnaWolfBacktrack :: Params -> Primary -> Double -> Tables -> [([ExtPairIdx], Double)] -- | Return the optimal energy. rnaWolfOptimal :: Tables -> Double -- | Transformation of predictions and known structures into keys. Keys are -- used for linearization. -- -- NOTE READ THE BIG FAT KEYS WARNING -- -- TODO Generalize and move into its own library module BioInf.Keys -- | A list of named parameters. -- -- Uniquely tag each key -- -- NOTE BIG FAT WARNING: BE ABSOLUTELY SURE THAT ALL IMPORTS AND EXPORTS -- FOLLOW THIS ORDERING EXACTLY, OTHERWISE KEYS WILL BE MAPPED TO WRONG -- POSITIONS DURING LOOKUP AND VALUES END UP SOMEWHERE ELSE. data Keys HairpinLength :: Int -> Keys HairpinClose :: (ExtPair, Nuc, Nuc) -> Keys Stem :: (ExtPair, ExtPair) -> Keys StemTriplet :: (ExtPair, ExtPair) -> Keys InteriorLength :: Int -> Keys InteriorAsym :: Int -> Keys InteriorClose :: (ExtPair, Nuc, Nuc) -> Keys BulgeLength :: Int -> Keys BulgeTriplet :: (ExtPair, ExtPair) -> Keys BulgeClose :: ExtPair -> Keys MbClose :: (ExtPair, Nuc, Nuc) -> Keys MultiBranched :: Keys MultiHelix :: Keys MultiUnpaired :: Keys PairDistance :: Int -> Keys InterMolInit :: Keys -- | Training data to feature vector featureVector :: String -> [ExtPairIdx] -> [Int] -- | transform feature to 0-based index lookupFeatureIndex :: Keys -> Int -- | Map param keys to thei Int-indices. -- -- And back from Int-indices to the keys. -- -- Takes a primary structure and secondary structure tree and produces a -- list of keys. -- -- TODO Data.Traversable ?! -- -- TODO better handling of unknown features: we can have genuine errors -- (pseudoknots) and uncoded features (e.g. hairpins of size > 30) treeToFeatures :: (MkPrimary a, Show a) => a -> SSTree ExtPairIdx t -> [Keys] -- | Create the secondary structure tree -- -- FIXME okPairs is ad-hoc, we should allow for other kinds of pairs! ssTree :: Int -> [ExtPairIdx] -> SSTree ExtPairIdx () instance Read Keys instance Show Keys instance Eq Keys instance Ord Keys -- | Passive-aggressive optimization. Mainly based on: -- -- Zakov, Shay and Goldberg, Yoav and Elhaded, Michael and Ziv-Ukelson, -- Michal Rich Parameterization Improves RNA Structure Prediction -- RECOMB 2011 -- -- and -- -- Crammer, Koby and (et al) Online Passive-Aggressive Algorithms -- Journal of Machine Learning Research (2006) -- -- TODO as always: move out of here and put in its own library module BioInf.PassiveAggressive -- | Default implementation of P/A. defaultPA :: Double -> Params -> TrainingData -> (Params, Double, Double, [(Int, Double)]) instance MkConfusionMatrix TrainingData