-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | RNA folding with non-canonical basepairs and base-triplets.
--
-- The algorithm implemented here-in provides extended RNA secondary
-- structure prediction. Each predicted nucleotide pairing is extended
-- with an annotation describing which of three nucleotide edges is
-- engaged in the pairing. In addition, each nucleotide may be engaged in
-- more than one pairing.
--
-- The algorithm is described in
--
-- Hoener zu Siederdissen C, Bernhart SH, Stadler PF, Hofacker IL,
--
-- A Folding Algorithm for Extended RNA Secondary Structures,
--
-- Bioinformatics (2011) 27 (13), i129-136
--
-- http://www.tbi.univie.ac.at/software/rnawolf/
--
-- Please note that experimental does mean experimental. We are
-- mostly concerned with determining a good set of (heuristic) rules for
-- run-time reduction currently. This version does include stacking and
-- is able to fold sequences of a few hundred nucleotides in seconds.
--
-- Triplet calculations will come back with the next version (in a few
-- days). The recursions require a number of changes to keep the runtimes
-- down (as has been done for the extended loops without triplets).
--
-- We have recently split the Biohaskell libraries into smaller
-- individual libraries. In addition, stacking, intermediate arrays,
-- fusion and newtype-wrapping did require a number of changes. Please
-- send a mail, if you encounter strange behaviour or bugs.
--
-- Changes in 0.3.2.0
--
--
-- - simpler training wrapper
-- - added parallelism option for multi-core systems (reduce iteration
-- time for the cost of a possible reduction in training efficiency; but
-- should be worth it)
--
--
-- Changes in 0.3.1.0
--
--
-- - fixed bugs introduced by bulgeinteriormulti-loops
--
@package RNAwolf
@version 0.3.2.0
-- | RNA-folding parameter space.
--
-- TODO find better names for types, functions, and minima/maxima.
module BioInf.Params
-- | A (very) rich set of paramters.
--
-- TODO 1xn interior loops should be tested (how often do they occur?)
--
-- TODO external loop
data Params
Params :: PaLength -> PaExtPairNN -> Pa2ExtPairs -> Pa2ExtPairs -> PaLength -> PaLength -> PaExtPairNN -> PaLength -> Pa2ExtPairs -> PaExtPair -> PaExtPairNN -> Double -> Double -> Double -> PaDistance -> Double -> Params
hairpinLength :: Params -> PaLength
hairpinClose :: Params -> PaExtPairNN
stem :: Params -> Pa2ExtPairs
stemTriplet :: Params -> Pa2ExtPairs
interiorLength :: Params -> PaLength
interiorAsym :: Params -> PaLength
interiorClose :: Params -> PaExtPairNN
bulgeLength :: Params -> PaLength
bulgeTriplet :: Params -> Pa2ExtPairs
bulgeClose :: Params -> PaExtPair
mbClose :: Params -> PaExtPairNN
multiBranched :: Params -> Double
multiHelix :: Params -> Double
multiUnpaired :: Params -> Double
pairDistance :: Params -> PaDistance
interMolInit :: Params -> Double
-- | An array which encodes length information
type PaLength = PrimArray Int Double
-- | This is an experimental annotation for long-distance interactions
type PaDistance = PrimArray Int Double
-- | An array holding information for one extended pair.
type PaExtPair = PrimArray ExtPair Double
-- | An array holding information for two extended pairs, e.g. stems.
type Pa2ExtPairs = PrimArray (ExtPair, ExtPair) Double
-- | An array holding information for one extended pair and two unpaired
-- nucleotides, closes a loop.
type PaExtPairNN = PrimArray (ExtPair, Nuc, Nuc) Double
instance Read Params
instance Show Params
-- | Given a list of doubles with the exact required length import
-- into a Params structure.
module BioInf.Params.Import
-- | Cast a list of values to parameters.
--
-- NOTE This operation is rather fragile if there are layout changes.
-- Consider Repr for this.
--
-- NOTE BIG FAT WARNING: BE ABSOLUTELY SURE THAT ALL IMPORTS AND EXPORTS
-- FOLLOW THIS ORDERING EXACTLY, OTHERWISE KEYS WILL BE MAPPED TO WRONG
-- POSITIONS DURING LOOKUP AND VALUES END UP SOMEWHERE ELSE.
fromList :: [Double] -> Params
-- | split up a list accordings to given lengths
splitXs :: [Int] -> [Double] -> [[Double]]
-- | Exporting parameters is a bit more involved as we need the ability to
-- export into a database format as well as linearize to list form.
module BioInf.Params.Export
-- | Just a long list of doubles.
toList :: Params -> [Double]
module BioInf.RNAwolf.Types
-- | Should really go into BiobaseXNA
--
-- Should really go into BiobaseXNA
type ExtBT = Int -> Int -> CTisomerism -> Edge -> Edge -> Double -> BTAnswer
type NBT = Int -> Int -> Double -> BTAnswer
type BTAnswer = [([ExtPairIdx], Double)]
type Table = PrimArray PairIdx Double
type ExtTable = PrimArray ExtPairIdx Double
type BaseF a = Params -> Primary -> a
type ExtFeatures a = Int -> Int -> CTisomerism -> Edge -> Edge -> a
type Features a = Int -> Int -> a
type Tables = (EStem, NStem, NInte, NInteLoop, NBulg, NBulgLoop, NMult, NMbr, NMbr1, NMultLoop, NExtn)
newtype EStem
EStem :: ExtTable -> EStem
unEStem :: EStem -> ExtTable
newtype NStem
NStem :: Table -> NStem
unNStem :: NStem -> Table
newtype NInte
NInte :: Table -> NInte
unNInte :: NInte -> Table
newtype NInteLoop
NInteLoop :: Table -> NInteLoop
unNInteLoop :: NInteLoop -> Table
newtype NMult
NMult :: Table -> NMult
unNMult :: NMult -> Table
newtype NBulg
NBulg :: Table -> NBulg
unNBulg :: NBulg -> Table
newtype NBulgLoop
NBulgLoop :: Table -> NBulgLoop
unBulgLoop :: NBulgLoop -> Table
newtype NMbr
NMbr :: Table -> NMbr
unNMbr :: NMbr -> Table
newtype NMbr1
NMbr1 :: Table -> NMbr1
unNMbr1 :: NMbr1 -> Table
newtype NExtn
NExtn :: Table -> NExtn
unNExtn :: NExtn -> Table
newtype NMultLoop
NMultLoop :: Table -> NMultLoop
unMultLoop :: NMultLoop -> Table
module BioInf.RNAwolf.Bulge
-- | The outer closing pair of a bulge loop (one unpaired region).
fBulgeOuter :: BaseF (NBulgLoop -> ExtFeatures (Vector (PairIdx, Double)))
-- | Outer part of a normal bulge
btBulgeOuter :: Params -> Primary -> EStem -> NBulgLoop -> NBT -> ExtBT
-- | The loop-part of bulges. Increases speed by 2x
fBulgeLoop :: BaseF (NBulg -> Features (Vector (PairIdx, Double)))
-- | Index generator for bulged loops
--
-- Backtrack the bulge loop part.
btBulgeLoop :: Params -> Primary -> NBulgLoop -> NBulg -> NBT -> NBT
-- | Inner part of a bulge to speed up bulge calculations
fBulgeInner :: BaseF (EStem -> Features (Vector (ExtPairIdx, Double)))
-- | Backtrack the inner part of a bulge
btBulgeInner :: Params -> Primary -> NBulg -> EStem -> ExtBT -> NBT
-- | External loops are complete substructures, of which zero to many sit
-- on the chain of nucleotides.
module BioInf.RNAwolf.Extern
-- | An external loop with an unpaired nucleotide to the left
fLeftUnpaired :: BaseF (NExtn -> Features (Vector (PairIdx, Double)))
-- | Backtracking a structure with an unpaired nucleotide to the left.
--
-- FIXME In btLeftUnpaired, allow only non-empty structures on the right.
-- It would be nice to make the recursion scheme take care of that.
btLeftUnpaired :: Params -> Primary -> NExtn -> NBT -> NBT
-- | Energy for exactly one stem at (i,k)
fStem :: BaseF (NStem -> Features (Vector (PairIdx, Double)))
-- | Backtrack one stem with right index k.
btStem :: Params -> Primary -> NExtn -> NStem -> NBT -> NBT
-- | This one is important as otherwise, some stretches of nucleotides will
-- always have to be paired. (Obviously, I forgot to add this one for a
-- time...)
fOne :: BaseF (Features (Vector (PairIdx, Double)))
btOne :: Params -> Primary -> NExtn -> NBT
-- | External structures with more than one stem have a NStem on the left
-- and an external NExtn structure on the right.
fStems :: BaseF (NStem -> NExtn -> Features (Vector (Int, Double)))
-- | Backtracking of an external structure with more than one stem
btStems :: Params -> Primary -> NStem -> NExtn -> NBT -> NBT -> NBT
module BioInf.RNAwolf.Hairpin
-- | A hairpin is a number of 0 or more unpaired nucleotides, enclosed by
-- the nucleotides (i,j) which pair.
--
-- TODO should we allow hairpins with no unpaired nucleotides in the pin?
-- They do occur, but only under special circumstances which we should
-- model differently...
--
-- TODO re-allow IMI
fHairpin :: [Int] -> BaseF (ExtFeatures (Vector (ExtPairIdx, Double)))
-- | Backtracking hairpins.
btHairpin :: Params -> Primary -> EStem -> ExtBT
module BioInf.RNAwolf.Interior
-- | The outer part of an interior loop. Given a certain basepair type, add
-- the cost from the unpaired part.
fInteriorOuter :: BaseF (NInteLoop -> ExtFeatures (Vector (PairIdx, Double)))
btInteriorOuter :: Params -> Primary -> EStem -> NInteLoop -> NBT -> ExtBT
-- | Performs the interior loop calculations between (i,j) outer and
-- (k,l) inner part. The score based on the unpaired nucleotides
-- is independent of both, the outer and the inner basepair type.
--
-- NOTE / TODO -- fusion enabled for this function (due to it taking 50%
-- of the time), full fusion is still dependent on other factors and
-- needs to be checked (in particular, we still have allocation events)
fInteriorLoop :: BaseF (NInte -> Features (Vector (PairIdx, Double)))
-- | Backtrack the unpaired loop region
btInteriorLoop :: Params -> Primary -> NInteLoop -> NInte -> NBT -> NBT
-- | This opens up an interior loop. For each index (i,j) we minimize over
-- all possible basepair types.
fInteriorInner :: BaseF (EStem -> Features (Vector (ExtPairIdx, Double)))
-- | Backtrack from an NInte result to the corresponding EStem parts
btInteriorInner :: Params -> Primary -> NInte -> EStem -> ExtBT -> NBT
-- | Since backtracking interior loops is mostly selfcontained, we
-- encapsulate the above three functions -- which we can't do easily with
-- the forward calculations as they actually have to save on runtime.
--
-- Given the outer indices (i,j), produces delta_i and delta_j so that
-- i+delta_i and j-delta_j are the inner indices. fInteriorKLs
-- should fuse and should make sure that l-k>=4 is always true (maxd).
-- Furthermore the maximal unpaired length of both sides combined is
-- determined by maxLength.
--
-- TODO better name than maxLength
fInteriorKLs :: Int -> Int -> Vector (Int, Int)
-- | Functions for handling non-triplet multibranched loops.
--
-- TODO We can do the loop-splitting thing again to speed up
-- multibranched closing by x10.
module BioInf.RNAwolf.Multibranched
-- | Energy for having the rightmost nucleotide (at j) unpaired in NMBr.
fUnpairedRight :: BaseF (NMbr -> Features (Vector (PairIdx, Double)))
-- | Backtrack in NMbr if the nucleotide at j is unpaired.
btUnpairedRight :: Params -> Primary -> NMbr -> NBT -> NBT
-- | Energy for having the rightmost nucleotide (at j) unpaired in NMBr1.
fUnpairedRight1 :: BaseF (NMbr1 -> Features (Vector (PairIdx, Double)))
-- | Backtrack NMbr1 if the nucleotide at j is unpaired.
btUnpairedRight1 :: Params -> Primary -> NMbr1 -> NBT -> NBT
-- | A multibranched helix (except the closing one). (i,j) are closed by a
-- basepair. Backtracking into the EStem reveals the type of pairing.
fMlHelix :: BaseF (EStem -> Features (Vector (ExtPairIdx, Double)))
-- | Backtracks from (i,j) in NMult into the extended-pairing EStem.
btMlHelix :: Params -> Primary -> NMult -> EStem -> ExtBT -> NBT
-- | Closes a multibranch loop.
--
-- TODO make completely triplet compliant
fMlClose :: BaseF (NMultLoop -> ExtFeatures (Vector (PairIdx, Double)))
-- | Backtrack from and extended annotation (ij,ext) into the helper table
-- NMultLoop.
btMlClose :: Params -> Primary -> EStem -> NMultLoop -> NBT -> ExtBT
-- | Multibranched loop helper function that combines at least one
-- stem with exactly one stem but does not add the closing
-- energy from (i,j).
fMlLoop :: BaseF (NMbr -> NMbr1 -> Features (Vector (Int, Double)))
-- | Backtracking the multibranched loop.
btMlLoop :: Params -> Primary -> NMultLoop -> NMbr -> NMbr1 -> NBT -> NBT -> NBT
-- | Backtrack a single stem in NMbr, where the stem has zero or more
-- unpaired nucleotides to the left.
fMlStem :: BaseF (NMult -> Features (Vector (Int, Double)))
-- | Backtrack by trying to find a multilooped helix.
btMlStem :: Params -> Primary -> NMbr -> NMult -> NBT -> NBT
-- | Add a stem to a multibranch table containing already at least one
-- stem.
fMlStems :: BaseF (NMbr -> NMult -> Features (Vector (Int, Double)))
-- | Backtrack by finding the splitting index between an NMbr composite
-- structure and a single multibranched stem NMult (which can contain
-- unpaired nucleotides to the left).
btMlStems :: Params -> Primary -> NMbr -> NMult -> NBT -> NBT -> NBT
-- | Add a single stem to a multibranch table containing zero stems
-- already.
--
-- TODO this would be equal to mlHelix, unify!
fMl1Stem :: BaseF (NMult -> Features (Vector ((Int, Int), Double)))
-- | Backtrack a single stem closed at (i,j) for NMbr1. Takes the route
-- through NMult which solves for the exact pairtype.
btMl1Stem :: Params -> Primary -> NMbr1 -> NMult -> NBT -> NBT
module BioInf.RNAwolf.Stem
-- | A normal stem is created by taking the minimum over all possible
-- basepairs of the extended stem.
fNstem :: BaseF (EStem -> Features (Vector (ExtPairIdx, Double)))
-- | Backtrack from a normal stem back into the extended stem.
btNstem :: Params -> Primary -> NStem -> EStem -> ExtBT -> NBT
-- | A stem is extended by another pair. The score contribution is
-- dependent on the previous pair. Note that for score lookup purposes,
-- the inner pair is switched.
fStem :: BaseF (EStem -> ExtFeatures (Vector (ExtPairIdx, Double)))
-- | Stem backtracking.
btStem :: Params -> Primary -> EStem -> ExtBT -> ExtBT
module BioInf.RNAwolf.TripletBulge
module BioInf.RNAwolf.TripletStem
-- | The RNAwolf folding algorithm, version 1.9. We now have full stacking
-- and rich parameters everywhere. In general, most parameters closely
-- follow what we have for ViennaRNA 1.8 but with extended RNA secondary
-- structures, instead of canonicals only. Further refinements of the
-- parameter system will follow.
--
-- TODO right now, 1-diagrams only, 2-diagrams come back in a few days. I
-- want to be sure that the full stacking approach does not introduce
-- subtle bugs.
--
-- TODO recast all fZZZ functions for folding to actually fuse on
-- minimum/fZZZ.
--
-- TODO VU.! -> VU.unsafeIndex
--
-- TODO possibly very big TODO: is this being optimized? : fold $ g z
-- where g z = if z==True then [1..10] else []. If this is not optimized,
-- we should change all functions below in a way that allows
-- optimization. (I dont think fusion can fire on these objects...)
--
-- TODO rewrite minimumVU to accept Either ctors and specialize on
-- them. Left to be used for strange errors, Right for
-- correct streams
module BioInf.RNAwolf
-- | Wrapper around the state monad.
rnaWolf :: Params -> Primary -> Tables
-- | Given parameters, input, score band, and filled tables we can
-- backtrack.
--
-- NOTE the order in which backtracking for individual functions is
-- performed, is important. In case of ties in energy, the first result
-- is taken. This should be considered!
--
--
-- - 1 We consider unpaired stretches always first. This is kind
-- of arbitrary.
-- - 2 extended stems always come last. This is because they can
-- potentially introduce many co-optimal structures before they are all
-- discarded.
--
--
-- TODO all the crap in comments are bug-fix backtracking options.
rnaWolfBacktrack :: Params -> Primary -> Double -> Tables -> [([ExtPairIdx], Double)]
-- | Return the optimal energy.
rnaWolfOptimal :: Tables -> Double
-- | Transformation of predictions and known structures into keys. Keys are
-- used for linearization.
--
-- NOTE READ THE BIG FAT KEYS WARNING
--
-- TODO Generalize and move into its own library
module BioInf.Keys
-- | A list of named parameters.
--
-- Uniquely tag each key
--
-- NOTE BIG FAT WARNING: BE ABSOLUTELY SURE THAT ALL IMPORTS AND EXPORTS
-- FOLLOW THIS ORDERING EXACTLY, OTHERWISE KEYS WILL BE MAPPED TO WRONG
-- POSITIONS DURING LOOKUP AND VALUES END UP SOMEWHERE ELSE.
data Keys
HairpinLength :: Int -> Keys
HairpinClose :: (ExtPair, Nuc, Nuc) -> Keys
Stem :: (ExtPair, ExtPair) -> Keys
StemTriplet :: (ExtPair, ExtPair) -> Keys
InteriorLength :: Int -> Keys
InteriorAsym :: Int -> Keys
InteriorClose :: (ExtPair, Nuc, Nuc) -> Keys
BulgeLength :: Int -> Keys
BulgeTriplet :: (ExtPair, ExtPair) -> Keys
BulgeClose :: ExtPair -> Keys
MbClose :: (ExtPair, Nuc, Nuc) -> Keys
MultiBranched :: Keys
MultiHelix :: Keys
MultiUnpaired :: Keys
PairDistance :: Int -> Keys
InterMolInit :: Keys
-- | Training data to feature vector
featureVector :: String -> [ExtPairIdx] -> [Int]
-- | transform feature to 0-based index
lookupFeatureIndex :: Keys -> Int
-- | Map param keys to thei Int-indices.
--
-- And back from Int-indices to the keys.
--
-- Takes a primary structure and secondary structure tree and produces a
-- list of keys.
--
-- TODO Data.Traversable ?!
--
-- TODO better handling of unknown features: we can have genuine errors
-- (pseudoknots) and uncoded features (e.g. hairpins of size > 30)
treeToFeatures :: (MkPrimary a, Show a) => a -> SSTree ExtPairIdx t -> [Keys]
-- | Create the secondary structure tree
--
-- FIXME okPairs is ad-hoc, we should allow for other kinds of pairs!
ssTree :: Int -> [ExtPairIdx] -> SSTree ExtPairIdx ()
instance Read Keys
instance Show Keys
instance Eq Keys
instance Ord Keys
-- | Passive-aggressive optimization. Mainly based on:
--
-- Zakov, Shay and Goldberg, Yoav and Elhaded, Michael and Ziv-Ukelson,
-- Michal Rich Parameterization Improves RNA Structure Prediction
-- RECOMB 2011
--
-- and
--
-- Crammer, Koby and (et al) Online Passive-Aggressive Algorithms
-- Journal of Machine Learning Research (2006)
--
-- TODO as always: move out of here and put in its own library
module BioInf.PassiveAggressive
-- | Default implementation of P/A. We return a data structure that
-- contains all changes required from this run, the enerDif
-- or energy difference between the known and the predicted structure,
-- and a structural difference score. Furthermore, some errors are being
-- reported in errors.
--
-- The energy difference can be (i) in that case, a wrongly predicted
-- structure has better (lower) energy than the known one. (ii) It can be
-- zero, then we have either found a co-optimal structural or the correct
-- structure. (iii) In some cases, it can be positive, this is a formal
-- error, but will not abort the program. (The calling program may opt to
-- abort on (not . null $ errors).
--
-- The structural difference is [0..1] with 0 for structurally
-- identical predictions and known structures and otherwise growing
-- toward 1 for bad predictions where nothing is correct.
defaultPA :: Double -> Params -> TrainingData -> PA
-- | In case that the known structure has a score epsilon better
-- than the predicted, we have an error condition, as this should never
-- be the case.
--
-- Return a lot of information from each P/A call. We do not return the
-- new Params anymore, only a list of changes. This allows us to
-- do some things. If the implementation of Params is switched,
-- we can update in place; or we can perform calculations in parallel.
data PA
PA :: [(Int, Double)] -> Double -> Double -> [String] -> PA
changes :: PA -> [(Int, Double)]
enerDif :: PA -> Double
accMeas :: PA -> Double
errors :: PA -> [String]
instance Show PA
instance MkConfusionMatrix TrainingData
instance NFData PA