-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Most likely order of mutation events in RNA -- -- Determine the most likely order in which single nucleotide mutations -- happened between two RNA sequences. -- -- Developed to analyse the HAR 1 region. -- -- As long as the two input RNAs are small enough enough (couple hundred -- nucleotides) and the number of mutations is small enough (around -- 20-26, since the algorithm is exponential in this number) the -- algorithm should work for similar problems without changes. @package MutationOrder @version 0.0.0.2 -- | Here we collect the necessary data structures for the RNAs to be -- compared. This data is serialized to disk once calculated, since it is -- most likely the part that takes longest. -- -- TODO if the vienna wrapper allows, we should parallelize the -- calculations. -- -- TODO nice interface counting up? module BioInf.MutationOrder.RNA -- | A single RNA with pre-calculated elements. -- -- All calculations are done at 37 C. -- -- TODO include the basepair probability matrix? Can we "compress" that -- one? -- -- We do not encode D1S into the json data RNA RNA :: !(Vector (Int, Char)) -> !ByteString -> !ByteString -> !D1Secondary -> !Double -> !ByteString -> !D1Secondary -> !Double -> RNA -- | we store just the mutation set, since this is more sparse and gives -- access to the mutational events. [mutationSet] :: RNA -> !(Vector (Int, Char)) -- | store RNA sequence too, for now [primarySequence] :: RNA -> !ByteString -- | the mfe structure we get [mfeStructure] :: RNA -> !ByteString -- | efficient structure encoding [mfeD1S] :: RNA -> !D1Secondary -- | mfe energy of the structure [mfeEnergy] :: RNA -> !Double -- | the centroid structure [centroidStructure] :: RNA -> !ByteString -- | efficient centroid structure encoding [centroidD1S] :: RNA -> !D1Secondary [centroidEnergy] :: RNA -> !Double bldD1S :: ByteString -> D1Secondary -- | Given the primary sequence and the mutation set, fill the RNA -- structure. -- -- NOTE This wraps some ViennaRNA-bindings calls that are in -- IO. -- -- TODO check if these calls are *really* thread-safe! mkRNA :: Maybe (HashMap ByteString QLine) -> ByteString -> Vector (Int, Char) -> RNA -- | Insert a set of mutations in a ByteString. insertMutations :: Vector (Int, Char) -> ByteString -> ByteString data Landscape Landscape :: HashMap (BitSet I) RNA -> !Int -> !ByteString -> !ByteString -> !(BimapHashMap Int Int) -> Landscape -- | the individual RNA mutations. The index should be calculated from -- linearIndex 0 high mutationSet [rnas] :: Landscape -> HashMap (BitSet I) RNA -- | how many nucleotides are mutated in total [mutationCount] :: Landscape -> !Int -- | the ancestral sequence [landscapeOrigin] :: Landscape -> !ByteString -- | the final sequence [landscapeDestination] :: Landscape -> !ByteString [mutationPositions] :: Landscape -> !(BimapHashMap Int Int) -- | TODO prime candidate for parallelization. ViennaRNA-bindings currently -- does not allow parallel runs! It would be possible to consider -- externalizing this, but for now we just run single-threaded. createRNAlandscape :: Maybe (HashMap ByteString QLine) -> Bool -> ByteString -> ByteString -> (Landscape, [(Int, ByteString)]) -- | Write a generated landscape to disk. toFile :: FilePath -> Landscape -> IO () toFileJSON :: FilePath -> Landscape -> IO () fromFile :: FilePath -> IO Landscape fromFileJSON :: FilePath -> IO Landscape data QLine QLine :: ByteString -> (ByteString, Double) -> (ByteString, Double) -> (ByteString, Double) -> QLine [qlSequence] :: QLine -> ByteString [qlmfe] :: QLine -> (ByteString, Double) [qlensemble] :: QLine -> (ByteString, Double) [qlcentroid] :: QLine -> (ByteString, Double) qlines :: FilePath -> IO (HashMap ByteString QLine) instance GHC.Show.Show BioInf.MutationOrder.RNA.QLine instance GHC.Generics.Generic BioInf.MutationOrder.RNA.Landscape instance GHC.Classes.Eq BioInf.MutationOrder.RNA.Landscape instance GHC.Show.Show BioInf.MutationOrder.RNA.Landscape instance GHC.Generics.Generic BioInf.MutationOrder.RNA.RNA instance GHC.Classes.Eq BioInf.MutationOrder.RNA.RNA instance GHC.Show.Show BioInf.MutationOrder.RNA.RNA instance Control.DeepSeq.NFData BioInf.MutationOrder.RNA.RNA instance Data.Serialize.Serialize BioInf.MutationOrder.RNA.RNA instance Data.Aeson.Types.ToJSON.ToJSON BioInf.MutationOrder.RNA.RNA instance Data.Aeson.Types.FromJSON.FromJSON BioInf.MutationOrder.RNA.RNA instance Control.DeepSeq.NFData BioInf.MutationOrder.RNA.Landscape instance Data.Serialize.Serialize BioInf.MutationOrder.RNA.Landscape instance Data.Aeson.Types.ToJSON.ToJSON BioInf.MutationOrder.RNA.Landscape instance Data.Aeson.Types.FromJSON.FromJSON BioInf.MutationOrder.RNA.Landscape -- | Calculate minimum-distance Hamiltonian Shortest Paths and -- probabilities for starting nodes. -- -- NOTE: We explicitly model starting nodes. For symmetrical distance -- matrices, this reports begin/end probabilities. For asymmetrical -- distance matrices, a second instances with Last instead of -- First boundary should be created to calculate begin/end -- probabilities separately. module BioInf.MutationOrder.MinDist -- | Given the RNA we come from and the RNA we mutate into, -- derive the gain or loss by a scaling function. type ScaleFunction = RNA -> RNA -> Double -- | Minimal distance algebra -- -- TODO The two Ints are the indices of the nodes and could be replaced? aMinDist :: Monad m => ScaleFunction -> Landscape -> SigMinDist m Double Double ((Int :. From) :. To) (Int :. To) -- | Fused co-optimal counter! -- -- TODO for now, Int is assumed to be big enough... aMinDistCount :: Monad m => ScaleFunction -> Landscape -> SigMinDist m (Double, Int) (Double, Int) ((Int :. From) :. To) (Int :. To) -- | Sum over all states and collapse into boundary unscaled weights. aInside :: Monad m => Maybe Int -> ScaleFunction -> Landscape -> SigMinDist m (Log Double) (Log Double) ((Int :. From) :. To) (Int :. To) -- | This should give the correct order of nodes independent of the -- underlying Set1 First or Set1 Last because the -- (From:.To) system is agnostic over these. -- -- TODO Use text builder aPretty :: Monad m => ScaleFunction -> Landscape -> SigMinDist m Text [Text] ((Int :. From) :. To) (Int :. To) -- | Count co-optimals aCount :: Monad m => Landscape -> SigMinDist m Integer [Integer] ((Int :. From) :. To) (Int :. To) type TS1 x = TwITbl Id Unboxed EmptyOk (BS1 First I) x type U x = TwITbl Id Unboxed EmptyOk (Unit I) x type PF x = TwITbl Id Unboxed EmptyOk (Boundary First I) x type TS1L x = TwITbl Id Unboxed EmptyOk (BS1 Last I) x type PFL x = TwITbl Id Unboxed EmptyOk (Boundary Last I) x type BT1 x b = TwITblBt Unboxed EmptyOk (BS1 First I) x Id Id b type BTU x b = TwITblBt Unboxed EmptyOk (Unit I) x Id Id b type BT1L x b = TwITblBt Unboxed EmptyOk (BS1 Last I) x Id Id b -- | Run the minimal distance algebra. -- -- This produces one-boundary sets. Meaning that for each boundary we get -- the total distance within the set. forwardMinDist1 :: ScaleFunction -> Landscape -> (Z :. TS1L Double) :. U Double backtrackMinDist1 :: ScaleFunction -> Landscape -> (Z :. TS1L Double) :. U Double -> [Text] -- | Count the number of co-optimals minDistCount :: ScaleFunction -> Landscape -> (Z :. TS1L (Double, Int)) :. U (Double, Int) countBackMinDist1 :: ScaleFunction -> Landscape -> (Z :. TS1L Double) :. U Double -> [Integer] -- | Given the Set1 produced in forwardMinDist1 we can -- now extract the co-optimal paths using the Set1 -> () -- index change. -- -- TODO do we want this one explicitly or make life easy and just extract -- from all forwardMinDist1 paths? runCoOptDist :: ScaleFunction -> Landscape -> (Double, [Text]) runCount :: ScaleFunction -> Landscape -> (Double, Int) -- | Extract the individual partition scores. boundaryPartFunFirst :: Maybe Int -> ScaleFunction -> Landscape -> [(Boundary First I, Log Double)] boundaryPartFunLast :: Maybe Int -> ScaleFunction -> Landscape -> BoundaryPart data BoundaryPart BoundaryPart :: [(Boundary Last I, Log Double)] -> [(Boundary Last I, Log Double)] -> Log Double -> BoundaryPart [bpNormalized] :: BoundaryPart -> [(Boundary Last I, Log Double)] [bpUnnormalized] :: BoundaryPart -> [(Boundary Last I, Log Double)] [bpTotal] :: BoundaryPart -> Log Double boundaryPart :: [(Boundary Last I, Log Double)] -> BoundaryPart instance GHC.Classes.Eq BioInf.MutationOrder.MinDist.BoundaryPart instance GHC.Show.Show BioInf.MutationOrder.MinDist.BoundaryPart module BioInf.MutationOrder.EdgeProb -- | Before using aInside the ScoreMat needs to be scaled -- -- TODO the Edge needs to be an EdgeWithActive to get -- the active bits on the left in the set. aInside :: Monad m => ScaleFunction -> Landscape -> SigEdgeProb m (Log Double) (Log Double) ((Int :. From) :. To) (Int :. To) type TF1 x = TwITbl Id Unboxed EmptyOk (BS1 Last I) x type TL1 x = TwITbl Id Unboxed EmptyOk (BS1 Last O) x type EB x = TwITbl Id Unboxed EmptyOk (EdgeBoundary C) x -- | Extract the individual partition scores. edgeProbPartFun :: ScaleFunction -> Landscape -> ([(Boundary Last I, Log Double)], [(EdgeBoundary C, Log Double)]) -- | Turn the edge probabilities into a score matrix. edgeProbScoreMatrix :: Landscape -> [Log Double] -> [(EdgeBoundary C, Log Double)] -> ScoreMatrix (Log Double) -- | Run all steps of the HoxCluster algorithms in order. -- -- This will produce the following: -- --
    --
  1. run the minimal distance algorithm, give the minimal distance -- score and return all co-optimal paths
  2. --
  3. run the end-probability algorithm and return the probability that -- each node is the begin/end of a chain
  4. --
  5. run the edge probability algorithm and give the probability for -- each from :-> to edge
  6. --
  7. with the edge probabilities, run the maximal probability path -- algorithm, return that probability and all co-optimal paths
  8. --
-- -- TODO -Pretty should yield a structure to be given to the eps or svg -- generator. This allows more flexibility. Does diagrams offer -- serialization? -- -- TODO All this should be wrapped and available as a function. not just -- providing output files. module BioInf.MutationOrder runMutationOrder :: Bool -> FillWeight -> FillStyle -> ScaleFunction -> ScaleFunction -> Int -> Int -> Maybe FilePath -> [Char] -> FilePath -> t -> Bool -> [FilePath] -> IO () posScaled :: Double -> Double -> ScaleFunction -> ScaleFunction -- | Basepair distance basepairDistanceMFE :: ScaleFunction basepairDistanceCentroid :: ScaleFunction -- | Scale function for normal mfe delta energies mfeDelta :: ScaleFunction -- | Scale function for normal centroid delta energies centroidDelta :: ScaleFunction -- | Square positive "contributions", making bad moves more unlikely squaredPositive :: ScaleFunction -> ScaleFunction -- | Scale by temperature (for probability stuff) scaleTemperature :: Double -> ScaleFunction -> ScaleFunction scaleByFunction :: (t3 -> t2) -> (t1 -> t -> t3) -> t1 -> t -> t2 -- | Basepair distance -- -- Stupid fasta reader stupidReader :: FilePath -> IO ByteString -- | withDumpFile is like idIO :: a -> IO a in that it -- returns the data we give to the function. However, in case the dump -- file exists, we read it and return its contents, instead of -- recalculating. If it does not exist, we dump the data in addition to -- returning it. This forces the Landscape. withDumpFile :: Handle -> FilePath -> ByteString -> ByteString -> Landscape -> IO Landscape -- | Fill weight for our grid. If the fill weight is logarithmic, -- then the line length is 1 / (1 + log value) otherwise it is -- value. data FillWeight :: * FWlog :: FillWeight FWlinear :: FillWeight FWfill :: FillWeight data FillStyle :: * FSopacityLog :: FillStyle FSopacityLinear :: FillStyle FSfull :: FillStyle -- | Given the RNA we come from and the RNA we mutate into, -- derive the gain or loss by a scaling function. type ScaleFunction = RNA -> RNA -> Double