-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Most likely order of mutation events in RNA
--   
--   Determine the most likely order in which single nucleotide mutations
--   happened between two RNA sequences.
--   
--   Developed to analyse the <tt>HAR 1</tt> region.
--   
--   As long as the two input RNAs are small enough enough (couple hundred
--   nucleotides) and the number of mutations is small enough (around
--   20-26, since the algorithm is exponential in this number) the
--   algorithm should work for similar problems without changes.
@package MutationOrder
@version 0.0.0.2


-- | Here we collect the necessary data structures for the RNAs to be
--   compared. This data is serialized to disk once calculated, since it is
--   most likely the part that takes longest.
--   
--   TODO if the vienna wrapper allows, we should parallelize the
--   calculations.
--   
--   TODO nice interface counting up?
module BioInf.MutationOrder.RNA

-- | A single RNA with pre-calculated elements.
--   
--   All calculations are done at 37 C.
--   
--   TODO include the basepair probability matrix? Can we "compress" that
--   one?
--   
--   We do not encode D1S into the json
data RNA
RNA :: !(Vector (Int, Char)) -> !ByteString -> !ByteString -> !D1Secondary -> !Double -> !ByteString -> !D1Secondary -> !Double -> RNA

-- | we store just the mutation set, since this is more sparse and gives
--   access to the mutational events.
[mutationSet] :: RNA -> !(Vector (Int, Char))

-- | store RNA sequence too, for now
[primarySequence] :: RNA -> !ByteString

-- | the mfe structure we get
[mfeStructure] :: RNA -> !ByteString

-- | efficient structure encoding
[mfeD1S] :: RNA -> !D1Secondary

-- | mfe energy of the structure
[mfeEnergy] :: RNA -> !Double

-- | the centroid structure
[centroidStructure] :: RNA -> !ByteString

-- | efficient centroid structure encoding
[centroidD1S] :: RNA -> !D1Secondary
[centroidEnergy] :: RNA -> !Double
bldD1S :: ByteString -> D1Secondary

-- | Given the primary sequence and the mutation set, fill the <a>RNA</a>
--   structure.
--   
--   NOTE This wraps some <tt>ViennaRNA-bindings</tt> calls that are in
--   <tt>IO</tt>.
--   
--   TODO check if these calls are *really* thread-safe!
mkRNA :: Maybe (HashMap ByteString QLine) -> ByteString -> Vector (Int, Char) -> RNA

-- | Insert a set of mutations in a <tt>ByteString</tt>.
insertMutations :: Vector (Int, Char) -> ByteString -> ByteString
data Landscape
Landscape :: HashMap (BitSet I) RNA -> !Int -> !ByteString -> !ByteString -> !(BimapHashMap Int Int) -> Landscape

-- | the individual RNA mutations. The index should be calculated from
--   <tt>linearIndex 0 high mutationSet</tt>
[rnas] :: Landscape -> HashMap (BitSet I) RNA

-- | how many nucleotides are mutated in total
[mutationCount] :: Landscape -> !Int

-- | the ancestral sequence
[landscapeOrigin] :: Landscape -> !ByteString

-- | the final sequence
[landscapeDestination] :: Landscape -> !ByteString
[mutationPositions] :: Landscape -> !(BimapHashMap Int Int)

-- | TODO prime candidate for parallelization. ViennaRNA-bindings currently
--   does not allow parallel runs! It would be possible to consider
--   externalizing this, but for now we just run single-threaded.
createRNAlandscape :: Maybe (HashMap ByteString QLine) -> Bool -> ByteString -> ByteString -> (Landscape, [(Int, ByteString)])

-- | Write a generated landscape to disk.
toFile :: FilePath -> Landscape -> IO ()
toFileJSON :: FilePath -> Landscape -> IO ()
fromFile :: FilePath -> IO Landscape
fromFileJSON :: FilePath -> IO Landscape
data QLine
QLine :: ByteString -> (ByteString, Double) -> (ByteString, Double) -> (ByteString, Double) -> QLine
[qlSequence] :: QLine -> ByteString
[qlmfe] :: QLine -> (ByteString, Double)
[qlensemble] :: QLine -> (ByteString, Double)
[qlcentroid] :: QLine -> (ByteString, Double)
qlines :: FilePath -> IO (HashMap ByteString QLine)
instance GHC.Show.Show BioInf.MutationOrder.RNA.QLine
instance GHC.Generics.Generic BioInf.MutationOrder.RNA.Landscape
instance GHC.Classes.Eq BioInf.MutationOrder.RNA.Landscape
instance GHC.Show.Show BioInf.MutationOrder.RNA.Landscape
instance GHC.Generics.Generic BioInf.MutationOrder.RNA.RNA
instance GHC.Classes.Eq BioInf.MutationOrder.RNA.RNA
instance GHC.Show.Show BioInf.MutationOrder.RNA.RNA
instance Control.DeepSeq.NFData BioInf.MutationOrder.RNA.RNA
instance Data.Serialize.Serialize BioInf.MutationOrder.RNA.RNA
instance Data.Aeson.Types.ToJSON.ToJSON BioInf.MutationOrder.RNA.RNA
instance Data.Aeson.Types.FromJSON.FromJSON BioInf.MutationOrder.RNA.RNA
instance Control.DeepSeq.NFData BioInf.MutationOrder.RNA.Landscape
instance Data.Serialize.Serialize BioInf.MutationOrder.RNA.Landscape
instance Data.Aeson.Types.ToJSON.ToJSON BioInf.MutationOrder.RNA.Landscape
instance Data.Aeson.Types.FromJSON.FromJSON BioInf.MutationOrder.RNA.Landscape


-- | Calculate minimum-distance Hamiltonian Shortest Paths and
--   probabilities for starting nodes.
--   
--   NOTE: We explicitly model starting nodes. For symmetrical distance
--   matrices, this reports begin/end probabilities. For asymmetrical
--   distance matrices, a second instances with <tt>Last</tt> instead of
--   <tt>First</tt> boundary should be created to calculate begin/end
--   probabilities separately.
module BioInf.MutationOrder.MinDist

-- | Given the <a>RNA</a> we come from and the <a>RNA</a> we mutate into,
--   derive the gain or loss by a scaling function.
type ScaleFunction = RNA -> RNA -> Double

-- | Minimal distance algebra
--   
--   TODO The two Ints are the indices of the nodes and could be replaced?
aMinDist :: Monad m => ScaleFunction -> Landscape -> SigMinDist m Double Double ((Int :. From) :. To) (Int :. To)

-- | Fused co-optimal counter!
--   
--   TODO for now, <tt>Int</tt> is assumed to be big enough...
aMinDistCount :: Monad m => ScaleFunction -> Landscape -> SigMinDist m (Double, Int) (Double, Int) ((Int :. From) :. To) (Int :. To)

-- | Sum over all states and collapse into boundary unscaled weights.
aInside :: Monad m => Maybe Int -> ScaleFunction -> Landscape -> SigMinDist m (Log Double) (Log Double) ((Int :. From) :. To) (Int :. To)

-- | This should give the correct order of nodes independent of the
--   underlying <tt>Set1 First</tt> or <tt>Set1 Last</tt> because the
--   <tt>(From:.To)</tt> system is agnostic over these.
--   
--   TODO Use text builder
aPretty :: Monad m => ScaleFunction -> Landscape -> SigMinDist m Text [Text] ((Int :. From) :. To) (Int :. To)

-- | Count co-optimals
aCount :: Monad m => Landscape -> SigMinDist m Integer [Integer] ((Int :. From) :. To) (Int :. To)
type TS1 x = TwITbl Id Unboxed EmptyOk (BS1 First I) x
type U x = TwITbl Id Unboxed EmptyOk (Unit I) x
type PF x = TwITbl Id Unboxed EmptyOk (Boundary First I) x
type TS1L x = TwITbl Id Unboxed EmptyOk (BS1 Last I) x
type PFL x = TwITbl Id Unboxed EmptyOk (Boundary Last I) x
type BT1 x b = TwITblBt Unboxed EmptyOk (BS1 First I) x Id Id b
type BTU x b = TwITblBt Unboxed EmptyOk (Unit I) x Id Id b
type BT1L x b = TwITblBt Unboxed EmptyOk (BS1 Last I) x Id Id b

-- | Run the minimal distance algebra.
--   
--   This produces one-boundary sets. Meaning that for each boundary we get
--   the total distance within the set.
forwardMinDist1 :: ScaleFunction -> Landscape -> (Z :. TS1L Double) :. U Double
backtrackMinDist1 :: ScaleFunction -> Landscape -> (Z :. TS1L Double) :. U Double -> [Text]

-- | Count the number of co-optimals
minDistCount :: ScaleFunction -> Landscape -> (Z :. TS1L (Double, Int)) :. U (Double, Int)
countBackMinDist1 :: ScaleFunction -> Landscape -> (Z :. TS1L Double) :. U Double -> [Integer]

-- | Given the <tt>Set1</tt> produced in <tt>forwardMinDist1</tt> we can
--   now extract the co-optimal paths using the <tt>Set1 -&gt; ()</tt>
--   index change.
--   
--   TODO do we want this one explicitly or make life easy and just extract
--   from all <tt>forwardMinDist1</tt> paths?
runCoOptDist :: ScaleFunction -> Landscape -> (Double, [Text])
runCount :: ScaleFunction -> Landscape -> (Double, Int)

-- | Extract the individual partition scores.
boundaryPartFunFirst :: Maybe Int -> ScaleFunction -> Landscape -> [(Boundary First I, Log Double)]
boundaryPartFunLast :: Maybe Int -> ScaleFunction -> Landscape -> BoundaryPart
data BoundaryPart
BoundaryPart :: [(Boundary Last I, Log Double)] -> [(Boundary Last I, Log Double)] -> Log Double -> BoundaryPart
[bpNormalized] :: BoundaryPart -> [(Boundary Last I, Log Double)]
[bpUnnormalized] :: BoundaryPart -> [(Boundary Last I, Log Double)]
[bpTotal] :: BoundaryPart -> Log Double
boundaryPart :: [(Boundary Last I, Log Double)] -> BoundaryPart
instance GHC.Classes.Eq BioInf.MutationOrder.MinDist.BoundaryPart
instance GHC.Show.Show BioInf.MutationOrder.MinDist.BoundaryPart

module BioInf.MutationOrder.EdgeProb

-- | Before using <tt>aInside</tt> the <tt>ScoreMat</tt> needs to be scaled
--   
--   TODO the <tt>Edge</tt> needs to be an <tt>EdgeWithActive</tt> to get
--   the active bits on the left in the set.
aInside :: Monad m => ScaleFunction -> Landscape -> SigEdgeProb m (Log Double) (Log Double) ((Int :. From) :. To) (Int :. To)
type TF1 x = TwITbl Id Unboxed EmptyOk (BS1 Last I) x
type TL1 x = TwITbl Id Unboxed EmptyOk (BS1 Last O) x
type EB x = TwITbl Id Unboxed EmptyOk (EdgeBoundary C) x

-- | Extract the individual partition scores.
edgeProbPartFun :: ScaleFunction -> Landscape -> ([(Boundary Last I, Log Double)], [(EdgeBoundary C, Log Double)])

-- | Turn the edge probabilities into a score matrix.
edgeProbScoreMatrix :: Landscape -> [Log Double] -> [(EdgeBoundary C, Log Double)] -> ScoreMatrix (Log Double)


-- | Run all steps of the HoxCluster algorithms in order.
--   
--   This will produce the following:
--   
--   <ol>
--   <li>run the minimal distance algorithm, give the minimal distance
--   score and return all co-optimal paths</li>
--   <li>run the end-probability algorithm and return the probability that
--   each node is the begin/end of a chain</li>
--   <li>run the edge probability algorithm and give the probability for
--   each <tt>from :-&gt; to</tt> edge</li>
--   <li>with the edge probabilities, run the maximal probability path
--   algorithm, return that probability and all co-optimal paths</li>
--   </ol>
--   
--   TODO -Pretty should yield a structure to be given to the eps or svg
--   generator. This allows more flexibility. Does diagrams offer
--   serialization?
--   
--   TODO All this should be wrapped and available as a function. not just
--   providing output files.
module BioInf.MutationOrder
runMutationOrder :: Bool -> FillWeight -> FillStyle -> ScaleFunction -> ScaleFunction -> Int -> Int -> Maybe FilePath -> [Char] -> FilePath -> t -> Bool -> [FilePath] -> IO ()
posScaled :: Double -> Double -> ScaleFunction -> ScaleFunction

-- | Basepair distance
basepairDistanceMFE :: ScaleFunction
basepairDistanceCentroid :: ScaleFunction

-- | Scale function for normal mfe delta energies
mfeDelta :: ScaleFunction

-- | Scale function for normal centroid delta energies
centroidDelta :: ScaleFunction

-- | Square positive "contributions", making bad moves more unlikely
squaredPositive :: ScaleFunction -> ScaleFunction

-- | Scale by temperature (for probability stuff)
scaleTemperature :: Double -> ScaleFunction -> ScaleFunction
scaleByFunction :: (t3 -> t2) -> (t1 -> t -> t3) -> t1 -> t -> t2

-- | Basepair distance
--   
--   Stupid fasta reader
stupidReader :: FilePath -> IO ByteString

-- | <tt>withDumpFile</tt> is like <tt>idIO :: a -&gt; IO a</tt> in that it
--   returns the data we give to the function. However, in case the dump
--   file exists, we read it and return its contents, instead of
--   recalculating. If it does not exist, we dump the data in addition to
--   returning it. This forces the <tt>Landscape</tt>.
withDumpFile :: Handle -> FilePath -> ByteString -> ByteString -> Landscape -> IO Landscape

-- | Fill weight for our grid. If the fill weight is <tt>logarithmic</tt>,
--   then the line length is <tt>1 / (1 + log value)</tt> otherwise it is
--   <tt>value</tt>.
data FillWeight :: *
FWlog :: FillWeight
FWlinear :: FillWeight
FWfill :: FillWeight
data FillStyle :: *
FSopacityLog :: FillStyle
FSopacityLinear :: FillStyle
FSfull :: FillStyle

-- | Given the <a>RNA</a> we come from and the <a>RNA</a> we mutate into,
--   derive the gain or loss by a scaling function.
type ScaleFunction = RNA -> RNA -> Double