-- | Run all steps of the HoxCluster algorithms in order. -- -- This will produce the following: -- -- 1. run the minimal distance algorithm, give the minimal distance score -- and return all co-optimal paths -- -- 2. run the end-probability algorithm and return the probability that -- each node is the begin/end of a chain -- -- 3. run the edge probability algorithm and give the probability for each -- @from :-> to@ edge -- -- 4. with the edge probabilities, run the maximal probability path -- algorithm, return that probability and all co-optimal paths -- -- TODO -Pretty should yield a structure to be given to the eps or svg -- generator. This allows more flexibility. Does diagrams offer -- serialization? -- -- TODO All this should be wrapped and available as a function. not just -- providing output files. module BioInf.MutationOrder ( module BioInf.MutationOrder , FillWeight (..) , FillStyle (..) , ScaleFunction (..) ) where import Control.Arrow (first,second) import Control.Error import Control.Monad.IO.Class (liftIO) import Control.Monad (unless,forM_,when,forM) import Control.Lens import Data.ByteString.Strict.Lens import Data.Bits import Data.ByteString (ByteString) import Data.Char (toUpper) import Data.Data import Data.Function (on) import Data.List (groupBy,sortBy,foldl',(\\),sort) import Data.List.Split (chunksOf) import Data.Ord (comparing) import Data.Tuple (swap) import Debug.Trace import Numeric.Log import qualified Control.Parallel.Strategies as Par import qualified Data.ByteString.Char8 as BS import qualified Data.HashMap.Strict as HM import qualified Data.Map.Strict as M import qualified Data.Text as T import qualified Data.Text.IO as T import qualified Data.Trie as Trie import qualified Data.Vector.Unboxed as VU import System.Directory (doesFileExist) import System.Exit (exitFailure) import System.Exit (exitSuccess) import System.IO (withFile,IOMode(WriteMode),hPutStrLn,Handle,stderr) import Text.Printf import ADP.Fusion.Term.Edge.Type (From(..),To(..)) import Biobase.Secondary.Diagrams (d1Distance) import Data.PrimitiveArray (fromEdgeBoundaryFst, EdgeBoundary(..), (:.)(..), getBoundary) import Data.PrimitiveArray.ScoreMatrix import Diagrams.TwoD.ProbabilityGrid import qualified Data.Bijection.HashMap as B import qualified Data.PrimitiveArray as PA import qualified ShortestPath.SHP.Edge.MinDist as SHP import Data.Bits.Ordered import BioInf.MutationOrder.EdgeProb import BioInf.MutationOrder.MinDist import BioInf.MutationOrder.RNA import BioInf.MutationOrder.SequenceDB import qualified BioInf.MutationOrder.BackMutations as BM data ScoreType = Mfe | Centroid | PairDistMfe | PairDistCen | PairDistMfeTgt | PairDistCenTgt deriving (Show,Data,Typeable) runMutationOrder verbose fw fs scoretype positivesquared posscaled onlypositive cooptCount cooptPrint lkupFile outprefix workdb temperature equalStart [ancestralFP,currentFP] everyKth = do -- only run if out file(s) do not exist dfe <- doesFileExist (outprefix ++ ".run") when dfe $ do hPrintf stderr "%s.run exists, ending now!\n" outprefix exitSuccess withFile (outprefix ++ ".run") WriteMode $ \oH -> do hPrintf stderr "%s.run job started!\n" outprefix -- -- Initial stuff and debug information -- ancestral <- stupidReader ancestralFP current <- stupidReader currentFP lkup <- case lkupFile of {Nothing -> return Nothing; Just f -> Just <$> qlines f} hPrintf stderr "prepare to load dump file\n" !ls <- withDumpFile oH workdb ancestral current . fst $ createRNAlandscape lkup verbose ancestral current hPrintf stderr "dump file loaded\n" -- final state in the rna landscape let !tgtRNA = rnas ls HM.! (2^ mutationCount ls - 1) hPrintf stderr "rna db loaded, tgt rna set\n" let fwdScaleFunction = (if positivesquared then squaredPositive else id) . (maybe id (uncurry posScaled) posscaled) . (if onlypositive then (scaleByFunction (max 0)) else id) $ (case scoretype of Mfe -> mfeDelta Centroid -> centroidDelta PairDistMfe -> basepairDistanceMFE PairDistCen -> basepairDistanceCentroid PairDistMfeTgt → bpMFEDistToExtant tgtRNA PairDistCenTgt → bpCentroidDistToExtant tgtRNA ) let probScaleFunction = scaleTemperature temperature . (if positivesquared then squaredPositive else id) . (maybe id (uncurry posScaled) posscaled) . (if onlypositive then (scaleByFunction (max 0)) else id) $ (case scoretype of Mfe -> mfeDelta Centroid -> centroidDelta PairDistMfe -> basepairDistanceMFE PairDistCen -> basepairDistanceCentroid PairDistMfeTgt → bpMFEDistToExtant tgtRNA PairDistCenTgt → bpCentroidDistToExtant tgtRNA ) let mpks = sortBy (comparing snd) . B.toList $ mutationPositions ls let bitToNuc = M.fromList $ map (swap . first (+1)) mpks let nn = length mpks hPrintf oH "number of mutations: %d\n" $ mutationCount ls hPrintf oH "\n%s\n\n" $ replicate 80 '=' -- -- Run co-optimal lowest energy changes -- hPrintf stderr "starting forward phase\n" let (!e,bs) = runCoOptDist fwdScaleFunction ls hPrintf stderr "forward phase done with %f\n" e -- forM_ bs $ \b → hPrintf stderr "%s\n" (show b) let (ecount,countcount) = runCount fwdScaleFunction ls -- split co-optimals into "want to print" and "want to count"; -- @countbs@ should be possible to stream let (printbs,countbs) = splitAt cooptPrint bs hPrintf stderr "starting coopthisto\n" let takeKth k [] = [] takeKth k (x:xs) = x : takeKth k xs let ch = coopthisto $ map snd $ takeKth everyKth $ take cooptCount $ printbs ++ countbs hPrintf stderr (show ch ++ "\n") -- TODO here we can now do a histogram with printbs and countbs hPrintf oH "Best energy gain: %10.4f\n" e hPrintf oH "Number of co-optimal paths: %10d\n" countcount -- ((length printbs) + (length $ take (cooptCount-cooptPrint) bs)) forM_ printbs (T.hPutStrLn oH . fst) hPrintf oH "%s\n\n" $ replicate 80 '=' -- -- Run @First@ probability algorithm to determine the probability for -- each mutation to be the initial one -- -- TODO this is completely wrong, because it still starts at the -- ancestral sequence. We would have to start at the extant sequence. -- Need to later think about this. But do not use any @First@ functions -- now! {- hPrintf oH "Chain begin probabilities:\n" let fps = boundaryPartFunFirst Nothing probScaleFunction ls forM_ mpks $ \(mp,k) -> hPrintf oH " %6d" (mp+1) hPrintf oH "\n" forM_ fps $ \(_, Exp p) -> hPrintf oH " %6.4f" (exp p) hPrintf oH "\n\n" printf "\n" -} -- -- Run @Last@ probability algorithm to determine the probability for -- each mutation to be the last one -- hPrintf oH "Chain end probabilities:\n" let fps = boundaryPartFunLast Nothing probScaleFunction ls forM_ mpks $ \(mp,k) -> hPrintf oH " %6d" (mp+1) hPrintf oH "\n" forM_ (bpNormalized fps) $ \(_, Exp p) -> hPrintf oH " %6.4f" (exp p) hPrintf oH "\n\n%s\n\n" $ replicate 80 '=' --printf "\n" -- -- Run specialized versions of the above, restricting the first mutation -- to the given one. Marginalized over the last probability, and rescaled -- we get the first probability. Completely printed out, we get the joint -- probability for each @i,j@ to be @first,last@ in the chain. -- hPrintf oH "Restricted chain end probabilities\n" let rbps = map (\(mp,k) -> (mp,k,boundaryPartFunLast (Just k) probScaleFunction ls)) mpks {- forM_ rbps $ \(mp,k,bp) -> do hPrintf oH "%5d %5d\n" (mp+1) k forM_ (bpUnnormalized bp) $ \(l,Exp p) -> hPrintf oH "%7d " (bitToNuc M.! getBoundary l) hPrintf oH "\n" forM_ (bpUnnormalized bp) $ \(l,p) -> hPrintf oH "%7.2f " (exp . ln $ p / bpTotal bp) hPrintf oH "\n" hPrintf oH "\n" -} -- collect all restricted partition function scores and prepare for -- normalization let firstlastUn = M.fromList [ ((mp+1,bitToNuc M.! getBoundary l), logp) | (mp,k,bp) <- rbps, (l,logp) <- bpUnnormalized bp ] let firstlastZ = Numeric.Log.sum [ bpTotal bp | (_,_,bp) <- rbps ] let firstlastLogP = M.map (/firstlastZ) firstlastUn let firstlastP = M.map (exp . ln) firstlastLogP -- rowMarginals gives the total probability that the mutation order -- begins with this mutation. let rowMarginals = M.mapKeysWith (+) fst firstlastP -- colMarginals gives the total probability that the mutation order -- ends with this mutation. let colMarginals = M.mapKeysWith (+) snd firstlastP hPrintf oH "lnZ: %8.5f\n" $ ln firstlastZ hPrintf oH " " forM_ (M.elems bitToNuc) $ \mut -> hPrintf oH "%6d " mut hPrintf oH " Σ\n" forM_ (M.elems bitToNuc) $ \frst -> do hPrintf oH "%4d " frst forM_ (M.elems bitToNuc) $ \lst -> hPrintf oH "%6.4f " (firstlastP M.! (frst,lst)) hPrintf oH " %6.4f\n" $ rowMarginals M.! frst hPrintf oH "Σ " forM_ (M.elems colMarginals) $ hPrintf oH "%6.4f " hPrintf oH "\n\n" hPrintf oH "divergence from proper normalization: %10.8f\n" (1 - Prelude.sum firstlastP) hPrintf oH "row marginal sum %10.8f\n" (Prelude.sum rowMarginals) hPrintf oH "col marginal sum %10.8f\n" (Prelude.sum colMarginals) hPrintf oH "\n%s\n\n" $ replicate 80 '=' -- debug on {- hPrintf oH "%f\n" $ ln firstlastZ hPrintf oH "%s " $ replicate 10 ' ' forM_ (M.elems bitToNuc) $ \mut -> hPrintf oH "%10d " mut hPrintf oH "\n" forM_ (M.elems bitToNuc) $ \frst -> do hPrintf oH "%8d " frst forM_ (M.elems bitToNuc) $ \lst -> hPrintf oH "%10.4f " (ln $ firstlastUn M.! (frst,lst)) hPrintf oH "\n" hPrintf oH "\n" hPrintf oH "%f\n" $ ln firstlastZ hPrintf oH "%s " $ replicate 10 ' ' forM_ (M.elems bitToNuc) $ \mut -> hPrintf oH "%10d " mut hPrintf oH "\n" forM_ (M.elems bitToNuc) $ \frst -> do hPrintf oH "%8d " frst forM_ (M.elems bitToNuc) $ \lst -> hPrintf oH "%10.4f " ((ln $ firstlastUn M.! (frst,lst)) - ln firstlastZ) hPrintf oH "\n" hPrintf oH "\n" -} -- debug off -- debug on -- calculate first weight, unnormalized -- let firstUn = M.fromList [ ] -- debug off -- -- -- Run edge probability Inside/Outside calculations. These take quite -- a while longer. -- let (ibs,eps) = edgeProbPartFun probScaleFunction ls hPrintf oH "pairwise next mutation probabilities:\n\n" hPrintf oH " " forM_ mpks $ \(mp,k) -> hPrintf oH " %6d" k hPutStrLn oH "" hPrintf oH " " forM_ mpks $ \(mp,k) -> hPrintf oH " %6d" (mp+1) hPutStrLn oH "" forM_ (zip (groupBy ((==) `on` (fromEdgeBoundaryFst . fst)) eps) mpks) $ \(rps,(mp,k)) -> do let (eb,_) = head rps hPrintf oH "%3d %3d" k (mp+1) forM_ rps $ \(eb,Exp p) -> hPrintf oH (" %6.4f") (exp p) hPrintf oH " %6.4f" (Prelude.sum $ map (exp . ln . snd) rps) hPrintf oH "\n" let colSums = M.fromListWith (+) [ (c,p) | ((_ :-> c),p) <- eps ] hPrintf oH " Σ " forM_ (M.toList colSums) $ \(c,Exp p) -> hPrintf oH (" %6.4f") (exp p) hPutStrLn oH "\n" gridFile [SVG,EPS] (outprefix ++ "-edge") fw fs nn nn (map (show . (+1) . fst) mpks) (map (show . (+1) . fst) mpks) (map snd eps) hPrintf oH "\n%s\n\n" $ replicate 80 '=' -- -- Generate the path with maximal edge probability -- {- let eprobsFirst = edgeProbScoreMatrix ls (Prelude.map (Exp . log) $ M.elems colMarginals) eps let (Exp maxprob,mpbt) = SHP.runMaxEdgeProbFirst eprobsFirst hPrintf oH "Maximal Edge Log-Probability Sum: %6.4f with at least %d co-optimal paths\n" maxprob (length $ take cooptCount mpbt) hPutStrLn oH "first mutation to extant species\n" forM_ (take cooptPrint mpbt) $ \bt -> do let extractMut (SHP.BTnode (_:.To n)) = n extractMut (SHP.BTedge (From ff:.To tt)) = ff let mutationOrder = tail $ scanl (\set mut -> set `setBit` extractMut mut) zeroBits (reverse bt) let prettyPrint mut k = do let rna = rnas ls HM.! mut hPrintf oH " %3s %s\n %s MFE %6.4f\n %s CNT %6.4f\n" (maybe "anc" (show . (+1) . fst . (!!) mpks) k) (BS.unpack $ primarySequence rna) (BS.unpack $ mfeStructure rna) (mfeEnergy rna) (BS.unpack $ centroidStructure rna) (centroidEnergy rna) hPutStrLn oH $ replicate 8 ' ' ++ (take (BS.length $ primarySequence rna) . concat $ zipWith (\xs x -> xs ++ show x) (repeat $ " . ") (drop 1 $ cycle [0..9])) prettyPrint zeroBits Nothing forM_ (zip (reverse bt) mutationOrder) $ \case (SHP.BTnode (_:.To n),mut) -> prettyPrint mut (Just n) (SHP.BTedge (From ff:.To tt),mut) -> prettyPrint mut (Just ff) hPutStrLn oH "" hPutStrLn oH "" -} -- the rowMarginals hold the probabily to begin with a mutation. Since -- @Last@ goes from first to last mutation, this is what we need. let eplStartWeight = if equalStart then Prelude.map (const 1) $ M.elems rowMarginals else Prelude.map (Exp . log) $ M.elems rowMarginals let eprobsLast = edgeProbScoreMatrix ls eplStartWeight eps --print eprobsLast --print $ PA.assocs $ scoreMatrix eprobsLast let (Exp maxprobLast,lastLogProbs,mpbtLast') = SHP.runMaxEdgeProbLast eprobsLast let mpbtLast = map reverse mpbtLast' --print maxprobLast --print lastLogProbs --mapM_ print $ mpbtLast hPrintf oH "Fraction of optimal choice for each final mutation:\n" forM_ lastLogProbs $ \(PA.Boundary b, _) -> hPrintf oH " %6d" $ bitToNuc M.! b hPrintf oH "\n" forM_ lastLogProbs $ \(_, p) -> hPrintf oH " %6.4f" $ exp $ ln (p / Exp maxprobLast) hPrintf oH "\n\n" hPrintf oH "Maximal edge log-probability sum: %6.4f (P = %10.8f) with at least %d co-optimal paths\n" maxprobLast (exp maxprobLast) (length $ take cooptCount mpbtLast) hPutStrLn oH "(first mutation to extant species)\n" forM_ (take cooptPrint mpbtLast) $ \bt -> do let extractMut (SHP.BTnode (_:.To n)) = n extractMut (SHP.BTedge (From ff:.To tt)) = tt let mutationOrder = tail $ scanl (\set mut -> set `setBit` extractMut mut) zeroBits bt let prettyPrint mut k = do let rna = rnas ls HM.! mut hPrintf oH " %3s %s\n %s MFE %6.4f\n %s CNT %6.4f\n" (maybe "anc" (show . (+1) . fst . (!!) mpks) k) (BS.unpack $ primarySequence rna) (BS.unpack $ mfeStructure rna) (mfeEnergy rna) (BS.unpack $ centroidStructure rna) (centroidEnergy rna) hPutStrLn oH $ replicate 8 ' ' ++ (take (BS.length $ primarySequence rna) . concat $ zipWith (\xs x -> xs ++ show x) (repeat $ " . ") (drop 1 $ cycle [0..9])) prettyPrint zeroBits Nothing forM_ (zip bt mutationOrder) $ \case (SHP.BTnode (_:.To n),mut) -> prettyPrint mut (Just n) (SHP.BTedge (From ff:.To tt),mut) -> prettyPrint mut (Just tt) hPutStrLn oH "" hPutStrLn oH "" -- NEW: coopt histograms hPutStrLn oH "Co-optimality histograms" hPrintf oH " " forM_ [1.. mutationCount ls] (hPrintf oH " %10d") hPrintf oH "\n" forM_ ch $ \(h,as,rs) → do hPrintf oH "%5d" $ h+1 -- absolutes VU.mapM_ (\c → hPrintf oH " %10d" c) as -- relatives hPrintf oH "\n r" VU.mapM_ (\c → hPrintf oH " %10.8f" c) rs -- -- log-scale (1 + relative count) -- hPrintf oH "\n lnR " -- VU.mapM_ (\c → hPrintf oH " %10.8f" (log $ 1+c)) rs hPrintf oH "\n" hPrintf oH "\n" -- -- MEA order let meaOrder = let go = \case SHP.BTnode (_:.To n) -> n SHP.BTedge (From ff:.To tt) -> tt in map go $ concat $ take 1 mpbtLast let meaAnno = map (\k -> map (show . (+1) . fst) mpks !! k) meaOrder let meaEps = [ (ee !! k) !! l | let ee = groupBy ((==) `on` (fromEdgeBoundaryFst . fst)) eps, k <- meaOrder, l <- meaOrder ] gridFile [SVG,EPS] (outprefix ++ "-edge-meaorder") fw fs nn nn meaAnno meaAnno (map snd meaEps) --print eps --print meaEps {- let eprobsLast = edgeProbScoreMatrix ls (Prelude.map (Exp . log) $ M.elems rowMarginals) eps let (Exp maxprobLast,lastLogProbs,mpbtLast) = SHP.runMaxEdgeProbLast eprobsLast print maxprobLast print $ map (\(k,Exp p) -> (k,exp $ p - maxprobLast)) lastLogProbs mapM_ print $ concat $ take 2 mpbtLast let eprobsLast = edgeProbScoreMatrix ls (Prelude.map (Exp . log) $ M.elems colMarginals) eps let (Exp maxprobLast,lastLogProbs,mpbtLast) = SHP.runMaxEdgeProbLast eprobsLast print maxprobLast print $ map (\(k,Exp p) -> (k,exp $ p - maxprobLast)) lastLogProbs mapM_ print $ concat $ take 2 mpbtLast -} {-# NoInline runMutationOrder #-} -- | Histogram that provides for each mutation position the relative frequency -- with which the mutation shows up as the k'th one in the order. coopthisto ∷ [[Int]] -- ^ list of permutation orders @[ [116, 10, 48, 30] , [10, 30, 48, 116], ... ]@ → [(Int,VU.Vector Int, VU.Vector Double)] -- ^ list of mutation, absolute frequency of position, relative frequency of position coopthisto as@(a':_) = es -- traceShow (keys,as,bs,cs,ds) $ es where -- just so that we know keys = sort a' -- add positional information: each permutation order has @(,index)@ added. -- then we can concatenate all lists and get @[ (116,1), (10,2), ... (116,4), ...]@ rks = reverse [1 .. length keys] bs = concatMap (\a → zip a rks) as -- into hashmap, (mutation, order position) sum up how often seen cs = HM.fromListWith (+) $ map (,1∷Int) bs -- absolute frequencies ds = [ (k, VU.fromList [ HM.lookupDefault 0 (k, i) cs | i ← [1 .. length keys] ]) | k ← keys ] -- attach relative frequencies es = [ (k, abs, VU.map (\q → fromIntegral q / s) abs) | (k,abs) ← ds, let (s∷Double) = fromIntegral $ VU.sum abs ] posScaled :: Double -> Double -> ScaleFunction -> ScaleFunction posScaled l s = scaleByFunction go where go d | d >= l = d ** s | otherwise = d {-# Inline go #-} {-# Inlinable posScaled #-} -- ** Basepair distance, between neighbors basepairDistanceMFE :: ScaleFunction basepairDistanceMFE frna trna = fromIntegral $ d1Distance (mfeD1S frna) (mfeD1S trna) basepairDistanceCentroid :: ScaleFunction basepairDistanceCentroid frna trna = fromIntegral $ d1Distance (centroidD1S frna) (centroidD1S trna) -- ** Basepair distance, from currently newest to extant bpMFEDistToExtant ∷ RNA → ScaleFunction bpMFEDistToExtant extant frna trna = fromIntegral $ d1Distance e (mfeD1S trna) - d1Distance e (mfeD1S frna) where e = mfeD1S extant bpCentroidDistToExtant ∷ RNA → ScaleFunction bpCentroidDistToExtant extant frna trna = fromIntegral $ d1Distance e (mfeD1S trna) - d1Distance e (mfeD1S frna) where e = mfeD1S extant -- | Scale function for normal mfe delta energies mfeDelta :: ScaleFunction mfeDelta frna trna = mfeEnergy trna - mfeEnergy frna {-# Inlinable mfeDelta #-} -- | Scale function for normal centroid delta energies centroidDelta :: ScaleFunction centroidDelta frna trna = centroidEnergy trna - centroidEnergy frna {-# Inlinable centroidDelta #-} -- | Square positive "contributions", making bad moves more unlikely squaredPositive :: ScaleFunction -> ScaleFunction squaredPositive sf = scaleByFunction sp sf where sp d | d > 0 = d * d | otherwise = d {-# Inline sp #-} {-# Inlinable squaredPositive #-} -- | Scale by temperature (for probability stuff) scaleTemperature :: Double -> ScaleFunction -> ScaleFunction scaleTemperature t sf = scaleByFunction (/t) sf {-# Inlinable scaleTemperature #-} scaleByFunction f sf = \frna trna -> let d = sf frna trna in f d {-# Inlinable scaleByFunction #-} -- | Basepair distance -- | Stupid fasta reader stupidReader :: FilePath -> IO ByteString stupidReader fp = do inp <- BS.lines <$> BS.readFile fp let xs = filter (\x -> not (BS.null x) && BS.head x /= '>') inp return $ BS.concat xs -- | @withDumpFile@ is like @idIO :: a -> IO a@ in that it returns the data -- we give to the function. However, in case the dump file exists, we read -- it and return its contents, instead of recalculating. If it does not -- exist, we dump the data in addition to returning it. This forces the -- @Landscape@. withDumpFile :: Handle -> FilePath -- ^ The path we store the serialized and compressed dump in -> ByteString -- ^ ancestral / origin sequence -> ByteString -- ^ destination sequence -> Landscape -- ^ the element which is to be serialized in the dump, or which would be -- the data in the dump -> IO Landscape -- ^ the data we put in, but maybe taken from the dump file withDumpFile oH fp ancestral current l = do dfe <- doesFileExist fp if dfe then do hPrintf oH "using database %s to load sequence information\n" fp ls <- fromFileJSON fp -- now we check if we have a sane DB file unless (landscapeOrigin ls == ancestral && landscapeDestination ls == current) $ do hPutStrLn oH "ancestral or target sequence do not match those stored in the work database" hPutStrLn oH $ "given ancestral: " ++ BS.unpack ancestral hPutStrLn oH $ "DB ancestral: " ++ (BS.unpack $ landscapeOrigin ls) hPutStrLn oH $ "given current: " ++ BS.unpack current hPutStrLn oH $ "DB current: " ++ (BS.unpack $ landscapeDestination ls) exitFailure return ls else do hPrintf oH "database %s does not exist! Folding all intermediate structures. This may take a while!\n" fp toFileJSON fp l return l -- | This function will run a subset of the possible backmutations. In -- particular, only those mutations for one particular backmutation column are -- being used. -- -- TODO less explicit transformer stack! runBackmutationVariants ∷ BM.ScaleFunction Double → FilePath -- ^ where the work db lives → [Char] -- ^ → Ancestral -- ^ The ancestral sequence from which we mutate away → Extant -- ^ The sequence to which to mutate to → Int -- ^ The backmutation / intermediate mutation we look at. Indexed with @[1..sequence length]@. -- TODO wrap in newtype that enforces this. We have some index structure saying "Start at 1". → ExceptT String IO () runBackmutationVariants scaleFun workdb alphabet ancestral extant ipos' = do let ipos = ipos' - 1 -- error guarding unless (ipos >=0) $ throwE "ipos can not be negative" unless (ipos < (BS.length $ getAncestral ancestral)) $ throwE "ipos larger than sequence length" unless (BS.length (getAncestral ancestral) == BS.length (getExtant extant)) $ throwE "ancestral and extant sequence do not have equal length" -- Load all sequences for the original problem -- they are needed anyway -- TODO i think @origSeqs'@ does not hold the original sequences, need to check tonight (seqCount, origSeqs', variants) ← createRNAlandscape2 alphabet (Right [ipos]) [] ancestral extant let origSeqs = Trie.fromList [ (s,()) | s ← origSeqs' ] varSeqs ← forM (alphabet \\ [getAncestral ancestral `BS.index` ipos, getExtant extant `BS.index` ipos]) $ \v → do let vsqs = Trie.fromList [ (s,()) | (i,c,ss) ← variants, c == v, s ← ss ] return (v, vsqs) -- liftIO $ print workdb -- TODO do only a single pass over the data -- origStrs ← liftIO $ filter (\r → rnaFoldSequence r `Trie.member` origSeqs) <$> map rna2dna <$> readRNAfoldFiles workdb allStrs ← liftIO $ filter (\r → rnaFoldSequence r `Trie.member` origSeqs || or [rnaFoldSequence r `Trie.member` vs | (_,vs) ← varSeqs] ) <$> map rna2dna <$> readRNAfoldFiles workdb let (rnas,_,_,_) = genSet ancestral extant Nothing allStrs -- The @ipos@ declares how many variants we have forM_ (alphabet \\ [getAncestral ancestral `BS.index` ipos, getExtant extant `BS.index` ipos]) $ \v → do let varSeqs = Trie.fromList [ (s,()) | (i,c,ss) ← variants, c == v, s ← ss ] varStrs ← liftIO $ filter (\r → rnaFoldSequence r `Trie.member` varSeqs) <$> map rna2dna <$> readRNAfoldFiles workdb let (ntrs,iposbitset,numMuts,mutpos) = genSet ancestral extant (Just (ipos,v)) allStrs --liftIO $ print (numMuts, iposbitset, rnas, ntrs, HM.size rnas, HM.size ntrs) let fmd = BM.forwardMinDist numMuts scaleFun iposbitset rnas ntrs let bts = BM.backtrackMinDist1 numMuts scaleFun iposbitset ipos' rnas ntrs mutpos fmd let evi = BM.forwardEvidence numMuts (partfun' scaleFun) iposbitset rnas ntrs liftIO $ printf "Unobserved Mutation Type: %s Position: %3d Nucleotide: %c Delta: %5.1f lnZ: %10.2f\n" (if iposbitset < 0 then "BM" else "IM" :: String) ipos' v (BM.forwardMinDistValue fmd) (ln $ BM.forwardEvidenceValue evi) liftIO $ mapM_ T.putStrLn $ take 1 bts return () return () mfeDelta' :: Bool → Bool → BM.ScaleFunction Double mfeDelta' mx sq frna trna = guardPositives mx sq $ t - f where t = rnaFoldMFEEner trna f = rnaFoldMFEEner frna {-# Inlinable mfeDelta' #-} centroidDelta' :: Bool → Bool → BM.ScaleFunction Double centroidDelta' mx sq frna trna = guardPositives mx sq $ t - f where t = rnaFoldCentroidEner trna f = rnaFoldCentroidEner frna {-# Inlinable centroidDelta' #-} guardPositives mx sq = (if sq then (\z → if z > 0 then z^2 else z) else id) . (if mx then max 0 else id) mfebpdist' ∷ Bool → Bool → BM.ScaleFunction Double mfebpdist' mx sq frna trna = fromIntegral $ d1Distance (bldD1S $ rnaFoldMFEStruc frna) (bldD1S $ rnaFoldMFEStruc trna) centroidbpdist' ∷ Bool → Bool → BM.ScaleFunction Double centroidbpdist' mx sq frna trna = fromIntegral $ d1Distance (bldD1S $ rnaFoldCentroidStruc frna) (bldD1S $ rnaFoldCentroidStruc trna) partfun' ∷ BM.ScaleFunction Double → BM.ScaleFunction (Log Double) partfun' f frna trna = Exp . negate $ f frna trna {-# Inlineable partfun' #-} rna2dna ∷ RNAfoldResult → RNAfoldResult rna2dna r = r { rnaFoldSequence = BS.map go $ rnaFoldSequence r } where go x =let x' = toUpper x in if x' == 'U' then 'T' else x' -- | Given ancestral and extant sequence, and possibly an intermediate -- mutation, as well as a list of intermediates create the set to RNA structure -- mapping. -- -- TODO some of the functions here should go into @Lib-SequencePolymorphism@. -- -- TODO should run within @ExceptT@ ! genSet ∷ Ancestral → Extant → Maybe (Int,Char) -- ^ If @Just@ then the 0-based position and character of the intermediate -- mutation. → [RNAfoldResult] → (HM.HashMap Int RNAfoldResult, Int, Int, B.BimapHashMap Int Int) genSet (Ancestral a') (Extant e') v xs = (HM.fromList kv, ipos, B.size posbit, posbit) where kv = [ (b, maybe (kvErr b) id $ HM.lookup (pat2str b) lkupRes) | b ← bits ] kvErr b = error $ show (lkupRes, sort $ map rnaFoldSequence xs, v, posbit, b, bits, pat2str b) -- update a/e based on if we have the intermediate mutation set up. a = maybe a' (\(i,c) → unpackedChars.ix i .~ c $ a') v e = maybe e' (\(i,c) → unpackedChars.ix i .~ c $ e') v -- all positions where the two bytestrings differ, together with the -- differing characters ks = filter (\(_,i,j) → i/=j) $ zip3 [0∷Int ..] (BS.unpack a') (BS.unpack e') -- turn into a bijection of actual position (first) and bit in bitset -- (second) posbit ∷ B.BimapHashMap Int Int posbit = B.fromList $ zip (ks^..traverse._1) [0∷Int ..] -- all bit patterns bits = [0 .. 2^B.size posbit - 1] -- convert a bit pattern to an actual string, to be looked up. Start -- with the ancestral sequence and for each @1@, modify the character -- into the one encountered in the extant sequence. pat2str ∷ Int → ByteString pat2str = let go s k = unpackedChars.ix (lk k) .~ (e `BS.index` lk k) $ s lk = maybe (error "lk") id . B.lookupR posbit in foldl' go a . activeBitsL -- lookup from sequence to RNAfoldResult lkupRes = HM.fromList [ (rnaFoldSequence x,x) | x ← xs ] -- is the global mutation intermediate? Yes: then >= 0 is the bitset -- element to be returned here ipos = maybe (-1) (\(z,_) → if BS.index a' z /= BS.index e' z then (maybe (error "ipos") id $ B.lookupL posbit z) else -1) v -- | Run the intermediate / backmutation order variant. This variant is slow, -- and requires large pre-calculated files, we parallelize and aggregate as -- much as possible. -- -- TODO read monad ?! {- runBackmutationVariants ∷ Int → [Char] → GlobalBackmutations → [BackmutationCol] → Ancestral → Extant → ExceptT String IO () runBackmutationVariants aggregate alphabet globback backcols ancestral extant = do -- Load all sequences for the original problem -- they are needed anyway (seqCount, origSeqs', variants) ← createRNAlandscape2 alphabet globback backcols ancestral extant let origSeqs = Trie.fromList [ (s,()) | s ← origSeqs' ] origStrs ← filter (\r → rnaFoldSequence r `Trie.member` origSeqs) <$> readRNAfoldFiles (error "workdb") let rnas = error "bitset -> rnafoldresult data" -- Group into sets of @aggregate@ elements for sequence aggregation let ass = chunksOf aggregate variants forM_ ass $ \as → do -- the required sequences are given by @origSeqs@ but modified at the appropriate position let aggrSeqss = map (\(p,n,xs) → (p,n,Trie.fromList [ (x,()) | x ← xs ])) as let allss = foldl' (\z (_,_,x) → Trie.unionL z x) Trie.empty aggrSeqss -- read in the structures for all as aggrStr ← filter (\r → rnaFoldSequence r `Trie.member` allss) <$> readRNAfoldFiles (error "workdb") let go (p,n,xs) = (p,n) where -- prepare the @ntrs@ data structure for each as ntrs = undefined $ filter (\r → rnaFoldSequence r `Trie.member` xs) aggrStr fMnD = BM.forwardMinDist (error "number of known mutations") (error "scale function") rnas ntrs fwdZ = BM.forwardEvidence (error "number of known mutations") (error "scale function for evidence") rnas ntrs -- parallel calculation and output for the different cases let rs = Par.parMap Par.rdeepseq go aggrSeqss forM_ rs $ \r → do return () return () -- print each output return () -}