module BioInf.MutationOrder
( module BioInf.MutationOrder
, FillWeight (..)
, FillStyle (..)
, ScaleFunction (..)
) where
import qualified Data.Vector.Unboxed as VU
import Data.Tuple (swap)
import Control.Monad (unless,forM_,when)
import Data.Bits
import Data.ByteString (ByteString)
import Data.Function (on)
import Data.List (groupBy,sortBy)
import Data.Ord (comparing)
import Numeric.Log
import qualified Data.ByteString.Char8 as BS
import qualified Data.HashMap.Strict as HM
import qualified Data.Map.Strict as M
import qualified Data.Text as T
import qualified Data.Text.IO as T
import System.Directory (doesFileExist)
import System.Exit (exitFailure)
import Text.Printf
import Control.Arrow (first,second)
import System.IO (withFile,IOMode(WriteMode),hPutStrLn,Handle)
import System.Exit (exitSuccess)
import ADP.Fusion.Term.Edge.Type (From(..),To(..))
import Data.PrimitiveArray (fromEdgeBoundaryFst, EdgeBoundary(..), (:.)(..), getBoundary)
import Data.PrimitiveArray.ScoreMatrix
import qualified Data.PrimitiveArray as PA
import Diagrams.TwoD.ProbabilityGrid
import qualified Data.Bijection.HashMap as B
import qualified ShortestPath.SHP.Edge.MinDist as SHP
import Biobase.Secondary.Diagrams (d1Distance)
import BioInf.MutationOrder.EdgeProb
import BioInf.MutationOrder.MinDist
import BioInf.MutationOrder.RNA
runMutationOrder verbose fw fs fwdScaleFunction probScaleFunction cooptCount cooptPrint lkupFile outprefix workdb temperature equalStart [ancestralFP,currentFP] = do
dfe <- doesFileExist (outprefix ++ ".run")
when dfe $ do
printf "%s.run exists, ending now!\n" outprefix
exitSuccess
withFile (outprefix ++ ".run") WriteMode $ \oH -> do
printf "%s.run job started!\n" outprefix
ancestral <- stupidReader ancestralFP
current <- stupidReader currentFP
lkup <- case lkupFile of {Nothing -> return Nothing; Just f -> Just <$> qlines f}
ls <- withDumpFile oH workdb ancestral current . fst $ createRNAlandscape lkup verbose ancestral current
let mpks = sortBy (comparing snd) . B.toList $ mutationPositions ls
let bitToNuc = M.fromList $ map (swap . first (+1)) mpks
let nn = length mpks
hPrintf oH "number of mutations: %d\n" $ mutationCount ls
hPrintf oH "\n%s\n\n" $ replicate 80 '='
let (e,bs) = runCoOptDist fwdScaleFunction ls
let (ecount,countcount) = runCount fwdScaleFunction ls
let (printbs,countbs) = splitAt cooptPrint bs
hPrintf oH "Best energy gain: %10.4f\n" e
hPrintf oH "Number of co-optimal paths: %10d\n" countcount
forM_ printbs (T.hPutStrLn oH)
hPrintf oH "%s\n\n" $ replicate 80 '='
hPrintf oH "Chain end probabilities:\n"
let fps = boundaryPartFunLast Nothing probScaleFunction ls
forM_ mpks $ \(mp,k) -> hPrintf oH " %6d" (mp+1)
hPrintf oH "\n"
forM_ (bpNormalized fps) $ \(_, Exp p) -> hPrintf oH " %6.4f" (exp p)
hPrintf oH "\n\n%s\n\n" $ replicate 80 '='
hPrintf oH "Restricted chain end probabilities\n"
let rbps = map (\(mp,k) -> (mp,k,boundaryPartFunLast (Just k) probScaleFunction ls)) mpks
let firstlastUn = M.fromList [ ((mp+1,bitToNuc M.! getBoundary l), logp)
| (mp,k,bp) <- rbps, (l,logp) <- bpUnnormalized bp
]
let firstlastZ = Numeric.Log.sum [ bpTotal bp | (_,_,bp) <- rbps ]
let firstlastLogP = M.map (/firstlastZ) firstlastUn
let firstlastP = M.map (exp . ln) firstlastLogP
let rowMarginals = M.mapKeysWith (+) fst firstlastP
let colMarginals = M.mapKeysWith (+) snd firstlastP
hPrintf oH " "
forM_ (M.elems bitToNuc) $ \mut -> hPrintf oH "%6d " mut
hPrintf oH " Σ\n"
forM_ (M.elems bitToNuc) $ \frst -> do
hPrintf oH "%4d " frst
forM_ (M.elems bitToNuc) $ \lst -> hPrintf oH "%6.4f " (firstlastP M.! (frst,lst))
hPrintf oH " %6.4f\n" $ rowMarginals M.! frst
hPrintf oH "Σ "
forM_ (M.elems colMarginals) $ hPrintf oH "%6.4f "
hPrintf oH "\n\n"
hPrintf oH "divergence from proper normalization: %10.8f\n" (1 Prelude.sum firstlastP)
hPrintf oH "row marginal sum %10.8f\n" (Prelude.sum rowMarginals)
hPrintf oH "col marginal sum %10.8f\n" (Prelude.sum colMarginals)
hPrintf oH "\n%s\n\n" $ replicate 80 '='
let (ibs,eps) = edgeProbPartFun probScaleFunction ls
hPrintf oH "pairwise next mutation probabilities:\n\n"
hPrintf oH " "
forM_ mpks $ \(mp,k) -> hPrintf oH " %6d" k
hPutStrLn oH ""
hPrintf oH " "
forM_ mpks $ \(mp,k) -> hPrintf oH " %6d" (mp+1)
hPutStrLn oH ""
forM_ (zip (groupBy ((==) `on` (fromEdgeBoundaryFst . fst)) eps) mpks) $ \(rps,(mp,k)) -> do
let (eb,_) = head rps
hPrintf oH "%3d %3d" k (mp+1)
forM_ rps $ \(eb,Exp p) -> hPrintf oH (" %6.4f") (exp p)
hPrintf oH " %6.4f" (Prelude.sum $ map (exp . ln . snd) rps)
hPrintf oH "\n"
let colSums = M.fromListWith (+) [ (c,p) | ((_ :-> c),p) <- eps ]
hPrintf oH " Σ "
forM_ (M.toList colSums) $ \(c,Exp p) -> hPrintf oH (" %6.4f") (exp p)
hPutStrLn oH "\n"
gridFile [SVG,EPS] (outprefix ++ "-edge") fw fs nn nn (map (show . (+1) . fst) mpks) (map (show . (+1) . fst) mpks) (map snd eps)
hPrintf oH "\n%s\n\n" $ replicate 80 '='
let eplStartWeight = if equalStart
then Prelude.map (const 1) $ M.elems rowMarginals
else Prelude.map (Exp . log) $ M.elems rowMarginals
let eprobsLast = edgeProbScoreMatrix ls eplStartWeight eps
let (Exp maxprobLast,lastLogProbs,mpbtLast') = SHP.runMaxEdgeProbLast eprobsLast
let mpbtLast = map reverse mpbtLast'
hPrintf oH "Fraction of optimal choice for each final mutation:\n"
forM_ lastLogProbs $ \(PA.Boundary b, _) -> hPrintf oH " %6d" $ bitToNuc M.! b
hPrintf oH "\n"
forM_ lastLogProbs $ \(_, p) -> hPrintf oH " %6.4f" $ exp $ ln (p / Exp maxprobLast)
hPrintf oH "\n\n"
hPrintf oH "Maximal edge log-probability sum: %6.4f (P = %10.8f) with at least %d co-optimal paths\n" maxprobLast (exp maxprobLast) (length $ take cooptCount mpbtLast)
hPutStrLn oH "(first mutation to extant species)\n"
forM_ (take cooptPrint mpbtLast) $ \bt -> do
let extractMut (SHP.BTnode (_:.To n)) = n
extractMut (SHP.BTedge (From ff:.To tt)) = tt
let mutationOrder = tail $ scanl (\set mut -> set `setBit` extractMut mut) zeroBits bt
let prettyPrint mut k = do
let rna = rnas ls HM.! mut
hPrintf oH " %3s %s\n %s MFE %6.4f\n %s CNT %6.4f\n"
(maybe "anc" (show . (+1) . fst . (!!) mpks) k)
(BS.unpack $ primarySequence rna)
(BS.unpack $ mfeStructure rna)
(mfeEnergy rna)
(BS.unpack $ centroidStructure rna)
(centroidEnergy rna)
hPutStrLn oH $ replicate 8 ' ' ++ (take (BS.length $ primarySequence rna) . concat $ zipWith (\xs x -> xs ++ show x) (repeat $ " . ") (drop 1 $ cycle [0..9]))
prettyPrint zeroBits Nothing
forM_ (zip bt mutationOrder) $ \case
(SHP.BTnode (_:.To n),mut) -> prettyPrint mut (Just n)
(SHP.BTedge (From ff:.To tt),mut) -> prettyPrint mut (Just tt)
hPutStrLn oH ""
hPutStrLn oH ""
let meaOrder =
let go = \case SHP.BTnode (_:.To n) -> n
SHP.BTedge (From ff:.To tt) -> tt
in map go $ concat $ take 1 mpbtLast
let meaAnno = map (\k -> map (show . (+1) . fst) mpks !! k) meaOrder
let meaEps = [ (ee !! k) !! l | let ee = groupBy ((==) `on` (fromEdgeBoundaryFst . fst)) eps, k <- meaOrder, l <- meaOrder ]
gridFile [SVG,EPS] (outprefix ++ "-edge-meaorder") fw fs nn nn meaAnno meaAnno (map snd meaEps)
posScaled :: Double -> Double -> ScaleFunction -> ScaleFunction
posScaled l s = scaleByFunction go where
go d | d >= l = d ** s
| otherwise = d
basepairDistanceMFE :: ScaleFunction
basepairDistanceMFE frna trna = fromIntegral $ d1Distance (mfeD1S frna) (mfeD1S trna)
basepairDistanceCentroid :: ScaleFunction
basepairDistanceCentroid frna trna = fromIntegral $ d1Distance (centroidD1S frna) (centroidD1S trna)
mfeDelta :: ScaleFunction
mfeDelta frna trna = mfeEnergy trna mfeEnergy frna
centroidDelta :: ScaleFunction
centroidDelta frna trna = centroidEnergy trna centroidEnergy frna
squaredPositive :: ScaleFunction -> ScaleFunction
squaredPositive sf = scaleByFunction sp sf where
sp d
| d > 0 = d * d
| otherwise = d
scaleTemperature :: Double -> ScaleFunction -> ScaleFunction
scaleTemperature t sf = scaleByFunction (/t) sf
scaleByFunction f sf = \frna trna ->
let d = sf frna trna
in f d
stupidReader :: FilePath -> IO ByteString
stupidReader fp = do
inp <- BS.lines <$> BS.readFile fp
let xs = filter (\x -> not (BS.null x) && BS.head x /= '>') inp
return $ BS.concat xs
withDumpFile
:: Handle
-> FilePath
-> ByteString
-> ByteString
-> Landscape
-> IO Landscape
withDumpFile oH fp ancestral current l = do
dfe <- doesFileExist fp
if dfe then do
hPrintf oH "using database %s to load sequence information\n" fp
ls <- fromFileJSON fp
unless (landscapeOrigin ls == ancestral && landscapeDestination ls == current) $ do
hPutStrLn oH "ancestral or target sequence do not match those stored in the work database"
hPutStrLn oH $ "given ancestral: " ++ BS.unpack ancestral
hPutStrLn oH $ "DB ancestral: " ++ (BS.unpack $ landscapeOrigin ls)
hPutStrLn oH $ "given current: " ++ BS.unpack current
hPutStrLn oH $ "DB current: " ++ (BS.unpack $ landscapeDestination ls)
exitFailure
return ls
else do
hPrintf oH "database %s does not exist! Folding all intermediate structures. This may take a while!\n" fp
toFileJSON fp l
return l