{-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE DeriveDataTypeable #-} -- Based on: Discriminatory Power of RNA Family Models, Christian Hoener zu -- Siederdissen and Ivo Hofacker, 2010, accepted for eccb10: -- -- Preprint: -- -- http://www.tbi.univie.ac.at/newpapers/abstracts/abstractTBI-p-2010-5.html -- NOTE always use coverage analysis to find out, if we really used all code -- paths (in long models, if a path is not taken, there is a bug) -- NOTE when comparing hits with cmsearch, use the following commandline: -- -- cmsearch --no-null3 --cyk --fil-no-hmm --fil-no-qdb -- -- --no-null3 : important, the test sequence is so short that null3 can easily -- generate scores that are way off! remember, we are interested in a sequence -- that is typically embedded in something large -- -- --fil-no-hmm, --fil-no-qdb: do not use heuristics for speedup, they -- sometimes hide results (in at least one case) -- -- (--toponly): if the comparison was done onesided -- -- (-g): if you want to compare globally -- {{{ module descriptor module Main where import Data.Array.IArray import Text.Printf import Control.Monad import Debug.Trace import System.Environment (getArgs) import System.Console.CmdArgs import Data.List (maximumBy,nub,sort) import Control.Arrow (first,second,(***)) import Biobase.Infernal.CM import Biobase.Infernal.CM.Import import Biobase.RNA hiding (nucE) import qualified Biobase.RNA as RNA import Debug.Trace.Tools -- }}} -- * optimization functions -- {{{ type of the optimization functions type StateID = Int -- TODO should this go into BiobaseCM? type CM' = CM () () type Opt a = ( CM' -> StateID -> a -- E , CM' -> StateID -> Double -> a -> a -- lbegin , CM' -> StateID -> Double -> a -> a -- S , CM' -> StateID -> Double -> a -> a -- D , CM' -> StateID -> Double -> Emission -> a -> a -- MP , CM' -> StateID -> Double -> Emission -> a -> a -- ML , CM' -> StateID -> Double -> Emission -> a -> a -- IL , CM' -> StateID -> Double -> Emission -> a -> a -- MR , CM' -> StateID -> Double -> Emission -> a -> a -- IR , CM' -> StateID -> a -> a -> a -- B , [(a,a)] -> [(a,a)] -- optimization , a -> String -- finalize, make pretty for output ) -- }}} -- {{{ optimization functions -- | calculates the cyk optimal score over both models. cykMaxiMin :: Opt Double cykMaxiMin = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end _ _ = 0 lbegin _ _ t s = t + s start _ _ t s = t + s delete _ _ t s = t + s matchP _ _ t (EmitP _ _ e) s = t + e + s matchL _ _ t (EmitS _ e) s = t + e + s insertL _ _ t (EmitS _ e) s = t + e + s matchR _ _ t (EmitS _ e) s = t + e + s insertR _ _ t (EmitS _ e) s = t + e + s branch _ _ s t = s + t opt [] = [] opt xs = [maximumBy (\(a,b) (c,d) -> (min a b) `compare` (min c d)) xs] finalize s = show s -- | return the nucleotide sequence leading to the score. uses an optional -- endmarker to denote end states. the string is the same for both models. this -- is the only Opt function, currently, for which this is true. rnaString :: Bool -> Opt [Nucleotide] rnaString endmarker = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end _ _ = [RNA.nucE | endmarker] lbegin _ _ _ s = s start _ _ _ s = s delete _ _ _ s = s matchP _ _ _ (EmitP k1 k2 _) s = [k1] ++ s ++ [k2] matchL _ _ _ (EmitS k _) s = k : s insertL _ _ _ (EmitS k _) s = k : s matchR _ _ _ (EmitS k _) s = s ++ [k] insertR _ _ _ (EmitS k _) s = s ++ [k] branch _ _ s t = s ++ t opt = id finalize s = if endmarker then concatMap f s else concatMap show s f x | x==RNA.nucE = "_" | otherwise = show x -- | dotbracket notation, again with an endmarker, to see the secondary -- structure corresponding to the rnastring. dotBracket :: Bool -> Opt String dotBracket endmarker = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end _ _ = ['_' | endmarker] lbegin _ _ _ s = s start _ _ _ s = s delete _ _ _ s = s matchP _ _ _ _ s = "(" ++ s ++ ")" matchL _ _ _ _ s = '.' : s insertL _ _ _ _ s = ',' : s matchR _ _ _ _ s = s ++ "." insertR _ _ _ _ s = s ++ "," branch _ _ s t = s ++ t opt = id finalize s = s -- | show the nodes which were visited to get the score. the last node can -- occur multiple times. if it does, local end transitions were used. visitedNodes :: Opt [Int] visitedNodes = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end cm k = [snode (states cm ! k)] lbegin cm k _ s = s start cm k _ s = snode (states cm ! k) : s delete cm k _ s = snode (states cm ! k) : s matchP cm k _ _ s = snode (states cm ! k) : s matchL cm k _ _ s = snode (states cm ! k) : s insertL cm k _ _ s = snode (states cm ! k) : s matchR cm k _ _ s = snode (states cm ! k) : s insertR cm k _ _ s = snode (states cm ! k) : s branch cm k s t = snode (states cm ! k) : (s ++ t) opt = id -- NOTE do not sort, do not nub ! finalize xs = "Nodes: " ++ show xs -- NOTE do not sort, do not nub ! -- | detailed output of the different states, that were visited. extendedOutput :: Opt String extendedOutput = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end cm sid = printf "E %5d %5d" sid (snode (states cm ! sid)) lbegin cm sid t s = printf "lbegin %5d %5d %7.3f \n%s" sid (snode (states cm ! sid)) t s start cm sid t s = printf "S %5d %5d %7.3f \n%s" sid (snode (states cm ! sid)) t s delete cm sid t s = printf "D %5d %5d %7.3f \n%s" sid (snode (states cm ! sid)) t s matchP cm sid t (EmitP k1 k2 e) s = printf "MP %5d %5d %7.3f %7.3f %1s %1s\n%s" sid (snode (states cm ! sid)) t e (show k1) (show k2) s matchL cm sid t (EmitS k e) s = printf "ML %5d %5d %7.3f %7.3f %1s\n%s" sid (snode (states cm ! sid)) t e (show k) s insertL cm sid t (EmitS k e) s = printf "IL %5d %5d %7.3f %7.3f %1s\n%s" sid (snode (states cm ! sid)) t e (show k) s matchR cm sid t (EmitS k e) s = printf "MR %5d %5d %7.3f %7.3f %1s\n%s" sid (snode (states cm ! sid)) t e (show k) s insertR cm sid t (EmitS k e) s = printf "IR %5d %5d %7.3f %7.3f %1s\n%s" sid (snode (states cm ! sid)) t e (show k) s branch cm sid s t = printf "B %5d %5d\n%s\n%s" sid (snode (states cm ! sid)) s t opt = id finalize s = "\nLabel State Node Trans Emis\n\n" ++ s (<*>) :: Eq a => Opt a -> Opt b -> Opt (a,b) algA <*> algB = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where (endA,lbeginA,startA,deleteA,matchPA,matchLA,insertLA,matchRA,insertRA,branchA,optA,finalizeA) = algA (endB,lbeginB,startB,deleteB,matchPB,matchLB,insertLB,matchRB,insertRB,branchB,optB,finalizeB) = algB end cm k = (endA cm k, endB cm k) lbegin cm k t (sA,sB) = (lbeginA cm k t sA, lbeginB cm k t sB) start cm k t (sA,sB) = (startA cm k t sA, startB cm k t sB) delete cm k t (sA,sB) = (deleteA cm k t sA, deleteB cm k t sB) matchP cm k t e (sA,sB) = (matchPA cm k t e sA, matchPB cm k t e sB) matchL cm k t e (sA,sB) = (matchLA cm k t e sA, matchLB cm k t e sB) insertL cm k t e (sA,sB) = (insertLA cm k t e sA, insertLB cm k t e sB) matchR cm k t e (sA,sB) = (matchRA cm k t e sA, matchRB cm k t e sB) insertR cm k t e (sA,sB) = (insertRA cm k t e sA, insertRB cm k t e sB) branch cm k (sA,sB) (tA,tB) = (branchA cm k sA tA, branchB cm k sB tB) opt xs = [((xl1,xl2),(xr1,xr2)) | (xl1,xr1) <- nub $ optA [(yl1,yr1) | ((yl1,yl2),(yr1,yr2)) <- xs] , (xl2,xr2) <- optB [(yl2,yr2) | ((yl1,yl2),(yr1,yr2)) <- xs, (yl1,yr1) == (xl1,xr1)] ] finalize (sA,sB) = finalizeA sA ++ "\n" ++ finalizeB sB -- }}} -- * recursion in two CMs simultanously -- {{{ main recursion recurse :: Opt a -> CM' -> CM' -> Array (Int,Int) [(a,a)] recurse (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) m1 m2 = locarr where loc k1 k2 | cmType m1 == CMProb || cmType m2 == CMProb = error "both models need to be score type models" | otherwise = opt $ do r <- arr ! (k1, k2) return $ (lbegin m1 k1 lb1 *** lbegin m2 k2 lb2) r where lb1 = localBegin m1 ! k1 lb2 = localBegin m2 ! k2 rec k1 k2 -- | t1 == E && t2 == E = [(end m1 k1, end m2 k2)] -- | t1 == S && t2 == S = opt $ do Transition c1 tr1 <- schildren s1 ++ [Transition ls1 le1 | acceptLE le1] Transition c2 tr2 <- schildren s2 ++ [Transition ls2 le2 | acceptLE le2] r <- arr ! (c1, c2) return $ (start m1 k1 tr1 *** start m2 k2 tr2) r | t1 == D && t2 == D = opt $ do Transition c1 tr1 <- schildren s1 ++ [Transition ls1 le1 | acceptLE le1] Transition c2 tr2 <- schildren s2 ++ [Transition ls2 le2 | acceptLE le2] r <- arr ! (c1, c2) return $ (delete m1 k1 tr1 *** delete m2 k2 tr2) r -- match pair emitting states | t1 == MP && t2 == MP = opt $ do Transition c1 tr1 <- schildren s1 ++ [Transition ls1 le1 | acceptLE le1] Transition c2 tr2 <- schildren s2 ++ [Transition ls2 le2 | acceptLE le2] (e1,e2) <- zip (semission s1) (semission s2) r <- arr ! (c1, c2) return $ (matchP m1 k1 tr1 e1 *** matchP m2 k2 tr2 e2) r -- match left emitting states | t1 `elem` lstates && t2 `elem` lstates = opt $ do Transition c1 tr1 <- schildren s1 ++ [Transition ls1 le1 | acceptLE le1] Transition c2 tr2 <- schildren s2 ++ [Transition ls2 le2 | acceptLE le2] guard $ c1 /= k1 || c2 /= k2 (e1,e2) <- zip (semission s1) (semission s2) r <- arr ! (c1, c2) let f = if t1 == ML then matchL else insertL let g = if t2 == ML then matchL else insertL return $ (f m1 k1 tr1 e1 *** g m2 k2 tr2 e2) r -- match right emitting states | t1 `elem` rstates && t2 `elem` rstates = opt $ do Transition c1 tr1 <- schildren s1 ++ [Transition ls1 le1 | acceptLE le1] Transition c2 tr2 <- schildren s2 ++ [Transition ls2 le2 | acceptLE le2] guard $ c1 /= k1 || c2 /= k2 (e1,e2) <- zip (semission s1) (semission s2) r <- arr ! (c1, c2) let f = if t1 == MR then matchR else insertR let g = if t2 == MR then matchR else insertR return $ (f m1 k1 tr1 e1 *** g m2 k2 tr2 e2) r -- if one state is E, we can only delete states, except for another S state, which will go into local end -- it is not possible to use an emitting state on the right as those would require emitting on the left, too! | t1 == E && t2 `elem` [D,S] = opt $ do Transition c2 tr2 <- schildren s2 ++ [Transition ls2 le2 | acceptLE le2] r <- arr ! (k1,c2) return $ if t2 == D then second (delete m2 k2 tr2) r else second (start m2 k2 tr2) r -- the other way around with D,E | t1 `elem` [D,S] && t2 == E = opt $ do Transition c1 tr1 <- schildren s1 ++ [Transition ls1 le1 | acceptLE le1] r <- arr ! (c1,k2) return $ if t1 == D then first (delete m1 k1 tr1) r else first (start m1 k1 tr1) r -- two branching states | t1 == B && t2 == B = opt $ let [Branch l1, Branch r1] = schildren s1 [Branch l2, Branch r2] = schildren s2 in -- both branches are matched do (s1,s2) <- arr ! (l1,l2) -- left branch (m1,m2) (t1,t2) <- arr ! (r1,r2) -- right branch (m1,m2) return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) -- (m1,m2) ++ do (t1,s2) <- arr ! (r1,l2) -- match right branch of m1 with left branch of m2 -- local ends for other branches x <- arr ! (ls1,ls2) let (s1,t2) = (delete m1 l1 le1 *** delete m2 l2 le2) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) ++ do (s1,t2) <- arr ! (l1,r2) x <- arr ! (ls1,ls2) let (t1,s2) = (delete m1 l1 le1 *** delete m2 l2 le2) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) -- branch - non-branch | t1 == B && t2 /= B = opt $ let [Branch l, Branch r] = schildren s1 in do (s1,s2) <- arr ! (l,k2) -- left branch and m2 x <- arr ! (ls1,ls2) -- dont do anything for ls2, since we do not have to -- delete a branch in model 2. let (t1,t2) = first (delete m1 r le1) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) ++ do (t1,t2) <- arr ! (r,k2) -- right branch and m2 x <- arr ! (ls1,ls2) let (s1,s2) = first (delete m1 l le1) x -- delete left branch in m1 return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) -- branch - non-branch | t1 /= B && t2 == B = opt $ let [Branch l, Branch r] = schildren s2 in do (s1,s2) <- arr ! (k1,l) x <- arr ! (ls1,ls2) let (t1,t2) = second (delete m2 r le2) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) ++ do (t1,t2) <- arr ! (k1,r) x <- arr ! (ls1,ls2) let (s1,s2) = second (delete m2 l le2) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) -- S state versus any | t1 == S = opt $ do Transition c1 tr1 <- schildren s1 ++ [Transition ls1 le1 | acceptLE le1] r <- arr ! (c1, k2) return $ first (start m1 k1 tr1) r -- S state versus any | t2 == S = opt $ do Transition c2 tr2 <- schildren s2 ++ [Transition ls2 le2 | acceptLE le2] r <- arr ! (k1, c2) return $ second (start m2 k2 tr2) r -- | otherwise = [] where s1 = states m1 ! k1 s2 = states m2 ! k2 t1 = stype s1 t2 = stype s2 le1 = localEnd m1 ! k1 le2 = localEnd m2 ! k2 ls1 = snd . bounds $ states m1 -- last state (E) ls2 = snd . bounds $ states m2 lstates = [ML,IL] rstates = [MR,IR] acceptLE x | cmType m1 == CMScore && x > (-1/0) = True | cmType m1 == CMProb && x /= 0 = True | otherwise = False locarr = (array ((0,0),(sn1,sn2)) [((k1,k2),loc k1 k2) | k1 <- [0 .. sn1], k2 <- [0 .. sn2]]) `asTypeOf` arr arr = (array ((0,0),(sn1,sn2)) [((k1,k2),rec k1 k2) | k1 <- [0 .. sn1], k2 <- [0 .. sn2]]) `asTypeOf` locarr (_,sn1) = bounds $ states m1 (_,sn2) = bounds $ states m2 -- }}} -- {{{ main -- TODO add an option to filter by minimal score (default: -1000000) if the -- filter is on, we get a result only, if the score is above the threshold data Options = Options { global :: Bool , invert :: Bool , pbegin :: Double , pend :: Double , endmarker :: Bool , nobeginilir :: Bool , models :: [String] } deriving (Show,Data,Typeable) options = Options { global = False &= help "global model comparison" , invert = False &= help "TODO invert second model (read right to left)" , pbegin = 0.05 &= help "aggregate local begin probability" , pend = 0.05 &= help "aggregate local end probability" , endmarker = False &= help "add an endmarker into the rnastring to denote local ends" , nobeginilir = False &= help "trailing left or right nucleotides change the score" , models = def &= args -- &= help "path to exactly two covariance models" } &= summary "CMCompare: Discriminatory Power of RNA Family Models" &= help "(c) 2010, Christian Hoener zu Siederdissen and Ivo Hofacker\nchoener@tbi.univie.ac.at\nLicensed under the GPLv3\n" &= verbosity -- TODO put fixTransition in BiobaseInfernal, rename and whatnot -- | what is this? this sets the transition score in the root to 0 for the -- transition into IL and IR. For CM comparison, this makes sense. Consider two -- cms .(.) and (.); given "acag" both should score about the same, but the -- second can only, if IL eats a nucleotide. for single CM search, this is -- taken care of by the DP algorithm which doesn't work here. fixTransition :: CM' -> CM' fixTransition x = x{states = ss // [(0,rtnew)]} where ss = states x rt = ss ! 0 tr = schildren rt trnew = [Transition 1 0, Transition 2 0] ++ drop 2 tr rtnew = rt{schildren = trnew} applyIf c f = if c then f else id answers optf m1 m2 = map (finalize *** finalize) . opt . concat . elems $ recurse optf m1 m2 where (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) = optf results optf m1 m2 = opt . concat . elems $ recurse optf m1 m2 where (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) = optf main = do Options{..} <- cmdArgs options normal <- isNormal loud <- isLoud let quiet = not normal unless (length models == 2) $ do fail "give exactly two CMs" let [a,b] = models Right [m1'] <- fromFile a Right [m2'] <- fromFile b let m1 = applyIf (not nobeginilir) fixTransition $ applyIf (not global) (cmMakeLocal pbegin pend) m1' let m2 = applyIf (not nobeginilir) fixTransition $ applyIf (not global) (cmMakeLocal pbegin pend) m2' let pr = (\(x,y) -> putStrLn x >> putStrLn "" >> putStrLn y >> putStrLn "++++++++++++") when (quiet && not normal) $ do let (a1,a2) = head $ results cykMaxiMin m1 m2 printf "%s %s %10.3f %10.3f\n" a b a1 a2 when (normal && not loud) $ do let ((((cyk1,vn1),rna1),db1),(((cyk2,vn2),_),db2)) = head $ results (cykMaxiMin <*> visitedNodes <*> rnaString endmarker <*> dotBracket endmarker) m1 m2 let rnaBoth = concatMap f rna1 where f x | x==RNA.nucE = "_" | otherwise = show x printf "%s %s %10.3f %10.3f %s %s %s %s %s\n" a b cyk1 cyk2 rnaBoth db1 db2 (show vn1) (show vn2) when loud . mapM_ pr $ answers (cykMaxiMin <*> visitedNodes <*> rnaString endmarker <*> dotBracket endmarker <*> extendedOutput) m1 m2 -- }}} -- * Helper functions -- {{{ -- | summation in logspace -- TODO time for a new library ;) logSum x y = h + log (1 + exp (l-h)) where h = max x y l = min x y -- }}}