{-# LANGUAGE RankNTypes #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE DeriveDataTypeable #-} -- | This program compares two Infernal covariance models with each other. -- Based on the Infernal CM scoring mechanism, a Link sequence and Link score -- are calculated. The Link sequence is defined as the sequence scoring highest -- in both models simultanuously. -- -- The complete algorithm is described in: -- -- "Christian Höner zu Siederdissen, and Ivo L. Hofacker. 2010. Discriminatory -- power of RNA family models. Bioinformatics 26, no. 18: 453–59." -- -- -- -- -- -- NOTE always use coverage analysis to find out, if we really used all code -- paths (in long models, if a path is not taken, there is a bug) -- NOTE when comparing hits with cmsearch, use the following commandline: -- -- cmsearch --no-null3 --cyk --fil-no-hmm --fil-no-qdb -- -- --no-null3 : important, the test sequence is so short that null3 can easily -- generate scores that are way off! remember, we are interested in a sequence -- that is typically embedded in something large -- -- --fil-no-hmm, --fil-no-qdb: do not use heuristics for speedup, they -- sometimes hide results (in at least one case) -- -- (--toponly): if the comparison was done onesided -- -- (-g): if you want to compare globally module BioInf.CMCompare where import Control.Arrow (first,second,(***)) import Control.Lens import Control.Monad import Data.Array.IArray import Data.List (maximumBy,nub,sort) import qualified Data.Map as M import System.Console.CmdArgs import System.Environment (getArgs) import Text.Printf import Biobase.Primary import Biobase.SElab.CM import Biobase.SElab.CM.Import import Biobase.SElab.Types -- * optimization functions -- | Type of the optimization functions. type Opt a = ( CM -> StateID -> a -- E , CM -> StateID -> BitScore -> a -> a -- lbegin , CM -> StateID -> BitScore -> a -> a -- S , CM -> StateID -> BitScore -> a -> a -- D , CM -> StateID -> BitScore -> (Char,Char,BitScore) -> a -> a -- MP , CM -> StateID -> BitScore -> (Char,BitScore) -> a -> a -- ML , CM -> StateID -> BitScore -> (Char,BitScore) -> a -> a -- IL , CM -> StateID -> BitScore -> (Char,BitScore) -> a -> a -- MR , CM -> StateID -> BitScore -> (Char,BitScore) -> a -> a -- IR , CM -> StateID -> a -> a -> a -- B , [(a,a)] -> [(a,a)] -- optimization , a -> String -- finalize, make pretty for output ) -- | Calculates the cyk optimal score over both models. cykMaxiMin :: Opt BitScore cykMaxiMin = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end _ _ = 0 lbegin _ _ t s = t + s start _ _ t s = t + s delete _ _ t s = t + s matchP _ _ t (_,_,e) s = t + e + s matchL _ _ t (_,e) s = t + e + s insertL _ _ t (_,e) s = t + e + s matchR _ _ t (_,e) s = t + e + s insertR _ _ t (_,e) s = t + e + s branch _ _ s t = s + t opt [] = [] opt xs = [maximumBy (\(a,b) (c,d) -> (min a b) `compare` (min c d)) xs] -- (xs `using` parList rdeepseq)] finalize s = show s -- | Return the nucleotide sequence leading to the score. uses an optional -- endmarker to denote end states. the string is the same for both models. this -- is the only Opt function, currently, for which this is true. rnaString :: Bool -> Opt [Char] rnaString endmarker = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end _ _ = ['N' | endmarker] lbegin _ _ _ s = s start _ _ _ s = s delete _ _ _ s = s matchP _ _ _ (k1,k2,_) s = [k1] ++ s ++ [k2] matchL _ _ _ (k,_) s = k : s insertL _ _ _ (k,_) s = k : s matchR _ _ _ (k,_) s = s ++ [k] insertR _ _ _ (k,_) s = s ++ [k] branch _ _ s t = s ++ t opt = id finalize s = if endmarker then concatMap f s else concatMap show s f x | x=='N' = "_" | otherwise = show x -- | Dotbracket notation, again with an endmarker, to see the secondary -- structure corresponding to the rnastring. dotBracket :: Bool -> Opt String dotBracket endmarker = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end _ _ = ['_' | endmarker] lbegin _ _ _ s = s start _ _ _ s = s delete _ _ _ s = s matchP _ _ _ _ s = "(" ++ s ++ ")" matchL _ _ _ _ s = '.' : s insertL _ _ _ _ s = ',' : s matchR _ _ _ _ s = s ++ "." insertR _ _ _ _ s = s ++ "," branch _ _ s t = s ++ t opt = id finalize s = s -- | Show the nodes which were visited to get the score. the last node can -- occur multiple times. if it does, local end transitions were used. visitedNodes :: Opt [NodeID] visitedNodes = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end cm k = [((cm^.states) M.! k) ^. nodeID] lbegin cm k _ s = s start cm k _ s = ((cm^.states) M.! k) ^. nodeID : s delete cm k _ s = ((cm^.states) M.! k) ^. nodeID : s matchP cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s matchL cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s insertL cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s matchR cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s insertR cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s branch cm k s t = ((cm^.states) M.! k) ^. nodeID : (s ++ t) opt = id -- NOTE do not sort, do not nub ! finalize xs = (show $ map unNodeID xs) -- NOTE do not sort, do not nub ! -- | Detailed output of the different states, that were visited. extendedOutput :: Opt String extendedOutput = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where end cm sid = printf "E %5d %5d" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) lbegin cm sid t s = printf "lbegin %5d %5d %7.3f \n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) s start cm sid t s = printf "S %5d %5d %7.3f \n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) s delete cm sid t s = printf "D %5d %5d %7.3f \n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) s matchP cm sid t (k1,k2,e) s = printf "MP %5d %5d %7.3f %7.3f %1s %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k1) (show k2) s matchL cm sid t (k,e) s = printf "ML %5d %5d %7.3f %7.3f %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k) s insertL cm sid t (k,e) s = printf "IL %5d %5d %7.3f %7.3f %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k) s matchR cm sid t (k,e) s = printf "MR %5d %5d %7.3f %7.3f %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k) s insertR cm sid t (k,e) s = printf "IR %5d %5d %7.3f %7.3f %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k) s branch cm sid s t = printf "B %5d %5d\n%s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid) ^. nodeID) s t opt = id finalize s = "\nLabel State Node Trans Emis\n\n" ++ s -- | Algebra product operation. (<*>) :: Eq a => Opt a -> Opt b -> Opt (a,b) algA <*> algB = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where (endA,lbeginA,startA,deleteA,matchPA,matchLA,insertLA,matchRA,insertRA,branchA,optA,finalizeA) = algA (endB,lbeginB,startB,deleteB,matchPB,matchLB,insertLB,matchRB,insertRB,branchB,optB,finalizeB) = algB end cm k = (endA cm k, endB cm k) lbegin cm k t (sA,sB) = (lbeginA cm k t sA, lbeginB cm k t sB) start cm k t (sA,sB) = (startA cm k t sA, startB cm k t sB) delete cm k t (sA,sB) = (deleteA cm k t sA, deleteB cm k t sB) matchP cm k t e (sA,sB) = (matchPA cm k t e sA, matchPB cm k t e sB) matchL cm k t e (sA,sB) = (matchLA cm k t e sA, matchLB cm k t e sB) insertL cm k t e (sA,sB) = (insertLA cm k t e sA, insertLB cm k t e sB) matchR cm k t e (sA,sB) = (matchRA cm k t e sA, matchRB cm k t e sB) insertR cm k t e (sA,sB) = (insertRA cm k t e sA, insertRB cm k t e sB) branch cm k (sA,sB) (tA,tB) = (branchA cm k sA tA, branchB cm k sB tB) opt xs = [((xl1,xl2),(xr1,xr2)) | (xl1,xr1) <- nub $ optA [(yl1,yr1) | ((yl1,yl2),(yr1,yr2)) <- xs] , (xl2,xr2) <- optB [(yl2,yr2) | ((yl1,yl2),(yr1,yr2)) <- xs, (yl1,yr1) == (xl1,xr1)] ] finalize (sA,sB) = finalizeA sA ++ "\n" ++ finalizeB sB -- * The grammar for CM comparison. -- | Recursion in two CMs simultanously. recurse :: Bool -> Opt a -> CM -> CM -> Array (StateID,StateID) [(a,a)] recurse fastIns (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) m1 m2 = locarr where loc k1 k2 | otherwise = opt $ do r <- arr ! (k1, k2) return $ (lbegin m1 k1 lb1 *** lbegin m2 k2 lb2) r where lb1 = M.findWithDefault (BitScore (-10000)) k1 (m1^.localBegin) lb2 = M.findWithDefault (BitScore (-10000)) k2 (m2^.localBegin) rec k1 k2 = let xyz = rec' k1 k2 in xyz -- traceShow ("rec",k1,((m1^.states) M.! k1) ^. stateType,k2,((m2^.states) M.! k2) ^. stateType) xyz rec' k1 k2 -- | t1 == E && t2 == E = [(end m1 k1, end m2 k2)] -- | t1 == S && t2 == S = opt $ do (c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)] (c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)] r <- arr ! (c1, c2) return $ (start m1 k1 tr1 *** start m2 k2 tr2) r | t1 == D && t2 == D = opt $ do (c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)] (c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)] r <- arr ! (c1, c2) return $ (delete m1 k1 tr1 *** delete m2 k2 tr2) r -- match pair emitting states | t1 == MP && t2 == MP = opt $ do (c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)] (c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)] (e1,e2) <- zip (s1 ^. emits ^. pair) (s2 ^. emits ^. pair) r <- arr ! (c1, c2) return $ (matchP m1 k1 tr1 e1 *** matchP m2 k2 tr2 e2) r -- match left emitting states | t1 `elem` lstates && t2 `elem` lstates = opt $ do (c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)] (c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)] guard $ (not fastIns && (c1 /= k1 || c2 /= k2)) || (fastIns && c1/=k1 && c2/=k2) (e1,e2) <- zip (s1 ^. emits ^. single) (s2 ^. emits ^. single) r <- arr ! (c1, c2) let f = if t1 == ML then matchL else insertL let g = if t2 == ML then matchL else insertL return $ (f m1 k1 tr1 e1 *** g m2 k2 tr2 e2) r -- match right emitting states | t1 `elem` rstates && t2 `elem` rstates = opt $ do (c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)] (c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)] guard $ (not fastIns && (c1 /= k1 || c2 /= k2)) || (fastIns && c1/=k1 && c2/=k2) (e1,e2) <- zip (s1 ^. emits ^. single) (s2 ^. emits ^. single) r <- arr ! (c1, c2) let f = if t1 == MR then matchR else insertR let g = if t2 == MR then matchR else insertR return $ (f m1 k1 tr1 e1 *** g m2 k2 tr2 e2) r -- if one state is E, we can only delete states, except for another S state, which will go into local end -- it is not possible to use an emitting state on the right as those would require emitting on the left, too! | t1 == E && t2 `elem` [D,S] = opt $ do (c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)] r <- arr ! (k1,c2) return $ if t2 == D then second (delete m2 k2 tr2) r else second (start m2 k2 tr2) r -- the other way around with D,E | t1 `elem` [D,S] && t2 == E = opt $ do (c1,tr1) <- s1 ^. transitions ++ [(ls1,le2)] r <- arr ! (c1,k2) return $ if t1 == D then first (delete m1 k1 tr1) r else first (start m1 k1 tr1) r -- two branching states | t1 == B && t2 == B = opt $ let [(l1,_),(r1,_)] = s1 ^. transitions [(l2,_),(r2,_)] = s2 ^. transitions in -- both branches are matched do (s1,s2) <- arr ! (l1,l2) -- left branch (m1,m2) (t1,t2) <- arr ! (r1,r2) -- right branch (m1,m2) return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) -- (m1,m2) ++ do (t1,s2) <- arr ! (r1,l2) -- match right branch of m1 with left branch of m2 -- local ends for other branches x <- arr ! (ls1,ls2) let (s1,t2) = (delete m1 l1 le1 *** delete m2 l2 le2) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) ++ do (s1,t2) <- arr ! (l1,r2) x <- arr ! (ls1,ls2) let (t1,s2) = (delete m1 l1 le1 *** delete m2 l2 le2) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) -- branch - non-branch | t1 == B && t2 /= B = opt $ let [(l,_), (r,_)] = s1 ^. transitions in do (s1,s2) <- arr ! (l,k2) -- left branch and m2 x <- arr ! (ls1,ls2) -- dont do anything for ls2, since we do not have to -- delete a branch in model 2. let (t1,t2) = first (delete m1 r le1) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) ++ do (t1,t2) <- arr ! (r,k2) -- right branch and m2 x <- arr ! (ls1,ls2) let (s1,s2) = first (delete m1 l le1) x -- delete left branch in m1 return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) -- branch - non-branch | t1 /= B && t2 == B = opt $ let [(l,_), (r,_)] = s2 ^. transitions in do (s1,s2) <- arr ! (k1,l) x <- arr ! (ls1,ls2) let (t1,t2) = second (delete m2 r le2) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) ++ do (t1,t2) <- arr ! (k1,r) x <- arr ! (ls1,ls2) let (s1,s2) = second (delete m2 l le2) x return (branch m1 k1 s1 t1, branch m2 k2 s2 t2) -- S state versus any | t1 == S = opt $ do (c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)] r <- arr ! (c1, k2) return $ first (start m1 k1 tr1) r -- S state versus any | t2 == S = opt $ do (c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)] r <- arr ! (k1, c2) return $ second (start m2 k2 tr2) r -- | otherwise = [] where s1 = (m1 ^. states) M.! k1 s2 = (m2 ^. states) M.! k2 t1 = s1 ^. stateType t2 = s2 ^. stateType le1 = M.findWithDefault (BitScore (-10000)) k1 (m1^.localEnd) le2 = M.findWithDefault (BitScore (-10000)) k2 (m2^.localEnd) ls1 = sn1 ls2 = sn2 lstates = [ML,IL] rstates = [MR,IR] locarr = (array ((0,0),(sn1,sn2)) [((k1,k2),loc k1 k2) | k1 <- [0 .. sn1], k2 <- [0 .. sn2]]) arr = (array ((0,0),(sn1,sn2)) [((k1,k2),rec k1 k2) | k1 <- [0 .. sn1], k2 <- [0 .. sn2]]) `asTypeOf` locarr sn1 = fst . M.findMax $ m1 ^. states sn2 = fst . M.findMax $ m2 ^. states