module BioInf.CMCompare where
import Control.Arrow (first,second,(***))
import Control.Lens
import Control.Monad
import Data.Array.IArray
import Data.List (maximumBy,nub,sort)
import qualified Data.Map as M
import System.Console.CmdArgs
import System.Environment (getArgs)
import Text.Printf
import Biobase.Primary
import Biobase.SElab.CM
import Biobase.SElab.CM.Import
import Biobase.SElab.Types
type Opt a =
( CM -> StateID -> a
, CM -> StateID -> BitScore -> a -> a
, CM -> StateID -> BitScore -> a -> a
, CM -> StateID -> BitScore -> a -> a
, CM -> StateID -> BitScore -> (Char,Char,BitScore) -> a -> a
, CM -> StateID -> BitScore -> (Char,BitScore) -> a -> a
, CM -> StateID -> BitScore -> (Char,BitScore) -> a -> a
, CM -> StateID -> BitScore -> (Char,BitScore) -> a -> a
, CM -> StateID -> BitScore -> (Char,BitScore) -> a -> a
, CM -> StateID -> a -> a -> a
, [(a,a)] -> [(a,a)]
, a -> String
)
cykMaxiMin :: Opt BitScore
cykMaxiMin = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where
end _ _ = 0
lbegin _ _ t s = t + s
start _ _ t s = t + s
delete _ _ t s = t + s
matchP _ _ t (_,_,e) s = t + e + s
matchL _ _ t (_,e) s = t + e + s
insertL _ _ t (_,e) s = t + e + s
matchR _ _ t (_,e) s = t + e + s
insertR _ _ t (_,e) s = t + e + s
branch _ _ s t = s + t
opt [] = []
opt xs = [maximumBy (\(a,b) (c,d) -> (min a b) `compare` (min c d)) xs]
finalize s = show s
rnaString :: Bool -> Opt [Char]
rnaString endmarker = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where
end _ _ = ['N' | endmarker]
lbegin _ _ _ s = s
start _ _ _ s = s
delete _ _ _ s = s
matchP _ _ _ (k1,k2,_) s = [k1] ++ s ++ [k2]
matchL _ _ _ (k,_) s = k : s
insertL _ _ _ (k,_) s = k : s
matchR _ _ _ (k,_) s = s ++ [k]
insertR _ _ _ (k,_) s = s ++ [k]
branch _ _ s t = s ++ t
opt = id
finalize s = if endmarker
then concatMap f s
else concatMap show s
f x
| x=='N' = "_"
| otherwise = show x
dotBracket :: Bool -> Opt String
dotBracket endmarker = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where
end _ _ = ['_' | endmarker]
lbegin _ _ _ s = s
start _ _ _ s = s
delete _ _ _ s = s
matchP _ _ _ _ s = "(" ++ s ++ ")"
matchL _ _ _ _ s = '.' : s
insertL _ _ _ _ s = ',' : s
matchR _ _ _ _ s = s ++ "."
insertR _ _ _ _ s = s ++ ","
branch _ _ s t = s ++ t
opt = id
finalize s = s
visitedNodes :: Opt [NodeID]
visitedNodes = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where
end cm k = [((cm^.states) M.! k) ^. nodeID]
lbegin cm k _ s = s
start cm k _ s = ((cm^.states) M.! k) ^. nodeID : s
delete cm k _ s = ((cm^.states) M.! k) ^. nodeID : s
matchP cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s
matchL cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s
insertL cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s
matchR cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s
insertR cm k _ _ s = ((cm^.states) M.! k) ^. nodeID : s
branch cm k s t = ((cm^.states) M.! k) ^. nodeID : (s ++ t)
opt = id
finalize xs = (show $ map unNodeID xs)
extendedOutput :: Opt String
extendedOutput = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where
end cm sid = printf "E %5d %5d" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID)
lbegin cm sid t s = printf "lbegin %5d %5d %7.3f \n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) s
start cm sid t s = printf "S %5d %5d %7.3f \n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) s
delete cm sid t s = printf "D %5d %5d %7.3f \n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) s
matchP cm sid t (k1,k2,e) s = printf "MP %5d %5d %7.3f %7.3f %1s %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k1) (show k2) s
matchL cm sid t (k,e) s = printf "ML %5d %5d %7.3f %7.3f %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k) s
insertL cm sid t (k,e) s = printf "IL %5d %5d %7.3f %7.3f %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k) s
matchR cm sid t (k,e) s = printf "MR %5d %5d %7.3f %7.3f %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k) s
insertR cm sid t (k,e) s = printf "IR %5d %5d %7.3f %7.3f %1s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid)^.nodeID) (unBitScore t) (unBitScore e) (show k) s
branch cm sid s t = printf "B %5d %5d\n%s\n%s" (unStateID sid) (unNodeID $ ((cm^.states) M.! sid) ^. nodeID) s t
opt = id
finalize s = "\nLabel State Node Trans Emis\n\n" ++ s
(<*>) :: Eq a => Opt a -> Opt b -> Opt (a,b)
algA <*> algB = (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) where
(endA,lbeginA,startA,deleteA,matchPA,matchLA,insertLA,matchRA,insertRA,branchA,optA,finalizeA) = algA
(endB,lbeginB,startB,deleteB,matchPB,matchLB,insertLB,matchRB,insertRB,branchB,optB,finalizeB) = algB
end cm k = (endA cm k, endB cm k)
lbegin cm k t (sA,sB) = (lbeginA cm k t sA, lbeginB cm k t sB)
start cm k t (sA,sB) = (startA cm k t sA, startB cm k t sB)
delete cm k t (sA,sB) = (deleteA cm k t sA, deleteB cm k t sB)
matchP cm k t e (sA,sB) = (matchPA cm k t e sA, matchPB cm k t e sB)
matchL cm k t e (sA,sB) = (matchLA cm k t e sA, matchLB cm k t e sB)
insertL cm k t e (sA,sB) = (insertLA cm k t e sA, insertLB cm k t e sB)
matchR cm k t e (sA,sB) = (matchRA cm k t e sA, matchRB cm k t e sB)
insertR cm k t e (sA,sB) = (insertRA cm k t e sA, insertRB cm k t e sB)
branch cm k (sA,sB) (tA,tB) = (branchA cm k sA tA, branchB cm k sB tB)
opt xs = [((xl1,xl2),(xr1,xr2)) | (xl1,xr1) <- nub $ optA [(yl1,yr1) | ((yl1,yl2),(yr1,yr2)) <- xs]
, (xl2,xr2) <- optB [(yl2,yr2) | ((yl1,yl2),(yr1,yr2)) <- xs, (yl1,yr1) == (xl1,xr1)]
]
finalize (sA,sB) = finalizeA sA ++ "\n" ++ finalizeB sB
recurse :: Bool -> Opt a -> CM -> CM -> Array (StateID,StateID) [(a,a)]
recurse fastIns (end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize) m1 m2 = locarr where
loc k1 k2
| otherwise = opt $ do
r <- arr ! (k1, k2)
return $ (lbegin m1 k1 lb1 *** lbegin m2 k2 lb2) r
where
lb1 = M.findWithDefault (BitScore (10000)) k1 (m1^.localBegin)
lb2 = M.findWithDefault (BitScore (10000)) k2 (m2^.localBegin)
rec k1 k2 = let xyz = rec' k1 k2
in xyz
rec' k1 k2
| t1 == E && t2 == E = [(end m1 k1, end m2 k2)]
| t1 == S && t2 == S = opt $ do
(c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)]
(c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)]
r <- arr ! (c1, c2)
return $ (start m1 k1 tr1 *** start m2 k2 tr2) r
| t1 == D && t2 == D = opt $ do
(c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)]
(c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)]
r <- arr ! (c1, c2)
return $ (delete m1 k1 tr1 *** delete m2 k2 tr2) r
| t1 == MP && t2 == MP
= opt $ do
(c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)]
(c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)]
(e1,e2) <- zip (s1 ^. emits ^. pair) (s2 ^. emits ^. pair)
r <- arr ! (c1, c2)
return $ (matchP m1 k1 tr1 e1 *** matchP m2 k2 tr2 e2) r
| t1 `elem` lstates && t2 `elem` lstates
= opt $ do
(c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)]
(c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)]
guard $ (not fastIns && (c1 /= k1 || c2 /= k2)) || (fastIns && c1/=k1 && c2/=k2)
(e1,e2) <- zip (s1 ^. emits ^. single) (s2 ^. emits ^. single)
r <- arr ! (c1, c2)
let f = if t1 == ML then matchL else insertL
let g = if t2 == ML then matchL else insertL
return $ (f m1 k1 tr1 e1 *** g m2 k2 tr2 e2) r
| t1 `elem` rstates && t2 `elem` rstates
= opt $ do
(c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)]
(c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)]
guard $ (not fastIns && (c1 /= k1 || c2 /= k2)) || (fastIns && c1/=k1 && c2/=k2)
(e1,e2) <- zip (s1 ^. emits ^. single) (s2 ^. emits ^. single)
r <- arr ! (c1, c2)
let f = if t1 == MR then matchR else insertR
let g = if t2 == MR then matchR else insertR
return $ (f m1 k1 tr1 e1 *** g m2 k2 tr2 e2) r
| t1 == E && t2 `elem` [D,S] = opt $ do
(c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)]
r <- arr ! (k1,c2)
return $ if t2 == D then second (delete m2 k2 tr2) r else second (start m2 k2 tr2) r
| t1 `elem` [D,S] && t2 == E = opt $ do
(c1,tr1) <- s1 ^. transitions ++ [(ls1,le2)]
r <- arr ! (c1,k2)
return $ if t1 == D then first (delete m1 k1 tr1) r else first (start m1 k1 tr1) r
| t1 == B && t2 == B = opt $
let
[(l1,_),(r1,_)] = s1 ^. transitions
[(l2,_),(r2,_)] = s2 ^. transitions
in
do
(s1,s2) <- arr ! (l1,l2)
(t1,t2) <- arr ! (r1,r2)
return (branch m1 k1 s1 t1, branch m2 k2 s2 t2)
++
do
(t1,s2) <- arr ! (r1,l2)
x <- arr ! (ls1,ls2)
let (s1,t2) = (delete m1 l1 le1 *** delete m2 l2 le2) x
return (branch m1 k1 s1 t1, branch m2 k2 s2 t2)
++
do
(s1,t2) <- arr ! (l1,r2)
x <- arr ! (ls1,ls2)
let (t1,s2) = (delete m1 l1 le1 *** delete m2 l2 le2) x
return (branch m1 k1 s1 t1, branch m2 k2 s2 t2)
| t1 == B && t2 /= B = opt $
let
[(l,_), (r,_)] = s1 ^. transitions
in
do
(s1,s2) <- arr ! (l,k2)
x <- arr ! (ls1,ls2)
let (t1,t2) = first (delete m1 r le1) x
return (branch m1 k1 s1 t1, branch m2 k2 s2 t2)
++
do
(t1,t2) <- arr ! (r,k2)
x <- arr ! (ls1,ls2)
let (s1,s2) = first (delete m1 l le1) x
return (branch m1 k1 s1 t1, branch m2 k2 s2 t2)
| t1 /= B && t2 == B = opt $
let
[(l,_), (r,_)] = s2 ^. transitions
in
do
(s1,s2) <- arr ! (k1,l)
x <- arr ! (ls1,ls2)
let (t1,t2) = second (delete m2 r le2) x
return (branch m1 k1 s1 t1, branch m2 k2 s2 t2)
++
do
(t1,t2) <- arr ! (k1,r)
x <- arr ! (ls1,ls2)
let (s1,s2) = second (delete m2 l le2) x
return (branch m1 k1 s1 t1, branch m2 k2 s2 t2)
| t1 == S = opt $ do
(c1,tr1) <- s1 ^. transitions ++ [(ls1,le1)]
r <- arr ! (c1, k2)
return $ first (start m1 k1 tr1) r
| t2 == S = opt $ do
(c2,tr2) <- s2 ^. transitions ++ [(ls2,le2)]
r <- arr ! (k1, c2)
return $ second (start m2 k2 tr2) r
| otherwise = []
where
s1 = (m1 ^. states) M.! k1
s2 = (m2 ^. states) M.! k2
t1 = s1 ^. stateType
t2 = s2 ^. stateType
le1 = M.findWithDefault (BitScore (10000)) k1 (m1^.localEnd)
le2 = M.findWithDefault (BitScore (10000)) k2 (m2^.localEnd)
ls1 = sn1
ls2 = sn2
lstates = [ML,IL]
rstates = [MR,IR]
locarr = (array ((0,0),(sn1,sn2)) [((k1,k2),loc k1 k2) | k1 <- [0 .. sn1], k2 <- [0 .. sn2]])
arr = (array ((0,0),(sn1,sn2)) [((k1,k2),rec k1 k2) | k1 <- [0 .. sn1], k2 <- [0 .. sn2]]) `asTypeOf` locarr
sn1 = fst . M.findMax $ m1 ^. states
sn2 = fst . M.findMax $ m2 ^. states