\documentclass[a4paper]{article}
\usepackage{haskell}
\pagestyle{myheadings}
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Splice Graph Construction}
This module builds splice graphs from {\tt WordMap}s. Each word
$w_1w_2w_3...w_n$ in the map represents an edge from $w_1w_2..w_{n1}$
to $w_2w_3..w_n$. It also generates the greedy paths representing
consensus sequences.
\begin{code}
module SpliceGraph (Path, greedy_paths, path_sequence, show_graph) where
import WordMap (WordMap,Key,wmlookup,get_k,genkeysN,
n2i,i2n,i2s,shift_left,shift_right,genkeys)
import qualified Data.Map as Map hiding (filter, map)
import Data.List (nub, sort, elem)
import Data.Maybe(fromJust,maybeToList,listToMaybe)
import Data.Set hiding (filter,map,null)
import EST(EST,label)
import Gene
import Indexed
import Suffix(Suffix,getEST,getPos)
import Util (foldl', sortOn)
type Branch = Key
type Path = [Key]
mlookup :: WordMap -> Key -> [Suffix]
mlookup = wmlookup
_log :: t -> ()
_log _ = ()
\end{code}
\subsection{Greedy (and less greedy) paths}
We find the remaining greediest path by starting with the most common
edge, and traversing in each direction. This is iterated, until no
edge heavy enough remains.
\begin{code}
greedy_paths :: Int
-> WordMap
-> [(String, Int, Int, Path)]
greedy_paths k wm = greedy_paths' k wm empty
greedy_paths' :: Int -> WordMap -> Set Key -> [(String,Int,Int,Path)]
greedy_paths' minw wm exc =
let (w0,weight) = heaviest_except exc wm
path = greedy_path wm w0
exc' = Data.Set.union exc $ fromList path
in if weight >= minw && weight > 0
then (i2s w0,weight,score_path wm path,path)
: greedy_paths' minw wm exc'
else []
score_path :: WordMap -> Path -> Int
score_path wm = foldr (+) 0 . map (fst . count_unique . fj . (flip Map.lookup) wm)
where fj (Just xs) = xs
fj Nothing = []
\end{code}
Finding the heaviest edge in the word map {\em not} already in a path.
The ''used'' edges are stored in a set of keys.
\begin{code}
heaviest_except :: Set Key -> WordMap -> (Key,Int)
heaviest_except exceptions = h' (error "h_e undef",0) . (Map.assocs)
where h' (current,weight) ((x,ss):rest) =
if (not $ x `member` exceptions) && count_u ss > weight
then h' (x,count_u ss) rest
else h' (current,weight) rest
h' (current,weight) [] = (current,weight)
count_u = (\(x,y)->x `div` y) . count_unique
count_unique :: [Suffix] -> (Int,Int)
count_unique ss = let count = foldl' uq Map.empty ss
in (Map.size count, maxFM count)
where
uq counts s = case (flip Map.lookup) counts e of
Nothing -> Map.insert e 1 counts
Just x -> Map.insert e (x+1) counts
where e = getEST s
maxFM = maximum . map snd . (Map.assocs)
\end{code}
\newpage
\subsection{Calculating a greedy path}
Calculating the greediest path in a WordMap, given a Key as a
starting point.
\begin{code}
mkseqpos :: Suffix -> (EST, Int)
mkseqpos s = (getEST s, getPos s)
greedy_path :: WordMap -> Key -> Path
greedy_path wm w0 =
let sm0 = Map.fromList $ map mkseqpos $ mlookup wm w0
wm0 = Map.mapWithKey (\_ _ -> get_k wm) sm0
(lks,rks) = mkpath wm (sm0,wm0,sm0) ([w0],[w0])
path = lks ++ drop 1 (reverse rks)
in
path
\end{code}
\subsection{The pathfinding algorithm}
We define the SeqMap to keep track of used positions in sequences, to
aid us in chosing branches in the graph. Among other things, this
helps us to avoid looping. (This can easily happen if you have two
sequences "v...w" and "w...v").
\begin{code}
type SeqMap = (Map.Map EST Int,Map.Map EST Int,Map.Map EST Int)
left :: (t, t1, t2) -> t
left (l,_,_) = l
right :: (t, t1, t2) -> t2
right (_,_,r) = r
weight :: (t, t1, t2) -> t1
weight (_,w,_) = w
data Direction = R | L deriving Eq
data Alt = Alt { alt_key :: Key, score :: Int,
support, follow_on, all_compat :: [Suffix] }
deriving (Eq,Show)
alt :: Alt
alt = Alt (error "Alt with undef. alt_key") (error "Alt with undef. score")
(error "Alt with undef. support") (error "Alt with undef. follow_on")
(error "Alt with undef. all_compat")
\end{code}
The actual algorithm for finding a path. Determines all alternatives,
and chooses the best one. When no more alternatives can be found
(i.e. all sequences end or are N), "extend" is called to try a bit
harder, or to finish the job.
This can probably be optimized quite a bit (50\%) by retaining unused
Alt's instead of recalculating them each iteration.
\begin{code}
mkpath :: WordMap -> SeqMap -> (Path,Path) -> (Path,Path)
mkpath wm sm (left,right) = let
k = get_k wm
lws = filter nonE $
map (mkAlt wm sm L . shift_right k (head left)) [A,C,G,T]
rws = filter nonE $
map (mkAlt wm sm R . shift_left k (head right)) [A,C,G,T]
nonE = not . null . all_compat
continue (Left k) =
mkpath wm (updateSM sm (Left k)) (alt_key k:left,right)
continue (Right k) =
mkpath wm (updateSM sm (Right k)) (left,alt_key k:right)
in _log (lws `seq` rws `seq` "ALTS:\nleft: "++
show (map (i2s . alt_key) lws) ++ "\nright: "
++ show (map (i2s . alt_key) rws)) `seq`
case (lws,rws) of
([],[]) -> extend wm sm (left,right)
([x],[]) -> continue (Left x)
([],[x]) -> continue (Right x)
(_,_) -> continue (bestbranch (lws,rws))
bestbranch :: ([Alt],[Alt]) -> Either Alt Alt
bestbranch (l,[]) = Left (best l)
bestbranch ([],r) = Right (best r)
bestbranch (l,r) = let (bl,br) = (best l, best r)
in _log (bl `seq` br `seq` "bb: = "++show (bl,br)) `seq`
if best [bl,br] == bl then Left bl else Right br
best :: [Alt] -> Alt
best = head . sortOn (negate . score)
\end{code}
Fixme: if the best branch has $>$ minw sequences in follow\_on, take it
else take highest follow\_on supported by compats (exchg rate?)
\subsection{Making Alts}
Constructing the Alt data structure. This contains the necessary
information for determining branch choices.
\begin{code}
mkAlt :: WordMap -> SeqMap -> Direction -> Key -> Alt
mkAlt wm sm d k =
calc_score sm with_follow
where
lookup = concat . maybeToList . (flip Map.lookup) wm
with_support = alt { alt_key = k, support = lookup k }
with_compat = with_support { all_compat = filter (isCompat sm d)
(support with_support) }
with_follow = with_compat { follow_on = filter follows
(support with_compat) }
follows s = let (e,p) = mkseqpos s
l = (flip Map.lookup) (left sm) e
r = (flip Map.lookup) (right sm) e
in case d of
L -> l /= Nothing && fromJust l == p+1
R -> r /= Nothing && fromJust r == p1
isCompat :: SeqMap -> Direction -> Suffix -> Bool
isCompat sm d s = let (e,p) = mkseqpos s
l = (flip Map.lookup) (left sm) e
r = (flip Map.lookup) (right sm) e
in case d of
L -> l == Nothing || fromJust l > p
R -> r == Nothing || fromJust r < p
calc_score :: SeqMap -> Alt -> Alt
calc_score (_l,w,_r) alt = alt { score = sum $ getweights $ follow_on alt}
where fj Nothing = 0
fj (Just w) = w
getweights =
map (\x -> (max ((fj . (flip Map.lookup) w . getEST) x) 42))
\end{code}
\subsection{Extending paths}
When no more matches can be found, see if we can extend it
with the remains of prematurely terminated sequences.
Todo: should probably use make Alts to determine the best branch...
\begin{code}
extend :: WordMap -> SeqMap -> ([Key],[Key]) -> ([Key],[Key])
extend wm sm (l:ls,r:rs) = _log ("extend: "++i2s l++","++i2s r) `seq`
let ssl = filter (isEdge sm L) $ filter (not.isStart) $ mlookup wm l
ssr = filter (isEdge sm R) $ filter (not.isEnd) $ mlookup wm r
next_l = longest $ map words_left ssl
next_r = longest $ map words_right ssr
in _log (show ((ssl,ssr),(sh next_l,sh next_r))) `seq`
case (next_l,next_r) of
(Nothing,Nothing) -> _log (show $ (Map.assocs) $ weight sm) `seq`
(l:ls,r:rs)
(Nothing,Just (e,x)) -> mkpath wm
(updateSM sm (Right $ mkAlt wm sm R $ snd $ head x))
(l:ls, reverse (mkNs wm R e ssr r (snd $ head x)) ++ rs)
(Just (e,x),_) -> mkpath wm
(updateSM sm (Left $ mkAlt wm sm L $ snd $ last x))
(mkNs wm L e ssl (snd $ last x) l ++ ls, r:rs)
where
k = get_k wm
longest :: [(EST,[(Int,Key)])] -> Maybe (EST,[(Int,Key)])
longest = listToMaybe . sortOn (negate.length.snd) . filter (not.null.snd)
words_right, words_left :: Suffix -> (EST,[(Int,Key)])
words_right s = (getEST s, genkeys k (1+getPos s) (getEST s))
words_left s = (getEST s, takeWhile ((getPos s >).fst) $
genkeys k 0 (getEST s))
isStart = (k >) . getPos
isEnd = (2*k >) . len
sh Nothing = []
sh (Just (e,xs)) = map (\(p,key)->(label e,p,i2s key)) xs
extend _ _ _ = error "Cannot extend an empty path"
isEdge :: SeqMap -> Direction -> Suffix -> Bool
isEdge sm L s = (flip Map.lookup) (left sm) (getEST s) == Just (getPos s)
isEdge sm R s = (flip Map.lookup) (right sm) (getEST s) == Just (getPos s)
\end{code}
Make a set of intermediate keys from left to right key, taken from the
sequence chosen to "extend" the path.
(This is more than a bit ugly!)
\begin{code}
mkNs :: WordMap -> Direction -> EST -> [Suffix] -> Key -> Key -> [Key]
mkNs wm d est ss lk rk = let
k = length $ i2n lk
thisESTpos = getPos . head . filter ((==est).getEST)
in _log ("mkNs: "++show (thisESTpos (mlookup wm lk),thisESTpos ss,thisESTpos (mlookup wm rk))) `seq`
if d == L
then map snd $ takeWhile ((thisESTpos ss>=).fst)
$ genkeysN k (thisESTpos (mlookup wm lk)) est
else map snd $ takeWhile ((thisESTpos (mlookup wm rk)>=).fst)
$ genkeysN k (thisESTpos ss) est
\end{code}
Update the sequence map to incorporate the chosen alternative.
Using intermediate lists $\rightarrow$ opportunity for optimization.
For all sequences already in the SM, update left bounds.
For all sequences *not* in the SM, add it to both left and right.
\begin{code}
updateSM :: SeqMap -> Either Alt Alt -> SeqMap
updateSM sm (Left alt) = foldl' add1 sm ps
where
ps = map (\s -> (getEST s, getPos s)) (all_compat alt)
add1 (l,w,r) (e,p) = case (flip Map.lookup) l e of
Nothing -> (Map.insert e p l, Map.insert e k w, Map.insert e p r)
Just pos -> (Map.insert e p l, Map.insertWith (+) e (min (posp) k) w, r)
where k = length $ i2n $ alt_key alt
updateSM sm (Right alt) = foldl' add1 sm ps
where
ps = map (\s -> (getEST s, getPos s)) (all_compat alt)
add1 (l,w,r) (e,p) = case (flip Map.lookup) r e of
Nothing -> (Map.insert e p l, Map.insert e k w, Map.insert e p r)
Just pos -> (l, Map.insertWith (+) e (min (ppos) k) w, Map.insert e p r)
where k = length $ i2n $ alt_key alt
path_sequence :: Path -> [Gene]
path_sequence (x:xs) =
i2n x ++ map (last . i2n) xs
path_sequence [] = error "No path with no keys"
\end{code}
\newpage
\section{Graph Output}
Write output suitable for GraphViz to produce nice pictures from.
We need to collapse nodes with indegree and outdegree of one, in order
to avoid excessive size of the graph (although GraphViz handles them
fine).
\begin{code}
show_graph :: Int -> WordMap -> [[Branch]] -> String
show_graph minw wm bs = comments ++ header ++
(unlines $ print_nodes k nodepairs) ++
(unlines $ map (print_edge minw bs wm) nodepairs)
++ footer
where header = "digraph splicegraph {\n"++
"rankdir = LR;\n" ++
"orientation = land;\n"
footer = "}\n"
comments = "// Produced by xtract\n"
nodepairs = optimize $ node2str $ map edge2nodes $ edges wm
k = get_k wm
edge2nodes :: [Gene] -> ([Gene],[Gene])
edge2nodes gs = (take (length gs1) gs, tail gs)
edges :: WordMap -> [[Gene]]
edges wm = map (i2n . fst) $ (Map.assocs) wm
node2str :: (Show a, Show a1) => [([a], [a1])] -> [(String, String)]
node2str = map (\(x,y)->(concatMap show x,concatMap show y))
print_nodes :: Int -> [(String,String)] -> [String]
print_nodes k ns = map shownode $ unique ns
where shownode x = x ++ " [ label=\""
++ (case indegree ns x of
0 -> x
_ -> drop (k2) x)
++ "\" shape=\"box\" ]"
unique = nub . sort . concatUnzip
concatUnzip ((a,b):abs) = a:b: concatUnzip abs
concatUnzip [] = []
print_edge :: Int -> [[Branch]] -> WordMap -> (String,String) -> String
print_edge minw bs wm (n1,n2) = n1 ++ " -> " ++ n2 ++ " [ "
++ (let x = length $ mlookup wm key in
if x == 1 then " style=\"dotted\""
else if x > minw then " style=\"bold\"" else "")
++ " color=\""
++ (if key `elem` (bs!!0) then "red\""
else "black\"")
++ "]; // "
where k = get_k wm
key = n2i $ map (read.(:[]))
(reverse ((head $ drop (k2) n2) : take (k1) (reverse n1)))
type EdgeMap a = Map.Map [a] [[a]]
type Edge a = ([a],[a])
optimize :: (Show a, Eq a, Ord a) => [Edge a] -> [Edge a]
optimize es = opt k (fwd_edges, rev_edges)
where
k = length $ fst $ head es
fwd_edges = (foldr . uncurry . Map.insertWith) (flip (++)) Map.empty $ map ( \(x,y) -> (x,[y])) es
rev_edges = (foldr . uncurry . Map.insertWith) (flip (++)) Map.empty $ map ( \(x,y) -> (y,[x])) es
opt :: (Show a, Ord a) => Int -> (EdgeMap a,EdgeMap a) -> [Edge a]
opt l edges =
let sgls = [(n0,n1) | (n0,[n1]) <- (Map.assocs) $ fst edges
, (flip Map.lookup) (snd edges) n1 == Just [n0]]
in case sgls of
((e0,e1):_) ->
let new = e0 ++ drop (l1) e1
f = (flip Map.delete) (fst edges) e0
r = (flip Map.delete) (snd edges) e1
i' = (flip Map.lookup) r e0
incoming = case i' of {Just x -> x; Nothing -> []}
o' = (flip Map.lookup) f e1
outgoing = case o' of {Just x -> x; Nothing -> []}
f' = replace f e0 new incoming
f'' = (flip Map.delete) f' e1
f''' = if outgoing == [] then (flip Map.delete) f'' new
else Map.insert new outgoing f''
r' = replace r e1 new outgoing
r'' = (flip Map.delete) r' e0
r''' = if incoming==[] then (flip Map.delete) r'' new
else Map.insert new incoming r''
in
opt l (f''',r''')
[] ->
e2l edges
replace :: (Show a,Ord a) => EdgeMap a -> [a] -> [a] -> [[a]] -> EdgeMap a
replace fm _ _ [] = fm
replace fm old new (l:ls) =
let ts = (flip Map.lookup) fm l
rep1 old new x = if x == old then new else x
fm' = Map.insert l (map (rep1 old new)
$ fromJust ts) fm
in
if ts == Nothing
then replace fm old new ls
else replace fm' old new ls
e2l :: (EdgeMap a,EdgeMap a) -> [Edge a]
e2l fms = let es = (Map.assocs) $ fst fms
in concatMap (\(s,ts) -> map ((,) s) ts) es
indegree, outdegree :: (Eq a) => [Edge a] -> [a] -> Int
indegree ns n = length $ filter ((==n).snd) ns
outdegree ns n = length $ filter ((==n).fst) ns
indeg, outdeg :: (Ord a) => (EdgeMap a, EdgeMap a)
-> [a] -> Int
indeg ns n = case (flip Map.lookup) (snd ns) n of Nothing -> 0
Just x -> length x
outdeg ns n = case (flip Map.lookup) (fst ns) n of Nothing -> 0
Just x -> length x
precs :: (Eq b) => [(a, b)] -> b -> [a]
precs ns n = map fst $ filter ((==n).snd) ns
otest1 :: [(String, String)]
otest1 = [("abc","bcd"), ("bcd","cde"), ("cde", "deg"), ("cde","def") ]
ofact1 :: [(String, String)]
ofact1 = [("abcde","deg"), ("abcde", "def")]
\end{code}
\end{document}