{- | Annotate sequences with GO terms, via BLASTX hits to UniProt -} module Main where import Data.ByteString.Lazy.Char8 (ByteString,unpack,pack) import qualified Data.ByteString.Lazy.Char8 as B import Data.Map (Map) import qualified Data.Map as M import Data.Set (Set) import qualified Data.Set as S import Data.List (intersperse,sortBy,partition,group,nub) import Control.Monad import Data.Maybe import System.IO import System.Directory (doesDirectoryExist, createDirectory) import Data.Array.Unboxed import Data.Word import Bio.Sequence.GOA import qualified Bio.Alignment.BlastData as BD import Bio.Alignment.BlastData (BlastResult) import Bio.Alignment.BlastFlat import Bio.Alignment.BlastXML import Options import Html main :: IO () main = getOptions >>= annotate -- >>= tabulate -- a sequence name and a list of annotation and bit? scores type AnnotatedSequence = (ByteString, [(Annotation,Int)]) {- | Read files and return list of annotated sequences 1. build a map of GO terms to description (25K lines - tiny) 2. collect all proteins names from the XML file (1 pass over .xml) 3. build a map of proteins to GO terms (1 pass over gene_assoc..), and add GO term desc 4. revisit .xml, output: sequence, hits?, go terms/summary,...whatever -} annotate :: Options -> IO () -- [AnnotatedSequence] annotate opts = do -- (Opts (Just go) (Just goa),[xml],[]) let readXMLs fs = return . concat =<< mapM readXML fs when (null (inputs opts)) $ fail "No input files specified!" -- goterms are not necessary when we don't also use goanns (goterms,goanns) <- case goann opts of Nothing -> return (M.empty,M.empty) Just goa -> do ts <- case godef opts of Nothing -> return M.empty Just godb -> do t <- return . goTerms =<< (count opts) 1000 "reading GO terms: " =<< readGO godb when (isNothing $ M.lookup (GO 0) t) $ return () -- strictness hack return t proteins <- getProts opts when (S.member (B.empty) proteins == False) $ return () -- strictness mt <- readGOA goa >>= (count opts) 1000 "reading GO annotations: " >>= return . protTerms proteins when (isNothing $ M.lookup B.empty mt) $ return () -- strictness, again return (ts,mt) let tabulator = tabulate opts goterms goanns case format opts of Csv -> readXMLs (inputs opts) >>= csvize opts tabulator . concatMap results Html -> readXMLs (inputs opts) >>= htmlize opts tabulator -- ---------------------------------------------------------------------- -- Build tables of annotation information -- ---------------------------------------------------------------------- -- | Parse GO.terms_and_ids into a map goTerms :: [GoDef] -> Map GoTerm GoDef goTerms = M.fromList . map go2keyval where go2keyval gd@(GoDef gt _ _) = (gt,gd) -- | Read blast xml output and collect UniProt hits getProts :: Options -> IO (Set UniProtAcc) getProts opts = do hs <- mapM (\f -> (count opts) 100 ("reading accessions from '"++f++"': ") . concatMap BD.hits . concatMap BD.results =<< readXML f) (inputs opts) return . S.fromList . map (chop . BD.subject) $ concat hs -- | Convert "UniProt_ABCDEF Blah blah blah" to "ABCDEF" -- Warning: only tested on this format, may break! chop :: ByteString -> ByteString chop = B.drop 1 . B.dropWhile (/='_') . head . B.words -- | Using GO definitions and the set of uniprot terms, scan the associations file and -- snarf all relevant data. Requires the annotations to be grouped by protein. protTerms :: (Set UniProtAcc) -> [Annotation] -> Map UniProtAcc (UArray Int Word16) protTerms ps as = M.fromAscList . map toArray . partitions . filter isMember $ as where toArray (Ann up (GO gt1) _:xs) = let gs = nub (gt1:[ gt | Ann _ (GO gt) _ <- xs ]) a = listArray (0,length gs-1) $ map fromIntegral gs in a `seq` (pack . unpack $ up,a) toArray [] = error "cannot read an empty list of annotations!" isMember (Ann up _ _) = up `S.member` ps partitions xs@(Ann x _ _:_) = let (p1,pps) = span (\(Ann y _ _) -> y==x) xs in p1:partitions pps partitions [] = [] -- --------------------------------------------------------------------- -- | Implement CSV output -- --------------------------------------------------------------------- csvize :: Options -> ([String], BlastRecord -> [[String]]) -> [BlastRecord] -> IO () htmlize :: Options -> ([String], BlastRecord -> [[String]]) -> [BlastResult] -> IO () csvize opts (header,writer) brs = do hs <- count opts 10 "Generating output: " brs let csvFormat = unlines . map (concat . intersperse "," . map maybeQuote) maybeQuote = filter (/=',') output opts $ csvFormat (header : concatMap writer hs) htmlize opts (header,writer) brs = do doesDirectoryExist htmldir >>= \d -> if d then when (verb opts) $ hPutStrLn stderr "Warning: reusing exisiting directory 'blast.d'" else createDirectory htmldir brs' <- count opts 10 "Generating output: " $ concatMap results brs withFile "index.html" WriteMode $ \h -> do hPutStr h (htmlheader (head brs) header) mapM (mkHtml h writer) brs' hPutStr h htmlfooter -- | read file and return a header, and a function for generating output tabulate :: Options -> Map GoTerm GoDef -> Map UniProtAcc (UArray Int Word16) -> ([String], BlastRecord -> [[String]]) tabulate opts gds pts = let header = ["Query","from","to","Target","Description","from","to","ident","bitscore","E-value","direction"] ++ if isJust (goann opts) then ["GO terms"] ++ case select opts of All -> [] _ -> ["indirect GO"] else [] wr = case format opts of Html -> showTop Csv -> case select opts of All -> showAll Top -> showTop Reg -> showReg in (header, wr (isJust (goann opts)) gds pts) -- a writer generates one or more output records from a BlastRecord type Writer = Bool -> Map GoTerm GoDef -> Map UniProtAcc (UArray Int Word16) -> (BlastRecord -> [[String]]) showAll, showTop, showReg :: Writer -- nb: the returns are due to the type of flatten :: [BlastRecord] -> .. showAll go gds pts = map show1 . flatten . return where show1 bf = showFlat bf ++ if go then [showGo gds pts [subject bf]] else [] showTop go g p = showTop' go g p . flatten . return showTop' :: Bool -> Map GoTerm GoDef -> Map UniProtAcc (UArray Int Word16) -> [BlastFlat] -> [[String]] showTop' go gds pts = map show1 . select_first where show1 (bf,go1,go2) = showFlat bf ++ if go then [go1,go2] else [] select_first [] = [] -- take all hits for this query select_first (x:xs) = let (as,zs) = span (\y -> query x == query y) xs (bs,ys) = span (\y -> subject x == subject y) as ysubs = map head $ group $ map subject ys in (merge (x:bs), showGo gds pts [subject x],showGo gds pts ysubs) : select_first zs -- merge all hits against the same subject merge :: [BlastFlat] -> BlastFlat merge (x:xs) = let (rest,_) = partition (\y -> e_val y == e_val x) xs q1 = minimum (map q_from (x:rest)) q2 = maximum (map q_to (x:rest)) s1 = minimum (map h_from (x:rest)) s2 = maximum (map h_to (x:rest)) in x { q_from = q1, q_to = q2, h_from = s1, h_to = s2 } merge [] = error "needs at least one blast hit to generate output" showReg go gds pts = concatMap (showTop' go gds pts) . select_region . flatten . return where select_region [] = [] select_region (x:xs) = let (this' ,next) = span ((query x==).query) xs (this'',next') = partition ((dir x==).dir) this' x' = merge (x:takeWhile ((subject x==).subject) this'') this = sortBy (compare `on` q_from) $ this'' in regions [x'] (q_from x', q_to x') this ++ select_region (next'++next) regions :: [BlastFlat] -> (Int,Int) -> [BlastFlat] -> [[BlastFlat]] regions x (a,b) (y:ys) = if overlap (a,b) y then regions (y:x) (min a (q_from y), max b (q_to y)) ys else reverse x : regions [y] (q_from y, q_to y) ys regions x (_,_) [] = [reverse x] overlap (a,b) s = q_from s < (a+b) `div` 2 || q_to s < b + (a+b) `div` 2 f `on` g = \x y -> f (g x) (g y) -- | generate one record (i.e. line of output) showFlat :: BlastFlat -> [String] showFlat bf = [ head $ words $ unpack $ query bf , show $ q_from bf, show $ q_to bf , head $ words $ unpack $ subject bf , unpack $ B.dropWhile (==' ') $ B.dropWhile (/=' ') $ subject bf , show $ h_from bf, show $ h_to bf , show (fst . identity $ bf), show (bits bf), show (e_val bf) , dir bf ] dir :: BlastFlat -> String dir bf = case aux bf of Strands p p' -> if p==p' then "Fwd" else "Rev" Frame d _ -> if d==Minus then "Rev" else "Fwd" showGo :: Map GoTerm GoDef -> Map UniProtAcc (UArray Int Word16) -> [UniProtAcc] -> String showGo gds pts gs = showGD . unions . map fromJust . filter isJust . map (flip M.lookup pts) . map chop $ gs where showGD = concatMap (\gd -> show (GO $ fromIntegral gd) ++ showGT (M.lookup (GO $ fromIntegral gd) gds)) showGT (Just (GoDef _ str cls)) = " ("++show cls++": "++unpack str++") " showGT Nothing = " " unions = nub . concatMap elems