{- | This module implements a parser for BLAST results. This module is DEPRECATED. It is *very* recommended that you run blast with XML output instaed, and use the BlastXML module to parse it. Don't say I didn't warn you! BLAST is a tool for searching in (biological) sequences for similarity. This library is tested against NCBI-blast version 2.2.14. There exist several independent versions, so expect some incompatbilities if you're using a different BLAST version. The format is straightforward (and non-recursive), and this implementation uses a simple line-based, hierarchical parser. For more information on BLAST, check -} {-# LANGUAGE CPP #-} module Bio.Alignment.Blast {-# DEPRECATED "Use XML output and the BlastXML module" #-} (parse) where import Prelude hiding (lines,words) import qualified Data.ByteString.Lazy.Char8 as B import Data.ByteString.Lazy.Char8 (ByteString,isPrefixOf,lines,words) import Data.List (groupBy) import Data.Char (isDigit) import Bio.Alignment.BlastData import Bio.Util (splitWhen) -- String constant used in parsing str_query, str_gt, str_score, str_refer, str_datab, str_search :: ByteString str_query = B.pack "Query= " str_gt = B.pack ">" str_score = B.pack " Score = " str_refer = B.pack "Reference: " str_datab = B.pack "Database: " str_search= B.pack "Searching" #define BUG(C_,M_) (error ("Program error - '"++C_++"' failed: "++M_++". Location: "++__FILE__++" line: "++ show (__LINE__::Int))) #define readX (\s -> case [ x | (x,t) <- reads (B.unpack s), ("","") <- lex t] of { [x] -> x ; [] -> BUG("read",("no parse for "++B.unpack s)); _ -> BUG("read","ambigous parse")}) -------------------------------------------------- -- Splitting up the input in its relevant parts -- -------------------------------------------------- queries :: [ByteString] -> [[ByteString]] queries = splitWhen (isPrefixOf str_query) qhits :: [ByteString] -> [[ByteString]] qhits = splitWhen (isPrefixOf str_gt) hmatches :: [ByteString] -> [[ByteString]] hmatches = splitWhen (isPrefixOf str_score) -------------------------------------------------- -- parsing each part -- -------------------------------------------------- -- top level parsing function parse :: ByteString -> BlastResult parse s = let (p:qs) = queries $ lines s br = parse_preamble p rs = map parse_query qs in br { results = rs } -- parse metadata from the preamble parse_preamble :: [ByteString] -> BlastResult parse_preamble [] = error "Blast: can't parse preamble - empty input" parse_preamble (l:ls) = BlastResult { blastprogram = w1, blastversion = w2, blastdate = w3 , blastreferences = B.concat $ map (B.drop (B.length str_refer)) rs , database = B.drop (B.length str_datab) d , dbsequences = -1, dbchars = -1, results = [] } where [w1,w2,w3] = words l records = map B.unlines $ splitWhen (\p -> or $ map (\x -> isPrefixOf x p) [str_refer,str_datab,str_search]) ls rs = filter (isPrefixOf str_refer) records d = case filter (isPrefixOf str_datab) records of [d1] -> d1; _ -> B.pack "" -- parse the blast record for one query sequence parse_query :: [ByteString] -> BlastRecord parse_query ls = let ((s1:s2:_):hs) = qhits ls in BlastRecord { query = B.drop (B.length str_query) s1 , qlength = readX $ B.takeWhile isDigit $ (!!1) $ B.split '(' s2 , hits = map parse_hit hs } -- parse a hit against a sequence in the data base parse_hit :: [ByteString] -> BlastHit parse_hit ls = let (h:ms) = hmatches ls in BlastHit { subject = B.concat h, slength = -1, matches = map parse_match ms} -- parse a match between a query and a subject -- the format is a bit variable, it seems parse_match :: [ByteString] -> BlastMatch parse_match (l1:l2:l3:_) = let hdr = words $ B.concat [l1,l2,l3] [bs,ev,ident,str1,str2] = map (hdr!!) [2,7,10] ++ map (reverse hdr!!) [2,0] -- Strand is last, ignore optional Gap identX = let [nom,_:denom] = groupBy (const isDigit) $ B.unpack ident in (read nom, read denom) in BlastMatch { bits = readX bs, e_val = readX ev, identity = identX, aux = Strands (readX str1) (readX str2), q_from = ee, q_to = ee, h_from = ee, h_to = ee } where ee = error "hit coordinates is not supported - use XML format" parse_match _ = error "Blast: parse_match: pattern match failed"