{- | Parse blast XML output.

   If you use a recent version of NCBI BLAST and specify XML output (blastall -m 7),
   this module should be able to parse the result into a hierarchical 'BlastResult'
   structure.

   While the process may consume a bit of memory, the parsing is lazy,
   and file sizes of several gigabytes can be parsed (see e.g. the
   xml2x tool for an example).  To parse XML, we use
   'Text.HTML.TagSoup'. 
-}

module Bio.Alignment.BlastXML (readXML) where

import Bio.Alignment.BlastData

import qualified Data.ByteString.Lazy.Char8 as B
import Text.HTML.TagSoup
import Control.Monad
import Control.Parallel
import Data.List (isPrefixOf)

type STag = Tag String

-- | Parse BLAST results in XML format
readXML :: FilePath -> IO [BlastResult]
readXML fp = do 
    fc <- readFile fp
    when (not $ isPrefixOf "<?xml" fc) 
             $ error ("Bio.Sequence.BlastXML.readXML:\n   The file '"
                      ++fp++"' does not look like an XML file - aborting!")
    let ts = parseTags fc
        (h:iters) = breaks (\t -> isTagOpenName "Iteration" t || isTagOpenName "Hit" t) ts
    return [xml2br h iters]

-- | breaks p = groupBy (const (not.p))
breaks :: (a -> Bool) -> [a] -> [[a]]
breaks p (x:xs) = let first = x : takeWhile (not.p) xs
                      rest  = dropWhile (not.p) xs
                  in  rest `par` first : if null rest then [] else breaks p rest
breaks _ []     = []

getFrom :: [STag] -> String -> String
getFrom list tag = let xs = sections (isTagOpenName tag) list 
                   in if null xs || null (head xs) || (null . drop 1 . head) xs 
                      then error ("Couldn't find tag '"++show tag++"' in\n"++showSome list)
                      else case xs !! 0 !! 1 of 
                             TagText s -> s
                             x -> error ("Unexpeced tag: "++ show x)

-- Use pattern match since 'length' is strict, defeating the purpose.
showSome :: [STag] -> String
showSome a@(_:_:_:_:_:_:_) = (init . show . take 5 $ a)++" ... ]"
showSome a                 = show a

xml2br :: [STag] -> [[STag]] -> BlastResult
xml2br h is = BlastResult { blastprogram = get "BlastOutput_program"
                          , blastversion = bv
                          , blastdate = bd 
                          , blastreferences = get "BlastOutput_reference"
                          , database = get "BlastOutput_db"
                          , dbsequences = 0
                          , dbchars = 0
                          , results = map iter2rec $ breaks (isTagOpenName "Iteration" . head) is
                          }
    where (bv,bd) = B.break (=='[') $ get "BlastOutput_version"
          get = B.pack . getFrom h

iter2rec :: [[STag]] -> BlastRecord
iter2rec (i:hs) = BlastRecord 
              { query = B.pack $ get "Iteration_query-def"
              , qlength = read $ get "Iteration_query-len"
              , hits = map hit2hit hs
              }
    where get = getFrom i
iter2rec [] = error "iter2rec: got empty list of sections!"

hit2hit :: [STag] -> BlastHit
hit2hit hs = BlastHit 
             { subject = B.pack $ get "Hit_def"
             , slength = read $ get "Hit_len"
             , matches = map hsp2match $ partitions (isTagOpenName "Hsp") hs
             }
    where get = getFrom hs


hsp2match :: [STag] -> BlastMatch
hsp2match ms = BlastMatch
               { bits   = read $ get "Hsp_bit-score"
               , e_val  = read $ get "Hsp_evalue"
               , q_from = read $ get "Hsp_query-from"
               , q_to   = read $ get "Hsp_query-to"
               , h_from = read $ get "Hsp_hit-from"
               , h_to   = read $ get "Hsp_hit-to"
               , identity = (read $ get "Hsp_identity", read $ get "Hsp_align-len")
               -- blastx has query-frame ±1..3 
               -- tblastn has hit-frame
               -- blastn has both hit and query
               -- tblastx has query-frame = 1, hit-frame ±1..3
               , aux = case sections (isTagOpenName "Hsp_hit-frame") ms of
                         [] -> mkFrame $ get "Hsp_query-frame"
                         [(_o:TagText hf:_c)] -> case sections (isTagOpenName "Hsp_query-frame") ms of 
                                                   [] -> mkFrame hf
                                                   [(__o:TagText qf:__c)] -> mkStrands hf qf
                                                   e -> error ("hsp2match: should be tagopen/text/close:\n"++show e)
                         e -> error ("hsp2match: failed to determine frame:\n"++show e)
               }
    where get = getFrom ms
          mkFrame f = Frame (strand' $ signum $ read f) (abs $ read f)
          mkStrands h q = Strands (strand' $ read h) (strand' $ read q)
          -- ignore frame also for tblastx hits (it can be reconstructed from location)
          strand' :: Int -> Strand
          strand' s = if s > 0 then Plus else Minus