{-# LANGUAGE Arrows #-}

-- | Parse Multiple EM for Motif Elicitation (MEME) XML output.
--   xml parsing is done with the HXT libary.
--   For more information on MEME consult: <http://meme.nbcr.net/meme/>
module Bio.MemeXML ( parseXML
                   , atTag
                   , atId
                   , rstrip
                   , getMemeResult
                   , getTrainingSet
                   , getMemeAlphabet
                   , getLetter
                   , getAmbigs
                   , getLetterFrequencies
                   , getModel
                   , getBackgroundFrequencies
                   , getScannedSiteSummary
                   , getScannedSites
                   , getScannedSite
                   , getSequence
                   , getMotif
                   , getContributingSite
                   , module Bio.MemeData) where

import Bio.MemeData
import Text.XML.HXT.Core
import Data.Char (isSpace)    

-- | Parse XML results in XML format
parseXML :: String -> IOStateArrow s b XmlTree              
parseXML = readDocument [ withValidate no
                        , withRemoveWS yes  -- throw away formating WS
                        ] 
           
-- | gets all subtrees with the specified tag name
atTag :: ArrowXml a =>  String -> a XmlTree XmlTree
atTag tag = deep (isElem >>> hasName tag)

-- | gets all subtrees with the specified id attribute
atId :: ArrowXml a =>  String -> a XmlTree XmlTree
atId elementId = deep (isElem >>> hasAttrValue "id" (== elementId))

-- | removes whitespace characters                 
rstrip :: String -> String
rstrip = reverse . dropWhile isSpace . reverse

-- | gets the whole memexml document tree 
getMemeResult :: ArrowXml a => a XmlTree MemeResult  
getMemeResult = atTag "MEME" >>> 
  proc memeResult -> do
  memeResult_version <- getAttrValue "version" -< memeResult
  memeResult_release <- getAttrValue "release" -< memeResult
  memeResult_training_set  <- getTrainingSet -< memeResult
  memeResult_model  <- getModel -< memeResult
  memeResult_motifs  <- listA getMotif -< memeResult
  memeResult_scanned_site_summary  <- getScannedSiteSummary -< memeResult
  returnA -< MemeResult {
    memeversion = memeResult_version,
    memerelease = memeResult_release,
    trainingset = memeResult_training_set,
    model = memeResult_model,
    motifs = memeResult_motifs,
    scannedSiteSummary = memeResult_scanned_site_summary}
  
-- | get the Training set subtree
getTrainingSet :: ArrowXml a => a XmlTree TrainingSet  
getTrainingSet = atTag "training_set" >>> 
  proc trainingSet -> do
  trainingSet_datafile <- getAttrValue "datafile" -< trainingSet
  trainingSet_length <- getAttrValue "length" -< trainingSet
  trainingSet_alphabet <- getMemeAlphabet -< trainingSet
  trainingSet_ambigs <- getAmbigs -< trainingSet
  trainingSet_sequences <- listA getSequence -< trainingSet
  trainingSet_letterfrequencies <- getLetterFrequencies -< trainingSet
  returnA -< TrainingSet {
    trainingsetDatafile = trainingSet_datafile,
    trainingsetLength = read trainingSet_length :: Int,
    trainingsetAlphabet = trainingSet_alphabet,
    trainingsetAmbigs = trainingSet_ambigs,
    trainingsetSequences = trainingSet_sequences,
    trainingsetLetterFrequencies = trainingSet_letterfrequencies}

getMemeAlphabet :: ArrowXml a => a XmlTree MemeAlphabet
getMemeAlphabet = atTag "alphabet" >>> 
  proc memealphabet -> do
    alphabet_id <- getAttrValue "id" -< memealphabet
    alphabet_length <- getAttrValue "length" -< memealphabet
    alphabet_letters <- listA getLetter -< memealphabet
    returnA -< MemeAlphabet {
      memeAlphabetId = alphabet_id, 
      memeAlphabetLength = read alphabet_length :: Int,
      memeAlphabetLetters = alphabet_letters}

getLetter :: ArrowXml a => a XmlTree MemeLetter
getLetter = atTag "letter" >>> 
  proc memeletter -> do
    letter_id <- getAttrValue "id" -< memeletter
    letter_symbol <- getAttrValue "symbol" -< memeletter
    returnA -< MemeLetter {
      memeLetterId = letter_id, 
      memeLetterSymbol = read letter_symbol :: Char}

getAmbigs :: ArrowXml a => a XmlTree MemeAmbigs
getAmbigs = atTag "ambigs" >>> 
  proc memeambigs -> do
    ambigs_letters <- listA getLetter -< memeambigs
    returnA -< MemeAmbigs {
      memeAmbigLetters = ambigs_letters} 

getLetterFrequencies :: ArrowXml a => a XmlTree LetterFrequencies
getLetterFrequencies = atTag "letter_frequencies" >>> 
  proc memeletterfrequencies -> do
    letterfrequencies_alphabetarray <- getAlphabetArray -< memeletterfrequencies
    returnA -< LetterFrequencies {
      letterFrequenciesAlphabetArray = letterfrequencies_alphabetarray} 
  
getAlphabetArray :: ArrowXml a => a XmlTree AlphabetArray
getAlphabetArray =  atTag "alphabet_array" >>> 
  proc memealphabetarray -> do
    alphabetarray_values <- listA getAlphabetArrayValues -< memealphabetarray
    returnA -< AlphabetArray {
      alphabetArrayValues = alphabetarray_values} 

getAlphabetArrayValues :: ArrowXml a => a XmlTree AlphabetArrayValue
getAlphabetArrayValues =  atTag "alphabet_array_value" >>> 
  proc memealphabetarrayvalue -> do
    alphabetarrayvalue_id <- getAttrValue "letter_id" -< memealphabetarrayvalue
    alphabetarrayvalue_frequency <- getText <<< getChildren -< memealphabetarrayvalue
    returnA -< AlphabetArrayValue {
      letterId = alphabetarrayvalue_id,
      frequency = read alphabetarrayvalue_frequency :: Double} 

-- | get the model subtree    
getModel :: ArrowXml a => a XmlTree Model
getModel = atTag "model" >>>
  proc mememodel -> do
     model_command_line <- getText <<< getChildren <<< atTag "command_line" -< mememodel
     model_host <- getText <<< getChildren <<< atTag "host" -< mememodel
     model_type <- getText <<< getChildren <<< atTag "type" -< mememodel
     model_nmotifs <- getText <<< getChildren <<< atTag "nmotifs" -< mememodel
     model_evalue_threshold <- getText <<< getChildren <<< atTag "evalue_threshold" -< mememodel
     model_object_function <- getText <<< getChildren <<< atTag "object_function" -< mememodel
     model_min_width <- getText <<< getChildren <<< atTag "min_width" -< mememodel
     model_max_width <- getText <<< getChildren <<< atTag "max_width" -< mememodel
     model_minic <- getText <<< getChildren <<< atTag "minic" -< mememodel
     model_wg <- getText <<< getChildren <<< atTag "wg" -< mememodel
     model_ws <- getText <<< getChildren <<< atTag "ws"-< mememodel
     model_endgaps <- getText <<< getChildren <<< atTag "endgaps" -< mememodel
     model_minsites <- getText <<< getChildren <<< atTag "minsites" -< mememodel
     model_maxsites <- getText <<< getChildren <<< atTag "maxsites" -< mememodel
     model_wnsites <- getText <<< getChildren <<< atTag "wnsites" -< mememodel
     model_prob <- getText <<< getChildren <<< atTag "prob" -< mememodel
     model_spmap <- getText <<< getChildren <<< atTag "spmap" -< mememodel
     model_spfuzz <- getText <<< getChildren <<< atTag "spfuzz" -< mememodel
     model_prior <- getText <<< getChildren <<< atTag "prior"-< mememodel
     model_beta <- getText <<< getChildren <<< atTag "beta" -< mememodel
     model_maxiter <- getText <<< getChildren <<< atTag "maxiter" -< mememodel
     model_distance <- getText <<< getChildren <<< atTag "distance" -< mememodel
     model_num_sequences <- getText <<< getChildren <<< atTag "num_sequences" -< mememodel
     model_num_positions <- getText <<< getChildren <<< atTag "num_positions" -< mememodel
     model_seed <- getText <<< getChildren <<< atTag "seed" -< mememodel
     model_seqfrac <- getText <<< getChildren <<< atTag "seqfrac" -< mememodel
     model_strands <- getText <<< getChildren <<< atTag "strands" -< mememodel
     model_priors_file <- getText <<< getChildren <<< atTag "priors_file" -< mememodel
     model_reason_for_stopping <- getText <<< getChildren <<< atTag "reason_for_stopping"-< mememodel
     model_background_frequencies <- getBackgroundFrequencies -< mememodel
     returnA -< Model {
        commandLine = model_command_line,
        host = model_host,
        modelType = model_type,
        nmotifs = read model_nmotifs :: Int,
        evalueThreshold = model_evalue_threshold,
        objectFunction = model_object_function,
        minWidth = read model_min_width :: Int,
        maxWidth = read model_max_width :: Int,
        minic = read model_minic :: Double,
        wg = read model_wg :: Int,
        ws = read model_ws :: Int,
        endGaps = model_endgaps,
        minSites = read model_minsites :: Int,
        maxSites = read model_maxsites :: Int,
        wnSites = read model_wnsites :: Double,
        prob = read model_prob :: Int,
        spMap = model_spmap,
        spFuzz = read model_spfuzz :: Double,
        prior = model_prior,
        beta = read model_beta :: Double,
        maxiter = read model_maxiter :: Int,
        distance = read model_distance :: Double,
        numSequences =  read model_num_sequences :: Int,
        numPositions = read model_num_positions :: Int,
        seed = read model_seed :: Int,
        seqfrac = model_seqfrac,
        strands =  model_strands,
        priorsFile =  model_priors_file,
        reasonForStopping =  model_reason_for_stopping,
        backgroundFrequencies = model_background_frequencies}

getBackgroundFrequencies :: ArrowXml a => a XmlTree BackgroundFrequencies
getBackgroundFrequencies =  atTag "background_frequencies" >>> 
  proc memebackgroundfrequencies -> do
    backgroundfrequencies_source <- getAttrValue "source" -< memebackgroundfrequencies
    backgroundfrequencies_alphabetarray <- getAlphabetArray -<  memebackgroundfrequencies
    returnA -< BackgroundFrequencies {
      source = backgroundfrequencies_source,
      backgroundFrequenciesAlphabetArray = backgroundfrequencies_alphabetarray} 

getSequence :: ArrowXml a => a XmlTree Sequence          
getSequence = atTag "sequence" >>> 
  proc nucleotideSequence -> do
    nucleotide_SeqId <- getAttrValue "id" -< nucleotideSequence
    nucleotide_SeqName <- getAttrValue "name" -< nucleotideSequence
    nucleotide_SeqLength <- getAttrValue "length" -< nucleotideSequence
    nucleotide_SeqWeight <- getAttrValue "weight" -< nucleotideSequence
    returnA -< Sequence {
      sequenceId = nucleotide_SeqId, 
      sequenceName = nucleotide_SeqName,
      sequenceLength = read nucleotide_SeqLength :: Int,
      sequenceWeight = read nucleotide_SeqWeight :: Double}

-- | get a result motif
getMotif :: ArrowXml a => a XmlTree Motif  
getMotif = atTag "motif"  >>> 
  proc mememotif -> do
    motif_id <- getAttrValue "id" -< mememotif
    motif_name <- getAttrValue "name" -< mememotif
    motif_width <- getAttrValue "width" -< mememotif
    motif_sites <- getAttrValue "sites" -< mememotif
    motif_ic <- getAttrValue "ic" -< mememotif
    motif_re <- getAttrValue "re" -< mememotif
    motif_llr <- getAttrValue "llr" -< mememotif
    motif_e_value <- getAttrValue "e_value" -< mememotif
    motif_bayes_threshold <- getAttrValue "bayes_threshold" -< mememotif
    motif_elapsed_time <- getAttrValue "elapsed_time" -< mememotif
    motif_scores <- getScores -< mememotif
    motif_propabilities <- getPropabilities -< mememotif
    regex <- getText <<< getChildren <<<  atTag "regular_expression" -< mememotif
    contributingsites <- listA getContributingSite -< mememotif
    returnA -< Motif {
      motifId = motif_id, 
      motifName = motif_name,
      motifWidth = read motif_width :: Int,
      motifSites = read motif_sites :: Int,
      motifIc = read motif_ic :: Double,
      motifRe = read motif_re :: Double,
      motifLlr = read motif_llr :: Int, 
      motifEvalue = read motif_e_value :: Double,
      motifBayesTreshold = read motif_bayes_threshold :: Double,
      motifElapsedTime = read motif_elapsed_time :: Double,
      -- regex field contains 2 linebreaks
      motifRegularexpression = filter (/= '\n') regex,
      motifScores = motif_scores,
      motifProbabilites = motif_propabilities,
      motifContributingsites = contributingsites }

getScores :: ArrowXml a => a XmlTree Scores
getScores = atTag "scores" >>>
  proc memescores -> do
  scores_alphabetmatrix <-getAlphabetMatrix  -< memescores
  returnA -< Scores {
    scoreAlphabetMatrix = scores_alphabetmatrix}

getPropabilities :: ArrowXml a => a XmlTree Propabilities
getPropabilities = atTag "propabilities" >>>
  proc memepropabilities -> do
  propabilities_alphabetmatrix <-getAlphabetMatrix  -< memepropabilities
  returnA -< Propabilities {
    propabilitiesAlphabetMatrix = propabilities_alphabetmatrix}

getAlphabetMatrix :: ArrowXml a => a XmlTree AlphabetMatrix
getAlphabetMatrix = atTag "alphabet_matrix" >>>
  proc memeaalphabetmatrix -> do
    alphabet_matrix <- listA getAlphabetArray -< memeaalphabetmatrix
    returnA -< AlphabetMatrix {
      alphabetMatrixArrays = alphabet_matrix} 
   
getContributingSite :: ArrowXml a => a XmlTree ContributingSite  
getContributingSite = atTag "contributing_site" >>> 
  proc contributingsite -> do
  contributing_site_id <- getAttrValue "sequence_id" -< contributingsite
  contributing_site_position <- getAttrValue "position" -< contributingsite
  contributing_site_strand <- getAttrValue "strand" -< contributingsite
  contributing_site_pvalue <- getAttrValue "pvalue" -< contributingsite
  contributing_site_left_flank <- getText <<< getChildren <<< atTag "left_flank"  -< contributingsite
  contributing_site <- getSite  -< contributingsite
  contributing_site_right_flank <- getText <<< getChildren <<< atTag "right_flank"  -< contributingsite
  returnA -<  ContributingSite {
    contributingSiteId = contributing_site_id,
    contributingSitePosition = read contributing_site_position :: Int,
    contributingSiteStrand = contributing_site_strand,
    contributingSitePvalue = read contributing_site_pvalue :: Double,
    contributingSiteLeftFlank = contributing_site_left_flank,
    contributingSite = contributing_site,
    contributingSiteRightFlank = contributing_site_right_flank}

getSite :: ArrowXml a => a XmlTree Site
getSite = atTag "site" >>>
   proc site -> do
   site_letterreferences <- listA getSiteLetterReference -< site
   returnA -< Site {
     siteLetterReferences = site_letterreferences}
 
getSiteLetterReference :: ArrowXml a => a XmlTree LetterReference
getSiteLetterReference = atTag "letter_ref" >>>
  proc letterreference -> do
  letterid <- getAttrValue "letter_id" -< letterreference
  returnA -< LetterReference {
    letterReference = letterid}
   
getScannedSiteSummary :: ArrowXml a => a XmlTree ScannedSiteSummary
getScannedSiteSummary = atTag "scanned_sites_summary" >>>
  proc scannedsitesummary  -> do
    scannedsitesummary_p_tresh <- getAttrValue "p_thresh" -< scannedsitesummary
    scannedsites <- listA getScannedSites -< scannedsitesummary
    returnA -< ScannedSiteSummary{
      pThresh = read scannedsitesummary_p_tresh :: Double,
      scannedSites = scannedsites}

getScannedSites :: ArrowXml a => a XmlTree ScannedSites
getScannedSites = atTag "scanned_sites" >>>
  proc scannedsites  -> do
    scannedsites_sequence_id <- getAttrValue "sequence_id" -< scannedsites
    scannedsites_pvalue <- getAttrValue "pvalue" -< scannedsites
    scannedsites_num_sites <- getAttrValue "num_sites" -< scannedsites       
    scannedsites_array <- listA getScannedSite -< scannedsites
    returnA -< ScannedSites{
      scannedsitesSequenceId = scannedsites_sequence_id,
      scannedSitesPvalue = read scannedsites_pvalue :: Double,
      numSites = read scannedsites_num_sites :: Int,
      scannedSiteArray = scannedsites_array}
                               
getScannedSite :: ArrowXml a => a XmlTree ScannedSite
getScannedSite = atTag "scanned_site" >>>
  proc scannedsite  -> do
    scannedsite_motif_id <- getAttrValue "motif_id" -< scannedsite
    scannedsite_strand <- getAttrValue "strand" -< scannedsite
    scannedsite_position <- getAttrValue "position" -< scannedsite     
    scannedsite_sitepvalue <- getAttrValue "pvalue" -< scannedsite
    returnA -< ScannedSite{
      scannedsiteMotifId = scannedsite_motif_id,
      strand = scannedsite_strand,
      position = read scannedsite_position :: Int,
      scannedSitePvalue = read scannedsite_sitepvalue :: Double}