src/Bio/Sequence/Phd.hs

module Bio.Sequence.Phd(Phd(..), readPhd, readPhdTags) where

import Bio.Core.Sequence
import Bio.Sequence.PhdData

import Text.ParserCombinators.Parsec hiding (label)

import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as LB
import qualified Data.ByteString.Lazy.Char8 as LBC

import Data.Ix
import Data.Int (Int64)
import Data.List
import Data.Maybe
import Data.Text (Text)
import Data.Binary (encode)
import System.IO

-- | Parse a .phd file, extracting the contents as a PHD
readPhd :: FilePath -> IO Phd
readPhd f = return . mkPhd =<< readFile f

readPhdTags :: FilePath -> IO (Maybe [PhdTag])
readPhdTags f = return . (mkPhdTags . lines) =<< readFile f

-- | Parse .phd contents from a handle
hReadPhd :: Handle -> IO Phd
hReadPhd h = return . mkPhd =<< hGetContents h

-- | The actual phd parser.

mkPhd :: String -> Phd
mkPhd inp = 
  let (hd:fs)                = filter (not . null) . lines $ inp
      (magic,label)          = splitAt 15 hd
      (comment'',seqAndTags) = break (== "BEGIN_DNA") fs
      comment'               = init $ tail comment''
      comment                = map (tail . snd) (map (break (==' ')) comment')
      (seq', tags')          = break (== "END_DNA") seqAndTags
      seq                    = drop 1 seq'
      tags                   = init $ tail tags'
      fields                 = words . unlines $ comment
      sdata                  = filter ((==3).length) . map words $ seq
      err                    = error "failed to parse quality value"
      dna                    = concat $ map (!!0) $ sdata
      qual                   = map ((\x -> read x :: Int) . (!!1)) $ sdata
      traceInd               = map ((\x -> read x :: Int) . (!!2)) $ sdata
  in if (magic == "BEGIN_SEQUENCE ") then Phd (mkComment  comment) 
                                              (mkDNABlock label dna qual traceInd)
                                              (mkPhdTags tags)
                                            
     else error "Incorrectly formatted PHD file - missing BEGIN_SEQUENCE"
     --   Todo: also check that we have a BEGIN_DNA/END_DNA region there.

mkComment :: [String] -> Comment
mkComment com = Comment { chromatFile        = com!!0
                        , abiThumbprint      = com!!1
                        , phredVersion       = com!!2
                        , callMethod         = com!!3
                        , qualityLevels      = read (com!!4) :: Int
                        , time               = com!!5
                        , traceArrayMinIndex = read (com!!6) :: Int
                        , traceArrayMaxIndex = read (com!!7) :: Int
                        , trim               = com!!8
                        , chem               = com!!9
                        , dye                = com!!10 }

mkDNABlock :: String -> String -> [Int] -> [Int] -> DNABlock
mkDNABlock l d q t = DNABlock { label        = l
                              , bases        = SeqData  { unSD = LBC.pack d }
                              , qualities    = QualData { unQD = encode   q }
                              , traceIndices = t }

mkPhdTags :: [String] -> Maybe [PhdTag]
mkPhdTags phdLines = case groupByTags phdLines of
                       [] -> Nothing
                       _ -> Just (map (fromJust . mkOnePhdTag) (groupByTags phdLines))

groupByTags :: [String] -> [[String]]
groupByTags [] = []
groupByTags xs =
  let begin_indices = elemIndices "BEGIN_TAG" xs
      end_indices   = elemIndices "END_TAG" xs
      tag_spans     = zip begin_indices end_indices
      grouping      = \x -> drop (fst (tag_spans!!x)) (take (snd (tag_spans!!x) + 1) xs)
  in  map grouping (Data.Ix.range (0, (length tag_spans) -1)) 

mkOnePhdTag :: [String] -> Maybe PhdTag
mkOnePhdTag td = case length td of 
                   0 -> Nothing
                   9 -> Just PhdTag { tagType = drop 6 (td!!1)
                                   , source  = drop 8 (td!!2)
                                   , unpaddedReadPosition = map (\x -> Offset {unOff = read x :: Int64}) (words (drop 19 (td!!3)))
                                   , date    = drop 6 (td!!4)
                                   , comment = if td!!6 == "BEGIN_TAG" then ""
                                               else td!!6 
                                   }
                   _ -> Just PhdTag { tagType = drop 6 (td!!1)
                                   , source  = drop 8 (td!!2)            
                                   , unpaddedReadPosition = map (\x -> Offset {unOff = read x :: Int64}) (words (drop 19 (td!!3)))          
                                   , date    = drop 6 (td!!4)                           
                                   , comment = ""
                                   }