module Bio.Sequence.Phd(PHD(..), readPhd) where import Bio.Core.Sequence import Bio.Sequence.PhdData import Text.ParserCombinators.Parsec hiding (label) import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as LB import qualified Data.ByteString.Lazy.Char8 as LBC import Data.Ix import Data.Int (Int64) import Data.List import Data.Text (Text) import Data.Binary (encode) import System.IO -- | Parse a .phd file, extracting the contents as a PHD readPhd :: FilePath -> IO PHD readPhd f = return . mkPhd =<< readFile f -- | Parse .phd contents from a handle hReadPhd :: Handle -> IO PHD hReadPhd h = return . mkPhd =<< hGetContents h -- | The actual phd parser. mkPhd :: String -> PHD mkPhd inp = let (hd:fs) = filter (not . null) . lines $ inp (magic,label) = splitAt 15 hd (comment'',seqAndTags) = break (== "BEGIN_DNA") fs comment' = init $ tail comment'' comment = map (tail . snd) (map (break (==' ')) comment') (seq', tags') = break (== "END_DNA") seqAndTags seq = drop 1 seq' tags = init $ tail tags' fields = words . unlines $ comment sdata = filter ((==3).length) . map words $ seq err = error "failed to parse quality value" dna = concat $ map (!!0) $ sdata qual = map ((\x -> read x :: Int) . (!!1)) $ sdata traceInd = map ((\x -> read x :: Int) . (!!2)) $ sdata in if (magic == "BEGIN_SEQUENCE ") then PHD (mkComment comment) (mkDNABlock label dna qual traceInd) (mkPhdTags tags) else error "Incorrectly formatted PHD file - missing BEGIN_SEQUENCE" -- Todo: also check that we have a BEGIN_DNA/END_DNA region there. mkComment :: [String] -> Comment mkComment com = Comment { chromatFile = com!!0 , abiThumbprint = com!!1 , phredVersion = com!!2 , callMethod = com!!3 , qualityLevels = read (com!!4) :: Int , time = com!!5 , traceArrayMinIndex = read (com!!6) :: Int , traceArrayMaxIndex = read (com!!7) :: Int , trim = com!!8 , chem = com!!9 , dye = com!!10 } mkDNABlock :: String -> String -> [Int] -> [Int] -> DNABlock mkDNABlock l d q t = DNABlock { label = l , bases = SeqData { unSD = LBC.pack d } , qualities = QualData { unQD = encode q } , traceIndices = t } mkPhdTags :: [String] -> Maybe [PhdTag] mkPhdTags [""] = Nothing mkPhdTags td = Just (map mkOnePhdTag (groupByTags td)) groupByTags :: [String] -> [[String]] groupByTags [] = [] groupByTags xs = let begin_indices = elemIndices "BEGIN_TAG" xs end_indices = elemIndices "END_TAG" xs tag_spans = zip begin_indices end_indices grouping = \x -> drop (fst (tag_spans!!x)) (take (snd (tag_spans!!x) + 1) xs) in map grouping (Data.Ix.range (0, (length tag_spans) -1)) mkOnePhdTag :: [String] -> PhdTag mkOnePhdTag td = if length td == 9 then PhdTag { tagType = drop 6 (td!!1) , source = drop 8 (td!!2) , unpaddedReadPosition = map (\x -> Offset {unOff = read x :: Int64}) (words (drop 19 (td!!3))) , date = drop 6 (td!!4) , comment = if td!!6 == "BEGIN_TAG" then "" else td!!6 } else PhdTag { tagType = drop 6 (td!!1) , source = drop 8 (td!!2) , unpaddedReadPosition = map (\x -> Offset {unOff = read x :: Int64}) (words (drop 19 (td!!3))) , date = drop 6 (td!!4) , comment = ""}