module Bio.Sequence.Phd(Phd(..), readPhd, readPhdTags) where
import Bio.Core.Sequence
import Bio.Sequence.PhdData
import Text.ParserCombinators.Parsec hiding (label)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as LB
import qualified Data.ByteString.Lazy.Char8 as LBC
import Data.Ix
import Data.Int (Int64)
import Data.List
import Data.Maybe
import Data.Text (Text)
import Data.Binary (encode)
import System.IO
readPhd :: FilePath -> IO Phd
readPhd f = return . mkPhd =<< readFile f
readPhdTags :: FilePath -> IO (Maybe [PhdTag])
readPhdTags f = return . (mkPhdTags . lines) =<< readFile f
hReadPhd :: Handle -> IO Phd
hReadPhd h = return . mkPhd =<< hGetContents h
mkPhd :: String -> Phd
mkPhd inp =
let (hd:fs) = filter (not . null) . lines $ inp
(magic,label) = splitAt 15 hd
(comment'',seqAndTags) = break (== "BEGIN_DNA") fs
comment' = init $ tail comment''
comment = map (tail . snd) (map (break (==' ')) comment')
(seq', tags') = break (== "END_DNA") seqAndTags
seq = drop 1 seq'
tags = init $ tail tags'
fields = words . unlines $ comment
sdata = filter ((==3).length) . map words $ seq
err = error "failed to parse quality value"
dna = concat $ map (!!0) $ sdata
qual = map ((\x -> read x :: Int) . (!!1)) $ sdata
traceInd = map ((\x -> read x :: Int) . (!!2)) $ sdata
in if (magic == "BEGIN_SEQUENCE ") then Phd (mkComment comment)
(mkDNABlock label dna qual traceInd)
(mkPhdTags tags)
else error "Incorrectly formatted PHD file - missing BEGIN_SEQUENCE"
mkComment :: [String] -> Comment
mkComment com = Comment { chromatFile = com!!0
, abiThumbprint = com!!1
, phredVersion = com!!2
, callMethod = com!!3
, qualityLevels = read (com!!4) :: Int
, time = com!!5
, traceArrayMinIndex = read (com!!6) :: Int
, traceArrayMaxIndex = read (com!!7) :: Int
, trim = com!!8
, chem = com!!9
, dye = com!!10 }
mkDNABlock :: String -> String -> [Int] -> [Int] -> DNABlock
mkDNABlock l d q t = DNABlock { label = l
, bases = SeqData { unSD = LBC.pack d }
, qualities = QualData { unQD = encode q }
, traceIndices = t }
mkPhdTags :: [String] -> Maybe [PhdTag]
mkPhdTags phdLines = case groupByTags phdLines of
[] -> Nothing
_ -> Just (map (fromJust . mkOnePhdTag) (groupByTags phdLines))
groupByTags :: [String] -> [[String]]
groupByTags [] = []
groupByTags xs =
let begin_indices = elemIndices "BEGIN_TAG" xs
end_indices = elemIndices "END_TAG" xs
tag_spans = zip begin_indices end_indices
grouping = \x -> drop (fst (tag_spans!!x)) (take (snd (tag_spans!!x) + 1) xs)
in map grouping (Data.Ix.range (0, (length tag_spans) 1))
mkOnePhdTag :: [String] -> Maybe PhdTag
mkOnePhdTag td = case length td of
0 -> Nothing
9 -> Just PhdTag { tagType = drop 6 (td!!1)
, source = drop 8 (td!!2)
, unpaddedReadPosition = map (\x -> Offset {unOff = read x :: Int64}) (words (drop 19 (td!!3)))
, date = drop 6 (td!!4)
, comment = if td!!6 == "BEGIN_TAG" then ""
else td!!6
}
_ -> Just PhdTag { tagType = drop 6 (td!!1)
, source = drop 8 (td!!2)
, unpaddedReadPosition = map (\x -> Offset {unOff = read x :: Int64}) (words (drop 19 (td!!3)))
, date = drop 6 (td!!4)
, comment = ""
}