module Bio.Sequence.Phd (readPhd,hReadPhd) where
import Bio.Sequence.SeqData
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString as BBB
import System.IO
readPhd :: FilePath -> IO Sequence
readPhd f = return . mkPhd =<< B.readFile f
hReadPhd :: Handle -> IO Sequence
hReadPhd h = return . mkPhd =<< B.hGetContents h
mkPhd :: B.ByteString -> Sequence
mkPhd inp =
let (hd:fs) = filter (not . B.null) . B.lines $ inp
(comment,sd) = break (==B.pack "BEGIN_DNA") fs
label = B.drop 15 hd
fields = B.words . B.unlines
. filter (not . isSubstr (B.pack "_COMMENT")) $ comment
sdata = filter ((==3).length) . map B.words $ sd
err = error "failed to parse quality value"
qual = BB.fromChunks [BBB.pack . map (maybe err (fromIntegral . fst) . B.readInt . (!!1)) $ sdata]
in qual `seq` (Seq (compact $ B.unwords (label:fields))
(compact $ B.concat $ map head sdata)
(Just qual))
isSubstr :: B.ByteString -> B.ByteString -> Bool
isSubstr s = any (B.isPrefixOf s) . B.tails
compact :: B.ByteString -> B.ByteString
compact = B.fromChunks . return . BBB.concat . B.toChunks