module Bio.Sequence.Phd (readPhd,hReadPhd) where
import Bio.Sequence.SeqData
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString as BBB
import System.IO
readPhd :: FilePath -> IO (Sequence Nuc)
readPhd f = return . mkPhd =<< B.readFile f
hReadPhd :: Handle -> IO (Sequence Nuc)
hReadPhd h = return . mkPhd =<< B.hGetContents h
mkPhd :: B.ByteString -> (Sequence Nuc)
mkPhd inp =
let (hd:fs) = filter (not . B.null) . B.lines $ inp
(comment,sd) = break (==B.pack "BEGIN_DNA") fs
(sd', _td) = break (==B.pack "END_DNA") sd
(magic,label) = B.splitAt 15 hd
more_magic = magic == B.pack "BEGIN_SEQUENCE "
fields = B.words . B.unlines
. filter (not . isSubstr (B.pack "_COMMENT")) $ comment
sdata = filter ((==3).length) . map B.words $ sd'
err = error "failed to parse quality value"
qual = BB.fromChunks [BBB.pack . map (maybe err (fromIntegral . fst) . B.readInt . (!!1)) $ sdata]
in if more_magic then qual `seq` (Seq (compact $ B.unwords (label:fields))
(compact $ B.concat $ map head sdata)
(Just qual))
else error "Incorrectly formatted PHD file - missing BEGIN_SEQUENCE"
isSubstr :: B.ByteString -> B.ByteString -> Bool
isSubstr s = any (B.isPrefixOf s) . B.tails
compact :: B.ByteString -> B.ByteString
compact = B.fromChunks . return . BBB.concat . B.toChunks