{- | Module: Bio.Sequence.Phd Parse phd files (phred base calling output). -} module Bio.Sequence.Phd (readPhd,hReadPhd) where import Bio.Sequence.SeqData import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.ByteString.Lazy as BB import qualified Data.ByteString as BBB -- this is getting silly! import System.IO -- | Parse a .phd file, extracting the contents as a Sequence readPhd :: FilePath -> IO (Sequence Nuc) readPhd f = return . mkPhd =<< B.readFile f -- | Parse .phd contents from a handle hReadPhd :: Handle -> IO (Sequence Nuc) hReadPhd h = return . mkPhd =<< B.hGetContents h -- | The actual phd parser. -- aesthetics is not a major design goal... -- but error checking really should have been. Sigh. mkPhd :: B.ByteString -> (Sequence Nuc) mkPhd inp = let (hd:fs) = filter (not . B.null) . B.lines $ inp (comment,sd) = break (==B.pack "BEGIN_DNA") fs (sd', td) = break (==B.pack "END_DNA") sd (magic,label) = B.splitAt 15 hd more_magic = magic == B.pack "BEGIN_SEQUENCE " fields = B.words . B.unlines . filter (not . isSubstr (B.pack "_COMMENT")) $ comment sdata = filter ((==3).length) . map B.words $ sd' err = error "failed to parse quality value" qual = BB.fromChunks [BBB.pack . map (maybe err (fromIntegral . fst) . B.readInt . (!!1)) $ sdata] in if more_magic then qual `seq` (Seq (compact $ B.unwords (label:fields)) (compact $ B.concat $ map head sdata) (Just qual)) else error "Incorrectly formatted PHD file - missing BEGIN_SEQUENCE" -- Todo: also check that we have a BEGIN_DNA/END_DNA region there. isSubstr :: B.ByteString -> B.ByteString -> Bool isSubstr s = any (B.isPrefixOf s) . B.tails -- | Pack bytestring segments into a single bytestring -- Allows the (rest of the) file contents to be GC'ed compact :: B.ByteString -> B.ByteString compact = B.fromChunks . return . BBB.concat . B.toChunks