{- | 
   Module: Bio.Sequence.Phd

   Parse phd files (phred base calling output).
-}

module Bio.Sequence.Phd (readPhd,hReadPhd) where

import Bio.Sequence.SeqData

import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString as BBB  -- this is getting silly!

import System.IO

-- | Parse a .phd file, extracting the contents as a Sequence
readPhd :: FilePath -> IO (Sequence Nuc)
readPhd f = return . mkPhd =<< B.readFile f

-- | Parse .phd contents from a handle
hReadPhd :: Handle -> IO (Sequence Nuc)
hReadPhd h = return . mkPhd =<< B.hGetContents h

-- | The actual phd parser.

--  aesthetics is not a major design goal...
--  but error checking really should have been.  Sigh.
mkPhd :: B.ByteString -> (Sequence Nuc)
mkPhd inp = 
  let (hd:fs) = filter (not . B.null) . B.lines $ inp
      (comment,sd) = break (==B.pack "BEGIN_DNA") fs
      (sd', _td) = break (==B.pack "END_DNA") sd
      (magic,label) = B.splitAt 15 hd
      more_magic = magic == B.pack "BEGIN_SEQUENCE "
      fields = B.words . B.unlines
               . filter (not . isSubstr (B.pack "_COMMENT")) $ comment
      sdata = filter ((==3).length) . map B.words $ sd'
      err = error "failed to parse quality value"
      qual = BB.fromChunks [BBB.pack . map (maybe err (fromIntegral . fst) . B.readInt . (!!1)) $ sdata]
  in if more_magic then qual `seq` (Seq (compact $ B.unwords (label:fields)) 
                                    (compact $ B.concat $ map head sdata)
                                    (Just qual))
     else error "Incorrectly formatted PHD file - missing BEGIN_SEQUENCE"
     --   Todo: also check that we have a BEGIN_DNA/END_DNA region there.

isSubstr :: B.ByteString -> B.ByteString -> Bool
isSubstr s = any (B.isPrefixOf s) . B.tails

-- | Pack bytestring segments into a single bytestring
--   Allows the (rest of the) file contents to be GC'ed
compact :: B.ByteString -> B.ByteString
compact = B.fromChunks . return . BBB.concat . B.toChunks