{- |
   Module: Bio.Sequence.FastQ

   Support the FastQ format that combines sequence and quality. See:

   * <http://www.bioperl.org/wiki/FASTQ_sequence_format>

   Of course, this is yet another vaguely defined pseudo-standard with 
   conflicting definitions.  Of course Solexa had to go and invent a different, 
   but indistinguishably so, way to do it:

   * <http://www.bcgsc.ca/pipermail/ssrformat/2007-March/000137.html>

   * <http://maq.sourceforge.net/fastq.shtml>

   Currently, we only support the non-Solexa FastQ, adding\/subtracting 33 for 
   the quality values.

   As far as I know, FastQ is only used for nucleotide sequences, never amino acid.
-}

module Bio.Sequence.FastQ 
    (
     -- * Reading FastQ
    readFastQ, hReadFastQ, parse
     -- * Writing FastQ
    , writeFastQ, hWriteFastQ, unparse
    ) where

import System.IO
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import Data.List (unfoldr)

import Bio.Sequence.SeqData

readFastQ :: FilePath -> IO [Sequence Nuc]
readFastQ f = (go . B.lines) `fmap` B.readFile f 

hReadFastQ :: Handle -> IO [Sequence Nuc]
hReadFastQ h = (go . B.lines) `fmap` B.hGetContents h

go :: [B.ByteString] -> [Sequence Nuc]
go = map (either error id) . unfoldr parse

-- | Parse one FastQ entry, suitable for using in 'unfoldr' over
--   'B.lines' from a file
parse :: [B.ByteString] -> Maybe (Either String (Sequence Nuc), [B.ByteString])
parse (h1:sd:h2:sq:rest) = 
    case (B.uncons h1,B.uncons h2) of
      (Just ('@',h1name), Just ('+',h2name))
          | h1name == h2name -> Just (Right $ Seq h1name sd (Just (BB.map (subtract 33) sq)), rest)
          | otherwise        -> Just (Left $ "Bio.Sequence.FastQ: name mismatch:" ++ showStanza, rest)
      _                      -> Just (Left $ "Bio.Sequence.FastQ: illegal FastQ format:" ++ showStanza, rest)
    where showStanza = unlines $ map B.unpack [ h1, sd, h2, sq ]
parse [] = Nothing
parse fs = let showStanza = unlines (map B.unpack fs)
               err = Left $ "Bio.Sequence.FastQ: illegal number of lines in FastQ format: " ++ showStanza
           in Just (err, [])

writeFastQ :: FilePath -> [Sequence a] -> IO ()
writeFastQ f = B.writeFile f . B.concat . map unparse

hWriteFastQ :: Handle -> [Sequence a] -> IO ()
hWriteFastQ h = B.hPut h . B.concat . map unparse

-- helper function for writing
unparse :: Sequence a -> B.ByteString
unparse (Seq h sd (Just sq)) = 
    B.unlines [B.cons '@' h, sd, B.cons '+' h, BB.map (+33) sq]
unparse (Seq h _ Nothing) = error ("Bio.Sequence.FastQ: sequence " ++ show (B.unpack h) 
                                   ++" doesn not have quality data!")