{- |
   Module: Bio.Sequence.FastQ

   Support the FastQ format that combines sequence and quality. See:

   * <http://www.bioperl.org/wiki/FASTQ_sequence_format>

   Of course, this is yet another vaguely defined pseudo-standard with
   conflicting definitions.  Of course Solexa had to go and invent not one, but two 
   different, and indistinguishably so, ways to do it:

   * <http://www.bcgsc.ca/pipermail/ssrformat/2007-March/000137.html>

   * <http://maq.sourceforge.net/fastq.shtml>

   * <http://en.wikipedia.org/wiki/FASTQ_format>

   Sanger-style FastQ-format is supported with the (h)read/writeSangerQ functions,
   and the new Illumina/Solexa-style with (h)read/writeIllumina.

   As far as I know, FastQ is only used for nucleotide sequences, never amino acid.
-}


module Bio.Sequence.FastQ
    (
     -- * Reading FastQ
    readFastQ, hReadFastQ, parse
     -- * Writing FastQ
    , writeFastQ, hWriteFastQ

    -- * use Sanger-style quality information
    , readSangerQ, hReadSangerQ
    , writeSangerQ, hWriteSangerQ
    
    -- * use Illumina (>v1.3)-style quality information
    , readIllumina, hReadIllumina
    , writeIllumina, hWriteIllumina
    ) where

import System.IO
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import Data.List (unfoldr)
import Bio.Core.Sequence

data Sequence = Seq SeqLabel SeqData QualData

instance BioSeq Sequence where
  seqlabel (Seq sl _ _) = sl
  seqdata (Seq _ sd _) = sd  
  seqlength = Offset . B.length . unSD . seqdata -- should be default
  
instance BioSeqQual Sequence where
  seqqual (Seq _ _ q) = q
  
{-# DEPRECATED readFastQ, hReadFastQ, writeFastQ, hWriteFastQ  "FastQ assumes Sanger-style quality info use {read,write}SangerQ or -Illumina instead" #-}

readSangerQ, readIllumina :: FilePath -> IO [Sequence]
readSangerQ = readFastQ
readIllumina f = addQual (negate 31) `fmap` readFastQ f

hReadSangerQ, hReadIllumina :: Handle -> IO [Sequence]
hReadSangerQ = hReadFastQ
hReadIllumina h = addQual (negate 31) `fmap` hReadFastQ h

writeSangerQ, writeIllumina :: FilePath -> [Sequence] -> IO ()
writeSangerQ = writeFastQ
writeIllumina f = writeFastQ f . addQual 31 

hWriteSangerQ, hWriteIllumina :: Handle -> [Sequence] -> IO ()
hWriteSangerQ = hWriteFastQ
hWriteIllumina h = hWriteFastQ h . addQual 31

addQual :: Qual -> [Sequence] -> [Sequence]
addQual (Qual q) = map (\(Seq h d mq) -> (Seq h d $ qmap (+q) mq))
  where qmap f (QualData qd) = QualData (BB.map f qd)

readFastQ :: FilePath -> IO [Sequence]
readFastQ f = (go . B.lines) `fmap` B.readFile f

hReadFastQ :: Handle -> IO [Sequence]
hReadFastQ h = (go . B.lines) `fmap` B.hGetContents h

go :: [B.ByteString] -> [Sequence]
go = map (either error id) . unfoldr parse

-- | Parse one FastQ entry, suitable for using in 'unfoldr' over
--   'B.lines' from a file
parse :: [B.ByteString] -> Maybe (Either String (Sequence), [B.ByteString])
parse (h1:sd:h2:sq:rest) =
    case (B.uncons h1,B.uncons h2) of
      -- The fast path: four-line format
      (Just ('@',h1name), Just ('+',h2name))
          | h1name == h2name || B.null h2name
            -> Just (Right $ Seq (SeqLabel h1name) (SeqData sd) (QualData (BB.map (subtract 33) sq)), rest)
          | otherwise
            -> Just (Left $ "Bio.Sequence.FastQ: name mismatch:" ++ showStanza, rest)
      _     -> Just (Left $ "Bio.Sequence.FastQ: illegal FastQ format:" ++ showStanza, rest)
    where showStanza = unlines $ map B.unpack [ h1, sd, h2, sq ]
parse [] = Nothing
parse fs = let showStanza = unlines (map B.unpack fs)
               err = Left $ "Bio.Sequence.FastQ: illegal number of lines in FastQ format: " ++ showStanza
           in Just (err, [])

writeFastQ :: FilePath -> [Sequence] -> IO ()
writeFastQ f = B.writeFile f . B.concat . map toFastQ

hWriteFastQ :: Handle -> [Sequence] -> IO ()
hWriteFastQ h = B.hPut h . B.concat . map toFastQ