{- |
   Module: Bio.Sequence.Fasta

   This module incorporates functionality for reading and writing
   sequence data in the Fasta format.
   Each sequence consists of a header (with a '>' prefix)
   and a set of lines containing the sequence data.
-}

module Bio.Sequence.Fasta
    (
    -- * Reading and writing plain FASTA files
    readFasta, writeFasta, hReadFasta, hWriteFasta
    -- * Reading and writing quality files
    , readQual, writeQual, hWriteQual
    -- * Combining FASTA and quality files
    , readFastaQual, hWriteFastaQual, writeFastaQual
    -- * Counting sequences in a FASTA file
    , countSeqs
    -- * Helper function for reading your own sequences
    , mkSeqs
) where


-- import Data.Char (isSpace)
import Data.List (groupBy,intersperse)
import Data.Int
import Data.Maybe
import System.IO
import System.IO.Unsafe
import Control.Monad

import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BB
import Data.ByteString.Lazy.Char8 (ByteString)

import Bio.Sequence.SeqData

splitsAt :: Offset -> ByteString -> [ByteString]
splitsAt n s = let (s1,s2) = B.splitAt n s
               in if B.null s2 then [s1] else s1 : splitsAt n s2

{-
-- ugly?
class SeqType sd where
    toSeq :: sd -> sd -> Sequence
    fromSeq :: Sequence -> (sd,sd)

instance SeqType B.ByteString where
    toSeq = Seq
    fromSeq (Seq x y) = (x,y)

instance SeqType BS.ByteString where
    toSeq h s = Seq (B.fromChunks [h]) (B.fromChunks [s])
    fromSeq (Seq x y) = (c x, c y) where c = BS.concat . B.toChunks
-}

-- | Lazily read sequences from a FASTA-formatted file
readFasta :: FilePath -> IO [Sequence]
readFasta f = do
  ls <- getLines f
  return (mkSeqs ls)

-- | Write sequences to a FASTA-formatted file.
--   Line length is 60.
writeFasta :: FilePath -> [Sequence] -> IO ()
writeFasta f ss = do
  h <- openFile f WriteMode
  hWriteFasta h ss
  hClose h

-- | Read quality data for sequences to a file.
readQual :: FilePath -> IO [Sequence]
readQual f = do
  ls <- getLines f
  return (mkQual ls)

-- | Write quality data for sequences to a file.
writeQual :: FilePath -> [Sequence] -> IO ()
writeQual f ss = do
  h <- openFile f WriteMode
  hWriteQual h ss
  hClose h

-- | Read sequence and associated quality.  Will error if
--   the sequences and qualites do not match one-to-one in sequence.
readFastaQual :: FilePath -> FilePath -> IO [Sequence]
readFastaQual  s q = do
  ss <- readFasta s
  qs <- readQual q
  return ss
  -- warning: assumes correct qual file!
  let mkseq s1@(Seq x y _) s2@(Seq _ _ (Just z))
          | seqlabel s1 /= seqlabel s2 = error ("mismatching sequence and quality: "
                                                ++show (seqlabel s1,seqlabel s2))
          | B.length y /= B.length z   = error ("mismatching sequence and quality lengths:"
                                                ++ show (seqlabel s1,B.length y,B.length z))
          | otherwise   = Seq x y (Just z)
      mkseq _ _ = error "readFastaQual: could not combine Fasta and Qual information"
  return $ zipWith mkseq ss qs

-- | Write sequence and quality data simulatnously
--   This may be more laziness-friendly.
writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO ()
writeFastaQual f q ss = do
  hf <- openFile f WriteMode
  hq <- openFile q WriteMode
  hWriteFastaQual hf hq ss
  hClose hq
  hClose hf

hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO ()
hWriteFastaQual hf hq = mapM_ wFQ
    where wFQ s = wFasta hf s >> wQual hq s

-- | Lazily read sequence from handle
hReadFasta :: Handle -> IO [Sequence]
hReadFasta h = do
  ls <- hGetLines h
  return (mkSeqs ls)

-- | Write sequences in FASTA format to a handle.
hWriteFasta :: Handle -> [Sequence] -> IO ()
hWriteFasta h = mapM_ (wFasta h)

wHead :: Handle -> SeqData -> IO ()
wHead h l = do
   B.hPut h $ B.pack ">"
   B.hPut h l
   B.hPut h $ B.pack "\n"

wFasta :: Handle -> Sequence -> IO ()
wFasta h (Seq l d _) = do
  wHead h l
  let ls = splitsAt 60 d
  mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls
  B.hPut h $ B.pack "\n"

hWriteQual :: Handle -> [Sequence] -> IO ()
hWriteQual h = mapM_ (wQual h)

wQual :: Handle -> Sequence -> IO ()
wQual h (Seq l _ (Just q)) = do
  wHead h l
  let qls = splitsAt 20 q
      qs = B.pack . unwords . map show . BB.unpack
  mapM_ ((\l' -> B.hPut h l' >> B.hPut h (B.pack "\n")) . qs) qls
wQual _ (Seq _ _ Nothing) = return ()

-- ByteString operations
-- These aren't (or possible weren't) provided by the FPS library.
-- Implement line-based IO (in retrospect it'd be simpler
-- and better to just use 'lines'.)

-- lazily read lines from file
getLines :: FilePath -> IO [ByteString]
getLines f = do
  h <- openFile f ReadMode
  hGetLines' (hClose h) h

-- lazily read lines from handle
hGetLines :: Handle -> IO [ByteString]
hGetLines = hGetLines' (return ())

-- add an optional handle-closing parameter
hGetLines' :: IO () -> Handle -> IO [ByteString]
hGetLines' c h = do
  e <- hIsEOF h
  if e then c >> return []
       else do l' <- BS.hGetLine h
               let l = B.fromChunks $ if BS.null l' then [] else  [l']
               ls <- unsafeInterleaveIO $ hGetLines' c h
               return (l:ls)

-- | Convert a list of FASTA-formatted lines into a list of sequences.
--   Blank lines are ignored.
--   Comment lines start with "#" are allowed between sequences (and ignored).
--   Lines starting with ">" initiate a new sequence.
mkSeqs :: [ByteString] -> [Sequence]
mkSeqs = map mkSeq . blocks

mkSeq :: [ByteString] -> Sequence
mkSeq (l:ls) = Seq (B.drop 1 l) (B.concat $ takeWhile isSeq ls) Nothing
    where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s
mkSeq [] = error "empty input to mkSeq"

mkQual :: [ByteString] -> [Sequence]
mkQual = map f . blocks
    where f (l:ls) = Seq (B.drop 1 l) B.empty
                     (Just $ BB.pack (map int (B.words $ B.unlines ls)))
          f [] = error "mkQual: empty quality data"
          int = fromIntegral . fst . fromJust' . B.readInt
          fromJust' = maybe (error "Error in qual format") id

-- | Split lines into blocks starting with '>' characters
--   Filter out # comments (but not semicolons?)
blocks :: [ByteString] -> [[ByteString]]
blocks = groupBy (const (('>' /=) . B.head)) . filter ((/='#') . B.head) . filter (not . B.null)

countSeqs :: FilePath -> IO Int
countSeqs f = do
  ss <- B.readFile f
  let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ B.lines ss
  return (length hdrs)