Haskell Code by HsColour

{- | Read (and write?) the SFF file format used by
   Roche\/454 sequencing to store flowgram data.

   A flowgram is a series of values (intensities) representing homopolymer runs of
   A,G,C, and T in a fixed cycle, and usually displayed as a histogram.

   The Staden Package contains an io_lib, with a C routine for parsing this format.
   According to comments in the sources, the io_lib implementation is based on a file
   called getsff.c, which I've been unable to track down.

   It is believed that all values are stored big endian.
-}

module Bio.Sequence.SFF ( SFF(..), CommonHeader(..)
                        , ReadHeader(..), ReadBlock(..)
                        , readSFF, writeSFF
                        , sffToSequence
                        , test, convert
                        , Flow, Qual, Index
                        ) where

import Bio.Sequence.SeqData

import Data.Int
import qualified Data.ByteString.Lazy as LB
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import Data.ByteString (ByteString)
import Control.Monad (when,replicateM,replicateM_)

import Data.List (intersperse)
import Data.Binary
import Data.Binary.Get (getByteString)
import qualified Data.Binary.Get as G
import Data.Binary.Put (putByteString)

import Text.Printf (printf)

-- | The type of flowgram value
type Flow = Int16
type Index = Word8

-- Global variables holding static information
magic :: Int32
magic = 0x2e736666

versions :: [Int32]
versions = [1]


readSFF :: FilePath -> IO SFF
readSFF f = return . decode =<< LB.readFile f

sffToSequence :: SFF -> [Sequence]
sffToSequence (SFF ch rs) = map r2s rs
    where r2s r = clip (read_name $ read_header r, bases r, quality r)
          lb x = LB.fromChunks [x]
          clip (n,s,q) = let (k,s2) = B.splitAt (fromIntegral $ key_length ch) s
                         in if k==key ch then Seq (lb n) (lb s2) (Just $ lb $ B.drop (fromIntegral $ key_length ch) q)
                                else error ("Couldn't match key in sequence "++BC.unpack n++" ("++BC.unpack k++" vs. "++BC.unpack (key ch)++")!")

writeSFF :: FilePath -> SFF -> IO ()
writeSFF = encodeFile

-- --------------------------------------------------
-- | test serialization by output'ing the header and first two reads 
--   in an SFF, and the same after a decode + encode cycle.
test :: FilePath -> IO ()
test file = do 
  (SFF h rs) <- readSFF file 
  let sff = (SFF h (take 2 rs))
  putStrLn $ show $ sff
  putStrLn ""
  putStrLn $ show $ (decode $ encode sff :: SFF)

-- --------------------------------------------------
-- | Convert a file by decoding it and re-encoding it
--   This will lose the index (which isn't really necessary)
convert :: FilePath -> IO ()
convert file = writeSFF (file++".out") =<< readSFF file

-- | Generalized function for padding
pad :: Integral a => a -> Put
pad x = replicateM_ (fromIntegral x) (put zero) where zero = 0 :: Word8 

-- | Generalized function to skip padding
skip :: Integral a => a -> Get ()
skip = G.skip . fromIntegral

-- | The data structure storing the contents of an SFF file (modulo the index)
data SFF = SFF !CommonHeader [ReadBlock]

instance Show SFF where 
    show (SFF h rs) = (show h ++ "Read Blocks:\n\n" ++ concatMap show rs)

instance Binary SFF where
    get = do
      -- Parse CommonHeader
      chead <- get
      -- Parse ReadHeaders
      rds <- replicateM (fromIntegral (num_reads chead))
                                   (do 
                                      rh <- get :: Get ReadHeader
                                      let nb = fromIntegral $ num_bases rh
                                          fl = fromIntegral $ flow_length chead
                                      fg <- getByteString (2*fl)
                                      fi <- getByteString nb
                                      bs <- getByteString nb
                                      qty <- getByteString nb
                                      let l = (fl*2+nb*3) `mod` 8
                                      when (l > 0) (skip (8-l))
                                      return (ReadBlock rh (decodeArray fg) fi bs qty)
                                   )
      return (SFF chead rds)

    put (SFF hd rds) = do
      put hd
      mapM_ (putRB (fromIntegral $ flow_length hd)) rds

-- | A ReadBlock can't be an instance of Binary directly, since it depends on
--   information from the CommonHeader.
putRB :: Int -> ReadBlock -> Put
putRB fl rb = do
  put (read_header rb)
  mapM_ put (flowgram rb)
  putByteString (flow_index rb)
  putByteString (bases rb)
  putByteString (quality rb)
  let nb = fromIntegral $ num_bases $ read_header rb
      l = (fl*2+nb*3) `mod` 8
  when (l > 0) (pad (8-l))

-- | What the name and type says.
decodeArray :: ByteString -> [Flow]
decodeArray = dec . map fromIntegral . B.unpack 
    where dec (d1:d2:rest) = d1*256+d2 : dec rest
          dec [] = []
          dec _  = error "odd flowgram length?!"

-- ----------------------------------------------------------
-- | SFF has a 31-byte common header
--   Todo: remove items that are derivable (counters, magic, etc)
--   cheader_lenght points to the first read header.
--   Also, the format is open to having the index anywhere between reads,
--   we should really keep count and check for each read.  In practice, it
--   seems to be places after the reads.
--   
--   The following two fields are considered part of the header, but as
--   they are static, they are not part of the data structure
--     magic   :: Word32   -- ^ 0x2e736666, i.e. the string ".sff"
--     version :: Word32   -- ^ 0x00000001

data CommonHeader = CommonHeader {
          index_offset                            :: Int64    -- ^ Points to a text(?) section
        , index_length, num_reads                 :: Int32
        , key_length, flow_length                 :: Int16
        , flowgram_fmt                            :: Word8
        , flow, key                               :: ByteString 
        }

instance Show CommonHeader where
    show (CommonHeader io il nr kl fl fmt f k) =
        "Common Header:\n\n" ++ (unlines $ map ("    "++) 
                                 ["index_off:\t"++show io ++"\tindex_len:\t"++show il
                                 ,"num_reads:\t"++show nr
                                 ,"key_len:\t"  ++show kl ++ "\tflow_len:\t"++show fl
                                 ,"format\t:"   ++show fmt
                                 ,"flow\t:"     ++BC.unpack f
                                 ,"key\t:"      ++BC.unpack k
                                 , ""
                                 ])

instance Binary CommonHeader where
    get = do { m <- get ; when (m /= magic)   $ error (printf "Incorrect magic number - got %8x, expected %8x" m magic)
             ; v <- get ; when (not (v `elem` versions)) $ error (printf "Unexpected version - got %d, supported are: %s" v (unwords $ map show versions))
             ; io <- get ; ixl <- get ; nrd <- get
             ; chl <- get ; kl <- get ; fl <- get ; fmt <- get
             ; fw <- getByteString (fromIntegral fl)
             ; k  <- getByteString (fromIntegral kl)
             ; skip (chl-(31+fl+kl)) -- skip to boundary
             ; return (CommonHeader io ixl nrd kl fl fmt fw k)
             }

    put ch = let CommonHeader io il nr kl fl fmt f k = ch { index_offset = 0 } in
        do { let cl = 31+fl+kl
                 l = cl `mod` 8
                 padding = if l > 0 then 8-l else 0
           ; put magic; put (last versions); put io; put il; put nr; put (cl+padding); put kl; put fl; put fmt
           ; putByteString f; putByteString k
           ; pad padding -- skip to boundary
           }

-- ---------------------------------------------------------- 
-- | Each Read has a fixed read header
data ReadHeader = ReadHeader {
      name_length                           :: Int16
    , num_bases                             :: Int32
    , clip_qual_left, clip_qual_right
    , clip_adapter_left, clip_adapter_right :: Int16
    , read_name                             :: ByteString
}

instance Show ReadHeader where
    show (ReadHeader nl nb cql cqr cal car rn) =
        ("    Read Header:\n" ++) $ unlines $ map ("        "++) 
                    [ "name_len:\t"++show nl, "num_bases:\t"++show nb
                    , "clip_qual:\t"++show cql++"..."++show cqr
                    , "clip_adap:\t"++show cal++"..."++show car
                    , "read name:\t"++BC.unpack rn
                    , "" 
                    ]

instance Binary ReadHeader where
    get = do
      { rhl <- get; nl <- get; nb <- get
      ; cql <- get; cqr <- get ; cal <- get ; car <- get
      ; n <- getByteString (fromIntegral nl)
      ; skip (rhl - (16+ nl))
      ; return (ReadHeader nl nb cql cqr cal car n)
      }
    put (ReadHeader nl nb cql cqr cal car rn) = 
        do { let rl = 16+nl
                 l = rl `mod` 8
                 padding = if l > 0 then 8-l else 0
           ; put (rl+padding); put nl; put nb; put cql; put cqr; put cal; put car
           ; putByteString rn 
           ; pad padding
           }

-- ----------------------------------------------------------
-- | This contains the actual flowgram for a single read.
data ReadBlock = ReadBlock {
      read_header                :: ReadHeader
    -- The data block
    , flowgram                   :: [Flow]
    , flow_index, bases, quality :: ByteString
    }

instance Show ReadBlock where
    show (ReadBlock h f i b q) =
        show h ++ unlines (map ("     "++) 
            ["flowgram:\t"++show f
            , "index:\t"++(concat . intersperse " " . map show . B.unpack) i
            , "bases:\t"++BC.unpack b
            , "quality:\t"++(concat . intersperse " " . map show . B.unpack) q
            , ""
            ])