{- | Read (and write?) the SFF file format used by Roche\/454 sequencing to store flowgram data. A flowgram is a series of values (intensities) representing homopolymer runs of A,G,C, and T in a fixed cycle, and usually displayed as a histogram. The Staden Package contains an io_lib, with a C routine for parsing this format. According to comments in the sources, the io_lib implementation is based on a file called getsff.c, which I've been unable to track down. It is believed that all values are stored big endian. -} module Bio.Sequence.SFF ( SFF(..), CommonHeader(..) , ReadHeader(..), ReadBlock(..) , readSFF, writeSFF, writeSFF' , sffToSequence , test, convert , Flow, Qual, Index, SeqData, QualData , ReadName (..), decodeReadName, encodeReadName ) where import Bio.Sequence.SeqData import Bio.Sequence.SFF_name import Data.Int import qualified Data.ByteString.Lazy as LB import qualified Data.ByteString.Lazy.Char8 as LBC import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as BC import Data.ByteString (ByteString) import Control.Monad (when,replicateM,replicateM_) import Data.List (intersperse) import Data.Binary import Data.Binary.Get (getByteString,getLazyByteString) import qualified Data.Binary.Get as G import Data.Binary.Put (putByteString,putLazyByteString) import Text.Printf (printf) import System.IO -- | The type of flowgram value type Flow = Int16 type Index = Word8 -- Global variables holding static information magic :: Int32 magic = 0x2e736666 versions :: [Int32] versions = [1] readSFF :: FilePath -> IO SFF readSFF f = return . decode =<< LB.readFile f sffToSequence :: SFF -> [Sequence Nuc] sffToSequence (SFF ch rs) = map r2s rs where r2s r = clip (read_name $ read_header r, bases r, quality r) lb x = LB.fromChunks [x] clip (n,s,q) = let (k,s2) = LB.splitAt (fromIntegral $ key_length ch) s in if k==lb (key ch) then Seq (lb n) s2 (Just $ LB.drop (fromIntegral $ key_length ch) q) else error ("Couldn't match key in sequence "++BC.unpack n++" ("++LBC.unpack k++" vs. "++BC.unpack (key ch)++")!") -- | Write an 'SFF' to the specified file name writeSFF :: FilePath -> SFF -> IO () writeSFF = encodeFile -- | Write an 'SFF' to the specified file name, but go back and -- update the read count. Useful if you want to output a lazy -- stream of 'ReadBlock's. writeSFF' :: FilePath -> SFF -> IO () writeSFF' f (SFF hs rs) = do h <- openFile f WriteMode LBC.hPut h $ encode hs c <- writeReads h (fromIntegral $ flow_length hs) rs putStrLn ("Count: "++show c) hSeek h AbsoluteSeek 20 LBC.hPut h $ encode c hClose h writeReads :: Handle -> Int -> [ReadBlock] -> IO Int32 writeReads _ _ [] = return 0 writeReads h i (r:rs) = do LBC.hPut h $ encode (RBI i r) c <- writeReads h i rs return (c+1) data RBI = RBI Int ReadBlock -- | Wrapper for ReadBlocks since they need additional information instance Binary RBI where put (RBI c r) = do putRB c r get = undefined -- -------------------------------------------------- -- | test serialization by output'ing the header and first two reads -- in an SFF, and the same after a decode + encode cycle. test :: FilePath -> IO () test file = do (SFF h rs) <- readSFF file let sff = (SFF h (take 2 rs)) putStrLn $ show $ sff putStrLn "" putStrLn $ show $ (decode $ encode sff :: SFF) -- -------------------------------------------------- -- | Convert a file by decoding it and re-encoding it -- This will lose the index (which isn't really necessary) convert :: FilePath -> IO () convert file = writeSFF (file++".out") =<< readSFF file -- | Generalized function for padding pad :: Integral a => a -> Put pad x = replicateM_ (fromIntegral x) (put zero) where zero = 0 :: Word8 -- | Generalized function to skip padding skip :: Integral a => a -> Get () skip = G.skip . fromIntegral -- | The data structure storing the contents of an SFF file (modulo the index) data SFF = SFF !CommonHeader [ReadBlock] instance Show SFF where show (SFF h rs) = (show h ++ "Read Blocks:\n\n" ++ concatMap show rs) instance Binary SFF where get = do -- Parse CommonHeader chead <- get -- Parse ReadHeaders rds <- replicateM (fromIntegral (num_reads chead)) (do rh <- get :: Get ReadHeader let nb = fromIntegral $ num_bases rh nb' = fromIntegral $ num_bases rh fl = fromIntegral $ flow_length chead fg <- getByteString (2*fl) fi <- getByteString nb bs <- getLazyByteString nb' qty <- getLazyByteString nb' let l = (fl*2+nb*3) `mod` 8 when (l > 0) (skip (8-l)) return (ReadBlock rh (decodeArray fg) fi bs qty) ) return (SFF chead rds) put (SFF hd rds) = do put hd mapM_ (put . RBI (fromIntegral $ flow_length hd)) rds -- | A ReadBlock can't be an instance of Binary directly, since it depends on -- information from the CommonHeader. putRB :: Int -> ReadBlock -> Put putRB fl rb = do put (read_header rb) mapM_ put (flowgram rb) -- ensure that flowgram has correct lenght replicateM (2*(fl-length (flowgram rb))) (put (0::Word8)) putByteString (flow_index rb) putLazyByteString (bases rb) putLazyByteString (quality rb) let nb = fromIntegral $ num_bases $ read_header rb l = (fl*2+nb*3) `mod` 8 when (l > 0) (pad (8-l)) -- | What the name and type says. decodeArray :: ByteString -> [Flow] decodeArray = dec . map fromIntegral . B.unpack where dec (d1:d2:rest) = d1*256+d2 : dec rest dec [] = [] dec _ = error "odd flowgram length?!" -- ---------------------------------------------------------- -- | SFF has a 31-byte common header -- Todo: remove items that are derivable (counters, magic, etc) -- cheader_lenght points to the first read header. -- Also, the format is open to having the index anywhere between reads, -- we should really keep count and check for each read. In practice, it -- seems to be places after the reads. -- -- The following two fields are considered part of the header, but as -- they are static, they are not part of the data structure -- magic :: Word32 -- ^ 0x2e736666, i.e. the string ".sff" -- version :: Word32 -- ^ 0x00000001 data CommonHeader = CommonHeader { index_offset :: Int64 -- ^ Points to a text(?) section , index_length, num_reads :: Int32 , key_length, flow_length :: Int16 , flowgram_fmt :: Word8 , flow, key :: ByteString } instance Show CommonHeader where show (CommonHeader io il nr kl fl fmt f k) = "Common Header:\n\n" ++ (unlines $ map (" "++) ["index_off:\t"++show io ++"\tindex_len:\t"++show il ,"num_reads:\t"++show nr ,"key_len:\t" ++show kl ++ "\tflow_len:\t"++show fl ,"format\t:" ++show fmt ,"flow\t:" ++BC.unpack f ,"key\t:" ++BC.unpack k , "" ]) instance Binary CommonHeader where get = do { m <- get ; when (m /= magic) $ error (printf "Incorrect magic number - got %8x, expected %8x" m magic) ; v <- get ; when (not (v `elem` versions)) $ error (printf "Unexpected version - got %d, supported are: %s" v (unwords $ map show versions)) ; io <- get ; ixl <- get ; nrd <- get ; chl <- get ; kl <- get ; fl <- get ; fmt <- get ; fw <- getByteString (fromIntegral fl) ; k <- getByteString (fromIntegral kl) ; skip (chl-(31+fl+kl)) -- skip to boundary ; return (CommonHeader io ixl nrd kl fl fmt fw k) } put ch = let CommonHeader io il nr kl fl fmt f k = ch { index_offset = 0 } in do { let cl = 31+fl+kl l = cl `mod` 8 padding = if l > 0 then 8-l else 0 ; put magic; put (last versions); put io; put il; put nr; put (cl+padding); put kl; put fl; put fmt ; putByteString f; putByteString k ; pad padding -- skip to boundary } -- ---------------------------------------------------------- -- | Each Read has a fixed read header data ReadHeader = ReadHeader { name_length :: Int16 , num_bases :: Int32 , clip_qual_left, clip_qual_right , clip_adapter_left, clip_adapter_right :: Int16 , read_name :: ByteString } instance Show ReadHeader where show (ReadHeader nl nb cql cqr cal car rn) = (" Read Header:\n" ++) $ unlines $ map (" "++) [ "name_len:\t"++show nl, "num_bases:\t"++show nb , "clip_qual:\t"++show cql++"..."++show cqr , "clip_adap:\t"++show cal++"..."++show car , "read name:\t"++BC.unpack rn , "" ] instance Binary ReadHeader where get = do { rhl <- get; nl <- get; nb <- get ; cql <- get; cqr <- get ; cal <- get ; car <- get ; n <- getByteString (fromIntegral nl) ; skip (rhl - (16+ nl)) ; return (ReadHeader nl nb cql cqr cal car n) } put (ReadHeader nl nb cql cqr cal car rn) = do { let rl = 16+nl l = rl `mod` 8 padding = if l > 0 then 8-l else 0 ; put (rl+padding); put nl; put nb; put cql; put cqr; put cal; put car ; putByteString rn ; pad padding } -- ---------------------------------------------------------- -- | This contains the actual flowgram for a single read. data ReadBlock = ReadBlock { read_header :: ReadHeader -- The data block , flowgram :: [Flow] , flow_index :: ByteString , bases :: SeqData , quality :: QualData } instance Show ReadBlock where show (ReadBlock h f i b q) = show h ++ unlines (map (" "++) ["flowgram:\t"++show f , "index:\t"++(concat . intersperse " " . map show . B.unpack) i , "bases:\t"++LBC.unpack b , "quality:\t"++(concat . intersperse " " . map show . LB.unpack) q , "" ])