{- | Read and write the SFF file format used by Roche\/454 sequencing to store flowgram data. A flowgram is a series of values (intensities) representing homopolymer runs of A,G,C, and T in a fixed cycle, and usually displayed as a histogram. This file is based on information in the Roche FLX manual. Among other sources for information about the format, are The Staden Package, which contains an io_lib with a C routine for parsing this format. According to comments in the sources, the io_lib implementation is based on a file called getsff.c, which I've been unable to track down. Other software parsing SFFs are QIIME, sff_extract, and Celera's sffToCa. It is believed that all values are stored big endian. -} module Bio.Sequence.SFF ( SFF(..), CommonHeader(..) , ReadHeader(..), ReadBlock(..) , readSFF, writeSFF, writeSFF', recoverSFF -- , sffToSequence, rbToSequence , trim, trimFromTo, trimKey, trimAdapter , baseToFlowPos, flowToBasePos , trimFlows , test, convert, flowgram , masked_bases, cumulative_index , packFlows, unpackFlows , Flow, Qual, Index, SeqData, QualData , ReadName (..), decodeReadName, encodeReadName , putRB, getRB ) where import Bio.Core.Sequence import Bio.Sequence.SFF_name import Data.Int import qualified Data.ByteString.Lazy as LB import qualified Data.ByteString.Lazy.Char8 as LBC import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as BC import Data.ByteString (ByteString) import Control.Monad (when,replicateM,replicateM_) import Data.List (intersperse) import Data.Binary import Data.Binary.Get (getByteString,getLazyByteString,runGetState) import qualified Data.Binary.Get as G import Data.Binary.Put (putByteString,putLazyByteString) import Data.Char (toUpper, toLower) import Text.Printf (printf) import System.IO -- | The type of flowgram value type Flow = Int16 type Index = Word8 -- Global variables holding static information -- | An SFF file always start with this magic number. magic :: Int32 magic = 0x2e736666 -- | Version is always 1. versions :: [Int32] versions = [1] -- | Read an SFF file. readSFF :: FilePath -> IO SFF readSFF f = do file <- LB.readFile f let (header, remaining, _) = runGetState (get::Get CommonHeader) file 0 blocks = getBlocks header (fromIntegral $ num_reads header) remaining return (SFF header blocks) getBlocks :: CommonHeader -> Int -> LB.ByteString -> [ReadBlock] getBlocks header n str | n == 0 = [] | otherwise = case runGetState (getBlock $ fromIntegral $ flow_length $ header) str 0 of (block, remaining, _) -> block : getBlocks header (n-1) remaining getBlock :: Int -> Get ReadBlock getBlock flows = get >>= getRB flows {- -- | Extract the read without the initial (TCAG) key. trimKey :: CommonHeader -> Sequence Nuc -> Maybe (Sequence Nuc) trimKey ch (Seq n s q) = let (k,s2) = LB.splitAt (fromIntegral $ key_length ch) s in if LBC.map toLower k==LBC.map toLower (LB.fromChunks [key ch]) then Just $ Seq n s2 (liftM (LB.drop (fromIntegral $ key_length ch)) q) else Nothing -- error ("Couldn't match key in sequence "++LBC.unpack n++" ("++LBC.unpack k++" vs. "++BC.unpack (key ch)++")!") -} instance BioSeq ReadBlock where seqid rb = SeqLabel $ LB.fromChunks [read_name $ read_header rb] seqheader rb = hdr where n = read_name $ read_header rb hdr = case decodeReadName n of Just h -> SeqLabel $ LBC.unwords [LB.fromChunks [n], LBC.pack $ show $ date h ,LBC.pack $ show $ time h, LBC.pack $ show $ region h ,LBC.pack $ (show (x_loc h) ++ ":"++show (y_loc h))] Nothing -> seqid rb seqdata rb = let h = read_header rb (left,right) = (clip_qual_left h, clip_qual_right h) (a,b) = LB.splitAt (fromIntegral right) $ unSD $ bases rb (c,d) = LB.splitAt (fromIntegral left-1) a in SeqData $ LBC.concat [LBC.map toLower c, LBC.map toUpper d,LBC.map toLower b] seqlength rb = fromIntegral $ num_bases $ read_header rb instance BioSeqQual ReadBlock where seqqual = quality {- -- | Extract the sequences from an 'SFF' data structure. sffToSequence :: SFF -> [Sequence Nuc] sffToSequence (SFF _ rs) = map rbToSequence rs -- | Extract the sequence information from a 'ReadBlock'. rbToSequence :: ReadBlock -> Sequence Nuc rbToSequence r = Seq (LB.fromChunks [read_name h ,BC.pack (" qclip: "++show left ++".."++show right)]) (seqdata r) (Just $ seqqual r) where h = read_header r (left,right) = (clip_qual_left h, clip_qual_right h) -} -- | Trim a 'ReadBlock' limiting the number of flows. If writing to -- an SFF file, make sure you update the 'CommonHeader' accordingly. -- See @examples/Flx.hs@ for how to use this. trimFlows :: Integral i => i -> ReadBlock -> ReadBlock trimFlows l rb = rb { read_header = rh { num_bases = fromIntegral n , clip_qual_right = min cqr $ fromIntegral n } , flow_data = B.take (2*fromIntegral l) (flow_data rb) , flow_index = B.take n (flow_index rb) , bases = SeqData $ LB.take (fromIntegral n) (unSD $ bases rb) , quality = QualData $ LB.take (fromIntegral n) (unQD $ quality rb) } where n = (flowToBasePos rb l)-1 rh = read_header rb cqr = clip_qual_right rh -- trimming the flowgram is necessary, but how to deal with the shift in flow -- sequence - i.e. what to do when trimming "splits" a flow into trimmed/untrimmed bases? -- | Trim a read to specific sequence position, inclusive bounds. trimFromTo :: (Integral i) => i -> i -> ReadBlock -> ReadBlock trimFromTo x r rd = let l = x-1 trim_seq = LB.drop (fromIntegral l) . LB.take (fromIntegral r) trim_seq' = B.drop (fromIntegral l) . B.take (fromIntegral r) trim_flw = B.drop ((2*) $ fromIntegral $ baseToFlowPos rd l) . B.take ((2*) $ fromIntegral $ baseToFlowPos rd r) new_flw = trim_flw (flow_data rd) padding = B.replicate (B.length (flow_data rd) - B.length new_flw) 0 rh = read_header rd [r',l'] = map fromIntegral [r,l] numbs = max 0 $ r'-l' oneToZero e = if e==1 then 0 else e -- clip of one means no clip nToZero e = if e >= numbs then 0 else e -- clip more than numbs means no clip rh' = rh { num_bases = fromIntegral numbs , clip_qual_left = oneToZero $ min numbs $ max 0 $ clip_qual_left rh-l' , clip_qual_right = nToZero $ max 0 $ min (clip_qual_right rh-l') (r'-l'+1) , clip_adapter_left = oneToZero $ min numbs $ max 0 $ clip_adapter_left rh-l' , clip_adapter_right = nToZero $ max 0 $ min (clip_adapter_right rh-l') (r'-l'+1) } in rd { read_header = rh' , flow_data = B.concat [new_flw, padding] , flow_index = trim_seq' (flow_index rd) , bases = SeqData $ trim_seq $ unSD $ bases rd , quality = QualData $ trim_seq $ unQD $ quality rd } -- | Trim a read according to clipping information trim :: ReadBlock -> ReadBlock trim rb = let rh = read_header rb in trimFromTo (clip_qual_left rh) (clip_qual_right rh) rb -- | Trim adapters from a read trimAdapter :: ReadBlock -> ReadBlock trimAdapter r = trimFromTo (clip_adapter_left rh) (if car == 0 then fromIntegral (num_bases rh) else car) r where rh = read_header r car = clip_adapter_right rh -- | Trim the key (i.e. first four bases) trimKey :: ReadBlock -> ReadBlock trimKey r = trimFromTo 5 (num_bases $ read_header r) r -- | Convert a flow position to the corresponding sequence position flowToBasePos :: Integral i => ReadBlock -> i -> Int flowToBasePos rd fp = length $ takeWhile (<=fp) $ scanl (+) 0 $ map fromIntegral $ B.unpack $ flow_index rd -- | Convert a sequence position to the corresponding flow position baseToFlowPos :: Integral i => ReadBlock -> i -> Int baseToFlowPos rd sp = sum $ map fromIntegral $ B.unpack $ B.take (fromIntegral sp) $ flow_index rd -- | Read an SFF file, but be resilient against errors. recoverSFF :: FilePath -> IO SFF recoverSFF f = return . unRecovered . decode =<< LB.readFile f -- | Write an 'SFF' to the specified file name writeSFF :: FilePath -> SFF -> IO () writeSFF = encodeFile -- | Write an 'SFF' to the specified file name, but go back and -- update the read count. Useful if you want to output a lazy -- stream of 'ReadBlock's. Returns the number of reads written. writeSFF' :: FilePath -> SFF -> IO Int writeSFF' f (SFF hs rs) = do h <- openFile f WriteMode LBC.hPut h $ encode hs c <- writeReads h (fromIntegral $ flow_length hs) rs hSeek h AbsoluteSeek 20 LBC.hPut h $ encode c hClose h return $ fromIntegral c -- | Write 'ReadBlock's to a file handle. writeReads :: Handle -> Int -> [ReadBlock] -> IO Int32 writeReads _ _ [] = return 0 writeReads h i xs = go 0 xs where go c (r:rs) = do LBC.hPut h $ encode (RBI i r) let c' = c+1 c' `seq` go c' rs go c [] = return c data RBI = RBI Int ReadBlock -- | Wrapper for ReadBlocks since they need additional information instance Binary RBI where put (RBI c r) = do putRB c r get = undefined -- -------------------------------------------------- -- | test serialization by output'ing the header and first two reads -- in an SFF, and the same after a decode + encode cycle. test :: FilePath -> IO () test file = do (SFF h rs) <- readSFF file let sff = (SFF h (take 2 rs)) putStrLn $ show $ sff putStrLn "" putStrLn $ show $ (decode $ encode sff :: SFF) -- -------------------------------------------------- -- | Convert a file by decoding it and re-encoding it -- This will lose the index (which isn't really necessary) convert :: FilePath -> IO () convert file = writeSFF (file++".out") =<< readSFF file -- | Generalized function for padding pad :: Integral a => a -> Put pad x = replicateM_ (fromIntegral x) (put zero) where zero = 0 :: Word8 -- | Generalized function to skip padding skip :: Integral a => a -> Get () skip = G.skip . fromIntegral -- | The data structure storing the contents of an SFF file (modulo the index) data SFF = SFF !CommonHeader [ReadBlock] instance Show SFF where show (SFF h rs) = (show h ++ "Read Blocks:\n\n" ++ concatMap show rs) instance Binary SFF where get = do -- Parse CommonHeader chead <- get -- Get the ReadBlocks rds <- replicateM (fromIntegral (num_reads chead)) (do rh <- get :: Get ReadHeader getRB (fromIntegral $ flow_length chead) rh ) return (SFF chead rds) put (SFF hd rds) = do put hd mapM_ (put . RBI (fromIntegral $ flow_length hd)) rds -- | Helper function for decoding a 'ReadBlock'. {-# INLINE getRB #-} getRB :: Int -> ReadHeader -> Get ReadBlock getRB fl rh = do let nb = fromIntegral $ num_bases rh nb' = fromIntegral $ num_bases rh fg <- getByteString (2*fl) fi <- getByteString nb bs <- getLazyByteString nb' qty <- getLazyByteString nb' let l = (fl*2+nb*3) `mod` 8 when (l > 0) (skip (8-l)) return (ReadBlock rh fg fi (SeqData bs) (QualData qty)) -- | A ReadBlock can't be an instance of Binary directly, since it depends on -- information from the CommonHeader. putRB :: Int -> ReadBlock -> Put putRB fl rb = do put (read_header rb) putByteString (flow_data rb) -- ensure that flowgram has correct lenght replicateM_ (2*fl-B.length (flow_data rb)) (put (0::Word8)) putByteString (flow_index rb) putLazyByteString (unSD $ bases rb) putLazyByteString (unQD $ quality rb) let nb = fromIntegral $ num_bases $ read_header rb l = (fl*2+nb*3) `mod` 8 when (l > 0) (pad (8-l)) -- | Unpack the flow_data field into a list of flow values unpackFlows :: ByteString -> [Flow] unpackFlows = dec . map fromIntegral . B.unpack where dec (d1:d2:rest) = d1*256+d2 : dec rest dec [] = [] dec _ = error "odd flowgram length?!" -- | Pack a list of flows into the corresponding binary structure (the flow_data field) packFlows :: [Flow] -> ByteString packFlows = B.pack . map fromIntegral . merge where merge (x:xs) = let (a,b) = x `divMod` 256 in a:b:merge xs merge [] = [] -- ---------------------------------------------------------- -- | SFF has a 31-byte common header -- -- The format is open to having the index anywhere between reads, -- we should really keep count and check for each read. In practice, it -- seems to be places after the reads. -- -- The following two fields are considered part of the header, but as -- they are static, they are not part of the data structure -- -- @ -- magic :: Word32 -- 0x2e736666, i.e. the string \".sff\" -- version :: Word32 -- 0x00000001 -- @ data CommonHeader = CommonHeader { index_offset :: Int64 -- ^ Points to a text(?) section , index_length, num_reads :: Int32 , key_length, flow_length :: Int16 , flowgram_fmt :: Word8 , flow, key :: ByteString } instance Show CommonHeader where show (CommonHeader io il nr kl fl fmt f k) = "Common Header:\n\n" ++ (unlines $ map (" "++) ["index_off:\t"++show io ++"\tindex_len:\t"++show il ,"num_reads:\t"++show nr ,"key_len:\t" ++show kl ++ "\tflow_len:\t"++show fl ,"format\t:" ++show fmt ,"flow\t:" ++BC.unpack f ,"key\t:" ++BC.unpack k , "" ]) instance Binary CommonHeader where get = do { m <- get ; when (m /= magic) $ error (printf "Incorrect magic number - got %8x, expected %8x" m magic) ; v <- get ; when (not (v `elem` versions)) $ error (printf "Unexpected version - got %d, supported are: %s" v (unwords $ map show versions)) ; io <- get ; ixl <- get ; nrd <- get ; chl <- get ; kl <- get ; fl <- get ; fmt <- get ; fw <- getByteString (fromIntegral fl) ; k <- getByteString (fromIntegral kl) ; skip (chl-(31+fl+kl)) -- skip to boundary ; return (CommonHeader io ixl nrd kl fl fmt fw k) } put ch = let CommonHeader io il nr kl fl fmt f k = ch { index_offset = 0, index_length = 0 } in do { let cl = 31+fl+kl l = cl `mod` 8 padding = if l > 0 then 8-l else 0 ; put magic; put (last versions); put io; put il; put nr; put (cl+padding); put kl; put fl; put fmt ; putByteString f; putByteString k ; pad padding -- skip to boundary } -- ---------------------------------------------------------- -- | Each Read has a fixed read header, containing various information. data ReadHeader = ReadHeader { name_length :: Int16 , num_bases :: Int32 , clip_qual_left, clip_qual_right , clip_adapter_left, clip_adapter_right :: Int16 , read_name :: ByteString } instance Show ReadHeader where show (ReadHeader nl nb cql cqr cal car rn) = (" Read Header:\n" ++) $ unlines $ map (" "++) [ "name_len:\t"++show nl, "num_bases:\t"++show nb , "clip_qual:\t"++show cql++"..."++show cqr , "clip_adap:\t"++show cal++"..."++show car , "read name:\t"++BC.unpack rn , "" ] instance Binary ReadHeader where get = do { rhl <- get; nl <- get; nb <- get ; cql <- get; cqr <- get ; cal <- get ; car <- get ; n <- getByteString (fromIntegral nl) ; skip (rhl - (16+ nl)) ; return (ReadHeader nl nb cql cqr cal car n) } put (ReadHeader nl nb cql cqr cal car rn) = do { let rl = 16+nl l = rl `mod` 8 padding = if l > 0 then 8-l else 0 ; put (rl+padding); put nl; put nb; put cql; put cqr; put cal; put car ; putByteString rn ; pad padding } -- ---------------------------------------------------------- -- | This contains the actual flowgram for a single read. data ReadBlock = ReadBlock { read_header :: ! ReadHeader -- The data block , flow_data :: ! ByteString -- nb! use unpackFlows for this , flow_index :: ! ByteString , bases :: ! SeqData , quality :: ! QualData } -- | Helper function to access the flowgram flowgram :: ReadBlock -> [Flow] flowgram = unpackFlows . flow_data -- | Extract the sequence with masked bases in lower case masked_bases :: ReadBlock -> SeqData masked_bases rb = let l = fromIntegral $ clip_qual_left $ read_header rb r = fromIntegral $ clip_qual_right $ read_header rb SeqData s = bases rb in SeqData $ LBC.concat [ LBC.map toLower $ LBC.take (l-1) s , LBC.take r (LBC.drop (l-1) s) , LBC.map toLower $ LBC.drop r s] -- | Extract the index as absolute coordinates, not relative. cumulative_index :: ReadBlock -> [Int] cumulative_index = scanl1 (+) . map fromIntegral . B.unpack . flow_index instance Show ReadBlock where show (ReadBlock h f i (SeqData b) (QualData q)) = show h ++ unlines (map (" "++) ["flowgram:\t"++show (unpackFlows f) , "index:\t"++(concat . intersperse " " . map show . B.unpack) i , "bases:\t"++LBC.unpack b , "quality:\t"++(concat . intersperse " " . map show . LB.unpack) q , "" ]) -- ------------------------------------------------------------ -- | RSFF wraps an SFF to provide an instance of Binary with some more error checking. data RSFF = RSFF { unRecovered :: SFF } instance Binary RSFF where get = do -- Parse CommonHeader chead <- get -- Get the first read block r1 <- do rh <- get getRB (fromIntegral $ flow_length chead) rh -- Get subsequent read blocks rds <- replicateM (fromIntegral (num_reads chead)) (do rh <- getSaneHeader (take 4 $ BC.unpack $ read_name $ read_header r1) getRB (fromIntegral $ flow_length chead) rh) return (RSFF $ SFF chead (r1:rds)) put = error "You should not serialize an RSFF" -- | This allows us to decode the constant parts of the read header for verifying its correcness. data PartialReadHeader = PartialReadHeader { _pread_header_lenght :: Int16 , _pname_length :: Int16 , _pnum_bases :: Int32 , _pclip_qual_left, _pclip_qual_right , _clip_adapter_left, _pclip_adapter_right :: Int16 , _pread_name :: ByteString -- length four } instance Binary PartialReadHeader where get = do { rhl <- get; nl <- get; nb <- get; ql <- get; qr <- get; al <- get; ar <- get; rn <- getByteString 4 ; return (PartialReadHeader rhl nl nb ql qr al ar rn) } put = error "You should not serialize a PartialReadHeader" -- | Ensure that the header we're decoding matches our expectations. getSaneHeader :: String -> Get ReadHeader getSaneHeader prefix = do buf <- getLazyByteString 20 decodeSaneH prefix buf -- | Decode a 'ReadHeader', verifying that the data make sense. decodeSaneH :: String -> LBC.ByteString -> Get ReadHeader decodeSaneH prefix buf = do let PartialReadHeader rhl nl _nb _ql _qr _al _ar rn = decode buf if rhl >= 20 && nl > 0 && all id (zipWith (==) prefix (BC.unpack rn)) then do buf2 <- getLazyByteString (fromIntegral rhl-20) return (decode $ LB.concat [buf,buf2]) else do x <- getLazyByteString 1 -- error "skip one byte, try again" decodeSaneH prefix (LBC.concat [buf,x])