module Bio.Sequence.SFF ( SFF(..), CommonHeader(..)
, ReadHeader(..), ReadBlock(..)
, readSFF, writeSFF
, sffToSequence
, test, convert
, Flow, Qual, Index
) where
import Bio.Sequence.SeqData
import Data.Int
import qualified Data.ByteString.Lazy as LB
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import Data.ByteString (ByteString)
import Control.Monad (when,replicateM,replicateM_)
import Data.List (intersperse)
import Data.Binary
import Data.Binary.Get (getByteString)
import qualified Data.Binary.Get as G
import Data.Binary.Put (putByteString)
import Text.Printf (printf)
type Flow = Int16
type Index = Word8
magic :: Int32
magic = 0x2e736666
versions :: [Int32]
versions = [1]
readSFF :: FilePath -> IO SFF
readSFF f = return . decode =<< LB.readFile f
sffToSequence :: SFF -> [Sequence]
sffToSequence (SFF ch rs) = map r2s rs
where r2s r = clip (read_name $ read_header r, bases r, quality r)
lb x = LB.fromChunks [x]
clip (n,s,q) = let (k,s2) = B.splitAt (fromIntegral $ key_length ch) s
in if k==key ch then Seq (lb n) (lb s2) (Just $ lb $ B.drop (fromIntegral $ key_length ch) q)
else error ("Couldn't match key in sequence "++BC.unpack n++" ("++BC.unpack k++" vs. "++BC.unpack (key ch)++")!")
writeSFF :: FilePath -> SFF -> IO ()
writeSFF = encodeFile
test :: FilePath -> IO ()
test file = do
(SFF h rs) <- readSFF file
let sff = (SFF h (take 2 rs))
putStrLn $ show $ sff
putStrLn ""
putStrLn $ show $ (decode $ encode sff :: SFF)
convert :: FilePath -> IO ()
convert file = writeSFF (file++".out") =<< readSFF file
pad :: Integral a => a -> Put
pad x = replicateM_ (fromIntegral x) (put zero) where zero = 0 :: Word8
skip :: Integral a => a -> Get ()
skip = G.skip . fromIntegral
data SFF = SFF !CommonHeader [ReadBlock]
instance Show SFF where
show (SFF h rs) = (show h ++ "Read Blocks:\n\n" ++ concatMap show rs)
instance Binary SFF where
get = do
chead <- get
rds <- replicateM (fromIntegral (num_reads chead))
(do
rh <- get :: Get ReadHeader
let nb = fromIntegral $ num_bases rh
fl = fromIntegral $ flow_length chead
fg <- getByteString (2*fl)
fi <- getByteString nb
bs <- getByteString nb
qty <- getByteString nb
let l = (fl*2+nb*3) `mod` 8
when (l > 0) (skip (8l))
return (ReadBlock rh (decodeArray fg) fi bs qty)
)
return (SFF chead rds)
put (SFF hd rds) = do
put hd
mapM_ (putRB (fromIntegral $ flow_length hd)) rds
putRB :: Int -> ReadBlock -> Put
putRB fl rb = do
put (read_header rb)
mapM_ put (flowgram rb)
putByteString (flow_index rb)
putByteString (bases rb)
putByteString (quality rb)
let nb = fromIntegral $ num_bases $ read_header rb
l = (fl*2+nb*3) `mod` 8
when (l > 0) (pad (8l))
decodeArray :: ByteString -> [Flow]
decodeArray = dec . map fromIntegral . B.unpack
where dec (d1:d2:rest) = d1*256+d2 : dec rest
dec [] = []
dec _ = error "odd flowgram length?!"
data CommonHeader = CommonHeader {
index_offset :: Int64
, index_length, num_reads :: Int32
, key_length, flow_length :: Int16
, flowgram_fmt :: Word8
, flow, key :: ByteString
}
instance Show CommonHeader where
show (CommonHeader io il nr kl fl fmt f k) =
"Common Header:\n\n" ++ (unlines $ map (" "++)
["index_off:\t"++show io ++"\tindex_len:\t"++show il
,"num_reads:\t"++show nr
,"key_len:\t" ++show kl ++ "\tflow_len:\t"++show fl
,"format\t:" ++show fmt
,"flow\t:" ++BC.unpack f
,"key\t:" ++BC.unpack k
, ""
])
instance Binary CommonHeader where
get = do { m <- get ; when (m /= magic) $ error (printf "Incorrect magic number - got %8x, expected %8x" m magic)
; v <- get ; when (not (v `elem` versions)) $ error (printf "Unexpected version - got %d, supported are: %s" v (unwords $ map show versions))
; io <- get ; ixl <- get ; nrd <- get
; chl <- get ; kl <- get ; fl <- get ; fmt <- get
; fw <- getByteString (fromIntegral fl)
; k <- getByteString (fromIntegral kl)
; skip (chl(31+fl+kl))
; return (CommonHeader io ixl nrd kl fl fmt fw k)
}
put ch = let CommonHeader io il nr kl fl fmt f k = ch { index_offset = 0 } in
do { let cl = 31+fl+kl
l = cl `mod` 8
padding = if l > 0 then 8l else 0
; put magic; put (last versions); put io; put il; put nr; put (cl+padding); put kl; put fl; put fmt
; putByteString f; putByteString k
; pad padding
}
data ReadHeader = ReadHeader {
name_length :: Int16
, num_bases :: Int32
, clip_qual_left, clip_qual_right
, clip_adapter_left, clip_adapter_right :: Int16
, read_name :: ByteString
}
instance Show ReadHeader where
show (ReadHeader nl nb cql cqr cal car rn) =
(" Read Header:\n" ++) $ unlines $ map (" "++)
[ "name_len:\t"++show nl, "num_bases:\t"++show nb
, "clip_qual:\t"++show cql++"..."++show cqr
, "clip_adap:\t"++show cal++"..."++show car
, "read name:\t"++BC.unpack rn
, ""
]
instance Binary ReadHeader where
get = do
{ rhl <- get; nl <- get; nb <- get
; cql <- get; cqr <- get ; cal <- get ; car <- get
; n <- getByteString (fromIntegral nl)
; skip (rhl (16+ nl))
; return (ReadHeader nl nb cql cqr cal car n)
}
put (ReadHeader nl nb cql cqr cal car rn) =
do { let rl = 16+nl
l = rl `mod` 8
padding = if l > 0 then 8l else 0
; put (rl+padding); put nl; put nb; put cql; put cqr; put cal; put car
; putByteString rn
; pad padding
}
data ReadBlock = ReadBlock {
read_header :: ReadHeader
, flowgram :: [Flow]
, flow_index, bases, quality :: ByteString
}
instance Show ReadBlock where
show (ReadBlock h f i b q) =
show h ++ unlines (map (" "++)
["flowgram:\t"++show f
, "index:\t"++(concat . intersperse " " . map show . B.unpack) i
, "bases:\t"++BC.unpack b
, "quality:\t"++(concat . intersperse " " . map show . B.unpack) q
, ""
])