module Bio.Sequence.SFF ( SFF(..), CommonHeader(..)
, ReadHeader(..), ReadBlock(..)
, readSFF, writeSFF, writeSFF', recoverSFF
, sffToSequence, trim, trimFromTo, trimKey
, baseToFlowPos, flowToBasePos
, test, convert, flowgram
, packFlows, unpackFlows
, Flow, Qual, Index, SeqData, QualData
, ReadName (..), decodeReadName, encodeReadName
) where
import Bio.Sequence.SeqData
import Bio.Sequence.SFF_name
import Data.Int
import qualified Data.ByteString.Lazy as LB
import qualified Data.ByteString.Lazy.Char8 as LBC
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import Data.ByteString (ByteString)
import Control.Monad (when,replicateM,replicateM_,liftM)
import Data.List (intersperse)
import Data.Binary
import Data.Binary.Get (getByteString,getLazyByteString)
import qualified Data.Binary.Get as G
import Data.Binary.Put (putByteString,putLazyByteString)
import Data.Char (toUpper, toLower)
import Text.Printf (printf)
import System.IO
type Flow = Int16
type Index = Word8
magic :: Int32
magic = 0x2e736666
versions :: [Int32]
versions = [1]
readSFF :: FilePath -> IO SFF
readSFF f = return . decode =<< LB.readFile f
trimKey :: CommonHeader -> Sequence Nuc -> Maybe (Sequence Nuc)
trimKey ch (Seq n s q) = let (k,s2) = LB.splitAt (fromIntegral $ key_length ch) s
in if LBC.map toLower k==LBC.map toLower (LB.fromChunks [key ch])
then Just $ Seq n s2 (liftM (LB.drop (fromIntegral $ key_length ch)) q)
else Nothing
sffToSequence :: SFF -> [Sequence Nuc]
sffToSequence (SFF ch rs) = map r2s rs
where r2s r = clip (read_header r, bases r, quality r)
clip (h, s, q) = let (left,right) = (clip_qual_left h,clip_qual_right h)
split x = let (a,b) = LB.splitAt (fromIntegral right) x
(c,d) = LB.splitAt (fromIntegral left1) a
in [c,d,b]
in Seq (LB.fromChunks [read_name h ,BC.pack (" qclip: "++show left++".."++show right)])
(let [a,b,c] = split s in LBC.concat [LBC.map toLower a,LBC.map toUpper b,LBC.map toLower c])
(Just q)
trimFromTo :: (Integral i) => i -> i -> ReadBlock -> ReadBlock
trimFromTo l r rd = let trim_seq = LB.drop (fromIntegral l) . LB.take (fromIntegral r)
trim_seq' = B.drop (fromIntegral l) . B.take (fromIntegral r)
trim_flw = B.drop ((2*) $ fromIntegral $ baseToFlowPos rd l) . B.take ((2*) $ fromIntegral $ baseToFlowPos rd r)
in rd { read_header = read_header rd
, flow_data = trim_flw (flow_data rd)
, flow_index = trim_seq' (flow_index rd)
, bases = trim_seq (bases rd)
, quality = trim_seq (quality rd)
}
trim :: ReadBlock -> ReadBlock
trim rb = let rh = read_header rb in trimFromTo (clip_qual_left rh1) (clip_qual_right rh) rb
flowToBasePos :: Integral i => ReadBlock -> i -> Int
flowToBasePos rd fp = length $ takeWhile (<fp) $ scanl (+) 0 $ map fromIntegral $ B.unpack $ flow_index rd
baseToFlowPos :: Integral i => ReadBlock -> i -> Int
baseToFlowPos rd sp = sum $ map fromIntegral $ B.unpack $ B.take (fromIntegral sp) $ flow_index rd
recoverSFF :: FilePath -> IO SFF
recoverSFF f = return . unRecovered . decode =<< LB.readFile f
writeSFF :: FilePath -> SFF -> IO ()
writeSFF = encodeFile
writeSFF' :: FilePath -> SFF -> IO Int
writeSFF' f (SFF hs rs) = do
h <- openFile f WriteMode
LBC.hPut h $ encode hs
c <- writeReads h (fromIntegral $ flow_length hs) rs
hSeek h AbsoluteSeek 20
LBC.hPut h $ encode c
hClose h
return $ fromIntegral c
writeReads :: Handle -> Int -> [ReadBlock] -> IO Int32
writeReads _ _ [] = return 0
writeReads h i (r:rs) = do
LBC.hPut h $ encode (RBI i r)
c <- writeReads h i rs
return $! (c+1)
data RBI = RBI Int ReadBlock
instance Binary RBI where
put (RBI c r) = do
putRB c r
get = undefined
test :: FilePath -> IO ()
test file = do
(SFF h rs) <- readSFF file
let sff = (SFF h (take 2 rs))
putStrLn $ show $ sff
putStrLn ""
putStrLn $ show $ (decode $ encode sff :: SFF)
convert :: FilePath -> IO ()
convert file = writeSFF (file++".out") =<< readSFF file
pad :: Integral a => a -> Put
pad x = replicateM_ (fromIntegral x) (put zero) where zero = 0 :: Word8
skip :: Integral a => a -> Get ()
skip = G.skip . fromIntegral
data SFF = SFF !CommonHeader [ReadBlock]
instance Show SFF where
show (SFF h rs) = (show h ++ "Read Blocks:\n\n" ++ concatMap show rs)
instance Binary SFF where
get = do
chead <- get
rds <- replicateM (fromIntegral (num_reads chead))
(do
rh <- get :: Get ReadHeader
getRB chead rh
)
return (SFF chead rds)
put (SFF hd rds) = do
put hd
mapM_ (put . RBI (fromIntegral $ flow_length hd)) rds
getRB :: CommonHeader -> ReadHeader -> Get ReadBlock
getRB chead rh = do
let nb = fromIntegral $ num_bases rh
nb' = fromIntegral $ num_bases rh
fl = fromIntegral $ flow_length chead
fg <- getByteString (2*fl)
fi <- getByteString nb
bs <- getLazyByteString nb'
qty <- getLazyByteString nb'
let l = (fl*2+nb*3) `mod` 8
when (l > 0) (skip (8l))
return (ReadBlock rh fg fi bs qty)
putRB :: Int -> ReadBlock -> Put
putRB fl rb = do
put (read_header rb)
putByteString (flow_data rb)
replicateM (2*flB.length (flow_data rb)) (put (0::Word8))
putByteString (flow_index rb)
putLazyByteString (bases rb)
putLazyByteString (quality rb)
let nb = fromIntegral $ num_bases $ read_header rb
l = (fl*2+nb*3) `mod` 8
when (l > 0) (pad (8l))
unpackFlows :: ByteString -> [Flow]
unpackFlows = dec . map fromIntegral . B.unpack
where dec (d1:d2:rest) = d1*256+d2 : dec rest
dec [] = []
dec _ = error "odd flowgram length?!"
packFlows :: [Flow] -> ByteString
packFlows = B.pack . map fromIntegral . merge
where merge (x:xs) = let (a,b) = x `divMod` 256 in a:b:merge xs
merge [] = []
data CommonHeader = CommonHeader {
index_offset :: Int64
, index_length, num_reads :: Int32
, key_length, flow_length :: Int16
, flowgram_fmt :: Word8
, flow, key :: ByteString
}
instance Show CommonHeader where
show (CommonHeader io il nr kl fl fmt f k) =
"Common Header:\n\n" ++ (unlines $ map (" "++)
["index_off:\t"++show io ++"\tindex_len:\t"++show il
,"num_reads:\t"++show nr
,"key_len:\t" ++show kl ++ "\tflow_len:\t"++show fl
,"format\t:" ++show fmt
,"flow\t:" ++BC.unpack f
,"key\t:" ++BC.unpack k
, ""
])
instance Binary CommonHeader where
get = do { m <- get ; when (m /= magic) $ error (printf "Incorrect magic number - got %8x, expected %8x" m magic)
; v <- get ; when (not (v `elem` versions)) $ error (printf "Unexpected version - got %d, supported are: %s" v (unwords $ map show versions))
; io <- get ; ixl <- get ; nrd <- get
; chl <- get ; kl <- get ; fl <- get ; fmt <- get
; fw <- getByteString (fromIntegral fl)
; k <- getByteString (fromIntegral kl)
; skip (chl(31+fl+kl))
; return (CommonHeader io ixl nrd kl fl fmt fw k)
}
put ch = let CommonHeader io il nr kl fl fmt f k = ch { index_offset = 0 } in
do { let cl = 31+fl+kl
l = cl `mod` 8
padding = if l > 0 then 8l else 0
; put magic; put (last versions); put io; put il; put nr; put (cl+padding); put kl; put fl; put fmt
; putByteString f; putByteString k
; pad padding
}
data ReadHeader = ReadHeader {
name_length :: Int16
, num_bases :: Int32
, clip_qual_left, clip_qual_right
, clip_adapter_left, clip_adapter_right :: Int16
, read_name :: ByteString
}
instance Show ReadHeader where
show (ReadHeader nl nb cql cqr cal car rn) =
(" Read Header:\n" ++) $ unlines $ map (" "++)
[ "name_len:\t"++show nl, "num_bases:\t"++show nb
, "clip_qual:\t"++show cql++"..."++show cqr
, "clip_adap:\t"++show cal++"..."++show car
, "read name:\t"++BC.unpack rn
, ""
]
instance Binary ReadHeader where
get = do
{ rhl <- get; nl <- get; nb <- get
; cql <- get; cqr <- get ; cal <- get ; car <- get
; n <- getByteString (fromIntegral nl)
; skip (rhl (16+ nl))
; return (ReadHeader nl nb cql cqr cal car n)
}
put (ReadHeader nl nb cql cqr cal car rn) =
do { let rl = 16+nl
l = rl `mod` 8
padding = if l > 0 then 8l else 0
; put (rl+padding); put nl; put nb; put cql; put cqr; put cal; put car
; putByteString rn
; pad padding
}
data ReadBlock = ReadBlock {
read_header :: ! ReadHeader
, flow_data :: ! ByteString
, flow_index :: ! ByteString
, bases :: ! SeqData
, quality :: ! QualData
}
flowgram :: ReadBlock -> [Flow]
flowgram = unpackFlows . flow_data
instance Show ReadBlock where
show (ReadBlock h f i b q) =
show h ++ unlines (map (" "++)
["flowgram:\t"++show (unpackFlows f)
, "index:\t"++(concat . intersperse " " . map show . B.unpack) i
, "bases:\t"++LBC.unpack b
, "quality:\t"++(concat . intersperse " " . map show . LB.unpack) q
, ""
])
data RSFF = RSFF { unRecovered :: SFF }
instance Binary RSFF where
get = do
chead <- get
r1 <- do rh <- get
getRB chead rh
rds <- replicateM (fromIntegral (num_reads chead))
(do rh <- getSaneHeader (take 4 $ BC.unpack $ read_name $ read_header r1)
getRB chead rh)
return (RSFF $ SFF chead (r1:rds))
put = error "You should not serialize an RSFF"
data PartialReadHeader = PartialReadHeader {
_pread_header_lenght :: Int16
, _pname_length :: Int16
, _pnum_bases :: Int32
, _pclip_qual_left, _pclip_qual_right
, _clip_adapter_left, _pclip_adapter_right :: Int16
, _pread_name :: ByteString
}
instance Binary PartialReadHeader where
get = do { rhl <- get; nl <- get; nb <- get; ql <- get; qr <- get; al <- get; ar <- get; rn <- getByteString 4
; return (PartialReadHeader rhl nl nb ql qr al ar rn) }
put = error "You should not serialize a PartialReadHeader"
getSaneHeader :: String -> Get ReadHeader
getSaneHeader prefix = do
buf <- getLazyByteString 20
decodeSaneH prefix buf
decodeSaneH :: String -> LBC.ByteString -> Get ReadHeader
decodeSaneH prefix buf = do
let PartialReadHeader rhl nl _nb _ql _qr _al _ar rn = decode buf
if rhl >= 20 && nl > 0 && all id (zipWith (==) prefix (BC.unpack rn))
then do buf2 <- getLazyByteString (fromIntegral rhl20)
return (decode $ LB.concat [buf,buf2])
else do x <- getLazyByteString 1
decodeSaneH prefix (LBC.concat [buf,x])