module Bio.Sequence.SFF ( SFF(..), CommonHeader(..)
, ReadHeader(..), ReadBlock(..)
, readSFF, writeSFF, writeSFF', recoverSFF
, trim, trimFromTo
, baseToFlowPos, flowToBasePos
, trimFlows
, test, convert, flowgram
, masked_bases, cumulative_index
, packFlows, unpackFlows
, Flow, Qual, Index, SeqData, QualData
, ReadName (..), decodeReadName, encodeReadName
, putRB, getRB
) where
import Bio.Core.Sequence
import Bio.Sequence.SFF_name
import Data.Int
import qualified Data.ByteString.Lazy as LB
import qualified Data.ByteString.Lazy.Char8 as LBC
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import Data.ByteString (ByteString)
import Control.Monad (when,replicateM,replicateM_)
import Data.List (intersperse)
import Data.Binary
import Data.Binary.Get (getByteString,getLazyByteString,runGetState)
import qualified Data.Binary.Get as G
import Data.Binary.Put (putByteString,putLazyByteString)
import Data.Char (toUpper, toLower)
import Text.Printf (printf)
import System.IO
type Flow = Int16
type Index = Word8
magic :: Int32
magic = 0x2e736666
versions :: [Int32]
versions = [1]
readSFF :: FilePath -> IO SFF
readSFF f = do
file <- LB.readFile f
let (header, remaining, _) = runGetState (get::Get CommonHeader) file 0
blocks = getBlocks header (fromIntegral $ num_reads header) remaining
return (SFF header blocks)
getBlocks :: CommonHeader -> Int -> LB.ByteString -> [ReadBlock]
getBlocks header n str
| n == 0 = []
| otherwise = case runGetState (getBlock $ fromIntegral $ flow_length $ header) str 0 of
(block, remaining, _) -> block : getBlocks header (n1) remaining
getBlock :: Int -> Get ReadBlock
getBlock flows = get >>= getRB flows
instance BioSeq ReadBlock where
seqid rb = SeqLabel $ LB.fromChunks [read_name $ read_header rb]
seqheader rb = hdr
where n = read_name $ read_header rb
hdr = case decodeReadName n of
Just h -> SeqLabel $ LBC.unwords
[LB.fromChunks [n], LBC.pack $ show $ date h
,LBC.pack $ show $ time h, LBC.pack $ show $ region h
,LBC.pack $ (show (x_loc h) ++ ":"++show (y_loc h))]
Nothing -> seqid rb
seqdata rb = let h = read_header rb
(left,right) = (clip_qual_left h, clip_qual_right h)
(a,b) = LB.splitAt (fromIntegral right) $ unSD $ bases rb
(c,d) = LB.splitAt (fromIntegral left1) a
in SeqData $ LBC.concat [LBC.map toLower c, LBC.map toUpper d,LBC.map toLower b]
seqlength rb = fromIntegral $ num_bases $ read_header rb
instance BioSeqQual ReadBlock where
seqqual = quality
trimFlows :: Integral i => i -> ReadBlock -> ReadBlock
trimFlows l rb = rb { read_header = rh { num_bases = fromIntegral n
, clip_qual_right = min cqr $ fromIntegral n
}
, flow_data = B.take (2*fromIntegral l) (flow_data rb)
, flow_index = B.take n (flow_index rb)
, bases = SeqData $ LB.take (fromIntegral n) (unSD $ bases rb)
, quality = QualData $ LB.take (fromIntegral n) (unQD $ quality rb)
}
where n = (flowToBasePos rb l)1
rh = read_header rb
cqr = clip_qual_right rh
trimFromTo :: (Integral i) => i -> i -> ReadBlock -> ReadBlock
trimFromTo x r rd = let
l = x1
trim_seq = LB.drop (fromIntegral l) . LB.take (fromIntegral r)
trim_seq' = B.drop (fromIntegral l) . B.take (fromIntegral r)
trim_flw = B.drop ((2*) $ fromIntegral $ baseToFlowPos rd l) . B.take ((2*) $ fromIntegral $ baseToFlowPos rd r)
new_flw = trim_flw (flow_data rd)
padding = B.replicate (B.length (flow_data rd) B.length new_flw) 0
rh = read_header rd
[r',l'] = map fromIntegral [r,l]
rh' = rh { num_bases = fromIntegral (r'l')
, clip_qual_left = max 0 $ clip_qual_left rhl'
, clip_qual_right = min (clip_qual_right rhl') (r'l'+1)
}
in rd { read_header = rh'
, flow_data = B.concat [new_flw, padding]
, flow_index = trim_seq' (flow_index rd)
, bases = SeqData $ trim_seq $ unSD $ bases rd
, quality = QualData $ trim_seq $ unQD $ quality rd
}
trim :: ReadBlock -> ReadBlock
trim rb = let rh = read_header rb in trimFromTo (clip_qual_left rh) (clip_qual_right rh) rb
flowToBasePos :: Integral i => ReadBlock -> i -> Int
flowToBasePos rd fp = length $ takeWhile (<=fp) $ scanl (+) 0 $ map fromIntegral $ B.unpack $ flow_index rd
baseToFlowPos :: Integral i => ReadBlock -> i -> Int
baseToFlowPos rd sp = sum $ map fromIntegral $ B.unpack $ B.take (fromIntegral sp) $ flow_index rd
recoverSFF :: FilePath -> IO SFF
recoverSFF f = return . unRecovered . decode =<< LB.readFile f
writeSFF :: FilePath -> SFF -> IO ()
writeSFF = encodeFile
writeSFF' :: FilePath -> SFF -> IO Int
writeSFF' f (SFF hs rs) = do
h <- openFile f WriteMode
LBC.hPut h $ encode hs
c <- writeReads h (fromIntegral $ flow_length hs) rs
hSeek h AbsoluteSeek 20
LBC.hPut h $ encode c
hClose h
return $ fromIntegral c
writeReads :: Handle -> Int -> [ReadBlock] -> IO Int32
writeReads _ _ [] = return 0
writeReads h i xs = go 0 xs
where go c (r:rs) = do
LBC.hPut h $ encode (RBI i r)
let c' = c+1
c' `seq` go c' rs
go c [] = return c
data RBI = RBI Int ReadBlock
instance Binary RBI where
put (RBI c r) = do
putRB c r
get = undefined
test :: FilePath -> IO ()
test file = do
(SFF h rs) <- readSFF file
let sff = (SFF h (take 2 rs))
putStrLn $ show $ sff
putStrLn ""
putStrLn $ show $ (decode $ encode sff :: SFF)
convert :: FilePath -> IO ()
convert file = writeSFF (file++".out") =<< readSFF file
pad :: Integral a => a -> Put
pad x = replicateM_ (fromIntegral x) (put zero) where zero = 0 :: Word8
skip :: Integral a => a -> Get ()
skip = G.skip . fromIntegral
data SFF = SFF !CommonHeader [ReadBlock]
instance Show SFF where
show (SFF h rs) = (show h ++ "Read Blocks:\n\n" ++ concatMap show rs)
instance Binary SFF where
get = do
chead <- get
rds <- replicateM (fromIntegral (num_reads chead))
(do
rh <- get :: Get ReadHeader
getRB (fromIntegral $ flow_length chead) rh
)
return (SFF chead rds)
put (SFF hd rds) = do
put hd
mapM_ (put . RBI (fromIntegral $ flow_length hd)) rds
getRB :: Int -> ReadHeader -> Get ReadBlock
getRB fl rh = do
let nb = fromIntegral $ num_bases rh
nb' = fromIntegral $ num_bases rh
fg <- getByteString (2*fl)
fi <- getByteString nb
bs <- getLazyByteString nb'
qty <- getLazyByteString nb'
let l = (fl*2+nb*3) `mod` 8
when (l > 0) (skip (8l))
return (ReadBlock rh fg fi (SeqData bs) (QualData qty))
putRB :: Int -> ReadBlock -> Put
putRB fl rb = do
put (read_header rb)
putByteString (flow_data rb)
replicateM_ (2*flB.length (flow_data rb)) (put (0::Word8))
putByteString (flow_index rb)
putLazyByteString (unSD $ bases rb)
putLazyByteString (unQD $ quality rb)
let nb = fromIntegral $ num_bases $ read_header rb
l = (fl*2+nb*3) `mod` 8
when (l > 0) (pad (8l))
unpackFlows :: ByteString -> [Flow]
unpackFlows = dec . map fromIntegral . B.unpack
where dec (d1:d2:rest) = d1*256+d2 : dec rest
dec [] = []
dec _ = error "odd flowgram length?!"
packFlows :: [Flow] -> ByteString
packFlows = B.pack . map fromIntegral . merge
where merge (x:xs) = let (a,b) = x `divMod` 256 in a:b:merge xs
merge [] = []
data CommonHeader = CommonHeader {
index_offset :: Int64
, index_length, num_reads :: Int32
, key_length, flow_length :: Int16
, flowgram_fmt :: Word8
, flow, key :: ByteString
}
instance Show CommonHeader where
show (CommonHeader io il nr kl fl fmt f k) =
"Common Header:\n\n" ++ (unlines $ map (" "++)
["index_off:\t"++show io ++"\tindex_len:\t"++show il
,"num_reads:\t"++show nr
,"key_len:\t" ++show kl ++ "\tflow_len:\t"++show fl
,"format\t:" ++show fmt
,"flow\t:" ++BC.unpack f
,"key\t:" ++BC.unpack k
, ""
])
instance Binary CommonHeader where
get = do { m <- get ; when (m /= magic) $ error (printf "Incorrect magic number - got %8x, expected %8x" m magic)
; v <- get ; when (not (v `elem` versions)) $ error (printf "Unexpected version - got %d, supported are: %s" v (unwords $ map show versions))
; io <- get ; ixl <- get ; nrd <- get
; chl <- get ; kl <- get ; fl <- get ; fmt <- get
; fw <- getByteString (fromIntegral fl)
; k <- getByteString (fromIntegral kl)
; skip (chl(31+fl+kl))
; return (CommonHeader io ixl nrd kl fl fmt fw k)
}
put ch = let CommonHeader io il nr kl fl fmt f k = ch { index_offset = 0 } in
do { let cl = 31+fl+kl
l = cl `mod` 8
padding = if l > 0 then 8l else 0
; put magic; put (last versions); put io; put il; put nr; put (cl+padding); put kl; put fl; put fmt
; putByteString f; putByteString k
; pad padding
}
data ReadHeader = ReadHeader {
name_length :: Int16
, num_bases :: Int32
, clip_qual_left, clip_qual_right
, clip_adapter_left, clip_adapter_right :: Int16
, read_name :: ByteString
}
instance Show ReadHeader where
show (ReadHeader nl nb cql cqr cal car rn) =
(" Read Header:\n" ++) $ unlines $ map (" "++)
[ "name_len:\t"++show nl, "num_bases:\t"++show nb
, "clip_qual:\t"++show cql++"..."++show cqr
, "clip_adap:\t"++show cal++"..."++show car
, "read name:\t"++BC.unpack rn
, ""
]
instance Binary ReadHeader where
get = do
{ rhl <- get; nl <- get; nb <- get
; cql <- get; cqr <- get ; cal <- get ; car <- get
; n <- getByteString (fromIntegral nl)
; skip (rhl (16+ nl))
; return (ReadHeader nl nb cql cqr cal car n)
}
put (ReadHeader nl nb cql cqr cal car rn) =
do { let rl = 16+nl
l = rl `mod` 8
padding = if l > 0 then 8l else 0
; put (rl+padding); put nl; put nb; put cql; put cqr; put cal; put car
; putByteString rn
; pad padding
}
data ReadBlock = ReadBlock {
read_header :: ! ReadHeader
, flow_data :: ! ByteString
, flow_index :: ! ByteString
, bases :: ! SeqData
, quality :: ! QualData
}
flowgram :: ReadBlock -> [Flow]
flowgram = unpackFlows . flow_data
masked_bases :: ReadBlock -> SeqData
masked_bases rb = let
l = fromIntegral $ clip_qual_left $ read_header rb
r = fromIntegral $ clip_qual_right $ read_header rb
SeqData s = bases rb
in SeqData $ LBC.concat [ LBC.map toLower $ LBC.take (l1) s
, LBC.take r (LBC.drop (l1) s)
, LBC.map toLower $ LBC.drop r s]
cumulative_index :: ReadBlock -> [Int]
cumulative_index = scanl1 (+) . map fromIntegral . B.unpack . flow_index
instance Show ReadBlock where
show (ReadBlock h f i (SeqData b) (QualData q)) =
show h ++ unlines (map (" "++)
["flowgram:\t"++show (unpackFlows f)
, "index:\t"++(concat . intersperse " " . map show . B.unpack) i
, "bases:\t"++LBC.unpack b
, "quality:\t"++(concat . intersperse " " . map show . LB.unpack) q
, ""
])
data RSFF = RSFF { unRecovered :: SFF }
instance Binary RSFF where
get = do
chead <- get
r1 <- do rh <- get
getRB (fromIntegral $ flow_length chead) rh
rds <- replicateM (fromIntegral (num_reads chead))
(do rh <- getSaneHeader (take 4 $ BC.unpack $ read_name $ read_header r1)
getRB (fromIntegral $ flow_length chead) rh)
return (RSFF $ SFF chead (r1:rds))
put = error "You should not serialize an RSFF"
data PartialReadHeader = PartialReadHeader {
_pread_header_lenght :: Int16
, _pname_length :: Int16
, _pnum_bases :: Int32
, _pclip_qual_left, _pclip_qual_right
, _clip_adapter_left, _pclip_adapter_right :: Int16
, _pread_name :: ByteString
}
instance Binary PartialReadHeader where
get = do { rhl <- get; nl <- get; nb <- get; ql <- get; qr <- get; al <- get; ar <- get; rn <- getByteString 4
; return (PartialReadHeader rhl nl nb ql qr al ar rn) }
put = error "You should not serialize a PartialReadHeader"
getSaneHeader :: String -> Get ReadHeader
getSaneHeader prefix = do
buf <- getLazyByteString 20
decodeSaneH prefix buf
decodeSaneH :: String -> LBC.ByteString -> Get ReadHeader
decodeSaneH prefix buf = do
let PartialReadHeader rhl nl _nb _ql _qr _al _ar rn = decode buf
if rhl >= 20 && nl > 0 && all id (zipWith (==) prefix (BC.unpack rn))
then do buf2 <- getLazyByteString (fromIntegral rhl20)
return (decode $ LB.concat [buf,buf2])
else do x <- getLazyByteString 1
decodeSaneH prefix (LBC.concat [buf,x])