module Bio.Sequence.TwoBit
( decode2Bit, read2Bit, hRead2Bit
) where
import Bio.Sequence.SeqData
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString.Lazy.Char8 as B
import System.IO
import Control.Monad
import Data.Char
import Data.Binary
import Data.Int
import Data.List
import Data.Bits
import Test.QuickCheck hiding (check)
type ByteString = B.ByteString
default_magic, default_version :: Word32
default_magic = 0x1A412743
default_version = 0
check :: Monad m => (a -> Bool) -> a -> m a
check p x = if (p x) then return x else fail "check failed"
bswap :: Integral a => Int -> a -> a
bswap n x = let s = bytes x in unbytes . reverse $ s ++ replicate (nlength s) 0
bytes :: Integral a => a -> [Word8]
bytes = Data.List.unfoldr (\w -> let (q,r) = quotRem w 256
in if q == 0 && r == 0 then Nothing
else Just (fromIntegral r,q))
unbytes :: Integral a => [Word8] -> a
unbytes = Data.List.foldr (\x y -> y*256+x) 0 . map fromIntegral
instance Arbitrary Word8 where
arbitrary = choose (0,255::Int) >>= return . fromIntegral
prop_bswap :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
prop_bswap a1 a2 a3 a4 = (bswap 4 . decode . BB.pack) [a1,a2,a3,a4] == ((decode . BB.pack) [a4,a3,a2,a1] :: Word32)
data Header = Header { swap :: Bool, version, count, reserved :: Word32 }
instance Show Header where show (Header _ v c r) = "H "++show (v,c,r)
instance Binary Header where
get = do
m <- get
v <- get >>= check (==default_version)
c <- get
r <- get
let s = if m==default_magic then id else if m==bswap 4 default_magic then bswap 4
else error "2bit decode: incorrect magic number"
return (Header (m/=default_magic) (s v) (s c) (s r))
put (Header m v c r) = do
put default_magic
put default_version
put c
put (0 :: Word32)
data Entry = Entry { name :: ByteString, offset :: Word32 }
deriving Show
swapEntry e = e { offset = bswap 4 (offset e) }
instance Binary Entry where
get = do
len <- getWord8
name <- replicateM (fromIntegral len) getWord8
offset <- get
return (Entry (BB.pack name) offset)
put (Entry bs offset) = do
let l = fromIntegral $ B.length bs :: Word8
put l
mapM_ put $ BB.unpack bs
put offset
data Entries = Entries Header [Entry]
instance Show Entries where show (Entries h es) = unlines (show h : map show es)
instance Binary Entries where
get = do
h <- get
es <- replicateM (fromIntegral $ count h) get
return (Entries h $ if swap h then map swapEntry es else es)
data SR = SR { dnaSize, nBlockCount :: Word32
, nBlockStarts, nBlockSizes :: [Word32]
, maskBlockCount :: Word32
, maskBlockStarts, maskBlockSizes :: [Word32]
, packedDna :: [Word8]
, reserved2 :: Word32 }
deriving Show
newtype SRBE = SRBE SR deriving Show
newtype SRLE = SRLE SR deriving Show
instance Binary SRBE where
get = undefined
instance Binary SRLE where
get = do
dz <- get :: Get Word32
nc <- get :: Get Word32
let n = fromIntegral $ bswap 4 nc
nbs <- replicateM n get
nbsz <- replicateM n get
mc <- get :: Get Word32
let m = fromIntegral $ bswap 4 mc
mbs <- replicateM m get
mbsz <- replicateM m get
_reserved <- get :: Get Word32
let d = fromIntegral $ bswap 4 dz
pdna <- replicateM ((d+3) `div` 4) get
return (SRLE $ SR (bswap 4 dz) (bswap 4 nc)
(map (bswap 4) nbs) (map (bswap 4) nbsz)
(bswap 4 mc) (map (bswap 4) mbs) (map (bswap 4) mbsz) pdna (bswap 4 _reserved))
put (SRLE sr) = do
put (bswap 4 $ dnaSize sr)
put (bswap 4 $ nBlockCount sr)
mapM_ (put . bswap 4) (nBlockStarts sr)
mapM_ (put . bswap 4) (nBlockSizes sr)
put (bswap 4 $ maskBlockCount sr)
mapM_ (put . bswap 4) (maskBlockStarts sr)
mapM_ (put . bswap 4) (maskBlockSizes sr)
put (0::Word32)
mapM_ put (packedDna sr)
fromSR :: SR -> ByteString
fromSR sr = B.unfoldr go (0,low,ns,take (fromIntegral $ dnaSize sr) dna)
where
low = combine (maskBlockStarts sr) (maskBlockSizes sr)
ns = combine (nBlockStarts sr) (nBlockSizes sr)
combine starts lengths = concatMap (\(p,l) -> [p..p+l1]) $ zip starts lengths
dna = decodeDNA $ packedDna sr
decodeDNA = concatMap (\x -> [shiftR (x .&. 0xC0) 6, shiftR (x .&. 0x30) 4, shiftR (x .&. 0x0C) 2, x .&. 0x03])
dec1 x = case x of 0 -> 'T'; 1 -> 'C'; 2 -> 'A'; 3 -> 'G'; _ -> error ("can't decode value "++show x)
go (_,_,_,[]) = Nothing
go (pos,(l:ls),(n:ns),(d:ds))
| pos == l && pos == n = Just ('n',(pos+1,ls,ns,ds))
| pos == l = Just (toLower (dec1 d),(pos+1,ls,n:ns,ds))
| pos == n = Just ('N',(pos+1,l:ls,ns,ds))
| otherwise = Just (dec1 d, (pos+1,l:ls,n:ns,ds))
go (pos,[],n:ns,d:ds)
| pos == n = Just ('N',(pos+1,[],ns,ds))
| otherwise = Just (dec1 d, (pos+1,[],n:ns,ds))
go (pos,l:ls,[],d:ds)
| pos == l = Just (toLower (dec1 d),(pos+1,ls,[],ds))
| otherwise = Just (dec1 d, (pos+1,l:ls,[],ds))
go (pos,[],[],d:ds) = Just (dec1 d, (pos+1,[],[],ds))
decode2Bit :: B.ByteString -> [Sequence]
decode2Bit cs = let (Entries h es) = decode cs :: Entries
ms = map (fromIntegral . offset) es
(c:chunks) = zipWith () ms (0:ms)
ss = splits chunks $ B.drop c cs
in map ($ Nothing) $ zipWith Seq (map name es)
$ map fromSR $ case swap h of
True -> map (unSRLE.decode) ss
False -> map (unSRBE.decode) ss
splits [] cs = [cs]
splits (e:es) cs = let (this,rest) = B.splitAt e cs
in this : splits es rest
unSRBE (SRBE x) = x
unSRLE (SRLE x) = x
toSR :: ByteString -> SR
toSR bs = undefined
read2Bit :: FilePath -> IO [Sequence]
read2Bit f = B.readFile f >>= return . decode2Bit
hRead2Bit :: Handle -> IO [Sequence]
hRead2Bit h = B.hGetContents h >>= return . decode2Bit
write2Bit :: FilePath -> [Sequence] -> IO ()
write2Bit = undefined
hWrite2Bit :: Handle -> [Sequence] -> IO ()
hWrite2Bit = undefined