{- | This module implements the 2bit format for sequences. Based on: Note! the description is not accurate, it is missing a reserved word in each sequence record. There are also other, completely different ideas of the 2bit format, e.g. -} module Bio.Sequence.TwoBit ( decode2Bit, read2Bit, hRead2Bit ) where import Bio.Sequence.SeqData import qualified Data.ByteString.Lazy as BB import qualified Data.ByteString.Lazy.Char8 as B import System.IO import Control.Monad import Data.Char import Data.Binary import Data.Int import Data.List import Data.Bits import Test.QuickCheck hiding (check) -- QC 1.0 -- import Test.QuickCheck hiding ((.&.)) -- QC 2.0 type ByteString = B.ByteString -- constants default_magic, default_version :: Word32 default_magic = 0x1A412743 default_version = 0 -- binary extras check :: Monad m => (a -> Bool) -> a -> m a check p x = if (p x) then return x else fail "check failed" bswap :: Integral a => Int -> a -> a bswap n x = let s = bytes x in unbytes . reverse $ s ++ replicate (n-length s) 0 bytes :: Integral a => a -> [Word8] bytes = Data.List.unfoldr (\w -> let (q,r) = quotRem w 256 in if q == 0 && r == 0 then Nothing else Just (fromIntegral r,q)) unbytes :: Integral a => [Word8] -> a unbytes = Data.List.foldr (\x y -> y*256+x) 0 . map fromIntegral instance Arbitrary Word8 where arbitrary = choose (0,255::Int) >>= return . fromIntegral prop_bswap :: Word8 -> Word8 -> Word8 -> Word8 -> Bool prop_bswap a1 a2 a3 a4 = (bswap 4 . decode . BB.pack) [a1,a2,a3,a4] == ((decode . BB.pack) [a4,a3,a2,a1] :: Word32) -- basic data types data Header = Header { swap :: Bool, version, count, reserved :: Word32 } instance Show Header where show (Header _ v c r) = "H "++show (v,c,r) instance Binary Header where get = do m <- get v <- get >>= check (==default_version) c <- get r <- get let s = if m==default_magic then id else if m==bswap 4 default_magic then bswap 4 else error "2bit decode: incorrect magic number" return (Header (m/=default_magic) (s v) (s c) (s r)) put (Header m v c r) = do put default_magic put default_version put c put (0 :: Word32) data Entry = Entry { name :: ByteString, offset :: Word32 } deriving Show swapEntry e = e { offset = bswap 4 (offset e) } instance Binary Entry where get = do len <- getWord8 name <- replicateM (fromIntegral len) getWord8 offset <- get return (Entry (BB.pack name) offset) put (Entry bs offset) = do let l = fromIntegral $ B.length bs :: Word8 put l mapM_ put $ BB.unpack bs put offset data Entries = Entries Header [Entry] instance Show Entries where show (Entries h es) = unlines (show h : map show es) instance Binary Entries where get = do h <- get es <- replicateM (fromIntegral $ count h) get return (Entries h $ if swap h then map swapEntry es else es) {- dnaSize - number of bases of DNA in the sequence nBlockCount - the number of blocks of Ns in the file (representing unknown sequence) nBlockStarts - the starting position for each block of Ns nBlockSizes - the size of each block of Ns maskBlockCount - the number of masked (lower-case) blocks maskBlockStarts - the starting position for each masked block maskBlockSizes - the size of each masked block packedDna - the DNA packed to two bits per base -} data SR = SR { dnaSize, nBlockCount :: Word32 , nBlockStarts, nBlockSizes :: [Word32] , maskBlockCount :: Word32 , maskBlockStarts, maskBlockSizes :: [Word32] , packedDna :: [Word8] , reserved2 :: Word32 } deriving Show -- big- and little-endian variants (what a mess) newtype SRBE = SRBE SR deriving Show newtype SRLE = SRLE SR deriving Show instance Binary SRBE where get = undefined instance Binary SRLE where get = do dz <- get :: Get Word32 nc <- get :: Get Word32 let n = fromIntegral $ bswap 4 nc nbs <- replicateM n get nbsz <- replicateM n get mc <- get :: Get Word32 let m = fromIntegral $ bswap 4 mc mbs <- replicateM m get mbsz <- replicateM m get _reserved <- get :: Get Word32 -- !!!! oops? let d = fromIntegral $ bswap 4 dz pdna <- replicateM ((d+3) `div` 4) get return (SRLE $ SR (bswap 4 dz) (bswap 4 nc) (map (bswap 4) nbs) (map (bswap 4) nbsz) (bswap 4 mc) (map (bswap 4) mbs) (map (bswap 4) mbsz) pdna (bswap 4 _reserved)) -- should this happen? Why not just write default format? put (SRLE sr) = do put (bswap 4 $ dnaSize sr) put (bswap 4 $ nBlockCount sr) mapM_ (put . bswap 4) (nBlockStarts sr) mapM_ (put . bswap 4) (nBlockSizes sr) put (bswap 4 $ maskBlockCount sr) mapM_ (put . bswap 4) (maskBlockStarts sr) mapM_ (put . bswap 4) (maskBlockSizes sr) put (0::Word32) mapM_ put (packedDna sr) -- used to convert to/from sequence data in the Sequence data structure fromSR :: SR -> ByteString fromSR sr = B.unfoldr go (0,low,ns,take (fromIntegral $ dnaSize sr) dna) where low = combine (maskBlockStarts sr) (maskBlockSizes sr) ns = combine (nBlockStarts sr) (nBlockSizes sr) combine starts lengths = concatMap (\(p,l) -> [p..p+l-1]) $ zip starts lengths dna = decodeDNA $ packedDna sr decodeDNA = concatMap (\x -> [shiftR (x .&. 0xC0) 6, shiftR (x .&. 0x30) 4, shiftR (x .&. 0x0C) 2, x .&. 0x03]) dec1 x = case x of 0 -> 'T'; 1 -> 'C'; 2 -> 'A'; 3 -> 'G'; _ -> error ("can't decode value "++show x) go (_,_,_,[]) = Nothing go (pos,(l:ls),(n:ns),(d:ds)) | pos == l && pos == n = Just ('n',(pos+1,ls,ns,ds)) | pos == l = Just (toLower (dec1 d),(pos+1,ls,n:ns,ds)) | pos == n = Just ('N',(pos+1,l:ls,ns,ds)) | otherwise = Just (dec1 d, (pos+1,l:ls,n:ns,ds)) go (pos,[],n:ns,d:ds) | pos == n = Just ('N',(pos+1,[],ns,ds)) | otherwise = Just (dec1 d, (pos+1,[],n:ns,ds)) go (pos,l:ls,[],d:ds) | pos == l = Just (toLower (dec1 d),(pos+1,ls,[],ds)) | otherwise = Just (dec1 d, (pos+1,l:ls,[],ds)) go (pos,[],[],d:ds) = Just (dec1 d, (pos+1,[],[],ds)) -- go x = error (show x) -- | Parse a (lazy) ByteString as sequences in the 2bit format. decode2Bit :: B.ByteString -> [Sequence] decode2Bit cs = let (Entries h es) = decode cs :: Entries ms = map (fromIntegral . offset) es (c:chunks) = zipWith (-) ms (0:ms) ss = splits chunks $ B.drop c cs in map ($ Nothing) $ zipWith Seq (map name es) $ map fromSR $ case swap h of True -> map (unSRLE.decode) ss False -> map (unSRBE.decode) ss splits [] cs = [cs] splits (e:es) cs = let (this,rest) = B.splitAt e cs in this : splits es rest unSRBE (SRBE x) = x unSRLE (SRLE x) = x toSR :: ByteString -> SR toSR bs = undefined -- | Extract sequences from a file in 2bit format. read2Bit :: FilePath -> IO [Sequence] read2Bit f = B.readFile f >>= return . decode2Bit -- | Extract sequences in the 2bit format from a handle. hRead2Bit :: Handle -> IO [Sequence] hRead2Bit h = B.hGetContents h >>= return . decode2Bit -- | Write sequences to file in the 2bit format. write2Bit :: FilePath -> [Sequence] -> IO () write2Bit = undefined -- | Write sequences to a handle in the 2bit format. hWrite2Bit :: Handle -> [Sequence] -> IO () hWrite2Bit = undefined