Haskell Code by HsColour

{- |
   This module implements the 2bit format for sequences.

   Based on: <http://genome.ucsc.edu/FAQ/FAQformat#format7>
   Note! the description is not accurate, it is missing a reserved word
         in each sequence record.

   There are also other, completely different ideas of the 2bit format, e.g.
      <http://jcomeau.freeshell.org/www/genome/2bitformat.html>
-}


module Bio.Sequence.TwoBit
   ( decode2Bit, read2Bit, hRead2Bit
   ) where

import Bio.Sequence.SeqData
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString.Lazy.Char8 as B

import System.IO
import Control.Monad

import Data.Char
import Data.Binary
import Data.Int
import Data.List
import Data.Bits

import Test.QuickCheck hiding (check)    -- QC 1.0
-- import Test.QuickCheck hiding ((.&.)) -- QC 2.0

type ByteString = B.ByteString

-- constants
default_magic, default_version :: Word32
default_magic   = 0x1A412743
default_version = 0

-- binary extras
check :: Monad m => (a -> Bool) ->  a -> m a
check p x = if (p x) then return x else fail "check failed"

bswap :: Integral a => Int -> a -> a
bswap n x = let s = bytes x in unbytes . reverse $ s ++ replicate (n-length s) 0

bytes :: Integral a => a -> [Word8]
bytes = Data.List.unfoldr (\w -> let (q,r) = quotRem w 256
                                 in if q == 0 && r == 0 then Nothing
                                    else Just (fromIntegral r,q))
unbytes :: Integral a => [Word8] -> a
unbytes = Data.List.foldr (\x y -> y*256+x) 0 . map fromIntegral

-- Conflicts with Bio.Util.TestBase
-- instance Arbitrary Word8 where
--    arbitrary = choose (0,255::Int) >>= return . fromIntegral

-- prop_bswap :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
-- prop_bswap a1 a2 a3 a4 = (bswap 4 . decode . BB.pack) [a1,a2,a3,a4] == ((decode . BB.pack) [a4,a3,a2,a1] :: Word32)

-- basic data types
data Header = Header { swap :: Bool,  version, count, reserved :: Word32 }
instance Show Header where show (Header _ v c r) = "H "++show (v,c,r)

instance Binary Header where
    get = do
      m <- get
      v <- get >>= check (==default_version)
      c <- get
      r <- get
      let s = if m==default_magic then id else if m==bswap 4 default_magic then bswap 4
                                               else error "2bit decode: incorrect magic number"
      return (Header (m/=default_magic) (s v) (s c) (s r))
    put (Header m v c r) = do
      put default_magic
      put default_version
      put c
      put (0 :: Word32)

data Entry  = Entry  { name :: ByteString, offset :: Word32 }
              deriving Show

swapEntry e = e { offset = bswap 4 (offset e) }

instance Binary Entry where
    get = do
      len <- getWord8
      name <- replicateM (fromIntegral len) getWord8
      offset <- get
      return (Entry (BB.pack name) offset)
    put (Entry bs offset) = do
      let l = fromIntegral $ B.length bs :: Word8
      put l
      mapM_ put $ BB.unpack bs
      put offset

data Entries = Entries Header [Entry]
instance Show Entries where show (Entries h es) = unlines (show h : map show es)

instance Binary Entries where
    get = do
      h <- get
      es <- replicateM (fromIntegral $ count h) get
      return (Entries h $ if swap h then map swapEntry es else es)

{-
 dnaSize - number of bases of DNA in the sequence
 nBlockCount - the number of blocks of Ns in the file (representing unknown sequence)
 nBlockStarts - the starting position for each block of Ns
 nBlockSizes - the size of each block of Ns
 maskBlockCount - the number of masked (lower-case) blocks
 maskBlockStarts - the starting position for each masked block
 maskBlockSizes - the size of each masked block
 packedDna - the DNA packed to two bits per base
-}

data SR = SR { dnaSize, nBlockCount :: Word32
                         , nBlockStarts, nBlockSizes :: [Word32]
                         , maskBlockCount :: Word32
                         , maskBlockStarts, maskBlockSizes :: [Word32]
                         , packedDna :: [Word8]
                         , reserved2 :: Word32 }
        deriving Show

-- big- and little-endian variants (what a mess)
newtype SRBE = SRBE SR deriving Show
newtype SRLE = SRLE SR deriving Show

instance Binary SRBE where
    get = undefined

instance Binary SRLE where
    get = do
      dz <- get :: Get Word32
      nc <- get :: Get Word32
      let n = fromIntegral $ bswap 4 nc
      nbs <- replicateM n get
      nbsz <- replicateM n get
      mc <- get :: Get Word32
      let m = fromIntegral $ bswap 4 mc
      mbs <- replicateM m get
      mbsz <- replicateM m get
      _reserved <- get :: Get Word32 -- !!!! oops?
      let d = fromIntegral $ bswap 4 dz
      pdna <- replicateM ((d+3) `div` 4)  get
      return (SRLE $ SR (bswap 4 dz) (bswap 4 nc)
              (map (bswap 4) nbs) (map (bswap 4) nbsz)
              (bswap 4 mc) (map (bswap 4) mbs) (map (bswap 4) mbsz) pdna (bswap 4 _reserved))
    -- should this happen?  Why not just write default format?
    put (SRLE sr) = do
      put (bswap 4 $ dnaSize sr)
      put (bswap 4 $ nBlockCount sr)
      mapM_ (put . bswap 4) (nBlockStarts sr)
      mapM_ (put . bswap 4) (nBlockSizes sr)
      put (bswap 4 $ maskBlockCount sr)
      mapM_ (put . bswap 4) (maskBlockStarts sr)
      mapM_ (put . bswap 4) (maskBlockSizes sr)
      put (0::Word32)
      mapM_ put (packedDna sr)

-- used to convert to/from sequence data in the Sequence data structure
fromSR :: SR -> ByteString
fromSR sr = B.unfoldr go (0,low,ns,take (fromIntegral $ dnaSize sr) dna)
    where
    low = combine (maskBlockStarts sr) (maskBlockSizes sr)
    ns  = combine (nBlockStarts sr) (nBlockSizes sr)
    combine starts lengths = concatMap (\(p,l) -> [p..p+l-1]) $ zip starts lengths

    dna = decodeDNA $ packedDna sr
    decodeDNA = concatMap (\x -> [shiftR (x .&. 0xC0) 6, shiftR (x .&. 0x30) 4, shiftR (x .&. 0x0C) 2, x .&. 0x03])
    dec1 x = case x of 0 -> 'T'; 1 -> 'C'; 2 -> 'A'; 3 -> 'G'; _ -> error ("can't decode value "++show x)

    go (_,_,_,[]) = Nothing
    go (pos,(l:ls),(n:ns),(d:ds))
        | pos == l && pos == n = Just ('n',(pos+1,ls,ns,ds))
        | pos == l             = Just (toLower (dec1 d),(pos+1,ls,n:ns,ds))
        |             pos == n = Just ('N',(pos+1,l:ls,ns,ds))
        | otherwise            = Just (dec1 d, (pos+1,l:ls,n:ns,ds))
    go (pos,[],n:ns,d:ds)
        |             pos == n = Just ('N',(pos+1,[],ns,ds))
        | otherwise            = Just (dec1 d, (pos+1,[],n:ns,ds))
    go (pos,l:ls,[],d:ds)
        | pos == l             = Just (toLower (dec1 d),(pos+1,ls,[],ds))
        | otherwise            = Just (dec1 d, (pos+1,l:ls,[],ds))
    go (pos,[],[],d:ds)        = Just (dec1 d, (pos+1,[],[],ds))
--    go x = error (show x)

-- | Parse a (lazy) ByteString as sequences in the 2bit format.
decode2Bit :: B.ByteString -> [Sequence]
decode2Bit cs = let (Entries h es) = decode cs :: Entries
                    ms = map (fromIntegral . offset) es
                    (c:chunks) = zipWith (-) ms (0:ms)
                    ss = splits chunks $ B.drop c cs
               in map ($ Nothing) $ zipWith Seq (map name es)
                      $ map fromSR $ case swap h of
                                       True -> map (unSRLE.decode) ss
                                       False -> map (unSRBE.decode) ss

splits [] cs = [cs]
splits (e:es) cs = let (this,rest) = B.splitAt e cs
                   in this : splits es rest

unSRBE (SRBE x) = x
unSRLE (SRLE x) = x

toSR :: ByteString -> SR
toSR bs = undefined

-- | Extract sequences from a file in 2bit format.
read2Bit  :: FilePath -> IO [Sequence]
read2Bit f = B.readFile f >>= return . decode2Bit

-- | Extract sequences in the 2bit format from a handle.
hRead2Bit :: Handle   -> IO [Sequence]
hRead2Bit h = B.hGetContents h >>= return . decode2Bit

-- | Write sequences to file in the 2bit format.
write2Bit  :: FilePath -> [Sequence] -> IO ()
write2Bit = undefined

-- | Write sequences to a handle in the 2bit format.
hWrite2Bit :: Handle   -> [Sequence] -> IO ()
hWrite2Bit = undefined