-- | Would you believe it?  The 2bit format stores blocks of Ns in a table at
-- the beginning of a sequence, then packs four bases into a byte.  So it
-- is neither possible nor necessary to store Ns in the main sequence, and
-- you would think they aren't stored there, right?  And they aren't.
-- Instead Ts are stored which the reader has to replace with Ns.
--
-- The sensible way to treat these is probably to just say there are two
-- kinds of implied annotation (repeats and large gaps for a typical
-- genome), which can be interpreted in whatever way fits.  And that's why
-- we have 'Mask' and 'getSubseqWith'.

module Bio.TwoBit (
        TwoBitFile(..),
        TwoBitSequence(..),
        openTwoBit,

        getFwdSubseqWith,
        getSubseq,
        getSubseqWith,
        getSubseqAscii,
        getSubseqMasked,
        getLazySubseq,
        getFragment,
        getFwdSubseqV,
        getSeqnames,
        lookupSequence,
        getSeqLength,
        clampPosition,
        getRandomSeq,

        takeOverlap,
        mergeBlocks,
        Mask(..)
    ) where

import           Bio.Prelude         hiding ( left, right, chr )
import           Bio.Util.MMap
import           Bio.Util.Storable
import           Control.Monad.Trans.State  ( StateT(..), get, evalStateT )
import qualified Data.ByteString                as B
import qualified Data.IntMap.Strict             as I
import qualified Data.HashMap.Lazy              as M
import qualified Data.Vector.Unboxed            as U
import           Foreign.C.Types            ( CChar )
import           Foreign.ForeignPtr.Unsafe  ( unsafeForeignPtrToPtr )

data TwoBitFile = TBF {
    TwoBitFile -> ForeignPtr CChar
tbf_raw :: ForeignPtr CChar,
    -- This map is intentionally lazy.  May or may not be important.
    TwoBitFile -> HashMap Bytes TwoBitSequence
tbf_seqs :: !(M.HashMap Bytes TwoBitSequence)
}

data TwoBitSequence = TBS { TwoBitSequence -> IntMap Int
tbs_n_blocks   :: !(I.IntMap Int)
                          , TwoBitSequence -> IntMap Int
tbs_m_blocks   :: !(I.IntMap Int)
                          , TwoBitSequence -> Int
tbs_dna_offset :: {-# UNPACK #-} !Int
                          , TwoBitSequence -> Int
tbs_dna_size   :: {-# UNPACK #-} !Int }

-- | Brings a 2bit file into memory.  The file is mmap'ed, so it will
-- not work on streams that are not actual files.  It's also unsafe if
-- the file is modified in any way.
openTwoBit :: FilePath -> IO TwoBitFile
openTwoBit :: FilePath -> IO TwoBitFile
openTwoBit fp :: FilePath
fp = do
        (_sz :: Int
_sz, raw :: ForeignPtr CChar
raw) <- FilePath -> IO (Int, ForeignPtr CChar)
forall a. FilePath -> IO (Int, ForeignPtr a)
mmapFile FilePath
fp
        ForeignPtr CChar -> (Ptr CChar -> IO TwoBitFile) -> IO TwoBitFile
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr CChar
raw ((Ptr CChar -> IO TwoBitFile) -> IO TwoBitFile)
-> (Ptr CChar -> IO TwoBitFile) -> IO TwoBitFile
forall a b. (a -> b) -> a -> b
$
              StateT (Ptr CChar) IO TwoBitFile -> Ptr CChar -> IO TwoBitFile
forall (m :: * -> *) s a. Monad m => StateT s m a -> s -> m a
evalStateT (StateT (Ptr CChar) IO TwoBitFile -> Ptr CChar -> IO TwoBitFile)
-> StateT (Ptr CChar) IO TwoBitFile -> Ptr CChar -> IO TwoBitFile
forall a b. (a -> b) -> a -> b
$ do
                    Word32
sig <- Get Word32
forall a. Num a => Get a
getWord32be
                    Get Int
getWord32 <- case Word32
sig :: Word32 of
                            0x1A412743 -> Get Int -> StateT (Ptr CChar) IO (Get Int)
forall (m :: * -> *) a. Monad m => a -> m a
return Get Int
forall a. Num a => Get a
getWord32be
                            0x4327411A -> Get Int -> StateT (Ptr CChar) IO (Get Int)
forall (m :: * -> *) a. Monad m => a -> m a
return Get Int
forall a. Num a => Get a
getWord32le
                            _          -> FilePath -> StateT (Ptr CChar) IO (Get Int)
forall (m :: * -> *) a. MonadFail m => FilePath -> m a
fail (FilePath -> StateT (Ptr CChar) IO (Get Int))
-> FilePath -> StateT (Ptr CChar) IO (Get Int)
forall a b. (a -> b) -> a -> b
$ "invalid .2bit signature " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ Word32 -> FilePath -> FilePath
forall a. (Integral a, Show a) => a -> FilePath -> FilePath
showHex Word32
sig []

                    Int
version <- Get Int
getWord32
                    Bool -> StateT (Ptr CChar) IO () -> StateT (Ptr CChar) IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (Int
version Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0) (StateT (Ptr CChar) IO () -> StateT (Ptr CChar) IO ())
-> StateT (Ptr CChar) IO () -> StateT (Ptr CChar) IO ()
forall a b. (a -> b) -> a -> b
$ FilePath -> StateT (Ptr CChar) IO ()
forall (m :: * -> *) a. MonadFail m => FilePath -> m a
fail (FilePath -> StateT (Ptr CChar) IO ())
-> FilePath -> StateT (Ptr CChar) IO ()
forall a b. (a -> b) -> a -> b
$ "wrong .2bit version " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ Int -> FilePath
forall a. Show a => a -> FilePath
show Int
version

                    Int
nseqs <- Get Int
getWord32
                    Int
_reserved <- Get Int
getWord32

                    ForeignPtr CChar -> HashMap Bytes TwoBitSequence -> TwoBitFile
TBF ForeignPtr CChar
raw (HashMap Bytes TwoBitSequence -> TwoBitFile)
-> StateT (Ptr CChar) IO (HashMap Bytes TwoBitSequence)
-> StateT (Ptr CChar) IO TwoBitFile
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (HashMap Bytes TwoBitSequence
 -> Int -> StateT (Ptr CChar) IO (HashMap Bytes TwoBitSequence))
-> HashMap Bytes TwoBitSequence
-> [Int]
-> StateT (Ptr CChar) IO (HashMap Bytes TwoBitSequence)
forall (t :: * -> *) (m :: * -> *) b a.
(Foldable t, Monad m) =>
(b -> a -> m b) -> b -> t a -> m b
foldM (\ix :: HashMap Bytes TwoBitSequence
ix _ -> do !Bytes
key <- Get Int
forall a. Num a => Get a
getWord8 Get Int
-> (Int -> StateT (Ptr CChar) IO Bytes)
-> StateT (Ptr CChar) IO Bytes
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Int -> StateT (Ptr CChar) IO Bytes
getByteString
                                                   !Int
off <- Get Int
getWord32
                                                   HashMap Bytes TwoBitSequence
-> StateT (Ptr CChar) IO (HashMap Bytes TwoBitSequence)
forall (m :: * -> *) a. Monad m => a -> m a
return (HashMap Bytes TwoBitSequence
 -> StateT (Ptr CChar) IO (HashMap Bytes TwoBitSequence))
-> HashMap Bytes TwoBitSequence
-> StateT (Ptr CChar) IO (HashMap Bytes TwoBitSequence)
forall a b. (a -> b) -> a -> b
$! Bytes
-> TwoBitSequence
-> HashMap Bytes TwoBitSequence
-> HashMap Bytes TwoBitSequence
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.insert Bytes
key (ForeignPtr CChar -> Get Int -> Int -> TwoBitSequence
mkBlockIndex ForeignPtr CChar
raw Get Int
getWord32 Int
off) HashMap Bytes TwoBitSequence
ix
                                      ) HashMap Bytes TwoBitSequence
forall k v. HashMap k v
M.empty [1..Int
nseqs]

type Get = StateT (Ptr CChar) IO

getWord8, getWord32be, getWord32le :: Num a => Get a
getWord8 :: Get a
getWord8    = (Ptr CChar -> IO (a, Ptr CChar)) -> Get a
forall s (m :: * -> *) a. (s -> m (a, s)) -> StateT s m a
StateT ((Ptr CChar -> IO (a, Ptr CChar)) -> Get a)
-> (Ptr CChar -> IO (a, Ptr CChar)) -> Get a
forall a b. (a -> b) -> a -> b
$ \p :: Ptr CChar
p -> Ptr CChar -> IO Word32
forall a. Ptr a -> IO Word32
peekUnalnWord32BE  Ptr CChar
p IO Word32 -> (Word32 -> IO (a, Ptr CChar)) -> IO (a, Ptr CChar)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \w :: Word32
w -> (a, Ptr CChar) -> IO (a, Ptr CChar)
forall (m :: * -> *) a. Monad m => a -> m a
return (Word32 -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word32
w, Ptr CChar -> Int -> Ptr CChar
forall a b. Ptr a -> Int -> Ptr b
plusPtr Ptr CChar
p 1)
getWord32be :: Get a
getWord32be = (Ptr CChar -> IO (a, Ptr CChar)) -> Get a
forall s (m :: * -> *) a. (s -> m (a, s)) -> StateT s m a
StateT ((Ptr CChar -> IO (a, Ptr CChar)) -> Get a)
-> (Ptr CChar -> IO (a, Ptr CChar)) -> Get a
forall a b. (a -> b) -> a -> b
$ \p :: Ptr CChar
p -> Ptr CChar -> IO Word32
forall a. Ptr a -> IO Word32
peekUnalnWord32BE  Ptr CChar
p IO Word32 -> (Word32 -> IO (a, Ptr CChar)) -> IO (a, Ptr CChar)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \w :: Word32
w -> (a, Ptr CChar) -> IO (a, Ptr CChar)
forall (m :: * -> *) a. Monad m => a -> m a
return (Word32 -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word32
w, Ptr CChar -> Int -> Ptr CChar
forall a b. Ptr a -> Int -> Ptr b
plusPtr Ptr CChar
p 4)
getWord32le :: Get a
getWord32le = (Ptr CChar -> IO (a, Ptr CChar)) -> Get a
forall s (m :: * -> *) a. (s -> m (a, s)) -> StateT s m a
StateT ((Ptr CChar -> IO (a, Ptr CChar)) -> Get a)
-> (Ptr CChar -> IO (a, Ptr CChar)) -> Get a
forall a b. (a -> b) -> a -> b
$ \p :: Ptr CChar
p -> Ptr CChar -> IO Word32
forall a. Ptr a -> IO Word32
peekUnalnWord32LE  Ptr CChar
p IO Word32 -> (Word32 -> IO (a, Ptr CChar)) -> IO (a, Ptr CChar)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \w :: Word32
w -> (a, Ptr CChar) -> IO (a, Ptr CChar)
forall (m :: * -> *) a. Monad m => a -> m a
return (Word32 -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word32
w, Ptr CChar -> Int -> Ptr CChar
forall a b. Ptr a -> Int -> Ptr b
plusPtr Ptr CChar
p 4)

getByteString :: Int -> Get Bytes
getByteString :: Int -> StateT (Ptr CChar) IO Bytes
getByteString l :: Int
l = (Ptr CChar -> IO (Bytes, Ptr CChar)) -> StateT (Ptr CChar) IO Bytes
forall s (m :: * -> *) a. (s -> m (a, s)) -> StateT s m a
StateT ((Ptr CChar -> IO (Bytes, Ptr CChar))
 -> StateT (Ptr CChar) IO Bytes)
-> (Ptr CChar -> IO (Bytes, Ptr CChar))
-> StateT (Ptr CChar) IO Bytes
forall a b. (a -> b) -> a -> b
$ \p :: Ptr CChar
p -> CStringLen -> IO Bytes
B.packCStringLen (Ptr CChar
p,Int
l) IO Bytes
-> (Bytes -> IO (Bytes, Ptr CChar)) -> IO (Bytes, Ptr CChar)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \s :: Bytes
s -> (Bytes, Ptr CChar) -> IO (Bytes, Ptr CChar)
forall (m :: * -> *) a. Monad m => a -> m a
return (Bytes
s, Ptr CChar -> Int -> Ptr CChar
forall a b. Ptr a -> Int -> Ptr b
plusPtr Ptr CChar
p Int
l)

mkBlockIndex :: ForeignPtr CChar -> Get Int -> Int -> TwoBitSequence
mkBlockIndex :: ForeignPtr CChar -> Get Int -> Int -> TwoBitSequence
mkBlockIndex raw :: ForeignPtr CChar
raw getWord32 :: Get Int
getWord32 ofs :: Int
ofs =
    IO TwoBitSequence -> TwoBitSequence
forall a. IO a -> a
unsafeDupablePerformIO (IO TwoBitSequence -> TwoBitSequence)
-> IO TwoBitSequence -> TwoBitSequence
forall a b. (a -> b) -> a -> b
$
    ForeignPtr CChar
-> (Ptr CChar -> IO TwoBitSequence) -> IO TwoBitSequence
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr CChar
raw ((Ptr CChar -> IO TwoBitSequence) -> IO TwoBitSequence)
-> (Ptr CChar -> IO TwoBitSequence) -> IO TwoBitSequence
forall a b. (a -> b) -> a -> b
$ \praw :: Ptr CChar
praw ->
    StateT (Ptr CChar) IO TwoBitSequence
-> Ptr CChar -> IO TwoBitSequence
forall (m :: * -> *) s a. Monad m => StateT s m a -> s -> m a
evalStateT StateT (Ptr CChar) IO TwoBitSequence
getBlock (Ptr CChar -> Int -> Ptr CChar
forall a b. Ptr a -> Int -> Ptr b
plusPtr Ptr CChar
praw Int
ofs)
  where
    getBlock :: StateT (Ptr CChar) IO TwoBitSequence
getBlock = do Ptr CChar
p0 <- StateT (Ptr CChar) IO (Ptr CChar)
forall (m :: * -> *) s. Monad m => StateT s m s
get
                  Int
ds <- Get Int
getWord32
                  [(Int, Int)]
nb <- StateT (Ptr CChar) IO [(Int, Int)]
readBlockList
                  [(Int, Int)]
mb <- StateT (Ptr CChar) IO [(Int, Int)]
readBlockList
                  Int
_  <- Get Int
getWord32
                  Ptr CChar
p1 <- StateT (Ptr CChar) IO (Ptr CChar)
forall (m :: * -> *) s. Monad m => StateT s m s
get
                  TwoBitSequence -> StateT (Ptr CChar) IO TwoBitSequence
forall (m :: * -> *) a. Monad m => a -> m a
return (TwoBitSequence -> StateT (Ptr CChar) IO TwoBitSequence)
-> TwoBitSequence -> StateT (Ptr CChar) IO TwoBitSequence
forall a b. (a -> b) -> a -> b
$! IntMap Int -> IntMap Int -> Int -> Int -> TwoBitSequence
TBS ([(Int, Int)] -> IntMap Int
forall a. [(Int, a)] -> IntMap a
I.fromList [(Int, Int)]
nb) ([(Int, Int)] -> IntMap Int
forall a. [(Int, a)] -> IntMap a
I.fromList [(Int, Int)]
mb) (Int
ofs Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Ptr CChar -> Ptr CChar -> Int
forall a b. Ptr a -> Ptr b -> Int
minusPtr Ptr CChar
p1 Ptr CChar
p0) Int
ds

    readBlockList :: StateT (Ptr CChar) IO [(Int, Int)]
readBlockList = Get Int
getWord32 Get Int
-> (Int -> StateT (Ptr CChar) IO [(Int, Int)])
-> StateT (Ptr CChar) IO [(Int, Int)]
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \n :: Int
n -> ([Int] -> [Int] -> [(Int, Int)])
-> StateT (Ptr CChar) IO [Int]
-> StateT (Ptr CChar) IO [Int]
-> StateT (Ptr CChar) IO [(Int, Int)]
forall (m :: * -> *) a1 a2 r.
Monad m =>
(a1 -> a2 -> r) -> m a1 -> m a2 -> m r
liftM2 [Int] -> [Int] -> [(Int, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip (Int -> Get Int -> StateT (Ptr CChar) IO [Int]
forall (m :: * -> *) a. Monad m => Int -> m a -> m [a]
repM Int
n Get Int
getWord32) (Int -> Get Int -> StateT (Ptr CChar) IO [Int]
forall (m :: * -> *) a. Monad m => Int -> m a -> m [a]
repM Int
n Get Int
getWord32)

-- | Repeat monadic action @n@ times.  Returns result in reverse(!)
-- order, but doesn't build a huge list of thunks in memory.
repM :: Monad m => Int -> m a -> m [a]
repM :: Int -> m a -> m [a]
repM n0 :: Int
n0 m :: m a
m = [a] -> Int -> m [a]
forall t. (Eq t, Num t) => [a] -> t -> m [a]
go [] Int
n0
  where
    go :: [a] -> t -> m [a]
go acc :: [a]
acc 0 = [a] -> m [a]
forall (m :: * -> *) a. Monad m => a -> m a
return [a]
acc
    go acc :: [a]
acc n :: t
n = m a
m m a -> (a -> m [a]) -> m [a]
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \x :: a
x -> a
x a -> m [a] -> m [a]
forall a b. a -> b -> b
`seq` [a] -> t -> m [a]
go (a
xa -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
acc) (t
nt -> t -> t
forall a. Num a => a -> a -> a
-1)

takeOverlap :: Int -> I.IntMap Int -> [(Int,Int)]
takeOverlap :: Int -> IntMap Int -> [(Int, Int)]
takeOverlap k :: Int
k m :: IntMap Int
m = ((Int, Int) -> Bool) -> [(Int, Int)] -> [(Int, Int)]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Int, Int) -> Bool
far_left ([(Int, Int)] -> [(Int, Int)]) -> [(Int, Int)] -> [(Int, Int)]
forall a b. (a -> b) -> a -> b
$
                  ([(Int, Int)] -> [(Int, Int)])
-> (((Int, Int), IntMap Int) -> [(Int, Int)] -> [(Int, Int)])
-> Maybe ((Int, Int), IntMap Int)
-> [(Int, Int)]
-> [(Int, Int)]
forall b a. b -> (a -> b) -> Maybe a -> b
maybe [(Int, Int)] -> [(Int, Int)]
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id (\(kv :: (Int, Int)
kv,_) -> (:) (Int, Int)
kv) (IntMap Int -> Maybe ((Int, Int), IntMap Int)
forall a. IntMap a -> Maybe ((Int, a), IntMap a)
I.maxViewWithKey IntMap Int
left) ([(Int, Int)] -> [(Int, Int)]) -> [(Int, Int)] -> [(Int, Int)]
forall a b. (a -> b) -> a -> b
$
                  ([(Int, Int)] -> [(Int, Int)])
-> (Int -> [(Int, Int)] -> [(Int, Int)])
-> Maybe Int
-> [(Int, Int)]
-> [(Int, Int)]
forall b a. b -> (a -> b) -> Maybe a -> b
maybe [(Int, Int)] -> [(Int, Int)]
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id (\v :: Int
v -> (:) (Int
k,Int
v)) Maybe Int
middle ([(Int, Int)] -> [(Int, Int)]) -> [(Int, Int)] -> [(Int, Int)]
forall a b. (a -> b) -> a -> b
$
                  IntMap Int -> [(Int, Int)]
forall a. IntMap a -> [(Int, a)]
I.toAscList IntMap Int
right
  where
    (left :: IntMap Int
left, middle :: Maybe Int
middle, right :: IntMap Int
right) = Int -> IntMap Int -> (IntMap Int, Maybe Int, IntMap Int)
forall a. Int -> IntMap a -> (IntMap a, Maybe a, IntMap a)
I.splitLookup Int
k IntMap Int
m
    far_left :: (Int, Int) -> Bool
far_left (s :: Int
s,l :: Int
l) = Int
sInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
l Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
k

data Mask = None | Soft | Hard | Both deriving (Mask -> Mask -> Bool
(Mask -> Mask -> Bool) -> (Mask -> Mask -> Bool) -> Eq Mask
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Mask -> Mask -> Bool
$c/= :: Mask -> Mask -> Bool
== :: Mask -> Mask -> Bool
$c== :: Mask -> Mask -> Bool
Eq, Eq Mask
Eq Mask =>
(Mask -> Mask -> Ordering)
-> (Mask -> Mask -> Bool)
-> (Mask -> Mask -> Bool)
-> (Mask -> Mask -> Bool)
-> (Mask -> Mask -> Bool)
-> (Mask -> Mask -> Mask)
-> (Mask -> Mask -> Mask)
-> Ord Mask
Mask -> Mask -> Bool
Mask -> Mask -> Ordering
Mask -> Mask -> Mask
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: Mask -> Mask -> Mask
$cmin :: Mask -> Mask -> Mask
max :: Mask -> Mask -> Mask
$cmax :: Mask -> Mask -> Mask
>= :: Mask -> Mask -> Bool
$c>= :: Mask -> Mask -> Bool
> :: Mask -> Mask -> Bool
$c> :: Mask -> Mask -> Bool
<= :: Mask -> Mask -> Bool
$c<= :: Mask -> Mask -> Bool
< :: Mask -> Mask -> Bool
$c< :: Mask -> Mask -> Bool
compare :: Mask -> Mask -> Ordering
$ccompare :: Mask -> Mask -> Ordering
$cp1Ord :: Eq Mask
Ord, Int -> Mask
Mask -> Int
Mask -> [Mask]
Mask -> Mask
Mask -> Mask -> [Mask]
Mask -> Mask -> Mask -> [Mask]
(Mask -> Mask)
-> (Mask -> Mask)
-> (Int -> Mask)
-> (Mask -> Int)
-> (Mask -> [Mask])
-> (Mask -> Mask -> [Mask])
-> (Mask -> Mask -> [Mask])
-> (Mask -> Mask -> Mask -> [Mask])
-> Enum Mask
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
enumFromThenTo :: Mask -> Mask -> Mask -> [Mask]
$cenumFromThenTo :: Mask -> Mask -> Mask -> [Mask]
enumFromTo :: Mask -> Mask -> [Mask]
$cenumFromTo :: Mask -> Mask -> [Mask]
enumFromThen :: Mask -> Mask -> [Mask]
$cenumFromThen :: Mask -> Mask -> [Mask]
enumFrom :: Mask -> [Mask]
$cenumFrom :: Mask -> [Mask]
fromEnum :: Mask -> Int
$cfromEnum :: Mask -> Int
toEnum :: Int -> Mask
$ctoEnum :: Int -> Mask
pred :: Mask -> Mask
$cpred :: Mask -> Mask
succ :: Mask -> Mask
$csucc :: Mask -> Mask
Enum, Int -> Mask -> FilePath -> FilePath
[Mask] -> FilePath -> FilePath
Mask -> FilePath
(Int -> Mask -> FilePath -> FilePath)
-> (Mask -> FilePath)
-> ([Mask] -> FilePath -> FilePath)
-> Show Mask
forall a.
(Int -> a -> FilePath -> FilePath)
-> (a -> FilePath) -> ([a] -> FilePath -> FilePath) -> Show a
showList :: [Mask] -> FilePath -> FilePath
$cshowList :: [Mask] -> FilePath -> FilePath
show :: Mask -> FilePath
$cshow :: Mask -> FilePath
showsPrec :: Int -> Mask -> FilePath -> FilePath
$cshowsPrec :: Int -> Mask -> FilePath -> FilePath
Show)

getFwdSubseqWith :: TwoBitFile -> TwoBitSequence                -- raw data, sequence
                 -> (Word8 -> Mask -> a)                        -- mask function
                 -> Int -> [a]                                  -- start, lazy result
getFwdSubseqWith :: TwoBitFile -> TwoBitSequence -> (Word8 -> Mask -> a) -> Int -> [a]
getFwdSubseqWith TBF{..} TBS{..} nt :: Word8 -> Mask -> a
nt start :: Int
start =
    [(Int, Int, Mask)] -> Int -> [Word8] -> [a]
do_mask (Int -> IntMap Int -> [(Int, Int)]
takeOverlap Int
start IntMap Int
tbs_n_blocks [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
`mergeBlocks` Int -> IntMap Int -> [(Int, Int)]
takeOverlap Int
start IntMap Int
tbs_m_blocks) Int
start ([Word8] -> [a]) -> (Ptr Word8 -> [Word8]) -> Ptr Word8 -> [a]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
.
    Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
drop (Int
start Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. 3) ([Word8] -> [Word8])
-> (Ptr Word8 -> [Word8]) -> Ptr Word8 -> [Word8]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
.
    Ptr Word8 -> [Word8]
forall a. (Bits a, Num a, Storable a) => Ptr a -> [a]
toDNA (Ptr Word8 -> [a]) -> Ptr Word8 -> [a]
forall a b. (a -> b) -> a -> b
$ Ptr CChar -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
plusPtr (ForeignPtr CChar -> Ptr CChar
forall a. ForeignPtr a -> Ptr a
unsafeForeignPtrToPtr ForeignPtr CChar
tbf_raw) (Int
tbs_dna_offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
start Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` 2))
  where
    toDNA :: Ptr a -> [a]
toDNA p :: Ptr a
p = let !b :: a
b = IO a -> a
forall a. IO a -> a
unsafeDupablePerformIO (IO a -> a) -> IO a -> a
forall a b. (a -> b) -> a -> b
$ do Ptr a -> IO a
forall a. Storable a => Ptr a -> IO a
peek Ptr a
p IO a -> IO () -> IO a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ForeignPtr CChar -> IO ()
forall a. ForeignPtr a -> IO ()
touchForeignPtr ForeignPtr CChar
tbf_raw
              in [ 3 a -> a -> a
forall a. Bits a => a -> a -> a
.&. (a
b a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
x) | Int
x <- [6,4,2,0] ] [a] -> [a] -> [a]
forall a. [a] -> [a] -> [a]
++ Ptr a -> [a]
toDNA (Ptr a -> Int -> Ptr a
forall a b. Ptr a -> Int -> Ptr b
plusPtr Ptr a
p 1)

    do_mask :: [(Int, Int, Mask)] -> Int -> [Word8] -> [a]
do_mask            _ _ [] = []
    do_mask [          ] _ ws :: [Word8]
ws = (Word8 -> a) -> [Word8] -> [a]
forall a b. (a -> b) -> [a] -> [b]
map (Word8 -> Mask -> a
`nt` Mask
None) [Word8]
ws
    do_mask ((s :: Int
s,l :: Int
l,m :: Mask
m):is :: [(Int, Int, Mask)]
is) p :: Int
p ws :: [Word8]
ws
        | Int
p Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
s     = (Word8 -> a) -> [Word8] -> [a]
forall a b. (a -> b) -> [a] -> [b]
map (Word8 -> Mask -> a
`nt` Mask
None) (Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
take  (Int
sInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
p)  [Word8]
ws) [a] -> [a] -> [a]
forall a. [a] -> [a] -> [a]
++ [(Int, Int, Mask)] -> Int -> [Word8] -> [a]
do_mask ((Int
s,Int
l,Mask
m)(Int, Int, Mask) -> [(Int, Int, Mask)] -> [(Int, Int, Mask)]
forall a. a -> [a] -> [a]
:[(Int, Int, Mask)]
is)  Int
s   (Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
drop  (Int
sInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
p)  [Word8]
ws)
        | Bool
otherwise = (Word8 -> a) -> [Word8] -> [a]
forall a b. (a -> b) -> [a] -> [b]
map (Word8 -> Mask -> a
`nt`    Mask
m) (Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
take (Int
sInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
lInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
p) [Word8]
ws) [a] -> [a] -> [a]
forall a. [a] -> [a] -> [a]
++ [(Int, Int, Mask)] -> Int -> [Word8] -> [a]
do_mask          [(Int, Int, Mask)]
is (Int
sInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
l) (Int -> [Word8] -> [Word8]
forall a. Int -> [a] -> [a]
drop (Int
sInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
lInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
p) [Word8]
ws)

-- | Merge blocks of Ns and blocks of Ms into single list of blocks with
-- masking annotation.  Gaps remain.  Used internally only.
mergeBlocks :: [(Int,Int)] -> [(Int,Int)] -> [(Int,Int,Mask)]
mergeBlocks :: [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
mergeBlocks ((_,0):nbs :: [(Int, Int)]
nbs) mbs :: [(Int, Int)]
mbs = [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
mergeBlocks [(Int, Int)]
nbs [(Int, Int)]
mbs
mergeBlocks nbs :: [(Int, Int)]
nbs ((_,0):mbs :: [(Int, Int)]
mbs) = [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
mergeBlocks [(Int, Int)]
nbs [(Int, Int)]
mbs

mergeBlocks ((ns :: Int
ns,nl :: Int
nl):nbs :: [(Int, Int)]
nbs) ((ms :: Int
ms,ml :: Int
ml):mbs :: [(Int, Int)]
mbs)
    | Int
ns Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
ms   = let l :: Int
l = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min (Int
msInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
ns) Int
nl in (Int
ns,Int
l, Mask
Hard) (Int, Int, Mask) -> [(Int, Int, Mask)] -> [(Int, Int, Mask)]
forall a. a -> [a] -> [a]
: [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
mergeBlocks ((Int
nsInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
l,Int
nlInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
l)(Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
:[(Int, Int)]
nbs) ((Int
ms,Int
ml)(Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
:[(Int, Int)]
mbs)
    | Int
ms Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
ns   = let l :: Int
l = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min (Int
nsInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
ms) Int
ml in (Int
ms,Int
l, Mask
Soft) (Int, Int, Mask) -> [(Int, Int, Mask)] -> [(Int, Int, Mask)]
forall a. a -> [a] -> [a]
: [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
mergeBlocks ((Int
ns,Int
nl)(Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
:[(Int, Int)]
nbs) ((Int
msInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
l,Int
mlInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
l)(Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
:[(Int, Int)]
mbs)
    | Bool
otherwise = let l :: Int
l = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
nl Int
ml in (Int
ns,Int
l, Mask
Both) (Int, Int, Mask) -> [(Int, Int, Mask)] -> [(Int, Int, Mask)]
forall a. a -> [a] -> [a]
: [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
mergeBlocks ((Int
nsInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
l,Int
nlInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
l)(Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
:[(Int, Int)]
nbs) ((Int
msInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
l,Int
mlInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
l)(Int, Int) -> [(Int, Int)] -> [(Int, Int)]
forall a. a -> [a] -> [a]
:[(Int, Int)]
mbs)

mergeBlocks ((ns :: Int
ns,nl :: Int
nl):nbs :: [(Int, Int)]
nbs) [] = (Int
ns,Int
nl, Mask
Hard) (Int, Int, Mask) -> [(Int, Int, Mask)] -> [(Int, Int, Mask)]
forall a. a -> [a] -> [a]
: [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
mergeBlocks [(Int, Int)]
nbs []
mergeBlocks [] ((ms :: Int
ms,ml :: Int
ml):mbs :: [(Int, Int)]
mbs) = (Int
ms,Int
ml, Mask
Soft) (Int, Int, Mask) -> [(Int, Int, Mask)] -> [(Int, Int, Mask)]
forall a. a -> [a] -> [a]
: [(Int, Int)] -> [(Int, Int)] -> [(Int, Int, Mask)]
mergeBlocks [] [(Int, Int)]
mbs

mergeBlocks [     ] [     ] = []


-- | Extract a subsequence and apply masking.  TwoBit files can
-- represent two kinds of masking (hard and soft), where hard masking is
-- usually realized by replacing everything by Ns and soft masking is
-- done by lowercasing.  Here, we take a user supplied function to apply
-- masking.
getSubseqWith :: (Nucleotide -> Mask -> a) -> TwoBitFile -> Range -> [a]
getSubseqWith :: (Nucleotide -> Mask -> a) -> TwoBitFile -> Range -> [a]
getSubseqWith maskf :: Nucleotide -> Mask -> a
maskf tbf :: TwoBitFile
tbf Range{ r_pos :: Range -> Position
r_pos = Pos { p_seq :: Position -> Bytes
p_seq = Bytes
chr, p_start :: Position -> Int
p_start = Int
start }, r_length :: Range -> Int
r_length = Int
len } = do
    let sq1 :: TwoBitSequence
sq1 = TwoBitSequence -> Maybe TwoBitSequence -> TwoBitSequence
forall a. a -> Maybe a -> a
fromMaybe (FilePath -> TwoBitSequence
forall a. HasCallStack => FilePath -> a
error (FilePath -> TwoBitSequence) -> FilePath -> TwoBitSequence
forall a b. (a -> b) -> a -> b
$ Bytes -> FilePath
forall s. Unpack s => s -> FilePath
unpack Bytes
chr FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ " doesn't exist") (Maybe TwoBitSequence -> TwoBitSequence)
-> Maybe TwoBitSequence -> TwoBitSequence
forall a b. (a -> b) -> a -> b
$ Bytes -> HashMap Bytes TwoBitSequence -> Maybe TwoBitSequence
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
chr (TwoBitFile -> HashMap Bytes TwoBitSequence
tbf_seqs TwoBitFile
tbf)
    let go :: (Word8 -> Mask -> a) -> Int -> [a]
go = TwoBitFile -> TwoBitSequence -> (Word8 -> Mask -> a) -> Int -> [a]
forall a.
TwoBitFile -> TwoBitSequence -> (Word8 -> Mask -> a) -> Int -> [a]
getFwdSubseqWith TwoBitFile
tbf TwoBitSequence
sq1
    if Int
start Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0
        then [a] -> [a]
forall a. [a] -> [a]
reverse ([a] -> [a]) -> [a] -> [a]
forall a b. (a -> b) -> a -> b
$ Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
take Int
len ([a] -> [a]) -> [a] -> [a]
forall a b. (a -> b) -> a -> b
$ (Word8 -> Mask -> a) -> Int -> [a]
forall a. (Word8 -> Mask -> a) -> Int -> [a]
go (Nucleotide -> Mask -> a
maskf (Nucleotide -> Mask -> a)
-> (Word8 -> Nucleotide) -> Word8 -> Mask -> a
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Word8 -> Nucleotide
cmp_nt) (-Int
startInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
len)
        else           Int -> [a] -> [a]
forall a. Int -> [a] -> [a]
take Int
len ([a] -> [a]) -> [a] -> [a]
forall a b. (a -> b) -> a -> b
$ (Word8 -> Mask -> a) -> Int -> [a]
forall a. (Word8 -> Mask -> a) -> Int -> [a]
go (Nucleotide -> Mask -> a
maskf (Nucleotide -> Mask -> a)
-> (Word8 -> Nucleotide) -> Word8 -> Mask -> a
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Word8 -> Nucleotide
fwd_nt)   Int
start
  where
    fwd_nt :: Word8 -> Nucleotide
fwd_nt = [Nucleotide] -> Int -> Nucleotide
forall a. [a] -> Int -> a
(!!) [Nucleotide
nucT, Nucleotide
nucC, Nucleotide
nucA, Nucleotide
nucG] (Int -> Nucleotide) -> (Word8 -> Int) -> Word8 -> Nucleotide
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
    cmp_nt :: Word8 -> Nucleotide
cmp_nt = [Nucleotide] -> Int -> Nucleotide
forall a. [a] -> Int -> a
(!!) [Nucleotide
nucA, Nucleotide
nucG, Nucleotide
nucT, Nucleotide
nucC] (Int -> Nucleotide) -> (Word8 -> Int) -> Word8 -> Nucleotide
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
{-# INLINE getSubseqWith #-}

-- | Works only in forward direction.
getLazySubseq :: TwoBitFile -> Position -> [Nucleotide]
getLazySubseq :: TwoBitFile -> Position -> [Nucleotide]
getLazySubseq tbf :: TwoBitFile
tbf Pos{ p_seq :: Position -> Bytes
p_seq = Bytes
chr, p_start :: Position -> Int
p_start = Int
start } = do
    let sq1 :: TwoBitSequence
sq1 = TwoBitSequence -> Maybe TwoBitSequence -> TwoBitSequence
forall a. a -> Maybe a -> a
fromMaybe (FilePath -> TwoBitSequence
forall a. HasCallStack => FilePath -> a
error (FilePath -> TwoBitSequence) -> FilePath -> TwoBitSequence
forall a b. (a -> b) -> a -> b
$ Bytes -> FilePath
forall s. Unpack s => s -> FilePath
unpack Bytes
chr FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ " doesn't exist") (Maybe TwoBitSequence -> TwoBitSequence)
-> Maybe TwoBitSequence -> TwoBitSequence
forall a b. (a -> b) -> a -> b
$ Bytes -> HashMap Bytes TwoBitSequence -> Maybe TwoBitSequence
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
chr (TwoBitFile -> HashMap Bytes TwoBitSequence
tbf_seqs TwoBitFile
tbf)
    let go :: (Word8 -> Mask -> a) -> Int -> [a]
go  = TwoBitFile -> TwoBitSequence -> (Word8 -> Mask -> a) -> Int -> [a]
forall a.
TwoBitFile -> TwoBitSequence -> (Word8 -> Mask -> a) -> Int -> [a]
getFwdSubseqWith TwoBitFile
tbf TwoBitSequence
sq1
    if Int
start Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0
        then FilePath -> [Nucleotide]
forall a. HasCallStack => FilePath -> a
error "sorry, can't go backwards"
        else (Word8 -> Mask -> Nucleotide) -> Int -> [Nucleotide]
forall a. (Word8 -> Mask -> a) -> Int -> [a]
go Word8 -> Mask -> Nucleotide
forall a p. Integral a => a -> p -> Nucleotide
fwd_nt Int
start
  where
    fwd_nt :: a -> p -> Nucleotide
fwd_nt n :: a
n _ = [Nucleotide
nucT, Nucleotide
nucC, Nucleotide
nucA, Nucleotide
nucG] [Nucleotide] -> Int -> Nucleotide
forall a. [a] -> Int -> a
!! a -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
n
{-# INLINE getLazySubseq #-}


-- | Extract a subsequence without masking.
getSubseq :: TwoBitFile -> Range -> [Nucleotide]
getSubseq :: TwoBitFile -> Range -> [Nucleotide]
getSubseq = (Nucleotide -> Mask -> Nucleotide)
-> TwoBitFile -> Range -> [Nucleotide]
forall a. (Nucleotide -> Mask -> a) -> TwoBitFile -> Range -> [a]
getSubseqWith Nucleotide -> Mask -> Nucleotide
forall a b. a -> b -> a
const
{-# INLINE getSubseq #-}

-- | Extract a subsequence with typical masking:  soft masking is
-- ignored, hard masked regions are replaced with Ns.
getSubseqMasked :: TwoBitFile -> Range -> [Nucleotides]
getSubseqMasked :: TwoBitFile -> Range -> [Nucleotides]
getSubseqMasked = (Nucleotide -> Mask -> Nucleotides)
-> TwoBitFile -> Range -> [Nucleotides]
forall a. (Nucleotide -> Mask -> a) -> TwoBitFile -> Range -> [a]
getSubseqWith Nucleotide -> Mask -> Nucleotides
mymask
  where
    mymask :: Nucleotide -> Mask -> Nucleotides
mymask n :: Nucleotide
n None = Nucleotide -> Nucleotides
nucToNucs Nucleotide
n
    mymask n :: Nucleotide
n Soft = Nucleotide -> Nucleotides
nucToNucs Nucleotide
n
    mymask _ Hard = Nucleotides
nucsN
    mymask _ Both = Nucleotides
nucsN
{-# INLINE getSubseqMasked #-}

-- | Extract a subsequence with masking for biologists:  soft masking is
-- done by lowercasing, hard masking by printing an N.
getSubseqAscii :: TwoBitFile -> Range -> String
getSubseqAscii :: TwoBitFile -> Range -> FilePath
getSubseqAscii = (Nucleotide -> Mask -> Char) -> TwoBitFile -> Range -> FilePath
forall a. (Nucleotide -> Mask -> a) -> TwoBitFile -> Range -> [a]
getSubseqWith Nucleotide -> Mask -> Char
mymask
  where
    mymask :: Nucleotide -> Mask -> Char
mymask n :: Nucleotide
n None = Nucleotide -> Char
showNucleotide Nucleotide
n
    mymask n :: Nucleotide
n Soft = Char -> Char
toLower (Nucleotide -> Char
showNucleotide Nucleotide
n)
    mymask _ Hard = 'N'
    mymask _ Both = 'N'
{-# INLINE getSubseqAscii #-}


getSeqnames :: TwoBitFile -> [Bytes]
getSeqnames :: TwoBitFile -> [Bytes]
getSeqnames = HashMap Bytes TwoBitSequence -> [Bytes]
forall k v. HashMap k v -> [k]
M.keys (HashMap Bytes TwoBitSequence -> [Bytes])
-> (TwoBitFile -> HashMap Bytes TwoBitSequence)
-> TwoBitFile
-> [Bytes]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. TwoBitFile -> HashMap Bytes TwoBitSequence
tbf_seqs

lookupSequence :: TwoBitFile -> Bytes -> Maybe TwoBitSequence
lookupSequence :: TwoBitFile -> Bytes -> Maybe TwoBitSequence
lookupSequence tbf :: TwoBitFile
tbf sq :: Bytes
sq = Bytes -> HashMap Bytes TwoBitSequence -> Maybe TwoBitSequence
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
sq (HashMap Bytes TwoBitSequence -> Maybe TwoBitSequence)
-> (TwoBitFile -> HashMap Bytes TwoBitSequence)
-> TwoBitFile
-> Maybe TwoBitSequence
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. TwoBitFile -> HashMap Bytes TwoBitSequence
tbf_seqs (TwoBitFile -> Maybe TwoBitSequence)
-> TwoBitFile -> Maybe TwoBitSequence
forall a b. (a -> b) -> a -> b
$ TwoBitFile
tbf

getSeqLength :: TwoBitFile -> Bytes -> Int
getSeqLength :: TwoBitFile -> Bytes -> Int
getSeqLength tbf :: TwoBitFile
tbf chr :: Bytes
chr =
    Int -> (TwoBitSequence -> Int) -> Maybe TwoBitSequence -> Int
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (FilePath -> Int
forall a. HasCallStack => FilePath -> a
error (FilePath -> Int) -> FilePath -> Int
forall a b. (a -> b) -> a -> b
$ Bytes -> FilePath -> FilePath
forall a. Show a => a -> FilePath -> FilePath
shows Bytes
chr " doesn't exist") TwoBitSequence -> Int
tbs_dna_size (Maybe TwoBitSequence -> Int) -> Maybe TwoBitSequence -> Int
forall a b. (a -> b) -> a -> b
$
    Bytes -> HashMap Bytes TwoBitSequence -> Maybe TwoBitSequence
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bytes
chr (TwoBitFile -> HashMap Bytes TwoBitSequence
tbf_seqs TwoBitFile
tbf)

-- | limits a range to a position within the actual sequence
clampPosition :: TwoBitFile -> Range -> Range
clampPosition :: TwoBitFile -> Range -> Range
clampPosition tbf :: TwoBitFile
tbf (Range (Pos n :: Bytes
n start :: Int
start) len :: Int
len) = Position -> Int -> Range
Range (Bytes -> Int -> Position
Pos Bytes
n Int
start') (Int
end' Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
start')
  where
    size :: Int
size   = TwoBitFile -> Bytes -> Int
getSeqLength TwoBitFile
tbf Bytes
n
    start' :: Int
start' = if Int
start Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0 then Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
start (-Int
size) else Int
start
    end' :: Int
end'   = Int -> Int -> Int
forall a. Ord a => a -> a -> a
min (Int
start Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len) (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ if Int
start Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0 then 0 else Int
size


-- | Sample a piece of random sequence uniformly from the genome.
-- Only pieces that are not hard masked are sampled, soft masking is
-- allowed, but not reported.
-- On a 32bit platform, this will fail for genomes larger than 1G bases.
-- However, if you're running this code on a 32bit platform, you have
-- bigger problems to worry about.
getRandomSeq :: TwoBitFile                   -- ^ 2bit file
             -> Int                          -- ^ desired length
             -> (Int -> g -> (Int, g))       -- ^ draw random int below limit
             -> g                            -- ^ RNG
             -> ((Range, [Nucleotide]), g)   -- ^ position, sequence, new RNG
getRandomSeq :: TwoBitFile
-> Int -> (Int -> g -> (Int, g)) -> g -> ((Range, [Nucleotide]), g)
getRandomSeq tbf :: TwoBitFile
tbf len :: Int
len rndInt :: Int -> g -> (Int, g)
rndInt = g -> ((Range, [Nucleotide]), g)
draw
  where
    names :: [Bytes]
names = TwoBitFile -> [Bytes]
getSeqnames TwoBitFile
tbf
    lengths :: [Int]
lengths = (Bytes -> Int) -> [Bytes] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (TwoBitFile -> Bytes -> Int
getSeqLength TwoBitFile
tbf) [Bytes]
names
    total :: Int
total = [Int] -> Int
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum [Int]
lengths
    frags :: IntMap Bytes
frags = [(Int, Bytes)] -> IntMap Bytes
forall a. [(Int, a)] -> IntMap a
I.fromList ([(Int, Bytes)] -> IntMap Bytes) -> [(Int, Bytes)] -> IntMap Bytes
forall a b. (a -> b) -> a -> b
$ [Int] -> [Bytes] -> [(Int, Bytes)]
forall a b. [a] -> [b] -> [(a, b)]
zip ((Int -> Int -> Int) -> Int -> [Int] -> [Int]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) 0 [Int]
lengths) [Bytes]
names

    draw :: g -> ((Range, [Nucleotide]), g)
draw g0 :: g
g0 | Bool
good      = ((Range
r', [Nucleotide]
sq), g
gn)
            | Bool
otherwise = g -> ((Range, [Nucleotide]), g)
draw g
gn
      where
        (p0 :: Int
p0, gn :: g
gn) = Int -> g -> (Int, g)
rndInt (2Int -> Int -> Int
forall a. Num a => a -> a -> a
*Int
total) g
g0
        p :: Int
p = Int
p0 Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` 1
        (o :: Int
o,s :: Bytes
s) = ((Int, Bytes), IntMap Bytes) -> (Int, Bytes)
forall a b. (a, b) -> a
fst (((Int, Bytes), IntMap Bytes) -> (Int, Bytes))
-> ((Int, Bytes), IntMap Bytes) -> (Int, Bytes)
forall a b. (a -> b) -> a -> b
$ Maybe ((Int, Bytes), IntMap Bytes) -> ((Int, Bytes), IntMap Bytes)
forall a. HasCallStack => Maybe a -> a
fromJust (Maybe ((Int, Bytes), IntMap Bytes)
 -> ((Int, Bytes), IntMap Bytes))
-> Maybe ((Int, Bytes), IntMap Bytes)
-> ((Int, Bytes), IntMap Bytes)
forall a b. (a -> b) -> a -> b
$ IntMap Bytes -> Maybe ((Int, Bytes), IntMap Bytes)
forall a. IntMap a -> Maybe ((Int, a), IntMap a)
I.maxViewWithKey (IntMap Bytes -> Maybe ((Int, Bytes), IntMap Bytes))
-> IntMap Bytes -> Maybe ((Int, Bytes), IntMap Bytes)
forall a b. (a -> b) -> a -> b
$ (IntMap Bytes, IntMap Bytes) -> IntMap Bytes
forall a b. (a, b) -> a
fst ((IntMap Bytes, IntMap Bytes) -> IntMap Bytes)
-> (IntMap Bytes, IntMap Bytes) -> IntMap Bytes
forall a b. (a -> b) -> a -> b
$ Int -> IntMap Bytes -> (IntMap Bytes, IntMap Bytes)
forall a. Int -> IntMap a -> (IntMap a, IntMap a)
I.split (Int
pInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) IntMap Bytes
frags
        r' :: Range
r' = (if Int -> Bool
forall a. Integral a => a -> Bool
odd Int
p0 then Range -> Range
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else Range -> Range
reverseRange) (Range -> Range) -> Range -> Range
forall a b. (a -> b) -> a -> b
$ TwoBitFile -> Range -> Range
clampPosition TwoBitFile
tbf (Range -> Range) -> Range -> Range
forall a b. (a -> b) -> a -> b
$ Position -> Int -> Range
Range (Bytes -> Int -> Position
Pos Bytes
s (Int
pInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
o)) Int
len
        sq :: [Nucleotide]
sq = [Maybe Nucleotide] -> [Nucleotide]
forall a. [Maybe a] -> [a]
catMaybes ([Maybe Nucleotide] -> [Nucleotide])
-> [Maybe Nucleotide] -> [Nucleotide]
forall a b. (a -> b) -> a -> b
$ (Nucleotide -> Mask -> Maybe Nucleotide)
-> TwoBitFile -> Range -> [Maybe Nucleotide]
forall a. (Nucleotide -> Mask -> a) -> TwoBitFile -> Range -> [a]
getSubseqWith Nucleotide -> Mask -> Maybe Nucleotide
forall a. a -> Mask -> Maybe a
mask2maybe TwoBitFile
tbf Range
r'
        good :: Bool
good = Range -> Int
r_length Range
r' Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
len Bool -> Bool -> Bool
&& [Nucleotide] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotide]
sq Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
len

        mask2maybe :: a -> Mask -> Maybe a
mask2maybe n :: a
n None = a -> Maybe a
forall a. a -> Maybe a
Just a
n
        mask2maybe n :: a
n Soft = a -> Maybe a
forall a. a -> Maybe a
Just a
n
        mask2maybe _ Hard = Maybe a
forall a. Maybe a
Nothing
        mask2maybe _ Both = Maybe a
forall a. Maybe a
Nothing

-- | Gets a fragment from a 2bit file.  The result always has the
-- desired length; if necessary, it is padded with Ns.  Be careful about
-- the unconventional encoding: 0..4 == TCAGN
getFragment :: TwoBitFile -> Bytes -> Int -> Int -> U.Vector Word8
getFragment :: TwoBitFile -> Bytes -> Int -> Int -> Vector Word8
getFragment tbf :: TwoBitFile
tbf chr :: Bytes
chr p :: Int
p l :: Int
l =
    case TwoBitFile -> Bytes -> Maybe TwoBitSequence
lookupSequence TwoBitFile
tbf Bytes
chr of
        Nothing  -> Int -> Word8 -> Vector Word8
forall a. Unbox a => Int -> a -> Vector a
U.replicate Int
l 4
        Just tbs :: TwoBitSequence
tbs -> TwoBitFile -> TwoBitSequence -> Int -> Int -> Vector Word8
getFwdSubseqV TwoBitFile
tbf TwoBitSequence
tbs Int
p Int
l

-- Careful about weird encoding: 0..4 == TCAGN
getFwdSubseqV :: TwoBitFile -> TwoBitSequence -> Int -> Int -> U.Vector Word8
getFwdSubseqV :: TwoBitFile -> TwoBitSequence -> Int -> Int -> Vector Word8
getFwdSubseqV TBF{..} TBS{..} start :: Int
start len :: Int
len = Int
-> ((Int, [(Int, Int)]) -> Maybe (Word8, (Int, [(Int, Int)])))
-> (Int, [(Int, Int)])
-> Vector Word8
forall a b. Unbox a => Int -> (b -> Maybe (a, b)) -> b -> Vector a
U.unfoldrN Int
len (Int, [(Int, Int)]) -> Maybe (Word8, (Int, [(Int, Int)]))
forall a.
(Storable a, Bits a, Num a) =>
(Int, [(Int, Int)]) -> Maybe (a, (Int, [(Int, Int)]))
step (Int, [(Int, Int)])
ini
  where
    ini :: (Int, [(Int, Int)])
ini = (Int
start, Int -> IntMap Int -> [(Int, Int)]
takeOverlap Int
start IntMap Int
tbs_n_blocks)

    step :: (Int, [(Int, Int)]) -> Maybe (a, (Int, [(Int, Int)]))
step (off :: Int
off, nbs :: [(Int, Int)]
nbs)
        | Int
off Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0                   = (a, (Int, [(Int, Int)])) -> Maybe (a, (Int, [(Int, Int)]))
forall a. a -> Maybe a
Just (4, (Int -> Int
forall a. Enum a => a -> a
succ Int
off, [(Int, Int)]
nbs))
        | Int
off Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
tbs_dna_size       = (a, (Int, [(Int, Int)])) -> Maybe (a, (Int, [(Int, Int)]))
forall a. a -> Maybe a
Just (4, (Int -> Int
forall a. Enum a => a -> a
succ Int
off, [(Int, Int)]
nbs))
        | Bool
otherwise = case [(Int, Int)]
nbs of
            [        ]             -> (a, (Int, [(Int, Int)])) -> Maybe (a, (Int, [(Int, Int)]))
forall a. a -> Maybe a
Just (a
y, (Int -> Int
forall a. Enum a => a -> a
succ Int
off, [ ]))
            (s :: Int
s,l :: Int
l):nbs' :: [(Int, Int)]
nbs' | Int
off Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
s   -> (a, (Int, [(Int, Int)])) -> Maybe (a, (Int, [(Int, Int)]))
forall a. a -> Maybe a
Just (a
y, (Int -> Int
forall a. Enum a => a -> a
succ Int
off, [(Int, Int)]
nbs))
                       | Int
off Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
sInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
l -> (a, (Int, [(Int, Int)])) -> Maybe (a, (Int, [(Int, Int)]))
forall a. a -> Maybe a
Just (4, (Int -> Int
forall a. Enum a => a -> a
succ Int
off, [(Int, Int)]
nbs))
                       | Bool
otherwise -> (a, (Int, [(Int, Int)])) -> Maybe (a, (Int, [(Int, Int)]))
forall a. a -> Maybe a
Just (a
y, (Int -> Int
forall a. Enum a => a -> a
succ Int
off, [(Int, Int)]
nbs'))
      where
        x :: a
x = IO a -> a
forall a. IO a -> a
unsafeDupablePerformIO (IO a -> a) -> IO a -> a
forall a b. (a -> b) -> a -> b
$
            ForeignPtr CChar -> (Ptr CChar -> IO a) -> IO a
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr CChar
tbf_raw ((Ptr CChar -> IO a) -> IO a) -> (Ptr CChar -> IO a) -> IO a
forall a b. (a -> b) -> a -> b
$ \p :: Ptr CChar
p ->
            Ptr CChar -> Int -> IO a
forall a b. Storable a => Ptr b -> Int -> IO a
peekByteOff Ptr CChar
p (Int
tbs_dna_offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
off Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftR` 2)
        y :: a
y = a
x a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` (6 Int -> Int -> Int
forall a. Num a => a -> a -> a
- 2 Int -> Int -> Int
forall a. Num a => a -> a -> a
* (Int
off Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. 3)) a -> a -> a
forall a. Bits a => a -> a -> a
.&. 3     -- T,C,A,G