{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}

module Bio.Seq.IO
    ( Genome
    , openGenome
    , closeGenome
    , withGenome
    , getSeq
    , getChrom
    , getChrSizes
    , mkIndex
    ) where

import Control.Exception (bracket)
import Conduit
import qualified Data.ByteString.Char8 as B
import qualified Data.HashMap.Lazy as M
import Data.List.Split
import System.IO

import Bio.Seq
import Bio.Utils.Misc (readInt)
import Bio.Data.Fasta (fastaReader)

-- | The first 2048 bytes are header. Header consists of a magic string,
-- followed by chromosome information. Example:
-- <HASKELLBIOINFORMATICS>\nCHR1 START SIZE
data Genome = Genome !Handle !IndexTable !Int

type IndexTable = M.HashMap B.ByteString (Int, Int)

magic :: B.ByteString
magic :: ByteString
magic = ByteString
"<HASKELLBIOINFORMATICS_7d2c5gxhg934>"

openGenome :: FilePath -> IO Genome
openGenome :: FilePath -> IO Genome
openGenome FilePath
fl = do
    Handle
h <- FilePath -> IOMode -> IO Handle
openFile FilePath
fl IOMode
ReadMode
    ByteString
sig <- Handle -> IO ByteString
B.hGetLine Handle
h
    if ByteString
sig ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ByteString
magic
        then do
            ByteString
header <- Handle -> IO ByteString
B.hGetLine Handle
h
            Genome -> IO Genome
forall (m :: * -> *) a. Monad m => a -> m a
return (Genome -> IO Genome) -> Genome -> IO Genome
forall a b. (a -> b) -> a -> b
$ Handle -> IndexTable -> Int -> Genome
Genome Handle
h (ByteString -> IndexTable
getIndex ByteString
header) (ByteString -> Int
B.length ByteString
sig Int -> Int -> Int
forall a. Num a => a -> a -> a
+ ByteString -> Int
B.length ByteString
header Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
2)
        else FilePath -> IO Genome
forall a. HasCallStack => FilePath -> a
error FilePath
"Bio.Seq.Query.openGenome: Incorrect format"
  where
    getIndex :: ByteString -> IndexTable
getIndex = [(ByteString, (Int, Int))] -> IndexTable
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
M.fromList ([(ByteString, (Int, Int))] -> IndexTable)
-> (ByteString -> [(ByteString, (Int, Int))])
-> ByteString
-> IndexTable
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([ByteString] -> (ByteString, (Int, Int)))
-> [[ByteString]] -> [(ByteString, (Int, Int))]
forall a b. (a -> b) -> [a] -> [b]
map [ByteString] -> (ByteString, (Int, Int))
f ([[ByteString]] -> [(ByteString, (Int, Int))])
-> (ByteString -> [[ByteString]])
-> ByteString
-> [(ByteString, (Int, Int))]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [ByteString] -> [[ByteString]]
forall e. Int -> [e] -> [[e]]
chunksOf Int
3 ([ByteString] -> [[ByteString]])
-> (ByteString -> [ByteString]) -> ByteString -> [[ByteString]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
B.words
      where
        f :: [ByteString] -> (ByteString, (Int, Int))
f [ByteString
k, ByteString
v, ByteString
l] = (ByteString
k, (ByteString -> Int
readInt ByteString
v, ByteString -> Int
readInt ByteString
l))
        f [ByteString]
_ = FilePath -> (ByteString, (Int, Int))
forall a. HasCallStack => FilePath -> a
error FilePath
"error"
{-# INLINE openGenome #-}

closeGenome :: Genome -> IO ()
closeGenome :: Genome -> IO ()
closeGenome (Genome Handle
h IndexTable
_ Int
_) = Handle -> IO ()
hClose Handle
h
{-# INLINE closeGenome #-}

withGenome :: FilePath -> (Genome -> IO a) -> IO a
withGenome :: FilePath -> (Genome -> IO a) -> IO a
withGenome FilePath
fl Genome -> IO a
fn = IO Genome -> (Genome -> IO ()) -> (Genome -> IO a) -> IO a
forall a b c. IO a -> (a -> IO b) -> (a -> IO c) -> IO c
bracket (FilePath -> IO Genome
openGenome FilePath
fl) Genome -> IO ()
closeGenome Genome -> IO a
fn
{-# INLINE withGenome #-}

-- | A query is represented by a tuple: (chr, start, end) and is
-- zero-based index, half-close-half-open
type Query = (B.ByteString, Int, Int)

-- | Retrieve sequence.
getSeq :: BioSeq s a => Genome -> Query -> IO (Either String (s a))
getSeq :: Genome -> Query -> IO (Either FilePath (s a))
getSeq (Genome Handle
h IndexTable
index Int
headerSize) (ByteString
chr, Int
start, Int
end) = case ByteString -> IndexTable -> Maybe (Int, Int)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup ByteString
chr IndexTable
index of
    Just (Int
chrStart, Int
chrSize) ->
        if Int
end Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
chrSize
            then Either FilePath (s a) -> IO (Either FilePath (s a))
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath (s a) -> IO (Either FilePath (s a)))
-> Either FilePath (s a) -> IO (Either FilePath (s a))
forall a b. (a -> b) -> a -> b
$ FilePath -> Either FilePath (s a)
forall a b. a -> Either a b
Left (FilePath -> Either FilePath (s a))
-> FilePath -> Either FilePath (s a)
forall a b. (a -> b) -> a -> b
$ FilePath
"Bio.Seq.getSeq: out of index: " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ Int -> FilePath
forall a. Show a => a -> FilePath
show Int
end FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++
                FilePath
">" FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ Int -> FilePath
forall a. Show a => a -> FilePath
show Int
chrSize
            else do
                Handle -> SeekMode -> Integer -> IO ()
hSeek Handle
h SeekMode
AbsoluteSeek (Integer -> IO ()) -> Integer -> IO ()
forall a b. (a -> b) -> a -> b
$ Int -> Integer
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Integer) -> Int -> Integer
forall a b. (a -> b) -> a -> b
$ Int
headerSize Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
chrStart Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
start
                ByteString -> Either FilePath (s a)
forall (seq :: * -> *) alphabet.
BioSeq seq alphabet =>
ByteString -> Either FilePath (seq alphabet)
fromBS (ByteString -> Either FilePath (s a))
-> IO ByteString -> IO (Either FilePath (s a))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Handle -> Int -> IO ByteString
B.hGet Handle
h (Int
end Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
start)
    Maybe (Int, Int)
_ -> Either FilePath (s a) -> IO (Either FilePath (s a))
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath (s a) -> IO (Either FilePath (s a)))
-> Either FilePath (s a) -> IO (Either FilePath (s a))
forall a b. (a -> b) -> a -> b
$ FilePath -> Either FilePath (s a)
forall a b. a -> Either a b
Left (FilePath -> Either FilePath (s a))
-> FilePath -> Either FilePath (s a)
forall a b. (a -> b) -> a -> b
$ FilePath
"Bio.Seq.getSeq: Cannot find " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ ByteString -> FilePath
forall a. Show a => a -> FilePath
show ByteString
chr
{-# INLINE getSeq #-}

-- | Retrieve whole chromosome.
getChrom :: Genome -> B.ByteString -> IO (Either String (DNA IUPAC))
getChrom :: Genome -> ByteString -> IO (Either FilePath (DNA IUPAC))
getChrom Genome
g ByteString
chr = case ByteString -> [(ByteString, Int)] -> Maybe Int
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup ByteString
chr [(ByteString, Int)]
chrSize of
    Just Int
s -> Genome -> Query -> IO (Either FilePath (DNA IUPAC))
forall (s :: * -> *) a.
BioSeq s a =>
Genome -> Query -> IO (Either FilePath (s a))
getSeq Genome
g (ByteString
chr, Int
0, Int
s)
    Maybe Int
_ -> Either FilePath (DNA IUPAC) -> IO (Either FilePath (DNA IUPAC))
forall (m :: * -> *) a. Monad m => a -> m a
return (Either FilePath (DNA IUPAC) -> IO (Either FilePath (DNA IUPAC)))
-> Either FilePath (DNA IUPAC) -> IO (Either FilePath (DNA IUPAC))
forall a b. (a -> b) -> a -> b
$ FilePath -> Either FilePath (DNA IUPAC)
forall a b. a -> Either a b
Left FilePath
"Unknown chromosome"
  where
    chrSize :: [(ByteString, Int)]
chrSize = Genome -> [(ByteString, Int)]
getChrSizes Genome
g
{-# INLINE getChrom #-}

-- | Retrieve chromosome size information.
getChrSizes :: Genome -> [(B.ByteString, Int)]
getChrSizes :: Genome -> [(ByteString, Int)]
getChrSizes (Genome Handle
_ IndexTable
table Int
_) = ((ByteString, (Int, Int)) -> (ByteString, Int))
-> [(ByteString, (Int, Int))] -> [(ByteString, Int)]
forall a b. (a -> b) -> [a] -> [b]
map (\(ByteString
k, (Int
_, Int
l)) -> (ByteString
k, Int
l)) ([(ByteString, (Int, Int))] -> [(ByteString, Int)])
-> (IndexTable -> [(ByteString, (Int, Int))])
-> IndexTable
-> [(ByteString, Int)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. IndexTable -> [(ByteString, (Int, Int))]
forall k v. HashMap k v -> [(k, v)]
M.toList (IndexTable -> [(ByteString, Int)])
-> IndexTable -> [(ByteString, Int)]
forall a b. (a -> b) -> a -> b
$ IndexTable
table
{-# INLINE getChrSizes #-}

-- | Indexing a genome.
mkIndex :: [FilePath]    -- ^ A list of fasta files. Each file can have multiple
                         -- chromosomes.
        -> FilePath      -- ^ output file
        -> IO ()
mkIndex :: [FilePath] -> FilePath -> IO ()
mkIndex [FilePath]
fls FilePath
outFl = FilePath -> IOMode -> (Handle -> IO ()) -> IO ()
forall r. FilePath -> IOMode -> (Handle -> IO r) -> IO r
withFile FilePath
outFl IOMode
WriteMode ((Handle -> IO ()) -> IO ()) -> (Handle -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Handle
outH -> do
    ([(ByteString, Int)]
chrs, [ByteString]
dnas) <- ([((ByteString, Int), ByteString)]
-> ([(ByteString, Int)], [ByteString])
forall a b. [(a, b)] -> ([a], [b])
unzip ([((ByteString, Int), ByteString)]
 -> ([(ByteString, Int)], [ByteString]))
-> ([[((ByteString, Int), ByteString)]]
    -> [((ByteString, Int), ByteString)])
-> [[((ByteString, Int), ByteString)]]
-> ([(ByteString, Int)], [ByteString])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[((ByteString, Int), ByteString)]]
-> [((ByteString, Int), ByteString)]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat) ([[((ByteString, Int), ByteString)]]
 -> ([(ByteString, Int)], [ByteString]))
-> IO [[((ByteString, Int), ByteString)]]
-> IO ([(ByteString, Int)], [ByteString])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (FilePath -> IO [((ByteString, Int), ByteString)])
-> [FilePath] -> IO [[((ByteString, Int), ByteString)]]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM FilePath -> IO [((ByteString, Int), ByteString)]
readSeq [FilePath]
fls
    Handle -> ByteString -> IO ()
B.hPutStr Handle
outH (ByteString -> IO ()) -> ByteString -> IO ()
forall a b. (a -> b) -> a -> b
$ [ByteString] -> ByteString
B.unlines [ByteString
magic, [(ByteString, Int)] -> ByteString
mkHeader [(ByteString, Int)]
chrs]
    (ByteString -> IO ()) -> [ByteString] -> IO ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (Handle -> ByteString -> IO ()
B.hPutStr Handle
outH) [ByteString]
dnas
  where
    readSeq :: FilePath -> IO [((ByteString, Int), ByteString)]
readSeq FilePath
fl = ResourceT IO [((ByteString, Int), ByteString)]
-> IO [((ByteString, Int), ByteString)]
forall (m :: * -> *) a. MonadUnliftIO m => ResourceT m a -> m a
runResourceT (ResourceT IO [((ByteString, Int), ByteString)]
 -> IO [((ByteString, Int), ByteString)])
-> ResourceT IO [((ByteString, Int), ByteString)]
-> IO [((ByteString, Int), ByteString)]
forall a b. (a -> b) -> a -> b
$ ConduitT () Void (ResourceT IO) [((ByteString, Int), ByteString)]
-> ResourceT IO [((ByteString, Int), ByteString)]
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (ResourceT IO) [((ByteString, Int), ByteString)]
 -> ResourceT IO [((ByteString, Int), ByteString)])
-> ConduitT
     () Void (ResourceT IO) [((ByteString, Int), ByteString)]
-> ResourceT IO [((ByteString, Int), ByteString)]
forall a b. (a -> b) -> a -> b
$ FilePath
-> ConduitT () (ByteString, [ByteString]) (ResourceT IO) ()
forall i.
FilePath -> ConduitT i (ByteString, [ByteString]) (ResourceT IO) ()
fastaReader FilePath
fl ConduitT () (ByteString, [ByteString]) (ResourceT IO) ()
-> ConduitM
     (ByteString, [ByteString])
     Void
     (ResourceT IO)
     [((ByteString, Int), ByteString)]
-> ConduitT
     () Void (ResourceT IO) [((ByteString, Int), ByteString)]
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT
  (ByteString, [ByteString])
  ((ByteString, Int), ByteString)
  (ResourceT IO)
  ()
conduit ConduitT
  (ByteString, [ByteString])
  ((ByteString, Int), ByteString)
  (ResourceT IO)
  ()
-> ConduitM
     ((ByteString, Int), ByteString)
     Void
     (ResourceT IO)
     [((ByteString, Int), ByteString)]
-> ConduitM
     (ByteString, [ByteString])
     Void
     (ResourceT IO)
     [((ByteString, Int), ByteString)]
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM
  ((ByteString, Int), ByteString)
  Void
  (ResourceT IO)
  [((ByteString, Int), ByteString)]
forall (m :: * -> *) a o. Monad m => ConduitT a o m [a]
sinkList
      where
        conduit :: ConduitT
  (ByteString, [ByteString])
  ((ByteString, Int), ByteString)
  (ResourceT IO)
  ()
conduit = ((ByteString, [ByteString])
 -> ConduitT
      (ByteString, [ByteString])
      ((ByteString, Int), ByteString)
      (ResourceT IO)
      ())
-> ConduitT
     (ByteString, [ByteString])
     ((ByteString, Int), ByteString)
     (ResourceT IO)
     ()
forall (m :: * -> *) i o r.
Monad m =>
(i -> ConduitT i o m r) -> ConduitT i o m ()
awaitForever (((ByteString, [ByteString])
  -> ConduitT
       (ByteString, [ByteString])
       ((ByteString, Int), ByteString)
       (ResourceT IO)
       ())
 -> ConduitT
      (ByteString, [ByteString])
      ((ByteString, Int), ByteString)
      (ResourceT IO)
      ())
-> ((ByteString, [ByteString])
    -> ConduitT
         (ByteString, [ByteString])
         ((ByteString, Int), ByteString)
         (ResourceT IO)
         ())
-> ConduitT
     (ByteString, [ByteString])
     ((ByteString, Int), ByteString)
     (ResourceT IO)
     ()
forall a b. (a -> b) -> a -> b
$ \(ByteString
chrName, [ByteString]
seqs) -> do
            let dna :: ByteString
dna = [ByteString] -> ByteString
B.concat [ByteString]
seqs
            ((ByteString, Int), ByteString)
-> ConduitT
     (ByteString, [ByteString])
     ((ByteString, Int), ByteString)
     (ResourceT IO)
     ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (([ByteString] -> ByteString
forall a. [a] -> a
head ([ByteString] -> ByteString) -> [ByteString] -> ByteString
forall a b. (a -> b) -> a -> b
$ ByteString -> [ByteString]
B.words ByteString
chrName, ByteString -> Int
B.length ByteString
dna), ByteString
dna)
{-# INLINE mkIndex #-}

mkHeader :: [(B.ByteString, Int)] -> B.ByteString
mkHeader :: [(ByteString, Int)] -> ByteString
mkHeader [(ByteString, Int)]
xs = [ByteString] -> ByteString
B.unwords([ByteString] -> ByteString)
-> (([ByteString], Int) -> [ByteString])
-> ([ByteString], Int)
-> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.([ByteString], Int) -> [ByteString]
forall a b. (a, b) -> a
fst (([ByteString], Int) -> ByteString)
-> ([ByteString], Int) -> ByteString
forall a b. (a -> b) -> a -> b
$ (([ByteString], Int) -> (ByteString, Int) -> ([ByteString], Int))
-> ([ByteString], Int)
-> [(ByteString, Int)]
-> ([ByteString], Int)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl ([ByteString], Int) -> (ByteString, Int) -> ([ByteString], Int)
forall b.
(Show b, Num b) =>
([ByteString], b) -> (ByteString, b) -> ([ByteString], b)
f ([], Int
0) [(ByteString, Int)]
xs
  where
    f :: ([ByteString], b) -> (ByteString, b) -> ([ByteString], b)
f ([ByteString]
s, b
i) (ByteString
s', b
i') = ([ByteString]
s [ByteString] -> [ByteString] -> [ByteString]
forall a. [a] -> [a] -> [a]
++ [ByteString
s', FilePath -> ByteString
B.pack (FilePath -> ByteString) -> FilePath -> ByteString
forall a b. (a -> b) -> a -> b
$ b -> FilePath
forall a. Show a => a -> FilePath
show b
i, FilePath -> ByteString
B.pack (FilePath -> ByteString) -> FilePath -> ByteString
forall a b. (a -> b) -> a -> b
$ b -> FilePath
forall a. Show a => a -> FilePath
show b
i'], b
i b -> b -> b
forall a. Num a => a -> a -> a
+ b
i')
{-# INLINE mkHeader #-}