{-# OPTIONS_GHC -fno-warn-orphans #-}

module Bio.ABI.Decode
  ( decodeRawSequence
  , decodeRawSequence'
  ) where

import           Control.Applicative        (many)
import           Data.Bifunctor             (bimap)
import           Data.Binary.Get            (getInt16be, runGetOrFail)
import           Data.ByteString            as BS (ByteString)
import           Data.ByteString.Lazy       as BSL (ByteString, fromStrict)
import           Data.ByteString.Lazy.Char8 as BSL8 (unpack)
import           Data.Char                  (ord)
import           Data.Functor               ((<&>))
import           Data.Int                   (Int16)
import           Data.List                  (find)
import           Data.Text                  (Text, pack)
import           Data.Vector                (Vector, fromList)
import           Hyrax.Abif                 (Abif (..), Directory (..))
import           Hyrax.Abif.Read            (getAbif)

import           Bio.Sequence               (SequenceDecodable (..),
                                             weightedSequence)
import           Bio.Sequence.Basecalled    (BasecalledSequence,
                                             BasecalledSequenceWithRawData (..))

-- | Decode ABIF file with additional raw data that may be required for later processing.
decodeRawSequence :: BSL.ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence :: ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence ByteString
bs = do
    Abif
abif <- ByteString -> Either Text Abif
getAbif ByteString
bs
    String
sequence' <- Abif -> Either Text String
extractSequence Abif
abif
    [Double]
quality' <- Abif -> Either Text [Double]
extractQuality Abif
abif
    BasecalledSequence
bsSequence <- [Element BasecalledSequence]
-> [Weight BasecalledSequence] -> Either Text BasecalledSequence
forall s (m :: * -> *).
(IsWeightedSequence s, MonadError Text m) =>
[Element s] -> [Weight s] -> m s
weightedSequence String
[Element BasecalledSequence]
sequence' [Double]
[Weight BasecalledSequence]
quality'

    Vector Int16
bsRawG <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"DATA" Int
9 Abif
abif Either Text Directory
-> (Directory -> Either Text (Vector Int16))
-> Either Text (Vector Int16)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray
    Vector Int16
bsRawA <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"DATA" Int
10 Abif
abif Either Text Directory
-> (Directory -> Either Text (Vector Int16))
-> Either Text (Vector Int16)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray
    Vector Int16
bsRawT <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"DATA" Int
11 Abif
abif Either Text Directory
-> (Directory -> Either Text (Vector Int16))
-> Either Text (Vector Int16)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray
    Vector Int16
bsRawC <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"DATA" Int
12 Abif
abif Either Text Directory
-> (Directory -> Either Text (Vector Int16))
-> Either Text (Vector Int16)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray

    Vector Int
bsPeakLocations <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"PLOC" Int
2 Abif
abif Either Text Directory
-> (Directory -> Either Text (Vector Int16))
-> Either Text (Vector Int16)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray Either Text (Vector Int16)
-> (Vector Int16 -> Vector Int) -> Either Text (Vector Int)
forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> (Int16 -> Int) -> Vector Int16 -> Vector Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Int16 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

    BasecalledSequenceWithRawData
-> Either Text BasecalledSequenceWithRawData
forall (m :: * -> *) a. Monad m => a -> m a
return BasecalledSequenceWithRawData :: BasecalledSequence
-> Vector Int16
-> Vector Int16
-> Vector Int16
-> Vector Int16
-> Vector Int
-> BasecalledSequenceWithRawData
BasecalledSequenceWithRawData{Vector Int
Vector Int16
BasecalledSequence
bsPeakLocations :: Vector Int
bsRawC :: Vector Int16
bsRawT :: Vector Int16
bsRawA :: Vector Int16
bsRawG :: Vector Int16
bsSequence :: BasecalledSequence
bsPeakLocations :: Vector Int
bsRawC :: Vector Int16
bsRawT :: Vector Int16
bsRawA :: Vector Int16
bsRawG :: Vector Int16
bsSequence :: BasecalledSequence
..}

-- | Same as 'decodeRawSequence', for strict @ByteString@.
decodeRawSequence' :: BS.ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence' :: ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence' = ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence (ByteString -> Either Text BasecalledSequenceWithRawData)
-> (ByteString -> ByteString)
-> ByteString
-> Either Text BasecalledSequenceWithRawData
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
BSL.fromStrict

-- | Discards raw data information.
instance SequenceDecodable BasecalledSequenceWithRawData BasecalledSequence where
  sequenceDecode :: BasecalledSequenceWithRawData -> Either Text BasecalledSequence
sequenceDecode = BasecalledSequence -> Either Text BasecalledSequence
forall (f :: * -> *) a. Applicative f => a -> f a
pure (BasecalledSequence -> Either Text BasecalledSequence)
-> (BasecalledSequenceWithRawData -> BasecalledSequence)
-> BasecalledSequenceWithRawData
-> Either Text BasecalledSequence
forall b c a. (b -> c) -> (a -> b) -> a -> c
. BasecalledSequenceWithRawData -> BasecalledSequence
bsSequence

-- | Converts 'Data.ByteString.Lazy.ByteString' (that should be content of ABI file)
-- into 'BasecalledSequence'.
--
instance SequenceDecodable BSL.ByteString BasecalledSequence where
    sequenceDecode :: BSL.ByteString -> Either Text BasecalledSequence
    sequenceDecode :: ByteString -> Either Text BasecalledSequence
sequenceDecode ByteString
bs = do
        Abif
abif      <- ByteString -> Either Text Abif
getAbif ByteString
bs
        String
sequence' <- Abif -> Either Text String
extractSequence Abif
abif
        [Double]
quality'  <- Abif -> Either Text [Double]
extractQuality  Abif
abif
        [Element BasecalledSequence]
-> [Weight BasecalledSequence] -> Either Text BasecalledSequence
forall s (m :: * -> *).
(IsWeightedSequence s, MonadError Text m) =>
[Element s] -> [Weight s] -> m s
weightedSequence String
[Element BasecalledSequence]
sequence' [Double]
[Weight BasecalledSequence]
quality'

-- | Converts 'Data.ByteString.ByteString' (that should be content of ABI file)
-- into 'BasecalledSequence'.
--
instance SequenceDecodable BS.ByteString BasecalledSequence where
    sequenceDecode :: BS.ByteString -> Either Text BasecalledSequence
    sequenceDecode :: ByteString -> Either Text BasecalledSequence
sequenceDecode = ByteString -> Either Text BasecalledSequence
forall a s. SequenceDecodable a s => a -> Either Text s
sequenceDecode (ByteString -> Either Text BasecalledSequence)
-> (ByteString -> ByteString)
-> ByteString
-> Either Text BasecalledSequence
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
BSL.fromStrict

-------------------------------------------------------------------------------
-- INTERNAL
-------------------------------------------------------------------------------

-- | Extracts sequence from ABI file.
--
extractSequence :: Abif -> Either Text String
extractSequence :: Abif -> Either Text String
extractSequence Abif
abif = Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"PBAS" Int
1 Abif
abif Either Text Directory
-> (Directory -> String) -> Either Text String
forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> ByteString -> String
BSL8.unpack (ByteString -> String)
-> (Directory -> ByteString) -> Directory -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Directory -> ByteString
dData Either Text String
-> (String -> Either Text String) -> Either Text String
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= String -> Either Text String
checkACGT

-- | Extracts quality from ABI file.
-- Number are encoded with letters, thus we have function @fromIntegral . ord@.
--
extractQuality :: Abif -> Either Text [Double]
extractQuality :: Abif -> Either Text [Double]
extractQuality Abif
abif = (Char -> Double) -> String -> [Double]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> (Char -> Int) -> Char -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord) (String -> [Double])
-> (Directory -> String) -> Directory -> [Double]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> String
BSL8.unpack (ByteString -> String)
-> (Directory -> ByteString) -> Directory -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Directory -> ByteString
dData (Directory -> [Double])
-> Either Text Directory -> Either Text [Double]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"PCON" Int
1 Abif
abif

-- | Checks that all chars are from alphabet ACGT
--
checkACGT :: String -> Either Text String
checkACGT :: String -> Either Text String
checkACGT String
str | (Char -> Bool) -> String -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Char -> Bool
validChar String
str = String -> Either Text String
forall a b. b -> Either a b
Right String
str
              | Bool
otherwise         = Text -> Either Text String
forall a b. a -> Either a b
Left Text
"Bio.ABI.Decode: could not parse sequence"
  where
    validChar :: Char -> Bool
    validChar :: Char -> Bool
validChar Char
ch = Char
ch Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Char
'A', Char
'C', Char
'G', Char
'T']

-- | Looks into ABI file and extract data by 'Directory' name.
--
findDataByDirectory
  :: Text -- ^ Directory name
  -> Int  -- ^ 1-based directory index
  -> Abif
  -> Either Text Directory
findDataByDirectory :: Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
dirName Int
dirIndex Abif
abif =
    let directoryM :: Maybe Directory
directoryM = (Directory -> Bool) -> [Directory] -> Maybe Directory
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find (\Directory{Int
[Text]
ByteString
Text
ElemType
dTagName :: Directory -> Text
dTagNum :: Directory -> Int
dElemType :: Directory -> ElemType
dElemTypeCode :: Directory -> Int
dElemTypeDesc :: Directory -> Text
dElemSize :: Directory -> Int
dElemNum :: Directory -> Int
dDataSize :: Directory -> Int
dDataOffset :: Directory -> Int
dDataDebug :: Directory -> [Text]
dDataDebug :: [Text]
dData :: ByteString
dDataOffset :: Int
dDataSize :: Int
dElemNum :: Int
dElemSize :: Int
dElemTypeDesc :: Text
dElemTypeCode :: Int
dElemType :: ElemType
dTagNum :: Int
dTagName :: Text
dData :: Directory -> ByteString
..} -> Text
dTagName Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
dirName Bool -> Bool -> Bool
&& Int
dTagNum Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
dirIndex) ([Directory] -> Maybe Directory)
-> (Abif -> [Directory]) -> Abif -> Maybe Directory
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Abif -> [Directory]
aDirs (Abif -> Maybe Directory) -> Abif -> Maybe Directory
forall a b. (a -> b) -> a -> b
$ Abif
abif
    in Either Text Directory
-> (Directory -> Either Text Directory)
-> Maybe Directory
-> Either Text Directory
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Text -> Either Text Directory
forall a b. a -> Either a b
Left Text
errorMsg) Directory -> Either Text Directory
forall a b. b -> Either a b
Right Maybe Directory
directoryM
  where
    errorMsg :: Text
    errorMsg :: Text
errorMsg = Text
"Bio.ABI.Decode: could not find directory " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
dirName Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> Text
" with index " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> String -> Text
pack (Int -> String
forall a. Show a => a -> String
show Int
dirIndex)

decodeShortArray :: Directory -> Either Text (Vector Int16)
decodeShortArray :: Directory -> Either Text (Vector Int16)
decodeShortArray =
  ((ByteString, ByteOffset, String) -> Text)
-> ((ByteString, ByteOffset, [Int16]) -> Vector Int16)
-> Either
     (ByteString, ByteOffset, String) (ByteString, ByteOffset, [Int16])
-> Either Text (Vector Int16)
forall (p :: * -> * -> *) a b c d.
Bifunctor p =>
(a -> b) -> (c -> d) -> p a c -> p b d
bimap
    (\(ByteString
_, ByteOffset
_, String
msg) -> Text
"Data.ABI.Decode: could not decode short array: " Text -> Text -> Text
forall a. Semigroup a => a -> a -> a
<> String -> Text
pack String
msg)
    (\(ByteString
_, ByteOffset
_, [Int16]
lst) -> [Int16] -> Vector Int16
forall a. [a] -> Vector a
fromList [Int16]
lst)
  (Either
   (ByteString, ByteOffset, String) (ByteString, ByteOffset, [Int16])
 -> Either Text (Vector Int16))
-> (Directory
    -> Either
         (ByteString, ByteOffset, String) (ByteString, ByteOffset, [Int16]))
-> Directory
-> Either Text (Vector Int16)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Get [Int16]
-> ByteString
-> Either
     (ByteString, ByteOffset, String) (ByteString, ByteOffset, [Int16])
forall a.
Get a
-> ByteString
-> Either
     (ByteString, ByteOffset, String) (ByteString, ByteOffset, a)
runGetOrFail (Get Int16 -> Get [Int16]
forall (f :: * -> *) a. Alternative f => f a -> f [a]
many Get Int16
getInt16be)
  (ByteString
 -> Either
      (ByteString, ByteOffset, String) (ByteString, ByteOffset, [Int16]))
-> (Directory -> ByteString)
-> Directory
-> Either
     (ByteString, ByteOffset, String) (ByteString, ByteOffset, [Int16])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Directory -> ByteString
dData