{-# OPTIONS_GHC -fno-warn-orphans #-}

module Bio.ABI.Decode
  ( decodeRawSequence
  , decodeRawSequence'
  ) where

import           Control.Applicative        (many)
import           Data.Bifunctor             (bimap)
import           Data.Binary.Get            (getInt16be, runGetOrFail)
import           Data.ByteString            as BS (ByteString)
import           Data.ByteString.Lazy       as BSL (ByteString, fromStrict)
import           Data.ByteString.Lazy.Char8 as BSL8 (unpack)
import           Data.Char                  (ord)
import           Data.Functor               ((<&>))
import           Data.Int                   (Int16)
import           Data.List                  (find)
import           Data.Text                  (Text, pack)
import           Data.Vector                (Vector, fromList)
import           Hyrax.Abif                 (Abif (..), Directory (..))
import           Hyrax.Abif.Read            (getAbif)

import           Bio.Sequence               (SequenceDecodable (..),
                                             weightedSequence)
import           Bio.Sequence.Basecalled    (BasecalledSequence,
                                             BasecalledSequenceWithRawData (..))

-- | Decode ABIF file with additional raw data that may be required for later processing.
decodeRawSequence :: BSL.ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence :: ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence ByteString
bs = do
    Abif
abif <- ByteString -> Either Text Abif
getAbif ByteString
bs
    String
sequence' <- Abif -> Either Text String
extractSequence Abif
abif
    [Double]
quality' <- Abif -> Either Text [Double]
extractQuality Abif
abif
    BasecalledSequence
bsSequence <- forall s (m :: * -> *).
(IsWeightedSequence s, MonadError Text m) =>
[Element s] -> [Weight s] -> m s
weightedSequence String
sequence' [Double]
quality'

    Vector Int16
bsRawG <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"DATA" Int
9 Abif
abif forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray
    Vector Int16
bsRawA <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"DATA" Int
10 Abif
abif forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray
    Vector Int16
bsRawT <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"DATA" Int
11 Abif
abif forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray
    Vector Int16
bsRawC <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"DATA" Int
12 Abif
abif forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray

    Vector Int
bsPeakLocations <- Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"PLOC" Int
2 Abif
abif forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Directory -> Either Text (Vector Int16)
decodeShortArray forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a b. (Integral a, Num b) => a -> b
fromIntegral

    forall (m :: * -> *) a. Monad m => a -> m a
return BasecalledSequenceWithRawData{Vector Int
Vector Int16
BasecalledSequence
bsPeakLocations :: Vector Int
bsRawC :: Vector Int16
bsRawT :: Vector Int16
bsRawA :: Vector Int16
bsRawG :: Vector Int16
bsSequence :: BasecalledSequence
bsPeakLocations :: Vector Int
bsRawC :: Vector Int16
bsRawT :: Vector Int16
bsRawA :: Vector Int16
bsRawG :: Vector Int16
bsSequence :: BasecalledSequence
..}

-- | Same as 'decodeRawSequence', for strict @ByteString@.
decodeRawSequence' :: BS.ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence' :: ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence' = ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
BSL.fromStrict

-- | Discards raw data information.
instance SequenceDecodable BasecalledSequenceWithRawData BasecalledSequence where
  sequenceDecode :: BasecalledSequenceWithRawData -> Either Text BasecalledSequence
sequenceDecode = forall (f :: * -> *) a. Applicative f => a -> f a
pure forall b c a. (b -> c) -> (a -> b) -> a -> c
. BasecalledSequenceWithRawData -> BasecalledSequence
bsSequence

-- | Converts 'Data.ByteString.Lazy.ByteString' (that should be content of ABI file)
-- into 'BasecalledSequence'.
--
instance SequenceDecodable BSL.ByteString BasecalledSequence where
    sequenceDecode :: BSL.ByteString -> Either Text BasecalledSequence
    sequenceDecode :: ByteString -> Either Text BasecalledSequence
sequenceDecode ByteString
bs = do
        Abif
abif      <- ByteString -> Either Text Abif
getAbif ByteString
bs
        String
sequence' <- Abif -> Either Text String
extractSequence Abif
abif
        [Double]
quality'  <- Abif -> Either Text [Double]
extractQuality  Abif
abif
        forall s (m :: * -> *).
(IsWeightedSequence s, MonadError Text m) =>
[Element s] -> [Weight s] -> m s
weightedSequence String
sequence' [Double]
quality'

-- | Converts 'Data.ByteString.ByteString' (that should be content of ABI file)
-- into 'BasecalledSequence'.
--
instance SequenceDecodable BS.ByteString BasecalledSequence where
    sequenceDecode :: BS.ByteString -> Either Text BasecalledSequence
    sequenceDecode :: ByteString -> Either Text BasecalledSequence
sequenceDecode = forall a s. SequenceDecodable a s => a -> Either Text s
sequenceDecode forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
BSL.fromStrict

-------------------------------------------------------------------------------
-- INTERNAL
-------------------------------------------------------------------------------

-- | Extracts sequence from ABI file.
--
extractSequence :: Abif -> Either Text String
extractSequence :: Abif -> Either Text String
extractSequence Abif
abif = Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"PBAS" Int
1 Abif
abif forall (f :: * -> *) a b. Functor f => f a -> (a -> b) -> f b
<&> ByteString -> String
BSL8.unpack forall b c a. (b -> c) -> (a -> b) -> a -> c
. Directory -> ByteString
dData forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= String -> Either Text String
checkACGT

-- | Extracts quality from ABI file.
-- Number are encoded with letters, thus we have function @fromIntegral . ord@.
--
extractQuality :: Abif -> Either Text [Double]
extractQuality :: Abif -> Either Text [Double]
extractQuality Abif
abif = forall a b. (a -> b) -> [a] -> [b]
map (forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord) forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> String
BSL8.unpack forall b c a. (b -> c) -> (a -> b) -> a -> c
. Directory -> ByteString
dData forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
"PCON" Int
1 Abif
abif

-- | Checks that all chars are from alphabet ACGT
--
checkACGT :: String -> Either Text String
checkACGT :: String -> Either Text String
checkACGT String
str | forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Char -> Bool
validChar String
str = forall a b. b -> Either a b
Right String
str
              | Bool
otherwise         = forall a b. a -> Either a b
Left Text
"Bio.ABI.Decode: could not parse sequence"
  where
    validChar :: Char -> Bool
    validChar :: Char -> Bool
validChar Char
ch = Char
ch forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Char
'A', Char
'C', Char
'G', Char
'T']

-- | Looks into ABI file and extract data by 'Directory' name.
--
findDataByDirectory
  :: Text -- ^ Directory name
  -> Int  -- ^ 1-based directory index
  -> Abif
  -> Either Text Directory
findDataByDirectory :: Text -> Int -> Abif -> Either Text Directory
findDataByDirectory Text
dirName Int
dirIndex Abif
abif =
    let directoryM :: Maybe Directory
directoryM = forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Maybe a
find (\Directory{Int
[Text]
ByteString
Text
ElemType
dTagName :: Directory -> Text
dTagNum :: Directory -> Int
dElemType :: Directory -> ElemType
dElemTypeCode :: Directory -> Int
dElemTypeDesc :: Directory -> Text
dElemSize :: Directory -> Int
dElemNum :: Directory -> Int
dDataSize :: Directory -> Int
dDataOffset :: Directory -> Int
dDataDebug :: Directory -> [Text]
dDataDebug :: [Text]
dData :: ByteString
dDataOffset :: Int
dDataSize :: Int
dElemNum :: Int
dElemSize :: Int
dElemTypeDesc :: Text
dElemTypeCode :: Int
dElemType :: ElemType
dTagNum :: Int
dTagName :: Text
dData :: Directory -> ByteString
..} -> Text
dTagName forall a. Eq a => a -> a -> Bool
== Text
dirName Bool -> Bool -> Bool
&& Int
dTagNum forall a. Eq a => a -> a -> Bool
== Int
dirIndex) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Abif -> [Directory]
aDirs forall a b. (a -> b) -> a -> b
$ Abif
abif
    in forall b a. b -> (a -> b) -> Maybe a -> b
maybe (forall a b. a -> Either a b
Left Text
errorMsg) forall a b. b -> Either a b
Right Maybe Directory
directoryM
  where
    errorMsg :: Text
    errorMsg :: Text
errorMsg = Text
"Bio.ABI.Decode: could not find directory " forall a. Semigroup a => a -> a -> a
<> Text
dirName forall a. Semigroup a => a -> a -> a
<> Text
" with index " forall a. Semigroup a => a -> a -> a
<> String -> Text
pack (forall a. Show a => a -> String
show Int
dirIndex)

decodeShortArray :: Directory -> Either Text (Vector Int16)
decodeShortArray :: Directory -> Either Text (Vector Int16)
decodeShortArray =
  forall (p :: * -> * -> *) a b c d.
Bifunctor p =>
(a -> b) -> (c -> d) -> p a c -> p b d
bimap
    (\(ByteString
_, ByteOffset
_, String
msg) -> Text
"Data.ABI.Decode: could not decode short array: " forall a. Semigroup a => a -> a -> a
<> String -> Text
pack String
msg)
    (\(ByteString
_, ByteOffset
_, [Int16]
lst) -> forall a. [a] -> Vector a
fromList [Int16]
lst)
  forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a.
Get a
-> ByteString
-> Either
     (ByteString, ByteOffset, String) (ByteString, ByteOffset, a)
runGetOrFail (forall (f :: * -> *) a. Alternative f => f a -> f [a]
many Get Int16
getInt16be)
  forall b c a. (b -> c) -> (a -> b) -> a -> c
. Directory -> ByteString
dData