{-# OPTIONS_GHC -fno-warn-orphans #-}
module Bio.ABI.Decode
( decodeRawSequence
, decodeRawSequence'
) where
import Control.Applicative (many)
import Data.Bifunctor (bimap)
import Data.Binary.Get (getInt16be, runGetOrFail)
import Data.ByteString as BS (ByteString)
import Data.ByteString.Lazy as BSL (ByteString, fromStrict)
import Data.ByteString.Lazy.Char8 as BSL8 (unpack)
import Data.Char (ord)
import Data.Functor ((<&>))
import Data.Int (Int16)
import Data.List (find)
import Data.Text (Text, pack)
import Data.Vector (Vector, fromList)
import Hyrax.Abif (Abif (..), Directory (..))
import Hyrax.Abif.Read (getAbif)
import Bio.Sequence (SequenceDecodable (..),
weightedSequence)
import Bio.Sequence.Basecalled (BasecalledSequence,
BasecalledSequenceWithRawData (..))
decodeRawSequence :: BSL.ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence bs = do
abif <- getAbif bs
sequence' <- extractSequence abif
quality' <- extractQuality abif
bsSequence <- weightedSequence sequence' quality'
bsRawG <- findDataByDirectory "DATA" 9 abif >>= decodeShortArray
bsRawA <- findDataByDirectory "DATA" 10 abif >>= decodeShortArray
bsRawT <- findDataByDirectory "DATA" 11 abif >>= decodeShortArray
bsRawC <- findDataByDirectory "DATA" 12 abif >>= decodeShortArray
bsPeakLocations <- findDataByDirectory "PLOC" 2 abif >>= decodeShortArray <&> fmap fromIntegral
return BasecalledSequenceWithRawData{..}
decodeRawSequence' :: BS.ByteString -> Either Text BasecalledSequenceWithRawData
decodeRawSequence' = decodeRawSequence . BSL.fromStrict
instance SequenceDecodable BasecalledSequenceWithRawData BasecalledSequence where
sequenceDecode = pure . bsSequence
instance SequenceDecodable BSL.ByteString BasecalledSequence where
sequenceDecode :: BSL.ByteString -> Either Text BasecalledSequence
sequenceDecode bs = do
abif <- getAbif bs
sequence' <- extractSequence abif
quality' <- extractQuality abif
weightedSequence sequence' quality'
instance SequenceDecodable BS.ByteString BasecalledSequence where
sequenceDecode :: BS.ByteString -> Either Text BasecalledSequence
sequenceDecode = sequenceDecode . BSL.fromStrict
extractSequence :: Abif -> Either Text String
extractSequence abif = findDataByDirectory "PBAS" 1 abif <&> BSL8.unpack . dData >>= checkACGT
extractQuality :: Abif -> Either Text [Double]
extractQuality abif = map (fromIntegral . ord) . BSL8.unpack . dData <$> findDataByDirectory "PCON" 1 abif
checkACGT :: String -> Either Text String
checkACGT str | all validChar str = Right str
| otherwise = Left "Bio.ABI.Decode: could not parse sequence"
where
validChar :: Char -> Bool
validChar ch = ch `elem` ['A', 'C', 'G', 'T']
findDataByDirectory
:: Text
-> Int
-> Abif
-> Either Text Directory
findDataByDirectory dirName dirIndex abif =
let directoryM = find (\Directory{..} -> dTagName == dirName && dTagNum == dirIndex) . aDirs $ abif
in maybe (Left errorMsg) Right directoryM
where
errorMsg :: Text
errorMsg = "Bio.ABI.Decode: could not find directory " <> dirName <> " with index " <> pack (show dirIndex)
decodeShortArray :: Directory -> Either Text (Vector Int16)
decodeShortArray =
bimap
(\(_, _, msg) -> "Data.ABI.Decode: could not decode short array: " <> pack msg)
(\(_, _, lst) -> fromList lst)
. runGetOrFail (many getInt16be)
. dData