{-# OPTIONS_GHC -fno-warn-orphans #-} module Bio.ABI.Decode ( decodeRawSequence , decodeRawSequence' ) where import Control.Applicative (many) import Data.Bifunctor (bimap) import Data.Binary.Get (getInt16be, runGetOrFail) import Data.ByteString as BS (ByteString) import Data.ByteString.Lazy as BSL (ByteString, fromStrict) import Data.ByteString.Lazy.Char8 as BSL8 (unpack) import Data.Char (ord) import Data.Functor ((<&>)) import Data.Int (Int16) import Data.List (find) import Data.Text (Text, pack) import Data.Vector (Vector, fromList) import Hyrax.Abif (Abif (..), Directory (..)) import Hyrax.Abif.Read (getAbif) import Bio.Sequence (SequenceDecodable (..), weightedSequence) import Bio.Sequence.Basecalled (BasecalledSequence, BasecalledSequenceWithRawData (..)) -- | Decode ABIF file with additional raw data that may be required for later processing. decodeRawSequence :: BSL.ByteString -> Either Text BasecalledSequenceWithRawData decodeRawSequence bs = do abif <- getAbif bs sequence' <- extractSequence abif quality' <- extractQuality abif bsSequence <- weightedSequence sequence' quality' bsRawG <- findDataByDirectory "DATA" 9 abif >>= decodeShortArray bsRawA <- findDataByDirectory "DATA" 10 abif >>= decodeShortArray bsRawT <- findDataByDirectory "DATA" 11 abif >>= decodeShortArray bsRawC <- findDataByDirectory "DATA" 12 abif >>= decodeShortArray bsPeakLocations <- findDataByDirectory "PLOC" 2 abif >>= decodeShortArray <&> fmap fromIntegral return BasecalledSequenceWithRawData{..} -- | Same as 'decodeRawSequence', for strict @ByteString@. decodeRawSequence' :: BS.ByteString -> Either Text BasecalledSequenceWithRawData decodeRawSequence' = decodeRawSequence . BSL.fromStrict -- | Discards raw data information. instance SequenceDecodable BasecalledSequenceWithRawData BasecalledSequence where sequenceDecode = pure . bsSequence -- | Converts 'Data.ByteString.Lazy.ByteString' (that should be content of ABI file) -- into 'BasecalledSequence'. -- instance SequenceDecodable BSL.ByteString BasecalledSequence where sequenceDecode :: BSL.ByteString -> Either Text BasecalledSequence sequenceDecode bs = do abif <- getAbif bs sequence' <- extractSequence abif quality' <- extractQuality abif weightedSequence sequence' quality' -- | Converts 'Data.ByteString.ByteString' (that should be content of ABI file) -- into 'BasecalledSequence'. -- instance SequenceDecodable BS.ByteString BasecalledSequence where sequenceDecode :: BS.ByteString -> Either Text BasecalledSequence sequenceDecode = sequenceDecode . BSL.fromStrict ------------------------------------------------------------------------------- -- INTERNAL ------------------------------------------------------------------------------- -- | Extracts sequence from ABI file. -- extractSequence :: Abif -> Either Text String extractSequence abif = findDataByDirectory "PBAS" 1 abif <&> BSL8.unpack . dData >>= checkACGT -- | Extracts quality from ABI file. -- Number are encoded with letters, thus we have function @fromIntegral . ord@. -- extractQuality :: Abif -> Either Text [Double] extractQuality abif = map (fromIntegral . ord) . BSL8.unpack . dData <$> findDataByDirectory "PCON" 1 abif -- | Checks that all chars are from alphabet ACGT -- checkACGT :: String -> Either Text String checkACGT str | all validChar str = Right str | otherwise = Left "Bio.ABI.Decode: could not parse sequence" where validChar :: Char -> Bool validChar ch = ch `elem` ['A', 'C', 'G', 'T'] -- | Looks into ABI file and extract data by 'Directory' name. -- findDataByDirectory :: Text -- ^ Directory name -> Int -- ^ 1-based directory index -> Abif -> Either Text Directory findDataByDirectory dirName dirIndex abif = let directoryM = find (\Directory{..} -> dTagName == dirName && dTagNum == dirIndex) . aDirs $ abif in maybe (Left errorMsg) Right directoryM where errorMsg :: Text errorMsg = "Bio.ABI.Decode: could not find directory " <> dirName <> " with index " <> pack (show dirIndex) decodeShortArray :: Directory -> Either Text (Vector Int16) decodeShortArray = bimap (\(_, _, msg) -> "Data.ABI.Decode: could not decode short array: " <> pack msg) (\(_, _, lst) -> fromList lst) . runGetOrFail (many getInt16be) . dData