{-# OPTIONS_GHC -fno-warn-orphans #-} module Bio.ABI.Decode () where import Bio.Sequence (SequenceDecodable (..), weightedSequence) import Bio.Sequence.Basecalled (BasecalledSequence) import Data.ByteString as BS (ByteString) import Data.ByteString.Lazy as BSL (ByteString, fromStrict) import Data.ByteString.Lazy.Char8 as BSL8 (unpack) import Data.Char (ord) import Data.List (elem, find) import Data.Maybe (maybe) import Data.Text (Text) import Hyrax.Abif (Abif (..), Directory (..)) import Hyrax.Abif.Read (getAbif) -- | Converts 'Data.ByteString.Lazy.ByteString' (that should be content of ABI file) -- into 'BasecalledSequence'. -- instance SequenceDecodable BSL.ByteString BasecalledSequence where sequenceDecode :: BSL.ByteString -> Either Text BasecalledSequence sequenceDecode bs = do abif <- getAbif bs sequence' <- extractSequence abif quality' <- extractQuality abif weightedSequence sequence' quality' -- | Converts 'Data.ByteString.ByteString' (that should be content of ABI file) -- into 'BasecalledSequence'. -- instance SequenceDecodable BS.ByteString BasecalledSequence where sequenceDecode :: BS.ByteString -> Either Text BasecalledSequence sequenceDecode = sequenceDecode . BSL.fromStrict ------------------------------------------------------------------------------- -- INTERNAL ------------------------------------------------------------------------------- -- | Extracts sequence from ABI file. -- extractSequence :: Abif -> Either Text String extractSequence abif = findDataByDirectory "PBAS" abif >>= checkACGT -- | Extracts quality from ABI file. -- Number are encoded with letters, thus we have function @fromIntegral . ord@. -- extractQuality :: Abif -> Either Text [Double] extractQuality abif = map (fromIntegral . ord) <$> findDataByDirectory "PCON" abif -- | Checks that all chars are from alphabet ACGT -- checkACGT :: String -> Either Text String checkACGT str | all validChar str = Right str | otherwise = Left "Bio.ABI.Decode: could not parse sequence" where validChar :: Char -> Bool validChar ch = ch `elem` ['A', 'C', 'G', 'T'] -- | Looks into ABI file and extract data by 'Directory' name. -- findDataByDirectory :: Text -> Abif -> Either Text String findDataByDirectory dirName abif = let directoryM = find (\Directory{..} -> dTagName == dirName) . aDirs $ abif in maybe (Left errorMsg) (Right . getData) directoryM where errorMsg :: Text errorMsg = "Bio.ABI.Decode: could not find directory " <> dirName getData :: Directory -> String getData = BSL8.unpack . dData