module Bio.Util.Text
    ( Unpack(..)
    , w2c, c2w
    , decodeBytes
    , encodeBytes
    , decompressGzip
    ) where

import BasePrelude
import Data.ByteString.Internal     ( c2w, w2c )
import Data.Text.Encoding           ( encodeUtf8, decodeUtf8With )

import qualified Codec.Compression.Zlib.Internal as Z
import qualified Data.ByteString.Char8           as S
import qualified Data.ByteString.Lazy            as L
import qualified Data.ByteString.Lazy.Char8      as C ( unpack )
import qualified Data.ByteString.Lazy.Internal   as L ( ByteString(..) )
import qualified Data.Text                       as T

-- | Class of things that can be unpacked into 'String's.  Kind of the
-- opposite of 'IsString'.
class Unpack s where unpack :: s -> String

instance Unpack L.ByteString where unpack :: ByteString -> String
unpack = ByteString -> String
C.unpack
instance Unpack S.ByteString where unpack :: ByteString -> String
unpack = ByteString -> String
S.unpack
instance Unpack T.Text       where unpack :: Text -> String
unpack = Text -> String
T.unpack
instance Unpack String       where unpack :: String -> String
unpack = String -> String
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id


-- | Converts 'Bytes' into 'Text'.  This uses UTF8, but if there is an
-- error, it pretends it was Latin1.  Evil as this is, it tends to Just
-- Work on files where nobody ever wasted a thought on encodings.
decodeBytes :: S.ByteString -> T.Text
decodeBytes :: ByteString -> Text
decodeBytes = OnDecodeError -> ByteString -> Text
decodeUtf8With ((Maybe Word8 -> Maybe Char) -> OnDecodeError
forall a b. a -> b -> a
const ((Maybe Word8 -> Maybe Char) -> OnDecodeError)
-> (Maybe Word8 -> Maybe Char) -> OnDecodeError
forall a b. (a -> b) -> a -> b
$ (Word8 -> Char) -> Maybe Word8 -> Maybe Char
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Word8 -> Char
w2c)

-- | Converts 'Text' into 'Bytes'.  This uses UTF8.
encodeBytes :: T.Text -> S.ByteString
encodeBytes :: Text -> ByteString
encodeBytes = Text -> ByteString
encodeUtf8


-- | Decompresses Gzip or Bgzf and passes everything else on.  In
-- reality, it simply decompresses Gzip, and when done, looks for
-- another Gzip stream.  Since there is a small chance to attempt
-- decompression of an uncompressed stream, the original data is
-- returned in case of an error.
decompressGzip :: L.ByteString -> L.ByteString
decompressGzip :: ByteString -> ByteString
decompressGzip s :: ByteString
s = case ByteString -> Maybe (Word8, ByteString)
L.uncons ByteString
s of
    Just (31, s' :: ByteString
s') -> case ByteString -> Maybe (Word8, ByteString)
L.uncons ByteString
s' of
        Just (139,_) -> (ByteString -> ByteString -> ByteString)
-> (ByteString -> ByteString)
-> (DecompressError -> ByteString)
-> (forall s. DecompressStream (ST s))
-> ByteString
-> ByteString
forall a.
(ByteString -> a -> a)
-> (ByteString -> a)
-> (DecompressError -> a)
-> (forall s. DecompressStream (ST s))
-> ByteString
-> a
Z.foldDecompressStreamWithInput ByteString -> ByteString -> ByteString
L.Chunk ByteString -> ByteString
decompressGzip (ByteString -> DecompressError -> ByteString
forall a b. a -> b -> a
const ByteString
s)
                        (Format -> DecompressParams -> DecompressStream (ST s)
forall s. Format -> DecompressParams -> DecompressStream (ST s)
Z.decompressST Format
Z.gzipOrZlibFormat DecompressParams
Z.defaultDecompressParams) ByteString
s
        _            -> ByteString
s
    _                -> ByteString
s