{-# LANGUAGE CPP #-} {-# LANGUAGE ExplicitForAll #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE Trustworthy #-} {-# LANGUAGE TypeSynonymInstances #-} {-# OPTIONS_GHC -Wno-orphans #-} -- | This module implements type class which allow to have conversion to and -- from 'Text', 'String' and 'ByteString' types (including both strict and lazy -- versions). Usually you need to export 'Text' modules qualified and use -- 'T.pack' \/ 'T.unpack' functions to convert to\/from 'Text'. Now you can -- just use 'toText' \/ 'toString' functions. module Universum.String.Conversion ( -- * Convenient type aliases LText , LByteString -- * Conversion type classes , ConvertUtf8 (..) , ToString (..) , ToLText (..) , ToText (..) -- * Show and read functions , readEither , show ) where import Data.Bifunctor (first) import Data.Either (Either) import Data.Function (id, (.)) import Data.String (String) import qualified Data.Text.Internal as T import qualified Data.Text.Internal.Fusion.Common as TF import Universum.Functor ((<$>)) import Universum.String.Reexport (ByteString, IsString, Read, Text, fromString) import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as LB import qualified Data.ByteString.Lazy.UTF8 as LBU import qualified Data.ByteString.UTF8 as BU import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.Encoding.Error as T import qualified Data.Text.Lazy as LT import qualified Data.Text.Lazy.Encoding as LT import qualified Text.Read (readEither) import qualified GHC.Show as Show (Show (show)) -- $setup -- >>> :set -XTypeApplications -XOverloadedStrings -- >>> import Universum.Base (Int) -- >>> import Universum.Function (($)) -- >>> import Universum.Print (putStrLn) -- | Type synonym for 'Data.Text.Lazy.Text'. type LText = LT.Text -- | Type synonym for 'Data.ByteString.Lazy.ByteString'. type LByteString = LB.ByteString -- | Type class for conversion to utf8 representation of text. class ConvertUtf8 a b where -- | Encode as utf8 string (usually 'B.ByteString'). -- -- >>> encodeUtf8 @Text @ByteString "патак" -- "\208\191\208\176\209\130\208\176\208\186" encodeUtf8 :: a -> b -- | Decode from utf8 string. -- -- >>> decodeUtf8 @Text @ByteString "\208\191\208\176\209\130\208\176\208\186" -- "\1087\1072\1090\1072\1082" -- >>> putStrLn $ decodeUtf8 @Text @ByteString "\208\191\208\176\209\130\208\176\208\186" -- патак decodeUtf8 :: b -> a {- | Decode as utf8 string but returning execption if byte sequence is malformed. #if MIN_VERSION_text(1,2,3) >>> decodeUtf8 @Text @ByteString "\208\208\176\209\130\208\176\208\186" "\65533\1072\1090\1072\1082" #else >>> decodeUtf8 @Text @ByteString "\208\208\176\209\130\208\176\208\186" "\65533\65533\1090\1072\1082" #endif >>> decodeUtf8Strict @Text @ByteString "\208\208\176\209\130\208\176\208\186" Left Cannot decode byte '\xd0': Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream -} decodeUtf8Strict :: b -> Either T.UnicodeException a instance ConvertUtf8 String B.ByteString where encodeUtf8 = BU.fromString decodeUtf8 = BU.toString decodeUtf8Strict = (T.unpack <$>) . decodeUtf8Strict instance ConvertUtf8 T.Text B.ByteString where encodeUtf8 = T.encodeUtf8 decodeUtf8 = T.decodeUtf8With T.lenientDecode decodeUtf8Strict = T.decodeUtf8' instance ConvertUtf8 LT.Text B.ByteString where encodeUtf8 = LB.toStrict . encodeUtf8 decodeUtf8 = LT.decodeUtf8With T.lenientDecode . LB.fromStrict decodeUtf8Strict = decodeUtf8Strict . LB.fromStrict instance ConvertUtf8 String LB.ByteString where encodeUtf8 = LBU.fromString decodeUtf8 = LBU.toString decodeUtf8Strict = (T.unpack <$>) . decodeUtf8Strict instance ConvertUtf8 T.Text LB.ByteString where encodeUtf8 = LB.fromStrict . T.encodeUtf8 decodeUtf8 = T.decodeUtf8With T.lenientDecode . LB.toStrict decodeUtf8Strict = T.decodeUtf8' . LB.toStrict instance ConvertUtf8 LT.Text LB.ByteString where encodeUtf8 = LT.encodeUtf8 decodeUtf8 = LT.decodeUtf8With T.lenientDecode decodeUtf8Strict = LT.decodeUtf8' -- | Type class for converting other strings to 'T.Text'. class ToText a where toText :: a -> T.Text instance ToText String where toText = T.pack instance ToText T.Text where toText = id instance ToText LT.Text where toText = LT.toStrict -- | Type class for converting other strings to 'LT.Text'. class ToLText a where toLText :: a -> LT.Text instance ToLText String where toLText = LT.pack instance ToLText T.Text where toLText = LT.fromStrict instance ToLText LT.Text where toLText = id -- | Type class for converting other strings to 'String'. class ToString a where toString :: a -> String instance ToString String where toString = id instance ToString T.Text where toString = T.unpack instance ToString LT.Text where toString = LT.unpack {- @toString . toText@ pattern may occur quite often after inlining because we tend to use 'Text' rather than 'String' in function signatures, but there are still some libraries which use 'String's and thus make us perform conversions back and forth. Note that @toString . toText@ is not strictly equal to identity function, see explanation in the comment below. -} {-# RULES "pack/unpack" [~0] forall s. T.unpack (T.pack s) = s #-} {- [Note toString-toText-rewritting] We can do even better if take rules defined in 'Data.Text' into account. Quoting investigation of @int-index: If we look at @unpack@ and @pack@ they are defined as @ unpack = S.unstreamList . stream {-# INLINE [1] unpack #-} pack = unstream . S.map safe . S.streamList {-# INLINE [1] pack #-} @ After they get inlined, the rule seems to be @ (S.unstreamList . stream) ((unstream . S.map safe . S.streamList) a) @ If we also inline function composition, we get @ S.unstreamList (stream (unstream (S.map safe (S.streamList a)))) @ `stream` and `unstream` surely cancel out via this rule: @ {-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-} @ So we are left with @ S.unstreamList (S.map safe (S.streamList a)) @ Now, what's this 'safe' function? Turns out it's defined as @ safe :: Char -> Char safe c | ord c .&. 0x1ff800 /= 0xd800 = c | otherwise = '\xfffd' @ Aha, so it's mapping some codepoints to @'\xfffd'@! There's a comment on top of it to explain this: ``` -- UTF-16 surrogate code points are not included in the set of Unicode -- scalar values, but are unfortunately admitted as valid 'Char' -- values by Haskell. They cannot be represented in a 'Text'. This -- function remaps those code points to the Unicode replacement -- character (U+FFFD, \'�\'), and leaves other code points -- unchanged. ``` This logic is lost with the mentioned rewrite rule. Not a huge loss, but it does mean that this rewrite rule isn't meaning preserving. We hope that in most cases it's fine. And if it's not, one can mark his function using either @pack@ or @unpack@ with @NOINLINE@ pragma to prevent the rule from firing. So, eventually, we add the following rule: -} {-# RULES "pack/unpack internal" [1] forall s. TF.unstreamList (TF.map T.safe (TF.streamList s)) = s #-} {- In case if GHC didn't manage to inline and rewrite everything in the remaining phases (@Data.Text.pack@ is inlined at 1-st phase), we still have "pack/unpack" rule. Hopefully, one of them will fire. -} {- The opposite rule is safe to have because 'T.safe' /is/ the identity function for strings made up from valid characters, and text is guaranteed to have only valid ones. However, for this case there is no @unstream (stream s) = id@ rule, so we don't delve deep into internals. As long as @stream@ and @unstream@ only perform conversion between text and stream of characters, they should be safe to collapse. -} {-# RULES "unpack/pack" [~0] forall s. T.pack (T.unpack s) = s #-} -- | Polymorhpic version of 'Text.Read.readEither'. -- -- >>> readEither @Text @Int "123" -- Right 123 -- >>> readEither @Text @Int "aa" -- Left "Prelude.read: no parse" readEither :: (ToString a, Read b) => a -> Either Text b readEither = first toText . Text.Read.readEither . toString -- | Generalized version of 'Prelude.show'. show :: forall b a . (Show.Show a, IsString b) => a -> b show x = fromString (Show.show x) {-# SPECIALIZE show :: Show.Show a => a -> Text #-} {-# SPECIALIZE show :: Show.Show a => a -> LText #-} {-# SPECIALIZE show :: Show.Show a => a -> ByteString #-} {-# SPECIALIZE show :: Show.Show a => a -> LByteString #-} {-# SPECIALIZE show :: Show.Show a => a -> String #-}