-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | ByteString ↔ Text converter based on GHC.IO.Encoding
--
-- Please see the README on GitHub at
-- https://github.com/msakai/bytestring-encoding#readme
@package bytestring-encoding
@version 0.1.1.0
-- | ByteString ↔ Text converter based on
-- GHC.IO.Encoding.
module Data.ByteString.Lazy.Encoding
-- | Encode a lazy Text into a lazy ByteString using a given
-- TextEncoding.
encode :: TextEncoding -> Text -> ByteString
-- | Decode a lazy ByteString to a lazy Text using a given
-- TextEncoding.
decode :: TextEncoding -> ByteString -> Text
-- | A TextEncoding is a specification of a conversion scheme
-- between sequences of bytes and sequences of Unicode characters.
--
-- For example, UTF-8 is an encoding of Unicode characters into a
-- sequence of bytes. The TextEncoding for UTF-8 is utf8.
data TextEncoding
-- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes directly to
-- the first 256 Unicode code points, and is thus not a complete Unicode
-- encoding. An attempt to write a character greater than '\255'
-- to a Handle using the latin1 encoding will result in an
-- error.
latin1 :: TextEncoding
-- | The UTF-8 Unicode encoding
utf8 :: TextEncoding
-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
-- sequence 0xEF 0xBB 0xBF). This encoding behaves like utf8,
-- except that on input, the BOM sequence is ignored at the beginning of
-- the stream, and on output, the BOM sequence is prepended.
--
-- The byte-order-mark is strictly unnecessary in UTF-8, but is sometimes
-- used to identify the encoding of a file.
utf8_bom :: TextEncoding
-- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
-- indicate endianness).
utf16 :: TextEncoding
-- | The UTF-16 Unicode encoding (litte-endian)
utf16le :: TextEncoding
-- | The UTF-16 Unicode encoding (big-endian)
utf16be :: TextEncoding
-- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
-- indicate endianness).
utf32 :: TextEncoding
-- | The UTF-32 Unicode encoding (litte-endian)
utf32le :: TextEncoding
-- | The UTF-32 Unicode encoding (big-endian)
utf32be :: TextEncoding
-- | The Unicode encoding of the current locale
--
-- This is the initial locale encoding: if it has been subsequently
-- changed by setLocaleEncoding this value will not reflect that
-- change.
localeEncoding :: TextEncoding
-- | An encoding in which Unicode code points are translated to bytes by
-- taking the code point modulo 256. When decoding, bytes are translated
-- directly into the equivalent code point.
--
-- This encoding never fails in either direction. However, encoding
-- discards information, so encode followed by decode is not the
-- identity.
char8 :: TextEncoding
-- | Look up the named Unicode encoding. May fail with
--
--
--
-- The set of known encodings is system-dependent, but includes at least:
--
--
-- UTF-8
-- - UTF-16, UTF-16BE, UTF-16LE
-- - UTF-32, UTF-32BE, UTF-32LE
--
--
-- There is additional notation (borrowed from GNU iconv) for specifying
-- how illegal characters are handled:
--
--
-- - a suffix of //IGNORE, e.g. UTF-8//IGNORE, will
-- cause all illegal sequences on input to be ignored, and on output will
-- drop all code points that have no representation in the target
-- encoding.
-- - a suffix of //TRANSLIT will choose a replacement
-- character for illegal sequences or code points.
-- - a suffix of //ROUNDTRIP will use a PEP383-style escape
-- mechanism to represent any invalid bytes in the input as Unicode
-- codepoints (specifically, as lone surrogates, which are normally
-- invalid in UTF-32). Upon output, these special codepoints are detected
-- and turned back into the corresponding original byte.
--
--
-- In theory, this mechanism allows arbitrary data to be roundtripped via
-- a String with no loss of data. In practice, there are two
-- limitations to be aware of:
--
--
-- - This only stands a chance of working for an encoding which is an
-- ASCII superset, as for security reasons we refuse to escape any bytes
-- smaller than 128. Many encodings of interest are ASCII supersets (in
-- particular, you can assume that the locale encoding is an ASCII
-- superset) but many (such as UTF-16) are not.
-- - If the underlying encoding is not itself roundtrippable, this
-- mechanism can fail. Roundtrippable encodings are those which have an
-- injective mapping into Unicode. Almost all encodings meet this
-- criteria, but some do not. Notably, Shift-JIS (CP932) and Big5 contain
-- several different encodings of the same Unicode codepoint.
--
--
-- On Windows, you can access supported code pages with the prefix
-- CP; for example, "CP1250".
mkTextEncoding :: String -> IO TextEncoding
-- | ByteString ↔ Text converter based on
-- GHC.IO.Encoding.
module Data.ByteString.Encoding
-- | Encode a strict Text into strict ByteString using a
-- given TextEncoding.
encode :: TextEncoding -> Text -> ByteString
-- | Decode a strict ByteString to a strit Text using a given
-- TextEncoding.
decode :: TextEncoding -> ByteString -> Text
-- | A TextEncoding is a specification of a conversion scheme
-- between sequences of bytes and sequences of Unicode characters.
--
-- For example, UTF-8 is an encoding of Unicode characters into a
-- sequence of bytes. The TextEncoding for UTF-8 is utf8.
data TextEncoding
-- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes directly to
-- the first 256 Unicode code points, and is thus not a complete Unicode
-- encoding. An attempt to write a character greater than '\255'
-- to a Handle using the latin1 encoding will result in an
-- error.
latin1 :: TextEncoding
-- | The UTF-8 Unicode encoding
utf8 :: TextEncoding
-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte
-- sequence 0xEF 0xBB 0xBF). This encoding behaves like utf8,
-- except that on input, the BOM sequence is ignored at the beginning of
-- the stream, and on output, the BOM sequence is prepended.
--
-- The byte-order-mark is strictly unnecessary in UTF-8, but is sometimes
-- used to identify the encoding of a file.
utf8_bom :: TextEncoding
-- | The UTF-16 Unicode encoding (a byte-order-mark should be used to
-- indicate endianness).
utf16 :: TextEncoding
-- | The UTF-16 Unicode encoding (litte-endian)
utf16le :: TextEncoding
-- | The UTF-16 Unicode encoding (big-endian)
utf16be :: TextEncoding
-- | The UTF-32 Unicode encoding (a byte-order-mark should be used to
-- indicate endianness).
utf32 :: TextEncoding
-- | The UTF-32 Unicode encoding (litte-endian)
utf32le :: TextEncoding
-- | The UTF-32 Unicode encoding (big-endian)
utf32be :: TextEncoding
-- | The Unicode encoding of the current locale
--
-- This is the initial locale encoding: if it has been subsequently
-- changed by setLocaleEncoding this value will not reflect that
-- change.
localeEncoding :: TextEncoding
-- | An encoding in which Unicode code points are translated to bytes by
-- taking the code point modulo 256. When decoding, bytes are translated
-- directly into the equivalent code point.
--
-- This encoding never fails in either direction. However, encoding
-- discards information, so encode followed by decode is not the
-- identity.
char8 :: TextEncoding
-- | Look up the named Unicode encoding. May fail with
--
--
--
-- The set of known encodings is system-dependent, but includes at least:
--
--
-- UTF-8
-- - UTF-16, UTF-16BE, UTF-16LE
-- - UTF-32, UTF-32BE, UTF-32LE
--
--
-- There is additional notation (borrowed from GNU iconv) for specifying
-- how illegal characters are handled:
--
--
-- - a suffix of //IGNORE, e.g. UTF-8//IGNORE, will
-- cause all illegal sequences on input to be ignored, and on output will
-- drop all code points that have no representation in the target
-- encoding.
-- - a suffix of //TRANSLIT will choose a replacement
-- character for illegal sequences or code points.
-- - a suffix of //ROUNDTRIP will use a PEP383-style escape
-- mechanism to represent any invalid bytes in the input as Unicode
-- codepoints (specifically, as lone surrogates, which are normally
-- invalid in UTF-32). Upon output, these special codepoints are detected
-- and turned back into the corresponding original byte.
--
--
-- In theory, this mechanism allows arbitrary data to be roundtripped via
-- a String with no loss of data. In practice, there are two
-- limitations to be aware of:
--
--
-- - This only stands a chance of working for an encoding which is an
-- ASCII superset, as for security reasons we refuse to escape any bytes
-- smaller than 128. Many encodings of interest are ASCII supersets (in
-- particular, you can assume that the locale encoding is an ASCII
-- superset) but many (such as UTF-16) are not.
-- - If the underlying encoding is not itself roundtrippable, this
-- mechanism can fail. Roundtrippable encodings are those which have an
-- injective mapping into Unicode. Almost all encodings meet this
-- criteria, but some do not. Notably, Shift-JIS (CP932) and Big5 contain
-- several different encodings of the same Unicode codepoint.
--
--
-- On Windows, you can access supported code pages with the prefix
-- CP; for example, "CP1250".
mkTextEncoding :: String -> IO TextEncoding