Safe Haskell	Safe-Inferred
Language	Haskell2010

Parser.Lathe.Encoding.UTF8

Contents

Byte-order mark
UTF-8

Description

Functions for parsing UTF-8.

Parsing of UTF-8 code points is broken down into two steps to allow for full or partial validation of characters as opposed to full Char conversions.

The following is an example of parsing a UTF-8 code point into a character:

data Error = EoF | Malformed | Surrogate | …

charUtf8 :: Parser Error Char
charUtf8 = do
  u <- unitUtf8 (\_ -> Malformed) EoF
  case u of
    UTF8_1 u1 -> pure $! fromUtf8 u1
    UTF8_2 u2 -> contUtf8_2 Malformed EoF u2
    UTF8_3 u3 -> do
      p <- contUtf8_3 (\_ -> Malformed) EoF u3
      if isSurrogate p
        then err Surrogate
        else pure $! fromUtf8 p

    UTF8_4 u4 -> contUtf8_4 (\_ -> Malformed) EoF u4

Synopsis

utf8BOM :: e -> e -> Parser e ()
newtype UTF8Unit (n :: Nat) = UTF8Unit Word8
newtype UTF8Point (n :: Nat) = UTF8Point Word32
fromUtf8 :: UTF8Point n -> Char
isSurrogate :: UTF8Point 3 -> Bool
data UTF8Branch
- = UTF8_1 !(UTF8Point 1)
- | UTF8_2 !(UTF8Unit 2)
- | UTF8_3 !(UTF8Unit 3)
- | UTF8_4 !(UTF8Unit 4)
data UTF8Error_1
- = UTF8Continuation
- | UTF8Overlong_2
- | UTF8Overflow_1
- | UTF8Invalid
unitUtf8 :: (UTF8Error_1 -> e) -> e -> Parser e UTF8Branch
contUtf8_2 :: e -> e -> UTF8Unit 2 -> Parser e Char
skipUtf8_2 :: e -> e -> UTF8Unit 2 -> Parser e ()
data UTF8Error_3
contUtf8_3 :: (UTF8Error_3 -> e) -> e -> UTF8Unit 3 -> Parser e (UTF8Point 3)
skipUtf8_3 :: (UTF8Error_3 -> e) -> e -> UTF8Unit 3 -> Parser e Bool
data UTF8Error_4
contUtf8_4 :: (UTF8Error_4 -> e) -> e -> UTF8Unit 4 -> Parser e Char
skipUtf8_4 :: (UTF8Error_4 -> e) -> e -> UTF8Unit 4 -> Parser e ()

Byte-order mark

utf8BOM Source #

Arguments

:: e	Malformed.
-> e	Reached end.
-> Parser e ()

Consume 3 bytes that represent a UTF-8 byte-order mark.

UTF-8

newtype UTF8Unit (n :: Nat) Source #

First UTF-8 code unit. n represents the total number of code units in this code point.

Constructors

UTF8Unit Word8

newtype UTF8Point (n :: Nat) Source #

A Unicode code point.

Constructors

UTF8Point Word32

fromUtf8 :: UTF8Point n -> Char Source #

Convert a code point into a Char.

isSurrogate :: UTF8Point 3 -> Bool Source #

Check whether a 3-unit code point lies in the surrogate range (U+D800 to U+DFFF).

First code unit

data UTF8Branch Source #

UTF-8 branching based on the first code unit.

Constructors

UTF8_1 !(UTF8Point 1)
UTF8_2 !(UTF8Unit 2)
UTF8_3 !(UTF8Unit 3)
UTF8_4 !(UTF8Unit 4)

data UTF8Error_1 Source #

Errors that may be encountered when parsing the first UTF-8 code unit.

Constructors

UTF8Continuation	(byte 1 is `10xxxxxx`) Continuation code unit.
UTF8Overlong_2	(byte 1 is `1100000x`) Overlong 2-unit code point.
UTF8Overflow_1	(byte 1 is `111101xx`, `xx` is not `00`) Invalid 4-unit code point.
UTF8Invalid	(byte 1 is `11111xxx`) Invalid first code unit.

Instances

Instances details

Show UTF8Error_1 Source #
Instance details Defined in Parser.Lathe.Encoding.UTF8 Methods showsPrec :: Int -> UTF8Error_1 -> ShowS # show :: UTF8Error_1 -> String # showList :: [UTF8Error_1] -> ShowS #
Eq UTF8Error_1 Source #
Instance details Defined in Parser.Lathe.Encoding.UTF8 Methods (==) :: UTF8Error_1 -> UTF8Error_1 -> Bool # (/=) :: UTF8Error_1 -> UTF8Error_1 -> Bool #

unitUtf8 Source #

Arguments

:: (UTF8Error_1 -> e)	Malformed
-> e	Reached end.
-> Parser e UTF8Branch

Consume 1 byte that represents the first code unit of a UTF-8 code point.

Second code unit

contUtf8_2 Source #

Arguments

:: e	(byte 2 is not `10xxxxxx`) Non-continuation second code unit.
-> e	Reached end.
-> UTF8Unit 2
-> Parser e Char

Consume 1 byte that represents the second code unit of a 2-unit UTF-8 code point and convert the two units into a Char.

skipUtf8_2 Source #

Arguments

:: e	(byte 2 is not `10xxxxxx`) Non-continuation second code unit.
-> e	Reached end.
-> UTF8Unit 2
-> Parser e ()

Consume 1 byte that represents the second code unit of a 2-unit UTF-8 code point.

Third code unit

data UTF8Error_3 Source #

Errors that may be encountered when parsing the third UTF-8 code unit.

Constructors

UTF8Incomplete_2_3	(byte 2 is not `10xxxxxx`) Non-continuation second code unit.
UTF8Overlong_3	(byte 1 is `11100000`, byte 2 is `100xxxxx`) Overlong 3-unit code point.
UTF8Incomplete_3_3	(byte 3 is not `10xxxxxx`) Non-continuation third code unit.

Instances

Instances details

Show UTF8Error_3 Source #
Instance details Defined in Parser.Lathe.Encoding.UTF8 Methods showsPrec :: Int -> UTF8Error_3 -> ShowS # show :: UTF8Error_3 -> String # showList :: [UTF8Error_3] -> ShowS #
Eq UTF8Error_3 Source #
Instance details Defined in Parser.Lathe.Encoding.UTF8 Methods (==) :: UTF8Error_3 -> UTF8Error_3 -> Bool # (/=) :: UTF8Error_3 -> UTF8Error_3 -> Bool #

contUtf8_3 Source #

Arguments

:: (UTF8Error_3 -> e)	Malformed.
-> e	Reached end.
-> UTF8Unit 3
-> Parser e (UTF8Point 3)

Consume 2 bytes that represent the second and third code units of a 3-unit UTF-8 code point and convert the three units into a code point.

skipUtf8_3 Source #

Arguments

:: (UTF8Error_3 -> e)	Malformed.
-> e	Reached end.
-> UTF8Unit 3
-> Parser e Bool	`True` if the skipped code point lies in the surrogate range (`U+D800` to `U+DFFF`).

Consume 2 bytes that represent the second and third code units of a 3-unit UTF-8 code point.

Fourth code unit

data UTF8Error_4 Source #

Errors that may be encountered when parsing the fourth UTF-8 code unit.

Constructors

UTF8Incomplete_2_4	(byte 2 is not `10xxxxxx`) Non-continuation second code unit.
UTF8Overlong_4	(byte 1 is `11110000`, byte 2 is `1000xxxx`) Overlong 4-unit code point.
UTF8Overflow_2	(byte 1 is `11110100`, byte 2 is not `1000xxxx`) Invalid 4-unit code point.
UTF8Incomplete_3_4	(byte 3 is not `10xxxxxx`) Non-continuation third code unit.
UTF8Incomplete_4_4	(byte 4 is not `10xxxxxx`) Non-continuation fourth code unit.

Instances

Instances details

Show UTF8Error_4 Source #
Instance details Defined in Parser.Lathe.Encoding.UTF8 Methods showsPrec :: Int -> UTF8Error_4 -> ShowS # show :: UTF8Error_4 -> String # showList :: [UTF8Error_4] -> ShowS #
Eq UTF8Error_4 Source #
Instance details Defined in Parser.Lathe.Encoding.UTF8 Methods (==) :: UTF8Error_4 -> UTF8Error_4 -> Bool # (/=) :: UTF8Error_4 -> UTF8Error_4 -> Bool #

contUtf8_4 Source #

Arguments

:: (UTF8Error_4 -> e)	Malformed.
-> e	Reached end.
-> UTF8Unit 4
-> Parser e Char

Consume 3 bytes that represent the second to fourth code units of a 4-unit UTF-8 code point and convert the four units into a Char.

skipUtf8_4 Source #

Arguments

:: (UTF8Error_4 -> e)	Malformed.
-> e	Reached end.
-> UTF8Unit 4
-> Parser e ()

Consume 3 bytes that represent the second to fourth code units of a 4-unit UTF-8 code point.