Safe Haskell	Safe-Inferred
Language	Haskell2010

Parser.Lathe.Encoding.UTF16

Contents

Byte-order mark
UTF-16
- Parsers
  - Continue
  - Skip

Description

Functions for parsing UTF-16, both little-endian and big-endian.

Parsing of UTF-16 code points is broken down into two steps to allow for full or partial validation of characters as opposed to full Char conversions.

The following is an example of parsing a little-endian UTF-16 code point into a character:

data Error = EoF | Malformed | …

charUtf16LE :: Parser Error Char
charUtf16LE = do
  u <- unitUtf16LE Malformed EoF
  case u of
    UTF16_1 u1 -> pure $! fromUtf16 u1
    UTF16_2 u2 -> contUtf16LE_2 Malformed EoF u2

Synopsis

utf16BOM :: e -> e -> Parser e ByteOrder
newtype UTF16Unit (byteOrder :: ByteOrder) (n :: Nat) = UTF16Unit Word16
newtype UTF16Point (n :: Nat) = UTF16Point Word32
fromUtf16 :: UTF16Point n -> Char
data UTF16Branch (byteOrder :: ByteOrder)
- = UTF16_1 !(UTF16Point 1)
- | UTF16_2 !(UTF16Unit byteOrder 2)
unitUtf16BE :: e -> e -> Parser e (UTF16Branch 'BigEndian)
unitUtf16LE :: e -> e -> Parser e (UTF16Branch 'LittleEndian)
contUtf16BE_2 :: e -> e -> UTF16Unit 'BigEndian 2 -> Parser e Char
contUtf16LE_2 :: e -> e -> UTF16Unit 'LittleEndian 2 -> Parser e Char
skipUtf16BE_2 :: e -> e -> UTF16Unit 'BigEndian 2 -> Parser e ()
skipUtf16LE_2 :: e -> e -> UTF16Unit 'LittleEndian 2 -> Parser e ()

Byte-order mark

utf16BOM Source #

Arguments

:: e	Malformed.
-> e	Reached end.
-> Parser e ByteOrder

Consume 2 bytes that represent a UTF-16 byte-order mark and return the corresponding ByteOrder.

UTF-16

newtype UTF16Unit (byteOrder :: ByteOrder) (n :: Nat) Source #

First UTF-16 code unit. n represents the total number of code units in this code point.

Constructors

UTF16Unit Word16

newtype UTF16Point (n :: Nat) Source #

UTF-16 code point.

Constructors

UTF16Point Word32

fromUtf16 :: UTF16Point n -> Char Source #

Convert a UTF-8 code point into a Char.

Parsers

data UTF16Branch (byteOrder :: ByteOrder) Source #

UTF-16 branching based on the first code unit.

Constructors

UTF16_1 !(UTF16Point 1)
UTF16_2 !(UTF16Unit byteOrder 2)

unitUtf16BE Source #

Arguments

:: e	Code unit is a low surrogate.
-> e	Reached end.
-> Parser e (UTF16Branch 'BigEndian)

Consume 2 bytes that represent the first code unit of a big-endian UTF-16 code point.

unitUtf16LE Source #

Arguments

:: e	Code unit is a low surrogate.
-> e	Reached end.
-> Parser e (UTF16Branch 'LittleEndian)

Consume 2 bytes that represent the first code unit of a little-endian UTF-16 code point.

Continue

contUtf16BE_2 Source #

Arguments

:: e	Code unit is not a low surrogate.
-> e	Reached end.
-> UTF16Unit 'BigEndian 2
-> Parser e Char

Consume 2 bytes that represent the second code unit of a 2-unit big-endian UTF-16 code point and convert the two units into a Char.

contUtf16LE_2 Source #

Arguments

:: e	Code unit is not a low surrogate.
-> e	Reached end.
-> UTF16Unit 'LittleEndian 2
-> Parser e Char

Consume 2 bytes that represent the second code unit of a 2-unit little-endian UTF-16 code point and convert the two units into a Char.

Skip

skipUtf16BE_2 Source #

Arguments

:: e	Code unit is not a low surrogate.
-> e	Reached end.
-> UTF16Unit 'BigEndian 2
-> Parser e ()

Consume 2 bytes that represent the second code unit of a 2-unit big-endian UTF-16 code point.

skipUtf16LE_2 Source #

Arguments

:: e	Code unit is not a low surrogate.
-> e	Reached end.
-> UTF16Unit 'LittleEndian 2
-> Parser e ()

Consume 2 bytes that represent the second code unit of a 2-unit little-endian UTF-16 code point.