Copyright	(c) 2024 Pierre Le Marre
Maintainer	dev@wismill.eu
Stability	experimental
Safe Haskell	Safe-Inferred
Language	GHC2021

Unicode.CharacterDatabase.Parser.UnicodeData

Description

Parser for UnicodeData.txt.

Since: 0.1.0

Synopsis

parse :: ByteString -> [Entry]
data Entry = Entry {
- range :: !CodePointRange
- details :: !CharDetails
}
data CharDetails = CharDetails {
- name :: !ShortByteString
- generalCategory :: !GeneralCategory
- combiningClass :: !Word8
- bidiClass :: !ShortByteString
- bidiMirrored :: !Bool
- decomposition :: !Decomposition
- numericValue :: !NumericValue
- simpleUpperCaseMapping :: !(Maybe Char)
- simpleLowerCaseMapping :: !(Maybe Char)
- simpleTitleCaseMapping :: !(Maybe Char)
}
data GeneralCategory where
- Lu
- Ll
- Lt
- Lm
- Lo
- Mn
- Mc
- Me
- Nd
- Nl
- No
- Pc
- Pd
- Ps
- Pe
- Pi
- Pf
- Po
- Sm
- Sc
- Sk
- So
- Zs
- Zl
- Zp
- Cc
- Cf
- Cs
- Co
- Cn
- pattern DefaultGeneralCategory :: GeneralCategory
data DecompositionType
- = Canonical
- | Compat
- | Font
- | NoBreak
- | Initial
- | Medial
- | Final
- | Isolated
- | Circle
- | Super
- | Sub
- | Vertical
- | Wide
- | Narrow
- | Small
- | Square
- | Fraction
data Decomposition
- = Self
- | Decomposition {
  - decompositionType :: !DecompositionType
  - decompositionMapping :: ![Char]
  }
data NumericValue
- = NotNumeric
- | Integer !Integer
- | Rational !Rational

Documentation

parse :: ByteString -> [Entry] Source #

Parser for UnicodeData.txt file

>>> :{
traverse_ print . parse $
  "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n\
  \00A8;DIAERESIS;Sk;0;ON;<compat> 0020 0308;;;;N;SPACING DIAERESIS;;;;\n\
  \17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n\
  \187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;\n"
:}
Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}}
Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}

Since: 0.1.0

data Entry Source #

An entry in UnicodeData.txt.

Since: 0.1.0

Constructors

Entry
Fields range :: !CodePointRange details :: !CharDetails

Instances

Instances details

Show Entry Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods showsPrec :: Int -> Entry -> ShowS # show :: Entry -> String # showList :: [Entry] -> ShowS #
Eq Entry Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods (==) :: Entry -> Entry -> Bool # (/=) :: Entry -> Entry -> Bool #

data CharDetails Source #

Core characteristics of a Unicode code point

Since: 0.1.0

Constructors

CharDetails

Fields

name :: !ShortByteString
In case of a range, the range’s name. It is better to use the names from DerivedName.txt.
generalCategory :: !GeneralCategory
combiningClass :: !Word8
Value in the range 0..254
bidiClass :: !ShortByteString
bidiMirrored :: !Bool
decomposition :: !Decomposition
numericValue :: !NumericValue
simpleUpperCaseMapping :: !(Maybe Char)
simpleLowerCaseMapping :: !(Maybe Char)
simpleTitleCaseMapping :: !(Maybe Char)

Instances

Instances details

Show CharDetails Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods showsPrec :: Int -> CharDetails -> ShowS # show :: CharDetails -> String # showList :: [CharDetails] -> ShowS #
Eq CharDetails Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods (==) :: CharDetails -> CharDetails -> Bool # (/=) :: CharDetails -> CharDetails -> Bool #

data GeneralCategory Source #

See: https://www.unicode.org/reports/tr44/#General_Category

Since: 0.1.0

Constructors

Lu	Letter, Uppercase
Ll	Letter, Lowercase
Lt	Letter, Titlecase
Lm	Letter, Modifier
Lo	Letter, Other
Mn	Mark, Non-Spacing
Mc	Mark, Spacing Combining
Me	Mark, Enclosing
Nd	Number, Decimal
Nl	Number, Letter
No	Number, Other
Pc	Punctuation, Connector
Pd	Punctuation, Dash
Ps	Punctuation, Open
Pe	Punctuation, Close
Pi	Punctuation, Initial quote
Pf	Punctuation, Final quote
Po	Punctuation, Other
Sm	Symbol, Math
Sc	Symbol, Currency
Sk	Symbol, Modifier
So	Symbol, Other
Zs	Separator, Space
Zl	Separator, Line
Zp	Separator, Paragraph
Cc	Other, Control
Cf	Other, Format
Cs	Other, Surrogate
Co	Other, Private Use
Cn	Other, Not Assigned

Bundled Patterns

pattern DefaultGeneralCategory :: GeneralCategory

Instances

Instances details

Bounded GeneralCategory Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods minBound :: GeneralCategory # maxBound :: GeneralCategory #
Enum GeneralCategory Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods succ :: GeneralCategory -> GeneralCategory # pred :: GeneralCategory -> GeneralCategory # toEnum :: Int -> GeneralCategory # fromEnum :: GeneralCategory -> Int # enumFrom :: GeneralCategory -> [GeneralCategory] # enumFromThen :: GeneralCategory -> GeneralCategory -> [GeneralCategory] # enumFromTo :: GeneralCategory -> GeneralCategory -> [GeneralCategory] # enumFromThenTo :: GeneralCategory -> GeneralCategory -> GeneralCategory -> [GeneralCategory] #
Read GeneralCategory Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods readsPrec :: Int -> ReadS GeneralCategory # readList :: ReadS [GeneralCategory] # readPrec :: ReadPrec GeneralCategory # readListPrec :: ReadPrec [GeneralCategory] #
Show GeneralCategory Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods showsPrec :: Int -> GeneralCategory -> ShowS # show :: GeneralCategory -> String # showList :: [GeneralCategory] -> ShowS #
Eq GeneralCategory Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods (==) :: GeneralCategory -> GeneralCategory -> Bool # (/=) :: GeneralCategory -> GeneralCategory -> Bool #

data DecompositionType Source #

See: https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings

Since: 0.1.0

Constructors

Canonical
Compat
Font
NoBreak
Initial
Medial
Final
Isolated
Circle
Super
Sub
Vertical
Wide
Narrow
Small
Square
Fraction

Instances

Instances details

Show DecompositionType Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods showsPrec :: Int -> DecompositionType -> ShowS # show :: DecompositionType -> String # showList :: [DecompositionType] -> ShowS #
Eq DecompositionType Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods (==) :: DecompositionType -> DecompositionType -> Bool # (/=) :: DecompositionType -> DecompositionType -> Bool #

data Decomposition Source #

Unicode decomposition of a code point

Since: 0.1.0

Constructors

Self
Decomposition
Fields decompositionType :: !DecompositionType decompositionMapping :: ![Char]

Instances

Instances details

Show Decomposition Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods showsPrec :: Int -> Decomposition -> ShowS # show :: Decomposition -> String # showList :: [Decomposition] -> ShowS #
Eq Decomposition Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.UnicodeData Methods (==) :: Decomposition -> Decomposition -> Bool # (/=) :: Decomposition -> Decomposition -> Bool #

data NumericValue Source #

Numeric value of a code point, if relevant

Since: 0.1.0

Constructors

NotNumeric
Integer !Integer
Rational !Rational

Instances

Instances details

Show NumericValue Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.Internal Methods showsPrec :: Int -> NumericValue -> ShowS # show :: NumericValue -> String # showList :: [NumericValue] -> ShowS #
Eq NumericValue Source #
Instance details Defined in Unicode.CharacterDatabase.Parser.Internal Methods (==) :: NumericValue -> NumericValue -> Bool # (/=) :: NumericValue -> NumericValue -> Bool #