unicode-data-parser-0.3.0: Parsers for Unicode Character Database (UCD) files
Copyright(c) 2024 Pierre Le Marre
Maintainerdev@wismill.eu
Stabilityexperimental
Safe HaskellSafe-Inferred
LanguageGHC2021

Unicode.CharacterDatabase.Parser.UnicodeData

Description

Parser for UnicodeData.txt.

Since: 0.1.0

Synopsis

Documentation

parse :: ByteString -> [Entry] Source #

Parser for UnicodeData.txt file

>>> :{
traverse_ print . parse $
  "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n\
  \00A8;DIAERESIS;Sk;0;ON;<compat> 0020 0308;;;;N;SPACING DIAERESIS;;;;\n\
  \17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n\
  \187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;\n"
:}
Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}}
Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}

Since: 0.1.0

data Entry Source #

An entry in UnicodeData.txt.

Since: 0.1.0

Constructors

Entry 

Instances

Instances details
Show Entry Source # 
Instance details

Defined in Unicode.CharacterDatabase.Parser.UnicodeData

Methods

showsPrec :: Int -> Entry -> ShowS #

show :: Entry -> String #

showList :: [Entry] -> ShowS #

Eq Entry Source # 
Instance details

Defined in Unicode.CharacterDatabase.Parser.UnicodeData

Methods

(==) :: Entry -> Entry -> Bool #

(/=) :: Entry -> Entry -> Bool #

data CharDetails Source #

Core characteristics of a Unicode code point

Since: 0.1.0

Constructors

CharDetails 

Fields

data GeneralCategory Source #

Constructors

Lu

Letter, Uppercase

Ll

Letter, Lowercase

Lt

Letter, Titlecase

Lm

Letter, Modifier

Lo

Letter, Other

Mn

Mark, Non-Spacing

Mc

Mark, Spacing Combining

Me

Mark, Enclosing

Nd

Number, Decimal

Nl

Number, Letter

No

Number, Other

Pc

Punctuation, Connector

Pd

Punctuation, Dash

Ps

Punctuation, Open

Pe

Punctuation, Close

Pi

Punctuation, Initial quote

Pf

Punctuation, Final quote

Po

Punctuation, Other

Sm

Symbol, Math

Sc

Symbol, Currency

Sk

Symbol, Modifier

So

Symbol, Other

Zs

Separator, Space

Zl

Separator, Line

Zp

Separator, Paragraph

Cc

Other, Control

Cf

Other, Format

Cs

Other, Surrogate

Co

Other, Private Use

Cn

Other, Not Assigned

Bundled Patterns

pattern DefaultGeneralCategory :: GeneralCategory 

data Decomposition Source #

Unicode decomposition of a code point

Since: 0.1.0

data NumericValue Source #

Numeric value of a code point, if relevant

Since: 0.1.0