Copyright | (c) 2024 Pierre Le Marre |
---|---|
Maintainer | dev@wismill.eu |
Stability | experimental |
Safe Haskell | Safe-Inferred |
Language | GHC2021 |
Parser for UnicodeData.txt.
Since: 0.1.0
Synopsis
- parse :: HasCallStack => ByteString -> [Entry]
- data Entry = Entry {
- range :: !CodePointRange
- details :: !CharDetails
- data CharDetails = CharDetails {
- name :: !ShortByteString
- generalCategory :: !GeneralCategory
- combiningClass :: !Word8
- bidiClass :: !ShortByteString
- bidiMirrored :: !Bool
- decomposition :: !Decomposition
- numericValue :: !NumericValue
- simpleUpperCaseMapping :: !(Maybe Char)
- simpleLowerCaseMapping :: !(Maybe Char)
- simpleTitleCaseMapping :: !(Maybe Char)
- data GeneralCategory where
- data DecompositionType
- data Decomposition
- = Self
- | Decomposition { }
- data NumericValue
- = NotNumeric
- | Integer !Integer
- | Rational !Rational
Documentation
parse :: HasCallStack => ByteString -> [Entry] Source #
Parser for UnicodeData.txt file
>>>
:{
traverse_ print . parse $ "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n\ \00A8;DIAERESIS;Sk;0;ON;<compat> 0020 0308;;;;N;SPACING DIAERESIS;;;;\n\ \17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n\ \187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;\n" :} Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}} Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
Since: 0.1.0
An entry in UnicodeData.txt
.
Since: 0.1.0
Entry | |
|
data CharDetails Source #
Core characteristics of a Unicode code point
Since: 0.1.0
CharDetails | |
|
Instances
Show CharDetails Source # | |
Defined in Unicode.CharacterDatabase.Parser.UnicodeData showsPrec :: Int -> CharDetails -> ShowS # show :: CharDetails -> String # showList :: [CharDetails] -> ShowS # | |
Eq CharDetails Source # | |
Defined in Unicode.CharacterDatabase.Parser.UnicodeData (==) :: CharDetails -> CharDetails -> Bool # (/=) :: CharDetails -> CharDetails -> Bool # |
data GeneralCategory Source #
See: https://www.unicode.org/reports/tr44/#General_Category
Since: 0.1.0
Lu | Letter, Uppercase |
Ll | Letter, Lowercase |
Lt | Letter, Titlecase |
Lm | Letter, Modifier |
Lo | Letter, Other |
Mn | Mark, Non-Spacing |
Mc | Mark, Spacing Combining |
Me | Mark, Enclosing |
Nd | Number, Decimal |
Nl | Number, Letter |
No | Number, Other |
Pc | Punctuation, Connector |
Pd | Punctuation, Dash |
Ps | Punctuation, Open |
Pe | Punctuation, Close |
Pi | Punctuation, Initial quote |
Pf | Punctuation, Final quote |
Po | Punctuation, Other |
Sm | Symbol, Math |
Sc | Symbol, Currency |
Sk | Symbol, Modifier |
So | Symbol, Other |
Zs | Separator, Space |
Zl | Separator, Line |
Zp | Separator, Paragraph |
Cc | Other, Control |
Cf | Other, Format |
Cs | Other, Surrogate |
Co | Other, Private Use |
Cn | Other, Not Assigned |
pattern DefaultGeneralCategory :: GeneralCategory |
Instances
data DecompositionType Source #
Canonical | |
Compat | |
Font | |
NoBreak | |
Initial | |
Medial | |
Final | |
Isolated | |
Circle | |
Super | |
Sub | |
Vertical | |
Wide | |
Narrow | |
Small | |
Square | |
Fraction |
Instances
Show DecompositionType Source # | |
Defined in Unicode.CharacterDatabase.Parser.UnicodeData showsPrec :: Int -> DecompositionType -> ShowS # show :: DecompositionType -> String # showList :: [DecompositionType] -> ShowS # | |
Eq DecompositionType Source # | |
Defined in Unicode.CharacterDatabase.Parser.UnicodeData (==) :: DecompositionType -> DecompositionType -> Bool # (/=) :: DecompositionType -> DecompositionType -> Bool # |
data Decomposition Source #
Unicode decomposition of a code point
Since: 0.1.0
Instances
Show Decomposition Source # | |
Defined in Unicode.CharacterDatabase.Parser.UnicodeData showsPrec :: Int -> Decomposition -> ShowS # show :: Decomposition -> String # showList :: [Decomposition] -> ShowS # | |
Eq Decomposition Source # | |
Defined in Unicode.CharacterDatabase.Parser.UnicodeData (==) :: Decomposition -> Decomposition -> Bool # (/=) :: Decomposition -> Decomposition -> Bool # |
data NumericValue Source #
Numeric value of a code point, if relevant
Since: 0.1.0
Instances
Show NumericValue Source # | |
Defined in Unicode.CharacterDatabase.Parser.Internal showsPrec :: Int -> NumericValue -> ShowS # show :: NumericValue -> String # showList :: [NumericValue] -> ShowS # | |
Eq NumericValue Source # | |
Defined in Unicode.CharacterDatabase.Parser.Internal (==) :: NumericValue -> NumericValue -> Bool # (/=) :: NumericValue -> NumericValue -> Bool # |