-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Parsers for Unicode Character Database (UCD) files -- -- This package provides simple ByteString parsers for the -- Unicode character database (UCD). -- -- See the Unicode Standard Annex #44 for further details. @package unicode-data-parser @version 0.2.0 -- | Miscellaneous bits common to various parsers module Unicode.CharacterDatabase.Parser.Common -- | Parse a code point formatted as hexadecimal -- -- Warning: raise an error on invalid input. -- --
-- >>> parseCodePoint "0061" -- 'a' --parseCodePoint :: ShortByteString -> Char -- | Parse a list of code points parseCodePointList :: ShortByteString -> [Char] -- | A Unicode code point range data CodePointRange SingleChar :: !Char -> CodePointRange [$sel:start:SingleChar] :: CodePointRange -> !Char CharRange :: !Char -> !Char -> CodePointRange [$sel:start:SingleChar] :: CodePointRange -> !Char [$sel:end:SingleChar] :: CodePointRange -> !Char -- | Numeric value of a code point, if relevant data NumericValue NotNumeric :: NumericValue Integer :: !Integer -> NumericValue Rational :: !Rational -> NumericValue -- | Parse space-separated list, similar to words. parseList :: ShortByteString -> [ShortByteString] -- | Default values for properties. module Unicode.CharacterDatabase.Parser.Properties.Defaults -- | Default age defaultAge :: ShortByteString -- | Default general category defaultGeneralCategory :: ShortByteString -- | Default script defaultScript :: ShortByteString -- | Parser for properties files with multiple properties, such as: -- -- module Unicode.CharacterDatabase.Parser.Properties.Multiple -- | A parser for properties files with multiple properties parse :: ByteString -> [Entry] -- | An entry from a properties file with multiple properties -- --
-- >>> parse "0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>"
-- [Entry {range = CharRange {start = '\t', end = '\r'}, property = "White_Space", value = Nothing}]
--
-- >>> parse "061C ; Bidi_Control # Cf ARABIC LETTER MARK"
-- [Entry {range = SingleChar {start = '\1564'}, property = "Bidi_Control", value = Nothing}]
--
-- >>> parse "037A ; FC_NFKC; 0020 03B9 # Lm GREEK YPOGEGRAMMENI"
-- [Entry {range = SingleChar {start = '\890'}, property = "FC_NFKC", value = Just "0020 03B9"}]
--
data Entry
Entry :: !CodePointRange -> !ShortByteString -> !Maybe ShortByteString -> Entry
[$sel:range:Entry] :: Entry -> !CodePointRange
[$sel:property:Entry] :: Entry -> !ShortByteString
[$sel:value:Entry] :: Entry -> !Maybe ShortByteString
instance GHC.Show.Show Unicode.CharacterDatabase.Parser.Properties.Multiple.Entry
instance GHC.Classes.Eq Unicode.CharacterDatabase.Parser.Properties.Multiple.Entry
-- | Parser for properties files with a single property, such as:
--
--
module Unicode.CharacterDatabase.Parser.Properties.Single
-- | A parser for properties files with one value per entry
--
--
-- >>> parse "102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK"
-- [Entry {range = SingleChar {start = '\66272'}, value = "Arab Copt"}]
--
-- >>> parse "1CF7 ; Beng # Mc VEDIC SIGN ATIKRAMA"
-- [Entry {range = SingleChar {start = '\7415'}, value = "Beng"}]
--
-- >>> parse "1CDE..1CDF ; Deva # Mn [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW"
-- [Entry {range = CharRange {start = '\7390', end = '\7391'}, value = "Deva"}]
--
-- >>> parse "1CD0 ; Beng Deva Gran Knda # Mn VEDIC TONE KARSHANA"
-- [Entry {range = SingleChar {start = '\7376'}, value = "Beng Deva Gran Knda"}]
--
parse :: ByteString -> [Entry]
-- | An entry from a properties file with one value per entry
data Entry
Entry :: !CodePointRange -> !ShortByteString -> Entry
[$sel:range:Entry] :: Entry -> !CodePointRange
[$sel:value:Entry] :: Entry -> !ShortByteString
-- | A parser for properties files with multiple values per entry
parseMultipleValues :: ByteString -> [EntryMultipleValues]
-- | An entry from a properties file with multiple values per entry
data EntryMultipleValues
EntryMultipleValues :: !CodePointRange -> !NonEmpty ShortByteString -> EntryMultipleValues
[$sel:range:EntryMultipleValues] :: EntryMultipleValues -> !CodePointRange
[$sel:values:EntryMultipleValues] :: EntryMultipleValues -> !NonEmpty ShortByteString
instance GHC.Show.Show Unicode.CharacterDatabase.Parser.Properties.Single.Entry
instance GHC.Classes.Eq Unicode.CharacterDatabase.Parser.Properties.Single.Entry
instance GHC.Show.Show Unicode.CharacterDatabase.Parser.Properties.Single.EntryMultipleValues
instance GHC.Classes.Eq Unicode.CharacterDatabase.Parser.Properties.Single.EntryMultipleValues
-- | Parser for UnicodeData.txt.
module Unicode.CharacterDatabase.Parser.UnicodeData
-- | Parser for UnicodeData.txt file
--
--
-- >>> :{
-- traverse_ print . parse $
-- "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n\
-- \00A8;DIAERESIS;Sk;0;ON;<compat> 0020 0308;;;;N;SPACING DIAERESIS;;;;\n\
-- \17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n\
-- \187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;\n"
-- :}
-- Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}}
-- Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
-- Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
--
parse :: ByteString -> [Entry]
-- | An entry in UnicodeData.txt.
data Entry
Entry :: !CodePointRange -> !CharDetails -> Entry
[$sel:range:Entry] :: Entry -> !CodePointRange
[$sel:details:Entry] :: Entry -> !CharDetails
-- | Core characteristics of a Unicode code point
data CharDetails
CharDetails :: !ShortByteString -> !GeneralCategory -> !Word8 -> !ShortByteString -> !Bool -> !Decomposition -> !NumericValue -> !Maybe Char -> !Maybe Char -> !Maybe Char -> CharDetails
-- | In case of a range, the range’s name. It is better to use the names
-- from DerivedName.txt.
[$sel:name:CharDetails] :: CharDetails -> !ShortByteString
[$sel:generalCategory:CharDetails] :: CharDetails -> !GeneralCategory
-- | Value in the range 0..254
[$sel:combiningClass:CharDetails] :: CharDetails -> !Word8
[$sel:bidiClass:CharDetails] :: CharDetails -> !ShortByteString
[$sel:bidiMirrored:CharDetails] :: CharDetails -> !Bool
[$sel:decomposition:CharDetails] :: CharDetails -> !Decomposition
[$sel:numericValue:CharDetails] :: CharDetails -> !NumericValue
[$sel:simpleUpperCaseMapping:CharDetails] :: CharDetails -> !Maybe Char
[$sel:simpleLowerCaseMapping:CharDetails] :: CharDetails -> !Maybe Char
[$sel:simpleTitleCaseMapping:CharDetails] :: CharDetails -> !Maybe Char
-- | See: https://www.unicode.org/reports/tr44/#General_Category
data GeneralCategory
-- | Letter, Uppercase
Lu :: GeneralCategory
-- | Letter, Lowercase
Ll :: GeneralCategory
-- | Letter, Titlecase
Lt :: GeneralCategory
-- | Letter, Modifier
Lm :: GeneralCategory
-- | Letter, Other
Lo :: GeneralCategory
-- | Mark, Non-Spacing
Mn :: GeneralCategory
-- | Mark, Spacing Combining
Mc :: GeneralCategory
-- | Mark, Enclosing
Me :: GeneralCategory
-- | Number, Decimal
Nd :: GeneralCategory
-- | Number, Letter
Nl :: GeneralCategory
-- | Number, Other
No :: GeneralCategory
-- | Punctuation, Connector
Pc :: GeneralCategory
-- | Punctuation, Dash
Pd :: GeneralCategory
-- | Punctuation, Open
Ps :: GeneralCategory
-- | Punctuation, Close
Pe :: GeneralCategory
-- | Punctuation, Initial quote
Pi :: GeneralCategory
-- | Punctuation, Final quote
Pf :: GeneralCategory
-- | Punctuation, Other
Po :: GeneralCategory
-- | Symbol, Math
Sm :: GeneralCategory
-- | Symbol, Currency
Sc :: GeneralCategory
-- | Symbol, Modifier
Sk :: GeneralCategory
-- | Symbol, Other
So :: GeneralCategory
-- | Separator, Space
Zs :: GeneralCategory
-- | Separator, Line
Zl :: GeneralCategory
-- | Separator, Paragraph
Zp :: GeneralCategory
-- | Other, Control
Cc :: GeneralCategory
-- | Other, Format
Cf :: GeneralCategory
-- | Other, Surrogate
Cs :: GeneralCategory
-- | Other, Private Use
Co :: GeneralCategory
-- | Other, Not Assigned
Cn :: GeneralCategory
pattern DefaultGeneralCategory :: GeneralCategory
-- | See:
-- https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
data DecompositionType
Canonical :: DecompositionType
Compat :: DecompositionType
Font :: DecompositionType
NoBreak :: DecompositionType
Initial :: DecompositionType
Medial :: DecompositionType
Final :: DecompositionType
Isolated :: DecompositionType
Circle :: DecompositionType
Super :: DecompositionType
Sub :: DecompositionType
Vertical :: DecompositionType
Wide :: DecompositionType
Narrow :: DecompositionType
Small :: DecompositionType
Square :: DecompositionType
Fraction :: DecompositionType
-- | Unicode decomposition of a code point
data Decomposition
Self :: Decomposition
Decomposition :: !DecompositionType -> ![Char] -> Decomposition
[$sel:decompositionType:Self] :: Decomposition -> !DecompositionType
[$sel:decompositionMapping:Self] :: Decomposition -> ![Char]
-- | Numeric value of a code point, if relevant
data NumericValue
NotNumeric :: NumericValue
Integer :: !Integer -> NumericValue
Rational :: !Rational -> NumericValue
instance GHC.Read.Read Unicode.CharacterDatabase.Parser.UnicodeData.GeneralCategory
instance GHC.Show.Show Unicode.CharacterDatabase.Parser.UnicodeData.GeneralCategory
instance GHC.Classes.Eq Unicode.CharacterDatabase.Parser.UnicodeData.GeneralCategory
instance GHC.Enum.Enum Unicode.CharacterDatabase.Parser.UnicodeData.GeneralCategory
instance GHC.Enum.Bounded Unicode.CharacterDatabase.Parser.UnicodeData.GeneralCategory
instance GHC.Classes.Eq Unicode.CharacterDatabase.Parser.UnicodeData.DecompositionType
instance GHC.Show.Show Unicode.CharacterDatabase.Parser.UnicodeData.DecompositionType
instance GHC.Classes.Eq Unicode.CharacterDatabase.Parser.UnicodeData.Decomposition
instance GHC.Show.Show Unicode.CharacterDatabase.Parser.UnicodeData.Decomposition
instance GHC.Show.Show Unicode.CharacterDatabase.Parser.UnicodeData.CharDetails
instance GHC.Classes.Eq Unicode.CharacterDatabase.Parser.UnicodeData.CharDetails
instance GHC.Show.Show Unicode.CharacterDatabase.Parser.UnicodeData.Entry
instance GHC.Classes.Eq Unicode.CharacterDatabase.Parser.UnicodeData.Entry