{- | Copyright : (c) 2024 Pierre Le Marre Maintainer: dev@wismill.eu Stability : experimental Parser for [UnicodeData.txt](https://www.unicode.org/reports/tr44/#UnicodeData.txt). @since 0.1.0 -} module Unicode.CharacterDatabase.Parser.UnicodeData ( parse, Entry (..), CharDetails (..), GeneralCategory (.., DefaultGeneralCategory), DecompositionType (..), Decomposition (..), NumericValue (..), ) where import Data.ByteString qualified as B import Data.ByteString.Char8 qualified as B8 import Data.ByteString.Short qualified as BS import Data.List qualified as L import Data.Word (Word8) import GHC.Stack (HasCallStack) import Unicode.CharacterDatabase.Parser.Internal ( CodePointRange (..), NumericValue (..), parseBoolValue, parseCodePoint, parseCodePointM, parseNumericValue, pattern Comma, pattern NewLine, pattern SemiColon, ) {- $setup >>> import Data.Foldable (traverse_) -} ------------------------------------------------------------------------------- -- Types ------------------------------------------------------------------------------- {- | See: https://www.unicode.org/reports/tr44/#General_Category @since 0.1.0 -} data GeneralCategory = -- | Letter, Uppercase Lu | -- | Letter, Lowercase Ll | -- | Letter, Titlecase Lt | -- | Letter, Modifier Lm | -- | Letter, Other Lo | -- | Mark, Non-Spacing Mn | -- | Mark, Spacing Combining Mc | -- | Mark, Enclosing Me | -- | Number, Decimal Nd | -- | Number, Letter Nl | -- | Number, Other No | -- | Punctuation, Connector Pc | -- | Punctuation, Dash Pd | -- | Punctuation, Open Ps | -- | Punctuation, Close Pe | -- | Punctuation, Initial quote Pi | -- | Punctuation, Final quote Pf | -- | Punctuation, Other Po | -- | Symbol, Math Sm | -- | Symbol, Currency Sc | -- | Symbol, Modifier Sk | -- | Symbol, Other So | -- | Separator, Space Zs | -- | Separator, Line Zl | -- | Separator, Paragraph Zp | -- | Other, Control Cc | -- | Other, Format Cf | -- | Other, Surrogate Cs | -- | Other, Private Use Co | -- | Other, Not Assigned Cn deriving (Bounded, Enum, Eq, Show, Read) pattern DefaultGeneralCategory ∷ GeneralCategory pattern DefaultGeneralCategory = Cn {- | See: https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings @since 0.1.0 -} data DecompositionType = Canonical | Compat | Font | NoBreak | Initial | Medial | Final | Isolated | Circle | Super | Sub | Vertical | Wide | Narrow | Small | Square | Fraction deriving (Show, Eq) {- | Unicode decomposition of a code point @since 0.1.0 -} data Decomposition = Self | Decomposition { decompositionType ∷ !DecompositionType , decompositionMapping ∷ ![Char] } deriving (Show, Eq) {- | Core characteristics of a Unicode code point @since 0.1.0 -} data CharDetails = CharDetails { name ∷ !BS.ShortByteString {- ^ In case of a range, the range’s name. It is better to use the names from @DerivedName.txt@. -} , generalCategory ∷ !GeneralCategory , combiningClass ∷ !Word8 -- ^ Value in the range 0..254 , bidiClass ∷ !BS.ShortByteString , bidiMirrored ∷ !Bool , decomposition ∷ !Decomposition , numericValue ∷ !NumericValue , simpleUpperCaseMapping ∷ !(Maybe Char) , simpleLowerCaseMapping ∷ !(Maybe Char) , simpleTitleCaseMapping ∷ !(Maybe Char) } deriving (Eq, Show) {- | An entry in @UnicodeData.txt@. @since 0.1.0 -} data Entry = Entry { range ∷ !CodePointRange , details ∷ !CharDetails } deriving (Eq, Show) -------------------------------------------------------------------------------- -- Parser -------------------------------------------------------------------------------- data PendingUnicodeDataRange = NoRange | -- | A partial range for entry with a name as: @\@ FirstCode !BS.ShortByteString !Char !CharDetails data UnicodeDataAcc = UnicodeDataAcc !B.ByteString !PendingUnicodeDataRange data RawEntry = Complete !Entry | Incomplete !PendingUnicodeDataRange {- | Parser for [UnicodeData.txt file](https://www.unicode.org/reports/tr44/#UnicodeData.txt) >>> :{ traverse_ print . parse $ "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n\ \00A8;DIAERESIS;Sk;0;ON; 0020 0308;;;;N;SPACING DIAERESIS;;;;\n\ \17000;;Lo;0;L;;;;;N;;;;;\n\ \187F7;;Lo;0;L;;;;;N;;;;;\n" :} Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}} Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} @since 0.1.0 -} parse ∷ (HasCallStack) ⇒ B.ByteString → [Entry] parse = L.unfoldr go . (`UnicodeDataAcc` NoRange) where go ∷ UnicodeDataAcc → Maybe (Entry, UnicodeDataAcc) go (UnicodeDataAcc raw pending) | B.null raw = Nothing | otherwise = case B.span (/= NewLine) raw of (B8.strip → line, B.drop 1 → raw') | B.null line → go (UnicodeDataAcc raw' pending) | otherwise → case combine pending (parseDetailedChar line) of Complete dc → Just (dc, UnicodeDataAcc raw' NoRange) Incomplete pending' → go (UnicodeDataAcc raw' pending') {- | Combine with previous line A range requires 2 continuous entries with respective names: * @\@ * @\@ See: https://www.unicode.org/reports/tr44/#Name -} combine ∷ (HasCallStack) ⇒ PendingUnicodeDataRange → (Char, CharDetails) → RawEntry combine = \case NoRange → \(ch, dc) → case BS.span (/= Comma) dc.name of (charRange, suffix) | suffix == ", First>" → Incomplete (FirstCode charRange ch dc) _ → Complete (Entry (SingleChar ch) dc) FirstCode range1 ch1 dc1 → \(ch2, dc2) → case BS.span (/= Comma) dc2.name of (range2, suffix) | suffix == ", Last>" → if range1 == range2 && ch1 < ch2 then Complete (Entry (CharRange ch1 ch2) dc1{name = BS.drop 1 range1}) else error $ "Cannot create range: incompatible ranges" <> show (dc1, dc2) _ → error $ "Cannot create range: missing entry corresponding to: " <> show range1 -- | Parse a single entry of @UnicodeData.txt@ parseDetailedChar ∷ (HasCallStack) ⇒ B.ByteString → (Char, CharDetails) parseDetailedChar line = ( parseCodePoint codePoint , CharDetails{..} ) where (codePoint, line1) = B.span (/= SemiColon) line (BS.toShort → name, line2) = B.span (/= SemiColon) (B.tail line1) (rawGeneralCategory, line3) = B.span (/= SemiColon) (B.tail line2) generalCategory = read (B8.unpack rawGeneralCategory) (rawCombiningClass, line4) = B.span (/= SemiColon) (B.tail line3) combiningClass = read (B8.unpack rawCombiningClass) (BS.toShort → bidiClass, line5) = B.span (/= SemiColon) (B.tail line4) (rawDecomposition, line6) = B.span (/= SemiColon) (B.tail line5) decomposition = parseDecomposition rawDecomposition (__decimal, line7) = B.span (/= SemiColon) (B.tail line6) (__digit, line8) = B.span (/= SemiColon) (B.tail line7) (numeric, line9) = B.span (/= SemiColon) (B.tail line8) numericValue = parseNumericValue numeric (parseBoolValue → bidiMirrored, line10) = B.span (/= SemiColon) (B.tail line9) (__uni1Name, line11) = B.span (/= SemiColon) (B.tail line10) (__iso, line12) = B.span (/= SemiColon) (B.tail line11) (rawUpperCase, line13) = B.span (/= SemiColon) (B.tail line12) (rawLowerCase, line14) = B.span (/= SemiColon) (B.tail line13) rawTitleCase = B.tail line14 simpleUpperCaseMapping = parseCodePointM rawUpperCase simpleLowerCaseMapping = parseCodePointM rawLowerCase simpleTitleCaseMapping = parseCodePointM rawTitleCase -- | See: https://www.unicode.org/reports/tr44/#Decomposition_Type parseDecomposition ∷ (HasCallStack) ⇒ B.ByteString → Decomposition parseDecomposition (B8.words → wrds) | null wrds = Self | otherwise = go wrds where go = \case [] → error ("parseDecomposition: invalid entry: " <> show wrds) ys@(x : xs) → case parseDecompositionType x of Canonical → Decomposition Canonical (parseCodePoints ys) other → Decomposition other (parseCodePoints xs) parseCodePoints = map parseCodePoint parseDecompositionType = \case "" → Compat "" → Circle "" → Final "" → Font "" → Fraction "" → Initial "" → Isolated "" → Medial "" → Narrow "" → NoBreak "" → Small "" → Square "" → Sub "" → Super "" → Vertical "" → Wide _ → Canonical -------------------------------------------------------------------------------- -- Doctest -------------------------------------------------------------------------------- -- TODO: add more examples and move to proper test suite {- $ >>> parse "0000;;Cc;0;BN;;;;;N;NULL;;;;" [Entry {range = SingleChar {start = '\NUL'}, details = CharDetails {name = "", generalCategory = Cc, combiningClass = 0, bidiClass = "BN", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}] >>> parse "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;" [Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}}] >>> parse "00A8;DIAERESIS;Sk;0;ON; 0020 0308;;;;N;SPACING DIAERESIS;;;;" [Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}] >>> parse "1E8E;LATIN CAPITAL LETTER Y WITH DOT ABOVE;Lu;0;L;0059 0307;;;;N;;;;1E8F;" [Entry {range = SingleChar {start = '\7822'}, details = CharDetails {name = "LATIN CAPITAL LETTER Y WITH DOT ABOVE", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Canonical, decompositionMapping = "Y\775"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just '\7823', simpleTitleCaseMapping = Nothing}}] >>> parse "320E;PARENTHESIZED HANGUL KIYEOK A;So;0;L; 0028 1100 1161 0029;;;;N;PARENTHESIZED HANGUL GA;;;;" [Entry {range = SingleChar {start = '\12814'}, details = CharDetails {name = "PARENTHESIZED HANGUL KIYEOK A", generalCategory = So, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = "(\4352\4449)"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}] >>> parse "FDFA;ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM;Lo;0;AL; 0635 0644 0649 0020 0627 0644 0644 0647 0020 0639 0644 064A 0647 0020 0648 0633 0644 0645;;;;N;ARABIC LETTER SALLALLAHOU ALAYHE WASALLAM;;;;" [Entry {range = SingleChar {start = '\65018'}, details = CharDetails {name = "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM", generalCategory = Lo, combiningClass = 0, bidiClass = "AL", bidiMirrored = False, decomposition = Decomposition {decompositionType = Isolated, decompositionMapping = "\1589\1604\1609 \1575\1604\1604\1607 \1593\1604\1610\1607 \1608\1587\1604\1605"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}] -} {- $ Consecutive single chars >>> parse "1F34E;RED APPLE;So;0;ON;;;;;N;;;;;\n1F34F;GREEN APPLE;So;0;ON;;;;;N;;;;;" [Entry {range = SingleChar {start = '\127822'}, details = CharDetails {name = "RED APPLE", generalCategory = So, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}},Entry {range = SingleChar {start = '\127823'}, details = CharDetails {name = "GREEN APPLE", generalCategory = So, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}] -} {- $ Consecutive ranges >>> :{ parse "30000;;Lo;0;L;;;;;N;;;;;\n\ \3134A;;Lo;0;L;;;;;N;;;;;\n\ \31350;;Lo;0;L;;;;;N;;;;;\n\ \323AF;;Lo;0;L;;;;;N;;;;;" == [ Entry {range = CharRange {start = '\196608', end = '\201546'}, details = CharDetails {name = "CJK Ideograph Extension G", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} , Entry {range = CharRange {start = '\201552', end = '\205743'}, details = CharDetails {name = "CJK Ideograph Extension H", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} ] :} True -} {- $ Range bewtween single chars >>> vietnamese_alternate_reading_mark_nhay = Entry {range = SingleChar {start = '\94193'}, details = CharDetails {name = "VIETNAMESE ALTERNATE READING MARK NHAY", generalCategory = Mc, combiningClass = 6, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} >>> parse "16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;;" == [vietnamese_alternate_reading_mark_nhay] True >>> tangut_ideograph = Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} >>> parse "17000;;Lo;0;L;;;;;N;;;;;\n187F7;;Lo;0;L;;;;;N;;;;;" == [tangut_ideograph] True >>> tangut_component_001 = Entry {range = SingleChar {start = '\100352'}, details = CharDetails {name = "TANGUT COMPONENT-001", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} >>> parse "18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;;" == [tangut_component_001] True >>> :{ parse "16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;;\n\ \17000;;Lo;0;L;;;;;N;;;;;\n\ \187F7;;Lo;0;L;;;;;N;;;;;\n\ \18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;;" == [vietnamese_alternate_reading_mark_nhay, tangut_ideograph, tangut_component_001] :} True -} {- $ Multiple consecutive ranges >>> :{ parse "2FA1D;CJK COMPATIBILITY IDEOGRAPH-2FA1D;Lo;0;L;2A600;;;;N;;;;;\n\ \30000;;Lo;0;L;;;;;N;;;;;\n\ \3134A;;Lo;0;L;;;;;N;;;;;\n\ \31350;;Lo;0;L;;;;;N;;;;;\n\ \323AF;;Lo;0;L;;;;;N;;;;;\n\ \E0001;LANGUAGE TAG;Cf;0;BN;;;;;N;;;;;" == [ Entry {range = SingleChar {start = '\195101'}, details = CharDetails {name = "CJK COMPATIBILITY IDEOGRAPH-2FA1D", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Canonical, decompositionMapping = "\173568"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} , Entry {range = CharRange {start = '\196608', end = '\201546'}, details = CharDetails {name = "CJK Ideograph Extension G", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} , Entry {range = CharRange {start = '\201552', end = '\205743'}, details = CharDetails {name = "CJK Ideograph Extension H", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} , Entry {range = SingleChar {start = '\917505'}, details = CharDetails {name = "LANGUAGE TAG", generalCategory = Cf, combiningClass = 0, bidiClass = "BN", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} ] :} True -}