{- |
Copyright : (c) 2024 Pierre Le Marre
Maintainer: dev@wismill.eu
Stability : experimental

Parser for [UnicodeData.txt](https://www.unicode.org/reports/tr44/#UnicodeData.txt).

@since 0.1.0
-}
module Unicode.CharacterDatabase.Parser.UnicodeData (
  parse,
  Entry (..),
  CharDetails (..),
  GeneralCategory (.., DefaultGeneralCategory),
  DecompositionType (..),
  Decomposition (..),
  NumericValue (..),
) where

import Data.ByteString qualified as B
import Data.ByteString.Char8 qualified as B8
import Data.ByteString.Short qualified as BS
import Data.List qualified as L
import Data.Word (Word8)

import Unicode.CharacterDatabase.Parser.Internal (
  CodePointRange (..),
  NumericValue (..),
  parseBoolValue,
  parseCodePoint,
  parseCodePointM,
  parseNumericValue,
  pattern Comma,
  pattern NewLine,
  pattern SemiColon,
 )

{- $setup
>>> import Data.Foldable (traverse_)
-}

-------------------------------------------------------------------------------
-- Types
-------------------------------------------------------------------------------

{- | See: https://www.unicode.org/reports/tr44/#General_Category

@since 0.1.0
-}
data GeneralCategory
  = -- | Letter, Uppercase
    Lu
  | -- | Letter, Lowercase
    Ll
  | -- | Letter, Titlecase
    Lt
  | -- | Letter, Modifier
    Lm
  | -- | Letter, Other
    Lo
  | -- | Mark, Non-Spacing
    Mn
  | -- | Mark, Spacing Combining
    Mc
  | -- | Mark, Enclosing
    Me
  | -- | Number, Decimal
    Nd
  | -- | Number, Letter
    Nl
  | -- | Number, Other
    No
  | -- | Punctuation, Connector
    Pc
  | -- | Punctuation, Dash
    Pd
  | -- | Punctuation, Open
    Ps
  | -- | Punctuation, Close
    Pe
  | -- | Punctuation, Initial quote
    Pi
  | -- | Punctuation, Final quote
    Pf
  | -- | Punctuation, Other
    Po
  | -- | Symbol, Math
    Sm
  | -- | Symbol, Currency
    Sc
  | -- | Symbol, Modifier
    Sk
  | -- | Symbol, Other
    So
  | -- | Separator, Space
    Zs
  | -- | Separator, Line
    Zl
  | -- | Separator, Paragraph
    Zp
  | -- | Other, Control
    Cc
  | -- | Other, Format
    Cf
  | -- | Other, Surrogate
    Cs
  | -- | Other, Private Use
    Co
  | -- | Other, Not Assigned
    Cn
  deriving (GeneralCategory
GeneralCategory -> GeneralCategory -> Bounded GeneralCategory
forall a. a -> a -> Bounded a
$cminBound :: GeneralCategory
minBound :: GeneralCategory
$cmaxBound :: GeneralCategory
maxBound :: GeneralCategory
Bounded, Int -> GeneralCategory
GeneralCategory -> Int
GeneralCategory -> [GeneralCategory]
GeneralCategory -> GeneralCategory
GeneralCategory -> GeneralCategory -> [GeneralCategory]
GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
(GeneralCategory -> GeneralCategory)
-> (GeneralCategory -> GeneralCategory)
-> (Int -> GeneralCategory)
-> (GeneralCategory -> Int)
-> (GeneralCategory -> [GeneralCategory])
-> (GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> (GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> (GeneralCategory
    -> GeneralCategory -> GeneralCategory -> [GeneralCategory])
-> Enum GeneralCategory
forall a.
(a -> a)
-> (a -> a)
-> (Int -> a)
-> (a -> Int)
-> (a -> [a])
-> (a -> a -> [a])
-> (a -> a -> [a])
-> (a -> a -> a -> [a])
-> Enum a
$csucc :: GeneralCategory -> GeneralCategory
succ :: GeneralCategory -> GeneralCategory
$cpred :: GeneralCategory -> GeneralCategory
pred :: GeneralCategory -> GeneralCategory
$ctoEnum :: Int -> GeneralCategory
toEnum :: Int -> GeneralCategory
$cfromEnum :: GeneralCategory -> Int
fromEnum :: GeneralCategory -> Int
$cenumFrom :: GeneralCategory -> [GeneralCategory]
enumFrom :: GeneralCategory -> [GeneralCategory]
$cenumFromThen :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromThen :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
$cenumFromTo :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromTo :: GeneralCategory -> GeneralCategory -> [GeneralCategory]
$cenumFromThenTo :: GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
enumFromThenTo :: GeneralCategory
-> GeneralCategory -> GeneralCategory -> [GeneralCategory]
Enum, GeneralCategory -> GeneralCategory -> Bool
(GeneralCategory -> GeneralCategory -> Bool)
-> (GeneralCategory -> GeneralCategory -> Bool)
-> Eq GeneralCategory
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: GeneralCategory -> GeneralCategory -> Bool
== :: GeneralCategory -> GeneralCategory -> Bool
$c/= :: GeneralCategory -> GeneralCategory -> Bool
/= :: GeneralCategory -> GeneralCategory -> Bool
Eq, Int -> GeneralCategory -> ShowS
[GeneralCategory] -> ShowS
GeneralCategory -> [Char]
(Int -> GeneralCategory -> ShowS)
-> (GeneralCategory -> [Char])
-> ([GeneralCategory] -> ShowS)
-> Show GeneralCategory
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> GeneralCategory -> ShowS
showsPrec :: Int -> GeneralCategory -> ShowS
$cshow :: GeneralCategory -> [Char]
show :: GeneralCategory -> [Char]
$cshowList :: [GeneralCategory] -> ShowS
showList :: [GeneralCategory] -> ShowS
Show, ReadPrec [GeneralCategory]
ReadPrec GeneralCategory
Int -> ReadS GeneralCategory
ReadS [GeneralCategory]
(Int -> ReadS GeneralCategory)
-> ReadS [GeneralCategory]
-> ReadPrec GeneralCategory
-> ReadPrec [GeneralCategory]
-> Read GeneralCategory
forall a.
(Int -> ReadS a)
-> ReadS [a] -> ReadPrec a -> ReadPrec [a] -> Read a
$creadsPrec :: Int -> ReadS GeneralCategory
readsPrec :: Int -> ReadS GeneralCategory
$creadList :: ReadS [GeneralCategory]
readList :: ReadS [GeneralCategory]
$creadPrec :: ReadPrec GeneralCategory
readPrec :: ReadPrec GeneralCategory
$creadListPrec :: ReadPrec [GeneralCategory]
readListPrec :: ReadPrec [GeneralCategory]
Read)

pattern DefaultGeneralCategory  GeneralCategory
pattern $mDefaultGeneralCategory :: forall {r}. GeneralCategory -> ((# #) -> r) -> ((# #) -> r) -> r
$bDefaultGeneralCategory :: GeneralCategory
DefaultGeneralCategory = Cn

{- | See: https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings

@since 0.1.0
-}
data DecompositionType
  = Canonical
  | Compat
  | Font
  | NoBreak
  | Initial
  | Medial
  | Final
  | Isolated
  | Circle
  | Super
  | Sub
  | Vertical
  | Wide
  | Narrow
  | Small
  | Square
  | Fraction
  deriving (Int -> DecompositionType -> ShowS
[DecompositionType] -> ShowS
DecompositionType -> [Char]
(Int -> DecompositionType -> ShowS)
-> (DecompositionType -> [Char])
-> ([DecompositionType] -> ShowS)
-> Show DecompositionType
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> DecompositionType -> ShowS
showsPrec :: Int -> DecompositionType -> ShowS
$cshow :: DecompositionType -> [Char]
show :: DecompositionType -> [Char]
$cshowList :: [DecompositionType] -> ShowS
showList :: [DecompositionType] -> ShowS
Show, DecompositionType -> DecompositionType -> Bool
(DecompositionType -> DecompositionType -> Bool)
-> (DecompositionType -> DecompositionType -> Bool)
-> Eq DecompositionType
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: DecompositionType -> DecompositionType -> Bool
== :: DecompositionType -> DecompositionType -> Bool
$c/= :: DecompositionType -> DecompositionType -> Bool
/= :: DecompositionType -> DecompositionType -> Bool
Eq)

{- | Unicode decomposition of a code point

@since 0.1.0
-}
data Decomposition
  = Self
  | Decomposition
      { Decomposition -> DecompositionType
decompositionType  !DecompositionType
      , Decomposition -> [Char]
decompositionMapping  ![Char]
      }
  deriving (Int -> Decomposition -> ShowS
[Decomposition] -> ShowS
Decomposition -> [Char]
(Int -> Decomposition -> ShowS)
-> (Decomposition -> [Char])
-> ([Decomposition] -> ShowS)
-> Show Decomposition
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Decomposition -> ShowS
showsPrec :: Int -> Decomposition -> ShowS
$cshow :: Decomposition -> [Char]
show :: Decomposition -> [Char]
$cshowList :: [Decomposition] -> ShowS
showList :: [Decomposition] -> ShowS
Show, Decomposition -> Decomposition -> Bool
(Decomposition -> Decomposition -> Bool)
-> (Decomposition -> Decomposition -> Bool) -> Eq Decomposition
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Decomposition -> Decomposition -> Bool
== :: Decomposition -> Decomposition -> Bool
$c/= :: Decomposition -> Decomposition -> Bool
/= :: Decomposition -> Decomposition -> Bool
Eq)

{- | Core characteristics of a Unicode code point

@since 0.1.0
-}
data CharDetails
  = CharDetails
  { CharDetails -> ShortByteString
name  !BS.ShortByteString
  -- ^ In case of a range, the range’s name.
  -- It is better to use the names from @DerivedName.txt@.
  , CharDetails -> GeneralCategory
generalCategory  !GeneralCategory
  , CharDetails -> Word8
combiningClass  !Word8
  -- ^ Value in the range 0..254
  , CharDetails -> ShortByteString
bidiClass  !BS.ShortByteString
  , CharDetails -> Bool
bidiMirrored  !Bool
  , CharDetails -> Decomposition
decomposition  !Decomposition
  , CharDetails -> NumericValue
numericValue  !NumericValue
  , CharDetails -> Maybe Char
simpleUpperCaseMapping  !(Maybe Char)
  , CharDetails -> Maybe Char
simpleLowerCaseMapping  !(Maybe Char)
  , CharDetails -> Maybe Char
simpleTitleCaseMapping  !(Maybe Char)
  }
  deriving (CharDetails -> CharDetails -> Bool
(CharDetails -> CharDetails -> Bool)
-> (CharDetails -> CharDetails -> Bool) -> Eq CharDetails
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: CharDetails -> CharDetails -> Bool
== :: CharDetails -> CharDetails -> Bool
$c/= :: CharDetails -> CharDetails -> Bool
/= :: CharDetails -> CharDetails -> Bool
Eq, Int -> CharDetails -> ShowS
[CharDetails] -> ShowS
CharDetails -> [Char]
(Int -> CharDetails -> ShowS)
-> (CharDetails -> [Char])
-> ([CharDetails] -> ShowS)
-> Show CharDetails
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CharDetails -> ShowS
showsPrec :: Int -> CharDetails -> ShowS
$cshow :: CharDetails -> [Char]
show :: CharDetails -> [Char]
$cshowList :: [CharDetails] -> ShowS
showList :: [CharDetails] -> ShowS
Show)

{- | An entry in @UnicodeData.txt@.

@since 0.1.0
-}
data Entry = Entry
  { Entry -> CodePointRange
range  !CodePointRange
  , Entry -> CharDetails
details  !CharDetails
  }
  deriving (Entry -> Entry -> Bool
(Entry -> Entry -> Bool) -> (Entry -> Entry -> Bool) -> Eq Entry
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Entry -> Entry -> Bool
== :: Entry -> Entry -> Bool
$c/= :: Entry -> Entry -> Bool
/= :: Entry -> Entry -> Bool
Eq, Int -> Entry -> ShowS
[Entry] -> ShowS
Entry -> [Char]
(Int -> Entry -> ShowS)
-> (Entry -> [Char]) -> ([Entry] -> ShowS) -> Show Entry
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Entry -> ShowS
showsPrec :: Int -> Entry -> ShowS
$cshow :: Entry -> [Char]
show :: Entry -> [Char]
$cshowList :: [Entry] -> ShowS
showList :: [Entry] -> ShowS
Show)

--------------------------------------------------------------------------------
-- Parser
--------------------------------------------------------------------------------

data PendingUnicodeDataRange
  = NoRange
  | -- | A partial range for entry with a name as: @\<RANGE_IDENTIFIER, First\>@
    FirstCode !BS.ShortByteString !Char !CharDetails

data UnicodeDataAcc = UnicodeDataAcc !B.ByteString !PendingUnicodeDataRange

data RawEntry = Complete !Entry | Incomplete !PendingUnicodeDataRange

{- | Parser for [UnicodeData.txt file](https://www.unicode.org/reports/tr44/#UnicodeData.txt)

>>> :{
traverse_ print . parse $
  "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n\
  \00A8;DIAERESIS;Sk;0;ON;<compat> 0020 0308;;;;N;SPACING DIAERESIS;;;;\n\
  \17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n\
  \187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;\n"
:}
Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}}
Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}

@since 0.1.0
-}
parse  B.ByteString  [Entry]
parse :: ByteString -> [Entry]
parse = (UnicodeDataAcc -> Maybe (Entry, UnicodeDataAcc))
-> UnicodeDataAcc -> [Entry]
forall b a. (b -> Maybe (a, b)) -> b -> [a]
L.unfoldr UnicodeDataAcc -> Maybe (Entry, UnicodeDataAcc)
go (UnicodeDataAcc -> [Entry])
-> (ByteString -> UnicodeDataAcc) -> ByteString -> [Entry]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ByteString -> PendingUnicodeDataRange -> UnicodeDataAcc
`UnicodeDataAcc` PendingUnicodeDataRange
NoRange)
 where
  go  UnicodeDataAcc  Maybe (Entry, UnicodeDataAcc)
  go :: UnicodeDataAcc -> Maybe (Entry, UnicodeDataAcc)
go (UnicodeDataAcc ByteString
raw PendingUnicodeDataRange
pending)
    | ByteString -> Bool
B.null ByteString
raw = Maybe (Entry, UnicodeDataAcc)
forall a. Maybe a
Nothing
    | Bool
otherwise = case (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
NewLine) ByteString
raw of
        (ByteString -> ByteString
B8.strip  ByteString
line, Int -> ByteString -> ByteString
B.drop Int
1  ByteString
raw')
          | ByteString -> Bool
B.null ByteString
line  UnicodeDataAcc -> Maybe (Entry, UnicodeDataAcc)
go (ByteString -> PendingUnicodeDataRange -> UnicodeDataAcc
UnicodeDataAcc ByteString
raw' PendingUnicodeDataRange
pending)
          | Bool
otherwise  case PendingUnicodeDataRange -> (Char, CharDetails) -> RawEntry
combine PendingUnicodeDataRange
pending (ByteString -> (Char, CharDetails)
parseDetailedChar ByteString
line) of
              Complete Entry
dc  (Entry, UnicodeDataAcc) -> Maybe (Entry, UnicodeDataAcc)
forall a. a -> Maybe a
Just (Entry
dc, ByteString -> PendingUnicodeDataRange -> UnicodeDataAcc
UnicodeDataAcc ByteString
raw' PendingUnicodeDataRange
NoRange)
              Incomplete PendingUnicodeDataRange
pending'  UnicodeDataAcc -> Maybe (Entry, UnicodeDataAcc)
go (ByteString -> PendingUnicodeDataRange -> UnicodeDataAcc
UnicodeDataAcc ByteString
raw' PendingUnicodeDataRange
pending')

{- | Combine with previous line

A range requires 2 continuous entries with respective names:

* @\<RANGE_IDENTIFIER, First\>@
* @\<RANGE_IDENTIFIER, Last\>@

See: https://www.unicode.org/reports/tr44/#Name
-}
combine  PendingUnicodeDataRange  (Char, CharDetails)  RawEntry
combine :: PendingUnicodeDataRange -> (Char, CharDetails) -> RawEntry
combine = \case
  PendingUnicodeDataRange
NoRange  \(Char
ch, CharDetails
dc)  case (Word8 -> Bool)
-> ShortByteString -> (ShortByteString, ShortByteString)
BS.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
Comma) CharDetails
dc.name of
    (ShortByteString
charRange, ShortByteString
suffix) | ShortByteString
suffix ShortByteString -> ShortByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ShortByteString
", First>"  PendingUnicodeDataRange -> RawEntry
Incomplete (ShortByteString -> Char -> CharDetails -> PendingUnicodeDataRange
FirstCode ShortByteString
charRange Char
ch CharDetails
dc)
    (ShortByteString, ShortByteString)
_  Entry -> RawEntry
Complete (CodePointRange -> CharDetails -> Entry
Entry (Char -> CodePointRange
SingleChar Char
ch) CharDetails
dc)
  FirstCode ShortByteString
range1 Char
ch1 CharDetails
dc1  \(Char
ch2, CharDetails
dc2)  case (Word8 -> Bool)
-> ShortByteString -> (ShortByteString, ShortByteString)
BS.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
Comma) CharDetails
dc2.name of
    (ShortByteString
range2, ShortByteString
suffix)
      | ShortByteString
suffix ShortByteString -> ShortByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ShortByteString
", Last>" 
          if ShortByteString
range1 ShortByteString -> ShortByteString -> Bool
forall a. Eq a => a -> a -> Bool
== ShortByteString
range2 Bool -> Bool -> Bool
&& Char
ch1 Char -> Char -> Bool
forall a. Ord a => a -> a -> Bool
< Char
ch2
            then Entry -> RawEntry
Complete (CodePointRange -> CharDetails -> Entry
Entry (Char -> Char -> CodePointRange
CharRange Char
ch1 Char
ch2) CharDetails
dc1{name = BS.drop 1 range1})
            else [Char] -> RawEntry
forall a. HasCallStack => [Char] -> a
error ([Char] -> RawEntry) -> [Char] -> RawEntry
forall a b. (a -> b) -> a -> b
$ [Char]
"Cannot create range: incompatible ranges" [Char] -> ShowS
forall a. Semigroup a => a -> a -> a
<> (CharDetails, CharDetails) -> [Char]
forall a. Show a => a -> [Char]
show (CharDetails
dc1, CharDetails
dc2)
    (ShortByteString, ShortByteString)
_  [Char] -> RawEntry
forall a. HasCallStack => [Char] -> a
error ([Char] -> RawEntry) -> [Char] -> RawEntry
forall a b. (a -> b) -> a -> b
$ [Char]
"Cannot create range: missing <range, Last> entry corresponding to: " [Char] -> ShowS
forall a. Semigroup a => a -> a -> a
<> ShortByteString -> [Char]
forall a. Show a => a -> [Char]
show ShortByteString
range1

-- | Parse a single entry of @UnicodeData.txt@
parseDetailedChar  B.ByteString  (Char, CharDetails)
parseDetailedChar :: ByteString -> (Char, CharDetails)
parseDetailedChar ByteString
line =
  ( ByteString -> Char
parseCodePoint ByteString
codePoint
  , CharDetails{Bool
Maybe Char
Word8
ShortByteString
NumericValue
Decomposition
GeneralCategory
$sel:name:CharDetails :: ShortByteString
$sel:generalCategory:CharDetails :: GeneralCategory
$sel:combiningClass:CharDetails :: Word8
$sel:bidiClass:CharDetails :: ShortByteString
$sel:bidiMirrored:CharDetails :: Bool
$sel:decomposition:CharDetails :: Decomposition
$sel:numericValue:CharDetails :: NumericValue
$sel:simpleUpperCaseMapping:CharDetails :: Maybe Char
$sel:simpleLowerCaseMapping:CharDetails :: Maybe Char
$sel:simpleTitleCaseMapping:CharDetails :: Maybe Char
name :: ShortByteString
generalCategory :: GeneralCategory
combiningClass :: Word8
bidiClass :: ShortByteString
decomposition :: Decomposition
numericValue :: NumericValue
bidiMirrored :: Bool
simpleUpperCaseMapping :: Maybe Char
simpleLowerCaseMapping :: Maybe Char
simpleTitleCaseMapping :: Maybe Char
..}
  )
 where
  (ByteString
codePoint, ByteString
line1) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) ByteString
line
  (ByteString -> ShortByteString
BS.toShort  ShortByteString
name, ByteString
line2) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line1)
  (ByteString
rawGeneralCategory, ByteString
line3) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line2)
  generalCategory :: GeneralCategory
generalCategory = [Char] -> GeneralCategory
forall a. Read a => [Char] -> a
read (ByteString -> [Char]
B8.unpack ByteString
rawGeneralCategory)
  (ByteString
rawCombiningClass, ByteString
line4) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line3)
  combiningClass :: Word8
combiningClass = [Char] -> Word8
forall a. Read a => [Char] -> a
read (ByteString -> [Char]
B8.unpack ByteString
rawCombiningClass)
  (ByteString -> ShortByteString
BS.toShort  ShortByteString
bidiClass, ByteString
line5) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line4)
  (ByteString
rawDecomposition, ByteString
line6) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line5)
  decomposition :: Decomposition
decomposition = ByteString -> Decomposition
parseDecomposition ByteString
rawDecomposition
  (ByteString
__decimal, ByteString
line7) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line6)
  (ByteString
__digit, ByteString
line8) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line7)
  (ByteString
numeric, ByteString
line9) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line8)
  numericValue :: NumericValue
numericValue = ByteString -> NumericValue
parseNumericValue ByteString
numeric
  (ByteString -> Bool
parseBoolValue  Bool
bidiMirrored, ByteString
line10) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line9)
  (ByteString
__uni1Name, ByteString
line11) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line10)
  (ByteString
__iso, ByteString
line12) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line11)
  (ByteString
rawUpperCase, ByteString
line13) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line12)
  (ByteString
rawLowerCase, ByteString
line14) = (Word8 -> Bool) -> ByteString -> (ByteString, ByteString)
B.span (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
SemiColon) (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line13)
  rawTitleCase :: ByteString
rawTitleCase = HasCallStack => ByteString -> ByteString
ByteString -> ByteString
B.tail ByteString
line14
  simpleUpperCaseMapping :: Maybe Char
simpleUpperCaseMapping = ByteString -> Maybe Char
parseCodePointM ByteString
rawUpperCase
  simpleLowerCaseMapping :: Maybe Char
simpleLowerCaseMapping = ByteString -> Maybe Char
parseCodePointM ByteString
rawLowerCase
  simpleTitleCaseMapping :: Maybe Char
simpleTitleCaseMapping = ByteString -> Maybe Char
parseCodePointM ByteString
rawTitleCase

-- | See: https://www.unicode.org/reports/tr44/#Decomposition_Type
parseDecomposition  B.ByteString  Decomposition
parseDecomposition :: ByteString -> Decomposition
parseDecomposition (ByteString -> [ByteString]
B8.words  [ByteString]
wrds)
  | [ByteString] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [ByteString]
wrds = Decomposition
Self
  | Bool
otherwise = [ByteString] -> Decomposition
go [ByteString]
wrds
 where
  go :: [ByteString] -> Decomposition
go = \case
    []  [Char] -> Decomposition
forall a. HasCallStack => [Char] -> a
error ([Char]
"parseDecomposition: invalid entry: " [Char] -> ShowS
forall a. Semigroup a => a -> a -> a
<> [ByteString] -> [Char]
forall a. Show a => a -> [Char]
show [ByteString]
wrds)
    ys :: [ByteString]
ys@(ByteString
x : [ByteString]
xs)  case ByteString -> DecompositionType
parseDecompositionType ByteString
x of
      DecompositionType
Canonical  DecompositionType -> [Char] -> Decomposition
Decomposition DecompositionType
Canonical ([ByteString] -> [Char]
parseCodePoints [ByteString]
ys)
      DecompositionType
other  DecompositionType -> [Char] -> Decomposition
Decomposition DecompositionType
other ([ByteString] -> [Char]
parseCodePoints [ByteString]
xs)

  parseCodePoints :: [ByteString] -> [Char]
parseCodePoints = (ByteString -> Char) -> [ByteString] -> [Char]
forall a b. (a -> b) -> [a] -> [b]
map ByteString -> Char
parseCodePoint

  parseDecompositionType :: ByteString -> DecompositionType
parseDecompositionType = \case
    ByteString
"<compat>"  DecompositionType
Compat
    ByteString
"<circle>"  DecompositionType
Circle
    ByteString
"<final>"  DecompositionType
Final
    ByteString
"<font>"  DecompositionType
Font
    ByteString
"<fraction>"  DecompositionType
Fraction
    ByteString
"<initial>"  DecompositionType
Initial
    ByteString
"<isolated>"  DecompositionType
Isolated
    ByteString
"<medial>"  DecompositionType
Medial
    ByteString
"<narrow>"  DecompositionType
Narrow
    ByteString
"<noBreak>"  DecompositionType
NoBreak
    ByteString
"<small>"  DecompositionType
Small
    ByteString
"<square>"  DecompositionType
Square
    ByteString
"<sub>"  DecompositionType
Sub
    ByteString
"<super>"  DecompositionType
Super
    ByteString
"<vertical>"  DecompositionType
Vertical
    ByteString
"<wide>"  DecompositionType
Wide
    ByteString
_  DecompositionType
Canonical

--------------------------------------------------------------------------------
-- Doctest
--------------------------------------------------------------------------------

-- TODO: add more examples and move to proper test suite

{- $
>>> parse "0000;<control>;Cc;0;BN;;;;;N;NULL;;;;"
[Entry {range = SingleChar {start = '\NUL'}, details = CharDetails {name = "<control>", generalCategory = Cc, combiningClass = 0, bidiClass = "BN", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]

>>> parse "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;"
[Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}}]

>>> parse "00A8;DIAERESIS;Sk;0;ON;<compat> 0020 0308;;;;N;SPACING DIAERESIS;;;;"
[Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]

>>> parse "1E8E;LATIN CAPITAL LETTER Y WITH DOT ABOVE;Lu;0;L;0059 0307;;;;N;;;;1E8F;"
[Entry {range = SingleChar {start = '\7822'}, details = CharDetails {name = "LATIN CAPITAL LETTER Y WITH DOT ABOVE", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Canonical, decompositionMapping = "Y\775"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just '\7823', simpleTitleCaseMapping = Nothing}}]

>>> parse "320E;PARENTHESIZED HANGUL KIYEOK A;So;0;L;<compat> 0028 1100 1161 0029;;;;N;PARENTHESIZED HANGUL GA;;;;"
[Entry {range = SingleChar {start = '\12814'}, details = CharDetails {name = "PARENTHESIZED HANGUL KIYEOK A", generalCategory = So, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = "(\4352\4449)"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]

>>> parse "FDFA;ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM;Lo;0;AL;<isolated> 0635 0644 0649 0020 0627 0644 0644 0647 0020 0639 0644 064A 0647 0020 0648 0633 0644 0645;;;;N;ARABIC LETTER SALLALLAHOU ALAYHE WASALLAM;;;;"
[Entry {range = SingleChar {start = '\65018'}, details = CharDetails {name = "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM", generalCategory = Lo, combiningClass = 0, bidiClass = "AL", bidiMirrored = False, decomposition = Decomposition {decompositionType = Isolated, decompositionMapping = "\1589\1604\1609 \1575\1604\1604\1607 \1593\1604\1610\1607 \1608\1587\1604\1605"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]
-}

{- $
Consecutive single chars

>>> parse "1F34E;RED APPLE;So;0;ON;;;;;N;;;;;\n1F34F;GREEN APPLE;So;0;ON;;;;;N;;;;;"
[Entry {range = SingleChar {start = '\127822'}, details = CharDetails {name = "RED APPLE", generalCategory = So, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}},Entry {range = SingleChar {start = '\127823'}, details = CharDetails {name = "GREEN APPLE", generalCategory = So, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]
-}

{- $
Consecutive ranges
>>> :{
parse "30000;<CJK Ideograph Extension G, First>;Lo;0;L;;;;;N;;;;;\n\
      \3134A;<CJK Ideograph Extension G, Last>;Lo;0;L;;;;;N;;;;;\n\
      \31350;<CJK Ideograph Extension H, First>;Lo;0;L;;;;;N;;;;;\n\
      \323AF;<CJK Ideograph Extension H, Last>;Lo;0;L;;;;;N;;;;;"
      ==
      [ Entry {range = CharRange {start = '\196608', end = '\201546'}, details = CharDetails {name = "CJK Ideograph Extension G", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
      , Entry {range = CharRange {start = '\201552', end = '\205743'}, details = CharDetails {name = "CJK Ideograph Extension H", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} ]
:}
True
-}

{- $
Range bewtween single chars

>>> vietnamese_alternate_reading_mark_nhay = Entry {range = SingleChar {start = '\94193'}, details = CharDetails {name = "VIETNAMESE ALTERNATE READING MARK NHAY", generalCategory = Mc, combiningClass = 6, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
>>> parse "16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;;" == [vietnamese_alternate_reading_mark_nhay]
True
>>> tangut_ideograph = Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
>>> parse "17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;" == [tangut_ideograph]
True
>>> tangut_component_001 = Entry {range = SingleChar {start = '\100352'}, details = CharDetails {name = "TANGUT COMPONENT-001", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
>>> parse "18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;;" == [tangut_component_001]
True
>>> :{
parse "16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;;\n\
      \17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n\
      \187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;\n\
      \18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;;"
      == [vietnamese_alternate_reading_mark_nhay, tangut_ideograph, tangut_component_001]
:}
True
-}

{- $
Multiple consecutive ranges

>>> :{
parse "2FA1D;CJK COMPATIBILITY IDEOGRAPH-2FA1D;Lo;0;L;2A600;;;;N;;;;;\n\
      \30000;<CJK Ideograph Extension G, First>;Lo;0;L;;;;;N;;;;;\n\
      \3134A;<CJK Ideograph Extension G, Last>;Lo;0;L;;;;;N;;;;;\n\
      \31350;<CJK Ideograph Extension H, First>;Lo;0;L;;;;;N;;;;;\n\
      \323AF;<CJK Ideograph Extension H, Last>;Lo;0;L;;;;;N;;;;;\n\
      \E0001;LANGUAGE TAG;Cf;0;BN;;;;;N;;;;;"
      ==
      [ Entry {range = SingleChar {start = '\195101'}, details = CharDetails {name = "CJK COMPATIBILITY IDEOGRAPH-2FA1D", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Canonical, decompositionMapping = "\173568"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
      , Entry {range = CharRange {start = '\196608', end = '\201546'}, details = CharDetails {name = "CJK Ideograph Extension G", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
      , Entry {range = CharRange {start = '\201552', end = '\205743'}, details = CharDetails {name = "CJK Ideograph Extension H", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
      , Entry {range = SingleChar {start = '\917505'}, details = CharDetails {name = "LANGUAGE TAG", generalCategory = Cf, combiningClass = 0, bidiClass = "BN", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} ]
:}
True
-}