{- |
Copyright : (c) 2024 Pierre Le Marre
Maintainer: dev@wismill.eu
Stability : experimental

Parser for [UnicodeData.txt](https://www.unicode.org/reports/tr44/#UnicodeData.txt).

@since 0.1.0
-}
module Unicode.CharacterDatabase.Parser.UnicodeData (
  parse,
  Entry (..),
  CharDetails (..),
  GeneralCategory (.., DefaultGeneralCategory),
  DecompositionType (..),
  Decomposition (..),
  NumericValue (..),
) where

import Data.ByteString qualified as B
import Data.ByteString.Char8 qualified as B8
import Data.ByteString.Short qualified as BS
import Data.List qualified as L
import Data.Word (Word8)
import GHC.Stack (HasCallStack)

import Unicode.CharacterDatabase.Parser.Internal (
  CodePointRange (..),
  NumericValue (..),
  parseBoolValue,
  parseCodePoint,
  parseCodePointM,
  parseNumericValue,
  pattern Comma,
  pattern NewLine,
  pattern SemiColon,
 )

{- $setup
>>> import Data.Foldable (traverse_)
-}

-------------------------------------------------------------------------------
-- Types
-------------------------------------------------------------------------------

{- | See: https://www.unicode.org/reports/tr44/#General_Category

@since 0.1.0
-}
data GeneralCategory
  = -- | Letter, Uppercase
    Lu
  | -- | Letter, Lowercase
    Ll
  | -- | Letter, Titlecase
    Lt
  | -- | Letter, Modifier
    Lm
  | -- | Letter, Other
    Lo
  | -- | Mark, Non-Spacing
    Mn
  | -- | Mark, Spacing Combining
    Mc
  | -- | Mark, Enclosing
    Me
  | -- | Number, Decimal
    Nd
  | -- | Number, Letter
    Nl
  | -- | Number, Other
    No
  | -- | Punctuation, Connector
    Pc
  | -- | Punctuation, Dash
    Pd
  | -- | Punctuation, Open
    Ps
  | -- | Punctuation, Close
    Pe
  | -- | Punctuation, Initial quote
    Pi
  | -- | Punctuation, Final quote
    Pf
  | -- | Punctuation, Other
    Po
  | -- | Symbol, Math
    Sm
  | -- | Symbol, Currency
    Sc
  | -- | Symbol, Modifier
    Sk
  | -- | Symbol, Other
    So
  | -- | Separator, Space
    Zs
  | -- | Separator, Line
    Zl
  | -- | Separator, Paragraph
    Zp
  | -- | Other, Control
    Cc
  | -- | Other, Format
    Cf
  | -- | Other, Surrogate
    Cs
  | -- | Other, Private Use
    Co
  | -- | Other, Not Assigned
    Cn
  deriving (Bounded, Enum, Eq, Show, Read)

pattern DefaultGeneralCategory ∷ GeneralCategory
pattern DefaultGeneralCategory = Cn

{- | See: https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings

@since 0.1.0
-}
data DecompositionType
  = Canonical
  | Compat
  | Font
  | NoBreak
  | Initial
  | Medial
  | Final
  | Isolated
  | Circle
  | Super
  | Sub
  | Vertical
  | Wide
  | Narrow
  | Small
  | Square
  | Fraction
  deriving (Show, Eq)

{- | Unicode decomposition of a code point

@since 0.1.0
-}
data Decomposition
  = Self
  | Decomposition
      { decompositionType ∷ !DecompositionType
      , decompositionMapping ∷ ![Char]
      }
  deriving (Show, Eq)

{- | Core characteristics of a Unicode code point

@since 0.1.0
-}
data CharDetails
  = CharDetails
  { name ∷ !BS.ShortByteString
  {- ^ In case of a range, the range’s name.
  It is better to use the names from @DerivedName.txt@.
  -}
  , generalCategory ∷ !GeneralCategory
  , combiningClass ∷ !Word8
  -- ^ Value in the range 0..254
  , bidiClass ∷ !BS.ShortByteString
  , bidiMirrored ∷ !Bool
  , decomposition ∷ !Decomposition
  , numericValue ∷ !NumericValue
  , simpleUpperCaseMapping ∷ !(Maybe Char)
  , simpleLowerCaseMapping ∷ !(Maybe Char)
  , simpleTitleCaseMapping ∷ !(Maybe Char)
  }
  deriving (Eq, Show)

{- | An entry in @UnicodeData.txt@.

@since 0.1.0
-}
data Entry = Entry
  { range ∷ !CodePointRange
  , details ∷ !CharDetails
  }
  deriving (Eq, Show)

--------------------------------------------------------------------------------
-- Parser
--------------------------------------------------------------------------------

data PendingUnicodeDataRange
  = NoRange
  | -- | A partial range for entry with a name as: @\<RANGE_IDENTIFIER, First\>@
    FirstCode !BS.ShortByteString !Char !CharDetails

data UnicodeDataAcc = UnicodeDataAcc !B.ByteString !PendingUnicodeDataRange

data RawEntry = Complete !Entry | Incomplete !PendingUnicodeDataRange

{- | Parser for [UnicodeData.txt file](https://www.unicode.org/reports/tr44/#UnicodeData.txt)

>>> :{
traverse_ print . parse $
  "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n\
  \00A8;DIAERESIS;Sk;0;ON;<compat> 0020 0308;;;;N;SPACING DIAERESIS;;;;\n\
  \17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n\
  \187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;\n"
:}
Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}}
Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}

@since 0.1.0
-}
parse ∷ (HasCallStack) ⇒ B.ByteString → [Entry]
parse = L.unfoldr go . (`UnicodeDataAcc` NoRange)
 where
  go ∷ UnicodeDataAcc → Maybe (Entry, UnicodeDataAcc)
  go (UnicodeDataAcc raw pending)
    | B.null raw = Nothing
    | otherwise = case B.span (/= NewLine) raw of
        (B8.strip → line, B.drop 1 → raw')
          | B.null line → go (UnicodeDataAcc raw' pending)
          | otherwise → case combine pending (parseDetailedChar line) of
              Complete dc → Just (dc, UnicodeDataAcc raw' NoRange)
              Incomplete pending' → go (UnicodeDataAcc raw' pending')

{- | Combine with previous line

A range requires 2 continuous entries with respective names:

* @\<RANGE_IDENTIFIER, First\>@
* @\<RANGE_IDENTIFIER, Last\>@

See: https://www.unicode.org/reports/tr44/#Name
-}
combine ∷ (HasCallStack) ⇒ PendingUnicodeDataRange → (Char, CharDetails) → RawEntry
combine = \case
  NoRange → \(ch, dc) → case BS.span (/= Comma) dc.name of
    (charRange, suffix) | suffix == ", First>" → Incomplete (FirstCode charRange ch dc)
    _ → Complete (Entry (SingleChar ch) dc)
  FirstCode range1 ch1 dc1 → \(ch2, dc2) → case BS.span (/= Comma) dc2.name of
    (range2, suffix)
      | suffix == ", Last>" →
          if range1 == range2 && ch1 < ch2
            then Complete (Entry (CharRange ch1 ch2) dc1{name = BS.drop 1 range1})
            else error $ "Cannot create range: incompatible ranges" <> show (dc1, dc2)
    _ → error $ "Cannot create range: missing <range, Last> entry corresponding to: " <> show range1

-- | Parse a single entry of @UnicodeData.txt@
parseDetailedChar ∷ (HasCallStack) ⇒ B.ByteString → (Char, CharDetails)
parseDetailedChar line =
  ( parseCodePoint codePoint
  , CharDetails{..}
  )
 where
  (codePoint, line1) = B.span (/= SemiColon) line
  (BS.toShort → name, line2) = B.span (/= SemiColon) (B.tail line1)
  (rawGeneralCategory, line3) = B.span (/= SemiColon) (B.tail line2)
  generalCategory = read (B8.unpack rawGeneralCategory)
  (rawCombiningClass, line4) = B.span (/= SemiColon) (B.tail line3)
  combiningClass = read (B8.unpack rawCombiningClass)
  (BS.toShort → bidiClass, line5) = B.span (/= SemiColon) (B.tail line4)
  (rawDecomposition, line6) = B.span (/= SemiColon) (B.tail line5)
  decomposition = parseDecomposition rawDecomposition
  (__decimal, line7) = B.span (/= SemiColon) (B.tail line6)
  (__digit, line8) = B.span (/= SemiColon) (B.tail line7)
  (numeric, line9) = B.span (/= SemiColon) (B.tail line8)
  numericValue = parseNumericValue numeric
  (parseBoolValue → bidiMirrored, line10) = B.span (/= SemiColon) (B.tail line9)
  (__uni1Name, line11) = B.span (/= SemiColon) (B.tail line10)
  (__iso, line12) = B.span (/= SemiColon) (B.tail line11)
  (rawUpperCase, line13) = B.span (/= SemiColon) (B.tail line12)
  (rawLowerCase, line14) = B.span (/= SemiColon) (B.tail line13)
  rawTitleCase = B.tail line14
  simpleUpperCaseMapping = parseCodePointM rawUpperCase
  simpleLowerCaseMapping = parseCodePointM rawLowerCase
  simpleTitleCaseMapping = parseCodePointM rawTitleCase

-- | See: https://www.unicode.org/reports/tr44/#Decomposition_Type
parseDecomposition ∷ (HasCallStack) ⇒ B.ByteString → Decomposition
parseDecomposition (B8.words → wrds)
  | null wrds = Self
  | otherwise = go wrds
 where
  go = \case
    [] → error ("parseDecomposition: invalid entry: " <> show wrds)
    ys@(x : xs) → case parseDecompositionType x of
      Canonical → Decomposition Canonical (parseCodePoints ys)
      other → Decomposition other (parseCodePoints xs)

  parseCodePoints = map parseCodePoint

  parseDecompositionType = \case
    "<compat>" → Compat
    "<circle>" → Circle
    "<final>" → Final
    "<font>" → Font
    "<fraction>" → Fraction
    "<initial>" → Initial
    "<isolated>" → Isolated
    "<medial>" → Medial
    "<narrow>" → Narrow
    "<noBreak>" → NoBreak
    "<small>" → Small
    "<square>" → Square
    "<sub>" → Sub
    "<super>" → Super
    "<vertical>" → Vertical
    "<wide>" → Wide
    _ → Canonical

--------------------------------------------------------------------------------
-- Doctest
--------------------------------------------------------------------------------

-- TODO: add more examples and move to proper test suite

{- $
>>> parse "0000;<control>;Cc;0;BN;;;;;N;NULL;;;;"
[Entry {range = SingleChar {start = '\NUL'}, details = CharDetails {name = "<control>", generalCategory = Cc, combiningClass = 0, bidiClass = "BN", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]

>>> parse "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;"
[Entry {range = SingleChar {start = 'A'}, details = CharDetails {name = "LATIN CAPITAL LETTER A", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just 'a', simpleTitleCaseMapping = Nothing}}]

>>> parse "00A8;DIAERESIS;Sk;0;ON;<compat> 0020 0308;;;;N;SPACING DIAERESIS;;;;"
[Entry {range = SingleChar {start = '\168'}, details = CharDetails {name = "DIAERESIS", generalCategory = Sk, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = " \776"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]

>>> parse "1E8E;LATIN CAPITAL LETTER Y WITH DOT ABOVE;Lu;0;L;0059 0307;;;;N;;;;1E8F;"
[Entry {range = SingleChar {start = '\7822'}, details = CharDetails {name = "LATIN CAPITAL LETTER Y WITH DOT ABOVE", generalCategory = Lu, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Canonical, decompositionMapping = "Y\775"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Just '\7823', simpleTitleCaseMapping = Nothing}}]

>>> parse "320E;PARENTHESIZED HANGUL KIYEOK A;So;0;L;<compat> 0028 1100 1161 0029;;;;N;PARENTHESIZED HANGUL GA;;;;"
[Entry {range = SingleChar {start = '\12814'}, details = CharDetails {name = "PARENTHESIZED HANGUL KIYEOK A", generalCategory = So, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Compat, decompositionMapping = "(\4352\4449)"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]

>>> parse "FDFA;ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM;Lo;0;AL;<isolated> 0635 0644 0649 0020 0627 0644 0644 0647 0020 0639 0644 064A 0647 0020 0648 0633 0644 0645;;;;N;ARABIC LETTER SALLALLAHOU ALAYHE WASALLAM;;;;"
[Entry {range = SingleChar {start = '\65018'}, details = CharDetails {name = "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM", generalCategory = Lo, combiningClass = 0, bidiClass = "AL", bidiMirrored = False, decomposition = Decomposition {decompositionType = Isolated, decompositionMapping = "\1589\1604\1609 \1575\1604\1604\1607 \1593\1604\1610\1607 \1608\1587\1604\1605"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]
-}

{- $
Consecutive single chars

>>> parse "1F34E;RED APPLE;So;0;ON;;;;;N;;;;;\n1F34F;GREEN APPLE;So;0;ON;;;;;N;;;;;"
[Entry {range = SingleChar {start = '\127822'}, details = CharDetails {name = "RED APPLE", generalCategory = So, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}},Entry {range = SingleChar {start = '\127823'}, details = CharDetails {name = "GREEN APPLE", generalCategory = So, combiningClass = 0, bidiClass = "ON", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}]
-}

{- $
Consecutive ranges
>>> :{
parse "30000;<CJK Ideograph Extension G, First>;Lo;0;L;;;;;N;;;;;\n\
      \3134A;<CJK Ideograph Extension G, Last>;Lo;0;L;;;;;N;;;;;\n\
      \31350;<CJK Ideograph Extension H, First>;Lo;0;L;;;;;N;;;;;\n\
      \323AF;<CJK Ideograph Extension H, Last>;Lo;0;L;;;;;N;;;;;"
      ==
      [ Entry {range = CharRange {start = '\196608', end = '\201546'}, details = CharDetails {name = "CJK Ideograph Extension G", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
      , Entry {range = CharRange {start = '\201552', end = '\205743'}, details = CharDetails {name = "CJK Ideograph Extension H", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} ]
:}
True
-}

{- $
Range bewtween single chars

>>> vietnamese_alternate_reading_mark_nhay = Entry {range = SingleChar {start = '\94193'}, details = CharDetails {name = "VIETNAMESE ALTERNATE READING MARK NHAY", generalCategory = Mc, combiningClass = 6, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
>>> parse "16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;;" == [vietnamese_alternate_reading_mark_nhay]
True
>>> tangut_ideograph = Entry {range = CharRange {start = '\94208', end = '\100343'}, details = CharDetails {name = "Tangut Ideograph", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
>>> parse "17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;" == [tangut_ideograph]
True
>>> tangut_component_001 = Entry {range = SingleChar {start = '\100352'}, details = CharDetails {name = "TANGUT COMPONENT-001", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
>>> parse "18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;;" == [tangut_component_001]
True
>>> :{
parse "16FF1;VIETNAMESE ALTERNATE READING MARK NHAY;Mc;6;L;;;;;N;;;;;\n\
      \17000;<Tangut Ideograph, First>;Lo;0;L;;;;;N;;;;;\n\
      \187F7;<Tangut Ideograph, Last>;Lo;0;L;;;;;N;;;;;\n\
      \18800;TANGUT COMPONENT-001;Lo;0;L;;;;;N;;;;;"
      == [vietnamese_alternate_reading_mark_nhay, tangut_ideograph, tangut_component_001]
:}
True
-}

{- $
Multiple consecutive ranges

>>> :{
parse "2FA1D;CJK COMPATIBILITY IDEOGRAPH-2FA1D;Lo;0;L;2A600;;;;N;;;;;\n\
      \30000;<CJK Ideograph Extension G, First>;Lo;0;L;;;;;N;;;;;\n\
      \3134A;<CJK Ideograph Extension G, Last>;Lo;0;L;;;;;N;;;;;\n\
      \31350;<CJK Ideograph Extension H, First>;Lo;0;L;;;;;N;;;;;\n\
      \323AF;<CJK Ideograph Extension H, Last>;Lo;0;L;;;;;N;;;;;\n\
      \E0001;LANGUAGE TAG;Cf;0;BN;;;;;N;;;;;"
      ==
      [ Entry {range = SingleChar {start = '\195101'}, details = CharDetails {name = "CJK COMPATIBILITY IDEOGRAPH-2FA1D", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Decomposition {decompositionType = Canonical, decompositionMapping = "\173568"}, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
      , Entry {range = CharRange {start = '\196608', end = '\201546'}, details = CharDetails {name = "CJK Ideograph Extension G", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
      , Entry {range = CharRange {start = '\201552', end = '\205743'}, details = CharDetails {name = "CJK Ideograph Extension H", generalCategory = Lo, combiningClass = 0, bidiClass = "L", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}}
      , Entry {range = SingleChar {start = '\917505'}, details = CharDetails {name = "LANGUAGE TAG", generalCategory = Cf, combiningClass = 0, bidiClass = "BN", bidiMirrored = False, decomposition = Self, numericValue = NotNumeric, simpleUpperCaseMapping = Nothing, simpleLowerCaseMapping = Nothing, simpleTitleCaseMapping = Nothing}} ]
:}
True
-}