{- |
Copyright : (c) 2024 Pierre Le Marre
Maintainer: dev@wismill.eu
Stability : experimental

Parser for [CaseFolding.txt](https://www.unicode.org/reports/tr44/#CaseFolding.txt)

@since 0.3.0
-}
module Unicode.CharacterDatabase.Parser.CaseFolding (
  parse,
  Entry (..),
  CaseFoldingType (..),
) where

import Data.ByteString qualified as B
import Data.ByteString.Char8 qualified as B8
import Data.List qualified as L
import GHC.Stack (HasCallStack)

import Unicode.CharacterDatabase.Parser.Internal (
  parseCodePoint,
  withParser,
  pattern HashTag,
  pattern SemiColon,
 )

{- | An entry from @CaseFolding.txt@ file

@since 0.3.0
-}
data Entry = Entry
  { char ∷ !Char
  , caseFoldingType ∷ !CaseFoldingType
  , caseFolding ∷ ![Char]
  }
  deriving (Eq, Show)

{- | Type of case folding

@since 0.3.0
-}
data CaseFoldingType
  = -- | /Common case folding/, common mappings shared by both simple and full mappings.
    CommonCaseFolding
  | -- | /Full case folding/, mappings that cause strings to grow in length.
    FullCaseFolding
  | -- | /Simple case folding/, mappings to single characters where different from 'FullCaseFolding'
    SimpleCaseFolding
  | -- | /Special case/ for uppercase I and dotted uppercase I
    SpecialCaseFolding
  deriving (Eq, Ord, Show)

{- | A parser for @CaseFolding.txt@

>>> parse "0041; C; 0061; # LATIN CAPITAL LETTER A"
[Entry {char = 'A', caseFoldingType = CommonCaseFolding, caseFolding = "a"}]
>>> parse "0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE"
[Entry {char = '\304', caseFoldingType = FullCaseFolding, caseFolding = "i\775"}]
>>> parse "0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE"
[Entry {char = '\304', caseFoldingType = SpecialCaseFolding, caseFolding = "i"}]

@since 0.3.0
-}
parse ∷ (HasCallStack) ⇒ B.ByteString → [Entry]
parse = L.unfoldr (withParser parseSpecialCasingLine)

{- | File format:

@<code>; <status>; <mapping>; # <name>@
-}
parseSpecialCasingLine ∷ (HasCallStack) ⇒ B.ByteString → Maybe Entry
parseSpecialCasingLine line
  | B.null line || B.head line == HashTag = Nothing
  | otherwise = Just Entry{..}
 where
  (rawChar, line1) = B.span (/= SemiColon) line
  char = parseCodePoint rawChar
  (rawCaseFoldType, line2) = B.span (/= SemiColon) (B.tail line1)
  caseFoldingType = case B8.strip rawCaseFoldType of
    "C" → CommonCaseFolding
    "F" → FullCaseFolding
    "S" → SimpleCaseFolding
    "T" → SpecialCaseFolding
    ty → error ("Unsupported case folding type: " <> show ty)
  (rawCaseFolding, _) = B.span (/= SemiColon) (B.tail line2)
  caseFolding = toChars rawCaseFolding
  toChars = fmap parseCodePoint . B8.words