{- |
Copyright : (c) 2024 Pierre Le Marre
Maintainer: dev@wismill.eu
Stability : experimental

Miscellaneous bits common to various parsers
-}
module Unicode.CharacterDatabase.Parser.Internal (
  -- * Word8 patterns
  pattern Asterisk,
  pattern Comma,
  pattern HashTag,
  pattern NewLine,
  pattern Period,
  pattern SemiColon,
  pattern Slash,

  -- * Parser helpers
  withParser,

  -- * Code point
  parseCodePoint,
  parseCodePointM,

  -- * Range
  CodePointRange (..),
  parseCodePointRange,
  parseCodePointRange',

  -- * Numeric value
  NumericValue (..),
  parseNumericValue,

  -- * Boolean value
  parseBoolValue,
) where

import Data.ByteString qualified as B
import Data.ByteString.Char8 qualified as B8
import Data.Char (chr)
import Data.Ratio ((%))
import Data.Word (Word8)

--------------------------------------------------------------------------------
-- Char8 patterns
--------------------------------------------------------------------------------

-- | @'\\n'@
pattern NewLine ∷ Word8
pattern NewLine = 0x0a

-- | @#@
pattern HashTag ∷ Word8
pattern HashTag = 0x23

-- | @*@
pattern Asterisk ∷ Word8
pattern Asterisk = 0x2a

-- | @,@
pattern Comma ∷ Word8
pattern Comma = 0x2c

-- | @.@
pattern Period ∷ Word8
pattern Period = 0x2e

-- | @\/@
pattern Slash ∷ Word8
pattern Slash = 0x2f

-- | @;@
pattern SemiColon ∷ Word8
pattern SemiColon = 0x3b

--------------------------------------------------------------------------------
-- Parse helpers
--------------------------------------------------------------------------------

-- | Use the given parser to parse each line
withParser ∷ (B.ByteString → Maybe a) → B.ByteString → Maybe (a, B.ByteString)
withParser parse = go
 where
  go raw
    | B.null raw = Nothing
    | otherwise = case B.span (/= NewLine) raw of
        (B8.strip → line, B.drop 1 → raw') →
          case parse line of
            Nothing → go raw'
            Just entry → Just (entry, raw')

--------------------------------------------------------------------------------
-- Code point parser
--------------------------------------------------------------------------------

{- | Parse a code point formatted as hexadecimal

/Warning:/ raise an error on invalid input.

>>> parseCodePoint "0061"
'a'

@since 0.1.0
-}
parseCodePoint ∷ B.ByteString → Char
parseCodePoint = chr . read . B8.unpack . ("0x" <>)

{- | Parse a code point formatted as hexadecimal, or return 'Nothing' on an
empty string.

/Warning:/ raise an error on invalid input.

>>> parseCodePointM "0061"
Just 'a'
>>> parseCodePointM ""
Nothing

See also: 'parseCodePoint'.

@since 0.1.0
-}
parseCodePointM ∷ B.ByteString → Maybe Char
parseCodePointM raw
  | B.null raw = Nothing
  | otherwise = Just (parseCodePoint raw)

--------------------------------------------------------------------------------
-- Code point range parser
--------------------------------------------------------------------------------

{- | A Unicode code point range

@since 0.1.0
-}
data CodePointRange
  = SingleChar {start ∷ !Char}
  | CharRange {start ∷ !Char, end ∷ !Char}
  deriving (Eq, Ord, Show)

{- | Parse @AAAA..BBBB@ range or single code point

@since 0.1.0
-}
parseCodePointRange ∷ B.ByteString → CodePointRange
parseCodePointRange raw = case B.span (/= Period) raw of
  (parseCodePoint → ch1, rest)
    | B.null rest → SingleChar ch1
    | otherwise → CharRange ch1 (parseCodePoint (B.drop 2 rest))

{- | Parse @AAAA..BBBB@ range

@since 0.1.0
-}
parseCodePointRange' ∷ B.ByteString → (Char, Char)
parseCodePointRange' raw = case B.span (/= Period) raw of
  (parseCodePoint → ch1, rest) → (ch1, parseCodePoint (B.drop 2 rest))

--------------------------------------------------------------------------------
-- Numeric value parser
--------------------------------------------------------------------------------

{- | Numeric value of a code point, if relevant

@since 0.1.0
-}
data NumericValue
  = NotNumeric
  | Integer !Integer
  | Rational !Rational
  deriving (Eq, Show)

-- | See: https://www.unicode.org/reports/tr44/#Numeric_Value
parseNumericValue ∷ B.ByteString → NumericValue
parseNumericValue raw
  | B.null raw = NotNumeric
  | B.elem Slash raw = case B.span (/= Slash) raw of
      (num, denum) → Rational (readB num % (readB . B.drop 1) denum)
       where
        readB = read . B8.unpack
  | otherwise = Integer (read (B8.unpack raw))

--------------------------------------------------------------------------------
-- Boolean value parser
--------------------------------------------------------------------------------

-- | Parse boolean values ‘Y’ and ‘N’.
parseBoolValue ∷ B.ByteString → Bool
parseBoolValue = \case
  "Y" → True
  "N" → False
  raw → error ("parseBoolValue: Cannot parse: " <> show raw)