{- | Copyright : (c) 2024 Pierre Le Marre Maintainer: dev@wismill.eu Stability : experimental Miscellaneous bits common to various parsers -} module Unicode.CharacterDatabase.Parser.Internal ( -- * Word8 patterns pattern Asterisk, pattern Comma, pattern HashTag, pattern NewLine, pattern Period, pattern SemiColon, pattern Slash, -- * Parser helpers withParser, -- * Code point parseCodePoint, parseCodePointM, -- * Range CodePointRange (..), parseCodePointRange, parseCodePointRange', -- * Numeric value NumericValue (..), parseNumericValue, -- * Boolean value parseBoolValue, ) where import Data.ByteString qualified as B import Data.ByteString.Char8 qualified as B8 import Data.Char (chr) import Data.Maybe (fromMaybe) import Data.Ratio ((%)) import Data.Word (Word8) import GHC.Stack (HasCallStack) -------------------------------------------------------------------------------- -- Char8 patterns -------------------------------------------------------------------------------- -- | @'\\n'@ pattern NewLine ∷ Word8 pattern NewLine = 0x0a -- | @#@ pattern HashTag ∷ Word8 pattern HashTag = 0x23 -- | @*@ pattern Asterisk ∷ Word8 pattern Asterisk = 0x2a -- | @,@ pattern Comma ∷ Word8 pattern Comma = 0x2c -- | @.@ pattern Period ∷ Word8 pattern Period = 0x2e -- | @\/@ pattern Slash ∷ Word8 pattern Slash = 0x2f -- | @;@ pattern SemiColon ∷ Word8 pattern SemiColon = 0x3b -------------------------------------------------------------------------------- -- Parse helpers -------------------------------------------------------------------------------- -- | Use the given parser to parse each line withParser ∷ (HasCallStack) ⇒ (B.ByteString → Maybe a) → B.ByteString → Maybe (a, B.ByteString) withParser parse = go . skipUtf8BOM where go raw | B.null raw = Nothing | otherwise = case B.span (/= NewLine) raw of (B8.strip → line, B.drop 1 → raw') → case parse line of Nothing → go raw' Just entry → Just (entry, raw') skipUtf8BOM ∷ B.ByteString → B.ByteString skipUtf8BOM raw = fromMaybe raw (B.stripPrefix "\xEF\xBB\xBF" raw) -------------------------------------------------------------------------------- -- Code point parser -------------------------------------------------------------------------------- {- | Parse a code point formatted as hexadecimal /Warning:/ raise an error on invalid input. >>> parseCodePoint "0061" 'a' @since 0.1.0 -} parseCodePoint ∷ (HasCallStack) ⇒ B.ByteString → Char parseCodePoint = chr . read . B8.unpack . ("0x" <>) {- | Parse a code point formatted as hexadecimal, or return 'Nothing' on an empty string. /Warning:/ raise an error on invalid input. >>> parseCodePointM "0061" Just 'a' >>> parseCodePointM "" Nothing See also: 'parseCodePoint'. @since 0.1.0 -} parseCodePointM ∷ (HasCallStack) ⇒ B.ByteString → Maybe Char parseCodePointM raw | B.null raw = Nothing | otherwise = Just (parseCodePoint raw) -------------------------------------------------------------------------------- -- Code point range parser -------------------------------------------------------------------------------- {- | A Unicode code point range @since 0.1.0 -} data CodePointRange = SingleChar {start ∷ !Char} | CharRange {start ∷ !Char, end ∷ !Char} deriving (Eq, Show) {- | This should be used only for non-overlapping ranges and expect ranges to be well-formed, i.e. @start < end@. >>> compare (SingleChar 'A') (SingleChar 'A') EQ >>> compare (SingleChar 'A') (SingleChar 'B') LT >>> compare (SingleChar 'B') (SingleChar 'A') GT >>> compare (SingleChar 'A') (CharRange 'A' 'B') LT >>> compare (SingleChar 'B') (CharRange 'A' 'B') GT >>> compare (SingleChar 'C') (CharRange 'A' 'B') GT >>> compare (SingleChar 'A') (CharRange 'A' 'A') EQ >>> compare (SingleChar 'A') (CharRange 'B' 'C') LT >>> compare (CharRange 'A' 'B') (SingleChar 'A') GT >>> compare (CharRange 'A' 'B') (SingleChar 'B') LT >>> compare (CharRange 'A' 'B') (SingleChar 'C') LT >>> compare (CharRange 'B' 'C') (SingleChar 'A') GT >>> compare (CharRange 'A' 'A') (SingleChar 'A') EQ >>> compare (CharRange 'A' 'B') (CharRange 'A' 'B') EQ >>> compare (CharRange 'A' 'B') (CharRange 'C' 'D') LT >>> compare (CharRange 'A' 'B') (CharRange 'B' 'C') LT >>> compare (CharRange 'A' 'C') (CharRange 'B' 'D') LT >>> compare (CharRange 'A' 'D') (CharRange 'B' 'C') LT >>> compare (CharRange 'C' 'D') (CharRange 'A' 'B') GT >>> compare (CharRange 'B' 'D') (CharRange 'A' 'B') GT >>> compare (CharRange 'B' 'C') (CharRange 'A' 'D') GT -} instance Ord CodePointRange where compare = \case SingleChar c1 → \case SingleChar c2 → compare c1 c2 CharRange c3 c4 | c3 == c4 && c4 == c1 → EQ | c1 <= c3 → LT | otherwise → GT CharRange c1 c2 → \case SingleChar c3 | c1 == c2 && c2 == c3 → EQ | c3 <= c1 → GT | otherwise → LT CharRange c3 c4 | c2 < c3 → LT | c1 > c4 → GT | otherwise → case compare c1 c3 of EQ → compare c2 c4 o → o {- | Parse @AAAA..BBBB@ range or single code point @since 0.1.0 -} parseCodePointRange ∷ (HasCallStack) ⇒ B.ByteString → CodePointRange parseCodePointRange raw = case B.span (/= Period) raw of (parseCodePoint → ch1, rest) | B.null rest → SingleChar ch1 | otherwise → CharRange ch1 (parseCodePoint (B.drop 2 rest)) {- | Parse @AAAA..BBBB@ range @since 0.1.0 -} parseCodePointRange' ∷ (HasCallStack) ⇒ B.ByteString → (Char, Char) parseCodePointRange' raw = case B.span (/= Period) raw of (parseCodePoint → ch1, rest) → (ch1, parseCodePoint (B.drop 2 rest)) -------------------------------------------------------------------------------- -- Numeric value parser -------------------------------------------------------------------------------- {- | Numeric value of a code point, if relevant @since 0.1.0 -} data NumericValue = NotNumeric | Integer !Integer | Rational !Rational deriving (Eq, Show) -- | See: https://www.unicode.org/reports/tr44/#Numeric_Value parseNumericValue ∷ (HasCallStack) ⇒ B.ByteString → NumericValue parseNumericValue raw | B.null raw = NotNumeric | B.elem Slash raw = case B.span (/= Slash) raw of (num, denum) → Rational (readB num % (readB . B.drop 1) denum) where readB = read . B8.unpack | otherwise = Integer (read (B8.unpack raw)) -------------------------------------------------------------------------------- -- Boolean value parser -------------------------------------------------------------------------------- -- | Parse boolean values ‘Y’ and ‘N’. parseBoolValue ∷ (HasCallStack) ⇒ B.ByteString → Bool parseBoolValue = \case "Y" → True "N" → False raw → error ("parseBoolValue: Cannot parse: " <> show raw)