{- | Copyright : (c) 2024 Pierre Le Marre Maintainer: dev@wismill.eu Stability : experimental Miscellaneous bits common to various parsers -} module Unicode.CharacterDatabase.Parser.Internal ( -- * Word8 patterns pattern Asterisk, pattern Comma, pattern HashTag, pattern NewLine, pattern Period, pattern SemiColon, pattern Slash, -- * Parser helpers withParser, -- * Code point parseCodePoint, parseCodePointM, -- * Range CodePointRange (..), parseCodePointRange, parseCodePointRange', -- * Numeric value NumericValue (..), parseNumericValue, -- * Boolean value parseBoolValue, ) where import Data.ByteString qualified as B import Data.ByteString.Char8 qualified as B8 import Data.Char (chr) import Data.Ratio ((%)) import Data.Word (Word8) -------------------------------------------------------------------------------- -- Char8 patterns -------------------------------------------------------------------------------- -- | @'\\n'@ pattern NewLine ∷ Word8 pattern NewLine = 0x0a -- | @#@ pattern HashTag ∷ Word8 pattern HashTag = 0x23 -- | @*@ pattern Asterisk ∷ Word8 pattern Asterisk = 0x2a -- | @,@ pattern Comma ∷ Word8 pattern Comma = 0x2c -- | @.@ pattern Period ∷ Word8 pattern Period = 0x2e -- | @\/@ pattern Slash ∷ Word8 pattern Slash = 0x2f -- | @;@ pattern SemiColon ∷ Word8 pattern SemiColon = 0x3b -------------------------------------------------------------------------------- -- Parse helpers -------------------------------------------------------------------------------- -- | Use the given parser to parse each line withParser ∷ (B.ByteString → Maybe a) → B.ByteString → Maybe (a, B.ByteString) withParser parse = go where go raw | B.null raw = Nothing | otherwise = case B.span (/= NewLine) raw of (B8.strip → line, B.drop 1 → raw') → case parse line of Nothing → go raw' Just entry → Just (entry, raw') -------------------------------------------------------------------------------- -- Code point parser -------------------------------------------------------------------------------- {- | Parse a code point formatted as hexadecimal /Warning:/ raise an error on invalid input. >>> parseCodePoint "0061" 'a' @since 0.1.0 -} parseCodePoint ∷ B.ByteString → Char parseCodePoint = chr . read . B8.unpack . ("0x" <>) {- | Parse a code point formatted as hexadecimal, or return 'Nothing' on an empty string. /Warning:/ raise an error on invalid input. >>> parseCodePointM "0061" Just 'a' >>> parseCodePointM "" Nothing See also: 'parseCodePoint'. @since 0.1.0 -} parseCodePointM ∷ B.ByteString → Maybe Char parseCodePointM raw | B.null raw = Nothing | otherwise = Just (parseCodePoint raw) -------------------------------------------------------------------------------- -- Code point range parser -------------------------------------------------------------------------------- {- | A Unicode code point range @since 0.1.0 -} data CodePointRange = SingleChar {start ∷ !Char} | CharRange {start ∷ !Char, end ∷ !Char} deriving (Eq, Ord, Show) {- | Parse @AAAA..BBBB@ range or single code point @since 0.1.0 -} parseCodePointRange ∷ B.ByteString → CodePointRange parseCodePointRange raw = case B.span (/= Period) raw of (parseCodePoint → ch1, rest) | B.null rest → SingleChar ch1 | otherwise → CharRange ch1 (parseCodePoint (B.drop 2 rest)) {- | Parse @AAAA..BBBB@ range @since 0.1.0 -} parseCodePointRange' ∷ B.ByteString → (Char, Char) parseCodePointRange' raw = case B.span (/= Period) raw of (parseCodePoint → ch1, rest) → (ch1, parseCodePoint (B.drop 2 rest)) -------------------------------------------------------------------------------- -- Numeric value parser -------------------------------------------------------------------------------- {- | Numeric value of a code point, if relevant @since 0.1.0 -} data NumericValue = NotNumeric | Integer !Integer | Rational !Rational deriving (Eq, Show) -- | See: https://www.unicode.org/reports/tr44/#Numeric_Value parseNumericValue ∷ B.ByteString → NumericValue parseNumericValue raw | B.null raw = NotNumeric | B.elem Slash raw = case B.span (/= Slash) raw of (num, denum) → Rational (readB num % (readB . B.drop 1) denum) where readB = read . B8.unpack | otherwise = Integer (read (B8.unpack raw)) -------------------------------------------------------------------------------- -- Boolean value parser -------------------------------------------------------------------------------- -- | Parse boolean values ‘Y’ and ‘N’. parseBoolValue ∷ B.ByteString → Bool parseBoolValue = \case "Y" → True "N" → False raw → error ("parseBoolValue: Cannot parse: " <> show raw)