-- |
-- Module      :  Data.Text.Parser
-- License     :  CC0-1.0
--
-- Maintainer  :  mordae@anilinux.org
-- Stability   :  unstable
-- Portability :  non-portable (ghc)
--
-- This module provides a parser for unicode 'Text'.
--
--   * If you\'d like to parse ASCII text, you might want to take a look at
--     "Data.ByteString.Parser.Char8". It is much, much faster.
--
--   * If you\'d like to parse byte sequences, look instead at the
--     "Data.ByteString.Parser".
--

module Data.Text.Parser
  ( Parser(..)
  , Result(..)
  , parseOnly

    -- * Chars
  , char
  , notChar
  , anyChar
  , satisfy
  , space
  , isSpace
  , skipSpace
  , peekChar

    -- * Strings
  , string
  , stringCI
  , Data.Text.Parser.take
  , scan
  , runScanner
  , Data.Text.Parser.takeWhile
  , takeWhile1
  , takeTill
  , takeTill1

    -- * Numbers
  , signed
  , decimal
  , hexadecimal
  , octal
  , fractional

    -- * Combinators
  , provided
  , choice
  , branch
  , Snack.Combinators.count
  , optional
  , eitherP
  , option
  , many
  , many1
  , manyTill
  , sepBy
  , sepBy1
  , wrap
  , match
  , label
  , unlabel
  , validate

    -- * End Of Input
  , takeText
  , peekText
  , endOfInput
  , atEnd

    -- * Position
  , offset
  , position
  , explain
  , Explanation(..)

    -- * Miscelaneous
    -- |
    -- These are all generic methods, but since I sometimes forget about them,
    -- it is nice to have them listed here for reference what writing parsers.
  , Control.Applicative.empty
  , pure
  , guard
  , when
  , unless
  , void
  )
where
  import Prelude hiding (null, length, splitAt, take)

  import Control.Applicative
  import Control.Monad

  import Data.Char
  import Data.Maybe
  import Data.List qualified as List

  import Data.Text as T
  import Data.Text.Unsafe as T
  import Data.Text.Encoding as T

  import Data.ByteString qualified as BS
  import Data.ByteString.Lex.Fractional qualified as LF
  import Data.ByteString.Lex.Integral qualified as LI

  import Snack.Combinators


  -- |
  -- Result represents either success or some kind of failure.
  --
  -- You can find the problematic offset by subtracting length of the
  -- remainder from length of the original input.
  --
  data Result a
    = Success a {-# UNPACK #-} !Text
      -- ^ Parser successfully matched the input.
      --   Produces the parsing result and the remainder of the input.

    | Failure [String] {-# UNPACK #-} !Text
      -- ^ Parser failed to match the input.
      --   Produces list of expected inputs and the corresponding remainder.

    | Error String {-# UNPACK #-} !Text {-# UNPACK #-} !Int
      -- ^ Parser ran into an error. Either syntactic or a validation one.

    deriving (Eq, Show)

  instance Functor Result where
    {-# INLINE fmap #-}
    fmap fn (Success res more) = Success (fn res) more
    fmap _  (Failure expected more) = Failure expected more
    fmap _  (Error reason more len) = Error reason more len


  -- |
  -- Parser for 'Text' inputs.
  --
  newtype Parser a =
    Parser
      { runParser :: Text -> Result a
        -- ^ Run the parser on specified input.
      }

  instance Functor Parser where
    {-# INLINE fmap #-}
    fmap fn Parser{runParser} = Parser \inp ->
      fmap fn (runParser inp)

  instance Applicative Parser where
    {-# INLINE pure #-}
    pure x = Parser \inp -> Success x inp

    {-# INLINE (<*>) #-}
    (Parser runFn) <*> (Parser runArg) = Parser \inp ->
      case runFn inp of
        Success fn rest -> fmap fn (runArg rest)
        Failure expected more -> Failure expected more
        Error reason more len -> Error reason more len

  instance Alternative Parser where
    {-# INLINE empty #-}
    empty = Parser \inp -> Failure [] inp

    -- |
    -- Tries the right branch only if the left brach produces Failure.
    -- Does not mask Error.
    --
    {-# INLINE (<|>) #-}
    (Parser runLeft) <|> (Parser runRight) = Parser \inp ->
      case runLeft inp of
        Success res more -> Success res more
        Error reason more len -> Error reason more len
        Failure expected more ->
          case runRight inp of
            Success res' more' -> Success res' more'
            Error reason' more' len' -> Error reason' more' len'
            Failure expected' more' ->
              -- Longer match (shorter remainder) wins.
              case length more `compare` length more' of
                LT -> Failure expected more
                EQ -> Failure (expected <> expected') more
                GT -> Failure expected' more'

  instance Monad Parser where
    {-# INLINE (>>=) #-}
    (Parser runLeft) >>= right = Parser \inp ->
      case runLeft inp of
        Success res more -> runParser (right res) more
        Failure expected more -> Failure expected more
        Error reason more len -> Error reason more len

  instance MonadPlus Parser


  -- |
  -- Accepts a single, matching character.
  --
  {-# INLINE CONLIKE char #-}
  char :: Char -> Parser Char
  char c = label (show c) $ satisfy (c ==)


  -- |
  -- Accepts a single, differing character.
  --
  {-# INLINE CONLIKE notChar #-}
  notChar :: Char -> Parser Char
  notChar c = satisfy (c /=)


  -- |
  -- Discards the remaining input and returns just the parse result.
  -- You might want to combine it with 'endOfInput' for the best effect.
  --
  -- Example:
  --
  -- @
  -- parseOnly (pContacts \<* endOfInput) bstr
  -- @
  --
  {-# INLINE CONLIKE parseOnly #-}
  parseOnly :: Parser a -> Text -> Either String a
  parseOnly par = \inp ->
    case runParser par inp of
      Success res _ -> Right res
      Error reason _ _ -> Left reason
      Failure expected _ ->
        case expected of
          [] -> Left $ "Unexpected input."
          ex -> Left $ "Expected " <> List.intercalate ", " ex <> "."


  -- |
  -- Accepts a single character.
  --
  {-# INLINE anyChar #-}
  anyChar :: Parser Char
  anyChar = Parser \inp ->
    if null inp
       then Failure ["any char"] inp
       else Success (unsafeHead inp) (unsafeTail inp)


  -- |
  -- Accepts a single character matching the predicate.
  --
  {-# INLINE CONLIKE satisfy #-}
  satisfy :: (Char -> Bool) -> Parser Char
  satisfy isOk = Parser \inp ->
    if null inp
       then Failure ["more input"] inp
       else let c = unsafeHead inp
             in if isOk c
                   then Success c (unsafeTail inp)
                   else Failure [] inp


  -- |
  -- Accepts a single unicode white space character.
  -- See 'isSpace' for details.
  --
  {-# INLINE space #-}
  space :: Parser Char
  space = label "space" $ satisfy isSpace


  -- |
  -- Accepts multiple unicode white space characters.
  -- See 'isSpace' for details.
  --
  {-# INLINE skipSpace #-}
  skipSpace :: Parser ()
  skipSpace = void $ Data.Text.Parser.takeWhile isSpace


  -- |
  -- Peeks ahead, but does not consume.
  --
  -- Be careful, peeking behind end of the input fails.
  -- You might want to check using 'atEnd' beforehand.
  --
  {-# INLINE peekChar #-}
  peekChar :: Parser Char
  peekChar = Parser \inp ->
    if null inp
       then Failure ["more input"] inp
       else Success (unsafeHead inp) inp


  -- |
  -- Accepts a matching string.
  --
  {-# INLINE CONLIKE string #-}
  string :: Text -> Parser Text
  string str = Parser \inp ->
    let (pfx, sfx) = splitAt (length str) inp
     in case pfx == str of
          True -> Success pfx sfx
          False -> Failure [show str] inp


  -- |
  -- Same as 'string', but case insensitive.
  --
  {-# INLINE CONLIKE stringCI #-}
  stringCI :: Text -> Parser Text
  stringCI str = Parser \inp ->
    let (pfx, sfx) = splitAt (length str) inp
     in case toCaseFold pfx == toCaseFold str of
          True -> Success pfx sfx
          False -> Failure [show str] inp


  -- |
  -- Accepts given number of characters.
  -- Fails when not enough characters are available.
  --
  {-# INLINE CONLIKE take #-}
  take :: Int -> Parser Text
  take n = Parser \inp ->
    if n > length inp
       then Failure [show n <> " more characters"] inp
       else let (pfx, more) = splitAt n inp
             in Success pfx more


  -- |
  -- Scans ahead statefully and then accepts whatever characters the scanner liked.
  -- Scanner returns 'Nothing' to mark end of the acceptable extent.
  --
  {-# INLINE CONLIKE scan #-}
  scan :: s -> (s -> Char -> Maybe s) -> Parser Text
  scan state scanner = fst <$> runScanner state scanner


  -- |
  -- Like 'scan', but also returns the final scanner state.
  --
  {-# INLINE CONLIKE runScanner #-}
  runScanner :: s -> (s -> Char -> Maybe s) -> Parser (Text, s)
  runScanner state scanner = Parser \inp -> loop inp state 0
    where
      loop inp !st !n =
        case n >= lengthWord8 inp of
          True -> Success (inp, st) mempty
          False ->
            case iter inp n of
              Iter c n' ->
                case scanner st c of
                  Nothing -> Success (takeWord8 n inp, st) (dropWord8 n inp)
                  Just st' -> loop inp st' (n + n')


  -- |
  -- Efficiently consume as long as the input characters match the predicate.
  -- An inverse of 'takeTill'.
  --
  {-# INLINE CONLIKE takeWhile #-}
  takeWhile :: (Char -> Bool) -> Parser Text
  takeWhile test = takeTill (not . test)


  -- |
  -- Like 'Data.Text.Parser.takeWhile', but requires at least a single character.
  --
  {-# INLINE CONLIKE takeWhile1 #-}
  takeWhile1 :: (Char -> Bool) -> Parser Text
  takeWhile1 test = Data.Text.Parser.takeWhile test `provided` (not . null)


  -- |
  -- Efficiently consume until a character matching the predicate is found.
  -- An inverse of 'Data.Text.Parser.takeWhile'.
  --
  {-# INLINE CONLIKE takeTill #-}
  takeTill :: (Char -> Bool) -> Parser Text
  takeTill test = Parser \inp ->
    let n = fromMaybe (length inp) $ findIndex test inp
        (pfx, more) = splitAt n inp
     in Success pfx more


  -- |
  -- Same as 'takeTill', but requires at least a single character.
  --
  {-# INLINE CONLIKE takeTill1 #-}
  takeTill1 :: (Char -> Bool) -> Parser Text
  takeTill1 test = Data.Text.Parser.takeTill test `provided` (not . null)


  -- |
  -- Makes the parser not only return the result, but also the original
  -- matched extent.
  --
  {-# INLINE CONLIKE match #-}
  match :: Parser a -> Parser (Text, a)
  match par = Parser \inp ->
    case runParser par inp of
      Failure expected more -> Failure expected more
      Error reason more len -> Error reason more len
      Success res more ->
        let n = length more
         in Success (T.take n inp, res) more


  -- |
  -- Names an extent of the parser.
  --
  -- When the extent returns a Failure, details are discarded and replaced
  -- with the extent as a whole.
  --
  -- When the extent returns an Error, it is adjusted to cover the whole
  -- extent, but the reason is left intact.
  --
  -- You should strive to make labeled extents as small as possible,
  -- approximately of a typical token size. For example:
  --
  -- @
  -- pString = label \"string\" $ pStringContents \`wrap\` char \'\"\'
  -- @
  --
  {-# INLINE CONLIKE label #-}
  label :: String -> Parser a -> Parser a
  label lbl par = Parser \inp ->
    case runParser par inp of
      Success res more -> Success res more
      Failure _expected _more -> Failure [lbl] inp
      Error reason more len ->
        let len' = len + (length inp - length more)
         in Error reason inp len'


  -- |
  -- Un-names an extent of the parser.
  --
  -- Same as 'label', but removes any expected values upon Failure.
  -- Very useful to mark comments and optional whitespace with.
  --
  {-# INLINE CONLIKE unlabel #-}
  unlabel :: Parser a -> Parser a
  unlabel par = Parser \inp ->
    case runParser par inp of
      Success res more -> Success res more
      Failure _expected _more -> Failure [] inp
      Error reason more len ->
        let len' = len + (length inp - length more)
         in Error reason inp len'


  -- |
  -- Validate parser result and turn it into an Error upon failure.
  --
  {-# INLINE CONLIKE validate #-}
  validate :: (a -> Either String b) -> Parser a -> Parser b
  validate test par = Parser \inp ->
    case runParser par inp of
      Failure expected more -> Failure expected more
      Error reason more len -> Error reason more len
      Success res more ->
        case test res of
          Right res' -> Success res' more
          Left reason -> Error reason inp (length inp - length more)


  -- |
  -- Given list of matchers and parsers, runs the first parser whose matcher
  -- succeeds on the input. This pattern makes for a simpler alternative to
  -- @try@ used in other parser combinator libraries.
  --
  -- Example:
  --
  -- @
  -- pProperty = branch [ ( string "public" <* skipSpace
  --                      , \_ -> Property Public <$> pToken
  --                      )
  --                    , ( string "private" <* skipSpace
  --                      , \_ -> Property Private <$> pToken
  --                      )
  --                    ]
  -- @
  --
  {-# INLINE CONLIKE branch #-}
  branch :: [(Parser a, a -> Parser b)] -> Parser b
  branch [] = Parser \inp -> Failure [] inp
  branch ((Parser test, finish) : alts) =
    Parser \inp ->
      case test inp of
        Success res more -> runParser (finish res) more
        Error reason more len -> Error reason more len
        Failure _expected _more -> runParser (branch alts) inp


  -- |
  -- Accept whatever input remains.
  --
  {-# INLINE takeText #-}
  takeText :: Parser Text
  takeText = Parser \inp -> Success inp mempty


  -- |
  -- Peek at whatever input remains.
  --
  {-# INLINE peekText #-}
  peekText :: Parser Text
  peekText = Parser \inp -> Success inp inp


  -- |
  -- Accepts end of input and fails if we are not there yet.
  --
  {-# INLINE endOfInput #-}
  endOfInput :: Parser ()
  endOfInput = Parser \case
    inp | null inp  -> Success () inp
    inp             -> Failure ["end of input"] inp


  -- |
  -- Returns whether we are at the end of the input yet.
  --
  {-# INLINE atEnd #-}
  atEnd :: Parser Bool
  atEnd = Parser \inp -> Success (null inp) inp


  -- |
  -- Accepts optional @\'+\'@ or @\'-\'@ character and then applies it to
  -- the following parser result.
  --
  {-# INLINE signed #-}
  signed :: (Num a) => Parser a -> Parser a
  signed runNumber = (char '-' *> fmap negate runNumber)
                 <|> (char '+' *> runNumber)
                 <|> (runNumber)


  {-# INLINE CONLIKE unsafeWithUtf8 #-}
  unsafeWithUtf8 :: (BS.ByteString -> Maybe (a, BS.ByteString))
                 -> Text -> Maybe (a, Text)
  unsafeWithUtf8 bspar = \inp ->
    let bstr = encodeUtf8 inp
     in case bspar bstr of
          Nothing -> Nothing
          Just (x, more) ->
            -- This should be perfectly safe as long as the embedded
            -- parser returns the actual remaining input and not some
            -- random chunk of bytes.
            let n = lengthWord8 inp - BS.length more
             in Just (x, dropWord8 n inp)


  -- |
  -- Accepts an integral number in the decimal format.
  --
  {-# INLINE decimal #-}
  decimal :: (Integral a) => Parser a
  decimal = Parser \inp ->
    case unsafeWithUtf8 LI.readDecimal inp of
      Just (res, more) -> Success res more
      Nothing -> Failure ["decimal"] inp


  -- |
  -- Accepts an integral number in the hexadecimal format in either case.
  -- Does not look for @0x@ or similar prefixes.
  --
  {-# INLINE hexadecimal #-}
  hexadecimal :: (Integral a) => Parser a
  hexadecimal = Parser \inp ->
    case unsafeWithUtf8 LI.readHexadecimal inp of
      Just (res, more) -> Success res more
      Nothing -> Failure ["hexadecimal"] inp


  -- |
  -- Accepts an integral number in the octal format.
  --
  {-# INLINE octal #-}
  octal :: (Integral a) => Parser a
  octal = Parser \inp ->
    case unsafeWithUtf8 LI.readOctal inp of
      Just (res, more) -> Success res more
      Nothing -> Failure ["octal"] inp


  -- |
  -- Accepts a fractional number as a decimal optinally followed by a colon
  -- and the fractional part. Does not support exponentiation.
  --
  {-# INLINE fractional #-}
  fractional :: (Fractional a) => Parser a
  fractional = Parser \inp ->
    case unsafeWithUtf8 LF.readDecimal inp of
      Just (res, more) -> Success res more
      Nothing -> Failure ["fractional"] inp


  -- |
  -- Calculate offset from the original input and the remainder.
  --
  offset :: Text -> Text -> Int
  offset inp more = length inp - length more


  -- |
  -- Determine @(line, column)@ from the original input and the remainder.
  --
  -- Counts line feed characters leading to the 'offset', so only use it
  -- on your slow path. For example when describing parsing errors.
  --
  position :: Text -> Text -> (Int, Int)
  position inp more = (succ line, succ column)
    where
      column = length lastLine
      lastLine = takeWhileEnd ('\n' /=) leader
      line = T.count "\n" leader
      leader = dropEnd (length more) inp


  -- |
  -- More precise 'Result' description produced by 'explain'.
  --
  data Explanation
    = Explanation
      { exSource       :: String
        -- ^ Name of the source file.
      , exSpanFrom     :: (Int, Int)
        -- ^ Line and column where the problem starts.
      , exSpanTo       :: (Int, Int)
        -- ^ Line and column where the problem ends.
      , exMessage      :: String
        -- ^ Message associated with the problem.
      }
    deriving (Eq, Show)


  -- |
  -- Process the result for showing it to the user.
  --
  explain :: String -> Text -> Result a -> Explanation
  explain src inp (Success _ more) =
    Explanation { exSource   = src
                , exSpanFrom = pos
                , exSpanTo   = pos
                , exMessage  = "Parsed successfully up to this point."
                }
      where
        pos = position inp more


  explain src inp (Failure expected more) =
    Explanation { exSource   = src
                , exSpanFrom = pos
                , exSpanTo   = pos
                , exMessage =
                    case expected of
                      [] -> "Unexpected input."
                      ex -> "Expected " <> List.intercalate ", " ex <> "."
                }
      where
        pos = position inp more

  explain src inp (Error reason more len) =
    Explanation { exSource   = src
                , exSpanFrom = from
                , exSpanTo   = to
                , exMessage  = reason
                }
      where
        from = position inp more
        to   = position inp (T.drop len more)


-- vim:set ft=haskell sw=2 ts=2 et: