-- #ignore-exports
-------------------------------------------------------------------------------
-- |
-- Module      :  Text.Yaml.Reference
-- Copyright   :  (c) Oren Ben-Kiki 2007
-- License     :  LGPL
-- 
-- Maintainer  :  yaml-oren@ben-kiki.org
-- Stability   :  alpha
-- Portability :  portable
--
-- Implementation of the YAML syntax as defined in <http://www.yaml.org>.
-- Actually this file contains the parsing framework and includes (using CPP)
-- the actual productions from @Reference.bnf@.
--
-- The parsing framework is fully streaming (generates output tokens
-- \"immediately\"). The memory leak that existed in previous version has been
-- plugged.
-------------------------------------------------------------------------------

module Text.Yaml.Reference
  (
    -- Basic parsing:
    Code,
    Token,
    Tokenizer,
    yaml,
    -- For testing:
    Context,
    Chomp,
    tokenizer,
    tokenizerWithN,
    tokenizerWithC,
    tokenizerWithT,
    tokenizerWithNC,
    tokenizerWithNT,
    tokenizerNames,
    showTokens
  )
where

import           Control.Monad
import qualified Data.ByteString.Lazy.Char8 as C
import           Data.Char
import qualified Data.DList as D
import qualified Data.Map as Map
import           Text.Regex
import           Debug.Trace
import qualified Prelude
import           Prelude hiding ((/), (*), (+), (-), (^))

-- * Generic operators
--
-- ** Numeric operators
--
-- We rename the four numerical operators @+@ @-@ @*@ @\/@ to start with @.@
-- (@.+@, @.-@, @.*@, @.\/@). This allows us to use the originals for BNF
-- notation (we also hijack the @^@ operator). This is not a generally
-- recommended practice. It is justified in this case since we have very little
-- arithmetic operations, and a lot of BNF rules which this makes extremely
-- readable.

infixl 6 .+
-- | \".+\" is the numeric addition (we use \"+\" for postfix \"one or more\").
(.+) = (Prelude.+)

infixl 6 .-
-- | \".-\" is the numeric subtraction (we use \"-\" for infix \"and not\").
(.-) = (Prelude.-)

infixl 7 .*
-- | \".*\" is the numeric multiplication (we use \"*\" for postfix \"zero or
-- more\").
(.*) = (Prelude.*)

infixl 7 ./
-- | \"./\" is the numeric division (we use \"/\" for infix \"or\").
(./) = (Prelude./)

-- ** Record field access
--
-- We also define @|>@ for record access for increased readability.

infixl 9 |>
-- | @record |> field@ is the same as @field record@,  but is more readable.
(|>) :: record -> (record -> value) -> value
record |> field = field record

-- * UTF decoding
--
-- This really should be factored out to the standard libraries. Since it isn't
-- there, we get to tailor it exactly to our needs. We use lazy byte strings as
-- input, which should give reasonable I\/O performance when reading large
-- files. The output is a normal 'Char' list which is easy to work with and
-- should be efficient enough as long as the 'Parser' does its job right.

-- | Recognized Unicode encodings. UTF-32 isn't required by YAML parsers.
data Encoding = UTF8    -- ^ UTF-8 encoding (or ASCII)
              | UTF16LE -- ^ UTF-16 little endian
              | UTF16BE -- ^ UTF-16 big endian
    deriving Show

-- | @decode bytes@ automatically detects the 'Encoding' used and converts the
-- /bytes/ to Unicode characters.
decode :: C.ByteString -> (Encoding, [Char])
decode text = (encoding, undoEncoding encoding text)
  where encoding = detectEncoding $ C.unpack $ C.take 2 text

-- | @detectEncoding text@ examines the first few chars (bytes) of the /text/
-- to deduce the Unicode encoding used according to the YAML spec.
detectEncoding :: [Char] -> Encoding
detectEncoding text =
  case text of
    '\xFF':'\xFE':_ -> UTF16LE
    '\xFE':'\xFF':_ -> UTF16BE
    _               -> UTF8

-- | @undoEncoding encoding bytes@ converts a /bytes/ stream to Unicode
-- characters according to the /encoding/.
undoEncoding :: Encoding -> C.ByteString -> [Char]
undoEncoding encoding bytes =
  case encoding of
    UTF8    -> undoUTF8 bytes
    UTF16LE -> combinePairs $ undoUTF16LE bytes
    UTF16BE -> combinePairs $ undoUTF16BE bytes

-- ** UTF-16 decoding

-- | @combinePairs chars@ converts each pair of UTF-16 surrogate characters to a
-- single Unicode character.
combinePairs :: [Char] -> [Char]
combinePairs []                          = []
combinePairs (lead:rest)
  | '\xD800' <= lead && lead <= '\xDBFF' = combineLead lead rest
  | '\xDC00' <= lead && lead <= '\xDFFF' = error "UTF-16 contains trail surrogate without lead surrogate"
  | otherwise                            = lead:(combinePairs rest)

-- | @combineLead lead rest@ combines the /lead/ surrogate with the head of the
-- /rest/ of the input chars, assumed to be a /trail/ surrogate, and continues
-- combining surrogate pairs.
combineLead :: Char -> [Char] -> [Char]
combineLead lead []                        = error "UTF-16 contains lead surrogate as final character"
combineLead lead (trail:rest)
  | '\xDC00' <= trail && trail <= '\xDFFF' = (combineSurrogates lead trail):combinePairs rest
  | otherwise                              = error "UTF-16 contains lead surrogate without trail surrogate"

-- | @surrogateOffset@ is copied from the Unicode FAQs.
surrogateOffset :: Int
surrogateOffset = 0x10000 .- (0xD800 .* 1024) .- 0xDC00

-- | @combineSurrogates lead trail@ combines two UTF-16 surrogates into a single
-- Unicode character.
combineSurrogates :: Char -> Char -> Char
combineSurrogates lead trail = chr $ (ord lead) .* 1024 .+ (ord trail) .+ surrogateOffset

-- | @hasFewerThan bytes n@ checks whether there are fewer than /n/ /bytes/
-- left to read.
hasFewerThan :: Int -> C.ByteString -> Bool
hasFewerThan n bytes
  | n == 1 = C.null bytes
  | n > 1  = C.null bytes || hasFewerThan (n .- 1) (C.tail bytes)

-- | @undoUTF18LE bytes@ decoded a UTF-16-LE /bytes/ stream to Unicode chars.
undoUTF16LE :: C.ByteString -> [Char]
undoUTF16LE bytes
  | C.null bytes = []
  | hasFewerThan 2 bytes = error "UTF-16-LE input contains odd number of bytes"
  | otherwise = let low = C.head bytes
                    bytes' = C.tail bytes
                    high = C.head bytes'
                    rest = C.tail bytes'
                in (chr $ (ord high) .* 256 .+ (ord low)):(undoUTF16LE rest)

-- | @undoUTF18BE bytes@ decoded a UTF-16-BE /bytes/ stream to Unicode chars.
undoUTF16BE :: C.ByteString -> [Char]
undoUTF16BE bytes
  | C.null bytes = []
  | hasFewerThan 2 bytes = error "UTF-16-BE input contains odd number of bytes"
  | otherwise = let high = C.head bytes
                    bytes' = C.tail bytes
                    low = C.head bytes'
                    rest = C.tail bytes'
                in (chr $ (ord high) .* 256 .+ (ord low)):(undoUTF16BE rest)

-- ** UTF-8 decoding

-- | @undoUTF8 bytes@ decoded a UTF-8 /bytes/ stream to Unicode chars.
undoUTF8 :: C.ByteString -> [Char]
undoUTF8 bytes
  | C.null bytes = []
  | otherwise = let first = C.head bytes
                    rest = C.tail bytes
                in case () of
                      _ | first < '\x80' -> first:(undoUTF8 rest)
                        | first < '\xC0' -> error "UTF-8 input contains invalid first byte"
                        | first < '\xE0' -> decodeTwoUTF8 first rest
                        | first < '\xF0' -> decodeThreeUTF8 first rest
                        | first < '\xF8' -> decodeFourUTF8 first rest
                        | otherwise      -> error "UTF-8 input contains invalid first byte"

-- | @decodeTwoUTF8 first bytes@ decodes a two-byte UTF-8 character, where the
-- /first/ byte is already available and the second is the head of the /bytes/,
-- and then continues to undo the UTF-8 encoding.
decodeTwoUTF8 :: Char -> C.ByteString -> [Char]
decodeTwoUTF8 first bytes
  | C.null bytes = error "UTF-8 double byte char is missing second byte at eof"
  | otherwise = let second = C.head bytes
                    rest = C.tail bytes
                in case () of
                      _ | second < '\x80' || '\xBF' < second -> error "UTF-8 double byte char has invalid second byte"
                        | otherwise                          -> (combineTwoUTF8 first second):(undoUTF8 rest)

-- | @combineTwoUTF8 first second@ combines the /first/ and /second/ bytes of a
-- two-byte UTF-8 char into a single Unicode char.
combineTwoUTF8 :: Char -> Char -> Char
combineTwoUTF8 first second = chr(((ord first) .- 0xC0) .* 64
                               .+ ((ord second) .- 0x80))

-- | @decodeThreeUTF8 first bytes@ decodes a three-byte UTF-8 character, where
-- the /first/ byte is already available and the second and third are the head
-- of the /bytes/, and then continues to undo the UTF-8 encoding.
decodeThreeUTF8 :: Char -> C.ByteString -> [Char]
decodeThreeUTF8 first bytes
  | hasFewerThan 2 bytes = error "UTF-8 triple byte char is missing bytes at eof"
  | otherwise = let second = C.head bytes
                    bytes' = C.tail bytes
                    third = C.head bytes'
                    rest = C.tail bytes'
                in case () of
                      _ | second < '\x80' || '\xBF' < second -> error "UTF-8 triple byte char has invalid second byte"
                        | third < '\x80' || '\xBF' < third   -> error "UTF-8 triple byte char has invalid third byte"
                        | otherwise                          -> (combineThreeUTF8 first second third):(undoUTF8 rest)

-- | @combineThreeUTF8 first second@ combines the /first/, /second/ and /third/
-- bytes of a three-byte UTF-8 char into a single Unicode char.
combineThreeUTF8 :: Char -> Char -> Char -> Char
combineThreeUTF8 first second third = chr(((ord first) .- 0xE0) .* 4096
                                       .+ ((ord second) .- 0x80) .* 64
                                       .+ ((ord third)  .- 0x80))

-- | @decodeFourUTF8 first bytes@ decodes a four-byte UTF-8 character, where the
-- /first/ byte is already available and the second, third and fourth are the
-- head of the /bytes/, and then continues to undo the UTF-8 encoding.
decodeFourUTF8 :: Char -> C.ByteString -> [Char]
decodeFourUTF8 first bytes
  | hasFewerThan 3 bytes = error "UTF-8 quad byte char is missing bytes at eof"
  | otherwise = let second = C.head bytes
                    bytes' = C.tail bytes
                    third = C.head bytes'
                    bytes'' = C.tail bytes'
                    fourth = C.head bytes''
                    rest = C.tail bytes''
                in case () of
                      _ | second < '\x80' || '\xBF' < second -> error "UTF-8 quad byte char has invalid second byte"
                        | third < '\x80' || '\xBF' < third   -> error "UTF-8 quad byte char has invalid third byte"
                        | third < '\x80' || '\xBF' < third   -> error "UTF-8 quad byte char has invalid fourth byte"
                        | otherwise                          -> (combineFourUTF8 first second third fourth):(undoUTF8 rest)

-- | @combineFourUTF8 first second@ combines the /first/, /second/ and /third/
-- bytes of a three-byte UTF-8 char into a single Unicode char.
combineFourUTF8 :: Char -> Char -> Char -> Char -> Char
combineFourUTF8 first second third fourth = chr(((ord first)  .- 0xF0) .* 262144
                                             .+ ((ord second) .- 0x80) .* 4096
                                             .+ ((ord third)  .- 0x80) .* 64
                                             .+ ((ord fourth) .- 0x80))

-- * Result tokens
--
-- The parsing result is a stream of tokens rather than a parse tree. The idea
-- is to convert the YAML input into \"byte codes\". These byte codes are
-- intended to be written into a byte codes file (or more likely a UNIX pipe)
-- for further processing.

-- | 'Token' codes.
data Code = Bom             -- ^ BOM, contains \"@TF8@\", \"@TF16LE@\" or \"@TF16BE@\".
          | Text            -- ^ Content text characters.
          | Meta            -- ^ Non-content (meta) text characters.
          | Break           -- ^ Line break preserved in content.
          | Continue        -- ^ Separation line break.
          | LineFeed        -- ^ Line break normalized to content line feed.
          | LineFold        -- ^ Line break folded to content space.
          | Indicator       -- ^ Character indicating structure.
          | White           -- ^ Separation white space.
          | Indent          -- ^ Indentation spaces.
          | DocumentStart   -- ^ Document start marker.
          | DocumentEnd     -- ^ Document end marker.
          | BeginEscape     -- ^ Begins escape sequence.
          | EndEscape       -- ^ Ends escape sequence.
          | BeginComment    -- ^ Begins comment.
          | EndComment      -- ^ Ends comment.
          | BeginDirective  -- ^ Begins directive.
          | EndDirective    -- ^ Ends directive.
          | BeginTag        -- ^ Begins tag.
          | EndTag          -- ^ Ends tag.
          | BeginHandle     -- ^ Begins tag handle.
          | EndHandle       -- ^ Ends tag handle.
          | BeginAnchor     -- ^ Begins anchor.
          | EndAnchor       -- ^ Ends anchor.
          | BeginProperties -- ^ Begins node properties.
          | EndProperties   -- ^ Ends node properties.
          | BeginAlias      -- ^ Begins alias.
          | EndAlias        -- ^ Ends alias.
          | BeginScalar     -- ^ Begins scalar content.
          | EndScalar       -- ^ Ends scalar content.
          | BeginSequence   -- ^ Begins sequence content.
          | EndSequence     -- ^ Ends sequence content.
          | BeginMapping    -- ^ Begins mapping content.
          | EndMapping      -- ^ Ends mapping content.
          | BeginPair       -- ^ Begins mapping key:value pair.
          | EndPair         -- ^ Ends mapping key:value pair.
          | BeginNode       -- ^ Begins complete node.
          | EndNode         -- ^ Ends complete node.
          | BeginDocument   -- ^ Begins document.
          | EndDocument     -- ^ Ends document.
          | BeginStream     -- ^ Begins YAML stream.
          | EndStream       -- ^ Ends YAML stream.
          | Error           -- ^ Parsing error at this point.
          | Unparsed        -- ^ The rest of the input at the error point.
          -- For testing:.
          | Test            -- ^ Test characters otherwise unassigned.
          | Detected        -- ^ Detected parameter.
  deriving Eq

-- | @show code@ converts a 'Code' to the one-character YEAST token code char.
-- The list of byte codes is also documented in the @yaml2yeast@ program.
instance Show Code where
  show code = case code of
                   Bom             -> "U"
                   Text            -> "T"
                   Meta            -> "t"
                   Break           -> "B"
                   Continue        -> "b"
                   LineFeed        -> "L"
                   LineFold        -> "l"
                   Indicator       -> "I"
                   White           -> "w"
                   Indent          -> "i"
                   DocumentStart   -> "K"
                   DocumentEnd     -> "k"
                   BeginEscape     -> "E"
                   EndEscape       -> "e"
                   BeginComment    -> "C"
                   EndComment      -> "c"
                   BeginDirective  -> "D"
                   EndDirective    -> "d"
                   BeginTag        -> "G"
                   EndTag          -> "g"
                   BeginHandle     -> "H"
                   EndHandle       -> "h"
                   BeginAnchor     -> "A"
                   EndAnchor       -> "a"
                   BeginProperties -> "P"
                   EndProperties   -> "p"
                   BeginAlias      -> "R"
                   EndAlias        -> "r"
                   BeginScalar     -> "S"
                   EndScalar       -> "s"
                   BeginSequence   -> "Q"
                   EndSequence     -> "q"
                   BeginMapping    -> "M"
                   EndMapping      -> "m"
                   BeginNode       -> "N"
                   EndNode         -> "n"
                   BeginPair       -> "X"
                   EndPair         -> "x"
                   BeginDocument   -> "O"
                   EndDocument     -> "o"
                   BeginStream     -> "Y"
                   EndStream       -> "y"
                   Error           -> "!"
                   Unparsed        -> "-"
                   Test            -> "?"
                   Detected        -> "$"

-- | Parsed token.
data Token = Token {
    tCode :: Code,  -- ^ Specific token 'Code'.
    tText :: String -- ^ Contained input chars, if any.
  }

-- | @show token@ converts a 'Token' to a single YEAST line.
instance Show Token where
  show token = (show $ token|>tCode) ++ (escapeString $ token|>tText) ++ "\n"

-- | @escapeString string@ escapes all the non-ASCII characters in the
-- /string/, as well as escaping the \"@\\@\" character, using the \"@\\xXX@\",
-- \"@\\uXXXX@\" and \"@\\UXXXXXXXX@\" escape sequences.
escapeString :: String -> String
escapeString []                                   = []
escapeString (first:rest)
  | ' ' <= first && first /= '\\' && first <= '~' = first:(escapeString rest)
  | first <= '\xFF'                               = "\\x" ++ (toHex 2 $ ord first) ++ (escapeString rest)
  | '\xFF' < first && first <= '\xFFFF'           = "\\u" ++ (toHex 4 $ ord first) ++ (escapeString rest)
  | otherwise                                     = "\\U" ++ (toHex 8 $ ord first) ++ (escapeString rest)

-- | @toHex digits int@ converts the /int/ to the specified number of
-- hexadecimal /digits/.
toHex :: Int -> Int -> String
toHex digits int
  | digits > 1  = (toHex (digits .- 1) (int `div` 16)) ++ [intToDigit $ int `mod` 16]
  | digits == 1 = [intToDigit int]

-- | @showTokens tokens@ converts a list of /tokens/ to a multi-line YEAST
-- text.
showTokens :: [Token] -> String
showTokens tokens = foldr (\ token text -> (show token) ++ text) "" tokens

-- * Parsing framework
--
-- Haskell has no shortage of parsing frameworks. We use our own because:
--
--  * Most available frameworks are inappropriate because of their focus on
--    building a parse tree, and completing all of it before any of it is
--    accessible to the caller. We return a stream of tokens, and would like
--    its head to be accessible as soon as possible to allow for streaming. To
--    do this with bounded memory usage we use a combination of continuation
--    passing style and difference lists for the collected tokens.
--
--  * Haskell makes it so easy to roll your own parsing framework. We need some
--    specialized machinery (limited lookahead, forbidden patterns). It is
--    possible to build these on top of existing frameworks but the end result
--    isn't much shorter than rolling our own.
--
-- Since we roll our own framework we don't bother with making it generalized,
-- so we maintain a single 'State' type rather than having a generic one that
-- contains a polymorphic \"UserState\" field etc.

-- | A 'Parser' is basically a function computing a 'Reply'.
data Parser result = Parser (State -> Reply result)

-- | The 'Result' of each invocation is either an error, the actual result, or
-- a continuation for computing the actual result.
data Result result = Failed String        -- ^ Parsing aborted with a failure.
                   | Result result        -- ^ Parsing completed with a result.
                   | More (Parser result) -- ^ Parsing is ongoing with a continuation.

-- Showing a 'Result' is only used in debugging.
instance (Show result) => Show (Result result) where
  show result = case result of
                     Failed message -> "Failed " ++ message
                     Result result  -> "Result " ++ (show result)
                     More _         -> "More"

-- | Each invication of a 'Parser' yields a 'Reply'. The 'Result' is only one
-- part of the 'Reply'.
data Reply result = Reply {
    rResult :: !(Result result), -- ^ Parsing result.
    rTokens :: !(D.DList Token), -- ^ Tokens generated by the parser.
    rCommit :: !(Maybe String),  -- ^ Commitment to a decision point.
    rState  :: !State            -- ^ The updated parser state.
  }

-- Showing a 'State' is only used in debugging.
instance (Show result) => Show (Reply result) where
  show reply = "Result: "    ++ (show $ reply|>rResult)
            ++ ", Tokens: "  ++ (show $ D.toList $ reply|>rTokens)
            ++ ", Commit: "  ++ (show $ reply|>rCommit)
            ++ ", State: { " ++ (show $ reply|>rState) ++ "}"

-- A 'Pattern' is a parser that doesn't have an (interesting) result.
type Pattern = Parser ()

-- ** Parsing state

-- | The internal parser state. We don't bother with parameterising it with a
-- \"UserState\", we just bundle the generic and specific fields together (not
-- that it is that easy to draw the line - is @sLine@ generic or specific?).
data State = State {
    sName      :: !String,          -- ^ The input name for error messages.
    sEncoding  :: !Encoding,        -- ^ The input UTF encoding.
    sDecision  :: !String,          -- ^ Current decision name.
    sLimit     :: !Int,             -- ^ Lookahead characters limit.
    sForbidden :: !(Maybe Pattern), -- ^ Pattern we must not enter into.
    sIsPeek    :: !Bool,            -- ^ Disables token generation.
    sChars     :: ![Char],          -- ^ (Reversed) characters collected for a token.
    sOffset    :: !Int,             -- ^ Offset in characters in the input.
    sLine      :: !Int,             -- ^ Builds on YAML's line break definition.
    sColumn    :: !Int,             -- ^ Actually character number - we hate tabs.
    sCode      :: !Code,            -- ^ Of token we are collecting chars for.
    sLast      :: !Char,            -- ^ Last matched character.
    sInput     :: ![Char]           -- ^ The decoded input characters.
  }

-- Showing a 'State' is only used in debugging. Note that forcing dump of
-- @sInput@ will disable streaming it.
instance Show State where
  show state = "Name: "       ++ (show $ state|>sName)
            ++ ", Encoding: " ++ (show $ state|>sEncoding)
            ++ ", Decision: " ++ (show $ state|>sDecision)
            ++ ", Limit: "    ++ (show $ state|>sLimit)
            ++ ", IsPeek: "   ++ (show $ state|>sIsPeek)
            ++ ", Chars: >>>" ++ (reverse $ state|>sChars) ++ "<<<"
            ++ ", Offset: "   ++ (show $ state|>sOffset)
            ++ ", Line: "     ++ (show $ state|>sLine)
            ++ ", Column: "   ++ (show $ state|>sColumn)
            ++ ", Code: "     ++ (show $ state|>sCode)
            ++ ", Last: "     ++ (show $ state|>sLast)
--          ++ ", Input: >>>" ++ (show $ state|>sInput) ++ "<<<"

-- | @initialState name input@ returns an initial 'State' for parsing the
-- /input/ (with /name/ for error messages).
initialState :: String -> C.ByteString -> State
initialState name input = let (encoding, decoded) = decode input
                          in State { sName      = name,
                                     sEncoding  = encoding,
                                     sDecision  = "",
                                     sLimit     = -1,
                                     sForbidden = Nothing,
                                     sIsPeek    = False,
                                     sChars     = [],
                                     sOffset    = 0,
                                     sLine      = 1,
                                     sColumn    = 0,
                                     sCode      = Test,
                                     sLast      = ' ',
                                     sInput     = decoded }

-- *** Setters
--
-- We need four setter functions to pass them around as arguments. For some
-- reason, Haskell only generates getter functions.

-- | @setDecision name state@ sets the @sDecision@ field to /decision/.
setDecision :: String -> State -> State
setDecision decision state = state { sDecision = decision }

-- | @setLimit limit state@ sets the @sLimit@ field to /limit/.
setLimit :: Int -> State -> State
setLimit limit state = state { sLimit = limit }

-- | @setForbidden forbidden state@ sets the @sForbidden@ field to /forbidden/.
setForbidden :: Maybe Pattern -> State -> State
setForbidden forbidden state = state { sForbidden = forbidden }

-- | @setCode code state@ sets the @sCode@ field to /code/.
setCode :: Code -> State -> State
setCode code state = state { sCode = code }

-- ** Implicit parsers
--
-- It is tedious to have to wrap each expected character (or character range)
-- in an explicit 'Parse' constructor. We let Haskell do that for us using a
-- 'Match' class.

-- | @Match parameter result@ specifies that we can convert the /parameter/ to
-- a 'Parser' returning the /result/.
class Match parameter result | parameter -> result where
    match :: parameter -> Parser result

-- | We don't need to convert a 'Parser', it already is one.
instance Match (Parser result) result where
    match = id

-- | We convert 'Char' to a parser for a character (that returns nothing).
instance Match Char () where
    match code = nextIf (== code)

-- | We convert a 'Char' tuple to a parser for a character range (that returns
-- nothing).
instance Match (Char, Char) () where
    match (low, high) = nextIf $ \ code -> low <= code && code <= high

-- | We convert 'String' to a parser for a sequence of characters (that returns
-- nothing).
instance Match String () where
    match = foldr (&) empty

-- ** Reply constructors

-- | @returnReply state result@ prepares a 'Reply' with the specified /state/
-- and /result/.
returnReply :: State -> result -> Reply result
returnReply state result = Reply { rResult = Result result,
                                   rTokens = D.empty,
                                   rCommit = Nothing,
                                   rState  = state }

-- | @tokenReply state token@ returns a 'Reply' containing the /state/ and
-- /token/. Any collected characters are cleared (either there are none, or we
-- put them in this token, or we don't want them).
tokenReply state token = Reply { rResult = Result (),
                                 rTokens = D.singleton token,
                                 rCommit = Nothing,
                                 rState  = state { sChars = [] } }

-- | @failReply state message@ prepares a 'Reply' with the specified /state/
-- and error /message/.
failReply :: State -> String -> Reply result
failReply state message = Reply { rResult = Failed $ state|>sName
                                                  ++ ": line " ++ (show $ state|>sLine)
                                                  ++ ": column " ++ (show $ state|>sColumn)
                                                  ++ ": " ++ message,
                                  rTokens = D.empty,
                                  rCommit = Nothing,
                                  rState  = state }

-- | @unexpectedReply state@ returns a @failReply@ for an unexpected character.
unexpectedReply :: State -> Reply result
unexpectedReply state = case state|>sInput of
                             (char:_) -> failReply state $ "Unexpected '" ++ [char] ++ "'"
                             []       -> failReply state "Unexpected end of input"

-- ** Parsing Monad

-- | Allow using the @do@ notation for our parsers, which makes for short and
-- sweet @do@ syntax when we want to examine the results (we typically don't).
instance Monad Parser where

  -- @return result@ does just that - return a /result/.
  return result = Parser $ \ state -> returnReply state result

  -- @left >>= right@ applies the /left/ parser, and if it didn't fail
  -- applies the /right/ one (well, the one /right/ returns).
  left >>= right = bindParser left right
                   where bindParser (Parser left) right = Parser $ \ state ->
                           let reply = left state
                           in case reply|>rResult of
                                   Failed message -> reply { rResult = Failed message }
                                   Result value   -> reply { rResult = More $ right value }
                                   More parser    -> reply { rResult = More $ bindParser parser right }

  -- @fail message@ does just that - fails with a /message/.
  fail message = Parser $ \ state -> failReply state message

-- ** Parsing operators
--
-- Here we reap the benefits of renaming the numerical operators. The Operator
-- precedence, in decreasing strength:
--
-- @repeated % n@, @repeated <% n@, @match - rejected@, @match ! decision@,
-- @match ?! decision@, @choice ^ (first \/ second)@.
--
-- @match - first - second@ is @(match - first) - second@.
--
-- @first & second & third@ is @first & (second & third)@. Note that @first -
-- rejected & second@ is @(first - rejected) & second@, etc.
--
-- @match \/ alternative \/ otherwise@ is @match \/ (alternative \/
-- otherwise)@. Note that @first & second \/ third@ is @(first & second) \/
-- third@.
--
-- @( match *)@, @(match +)@, @(match ?)@ are the weakest and require the
-- surrounding @()@.

infix  3 ^
infix  3 %
infix  3 <%
infix  3 !
infix  3 ?!
infix  3 <?
infix  3 >?
infixl 3 -
infixr 2 &
infixr 1 /
infix  0 ?
infix  0 *
infix  0 +

-- | @parser % n@ repeats /parser/ exactly /n/ times.
(%) :: (Match match result) => match -> Int -> Pattern
parser % n
  | n <= 0 = empty
  | n > 0  = parser & parser % n .- 1

-- | @parser <% n@ matches fewer than /n/ occurrences of /parser/.
(<%) :: (Match match result) => match -> Int -> Pattern
parser <% n
  | n < 1 = fail "Fewer than 0 repetitions"
  | n == 1 = reject parser Nothing
  | n > 1  = "<%" ^ ( parser ! "<%" & parser <% n .- 1 / empty )

-- | @decision ^ (option \/ option \/ ...)@ provides a /decision/ name to the
-- choice about to be made, to allow to @commit@ to it.
(^) :: (Match match result) => String -> match -> Parser result
decision ^ parser = choice decision (match parser)

-- | @parser ! decision@ commits to /decision/ (in an option) after
-- successfully matching the /parser/.
(!) :: (Match match result) => match -> String -> Pattern
parser ! decision = parser & commit decision

-- | @parser ?! decision@ commits to /decision/ (in an option) if the current
-- position matches /parser/, without consuming any characters.
(?!) :: (Match match result) => match -> String -> Pattern
parser ?! decision = peek parser & commit decision

-- | @lookbehind <?@ matches the current point without consuming any
-- characters, if the previous character matches the lookbehind parser (single
-- character negative lookbehind)
(<?) :: (Match match result) => match -> Parser result
(<?) lookbehind = prev lookbehind

-- | @lookahead >?@ matches the current point without consuming any characters
-- if it matches the lookahead parser (positive lookahead)
(>?) :: (Match match result) => match -> Parser result
(>?) lookahead = peek lookahead

-- | @parser - rejected@ matches /parser/, except if /rejected/ matches at this
-- point.
(-) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1
parser - rejected = reject rejected Nothing & parser

-- | @before & after@ parses /before/ and, if it succeeds, parses /after/. This
-- basically invokes the monad's @>>=@ (bind) method.
(&) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result2
before & after = (match before) >> (match after)

-- | @first \/ second@ tries to parse /first/, and failing that parses
-- /second/, unless /first/ has committed in which case is fails immediately.
(/) :: (Match match1 result, Match match2 result) => match1 -> match2 -> Parser result
first / second = Parser $ \ state ->
  let Parser parser = decide (match first) (match second)
  in  parser state

-- | @(optional ?)@ tries to match /parser/, otherwise does nothing.
(?) :: (Match match result) => match -> Pattern
(?) optional = "?" ^ (optional & empty / empty)

-- | @(parser *)@ matches zero or more occurrences of /repeat/, as long as each
-- one actually consumes input characters.
(*) :: (Match match result) => match -> Pattern
(*) parser = "*" ^ zomParser
  where zomParser = (nonEmpty parser ! "*" & zomParser) / empty

-- | @(parser +)@ matches one or more occurrences of /parser/, as long as each
-- one actually consumed input characters.
(+) :: (Match match result) => match -> Pattern
(+) parser = nonEmpty parser & (parser *)

-- ** Basic parsers

-- | @decide first second@ tries to parse /first/, and failing that parses
-- /second/, unless /first/ has committed in which case is fails immediately.
decide :: Parser result -> Parser result -> Parser result
decide left right = Parser $ \ state ->
  let Parser parser = decideParser state D.empty left right
  in  parser state
  where decideParser point tokens (Parser left) right = Parser $ \state ->
          let reply = left state
              tokens' reply = D.append tokens $ reply|>rTokens
          in case (reply|>rResult, reply|>rCommit) of
                  (Failed _,    _)      -> Reply { rState  = point,
                                                   rTokens = D.empty,
                                                   rResult = More right,
                                                   rCommit = Nothing }
                  (Result _,   _)       -> reply { rTokens = tokens' reply }
                  (More left', Just _)  -> reply { rTokens = tokens' reply,
                                                   rResult = More left' }
                  (More left', Nothing) -> let Parser parser = decideParser point (tokens' reply) left' right
                                     in  parser $ reply|>rState

-- | @choice decision parser@ provides a /decision/ name to the choice about to
-- be made in /parser/, to allow to @commit@ to it.
choice :: String -> Parser result -> Parser result
choice decision parser = Parser $ \ state ->
  let Parser parser' = choiceParser (state|>sDecision) decision parser
  in parser' state { sDecision = decision }
  where choiceParser parentDecision makingDecision (Parser parser) = Parser $ \ state ->
          let reply   = parser state
              commit' = case reply|>rCommit of
                             Nothing                                    -> Nothing
                             Just decision | decision == makingDecision -> Nothing
                                           | otherwise                  -> reply|>rCommit
              reply'  = case reply|>rResult of
                             More parser' -> reply { rCommit = commit',
                                                     rResult = More $ choiceParser parentDecision makingDecision parser' }
                             _            -> reply { rCommit = commit',
                                                     rState = (reply|>rState) { sDecision = parentDecision } }
          in reply'

-- | @prev parser@ succeeds if /parser/ matches at the previous character. It
-- does not consume any input.
prev :: (Match match result) => match -> Parser result
prev parser = Parser $ \ state ->
  prevParser state (match parser) state { sIsPeek = True, sChars = state|>sLast : state|>sChars }
  where prevParser point (Parser parser) state =
          let reply = parser state
          in case reply|>rResult of
                  Failed message -> failReply point message
                  Result value   -> returnReply point value
                  More parser'   -> prevParser point parser' $ reply|>rState

-- | @peek parser@ succeeds if /parser/ matches at this point, but does not
-- consume any input.
peek :: (Match match result) => match -> Parser result
peek parser = Parser $ \ state ->
  peekParser state (match parser) state { sIsPeek = True }
  where peekParser point (Parser parser) state =
          let reply = parser state
          in case reply|>rResult of
                  Failed message -> failReply point message
                  Result value   -> returnReply point value
                  More parser'   -> peekParser point parser' $ reply|>rState

-- | @reject parser name@ fails if /parser/ matches at this point, and does
-- nothing otherwise. If /name/ is provided, it is used in the error message,
-- otherwise the messages uses the current character.
reject :: (Match match result) => match -> Maybe String -> Pattern
reject parser name = Parser $ \ state ->
  rejectParser state name (match parser) state { sIsPeek = True }
  where rejectParser point name (Parser parser) state =
          let reply = parser state
          in case reply|>rResult of
                  Failed message -> returnReply point ()
                  Result value   -> case name of
                                         Nothing   -> unexpectedReply point
                                         Just text -> failReply point $ "Unexpected " ++ text
                  More parser'   -> rejectParser point name parser' $ reply|>rState

-- | @nonEmpty parser@ succeeds if /parser/ matches some non-empty input
-- characters at this point.
nonEmpty :: (Match match result) => match -> Parser result
nonEmpty parser = Parser $ \ state ->
  let Parser parser' = nonEmptyParser (state|>sOffset) (match parser)
  in parser' state
  where nonEmptyParser offset (Parser parser) = Parser $ \ state ->
          let reply = parser state
              state' = reply|>rState
          in case reply|>rResult of
                  Failed message -> reply
                  Result value   -> if state'|>sOffset > offset
                                       then reply
                                       else failReply state' "Matched empty pattern"
                  More parser'   -> reply { rResult = More $ nonEmptyParser offset parser' }

-- | @empty@ always matches without consuming any input.
empty :: Pattern
empty = return ()

-- | @eof@ matches the end of the input.
eof :: Pattern
eof = Parser $ \ state ->
  if state|>sInput == []
     then returnReply state ()
     else failReply state "Expected end of input"

-- | @sol@ matches the start of a line.
sol :: Pattern
sol = Parser $ \ state ->
  if state|>sColumn == 0
     then returnReply state ()
     else failReply state "Expected start of line"

-- ** State manipulation pseudo-parsers

-- | @commit decision@ commits the parser to all the decisions up to the most
-- recent parent /decision/. This makes all tokens generated in this parsing
-- path immediately available to the caller.
commit :: String -> Pattern
commit decision = Parser $ \ state ->
  Reply { rState  = state,
          rTokens = D.empty,
          rResult = Result (),
          rCommit = Just decision }

-- | @nextLine@ increments @sLine@ counter and resets @sColumn@.
nextLine :: Pattern
nextLine = Parser $ \ state ->
  returnReply state { sLine = state|>sLine .+ 1,
                      sColumn = 0 }
              ()

-- | @with setField getField value parser@ invokes the specified /parser/ with
-- the value of the specified field set to /value/ for the duration of the
-- invocation, using the /setField/ and /getField/ functions to manipulate it.
with :: (value -> State -> State) -> (State -> value) -> value -> Parser result -> Parser result
with setField getField value parser = Parser $ \ state ->
  let value' = getField state
      Parser parser' = value' `seq` withParser value' parser
  in  parser' $ setField value state
  where withParser parentValue (Parser parser) = Parser $ \ state ->
          let reply = parser state
          in case reply|>rResult of
                  Failed _     -> reply { rState = setField parentValue $ reply|>rState }
                  Result _     -> reply { rState = setField parentValue $ reply|>rState }
                  More parser' -> reply { rResult = More $ withParser parentValue parser' }

-- | @parser ``forbidding`` pattern@ parses the specified /parser/ ensuring
-- that it does not contain anything matching the /forbidden/ parser.
forbidding :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1
forbidding parser forbidden = with setForbidden sForbidden (Just $ forbidden & empty) (match parser)

-- | @parser ``limitedTo`` limit@ parses the specified /parser/
-- ensuring that it does not consume more than the /limit/ input chars.
limitedTo :: (Match match result) => match -> Int -> Parser result
limitedTo parser limit = with setLimit sLimit limit (match parser)

-- ** Consuming input characters

-- | @nextIf test@ fails if the current position matches the 'State' forbidden
-- pattern or if the 'State' lookahead limit is reached. Otherwise it consumes
-- (and buffers) the next input char if it satisfies /test/.
nextIf :: (Char -> Bool) -> Pattern
nextIf test = Parser $ \ state ->
  case state|>sForbidden of
       Nothing     -> limitedNextIf state
       Just parser -> let Parser parser' = reject parser $ Just "forbidden pattern"
                          reply = parser' state { sForbidden = Nothing }
                      in case reply|>rResult of
                              Failed _ -> reply
                              Result _ -> limitedNextIf state
  where limitedNextIf state =
          case state|>sLimit of
               -1    -> consumeNextIf state
               0     -> failReply state "Lookahead limit reached"
               limit -> consumeNextIf state { sLimit = state|>sLimit .- 1 }
        consumeNextIf state =
          case state|>sInput of
               (char:rest) | test char -> let chars = if state|>sIsPeek
                                                         then []
                                                         else char:(state|>sChars)
                                              state' = state { sInput = rest,
                                                               sLast = char,
                                                               sChars = chars,
                                                               sOffset = state|>sOffset .+ 1,
                                                               sColumn = state|>sColumn .+ 1 }
                                          in returnReply state' ()
                           | otherwise -> unexpectedReply state
               []                      -> unexpectedReply state

-- ** Producing tokens

-- | @finishToken@ places all collected text into a new token and begins a new
-- one, or does nothing if there are no collected characters.
finishToken :: Pattern
finishToken = Parser $ \ state ->
  if state|>sIsPeek
     then returnReply state ()
     else case state|>sChars of
               []          -> returnReply state ()
               chars@(_:_) -> tokenReply state Token { tCode = state|>sCode,
                                                       tText = reverse chars }

-- | @wrap parser@ invokes the /parser/, ensures any unclaimed input characters
-- are wrapped into a token (only happens when testing productions), ensures no
-- input is left unparsed, and returns the parser's result.
wrap :: (Match match result) => match -> Parser result
wrap parser = do result <- match parser
                 finishToken
                 eof
                 return result

-- | @consume parser@ invokes the /parser/ and then consumes all remaining
-- unparsed input characters.
consume :: (Match match result) => match -> Parser result
consume parser = do result <- match parser
                    finishToken
                    clearInput
                    return result
                 where clearInput = Parser $ \ state -> returnReply state { sInput = [] } ()

-- | @token code parser@ places all text matched by /parser/ into a 'Token' with
-- the specified /code/ (unless it is empty). Note it collects the text even if
-- there is an error.
token :: (Match match result) => Code -> match -> Pattern
token code parser = finishToken & with setCode sCode code (parser & finishToken)

-- | @fake code text@ creates a token with the specified /code/ and \"fake\"
-- /text/ characters, instead of whatever characters are collected so far.
fake :: Code -> String -> Pattern
fake code text = Parser $ \ state ->
  if state|>sIsPeek
     then returnReply state ()
     else tokenReply state Token { tCode = code,
                                   tText = text }

-- | @meta parser@ collects the text matched by the specified /parser/ into a
-- | @Meta@ token.
meta :: (Match match result) => match -> Pattern
meta parser = token Meta parser

-- | @indicator code@ collects the text matched by the specified /parser/ into an
-- @Indicator@ token.
indicator :: (Match match result) => match -> Pattern
indicator parser = token Indicator $ parser

-- | @text parser@  collects the text matched by the specified /parser/ into a
-- @Text@ token.
text :: (Match match result) => match -> Pattern
text parser = token Text parser

-- | @nest code@ returns an empty token with the specified begin\/end /code/ to
-- signal nesting.
nest :: Code -> Pattern
nest code = finishToken & nestParser code
  where nestParser code = Parser $ \ state ->
          if state|>sIsPeek
             then returnReply state ()
             else tokenReply state Token { tCode = code,
                                           tText = "" }

-- * Production parameters

-- | Production context.
data Context = BlockOut     -- ^ Outside block sequence..
             | BlockIn      -- ^ Inside block sequence..
             | FlowOut      -- ^ Outside flow collection.
             | FlowIn       -- ^ Inside flow collection.
             | FlowKey      -- ^ Inside flow key.

-- | @show context@ converts a 'Context' to a 'String'.
instance Show Context where
  show context = case context of
                      BlockOut -> "block-out"
                      BlockIn  -> "block-in"
                      FlowOut  -> "flow-out"
                      FlowIn   -> "flow-in"
                      FlowKey  -> "flow-key"

-- | @read context@ converts a 'String' to a 'Context'. We trust our callers to
-- convert any @-@ characters into @_@ to allow the built-in @lex@ function to
-- handle the names as single identifiers.
instance Read Context where
  readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ]
    where r word = case word of
                        "block_out" -> BlockOut
                        "block_in"  -> BlockIn
                        "flow_out"  -> FlowOut
                        "flow_in"   -> FlowIn
                        "flow_key"  -> FlowKey
                        _           -> error $ "unknown context: " ++ word

-- | Chomp method.
data Chomp = Strip -- ^ Remove all trailing line breaks.
           | Clip  -- ^ Keep first trailing line break.
           | Keep  -- ^ Keep all trailing line breaks.

-- | @show chomp@ converts a 'Chomp' to a 'String'.
instance Show Chomp where
  show chomp = case chomp of
                    Strip -> "strip"
                    Clip  -> "clip"
                    Keep  -> "keep"

-- | @read chomp@ converts a 'String' to a 'Chomp'.
instance Read Chomp where
  readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ]
    where r word = case word of
                        "strip" -> Strip
                        "clip"  -> Clip
                        "keep"  -> Keep
                        _       -> error $ "unknown chomp: " ++ word

-- * Tokenizers
--
-- We encapsulate the 'Parser' inside a 'Tokenizer'. This allows us to hide the
-- implementation details from our callers.

-- | 'Tokenizer' converts a (named) input text into a list of 'Token'. Errors
-- are reported as tokens with the @Error@ 'Code', and the unparsed text
-- following an error may be attached as a final token.
type Tokenizer = String -> C.ByteString -> Bool -> [Token]

-- | @patternTokenizer pattern@ converts the /pattern/ to a simple 'Tokenizer'.
patternTokenizer :: Pattern -> Tokenizer
patternTokenizer pattern name input withFollowing =
  D.toList $ patternParser (wrap pattern) (initialState name input)
  where patternParser (Parser parser) state =
          let reply = parser state
              tokens = commitBugs reply
          in case reply|>rResult of
                  Failed message -> errorTokens tokens message withFollowing $ reply|>rState|>sInput
                  Result _       -> tokens
                  More parser'   -> D.append tokens $ patternParser parser' $ reply|>rState

-- | @parserTokenizer what parser@ converts the /parser/ returning /what/ to a
-- simple 'Tokenizer' (only used for tests). The result is reported as a token
-- with the @Detected@ 'Code' The result is reported as a token with the
-- @Detected@ 'Code'.
parserTokenizer :: (Show result, Match match result) => String -> match -> Tokenizer
parserTokenizer what parser name input withFollowing =
  D.toList $ parserParser (wrap parser) (initialState name input)
  where parserParser (Parser parser) state =
          let reply = parser state
              tokens = commitBugs reply
          in case reply|>rResult of
                  Failed message -> errorTokens tokens message withFollowing $ reply|>rState|>sInput
                  Result value   -> D.append tokens $ D.singleton Token { tCode = Detected,
                                                                          tText = what ++ "=" ++ (show value) }
                  More parser'   -> D.append tokens $ parserParser parser' $ reply|>rState

-- | @errorTokens tokens message withFollowing following@ appends an @Error@
-- token with the specified /message/ at the end of /tokens/, and if
-- /withFollowing/ also appends the unparsed text /following/ the error as a
-- final @Unparsed@ token.
errorTokens tokens message withFollowing following =
    let tokens' = D.append tokens $ D.singleton Token { tCode = Error, tText = message }
    in if withFollowing && following /= ""
       then D.append tokens' $ D.singleton Token { tCode = Unparsed, tText = following }
       else tokens'

-- | @commitBugs reply@ inserts an error token if a commit was made outside a
-- named choice. This should never happen outside tests.
commitBugs :: Reply result -> D.DList Token
commitBugs reply =
  let tokens = reply|>rTokens
  in case reply|>rCommit of
          Nothing     -> tokens
          Just commit -> D.append tokens $ D.singleton Token { tCode = Error,
                                                               tText = "Commit to '" ++ commit ++ "' was made outside it" }

-- | @yaml name input@ converts the Unicode /input/ (called /name/ in error
-- messages) to a list of 'Token' according to the YAML spec. This is it!
yaml :: Tokenizer
yaml = patternTokenizer l_yaml_stream

#ifdef REAL_CPP
-- This is how non-ancient C pre-processor do it.
#define STR(X) #X
#else
-- This only works in GHC's simplistic GHC (and ancient C pre-processors).
#define STR(X) "X"
#endif
-- These allow us to avoid repeating the parser names.
#define PAT(PATTERN) pat STR(PATTERN) PATTERN
#define PAR(PARSER)  par STR(PARSER)  PARSER
#define PAC(PARSER)  pac STR(PARSER)  PARSER

-- | @pName name@ converts a parser name to the \"proper\" spec name.
pName :: String -> String
pName name = regexSub questionRegex "?"
           $ regexSub minusRegex "-"
           $ regexSub plusRegex "+" name
           where regexSub regex value text = subRegex regex text value
                 questionRegex = mkRegex "'"
                 minusRegex = mkRegex "_"
                 plusRegex = mkRegex "__"

-- | @tokenizers@ returns a mapping from a production name to a production
-- tokenizer.
tokenizers :: Map.Map String Tokenizer
tokenizers = PAR(c_chomping_indicator) "t"
           $ PAC(detect_inline_indentation) "m"
           $ PAT(b_as_line_feed)
           $ PAT(b_carriage_return)
           $ PAT(b_char)
           $ PAT(b_generic)
           $ PAT(b_l_folded_as_space)
           $ PAT(b_line_feed)
           $ PAT(b_line_separator)
           $ PAT(b_next_line)
           $ PAT(b_non_content_any)
           $ PAT(b_non_content_generic)
           $ PAT(b_normalized)
           $ PAT(b_paragraph_separator)
           $ PAT(b_specific)
           $ PAT(c_alias)
           $ PAT(c_anchor)
           $ PAT(c_byte_order_mark)
           $ PAT(c_collect_entry)
           $ PAT(c_comment)
           $ PAT(c_directive)
           $ PAT(c_document_end)
           $ PAT(c_document_start)
           $ PAT(c_double_quote)
           $ PAT(c_escape)
           $ PAT(c_folded)
           $ PAT(c_indicator)
           $ PAT(c_literal)
           $ PAT(c_mapping_end)
           $ PAT(c_mapping_key)
           $ PAT(c_mapping_start)
           $ PAT(c_mapping_value)
           $ PAT(c_named_tag_handle)
           $ PAT(c_nb_comment_text)
           $ PAT(c_non_specific_tag)
           $ PAT(c_ns_alias_node)
           $ PAT(c_ns_anchor_property)
           $ PAT(c_ns_esc_char)
           $ PAT(c_ns_local_tag_prefix)
           $ PAT(c_ns_shorthand_tag)
           $ PAT(c_ns_tag_property)
           $ PAT(c_primary_tag_handle)
           $ PAT(c_printable)
           $ PAT(c_quoted_quote)
           $ PAT(c_reserved)
           $ PAT(c_secondary_tag_handle)
           $ PAT(c_sequence_end)
           $ PAT(c_sequence_entry)
           $ PAT(c_sequence_start)
           $ PAT(c_s_implicit_json_key)
           $ PAT(c_single_quote)
           $ PAT(c_tag)
           $ PAT(c_tag_handle)
           $ PAT(c_verbatim_tag)
           $ PAT(e_node)
           $ PAT(e_scalar)
           $ PAT(l_comment)
           $ PAT(l_directive)
           $ PAT(l_document_prefix)
           $ PAT(l_documents)
           $ PAT(l_document_suffix)
           $ PAT(l_explicit_document)
           $ PAT(l_following_document)
           $ PAT(l_forbidden)
           $ PAT(l_implicit_document)
           $ PAT(l_leading_document)
           $ PAT(l_yaml_stream)
           $ PAT(nb_char)
           $ PAT(nb_double_char)
           $ PAT(nb_double_one_line)
           $ PAT(nb_ns_double_in_line)
           $ PAT(nb_ns_single_in_line)
           $ PAT(nb_single_char)
           $ PAT(nb_single_one_line)
           $ PAT(ns_anchor_char)
           $ PAT(ns_anchor_name)
           $ PAT(ns_ascii_letter)
           $ PAT(ns_char)
           $ PAT(ns_dec_digit)
           $ PAT(ns_directive_name)
           $ PAT(ns_directive_parameter)
           $ PAT(ns_double_char)
           $ PAT(ns_esc_16_bit)
           $ PAT(ns_esc_32_bit)
           $ PAT(ns_esc_8_bit)
           $ PAT(ns_esc_backslash)
           $ PAT(ns_esc_backspace)
           $ PAT(ns_esc_bell)
           $ PAT(ns_esc_carriage_return)
           $ PAT(ns_esc_double_quote)
           $ PAT(ns_esc_escape)
           $ PAT(ns_esc_form_feed)
           $ PAT(ns_esc_horizontal_tab)
           $ PAT(ns_esc_line_feed)
           $ PAT(ns_esc_line_separator)
           $ PAT(ns_esc_next_line)
           $ PAT(ns_esc_non_breaking_space)
           $ PAT(ns_esc_null)
           $ PAT(ns_esc_paragraph_separator)
           $ PAT(ns_esc_space)
           $ PAT(ns_esc_vertical_tab)
           $ PAT(ns_global_tag_prefix)
           $ PAT(ns_hex_digit)
           $ PAT(ns_plain_safe_in)
           $ PAT(ns_plain_safe_out)
           $ PAT(ns_reserved_directive)
           $ PAT(ns_s_block_map_implicit_key)
           $ PAT(ns_s_implicit_yaml_key)
           $ PAT(ns_single_char)
           $ PAT(ns_tag_char)
           $ PAT(ns_tag_directive)
           $ PAT(ns_tag_prefix)
           $ PAT(ns_uri_char)
           $ PAT(ns_word_char)
           $ PAT(ns_yaml_directive)
           $ PAT(ns_yaml_version)
           $ PAT(s_b_comment)
           $ PAT(s_l_comments)
           $ PAT(s_separate_in_line)
           $ PAT(s_space)
           $ PAT(s_tab)
           $ PAT(s_white)
           $ Map.empty
  where pat name pattern     = Map.insert (pName name) $ patternTokenizer     (match pattern)
        par name parser what = Map.insert (pName name) $ parserTokenizer what (match parser)
        pac name parser what = Map.insert (pName name) $ parserTokenizer what (consume parser)

-- | @tokenizer name@ converts the production with the specified /name/ to a
-- simple 'Tokenizer', or @Nothing@ if it isn't known.
tokenizer :: String -> (Maybe Tokenizer)
tokenizer name = Map.lookup name tokenizers

-- | @tokenizersWithN@ returns a mapping from a production name to a production
-- tokenizer (that takes an /n/ argument).
tokenizersWithN :: Map.Map String (Int -> Tokenizer)
tokenizersWithN = PAR(c_b_block_header) "(m,t)"
                $ PAC(detect_collection_indentation) "m"
                $ PAC(detect_scalar_indentation) "m"
                $ PAR(c_indentation_indicator) "m"
                $ PAR(count_spaces) "m"
                $ PAT(b_l_spaced)
                $ PAT(b_nb_literal_next)
                $ PAT(c_l_block_map_explicit_entry)
                $ PAT(c_l_block_map_explicit_key)
                $ PAT(c_l_block_map_implicit_value)
                $ PAT(c_l_block_seq_entry)
                $ PAT(c_l__folded)
                $ PAT(c_l__literal)
                $ PAT(l_block_map_explicit_value)
                $ PAT(l__block_mapping)
                $ PAT(l__block_sequence)
                $ PAT(l_keep_empty)
                $ PAT(l_nb_diff_lines)
                $ PAT(l_nb_folded_lines)
                $ PAT(l_nb_literal_text)
                $ PAT(l_nb_same_lines)
                $ PAT(l_nb_spaced_lines)
                $ PAT(l_strip_empty)
                $ PAT(l_trail_comments)
                $ PAT(nb_double_multi_line)
                $ PAT(nb_single_multi_line)
                $ PAT(ns_l_block_map_entry)
                $ PAT(ns_l_block_map_implicit_entry)
                $ PAT(ns_l_in_line_mapping)
                $ PAT(ns_l_in_line_sequence)
                $ PAT(s_block_line_prefix)
                $ PAT(s_flow_line_prefix)
                $ PAT(s_indent)
                $ PAT(s_indent_le)
                $ PAT(s_indent_lt)
                $ PAT(s_l_double_any)
                $ PAT(s_l_double_escaped)
                $ PAT(s_l_flow_folded)
                $ PAT(s_l__flow_in_block)
                $ PAT(s_nb_folded_text)
                $ PAT(s_nb_spaced_text)
                $ PAT(s_ns_double_next_line)
                $ PAT(s_ns_single_next_line)
                $ PAT(s_separate_lines)
                $ Map.empty
  where pat name pattern     = Map.insert (pName name) (\ n -> patternTokenizer     (match   $ pattern n))
        par name parser what = Map.insert (pName name) (\ n -> parserTokenizer what (match   $ parser  n))
        pac name parser what = Map.insert (pName name) (\ n -> parserTokenizer what (consume $ parser  n))

-- | @tokenizerWithN name n@ converts the production (that requires an /n/
-- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if
-- it isn't known.
tokenizerWithN :: String -> Int -> Maybe Tokenizer
tokenizerWithN name n =
  case Map.lookup name tokenizersWithN of
    Just tokenizer -> Just $ tokenizer n
    Nothing        -> Nothing

-- | @tokenizersWithC@ returns a mapping from a production name to a production
-- tokenizer (that takes a /c/ argument).
tokenizersWithC :: Map.Map String (Context -> Tokenizer)
tokenizersWithC = PAT(nb_ns_plain_in_line)
                $ PAT(nb_plain_char)
                $ PAT(ns_plain_char)
                $ PAT(ns_plain_first)
                $ PAT(ns_plain_one_line)
                $ PAT(ns_plain_safe)
                $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ c -> patternTokenizer (match $ pattern c))

-- | @tokenizerWithC name c@ converts the production (that requires a /c/
-- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if
-- it isn't known.
tokenizerWithC :: String -> Context -> Maybe Tokenizer
tokenizerWithC name c =
  case Map.lookup name tokenizersWithC of
    Just tokenizer -> Just $ tokenizer c
    Nothing        -> Nothing

-- | @tokenizersWithT@ returns a mapping from a production name to a production
-- tokenizer (that takes a /t/ argument).
tokenizersWithT :: Map.Map String (Chomp -> Tokenizer)
tokenizersWithT = PAT(b_chomped_last)
                $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ t -> patternTokenizer (match $ pattern t))

-- | @tokenizerWithT name t@ converts the production (that requires an /t/
-- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if
-- it isn't known.
tokenizerWithT :: String -> Chomp -> Maybe Tokenizer
tokenizerWithT name t = 
  case Map.lookup name tokenizersWithT of
    Just tokenizer -> Just $ tokenizer t
    Nothing        -> Nothing

-- | @tokenizersWithNC@ returns a mapping from a production name to a
-- production tokenizer (that requires /n/ and /c/ arguments).
tokenizersWithNC :: Map.Map String (Int -> Context -> Tokenizer)
tokenizersWithNC = PAT(b_l_folded_any)
                 $ PAT(b_l_folded_specific)
                 $ PAT(b_l_folded_trimmed)
                 $ PAT(c_double_quoted)
                 $ PAT(c_flow_json_content)
                 $ PAT(c_flow_json_node)
                 $ PAT(c_flow_mapping)
                 $ PAT(c_flow_sequence)
                 $ PAT(c_ns_flow_map_adjacent_value)
                 $ PAT(c_ns_flow_map_empty_key_entry)
                 $ PAT(c_ns_flow_map_json_key_entry)
                 $ PAT(c_ns_flow_map_separate_value)
                 $ PAT(c_ns_flow_pair_json_key_entry)
                 $ PAT(c_ns_properties)
                 $ PAT(c_single_quoted)
                 $ PAT(l_empty)
                 $ PAT(nb_double_text)
                 $ PAT(nb_single_text)
                 $ PAT(ns_flow_content)
                 $ PAT(ns_flow_map_entry)
                 $ PAT(ns_flow_map_explicit_entry)
                 $ PAT(ns_flow_map_implicit_entry)
                 $ PAT(ns_flow_map_yaml_key_entry)
                 $ PAT(ns_flow_node)
                 $ PAT(ns_flow_pair)
                 $ PAT(ns_flow_pair_entry)
                 $ PAT(ns_flow_pair_yaml_key_entry)
                 $ PAT(ns_flow_seq_entry)
                 $ PAT(ns_flow_yaml_content)
                 $ PAT(ns_flow_yaml_node)
                 $ PAT(ns_plain)
                 $ PAT(ns_plain_multi_line)
                 $ PAT(ns_s_flow_map_entries)
                 $ PAT(ns_s_flow_seq_entries)
                 $ PAT(s_l__block_collection)
                 $ PAT(s_l__block_in_block)
                 $ PAT(s_l__block_indented)
                 $ PAT(s_l__block_node)
                 $ PAT(s_l__block_scalar)
                 $ PAT(s_line_prefix)
                 $ PAT(s_ns_plain_next_line)
                 $ PAT(s_separate)
                 $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ n c -> patternTokenizer (match $ pattern n c))

-- | @tokenizerWithNC name n c@ converts the production (that requires /n/ and
-- /c/ arguments) with the specified /name/ to a simple 'Tokenizer', or
-- @Nothing@ if it isn't known.
tokenizerWithNC :: String -> Int -> Context -> Maybe Tokenizer
tokenizerWithNC name n c =
  case Map.lookup name tokenizersWithNC of
    Just tokenizer -> Just $ tokenizer n c
    Nothing        -> Nothing

-- | @tokenizersWithNT@ returns a mapping from a production name to a
-- production tokenizer (that requires /n/ and /t/ arguments).
tokenizersWithNT :: Map.Map String (Int -> Chomp -> Tokenizer)
tokenizersWithNT = PAT(l_chomped_empty)
                 $ PAT(l_folded_content)
                 $ PAT(l_literal_content)
                 $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ n t -> patternTokenizer (match $ pattern n t))

-- | @tokenizerWithNT name n t@ converts the production (that requires /n/ and
-- /t/ arguments) with the specified /name/ to a simple 'Tokenizer', or
-- @Nothing@ if it isn't known.
tokenizerWithNT :: String -> Int -> Chomp -> Maybe Tokenizer
tokenizerWithNT name n t =
  case Map.lookup name tokenizersWithNT of
    Just tokenizer -> Just $ tokenizer n t
    Nothing        -> Nothing

-- | @tokenizerNames@ returns the list of all productions (tokenizers).
tokenizerNames :: [String]
tokenizerNames = (Map.keys tokenizers)
              ++ (Map.keys tokenizersWithN)
              ++ (Map.keys tokenizersWithC)
              ++ (Map.keys tokenizersWithT)
              ++ (Map.keys tokenizersWithNC)
              ++ (Map.keys tokenizersWithNT)

-- * Productions

-- ** BNF compatibility helpers

-- | @detect_utf_encoding@ doesn't actually detect the encoding, we just call it
-- this way to make the productions compatible with the spec. Instead it simply
-- reports the encoding (which was already detected when we started parsing).
detect_utf_encoding = Parser $ \ state -> let text = case state|>sEncoding of
                                                          UTF8    -> "TF8"
                                                          UTF16LE -> "TF16LE"
                                                          UTF16BE -> "TF16BE"
                                              Parser parser = fake Bom text
                                          in  parser state { sColumn = state|>sColumn .- 1 }

-- | @na@ is the \"non-applicable\" indentation value. We use Haskell's laziness
-- to verify it really is never used.
na :: Int
na = error "Accessing non-applicable indentation"

-- | @asInteger@ returns the last consumed character, which is assumed to be a
-- decimal digit, as an integer.
asInteger :: Parser Int
asInteger = Parser $ \ state -> returnReply state $ ord (state|>sLast) .- 48

-- | @result value@ is the same as /return value/ except that we give the
-- Haskell type deduction the additional boost it needs to figure out this is
-- wrapped in a 'Parser'.
result :: result -> Parser result
result = return

#include "Reference.bnf"