-- #ignore-exports
-------------------------------------------------------------------------------
-- |
-- Module      :  Text.Yaml.Reference
-- Copyright   :  (c) Oren Ben-Kiki 2007
-- License     :  LGPL
-- 
-- Maintainer  :  yaml-oren@ben-kiki.org
-- Stability   :  alpha
-- Portability :  portable
--
-- Implementation of the YAML syntax as defined in <http://www.yaml.org>.
-- Actually this file contains the parsing framework and includes (using CPP)
-- the actual productions from @Reference.bnf@.
--
-- The parsing framework is fully streaming (generates output tokens
-- \"immediately\"), but has a memory leak (actually retention) which causes it
-- to blow up on \"large\" files. To debug this with minimal syntax productions
-- use the @debug_leak@ production (@yes "#" | yaml2yeast -p debug-leak@).
-------------------------------------------------------------------------------

module Text.Yaml.Reference
  (
    -- Basic parsing:
    Code,
    Token,
    Tokenizer,
    yaml,
    -- For testing:
    Context,
    Style,
    Chomp,
    tokenizer,
    tokenizerWithN,
    tokenizerWithC,
    tokenizerWithS,
    tokenizerWithT,
    tokenizerWithNC,
    tokenizerWithNS,
    tokenizerWithNT,
    tokenizerNames,
    showTokens
  )
where

import           Control.Monad
import qualified Data.ByteString.Lazy.Char8 as C
import           Data.Char
import qualified Data.DList as D
import qualified Data.Map as Map
import           Text.Regex
import           Debug.Trace
import qualified Prelude
import           Prelude hiding ((/), (*), (+), (-), (^))

-- * Generic operators
--
-- ** Numeric operators
--
-- We rename the four numerical operators @+@ @-@ @*@ @\/@ to start with @.@
-- (@.+@, @.-@, @.*@, @.\/@). This allows us to use the originals for BNF
-- notation (we also hijack the @^@ operator). This is not a generally
-- recommended practice. It is justified in this case since we have very little
-- arithmetic operations, and a lot of BNF rules which this makes extremely
-- readable.

infixl 6 .+
-- | \".+\" is the numeric addition (we use \"+\" for postfix \"one or more\").
(.+) = (Prelude.+)

infixl 6 .-
-- | \".-\" is the numeric subtraction (we use \"-\" for infix \"and not\").
(.-) = (Prelude.-)

infixl 7 .*
-- | \".*\" is the numeric multiplication (we use \"*\" for postfix \"zero or
-- more\").
(.*) = (Prelude.*)

infixl 7 ./
-- | \"./\" is the numeric division (we use \"/\" for infix \"or\").
(./) = (Prelude./)

-- ** Record field access
--
-- We also define @|>@ for record access for increased readability.

infixl 9 |>
-- | @record |> field@ is the same as @field record@,  but is more readable.
(|>) :: record -> (record -> value) -> value
record |> field = field record

-- * UTF decoding
--
-- This really should be factored out to the standard libraries. Since it isn't
-- there, we get to tailor it exactly to our needs. We use lazy byte strings as
-- input, which should give reasonable I\/O performance when reading large
-- files. The output is a normal 'Char' list which is easy to work with and
-- should be efficient enough as long as the 'Parser' does its job right.

-- | Recognized Unicode encodings. UTF-32 isn't required by YAML parsers.
data Encoding = UTF8    -- ^ UTF-8 encoding (or ASCII)
              | UTF16LE -- ^ UTF-16 little endian
              | UTF16BE -- ^ UTF-16 big endian
    deriving Show

-- | @decode bytes@ automatically detects the 'Encoding' used and converts the
-- /bytes/ to Unicode characters.
decode :: C.ByteString -> (Encoding, [Char])
decode text = (encoding, undoEncoding encoding text)
  where encoding = detectEncoding $ C.unpack $ C.take 2 text

-- | @detectEncoding text@ examines the first few chars (bytes) of the /text/
-- to deduce the Unicode encoding used according to the YAML spec.
detectEncoding :: [Char] -> Encoding
detectEncoding text =
  case text of
    '\xFF':'\xFE':_ -> UTF16LE
    '\xFE':'\xFF':_ -> UTF16BE
    _               -> UTF8

-- | @undoEncoding encoding bytes@ converts a /bytes/ stream to Unicode
-- characters according to the /encoding/.
undoEncoding :: Encoding -> C.ByteString -> [Char]
undoEncoding encoding bytes =
  case encoding of
    UTF8    -> undoUTF8 bytes
    UTF16LE -> combinePairs $ undoUTF16LE bytes
    UTF16BE -> combinePairs $ undoUTF16BE bytes

-- ** UTF-16 decoding

-- | @combinePairs chars@ converts each pair of UTF-16 surrogate characters to a
-- single Unicode character.
combinePairs :: [Char] -> [Char]
combinePairs []                          = []
combinePairs (lead:rest)
  | '\xD800' <= lead && lead <= '\xDBFF' = combineLead lead rest
  | '\xDC00' <= lead && lead <= '\xDFFF' = error "UTF-16 contains trail surrogate without lead surrogate"
  | otherwise                            = lead:(combinePairs rest)

-- | @combineLead lead rest@ combines the /lead/ surrogate with the head of the
-- /rest/ of the input chars, assumed to be a /trail/ surrogate, and continues
-- combining surrogate pairs.
combineLead :: Char -> [Char] -> [Char]
combineLead lead []                        = error "UTF-16 contains lead surrogate as final character"
combineLead lead (trail:rest)
  | '\xDC00' <= trail && trail <= '\xDFFF' = (combineSurrogates lead trail):combinePairs rest
  | otherwise                              = error "UTF-16 contains lead surrogate without trail surrogate"

-- | @surrogateOffset@ is copied from the Unicode FAQs.
surrogateOffset :: Int
surrogateOffset = 0x10000 .- (0xD800 .* 1024) .- 0xDC00

-- | @combineSurrogates lead trail@ combines two UTF-16 surrogates into a single
-- Unicode character.
combineSurrogates :: Char -> Char -> Char
combineSurrogates lead trail = chr $ (ord lead) .* 1024 .+ (ord trail) .+ surrogateOffset

-- | @hasFewerThan bytes n@ checks whether there are fewer than /n/ /bytes/
-- left to read.
hasFewerThan :: Int -> C.ByteString -> Bool
hasFewerThan n bytes
  | n == 1 = C.null bytes
  | n > 1  = C.null bytes || hasFewerThan (n .- 1) (C.tail bytes)

-- | @undoUTF18LE bytes@ decoded a UTF-16-LE /bytes/ stream to Unicode chars.
undoUTF16LE :: C.ByteString -> [Char]
undoUTF16LE bytes
  | C.null bytes = []
  | hasFewerThan 2 bytes = error "UTF-16-LE input contains odd number of bytes"
  | otherwise = let low = C.head bytes
                    bytes' = C.tail bytes
                    high = C.head bytes'
                    rest = C.tail bytes'
                in (chr $ (ord high) .* 256 .+ (ord low)):(undoUTF16LE rest)

-- | @undoUTF18BE bytes@ decoded a UTF-16-BE /bytes/ stream to Unicode chars.
undoUTF16BE :: C.ByteString -> [Char]
undoUTF16BE bytes
  | C.null bytes = []
  | hasFewerThan 2 bytes = error "UTF-16-BE input contains odd number of bytes"
  | otherwise = let high = C.head bytes
                    bytes' = C.tail bytes
                    low = C.head bytes'
                    rest = C.tail bytes'
                in (chr $ (ord high) .* 256 .+ (ord low)):(undoUTF16BE rest)

-- ** UTF-8 decoding

-- | @undoUTF8 bytes@ decoded a UTF-8 /bytes/ stream to Unicode chars.
undoUTF8 :: C.ByteString -> [Char]
undoUTF8 bytes
  | C.null bytes = []
  | otherwise = let first = C.head bytes
                    rest = C.tail bytes
                in case () of
                      _ | first < '\x80' -> first:(undoUTF8 rest)
                        | first < '\xC0' -> error "UTF-8 input contains invalid first byte"
                        | first < '\xE0' -> decodeTwoUTF8 first rest
                        | first < '\xF0' -> decodeThreeUTF8 first rest
                        | first < '\xF8' -> decodeFourUTF8 first rest
                        | otherwise      -> error "UTF-8 input contains invalid first byte"

-- | @decodeTwoUTF8 first bytes@ decodes a two-byte UTF-8 character, where the
-- /first/ byte is already available and the second is the head of the /bytes/,
-- and then continues to undo the UTF-8 encoding.
decodeTwoUTF8 :: Char -> C.ByteString -> [Char]
decodeTwoUTF8 first bytes
  | C.null bytes = error "UTF-8 double byte char is missing second byte at eof"
  | otherwise = let second = C.head bytes
                    rest = C.tail bytes
                in case () of
                      _ | second < '\x80' || '\xBF' < second -> error "UTF-8 double byte char has invalid second byte"
                        | otherwise                          -> (combineTwoUTF8 first second):(undoUTF8 rest)

-- | @combineTwoUTF8 first second@ combines the /first/ and /second/ bytes of a
-- two-byte UTF-8 char into a single Unicode char.
combineTwoUTF8 :: Char -> Char -> Char
combineTwoUTF8 first second = chr(((ord first) .- 0xC0) .* 64
                               .+ ((ord second) .- 0x80))

-- | @decodeThreeUTF8 first bytes@ decodes a three-byte UTF-8 character, where
-- the /first/ byte is already available and the second and third are the head
-- of the /bytes/, and then continues to undo the UTF-8 encoding.
decodeThreeUTF8 :: Char -> C.ByteString -> [Char]
decodeThreeUTF8 first bytes
  | hasFewerThan 2 bytes = error "UTF-8 triple byte char is missing bytes at eof"
  | otherwise = let second = C.head bytes
                    bytes' = C.tail bytes
                    third = C.head bytes'
                    rest = C.tail bytes'
                in case () of
                      _ | second < '\x80' || '\xBF' < second -> error "UTF-8 triple byte char has invalid second byte"
                        | third < '\x80' || '\xBF' < third   -> error "UTF-8 triple byte char has invalid third byte"
                        | otherwise                          -> (combineThreeUTF8 first second third):(undoUTF8 rest)

-- | @combineThreeUTF8 first second@ combines the /first/, /second/ and /third/
-- bytes of a three-byte UTF-8 char into a single Unicode char.
combineThreeUTF8 :: Char -> Char -> Char -> Char
combineThreeUTF8 first second third = chr(((ord first) .- 0xE0) .* 4096
                                       .+ ((ord second) .- 0x80) .* 64
                                       .+ ((ord third)  .- 0x80))

-- | @decodeFourUTF8 first bytes@ decodes a four-byte UTF-8 character, where the
-- /first/ byte is already available and the second, third and fourth are the
-- head of the /bytes/, and then continues to undo the UTF-8 encoding.
decodeFourUTF8 :: Char -> C.ByteString -> [Char]
decodeFourUTF8 first bytes
  | hasFewerThan 3 bytes = error "UTF-8 quad byte char is missing bytes at eof"
  | otherwise = let second = C.head bytes
                    bytes' = C.tail bytes
                    third = C.head bytes'
                    bytes'' = C.tail bytes'
                    fourth = C.head bytes''
                    rest = C.tail bytes''
                in case () of
                      _ | second < '\x80' || '\xBF' < second -> error "UTF-8 quad byte char has invalid second byte"
                        | third < '\x80' || '\xBF' < third   -> error "UTF-8 quad byte char has invalid third byte"
                        | third < '\x80' || '\xBF' < third   -> error "UTF-8 quad byte char has invalid fourth byte"
                        | otherwise                          -> (combineFourUTF8 first second third fourth):(undoUTF8 rest)

-- | @combineFourUTF8 first second@ combines the /first/, /second/ and /third/
-- bytes of a three-byte UTF-8 char into a single Unicode char.
combineFourUTF8 :: Char -> Char -> Char -> Char -> Char
combineFourUTF8 first second third fourth = chr(((ord first)  .- 0xF0) .* 262144
                                             .+ ((ord second) .- 0x80) .* 4096
                                             .+ ((ord third)  .- 0x80) .* 64
                                             .+ ((ord fourth) .- 0x80))

-- * Result tokens
--
-- The parsing result is a stream of tokens rather than a parse tree. The idea
-- is to convert the YAML input into \"byte codes\". These byte codes are
-- intended to be written into a byte codes file (or more likely a UNIX pipe)
-- for further processing.

-- | 'Token' codes.
data Code = Bom             -- ^ BOM, contains \"@TF8@\", \"@TF16LE@\" or \"@TF16BE@\".
          | Text            -- ^ Content text characters.
          | Meta            -- ^ Non-content (meta) text characters.
          | Break           -- ^ Line break preserved in content.
          | Continue        -- ^ Separation line break.
          | LineFeed        -- ^ Line break normalized to content line feed.
          | LineFold        -- ^ Line break folded to content space.
          | Indicator       -- ^ Character indicating structure.
          | White           -- ^ Separation white space.
          | Indent          -- ^ Indentation spaces.
          | DocumentStart   -- ^ Document start marker.
          | DocumentEnd     -- ^ Document end marker.
          | BeginEscape     -- ^ Begins escape sequence.
          | EndEscape       -- ^ Ends escape sequence.
          | BeginComment    -- ^ Begins comment.
          | EndComment      -- ^ Ends comment.
          | BeginDirective  -- ^ Begins directive.
          | EndDirective    -- ^ Ends directive.
          | BeginTag        -- ^ Begins tag.
          | EndTag          -- ^ Ends tag.
          | BeginHandle     -- ^ Begins tag handle.
          | EndHandle       -- ^ Ends tag handle.
          | BeginAnchor     -- ^ Begins anchor.
          | EndAnchor       -- ^ Ends anchor.
          | BeginProperties -- ^ Begins node properties.
          | EndProperties   -- ^ Ends node properties.
          | BeginAlias      -- ^ Begins alias.
          | EndAlias        -- ^ Ends alias.
          | BeginScalar     -- ^ Begins scalar content.
          | EndScalar       -- ^ Ends scalar content.
          | BeginSequence   -- ^ Begins sequence content.
          | EndSequence     -- ^ Ends sequence content.
          | BeginMapping    -- ^ Begins mapping content.
          | EndMapping      -- ^ Ends mapping content.
          | BeginPair       -- ^ Begins mapping key:value pair.
          | EndPair         -- ^ Ends mapping key:value pair.
          | BeginNode       -- ^ Begins complete node.
          | EndNode         -- ^ Ends complete node.
          | BeginDocument   -- ^ Begins document.
          | EndDocument     -- ^ Ends document.
          | BeginStream     -- ^ Begins YAML stream.
          | EndStream       -- ^ Ends YAML stream.
          | Error           -- ^ Parsing error at this point.
          -- For testing:.
          | Test            -- ^ Test characters otherwise unassigned.
          | Detected        -- ^ Detected parameter.
  deriving Eq

-- | @show code@ converts a 'Code' to the one-character YEAST token code char.
-- The list of byte codes is also documented in the @yaml2yeast@ program.
instance Show Code where
  show code = case code of
                   Bom             -> "U"
                   Text            -> "T"
                   Meta            -> "t"
                   Break           -> "B"
                   Continue        -> "b"
                   LineFeed        -> "L"
                   LineFold        -> "l"
                   Indicator       -> "I"
                   White           -> "w"
                   Indent          -> "i"
                   DocumentStart   -> "K"
                   DocumentEnd     -> "k"
                   BeginEscape     -> "E"
                   EndEscape       -> "e"
                   BeginComment    -> "C"
                   EndComment      -> "c"
                   BeginDirective  -> "D"
                   EndDirective    -> "d"
                   BeginTag        -> "G"
                   EndTag          -> "g"
                   BeginHandle     -> "H"
                   EndHandle       -> "h"
                   BeginAnchor     -> "A"
                   EndAnchor       -> "a"
                   BeginProperties -> "P"
                   EndProperties   -> "p"
                   BeginAlias      -> "R"
                   EndAlias        -> "r"
                   BeginScalar     -> "S"
                   EndScalar       -> "s"
                   BeginSequence   -> "Q"
                   EndSequence     -> "q"
                   BeginMapping    -> "M"
                   EndMapping      -> "m"
                   BeginNode       -> "N"
                   EndNode         -> "n"
                   BeginPair       -> "X"
                   EndPair         -> "x"
                   BeginDocument   -> "O"
                   EndDocument     -> "o"
                   BeginStream     -> "Y"
                   EndStream       -> "y"
                   Error           -> "!"
                   Test            -> "?"
                   Detected        -> "$"

-- | Parsed token.
data Token = Token {
    tCode :: Code,        -- ^ Specific token 'Code'.
    tText :: Maybe String -- ^ Contained input chars, if any.
  }

-- | @show token@ converts a 'Token' to a single YEAST line.
instance Show Token where
  show token = case token|>tText of
                    Nothing     -> (show $ token|>tCode) ++ "\n"
                    Just string -> (show $ token|>tCode) ++ (escapeString string) ++ "\n"

-- | @escapeString string@ escapes all the non-ASCII characters in the
-- /string/, as well as escaping the \"@\\@\" character, using the \"@\\xXX@\",
-- \"@\\uXXXX@\" and \"@\\UXXXXXXXX@\" escape sequences.
escapeString :: String -> String
escapeString []                                   = []
escapeString (first:rest)
  | ' ' <= first && first /= '\\' && first <= '~' = first:(escapeString rest)
  | first <= '\xFF'                               = "\\x" ++ (toHex 2 $ ord first) ++ (escapeString rest)
  | '\xFF' < first && first <= '\xFFFF'           = "\\u" ++ (toHex 4 $ ord first) ++ (escapeString rest)
  | otherwise                                     = "\\U" ++ (toHex 8 $ ord first) ++ (escapeString rest)

-- | @toHex digits int@ converts the /int/ to the specified number of
-- hexadecimal /digits/.
toHex :: Int -> Int -> String
toHex digits int
  | digits > 1  = (toHex (digits .- 1) (int `div` 16)) ++ [intToDigit $ int `mod` 16]
  | digits == 1 = [intToDigit int]

-- | @showTokens tokens@ converts a list of /tokens/ to a multi-line YEAST
-- text.
showTokens :: [Token] -> String
showTokens tokens = foldr (\ token text -> (show token) ++ text) "" tokens

-- * Parsing framework
--
-- Haskell has no shortage of parsing frameworks. We use our own because:
--
--  * Most available frameworks are inappropriate because of their focus on
--    building a parse tree, and completing all of it before any of it is
--    accessible to the caller. We return a stream of tokens, and would like
--    its head to be accessible as soon as possible to allow for streaming. To
--    do this we use a difference list (not a 'DList' - we use a \"real\"
--    difference list building on Haskell's lazy evaluation nature).
--
--  * Haskell makes it so easy to roll your own parsing framework. We need some
--    specialized machinery (limited lookahead, forbidden patterns). It is
--    possible to build these on top of existing frameworks but the end result
--    isn't much shorter than rolling our own.
--
-- Since we roll our own framework we don't bother with making it generalized,
-- so we maintain a single 'State' type rather than having a generic one that
-- contains a polymorphic \"UserState\" field etc.

-- | A 'Parser' is basically a function computing a /result/, while at the same
-- time accumulating a list of 'Token'. A result of @Nothing@ indicates
-- failure.
data Parser result = Parser (State -> (State, Maybe result))

-- A 'Pattern' is a parser that doesn't have an (interesting) result.
type Pattern = Parser ()

-- ** Parsing state

-- | The internal parser state. We don't bother with parameterising it with a
-- \"UserState\", we just bundle the generic and specific fields together (not
-- that it is that easy to draw the line - is @sLine@ generic or specific?).
-- Note that using 'DList' for @sTokens@ allows us to consume tokens as they
-- are generated, instead of waiting for the whole parsing process to complete.
-- Likewise for @sCommits@, which allows us to commit to a partially generated
-- tokens stream before it is fully generated.
data State = State {
    -- Fields that never change, or only change for nested parsers:
    sName         :: String,         -- ^ The input name for error messages.
    sEncoding     :: Encoding,       -- ^ The input UTF encoding.
    sDecision     :: String,         -- ^ Current decision name.
    sLimit        :: Int,            -- ^ Lookahead characters limit.
    sForbidden    :: Maybe Pattern,  -- ^ Pattern we must not enter into.
    sIsPeek       :: Bool,           -- ^ Disables token generation.
    -- Fields that get collected by consecutive parsers, then merged:
    sTokens       :: D.DList Token,  -- ^ Tokens collected from input.
    sCommits      :: D.DList String, -- ^ Committments collected while parsing.
    -- Fields that are incrementally modified by consecutive parsers:
    sConsumed     :: Bool,           -- ^ Consumed character markers.
    sChars        :: D.DList Char,   -- ^ Characters collected for a token.
    sMessage      :: Maybe String,   -- ^ If an error occurred.
    sLine         :: Int,            -- ^ Builds on YAML's line break definition.
    sColumn       :: Int,            -- ^ Actually character number - we hate tabs.
    sCode         :: Code,           -- ^ Of token we are collecting chars for.
    sLast         :: Char,           -- ^ Last matched character.
    sInput        :: [Char]          -- ^ The input decoded characters.
  }

-- Showing a 'State' is only used in debugging. Note that forcing certain
-- members (@sTokens@, @sCommits@) may disable streaming.
instance Show State where
  show state = "Name: " ++ (show $ state|>sName)
--          ++ ", Encoding: " ++ (show / state|>sEncoding)
            ++ ", Decision: " ++ (show $ state|>sDecision)
--          ++ ", Limit: " ++ (show / state|>sLimit)
--          ++ ", IsPeek: " ++ (show / state|>sIsPeek)
--          ++ ", Tokens:\n" ++ (showTokens / D.toList / state|>sTokens) ++ "...\n"
--          ++ ", Commits: " ++ (show / D.toList / state|>sCommits)
--          ++ ", Consumed: >>>" ++ (show / state|>sConsumed) ++ "<<<"
--          ++ ", Chars: >>>" ++ (D.toList / state|>sChars) ++ "<<<"
            ++ ", Message: " ++ (show $ state|>sMessage)
            ++ ", Line: " ++ (show $ state|>sLine)
            ++ ", Column: " ++ (show $ state|>sColumn)
            ++ ", Code: " ++ (show $ state|>sCode)
            ++ ", Last: >>>" ++ (show $ state|>sLast) ++ "<<<"
--          ++ ", Input: >>>" ++ (show / state|>sInput) ++ "<<<"

-- | @initialState name input@ returns an initial 'State' for parsing the
-- /input/ (with /name/ for error messages).
initialState :: String -> C.ByteString -> State
initialState name input = let (encoding, decoded) = decode input
                          in State { sName         = name,
                                     sEncoding     = encoding,
                                     sDecision     = "n/a",
                                     sLimit        = -1,
                                     sForbidden    = Nothing,
                                     sIsPeek       = False,
                                     sTokens       = D.empty,
                                     sCommits      = D.empty,
                                     sConsumed     = False,
                                     sChars        = D.empty,
                                     sMessage      = Nothing,
                                     sLine         = 1,
                                     sColumn       = 0,
                                     sCode         = Test,
                                     sLast         = ' ',
                                     sInput        = decoded }

-- *** Setters
--
-- We need setter functions instead of using @{ sXxx = value }@ because the
-- nice syntax causes the old state to be fully evaluated, killing the
-- streaming (lazy) functionality we need. Generating the setters by hand was
-- tedious, there's surely a way to do this using Template Haskell or DrIFT or
-- some such. Life is too short.

-- | @setDecision name state@ sets the @sDecision@ field to /decision/.
setDecision :: String -> State -> State
setDecision decision state = State { sName         = state|>sName,
                                     sEncoding     = state|>sEncoding,
                                     sDecision     = decision,
                                     sLimit        = state|>sLimit,
                                     sForbidden    = state|>sForbidden,
                                     sIsPeek       = state|>sIsPeek,
                                     sTokens       = state|>sTokens,
                                     sCommits      = state|>sCommits,
                                     sConsumed     = state|>sConsumed,
                                     sChars        = state|>sChars,
                                     sMessage      = state|>sMessage,
                                     sLine         = state|>sLine,
                                     sColumn       = state|>sColumn,
                                     sCode         = state|>sCode,
                                     sLast         = state|>sLast,
                                     sInput        = state|>sInput }

-- | @setLimit limit state@ sets the @sLimit@ field to /limit/.
setLimit :: Int -> State -> State
setLimit limit state = State { sName         = state|>sName,
                               sEncoding     = state|>sEncoding,
                               sDecision     = state|>sDecision,
                               sLimit        = limit,
                               sForbidden    = state|>sForbidden,
                               sIsPeek       = state|>sIsPeek,
                               sTokens       = state|>sTokens,
                               sCommits      = state|>sCommits,
                               sConsumed     = state|>sConsumed,
                               sChars        = state|>sChars,
                               sMessage      = state|>sMessage,
                               sLine         = state|>sLine,
                               sColumn       = state|>sColumn,
                               sCode         = state|>sCode,
                               sLast         = state|>sLast,
                               sInput        = state|>sInput }

-- | @setForbidden forbidden state@ sets the @sForbidden@ field to /forbidden/.
setForbidden :: Maybe Pattern -> State -> State
setForbidden forbidden state = State { sName         = state|>sName,
                                       sEncoding     = state|>sEncoding,
                                       sDecision     = state|>sDecision,
                                       sLimit        = state|>sLimit,
                                       sForbidden    = forbidden,
                                       sIsPeek       = state|>sIsPeek,
                                       sTokens       = state|>sTokens,
                                       sCommits      = state|>sCommits,
                                       sConsumed     = state|>sConsumed,
                                       sChars        = state|>sChars,
                                       sMessage      = state|>sMessage,
                                       sLine         = state|>sLine,
                                       sColumn       = state|>sColumn,
                                       sCode         = state|>sCode,
                                       sLast         = state|>sLast,
                                       sInput        = state|>sInput }

-- | @setIsPeek isPeek state@ sets the @sIsPeek@ field to /isPeek/.
setIsPeek :: Bool -> State -> State
setIsPeek isPeek state = State { sName         = state|>sName,
                                 sEncoding     = state|>sEncoding,
                                 sDecision     = state|>sDecision,
                                 sLimit        = state|>sLimit,
                                 sForbidden    = state|>sForbidden,
                                 sIsPeek       = isPeek,
                                 sTokens       = state|>sTokens,
                                 sCommits      = state|>sCommits,
                                 sConsumed     = state|>sConsumed,
                                 sChars        = state|>sChars,
                                 sMessage      = state|>sMessage,
                                 sLine         = state|>sLine,
                                 sColumn       = state|>sColumn,
                                 sCode         = state|>sCode,
                                 sLast         = state|>sLast,
                                 sInput        = state|>sInput }

-- | @setTokens tokens state@ sets the @sTokens@ field to /tokens/.
setTokens :: D.DList Token -> State -> State
setTokens tokens state = State { sName         = state|>sName,
                                 sEncoding     = state|>sEncoding,
                                 sDecision     = state|>sDecision,
                                 sLimit        = state|>sLimit,
                                 sForbidden    = state|>sForbidden,
                                 sIsPeek       = state|>sIsPeek,
                                 sTokens       = tokens,
                                 sCommits      = state|>sCommits,
                                 sConsumed     = state|>sConsumed,
                                 sChars        = state|>sChars,
                                 sMessage      = state|>sMessage,
                                 sLine         = state|>sLine,
                                 sColumn       = state|>sColumn,
                                 sCode         = state|>sCode,
                                 sLast         = state|>sLast,
                                 sInput        = state|>sInput }

-- | @setCommits commits state@ sets the @sCommits@ field to /commits/.
setCommits :: D.DList String -> State -> State
setCommits commits state = State { sName         = state|>sName,
                                   sEncoding     = state|>sEncoding,
                                   sDecision     = state|>sDecision,
                                   sLimit        = state|>sLimit,
                                   sForbidden    = state|>sForbidden,
                                   sIsPeek       = state|>sIsPeek,
                                   sTokens       = state|>sTokens,
                                   sCommits      = commits,
                                   sConsumed     = state|>sConsumed,
                                   sChars        = state|>sChars,
                                   sMessage      = state|>sMessage,
                                   sLine         = state|>sLine,
                                   sColumn       = state|>sColumn,
                                   sCode         = state|>sCode,
                                   sLast         = state|>sLast,
                                   sInput        = state|>sInput }

-- | @setChars chars state@ sets the @sChars@ field to /chars/.
setChars :: D.DList Char -> State -> State
setChars chars state = State { sName         = state|>sName,
                               sEncoding     = state|>sEncoding,
                               sDecision     = state|>sDecision,
                               sLimit        = state|>sLimit,
                               sForbidden    = state|>sForbidden,
                               sIsPeek       = state|>sIsPeek,
                               sTokens       = state|>sTokens,
                               sCommits      = state|>sCommits,
                               sConsumed     = state|>sConsumed,
                               sChars        = chars,
                               sMessage      = state|>sMessage,
                               sLine         = state|>sLine,
                               sColumn       = state|>sColumn,
                               sCode         = state|>sCode,
                               sLast         = state|>sLast,
                               sInput        = state|>sInput }

-- | @setConsumed consumed state@ sets the @sConsumed@ field to /consumed/.
setConsumed :: Bool -> State -> State
setConsumed consumed state = State { sName         = state|>sName,
                                     sEncoding     = state|>sEncoding,
                                     sDecision     = state|>sDecision,
                                     sLimit        = state|>sLimit,
                                     sForbidden    = state|>sForbidden,
                                     sIsPeek       = state|>sIsPeek,
                                     sTokens       = state|>sTokens,
                                     sCommits      = state|>sCommits,
                                     sConsumed     = consumed,
                                     sChars        = state|>sChars,
                                     sMessage      = state|>sMessage,
                                     sLine         = state|>sLine,
                                     sColumn       = state|>sColumn,
                                     sCode         = state|>sCode,
                                     sLast         = state|>sLast,
                                     sInput        = state|>sInput }

-- | @setMessage message state@ sets the @sMessage@ field to /message/.
setMessage :: Maybe String -> State -> State
setMessage message state = State { sName         = state|>sName,
                                   sEncoding     = state|>sEncoding,
                                   sDecision     = state|>sDecision,
                                   sLimit        = state|>sLimit,
                                   sForbidden    = state|>sForbidden,
                                   sIsPeek       = state|>sIsPeek,
                                   sTokens       = state|>sTokens,
                                   sCommits      = state|>sCommits,
                                   sConsumed     = state|>sConsumed,
                                   sChars        = state|>sChars,
                                   sMessage      = message,
                                   sLine         = state|>sLine,
                                   sColumn       = state|>sColumn,
                                   sCode         = state|>sCode,
                                   sLast         = state|>sLast,
                                   sInput        = state|>sInput }

-- | @setLine line state@ sets the @sLine@ field to /line/.
setLine :: Int -> State -> State
setLine line state = State { sName         = state|>sName,
                             sEncoding     = state|>sEncoding,
                             sDecision     = state|>sDecision,
                             sLimit        = state|>sLimit,
                             sForbidden    = state|>sForbidden,
                             sIsPeek       = state|>sIsPeek,
                             sTokens       = state|>sTokens,
                             sCommits      = state|>sCommits,
                             sConsumed     = state|>sConsumed,
                             sChars        = state|>sChars,
                             sMessage      = state|>sMessage,
                             sLine         = line,
                             sColumn       = state|>sColumn,
                             sCode         = state|>sCode,
                             sLast         = state|>sLast,
                             sInput        = state|>sInput }

-- | @setColumn line state@ sets the @sColumn@ field to /line/.
setColumn :: Int -> State -> State
setColumn column state = State { sName         = state|>sName,
                                 sEncoding     = state|>sEncoding,
                                 sDecision     = state|>sDecision,
                                 sLimit        = state|>sLimit,
                                 sForbidden    = state|>sForbidden,
                                 sIsPeek       = state|>sIsPeek,
                                 sTokens       = state|>sTokens,
                                 sCommits      = state|>sCommits,
                                 sConsumed     = state|>sConsumed,
                                 sChars        = state|>sChars,
                                 sMessage      = state|>sMessage,
                                 sLine         = state|>sLine,
                                 sColumn       = column,
                                 sCode         = state|>sCode,
                                 sLast         = state|>sLast,
                                 sInput        = state|>sInput }

-- | @setCode code state@ sets the @sCode@ field to /code/.
setCode :: Code -> State -> State
setCode code state = State { sName         = state|>sName,
                             sEncoding     = state|>sEncoding,
                             sDecision     = state|>sDecision,
                             sLimit        = state|>sLimit,
                             sForbidden    = state|>sForbidden,
                             sIsPeek       = state|>sIsPeek,
                             sTokens       = state|>sTokens,
                             sCommits      = state|>sCommits,
                             sConsumed     = state|>sConsumed,
                             sChars        = state|>sChars,
                             sMessage      = state|>sMessage,
                             sLine         = state|>sLine,
                             sColumn       = state|>sColumn,
                             sCode         = code,
                             sLast         = state|>sLast,
                             sInput        = state|>sInput }

-- | @setLast last state@ sets the @sLast@ field to /last/.
setLast :: Char -> State -> State
setLast last state = State { sName         = state|>sName,
                             sEncoding     = state|>sEncoding,
                             sDecision     = state|>sDecision,
                             sLimit        = state|>sLimit,
                             sForbidden    = state|>sForbidden,
                             sIsPeek       = state|>sIsPeek,
                             sTokens       = state|>sTokens,
                             sCommits      = state|>sCommits,
                             sConsumed     = state|>sConsumed,
                             sChars        = state|>sChars,
                             sMessage      = state|>sMessage,
                             sLine         = state|>sLine,
                             sColumn       = state|>sColumn,
                             sCode         = state|>sCode,
                             sLast         = last,
                             sInput        = state|>sInput }

-- | @setInput input state@ sets the @sInput@ field to /input/.
setInput :: [Char] -> State -> State
setInput input state = State { sName         = state|>sName,
                               sEncoding     = state|>sEncoding,
                               sDecision     = state|>sDecision,
                               sLimit        = state|>sLimit,
                               sForbidden    = state|>sForbidden,
                               sIsPeek       = state|>sIsPeek,
                               sTokens       = state|>sTokens,
                               sCommits      = state|>sCommits,
                               sConsumed     = state|>sConsumed,
                               sChars        = state|>sChars,
                               sMessage      = state|>sMessage,
                               sLine         = state|>sLine,
                               sColumn       = state|>sColumn,
                               sCode         = state|>sCode,
                               sLast         = state|>sLast,
                               sInput        = input }

-- ** Implicit parsers
--
-- It is tedious to have to wrap each expected character (or character range)
-- in an explicit 'Parse' constructor. We let Haskell do that for us using a
-- 'Match' class.

-- | @Match parameter result@ specifies that we can convert the /parameter/ to
-- a 'Parser' returning the /result/.
class Match parameter result | parameter -> result where
    match :: parameter -> Parser result

-- | @parse parser state@ applies the actual /parser/ match function to a
-- /state/.
parse :: (Match match result) => match -> State -> (State, Maybe result)
parse parser state = let Parser parser' = match parser
                     in parser' state

-- | We don't need to convert a 'Parser', it already is one.
instance Match (Parser result) result where
    match = id

-- | We convert 'Char' to a parser for a character (that returns nothing).
instance Match Char () where
    match code = nextIf (== code)

-- | We convert a 'Char' tuple to a parser for a character range (that returns
-- nothing).
instance Match (Char, Char) () where
    match (low, high) = nextIf (\ code -> low <= code && code <= high)

-- | We convert 'String' to a parser for a sequence of characters (that returns
-- nothing).
instance Match String () where
    match "" = empty
    match (first:rest) = match first & match rest

-- ** Parsing Monad

-- | Allow using the @do@ notation for our parsers, which makes for short and
-- sweet @do@ syntax when we want to examine the results (we typically don't).
--
-- We don't use the 'Monad' @fail@ method because we need access to the 'State'
-- on failure.
--
-- @return result@ does just that - return a /result/.
--
-- @left >>= right@ applies the /left/ parser, and if it didn't fail
-- applies the /right/ one (well, the one /right/ returns). To achieve
-- streaming, we need to construct the final state manually, \"regardless\" of
-- the result of the right parser, and append the collected tokens and
-- commitments from the left and right parser. This allows our caller to
-- start consuming tokens from the left parser before the right parser is
-- done.
--
-- This code seems to hang on to the old states for too long a time, causing
-- memory usage to grow up when creating long parser chains (e.g. using @*@).
instance Monad Parser where

  return result = Parser (\ state -> (state, Just result))

  left >>= right =
    Parser (\ originalState -> let (leftState, leftResult) = parse left originalState
                                   leftTokens = leftState|>sTokens
                                   leftCommits = leftState|>sCommits
                                   leftConsumed = leftState|>sConsumed
                                   leftState' = setTokens D.empty
                                              $ setCommits D.empty 
                                              $ setConsumed False
                                              $ leftState
                                   (rightState, rightResult) = case leftResult of
                                                                    Nothing    -> (leftState', Nothing)
                                                                    Just value -> parse (right value) leftState'
                                   rightTokens = rightState|>sTokens
                                   rightCommits = rightState|>sCommits
                                   rightConsumed = rightState|>sConsumed
                                   finalState = setTokens (D.append leftTokens rightTokens)
                                              $ setCommits (D.append leftCommits rightCommits)
                                              $ setConsumed (leftConsumed || rightConsumed)
                                              $ rightState
                               in (finalState, rightResult))

-- ** Parsing operators
--
-- Here we reap the benefits of renaming the numerical operators. Note that in
-- our specific case, we use the 'MonadPlus' to combine the results of
-- different parsers. Typically, parsing frameworks simply use the last result,
-- or leave the task of combining them to the rules themselves (using @do@
-- notation).
--
-- Operator precedence, in decreasing strength:
--
-- @repeated % n@, @repeated <% n@, @match - rejected@, @match ! decision@,
-- @match ?! decision@ are the strongest binding, and don't mix.
--
-- @match - first - second@ is @(match - first) - second@.
--
-- @first & second & third@ is @first & (second & third)@. Note that @first -
-- rejected & second@ is @(first - rejected) & second@, etc. d@ is @a & (b - c)
-- & d@.
--
-- @match \/ alternative \/ otherwise@ is @match \/ (alternative \/
-- otherwise)@. Note that @first & second \/ third@ is @(first & second) \/
-- third@.
--
-- @( match *)@, @(match +)@, @(match ?)@ are the weakest and require the
-- @()@.

infix  3 %
infix  3 <%
infix  3 ^
infix  3 !
infix  3 ?!
infixl 3 -
infixr 2 &
infixr 1 /
infix 0 ?
infix 0 *
infix 0 +

-- | @parser % n@ repeats /parser/ exactly /n/ times.
(%) :: (Match match result) => match -> Int -> Pattern
parser % n
  | n <= 0 = empty
  | n > 0  = parser & parser % n .- 1

-- | @decision ^ (option \/ option \/ ...)@ provides a /decision/ name to the
-- choice about to be made, to allow to @commit@ to it.
(^) :: (Match match result) => String -> match -> Parser result
decision ^ parser = decide decision parser

-- | @parser ! decision@ commits to /decision/ after successfully matching the
-- /parser/.
(!) :: (Match match result) => match -> String -> Pattern
parser ! decision = parser & commit decision

-- | @parser ?! decision@ commits to /decision/ if the current position matches
-- /parser/, without consuming any characters.
(?!) :: (Match match result) => match -> String -> Pattern
parser ?! decision = peek parser & commit decision

-- | @parser <% n@ matches fewer than /n/ occurrences of /parser/.
(<%) :: (Match match result) => match -> Int -> Pattern
parser <% n
  | n < 1 = failed "Fewer than 0 repetitions"
  | n == 1 = reject parser Nothing
  | n > 1  = parser & parser <% n .- 1 / empty

-- | @parser - rejected@ matches /parser/, except if /rejected/ matches at this
-- point.
(-) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1
parser - rejected = reject rejected Nothing & parser

-- | @before & after@ parses /before/ and, if it succeeds, parses /after/. This
-- basically invokes the monad's @>>=@ method.
(&) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result2
before & after = (match before) >> (match after)

-- | @first \/ second@ tries to parse /first/, and failing that parses /second/,
-- unless /first/ has committed in which case is fails immediately. To achieve
-- streaming, we need to construct the final state manually, passing the
-- commitments through \"regardless\" of the results of either parsers. This
-- allows our caller to start consuming tokens from a committed option before
-- it is done parsing.
--
-- This code seems to hang on to the old states for too long a time, causing
-- memory usage to grow up when creating long parser chains (e.g. using @*@).
(/) :: (Match match1 result, Match match2 result) => match1 -> match2 -> Parser result
first / second =
  Parser (\ originalState ->
           let decision = originalState|>sDecision
               originalTokens = originalState|>sTokens
               originalCommits = originalState|>sCommits
               originalConsumed = originalState|>sConsumed
               nestState = setDecision "n/a"
                         $ setTokens D.empty
                         $ setCommits D.empty
                         $ setConsumed False
                         $ originalState
               (firstState, firstResult) = parse first nestState
               (firstIsCommitted, firstCommits) = commitStatus "n/a" decision $ D.toList $ firstState|>sCommits
               (chosenState, chosenResult, chosenCommits) =
                 if firstIsCommitted
                    then (firstState, firstResult, firstCommits)
                    else case firstResult of
                              Just _  -> (firstState, firstResult, firstCommits)
                              Nothing -> let (secondState, secondResult) = parse second nestState
                                             secondCommits = snd $ commitStatus "n/a" decision $ D.toList $ secondState|>sCommits
                                         in  (secondState, secondResult, secondCommits)
               chosenTokens = chosenState|>sTokens
               chosenConsumed = chosenState|>sConsumed
               finalState = setTokens (D.append originalTokens chosenTokens)
                          $ setCommits (D.append originalCommits chosenCommits)
                          $ setConsumed (originalConsumed || chosenConsumed)
                          $ setDecision decision
                          $ chosenState
           in (finalState, chosenResult))
  where commitStatus :: String -> String -> [String] -> (Bool, D.DList String)
        commitStatus prev decision []                             = (False, D.empty)
        commitStatus prev decision (name:rest) | name == decision = (True, snd $ commitStatus prev decision rest)
                                               | name == prev     = (True, snd $ commitStatus prev decision rest)
                                               | otherwise        = (True, D.cons name $ snd $ commitStatus name decision rest)

-- | @(parser ?)@ (optional) tries to match /parser/, otherwise does nothing.
(?) :: (Match match result) => match -> Pattern
(?) parser = non_empty parser & empty / empty

-- | @(parser *)@ matches zero or more occurrences of /parser/.
(*) :: (Match match result) => match -> Pattern
(*) parser = "*" ^ ( non_empty parser ! "*" & (parser *) / empty )

-- | @(parser +)@ matches one or more occurrences of /parser/.
(+) :: (Match match result) => match -> Pattern
(+) parser = non_empty parser & (parser *)

-- ** Basic parsers

-- | @traced name parser@ traces all invocations to the parser. Is only used when
-- debugging.
traced :: (Match match result, Show result) => String -> match -> Parser result
traced name parser = Parser (\ state -> trace_reply name (parse parser (trace_call name state)))

-- | @trace_call name state@ traces the /state/ at the start of the call to /name/.
trace_call :: String -> State -> State
trace_call name state = trace ("Call " ++ name ++ " with " ++ (show state)) state

-- | @trace_call name reply@ traces the /reply/ from calling /name/.
trace_reply :: Show result => String -> (State, Maybe result) -> (State, Maybe result)
trace_reply name reply@(state, result) =
  case result of
       Nothing    -> trace ("Fail " ++ name ++ " with " ++ (show state)) reply
       Just value -> trace ("Done " ++ name ++ " with " ++ (show state)) reply

-- | @reject rejected name@ fails if /rejected/ matches at this point, and does
-- nothing otherwise. If /name/ is provided, it is used in the error message,
-- otherwise the messages uses the current character.
reject :: (Match match result) => match -> Maybe String -> Pattern
reject rejected name =
  Parser (\ state -> let (_, result) = parse rejected $ setIsPeek True state
                         parser' = case (result, name) of
                                        (Nothing, _)        -> empty
                                        (Just _, Nothing)   -> consumeNextIf (const False)
                                        (Just _, Just text) -> failed $ "Unexpected " ++ text
                     in parse parser' state)

-- | @peek parser@ succeeds if /parser/ matches at this point, but does not
-- consume any input.
peek :: (Match match result) => match -> Parser result
peek parser = Parser (\ state -> let (_, result) = parse parser $ setIsPeek True state
                                 in case result of
                                         Nothing -> (setFailed "Peek failed" state, Nothing)
                                         Just _  -> (state, result))

-- | @non_empty parser@ matches the same as /parser/ as long as it consumes some
-- characters.
non_empty :: (Match match result) => match -> Parser result
non_empty parser =
  Parser (\ originalState -> let nestState = setConsumed False originalState
                                 (parseState, parseResult) = parse parser nestState
                             in case (parseState|>sConsumed, parseResult) of
                                     (True,   _)       -> (parseState, parseResult)
                                     (False,  Nothing) -> (parseState, parseResult)
                                     (False,  Just _)  -> (setFailed "Parser does not consume characters" originalState, Nothing))

-- | @empty@ always matches without consuming any input.
empty :: Pattern
empty = return ()

-- | @setFailed state message@ sets the @sMessage@ field to the error /message/.
setFailed :: String -> State -> State
setFailed message state = setMessage (Just $ state|>sName
                                          ++ ": line " ++ (show $ state|>sLine)
                                          ++ ": column " ++ (show $ state|>sColumn)
                                          ++ ": " ++ message)
                                     state

-- | @failed message@ fails parsing with the specified /message/. Note this is
-- not the 'Monad' @fail@ method, which we do not use.
failed :: String -> Parser result
failed message = Parser (\ state -> (setFailed message state, Nothing))

-- | @eof@ matches the end of the input.
eof :: Pattern
eof = Parser (\ state -> if state|>sInput     == []
                            then (state, Just ())
                            else (setFailed "Expected end of input" state, Nothing))

-- | @sol@ matches the start of a line.
sol :: Pattern
sol = Parser (\ state -> if state|>sColumn == 0
                            then (state, Just ())
                            else (setFailed "Expected start of line" state, Nothing))

-- ** State manipulation pseudo-parsers

-- | @incrLine@ increments @sLine@ counter resets @sColumn@.
nextLine :: Pattern
nextLine = Parser (\ state -> (setLine (state|>sLine .+ 1)
                             $ setColumn 0
                             $ state,
                               Just ()))

-- | @with setField getField value parser@ invokes the specified /parser/ with
-- the value of the specified field set to /value/ for the duration of the
-- invocation, using the /setField/ and /getField/ functions to manipulate it.
with :: (value -> State -> State) -> (State -> value) -> value -> Parser result -> Parser result
with setField getField value parser =
  Parser (\ originalState -> let originalValue = getField originalState
                                 withState = setField value originalState
                                 (parseState, parseResult) = parse parser withState
                                 finalState = setField originalValue parseState
                             in (finalState, parseResult))

-- | @decide name (a / b / ...)@ names the contained decision point so it can be
-- addressed by later @commit@ calls.
decide :: (Match match result) => String -> match -> Parser result
decide decision parser = with setDecision sDecision decision $ match parser

-- | @commit name@ commits the parser to all the decisions up to the most recent
-- containing decision with the specified /name/. This makes all tokens
-- generated in this parsing path immediately available to the caller.
commit :: String -> Pattern
commit name = Parser (\ state -> (setCommits (D.snoc (state|>sCommits) name) state, Just ()))

-- | @parser ``forbidding`` pattern@ parses the specified /parser/ ensuring
-- that it does not contain anything matching the /forbidden/ parser.
forbidding :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1
forbidding parser forbidden = with setForbidden sForbidden (Just $ forbidden & empty) (match parser)

-- | @parser ``limitedTo`` limit@ parses the specified /parser/
-- ensuring that it does not consume more than the /limit/ input chars.
limitedTo :: (Match match result) => match -> Int -> Parser result
limitedTo parser limit = with setLimit sLimit limit (match parser)

-- ** Consuming input characters

-- | @consumeNextIf test@ consumes and returns the next character if it satisfies
-- the /test/.
consumeNextIf :: (Char -> Bool) -> Pattern
consumeNextIf test = 
  Parser (\ state -> case state|>sInput of
                          (char:rest) | test char -> (setInput rest
                                                    $ setLast char
                                                    $ setColumn (state|>sColumn .+ 1)
                                                    $ setChars (D.snoc (state|>sChars) char)
                                                    $ setConsumed True
                                                    $ state,
                                                      Just ())
                                      | otherwise -> (setFailed ("Unexpected '" ++ [char] ++ "'") state, Nothing)
                          []                      -> (setFailed "Unexpected end of input" state, Nothing))

-- | @limitedNextIf test@ fails if the 'State' lookahead limit is reached.
-- Otherwise it consumes and returns the specified input char if it satisfies
-- /test/.
limitedNextIf :: (Char -> Bool) -> Pattern
limitedNextIf test =
  Parser (\ state -> case state|>sLimit of
                          -1    -> parse (consumeNextIf test) state
                          0     -> (setFailed "Lookahead limit reached" state, Nothing)
                          limit -> parse (consumeNextIf test) $ setLimit (limit .- 1) state)

-- | @nextIf test@ fails if the current position matches the 'State' forbidden
-- pattern or if the 'State' lookahead limit is reached. Otherwise it consumes
-- (and buffers) the next input char if it satisfies /test/.
nextIf :: (Char -> Bool) -> Pattern
nextIf test =
  Parser (\ state -> case state|>sForbidden of
                          Nothing     -> parse (limitedNextIf test) state
                          Just parser -> let (_, result) = parse parser $ setIsPeek True
                                                                        $ setForbidden Nothing
                                                                        $ state
                                         in case result of
                                                 Nothing -> parse (limitedNextIf test) state
                                                 Just _  -> (setFailed "Forbidden pattern detected" state, Nothing))

-- ** Producing tokens

-- | @finishToken@ places all collected text into a new token and begins a new
-- one, or does nothing if there are no collected characters.
finishToken :: Pattern
finishToken =
  Parser (\ state -> let chars = D.toList $ state|>sChars
                         token = Token { tCode = state|>sCode, tText = Just chars }
                     in if state|>sIsPeek || chars == ""
                           then (state, Just ())
                           else (setTokens (D.snoc (state|>sTokens) token)
                               $ setChars D.empty
                               $ state,
                                 Just ()))

-- | @wrap parser@ invokes the /parser/, ensures any unclaimed input characters
-- are wrapped into a token (only happens when testing productions), ensures no
-- input is left unparsed, and returns the parser's result.
wrap :: (Match match result) => match -> Parser result
wrap parser = do result <- match parser
                 finishToken & eof
                 return result

-- | @consume parser@ invokes the /parser/ and then consumes all remaining
-- unparsed input characters.
consume :: (Match match result) => match -> Parser result
consume parser = do result <- match parser
                    finishToken & clear_input
                    return result
                 where clear_input = Parser (\ state -> (setInput [] state, Just ()))

-- | @token code parser@ places all text matched by /parser/ into a 'Token' with
-- the specified /code/ (unless it is empty). Note it collects the text even if
-- there is an error.
token :: (Match match result) => Code -> match -> Pattern
token code parser = finishToken & with setCode sCode code (parser & finishToken)

-- | @fake code text@ creates a token with the specified /code/ and \"fake\"
-- /text/ characters.
fake :: Code -> String -> Pattern
fake code text =
  Parser (\ state -> if state|>sIsPeek
                        then (state, Just())
                        else let token = Token { tCode = code, tText = Just text }
                             in (setTokens (D.snoc (state|>sTokens) token) state, Just ()))

-- | @meta parser@ collects the text matched by the specified /parser/ into a
-- | @Meta@ token.
meta :: (Match match result) => match -> Pattern
meta parser = token Meta parser

-- | @indicator code@ collects the text matched by the specified /parser/ into an
-- @Indicator@ token.
indicator :: (Match match result) => match -> Pattern
indicator parser = token Indicator $ parser

-- | @text parser@  collects the text matched by the specified /parser/ into a
-- @Text@ token.
text :: (Match match result) => match -> Pattern
text parser = token Text parser

-- | @nest code@ returns an empty token with the specified begin\/end /code/ to
-- signal nesting.
nest :: Code -> Pattern
nest code =
  Parser (\ state -> if state|>sIsPeek
                        then (state, Just())
                        else let (state', _) = parse finishToken state
                                 token = Token { tCode = code, tText = Nothing }
                             in (setTokens (D.snoc (state'|>sTokens) token) state', Just ()))

-- * Production parameters

-- | Production context.
data Context = BlockOut     -- ^ Outside block mapping.
             | BlockIn      -- ^ Inside block mapping.
             | FlowOut      -- ^ Outside flow collection.
             | FlowIn       -- ^ Inside flow collection.
             | FlowKey      -- ^ Inside flow key.

-- | @show context@ converts a 'Context' to a 'String'.
instance Show Context where
  show context = case context of
                      BlockOut -> "block-out"
                      BlockIn  -> "block-in"
                      FlowOut  -> "flow-out"
                      FlowIn   -> "flow-in"
                      FlowKey  -> "flow-key"

-- | @read context@ converts a 'String' to a 'Context'. We trust our callers to
-- convert any @-@ characters into @_@ to allow the built-in @lex@ function to
-- handle the names as single identifiers.
instance Read Context where
  readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ]
    where r word = case word of
                        "block_out" -> BlockOut
                        "block_in"  -> BlockIn
                        "flow_out"  -> FlowOut
                        "flow_in"   -> FlowIn
                        "flow_key"  -> FlowKey
                        _           -> error $ "unknown context: " ++ word

-- | Scalar style.
data Style = Plain   -- ^ Plain scalar.
           | Double  -- ^ Double quoted.
           | Single  -- ^ Single quoted.
           | Literal -- ^ Literal block.
           | Folded  -- ^ Folded block.

-- | @show style@ converts a 'Style' to a 'String'.
instance Show Style where
  show style = case style of
                    Plain   -> "plain"
                    Double  -> "double"
                    Single  -> "single"
                    Literal -> "literal"
                    Folded  -> "folded"

-- | @read style@ converts a 'String' to a 'Style'.
instance Read Style where
  readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ]
    where r word = case word of
                        "plain"   -> Plain
                        "double"  -> Double
                        "single"  -> Single
                        "literal" -> Literal
                        "folded"  -> Folded
                        _         -> error $ "unknown style: " ++ word

-- | Chomp method.
data Chomp = Strip -- ^ Remove all trailing line breaks.
           | Clip  -- ^ Keep first trailing line break.
           | Keep  -- ^ Keep all trailing line breaks.

-- | @show chomp@ converts a 'Chomp' to a 'String'.
instance Show Chomp where
  show chomp = case chomp of
                    Strip -> "strip"
                    Clip  -> "clip"
                    Keep  -> "keep"

-- | @read chomp@ converts a 'String' to a 'Chomp'.
instance Read Chomp where
  readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ]
    where r word = case word of
                        "strip" -> Strip
                        "clip"  -> Clip
                        "keep"  -> Keep
                        _       -> error $ "unknown chomp: " ++ word

-- * Tokenizers
--
-- We encapsulate the 'Parser' inside a 'Tokenizer'. This allows us to hide the
-- implementation details from our callers.

-- | 'Tokenizer' converts a (named) input text into a list of 'Token'. Errors
-- are reported as tokens.
type Tokenizer = String -> C.ByteString -> [Token]

-- | @patternTokenizer pattern@ converts the /pattern/ to a simple 'Tokenizer'.
-- Note how using difference lists we can tuck the error token at the end of
-- the result without preventing the streaming.
patternTokenizer :: Pattern -> Tokenizer
patternTokenizer pattern name input =
  let (state, result) = parse (wrap pattern) (initialState name input)
      last = case result of
                  Just _  -> D.empty
                  Nothing -> D.singleton Token { tCode = Error, tText = state|>sMessage }
      bugs = commitBugs $ D.toList $ state|>sCommits
      errors = D.append bugs last
  in D.toList $ D.append (state|>sTokens) errors

-- | @parserTokenizer what parser@ converts the /parser/ returning /what/ to a
-- simple 'Tokenizer' (only used for tests). Note how using difference lists we
-- can tuck the result or error token at the end of the result without
-- preventing the streaming.
parserTokenizer :: (Show result, Match match result) => String -> match -> Tokenizer
parserTokenizer what parser name input =
  let (state, result) = parse (wrap parser) (initialState name input)
      last = case result of
                  Just value -> Token { tCode = Detected, tText = Just $ what ++ "=" ++ (show value) }
                  Nothing    -> Token { tCode = Error, tText = state|>sMessage }
      bugs = commitBugs $ D.toList $ state|>sCommits
      errors = D.snoc bugs last
  in D.toList $ D.append (state|>sTokens) errors

-- | @commitBugs commits@ converts any @commit@ calls made outside the decision
-- they refer to into an error token. No such calls should exists outside
-- tests.
commitBugs :: [String] -> D.DList Token
commitBugs [] = D.empty
commitBugs (decision:rest) =
  D.cons Token { tCode = Error, tText = Just $ "Commit to '" ++ decision ++ "' was made outside it" }
       $ commitBugs rest

-- | @yaml name input@ converts the Unicode /input/ (called /name/ in error
-- messages) to a list of 'Token' according to the YAML spec. This is it!
yaml :: Tokenizer
yaml = patternTokenizer l_yaml_stream

-- CPP LINES CAUSE HADDOCK TO BARF --
#ifdef REAL_CPP
-- This is how non-ancient C pre-processor do it.
#define STR(X) #X
#else
-- This only works in GHC's simplistic GHC (and ancient C pre-processors).
#define STR(X) "X"
#endif
-- These allow us to avoid repeating the parser names.
#define PAT(PATTERN) pat STR(PATTERN) PATTERN
#define PAR(PARSER)  par STR(PARSER)  PARSER
#define PAC(PARSER)  pac STR(PARSER)  PARSER
-- CPP LINES CAUSE HADDOCK TO BARF --

-- | @pName name@ converts a parser name to the \"proper\" spec name.
pName :: String -> String
pName name = regexSub questionRegex "?"
           $ regexSub minusRegex "-"
           $ regexSub plusRegex "+" name
           where regexSub regex value text = subRegex regex text value
                 questionRegex = mkRegex "'"
                 minusRegex = mkRegex "_"
                 plusRegex = mkRegex "__"

-- | @tokenizers@ returns a mapping from a production name to a production
-- tokenizer.
tokenizers :: Map.Map String Tokenizer
tokenizers = PAR(c_chomping_indicator) "t"
           $ PAC(detect_inline_indentation) "m"
           $ PAT(debug_leak)
           $ PAT(b_as_line_feed)
           $ PAT(b_carriage_return)
           $ PAT(b_char)
           $ PAT(b_generic)
           $ PAT(b_ignored_any)
           $ PAT(b_ignored_generic)
           $ PAT(b_l_folded_as_space)
           $ PAT(b_line_feed)
           $ PAT(b_line_separator)
           $ PAT(b_next_line)
           $ PAT(b_normalized)
           $ PAT(b_paragraph_separator)
           $ PAT(b_specific)
           $ PAT(c_alias)
           $ PAT(c_anchor)
           $ PAT(c_byte_order_mark)
           $ PAT(c_collect_entry)
           $ PAT(c_comment)
           $ PAT(c_directive)
           $ PAT(c_document_end)
           $ PAT(c_document_start)
           $ PAT(c_double_quote)
           $ PAT(c_escape)
           $ PAT(c_flow_indicator)
           $ PAT(c_folded)
           $ PAT(c_indicator)
           $ PAT(c_literal)
           $ PAT(c_mapping_end)
           $ PAT(c_mapping_key)
           $ PAT(c_mapping_start)
           $ PAT(c_mapping_value)
           $ PAT(c_named_tag_handle)
           $ PAT(c_nb_comment_text)
           $ PAT(c_non_specific_tag)
           $ PAT(c_ns_alias)
           $ PAT(c_ns_anchor_property)
           $ PAT(c_ns_local_tag_prefix)
           $ PAT(c_ns_properties)
           $ PAT(c_ns_property)
           $ PAT(c_ns_shorthand_tag)
           $ PAT(c_ns_tag_property)
           $ PAT(c_primary_tag_handle)
           $ PAT(c_printable)
           $ PAT(c_quoted_quote)
           $ PAT(c_reserved)
           $ PAT(c_secondary_tag_handle)
           $ PAT(c_sequence_end)
           $ PAT(c_sequence_entry)
           $ PAT(c_sequence_start)
           $ PAT(c_single_quote)
           $ PAT(c_s_simple_json_key)
           $ PAT(c_tag)
           $ PAT(c_tag_handle)
           $ PAT(c_verbatim_tag)
           $ PAT(e_node)
           $ PAT(e_no_document)
           $ PAT(e_scalar)
           $ PAT(l_comment)
           $ PAT(l_directive)
           $ PAT(l_document_prefix)
           $ PAT(l_documents)
           $ PAT(l_document_suffix)
           $ PAT(l_explicit_document)
           $ PAT(l_following_document)
           $ PAT(l_forbidden)
           $ PAT(l_implicit_document)
           $ PAT(l_leading_document)
           $ PAT(l_yaml_stream)
           $ PAT(nb_char)
           $ PAT(nb_double_char)
           $ PAT(nb_single_char)
           $ PAT(ns_anchor_char)
           $ PAT(ns_anchor_name)
           $ PAT(ns_ascii_letter)
           $ PAT(ns_char)
           $ PAT(ns_dec_digit)
           $ PAT(ns_directive_name)
           $ PAT(ns_directive_parameter)
           $ PAT(ns_double_char)
           $ PAT(ns_esc_16_bit)
           $ PAT(ns_esc_32_bit)
           $ PAT(ns_esc_8_bit)
           $ PAT(ns_esc_backslash)
           $ PAT(ns_esc_backspace)
           $ PAT(ns_esc_bell)
           $ PAT(ns_esc_carriage_return)
           $ PAT(ns_esc_char)
           $ PAT(ns_esc_double_quote)
           $ PAT(ns_esc_escape)
           $ PAT(ns_esc_form_feed)
           $ PAT(ns_esc_horizontal_tab)
           $ PAT(ns_esc_line_feed)
           $ PAT(ns_esc_line_separator)
           $ PAT(ns_esc_next_line)
           $ PAT(ns_esc_non_breaking_space)
           $ PAT(ns_esc_null)
           $ PAT(ns_esc_paragraph_separator)
           $ PAT(ns_esc_space)
           $ PAT(ns_esc_vertical_tab)
           $ PAT(ns_global_tag_prefix)
           $ PAT(ns_hex_digit)
           $ PAT(ns_plain_safe_in)
           $ PAT(ns_plain_safe_out)
           $ PAT(ns_reserved_directive)
           $ PAT(ns_s_block_map_implicit_key)
           $ PAT(ns_single_char)
           $ PAT(ns_s_simple_yaml_key)
           $ PAT(ns_tag_char)
           $ PAT(ns_tag_directive)
           $ PAT(ns_tag_prefix)
           $ PAT(ns_uri_char)
           $ PAT(ns_word_char)
           $ PAT(ns_yaml_directive)
           $ PAT(ns_yaml_version)
           $ PAT(s_b_comment)
           $ PAT(s_b_double_escaped)
           $ PAT(s_l_comments)
           $ PAT(s_ns_double_chars)
           $ PAT(s_ns_single_chars)
           $ PAT(s_separate_in_line)
           $ PAT(s_space)
           $ PAT(s_tab)
           $ PAT(s_white)
           $ Map.empty
  where pat name pattern     = Map.insert (pName name) $ patternTokenizer     (match pattern)
        par name parser what = Map.insert (pName name) $ parserTokenizer what (match parser)
        pac name parser what = Map.insert (pName name) $ parserTokenizer what (consume parser)

-- | @tokenizer name@ converts the production with the specified /name/ to a
-- simple 'Tokenizer', or @Nothing@ if it isn't known.
tokenizer :: String -> (Maybe Tokenizer)
tokenizer name = Map.lookup name tokenizers

-- | @tokenizersWithN@ returns a mapping from a production name to a production
-- tokenizer (that takes an /n/ argument).
tokenizersWithN :: Map.Map String (Int -> Tokenizer)
tokenizersWithN = PAR(c_indentation_indicator) "m"
                $ PAR(count_spaces) "m"
                $ PAC(detect_collection_indentation) "m"
                $ PAC(detect_scalar_indentation) "m"
                $ PAT(b_l_literal_next)
                $ PAT(b_l_spaced)
                $ PAT(c_l_block_map_explicit_entry)
                $ PAT(c_l_block_map_explicit_key)
                $ PAT(c_l_block_map_implicit_value)
                $ PAT(c_l_block_seq_entry)
                $ PAT(c_l__folded)
                $ PAT(c_l__literal)
                $ PAT(l_block_map_explicit_value)
                $ PAT(l__block_mapping)
                $ PAT(l__block_sequence)
                $ PAT(l_keep_empty)
                $ PAT(l_nb_folded_lines)
                $ PAT(l_nb_literal_chars)
                $ PAT(l_nb_spaced_lines)
                $ PAT(l_nb_start_with_any)
                $ PAT(l_nb_start_with_folded)
                $ PAT(l_nb_start_with_spaced)
                $ PAT(l_strip_empty)
                $ PAT(l_trail_comments)
                $ PAT(ns_l_block_map_entry)
                $ PAT(ns_l_block_map_implicit_entry)
                $ PAT(ns_l_in_line_mapping)
                $ PAT(ns_l_in_line_sequence)
                $ PAT(s_double_multi)
                $ PAT(s_ignored_prefix_block)
                $ PAT(s_ignored_prefix_flow)
                $ PAT(s_indent)
                $ PAT(s_indent_le)
                $ PAT(s_indent_lt)
                $ PAT(s_l_double_any)
                $ PAT(s_l_flow_folded)
                $ PAT(s_l__flow_in_block)
                $ PAT(s_nb_folded_text)
                $ PAT(s_nb_spaced_text)
                $ PAT(s_ns_double_next)
                $ PAT(s_ns_single_next)
                $ PAT(s_separate_lines)
                $ PAT(s_single_multi)
                $ Map.empty
  where pat name pattern     = Map.insert (pName name) (\ n -> patternTokenizer     (match   $ pattern n))
        par name parser what = Map.insert (pName name) (\ n -> parserTokenizer what (match   $ parser  n))
        pac name parser what = Map.insert (pName name) (\ n -> parserTokenizer what (consume $ parser  n))

-- | @tokenizerWithN name n@ converts the production (that requires an /n/
-- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if
-- it isn't known.
tokenizerWithN :: String -> Int -> Maybe Tokenizer
tokenizerWithN name n =
  case Map.lookup name tokenizersWithN of
    Just tokenizer -> Just $ tokenizer n
    Nothing        -> Nothing

-- | @tokenizersWithC@ returns a mapping from a production name to a production
-- tokenizer (that takes a /c/ argument).
tokenizersWithC :: Map.Map String (Context -> Tokenizer)
tokenizersWithC = PAT(nb_plain_char)
                $ PAT(ns_plain_char)
                $ PAT(ns_plain_first)
                $ PAT(ns_plain_safe)
                $ PAT(ns_plain_single)
                $ PAT(s_ns_plain_chars)
                $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ c -> patternTokenizer (match $ pattern c))

-- | @tokenizerWithC name c@ converts the production (that requires a /c/
-- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if
-- it isn't known.
tokenizerWithC :: String -> Context -> Maybe Tokenizer
tokenizerWithC name c =
  case Map.lookup name tokenizersWithC of
    Just tokenizer -> Just $ tokenizer c
    Nothing        -> Nothing

-- | @tokenizersWithS@ returns a mapping from a production name to a production
-- tokenizer (that takes a /s/ argument).
tokenizersWithS :: Map.Map String (Style -> Tokenizer)
tokenizersWithS = PAT(c_style_indicator)
                $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ s -> patternTokenizer (match $ pattern s))

-- | @tokenizerWithS name s@ converts the production (that requires an /s/
-- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if
-- it isn't known.
tokenizerWithS :: String -> Style -> Maybe Tokenizer
tokenizerWithS name s =
  case Map.lookup name tokenizersWithS of
    Just tokenizer -> Just $ tokenizer s
    Nothing        -> Nothing

-- | @tokenizersWithT@ returns a mapping from a production name to a production
-- tokenizer (that takes a /t/ argument).
tokenizersWithT :: Map.Map String (Chomp -> Tokenizer)
tokenizersWithT = PAT(b_chomped_last)
                $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ t -> patternTokenizer (match $ pattern t))

-- | @tokenizerWithT name t@ converts the production (that requires an /t/
-- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if
-- it isn't known.
tokenizerWithT :: String -> Chomp -> Maybe Tokenizer
tokenizerWithT name t = 
  case Map.lookup name tokenizersWithT of
    Just tokenizer -> Just $ tokenizer t
    Nothing        -> Nothing

-- | @tokenizersWithNC@ returns a mapping from a production name to a
-- production tokenizer (that requires /n/ and /c/ arguments).
tokenizersWithNC :: Map.Map String (Int -> Context -> Tokenizer)
tokenizersWithNC = PAT(c_double_quoted)
                 $ PAT(c_flow_json_content)
                 $ PAT(c_flow_json_node)
                 $ PAT(c_flow_mapping)
                 $ PAT(c_flow_sequence)
                 $ PAT(c_ns_flow_map_adjacent_value)
                 $ PAT(c_ns_flow_map_implicit_json)
                 $ PAT(c_ns_flow_map_separate_value)
                 $ PAT(c_ns_flow_map_single_json)
                 $ PAT(c_single_quoted)
                 $ PAT(nb_double_text)
                 $ PAT(nb_single_text)
                 $ PAT(ns_flow_content)
                 $ PAT(ns_flow_map_entry)
                 $ PAT(ns_flow_map_implicit_empty)
                 $ PAT(ns_flow_map_implicit_entry)
                 $ PAT(ns_flow_map_implicit_yaml)
                 $ PAT(ns_flow_map_single_entry)
                 $ PAT(ns_flow_map_single_pair)
                 $ PAT(ns_flow_map_single_yaml)
                 $ PAT(ns_flow_node)
                 $ PAT(ns_flow_seq_entry)
                 $ PAT(ns_flow_yaml_content)
                 $ PAT(ns_flow_yaml_node)
                 $ PAT(ns_plain)
                 $ PAT(ns_plain_multi)
                 $ PAT(ns_s_flow_map_entries)
                 $ PAT(ns_s_flow_seq_entries)
                 $ PAT(s_l__block_content)
                 $ PAT(s_l__block_in_block)
                 $ PAT(s_l__block_indented)
                 $ PAT(s_l__block_node)
                 $ PAT(s_ns_flow_map_explicit_empty)
                 $ PAT(s_ns_flow_map_explicit_entry)
                 $ PAT(s_ns_flow_map_explicit_json)
                 $ PAT(s_ns_flow_map_explicit_yaml)
                 $ PAT(s_ns_plain_next)
                 $ PAT(s_separate_in_flow)
                 $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ n c -> patternTokenizer (match $ pattern n c))

-- | @tokenizerWithNC name n c@ converts the production (that requires /n/ and
-- /c/ arguments) with the specified /name/ to a simple 'Tokenizer', or
-- @Nothing@ if it isn't known.
tokenizerWithNC :: String -> Int -> Context -> Maybe Tokenizer
tokenizerWithNC name n c =
  case Map.lookup name tokenizersWithNC of
    Just tokenizer -> Just $ tokenizer n c
    Nothing        -> Nothing

-- | @tokenizersWithNS@ returns a mapping from a production name to a production
-- tokenizer (that requires /n/ and /s/ arguments).
tokenizersWithNS :: Map.Map String (Int -> Style -> Tokenizer)
tokenizersWithNS = PAR(c_b__block_header) "(m,t)"
                 $ PAT(b_l_folded_any)
                 $ PAT(b_l_folded_specific)
                 $ PAT(b_l_folded_trimmed)
                 $ PAT(l_empty)
                 $ PAT(s_ignored_prefix)
                 $ Map.empty
  where pat name pattern     = Map.insert (pName name) (\ n s -> patternTokenizer     (match $ pattern n s))
        par name parser what = Map.insert (pName name) (\ n s -> parserTokenizer what (match $ parser  n s))

-- | @tokenizerWithNS name n s@ converts the production (that requires /n/ and
-- /s/ arguments) with the specified /name/ to a simple 'Tokenizer', or
-- @Nothing@ if it isn't known.
tokenizerWithNS :: String -> Int -> Style -> Maybe Tokenizer
tokenizerWithNS name n s =
  case Map.lookup name tokenizersWithNS of
    Just tokenizer -> Just $ tokenizer n s
    Nothing        -> Nothing

-- | @tokenizersWithNT@ returns a mapping from a production name to a
-- production tokenizer (that requires /n/ and /t/ arguments).
tokenizersWithNT :: Map.Map String (Int -> Chomp -> Tokenizer)
tokenizersWithNT = PAT(l_chomped_empty)
                 $ PAT(l_folded_content)
                 $ PAT(l_literal_content)
                 $ Map.empty
  where pat name pattern = Map.insert (pName name) (\ n t -> patternTokenizer (match $ pattern n t))

-- | @tokenizerWithNT name n t@ converts the production (that requires /n/ and
-- /t/ arguments) with the specified /name/ to a simple 'Tokenizer', or
-- @Nothing@ if it isn't known.
tokenizerWithNT :: String -> Int -> Chomp -> Maybe Tokenizer
tokenizerWithNT name n t =
  case Map.lookup name tokenizersWithNT of
    Just tokenizer -> Just $ tokenizer n t
    Nothing        -> Nothing

-- | @tokenizerNames@ returns the list of all productions (tokenizers).
tokenizerNames :: [String]
tokenizerNames = (Map.keys tokenizers)
              ++ (Map.keys tokenizersWithN)
              ++ (Map.keys tokenizersWithC)
              ++ (Map.keys tokenizersWithS)
              ++ (Map.keys tokenizersWithT)
              ++ (Map.keys tokenizersWithNC)
              ++ (Map.keys tokenizersWithNS)
              ++ (Map.keys tokenizersWithNT)

-- * Productions

-- ** BNF compatibility helpers

-- | @detect_utf_encoding@ doesn't actually detect the encoding, we just call it
-- this way to make the productions compatible with the spec. Instead it simply
-- reports the encoding (which was already detected when we started parsing).
detect_utf_encoding = Parser (\ state -> let text = case state|>sEncoding of
                                                         UTF8    -> "TF8"
                                                         UTF16LE -> "TF16LE"
                                                         UTF16BE -> "TF16BE"
                                         in parse (fake Bom text) $ setChars D.empty
                                                                  $ setColumn (state|>sColumn .- 1)
                                                                  $ state)

-- | @na@ is the \"non-applicable\" indentation value. We use Haskell's laziness
-- to verify it really is never used.
na :: Int
na = error "Accessing non-applicable indentation"

-- | @asInteger@ returns the last consumed character, which is assumed to be a
-- decimal digit, as an integer.
asInteger :: Parser Int
asInteger = Parser (\ state -> (state, Just $ ord (state|>sLast) .- 48))

-- | @result value@ is the same as /return value/ except that we give the
-- Haskell type deduction the additional boost it needs to figure out this is
-- wrapped in a 'Parser'.
result :: result -> Parser result
result = return

-- CPP LINES CAUSE HADDOCK TO BARF --
#include "Reference.bnf"
-- CPP LINES CAUSE HADDOCK TO BARF --

debug_leak = ( c_comment & b_line_feed *)