{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FunctionalDependencies #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE PostfixOperators #-} {-# LANGUAGE Safe #-} {-# LANGUAGE TypeSynonymInstances #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# OPTIONS_GHC -fno-warn-incomplete-patterns #-} -- FIXME -- | -- Copyright: © Oren Ben-Kiki 2007, -- © Herbert Valerio Riedel 2015-2018 -- SPDX-License-Identifier: GPL-3.0 -- -- Tokenizer for the YAML 1.2 syntax as defined in . -- module Data.YAML.Token ( tokenize , Token(..) , Code(..) ) where import qualified Data.ByteString.Lazy.Char8 as BLC import Data.Char (chr, ord) import qualified Data.DList as D import Prelude hiding ((*), (+), (-), (/), (^)) import qualified Prelude import Util hiding (empty) -- * Generic operators -- -- ** Numeric operators -- -- We rename the four numerical operators @+@ @-@ @*@ @\/@ to start with @.@ -- (@.+@, @.-@, @.*@, @.\/@). This allows us to use the originals for BNF -- notation (we also hijack the @^@ operator). This is not a generally -- recommended practice. It is justified in this case since we have very little -- arithmetic operations, and a lot of BNF rules which this makes extremely -- readable. infixl 6 .+ -- | \".+\" is the numeric addition (we use \"+\" for postfix \"one or more\"). (.+) :: Int -> Int -> Int (.+) = (Prelude.+) infixl 6 .- -- | \".-\" is the numeric subtraction (we use \"-\" for infix \"and not\"). (.-) :: Int -> Int -> Int (.-) = (Prelude.-) infixl 7 .* -- | \".*\" is the numeric multiplication (we use \"*\" for postfix \"zero or -- more\"). (.*) :: Int -> Int -> Int (.*) = (Prelude.*) -- ** Record field access -- -- We also define @|>@ for record access for increased readability. infixl 9 |> -- | @record |> field@ is the same as @field record@, but is more readable. (|>) :: record -> (record -> value) -> value record |> field = field record -- * UTF decoding -- -- This really should be factored out to the standard libraries. Since it isn't -- there, we get to tailor it exactly to our needs. We use lazy byte strings as -- input, which should give reasonable I\/O performance when reading large -- files. The output is a normal 'Char' list which is easy to work with and -- should be efficient enough as long as the 'Parser' does its job right. -- | Recognized Unicode encodings. As of YAML 1.2 UTF-32 is also required. data Encoding = UTF8 -- ^ UTF-8 encoding (or ASCII) | UTF16LE -- ^ UTF-16 little endian | UTF16BE -- ^ UTF-16 big endian | UTF32LE -- ^ UTF-32 little endian | UTF32BE -- ^ UTF-32 big endian -- | @show encoding@ converts an 'Encoding' to the encoding name (with a "-") -- as used by most programs. instance Show Encoding where show UTF8 = "UTF-8" show UTF16LE = "UTF-16LE" show UTF16BE = "UTF-16BE" show UTF32LE = "UTF-32LE" show UTF32BE = "UTF-32BE" -- | @decode bytes@ automatically detects the 'Encoding' used and converts the -- /bytes/ to Unicode characters, with byte offsets. Note the offset is for -- past end of the character, not its beginning. decode :: BLC.ByteString -> (Encoding, [(Int, Char)]) decode text = (encoding, undoEncoding encoding text) where encoding = detectEncoding $ BLC.unpack $ BLC.take 4 text -- | @detectEncoding text@ examines the first few chars (bytes) of the /text/ -- to deduce the Unicode encoding used according to the YAML spec. detectEncoding :: [Char] -> Encoding detectEncoding text = case text of '\x00' : '\x00' : '\xFE' : '\xFF' : _ -> UTF32BE '\x00' : '\x00' : '\x00' : _ : _ -> UTF32BE '\xFF' : '\xFE' : '\x00' : '\x00' : _ -> UTF32LE _ : '\x00' : '\x00' : '\x00' : _ -> UTF32LE '\xFE' : '\xFF' : _ -> UTF16BE '\x00' : _ : _ -> UTF16BE '\xFF' : '\xFE' : _ -> UTF16LE _ : '\x00' : _ -> UTF16LE '\xEF' : '\xBB' : '\xBF' : _ -> UTF8 _ -> UTF8 -- | @undoEncoding encoding bytes@ converts a /bytes/ stream to Unicode -- characters according to the /encoding/. undoEncoding :: Encoding -> BLC.ByteString -> [(Int, Char)] undoEncoding encoding bytes = case encoding of UTF8 -> undoUTF8 bytes 0 UTF16LE -> combinePairs $ undoUTF16LE bytes 0 UTF16BE -> combinePairs $ undoUTF16BE bytes 0 UTF32LE -> combinePairs $ undoUTF32LE bytes 0 UTF32BE -> combinePairs $ undoUTF32BE bytes 0 -- ** UTF-32 decoding -- | @hasFewerThan bytes n@ checks whether there are fewer than /n/ /bytes/ -- left to read. hasFewerThan :: Int -> BLC.ByteString -> Bool hasFewerThan n bytes | n == 1 = BLC.null bytes | n > 1 = BLC.null bytes || hasFewerThan (n .- 1) (BLC.tail bytes) | otherwise = False -- | @undoUTF32LE bytes offset@ decoded a UTF-32LE /bytes/ stream to Unicode -- chars. undoUTF32LE :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF32LE bytes offset | BLC.null bytes = [] | hasFewerThan 4 bytes = error "UTF-32LE input contains invalid number of bytes" | otherwise = let first = BLC.head bytes bytes' = BLC.tail bytes second = BLC.head bytes' bytes'' = BLC.tail bytes' third = BLC.head bytes'' bytes''' = BLC.tail bytes'' fourth = BLC.head bytes''' rest = BLC.tail bytes''' in (offset .+ 4, chr $ (ord first) .+ 256 .* ((ord second) .+ 256 .* ((ord third) .+ 256 .* ((ord fourth))))):(undoUTF32LE rest $ offset .+ 4) -- | @undoUTF32BE bytes offset@ decoded a UTF-32BE /bytes/ stream to Unicode -- chars. undoUTF32BE :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF32BE bytes offset | BLC.null bytes = [] | hasFewerThan 4 bytes = error "UTF-32BE input contains invalid number of bytes" | otherwise = let first = BLC.head bytes bytes' = BLC.tail bytes second = BLC.head bytes' bytes'' = BLC.tail bytes' third = BLC.head bytes'' bytes''' = BLC.tail bytes'' fourth = BLC.head bytes''' rest = BLC.tail bytes''' in (offset .+ 4, chr $ (ord fourth) .+ 256 .* ((ord third) .+ 256 .* ((ord second) .+ 256 .* ((ord first))))):(undoUTF32BE rest $ offset .+ 4) -- ** UTF-16 decoding -- | @combinePairs chars@ converts each pair of UTF-16 surrogate characters to a -- single Unicode character. combinePairs :: [(Int, Char)] -> [(Int, Char)] combinePairs [] = [] combinePairs (head@(_, head_char):tail) | '\xD800' <= head_char && head_char <= '\xDBFF' = combineLead head tail | '\xDC00' <= head_char && head_char <= '\xDFFF' = error "UTF-16 contains trail surrogate without lead surrogate" | otherwise = head:(combinePairs tail) -- | @combineLead lead rest@ combines the /lead/ surrogate with the head of the -- /rest/ of the input chars, assumed to be a /trail/ surrogate, and continues -- combining surrogate pairs. combineLead :: (Int, Char) -> [(Int, Char)] -> [(Int, Char)] combineLead _lead [] = error "UTF-16 contains lead surrogate as final character" combineLead (_, lead_char) ((trail_offset, trail_char):rest) | '\xDC00' <= trail_char && trail_char <= '\xDFFF' = (trail_offset, combineSurrogates lead_char trail_char):combinePairs rest | otherwise = error "UTF-16 contains lead surrogate without trail surrogate" -- | @surrogateOffset@ is copied from the Unicode FAQs. surrogateOffset :: Int surrogateOffset = 0x10000 .- (0xD800 .* 1024) .- 0xDC00 -- | @combineSurrogates lead trail@ combines two UTF-16 surrogates into a single -- Unicode character. combineSurrogates :: Char -> Char -> Char combineSurrogates lead trail = chr $ (ord lead) .* 1024 .+ (ord trail) .+ surrogateOffset -- | @undoUTF18LE bytes offset@ decoded a UTF-16LE /bytes/ stream to Unicode -- chars. undoUTF16LE :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF16LE bytes offset | BLC.null bytes = [] | hasFewerThan 2 bytes = error "UTF-16LE input contains odd number of bytes" | otherwise = let low = BLC.head bytes bytes' = BLC.tail bytes high = BLC.head bytes' rest = BLC.tail bytes' in (offset .+ 2, chr $ (ord low) .+ (ord high) .* 256):(undoUTF16LE rest $ offset .+ 2) -- | @undoUTF18BE bytes offset@ decoded a UTF-16BE /bytes/ stream to Unicode -- chars. undoUTF16BE :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF16BE bytes offset | BLC.null bytes = [] | hasFewerThan 2 bytes = error "UTF-16BE input contains odd number of bytes" | otherwise = let high = BLC.head bytes bytes' = BLC.tail bytes low = BLC.head bytes' rest = BLC.tail bytes' in (offset .+ 2, chr $ (ord low) .+ (ord high) .* 256):(undoUTF16BE rest $ offset .+ 2) -- ** UTF-8 decoding -- | @undoUTF8 bytes offset@ decoded a UTF-8 /bytes/ stream to Unicode chars. undoUTF8 :: BLC.ByteString -> Int -> [(Int, Char)] undoUTF8 bytes offset | BLC.null bytes = [] | otherwise = let first = BLC.head bytes rest = BLC.tail bytes in case () of _ | first < '\x80' -> (offset .+ 1, first):(undoUTF8 rest $ offset .+ 1) | first < '\xC0' -> error $ "UTF-8 input contains invalid first byte" | first < '\xE0' -> decodeTwoUTF8 first offset rest | first < '\xF0' -> decodeThreeUTF8 first offset rest | first < '\xF8' -> decodeFourUTF8 first offset rest | otherwise -> error $ "UTF-8 input contains invalid first byte" -- | @decodeTwoUTF8 first offset bytes@ decodes a two-byte UTF-8 character, -- where the /first/ byte is already available and the second is the head of -- the /bytes/, and then continues to undo the UTF-8 encoding. decodeTwoUTF8 :: Char -> Int -> BLC.ByteString -> [(Int, Char)] decodeTwoUTF8 first offset bytes | BLC.null bytes = error "UTF-8 double byte char is missing second byte at eof" | otherwise = let second = BLC.head bytes rest = BLC.tail bytes in case () of _ | second < '\x80' || '\xBF' < second -> error $ "UTF-8 double byte char has invalid second byte" | otherwise -> (offset .+ 2, combineTwoUTF8 first second):(undoUTF8 rest $ offset .+ 2) -- | @combineTwoUTF8 first second@ combines the /first/ and /second/ bytes of a -- two-byte UTF-8 char into a single Unicode char. combineTwoUTF8 :: Char -> Char -> Char combineTwoUTF8 first second = chr(((ord first) .- 0xC0) .* 64 .+ ((ord second) .- 0x80)) -- | @decodeThreeUTF8 first offset bytes@ decodes a three-byte UTF-8 character, -- where the /first/ byte is already available and the second and third are the -- head of the /bytes/, and then continues to undo the UTF-8 encoding. decodeThreeUTF8 :: Char -> Int -> BLC.ByteString -> [(Int, Char)] decodeThreeUTF8 first offset bytes | hasFewerThan 2 bytes = error "UTF-8 triple byte char is missing bytes at eof" | otherwise = let second = BLC.head bytes bytes' = BLC.tail bytes third = BLC.head bytes' rest = BLC.tail bytes' in case () of _ | second < '\x80' || '\xBF' < second -> error "UTF-8 triple byte char has invalid second byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 triple byte char has invalid third byte" | otherwise -> (offset .+ 3, combineThreeUTF8 first second third):(undoUTF8 rest $ offset .+ 3) -- | @combineThreeUTF8 first second@ combines the /first/, /second/ and /third/ -- bytes of a three-byte UTF-8 char into a single Unicode char. combineThreeUTF8 :: Char -> Char -> Char -> Char combineThreeUTF8 first second third = chr(((ord first) .- 0xE0) .* 4096 .+ ((ord second) .- 0x80) .* 64 .+ ((ord third) .- 0x80)) -- | @decodeFourUTF8 first offset bytes@ decodes a four-byte UTF-8 character, -- where the /first/ byte is already available and the second, third and fourth -- are the head of the /bytes/, and then continues to undo the UTF-8 encoding. decodeFourUTF8 :: Char -> Int -> BLC.ByteString -> [(Int, Char)] decodeFourUTF8 first offset bytes | hasFewerThan 3 bytes = error "UTF-8 quad byte char is missing bytes at eof" | otherwise = let second = BLC.head bytes bytes' = BLC.tail bytes third = BLC.head bytes' bytes'' = BLC.tail bytes' fourth = BLC.head bytes'' rest = BLC.tail bytes'' in case () of _ | second < '\x80' || '\xBF' < second -> error "UTF-8 quad byte char has invalid second byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 quad byte char has invalid third byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 quad byte char has invalid fourth byte" | otherwise -> (offset .+ 4, combineFourUTF8 first second third fourth):(undoUTF8 rest $ offset .+ 4) -- | @combineFourUTF8 first second@ combines the /first/, /second/ and /third/ -- bytes of a three-byte UTF-8 char into a single Unicode char. combineFourUTF8 :: Char -> Char -> Char -> Char -> Char combineFourUTF8 first second third fourth = chr(((ord first) .- 0xF0) .* 262144 .+ ((ord second) .- 0x80) .* 4096 .+ ((ord third) .- 0x80) .* 64 .+ ((ord fourth) .- 0x80)) -- * Result tokens -- -- The parsing result is a stream of tokens rather than a parse tree. The idea -- is to convert the YAML input into \"byte codes\". These byte codes are -- intended to be written into a byte codes file (or more likely a UNIX pipe) -- for further processing. -- | 'Token' codes. data Code = Bom -- ^ BOM, contains \"@TF8@\", \"@TF16LE@\", \"@TF32BE@\", etc. | Text -- ^ Content text characters. | Meta -- ^ Non-content (meta) text characters. | Break -- ^ Separation line break. | LineFeed -- ^ Line break normalized to content line feed. | LineFold -- ^ Line break folded to content space. | Indicator -- ^ Character indicating structure. | White -- ^ Separation white space. | Indent -- ^ Indentation spaces. | DirectivesEnd -- ^ Document start marker. | DocumentEnd -- ^ Document end marker. | BeginEscape -- ^ Begins escape sequence. | EndEscape -- ^ Ends escape sequence. | BeginComment -- ^ Begins comment. | EndComment -- ^ Ends comment. | BeginDirective -- ^ Begins directive. | EndDirective -- ^ Ends directive. | BeginTag -- ^ Begins tag. | EndTag -- ^ Ends tag. | BeginHandle -- ^ Begins tag handle. | EndHandle -- ^ Ends tag handle. | BeginAnchor -- ^ Begins anchor. | EndAnchor -- ^ Ends anchor. | BeginProperties -- ^ Begins node properties. | EndProperties -- ^ Ends node properties. | BeginAlias -- ^ Begins alias. | EndAlias -- ^ Ends alias. | BeginScalar -- ^ Begins scalar content. | EndScalar -- ^ Ends scalar content. | BeginSequence -- ^ Begins sequence content. | EndSequence -- ^ Ends sequence content. | BeginMapping -- ^ Begins mapping content. | EndMapping -- ^ Ends mapping content. | BeginPair -- ^ Begins mapping key:value pair. | EndPair -- ^ Ends mapping key:value pair. | BeginNode -- ^ Begins complete node. | EndNode -- ^ Ends complete node. | BeginDocument -- ^ Begins document. | EndDocument -- ^ Ends document. | BeginStream -- ^ Begins YAML stream. | EndStream -- ^ Ends YAML stream. | Error -- ^ Parsing error at this point. | Unparsed -- ^ Unparsed due to errors (or at end of test). | Detected -- ^ Detected parameter (for testing). deriving (Show,Eq) {- -- | @show code@ converts a 'Code' to the one-character YEAST token code char. -- The list of byte codes is also documented in the @yaml2yeast@ program. instance Show Code where show code = case code of Bom -> "U" Text -> "T" Meta -> "t" Break -> "b" LineFeed -> "L" LineFold -> "l" Indicator -> "I" White -> "w" Indent -> "i" DirectivesEnd -> "K" DocumentEnd -> "k" BeginEscape -> "E" EndEscape -> "e" BeginComment -> "C" EndComment -> "c" BeginDirective -> "D" EndDirective -> "d" BeginTag -> "G" EndTag -> "g" BeginHandle -> "H" EndHandle -> "h" BeginAnchor -> "A" EndAnchor -> "a" BeginProperties -> "P" EndProperties -> "p" BeginAlias -> "R" EndAlias -> "r" BeginScalar -> "S" EndScalar -> "s" BeginSequence -> "Q" EndSequence -> "q" BeginMapping -> "M" EndMapping -> "m" BeginNode -> "N" EndNode -> "n" BeginPair -> "X" EndPair -> "x" BeginDocument -> "O" EndDocument -> "o" Error -> "!" Unparsed -> "-" Detected -> "$" -} -- | Parsed token. data Token = Token { tByteOffset :: Int, -- ^ 0-base byte offset in stream. tCharOffset :: Int, -- ^ 0-base character offset in stream. tLine :: Int, -- ^ 1-based line number. tLineChar :: Int, -- ^ 0-based character in line. tCode :: Code, -- ^ Specific token 'Code'. tText :: String -- ^ Contained input chars, if any. } deriving Show -- * Parsing framework -- -- Haskell has no shortage of parsing frameworks. We use our own because: -- -- * Most available frameworks are inappropriate because of their focus on -- building a parse tree, and completing all of it before any of it is -- accessible to the caller. We return a stream of tokens, and would like -- its head to be accessible as soon as possible to allow for streaming. To -- do this with bounded memory usage we use a combination of continuation -- passing style and difference lists for the collected tokens. -- -- * Haskell makes it so easy to roll your own parsing framework. We need some -- specialized machinery (limited lookahead, forbidden patterns). It is -- possible to build these on top of existing frameworks but the end result -- isn't much shorter than rolling our own. -- -- Since we roll our own framework we don't bother with making it generalized, -- so we maintain a single 'State' type rather than having a generic one that -- contains a polymorphic \"UserState\" field etc. -- | A 'Parser' is basically a function computing a 'Reply'. data Parser result = Parser (State -> Reply result) -- | The 'Result' of each invocation is either an error, the actual result, or -- a continuation for computing the actual result. data Result result = Failed String -- ^ Parsing aborted with a failure. | Result result -- ^ Parsing completed with a result. | More (Parser result) -- ^ Parsing is ongoing with a continuation. -- Showing a 'Result' is only used in debugging. instance (Show result) => Show (Result result) where show result = case result of Failed message -> "Failed " ++ message Result result -> "Result " ++ (show result) More _ -> "More" -- | Each invication of a 'Parser' yields a 'Reply'. The 'Result' is only one -- part of the 'Reply'. data Reply result = Reply { rResult :: !(Result result), -- ^ Parsing result. rTokens :: !(D.DList Token), -- ^ Tokens generated by the parser. rCommit :: !(Maybe String), -- ^ Commitment to a decision point. rState :: !State -- ^ The updated parser state. } -- Showing a 'State' is only used in debugging. instance (Show result) => Show (Reply result) where show reply = "Result: " ++ (show $ reply|>rResult) ++ ", Tokens: " ++ (show $ D.toList $ reply|>rTokens) ++ ", Commit: " ++ (show $ reply|>rCommit) ++ ", State: { " ++ (show $ reply|>rState) ++ "}" -- A 'Pattern' is a parser that doesn't have an (interesting) result. type Pattern = Parser () -- ** Parsing state -- | The internal parser state. We don't bother with parameterising it with a -- \"UserState\", we just bundle the generic and specific fields together (not -- that it is that easy to draw the line - is @sLine@ generic or specific?). data State = State { sEncoding :: !Encoding, -- ^ The input UTF encoding. sDecision :: !String, -- ^ Current decision name. sLimit :: !Int, -- ^ Lookahead characters limit. sForbidden :: !(Maybe Pattern), -- ^ Pattern we must not enter into. sIsPeek :: !Bool, -- ^ Disables token generation. sIsSol :: !Bool, -- ^ Is at start of line? sChars :: ![Char], -- ^ (Reversed) characters collected for a token. sCharsByteOffset :: !Int, -- ^ Byte offset of first collected character. sCharsCharOffset :: !Int, -- ^ Char offset of first collected character. sCharsLine :: !Int, -- ^ Line of first collected character. sCharsLineChar :: !Int, -- ^ Character in line of first collected character. sByteOffset :: !Int, -- ^ Offset in bytes in the input. sCharOffset :: !Int, -- ^ Offset in characters in the input. sLine :: !Int, -- ^ Builds on YAML's line break definition. sLineChar :: !Int, -- ^ Character number in line. sCode :: !Code, -- ^ Of token we are collecting chars for. sLast :: !Char, -- ^ Last matched character. sInput :: ![(Int, Char)] -- ^ The decoded input characters. } -- Showing a 'State' is only used in debugging. Note that forcing dump of -- @sInput@ will disable streaming it. instance Show State where show state = "Encoding: " ++ (show $ state|>sEncoding) ++ ", Decision: " ++ (show $ state|>sDecision) ++ ", Limit: " ++ (show $ state|>sLimit) ++ ", IsPeek: " ++ (show $ state|>sIsPeek) ++ ", IsSol: " ++ (show $ state|>sIsSol) ++ ", Chars: >>>" ++ (reverse $ state|>sChars) ++ "<<<" ++ ", CharsByteOffset: " ++ (show $ state|>sCharsByteOffset) ++ ", CharsCharOffset: " ++ (show $ state|>sCharsCharOffset) ++ ", CharsLine: " ++ (show $ state|>sCharsLine) ++ ", CharsLineChar: " ++ (show $ state|>sCharsLineChar) ++ ", ByteOffset: " ++ (show $ state|>sByteOffset) ++ ", CharOffset: " ++ (show $ state|>sCharOffset) ++ ", Line: " ++ (show $ state|>sLine) ++ ", LineChar: " ++ (show $ state|>sLineChar) ++ ", Code: " ++ (show $ state|>sCode) ++ ", Last: " ++ (show $ state|>sLast) -- ++ ", Input: >>>" ++ (show $ state|>sInput) ++ "<<<" -- | @initialState name input@ returns an initial 'State' for parsing the -- /input/ (with /name/ for error messages). initialState :: BLC.ByteString -> State initialState input = State { sEncoding = encoding , sDecision = "" , sLimit = -1 , sForbidden = Nothing , sIsPeek = False , sIsSol = True , sChars = [] , sCharsByteOffset = -1 , sCharsCharOffset = -1 , sCharsLine = -1 , sCharsLineChar = -1 , sByteOffset = 0 , sCharOffset = 0 , sLine = 1 , sLineChar = 0 , sCode = Unparsed , sLast = ' ' , sInput = decoded } where (encoding, decoded) = decode input -- *** Setters -- -- We need four setter functions to pass them around as arguments. For some -- reason, Haskell only generates getter functions. -- | @setLimit limit state@ sets the @sLimit@ field to /limit/. setLimit :: Int -> State -> State setLimit limit state = state { sLimit = limit } -- | @setForbidden forbidden state@ sets the @sForbidden@ field to /forbidden/. setForbidden :: Maybe Pattern -> State -> State setForbidden forbidden state = state { sForbidden = forbidden } -- | @setCode code state@ sets the @sCode@ field to /code/. setCode :: Code -> State -> State setCode code state = state { sCode = code } -- ** Implicit parsers -- -- It is tedious to have to wrap each expected character (or character range) -- in an explicit 'Parse' constructor. We let Haskell do that for us using a -- 'Match' class. -- | @Match parameter result@ specifies that we can convert the /parameter/ to -- a 'Parser' returning the /result/. class Match parameter result | parameter -> result where match :: parameter -> Parser result -- | We don't need to convert a 'Parser', it already is one. instance Match (Parser result) result where match = id -- | We convert 'Char' to a parser for a character (that returns nothing). instance Match Char () where match code = nextIf (== code) -- | We convert a 'Char' tuple to a parser for a character range (that returns -- nothing). instance Match (Char, Char) () where match (low, high) = nextIf $ \ code -> low <= code && code <= high -- | We convert 'String' to a parser for a sequence of characters (that returns -- nothing). instance Match String () where match = foldr (&) empty -- ** Reply constructors -- | @returnReply state result@ prepares a 'Reply' with the specified /state/ -- and /result/. returnReply :: State -> result -> Reply result returnReply state result = Reply { rResult = Result result, rTokens = D.empty, rCommit = Nothing, rState = state } -- | @tokenReply state token@ returns a 'Reply' containing the /state/ and -- /token/. Any collected characters are cleared (either there are none, or we -- put them in this token, or we don't want them). tokenReply :: State -> Token -> Reply () tokenReply state token = Reply { rResult = Result (), rTokens = D.singleton token, rCommit = Nothing, rState = state { sCharsByteOffset = -1, sCharsCharOffset = -1, sCharsLine = -1, sCharsLineChar = -1, sChars = [] } } -- | @failReply state message@ prepares a 'Reply' with the specified /state/ -- and error /message/. failReply :: State -> String -> Reply result failReply state message = Reply { rResult = Failed message, rTokens = D.empty, rCommit = Nothing, rState = state } -- | @unexpectedReply state@ returns a @failReply@ for an unexpected character. unexpectedReply :: State -> Reply result unexpectedReply state = case state|>sInput of ((_, char):_) -> failReply state $ "Unexpected '" ++ [char] ++ "'" [] -> failReply state "Unexpected end of input" instance Functor Parser where fmap = liftM instance Applicative Parser where pure = return (<*>) = ap -- | Allow using the @do@ notation for our parsers, which makes for short and -- sweet @do@ syntax when we want to examine the results (we typically don't). instance Monad Parser where -- @return result@ does just that - return a /result/. return result = Parser $ \ state -> returnReply state result -- @left >>= right@ applies the /left/ parser, and if it didn't fail -- applies the /right/ one (well, the one /right/ returns). left >>= right = bindParser left right where bindParser (Parser left) right = Parser $ \ state -> let reply = left state in case reply|>rResult of Failed message -> reply { rResult = Failed message } Result value -> reply { rResult = More $ right value } More parser -> reply { rResult = More $ bindParser parser right } -- @fail message@ does just that - fails with a /message/. fail message = Parser $ \ state -> failReply state message -- ** Parsing operators -- -- Here we reap the benefits of renaming the numerical operators. The Operator -- precedence, in decreasing strength: -- -- @repeated % n@, @repeated <% n@, @match - rejected@, @match ! decision@, -- @match ?! decision@, @choice ^ (first \/ second)@. -- -- @match - first - second@ is @(match - first) - second@. -- -- @first & second & third@ is @first & (second & third)@. Note that @first - -- rejected & second@ is @(first - rejected) & second@, etc. -- -- @match \/ alternative \/ otherwise@ is @match \/ (alternative \/ -- otherwise)@. Note that @first & second \/ third@ is @(first & second) \/ -- third@. -- -- @( match *)@, @(match +)@, @(match ?)@, @(match ?)@, @(match -- >!)@, @(match ? infix 0 >! -- | @parser % n@ repeats /parser/ exactly /n/ times. (%) :: (Match match result) => match -> Int -> Pattern parser % n | n <= 0 = empty | n > 0 = parser & parser % n .- 1 -- | @parser <% n@ matches fewer than /n/ occurrences of /parser/. (<%) :: (Match match result) => match -> Int -> Pattern parser <% n | n < 1 = fail "Fewer than 0 repetitions" | n == 1 = reject parser Nothing | n > 1 = "<%" ^ ( parser ! "<%" & parser <% n .- 1 / empty ) -- | @decision ^ (option \/ option \/ ...)@ provides a /decision/ name to the -- choice about to be made, to allow to @commit@ to it. (^) :: (Match match result) => String -> match -> Parser result decision ^ parser = choice decision $ match parser -- | @parser ! decision@ commits to /decision/ (in an option) after -- successfully matching the /parser/. (!) :: (Match match result) => match -> String -> Pattern parser ! decision = parser & commit decision -- | @parser ?! decision@ commits to /decision/ (in an option) if the current -- position matches /parser/, without consuming any characters. (?!) :: (Match match result) => match -> String -> Pattern parser ?! decision = peek parser & commit decision -- | @lookbehind match -> Parser result (?@ matches the current point without consuming any characters -- if it matches the lookahead parser (positive lookahead) (>?) :: (Match match result) => match -> Parser result (>?) lookahead = peek lookahead -- | @lookahead >?@ matches the current point without consuming any characters -- if it matches the lookahead parser (negative lookahead) (>!) :: (Match match result) => match -> Pattern (>!) lookahead = reject lookahead Nothing -- | @parser - rejected@ matches /parser/, except if /rejected/ matches at this -- point. (-) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1 parser - rejected = reject rejected Nothing & parser -- | @before & after@ parses /before/ and, if it succeeds, parses /after/. This -- basically invokes the monad's @>>=@ (bind) method. (&) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result2 before & after = (match before) >> (match after) -- | @first \/ second@ tries to parse /first/, and failing that parses -- /second/, unless /first/ has committed in which case is fails immediately. (/) :: (Match match1 result, Match match2 result) => match1 -> match2 -> Parser result first / second = Parser $ \ state -> let Parser parser = decide (match first) (match second) in parser state -- | @(optional ?)@ tries to match /parser/, otherwise does nothing. (?) :: (Match match result) => match -> Pattern (?) optional = (optional & empty) / empty -- | @(parser *)@ matches zero or more occurrences of /repeat/, as long as each -- one actually consumes input characters. (*) :: (Match match result) => match -> Pattern (*) parser = "*" ^ zomParser where zomParser = (parser ! "*" & zomParser) / empty -- | @(parser +)@ matches one or more occurrences of /parser/, as long as each -- one actually consumed input characters. (+) :: (Match match result) => match -> Pattern (+) parser = parser & (parser *) -- ** Basic parsers -- | @decide first second@ tries to parse /first/, and failing that parses -- /second/, unless /first/ has committed in which case is fails immediately. decide :: Parser result -> Parser result -> Parser result decide left right = Parser $ \ state -> let Parser parser = decideParser state D.empty left right in parser state where decideParser point tokens (Parser left) right = Parser $ \state -> let reply = left state tokens' reply = D.append tokens $ reply|>rTokens in case (reply|>rResult, reply|>rCommit) of (Failed _, _) -> Reply { rState = point, rTokens = D.empty, rResult = More right, rCommit = Nothing } (Result _, _) -> reply { rTokens = tokens' reply } (More left', Just _) -> reply { rTokens = tokens' reply, rResult = More left' } (More left', Nothing) -> let Parser parser = decideParser point (tokens' reply) left' right in parser $ reply|>rState -- | @choice decision parser@ provides a /decision/ name to the choice about to -- be made in /parser/, to allow to @commit@ to it. choice :: String -> Parser result -> Parser result choice decision parser = Parser $ \ state -> let Parser parser' = choiceParser (state|>sDecision) decision parser in parser' state { sDecision = decision } where choiceParser parentDecision makingDecision (Parser parser) = Parser $ \ state -> let reply = parser state commit' = case reply|>rCommit of Nothing -> Nothing Just decision | decision == makingDecision -> Nothing | otherwise -> reply|>rCommit reply' = case reply|>rResult of More parser' -> reply { rCommit = commit', rResult = More $ choiceParser parentDecision makingDecision parser' } _ -> reply { rCommit = commit', rState = (reply|>rState) { sDecision = parentDecision } } in reply' -- | @parser ``recovery`` pattern@ parses the specified /parser/; if it fails, -- it continues to the /recovery/ parser to recover. recovery :: (Match match1 result, Match match2 result) => match1 -> match2 -> Parser result recovery pattern recover = Parser $ \ state -> let (Parser parser) = match pattern reply = parser state in if state|>sIsPeek then reply else case reply|>rResult of Result _ -> reply More more -> reply { rResult = More $ more `recovery` recover } Failed message -> reply { rResult = More $ fake Error message & unparsed & recover } where unparsed = let (Parser parser) = match finishToken in Parser $ \ state -> parser $ state { sCode = Unparsed } -- | @prev parser@ succeeds if /parser/ matches at the previous character. It -- does not consume any input. prev :: (Match match result) => match -> Parser result prev parser = Parser $ \ state -> prevParser state (match parser) state { sIsPeek = True, sInput = (-1, state|>sLast) : state|>sInput } where prevParser point (Parser parser) state = let reply = parser state in case reply|>rResult of Failed message -> failReply point message Result value -> returnReply point value More parser' -> prevParser point parser' $ reply|>rState -- | @peek parser@ succeeds if /parser/ matches at this point, but does not -- consume any input. peek :: (Match match result) => match -> Parser result peek parser = Parser $ \ state -> peekParser state (match parser) state { sIsPeek = True } where peekParser point (Parser parser) state = let reply = parser state in case reply|>rResult of Failed message -> failReply point message Result value -> returnReply point value More parser' -> peekParser point parser' $ reply|>rState -- | @reject parser name@ fails if /parser/ matches at this point, and does -- nothing otherwise. If /name/ is provided, it is used in the error message, -- otherwise the messages uses the current character. reject :: (Match match result) => match -> Maybe String -> Pattern reject parser name = Parser $ \ state -> rejectParser state name (match parser) state { sIsPeek = True } where rejectParser point name (Parser parser) state = let reply = parser state in case reply|>rResult of Failed _message -> returnReply point () Result _value -> case name of Nothing -> unexpectedReply point Just text -> failReply point $ "Unexpected " ++ text More parser' -> rejectParser point name parser' $ reply|>rState -- | @upto parser@ consumes all the character up to and not including the next -- point where the specified parser is a match. upto :: Pattern -> Pattern upto parser = ( ( parser >!) & nextIf (const True) *) -- | @nonEmpty parser@ succeeds if /parser/ matches some non-empty input -- characters at this point. nonEmpty :: (Match match result) => match -> Parser result nonEmpty parser = Parser $ \ state -> let Parser parser' = nonEmptyParser (state|>sCharOffset) (match parser) in parser' state where nonEmptyParser offset (Parser parser) = Parser $ \ state -> let reply = parser state state' = reply|>rState in case reply|>rResult of Failed _message -> reply Result _value -> if state'|>sCharOffset > offset then reply else failReply state' "Matched empty pattern" More parser' -> reply { rResult = More $ nonEmptyParser offset parser' } -- | @empty@ always matches without consuming any input. empty :: Pattern empty = return () -- | @eof@ matches the end of the input. eof :: Pattern eof = Parser $ \ state -> if state|>sInput == [] then returnReply state () else unexpectedReply state -- | @sol@ matches the start of a line. sol :: Pattern sol = Parser $ \ state -> if state|>sIsSol then returnReply state () else failReply state "Expected start of line" -- ** State manipulation pseudo-parsers -- | @commit decision@ commits the parser to all the decisions up to the most -- recent parent /decision/. This makes all tokens generated in this parsing -- path immediately available to the caller. commit :: String -> Pattern commit decision = Parser $ \ state -> Reply { rState = state, rTokens = D.empty, rResult = Result (), rCommit = Just decision } -- | @nextLine@ increments @sLine@ counter and resets @sLineChar@. nextLine :: Pattern nextLine = Parser $ \ state -> returnReply state { sIsSol = True, sLine = state|>sLine .+ 1, sLineChar = 0 } () -- | @with setField getField value parser@ invokes the specified /parser/ with -- the value of the specified field set to /value/ for the duration of the -- invocation, using the /setField/ and /getField/ functions to manipulate it. with :: (value -> State -> State) -> (State -> value) -> value -> Parser result -> Parser result with setField getField value parser = Parser $ \ state -> let value' = getField state Parser parser' = value' `seq` withParser value' parser in parser' $ setField value state where withParser parentValue (Parser parser) = Parser $ \ state -> let reply = parser state in case reply|>rResult of Failed _ -> reply { rState = setField parentValue $ reply|>rState } Result _ -> reply { rState = setField parentValue $ reply|>rState } More parser' -> reply { rResult = More $ withParser parentValue parser' } -- | @parser ``forbidding`` pattern@ parses the specified /parser/ ensuring -- that it does not contain anything matching the /forbidden/ parser. forbidding :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1 forbidding parser forbidden = with setForbidden sForbidden (Just $ forbidden & empty) (match parser) -- | @parser ``limitedTo`` limit@ parses the specified /parser/ -- ensuring that it does not consume more than the /limit/ input chars. limitedTo :: (Match match result) => match -> Int -> Parser result limitedTo parser limit = with setLimit sLimit limit (match parser) -- ** Consuming input characters -- | @nextIf test@ fails if the current position matches the 'State' forbidden -- pattern or if the 'State' lookahead limit is reached. Otherwise it consumes -- (and buffers) the next input char if it satisfies /test/. nextIf :: (Char -> Bool) -> Pattern nextIf test = Parser $ \ state -> case state|>sForbidden of Nothing -> limitedNextIf state Just parser -> let Parser parser' = reject parser $ Just "forbidden pattern" reply = parser' state { sForbidden = Nothing } in case reply|>rResult of Failed _ -> reply Result _ -> limitedNextIf state where limitedNextIf state = case state|>sLimit of -1 -> consumeNextIf state 0 -> failReply state "Lookahead limit reached" _limit -> consumeNextIf state { sLimit = state|>sLimit .- 1 } consumeNextIf state = case state|>sInput of ((offset, char):rest) | test char -> let chars = if state|>sIsPeek then [] else char:(state|>sChars) byte_offset = charsOf sByteOffset sCharsByteOffset char_offset = charsOf sCharOffset sCharsCharOffset line = charsOf sLine sCharsLine line_char = charsOf sLineChar sCharsLineChar is_sol = if char == '\xFEFF' then state|>sIsSol else False state' = state { sInput = rest, sLast = char, sChars = chars, sCharsByteOffset = byte_offset, sCharsCharOffset = char_offset, sCharsLine = line, sCharsLineChar = line_char, sIsSol = is_sol, sByteOffset = offset, sCharOffset = state|>sCharOffset .+ 1, sLineChar = state|>sLineChar .+ 1 } in returnReply state' () | otherwise -> unexpectedReply state [] -> unexpectedReply state where charsOf field charsField = if state|>sIsPeek then -1 else if state|>sChars == [] then state|>field else state|>charsField -- ** Producing tokens -- | @finishToken@ places all collected text into a new token and begins a new -- one, or does nothing if there are no collected characters. finishToken :: Pattern finishToken = Parser $ \ state -> let state' = state { sChars = [], sCharsByteOffset = -1, sCharsCharOffset = -1, sCharsLine = -1, sCharsLineChar = -1 } in if state|>sIsPeek then returnReply state' () else case state|>sChars of [] -> returnReply state' () chars@(_:_) -> tokenReply state' Token { tByteOffset = state|>sCharsByteOffset, tCharOffset = state|>sCharsCharOffset, tLine = state|>sCharsLine, tLineChar = state|>sCharsLineChar, tCode = state|>sCode, tText = reverse chars } -- | @wrap parser@ invokes the /parser/, ensures any unclaimed input characters -- are wrapped into a token (only happens when testing productions), ensures no -- input is left unparsed, and returns the parser's result. wrap :: (Match match result) => match -> Parser result wrap parser = do result <- match parser finishToken eof return result -- | @token code parser@ places all text matched by /parser/ into a 'Token' with -- the specified /code/ (unless it is empty). Note it collects the text even if -- there is an error. token :: (Match match result) => Code -> match -> Pattern token code parser = finishToken & with setCode sCode code (parser & finishToken) -- | @fake code text@ creates a token with the specified /code/ and \"fake\" -- /text/ characters, instead of whatever characters are collected so far. fake :: Code -> String -> Pattern fake code text = Parser $ \ state -> if state|>sIsPeek then returnReply state () else tokenReply state Token { tByteOffset = value state sByteOffset sCharsByteOffset, tCharOffset = value state sCharOffset sCharsCharOffset, tLine = value state sLine sCharsLine, tLineChar = value state sLineChar sCharsLineChar, tCode = code, tText = text } where value state field1 field2 = if field2 state == -1 then field1 state else field2 state -- | @meta parser@ collects the text matched by the specified /parser/ into a -- | @Meta@ token. meta :: (Match match result) => match -> Pattern meta parser = token Meta parser -- | @indicator code@ collects the text matched by the specified /parser/ into an -- @Indicator@ token. indicator :: (Match match result) => match -> Pattern indicator parser = token Indicator $ parser -- | @text parser@ collects the text matched by the specified /parser/ into a -- @Text@ token. text :: (Match match result) => match -> Pattern text parser = token Text parser -- | @emptyToken code@ returns an empty token. emptyToken :: Code -> Pattern emptyToken code = finishToken & parser code where parser code = Parser $ \ state -> if state|>sIsPeek then returnReply state () else tokenReply state Token { tByteOffset = state|>sByteOffset, tCharOffset = state|>sCharOffset, tLine = state|>sLine, tLineChar = state|>sLineChar, tCode = code, tText = "" } -- | @wrapTokens beginCode endCode parser@ wraps the specified /parser/ with -- matching /beginCode/ and /endCode/ tokens. wrapTokens :: Code -> Code -> Pattern -> Pattern wrapTokens beginCode endCode pattern = emptyToken beginCode & prefixErrorWith pattern (emptyToken endCode) & emptyToken endCode -- | @prefixErrorWith pattern prefix@ will invoke the @prefix@ parser if an -- error is detected during the @pattern@ parser, and then return the error. prefixErrorWith :: (Match match result) => match -> Pattern -> Parser result prefixErrorWith pattern prefix = Parser $ \ state -> let (Parser parser) = match pattern reply = parser state in case reply|>rResult of Result _ -> reply More more -> reply { rResult = More $ prefixErrorWith more prefix } Failed message -> reply { rResult = More $ prefix & (fail message :: Parser result) } -- * Production parameters -- | Production context. data Context = BlockOut -- ^ Outside block sequence. | BlockIn -- ^ Inside block sequence. | FlowOut -- ^ Outside flow collection. | FlowIn -- ^ Inside flow collection. | BlockKey -- ^ Implicit block key. | FlowKey -- ^ Implicit flow key. -- | @show context@ converts a 'Context' to a 'String'. instance Show Context where show context = case context of BlockOut -> "block-out" BlockIn -> "block-in" FlowOut -> "flow-out" FlowIn -> "flow-in" BlockKey -> "block-key" FlowKey -> "flow-key" -- | @read context@ converts a 'String' to a 'Context'. We trust our callers to -- convert any @-@ characters into @_@ to allow the built-in @lex@ function to -- handle the names as single identifiers. instance Read Context where readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ] where r word = case word of "block_out" -> BlockOut "block_in" -> BlockIn "flow_out" -> FlowOut "flow_in" -> FlowIn "block_key" -> BlockKey "flow_key" -> FlowKey _ -> error $ "unknown context: " ++ word -- | Chomp method. data Chomp = Strip -- ^ Remove all trailing line breaks. | Clip -- ^ Keep first trailing line break. | Keep -- ^ Keep all trailing line breaks. -- | @show chomp@ converts a 'Chomp' to a 'String'. instance Show Chomp where show chomp = case chomp of Strip -> "strip" Clip -> "clip" Keep -> "keep" -- | @read chomp@ converts a 'String' to a 'Chomp'. instance Read Chomp where readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ] where r word = case word of "strip" -> Strip "clip" -> Clip "keep" -> Keep _ -> error $ "unknown chomp: " ++ word -- * Tokenizers -- -- We encapsulate the 'Parser' inside a 'Tokenizer'. This allows us to hide the -- implementation details from our callers. -- | 'Tokenizer' converts a input text into a list of 'Token'. Errors -- are reported as tokens with the @Error@ 'Code', and the unparsed text -- following an error may be attached as a final token (if the @Bool@ is -- @True@). Note that tokens are available \"immediately\", allowing for -- streaming of large YAML files with memory requirements depending only on the -- YAML nesting level. type Tokenizer = BLC.ByteString -> Bool -> [Token] -- | @patternTokenizer pattern@ converts the /pattern/ to a simple 'Tokenizer'. patternTokenizer :: Pattern -> Tokenizer patternTokenizer pattern input withFollowing = D.toList $ patternParser (wrap pattern) (initialState input) where patternParser (Parser parser) state = let reply = parser state tokens = commitBugs reply state' = reply|>rState in case reply|>rResult of Failed message -> errorTokens tokens state' message withFollowing Result _ -> tokens More parser' -> D.append tokens $ patternParser parser' state' -- | @errorTokens tokens state message withFollowing@ appends an @Error@ token -- with the specified /message/ at the end of /tokens/, and if /withFollowing/ -- also appends the unparsed text following the error as a final @Unparsed@ -- token. errorTokens :: D.DList Token -> State -> String -> Bool -> D.DList Token errorTokens tokens state message withFollowing = let tokens' = D.append tokens $ D.singleton Token { tByteOffset = state|>sByteOffset, tCharOffset = state|>sCharOffset, tLine = state|>sLine, tLineChar = state|>sLineChar, tCode = Error, tText = message } in if withFollowing && state|>sInput /= [] then D.append tokens' $ D.singleton Token { tByteOffset = state|>sByteOffset, tCharOffset = state|>sCharOffset, tLine = state|>sLine, tLineChar = state|>sLineChar, tCode = Unparsed, tText = map snd $ state|>sInput } else tokens' -- | @commitBugs reply@ inserts an error token if a commit was made outside a -- named choice. This should never happen outside tests. commitBugs :: Reply result -> D.DList Token commitBugs reply = let tokens = reply|>rTokens state = reply|>rState in case reply|>rCommit of Nothing -> tokens Just commit -> D.append tokens $ D.singleton Token { tByteOffset = state|>sByteOffset, tCharOffset = state|>sCharOffset, tLine = state|>sLine, tLineChar = state|>sLineChar, tCode = Error, tText = "Commit to '" ++ commit ++ "' was made outside it" } -- | @'tokenize' input emit_unparsed@ -- converts the Unicode /input/ to a -- list of 'Token' according to the YAML 1.2 specification. -- -- Errors are reported as tokens with @'Error' :: 'Code'@, and the -- unparsed text following an error may be attached as a final 'Unparsed' token -- (if the /emit_unparsed/ argument is @True@). Note that tokens are available -- \"immediately\", allowing for streaming of large YAML files with -- memory requirements depending only on the YAML nesting level. tokenize :: BLC.ByteString -> Bool -> [Token] tokenize = patternTokenizer l_yaml_stream -- * Productions -- ** BNF compatibility helpers -- | @detect_utf_encoding@ doesn't actually detect the encoding, we just call it -- this way to make the productions compatible with the spec. Instead it simply -- reports the encoding (which was already detected when we started parsing). bom :: Match match1 result1 => match1 -> Parser () bom code = code & (Parser $ \ state -> let text = case state|>sEncoding of UTF8 -> "TF-8" UTF16LE -> "TF-16LE" UTF16BE -> "TF-16BE" UTF32LE -> "TF-32LE" UTF32BE -> "TF-32BE" Parser parser = fake Bom text in parser state) -- | @na@ is the \"non-applicable\" indentation value. We use Haskell's laziness -- to verify it really is never used. na :: Int na = error "Accessing non-applicable indentation" -- | @asInteger@ returns the last consumed character, which is assumed to be a -- decimal digit, as an integer. asInteger :: Parser Int asInteger = Parser $ \ state -> returnReply state $ ord (state|>sLast) .- 48 -- | @result value@ is the same as /return value/ except that we give the -- Haskell type deduction the additional boost it needs to figure out this is -- wrapped in a 'Parser'. result :: result -> Parser result result = return ---------------------------------------------------------------------------- -- ** Spec productions -- -- These are copied directly from the spec, with the sprinkling of -- additional token and decision point directives. -- 5.1 Character Set c_printable {- 1 -} = '\x9' / '\xA' / '\xD' / ('\x20', '\x7E') / '\x85' / ('\xA0', '\xD7FF') / ('\xE000', '\xFFFD') / ('\x10000', '\x10FFFF') nb_json {- 2 -} = '\x9' / ('\x20', '\x10FFFF') -- 5.2 Character Encodings c_byte_order_mark {- 3 -} = bom '\xFEFF' -- 5.3 Indicator Characters c_sequence_entry {- 4 -} = indicator '-' c_mapping_key {- 5 -} = indicator '?' c_mapping_value {- 6 -} = indicator ':' c_collect_entry {- 7 -} = indicator ',' c_sequence_start {- 8 -} = indicator '[' c_sequence_end {- 9 -} = indicator ']' c_mapping_start {- 10 -} = indicator '{' c_mapping_end {- 11 -} = indicator '}' c_comment {- 12 -} = indicator '#' c_anchor {- 13 -} = indicator '&' c_alias {- 14 -} = indicator '*' c_tag {- 15 -} = indicator '!' c_literal {- 16 -} = indicator '|' c_folded {- 17 -} = indicator '>' c_single_quote {- 18 -} = indicator '\'' c_double_quote {- 19 -} = indicator '"' c_directive {- 20 -} = indicator '%' c_reserved {- 21 -} = indicator ( '@' / '`' ) c_indicator {- 22 -} = c_sequence_entry / c_mapping_key / c_mapping_value / c_collect_entry / c_sequence_start / c_sequence_end / c_mapping_start / c_mapping_end / c_comment / c_anchor / c_alias / c_tag / c_literal / c_folded / c_single_quote / c_double_quote / c_directive / c_reserved c_flow_indicator {- 23 -} = c_collect_entry / c_sequence_start / c_sequence_end / c_mapping_start / c_mapping_end -- 5.4 Line Break Characters b_line_feed {- 24 -} = '\xA' b_carriage_return {- 25 -} = '\xD' b_char {- 26 -} = b_line_feed / b_carriage_return nb_char {- 27 -} = c_printable - b_char - c_byte_order_mark b_break {- 28 -} = ( b_carriage_return & b_line_feed / b_carriage_return / b_line_feed ) & nextLine b_as_line_feed {- 29 -} = token LineFeed b_break b_non_content {- 30 -} = token Break b_break -- 5.5 White Space Characters s_space {- 31 -} = '\x20' s_tab {- 32 -} = '\x9' s_white {- 33 -} = s_space / s_tab ns_char {- 34 -} = nb_char - s_white -- 5.6 Miscellaneous Characters ns_dec_digit {- 35 -} = ('\x30', '\x39') ns_hex_digit {- 36 -} = ns_dec_digit / ('\x41', '\x46') / ('\x61', '\x66') ns_ascii_letter {- 37 -} = ('\x41', '\x5A') / ('\x61', '\x7A') ns_word_char {- 38 -} = ns_dec_digit / ns_ascii_letter / '-' ns_uri_char {- 39 -} = "escape" ^ ( '%' ! "escape" & ns_hex_digit & ns_hex_digit / ns_word_char / '#' / ';' / '/' / '?' / ':' / '@' / '&' / '=' / '+' / '$' / ',' / '_' / '.' / '!' / '~' / '*' / '\'' / '(' / ')' / '[' / ']' ) ns_tag_char {- 40 -} = ns_uri_char - c_tag - c_flow_indicator -- 5.7 Escaped Characters c_escape {- 41 -} = indicator '\\' ns_esc_null {- 42 -} = meta '0' ns_esc_bell {- 43 -} = meta 'a' ns_esc_backspace {- 44 -} = meta 'b' ns_esc_horizontal_tab {- 45 -} = meta ( 't' / '\x9' ) ns_esc_line_feed {- 46 -} = meta 'n' ns_esc_vertical_tab {- 47 -} = meta 'v' ns_esc_form_feed {- 48 -} = meta 'f' ns_esc_carriage_return {- 49 -} = meta 'r' ns_esc_escape {- 50 -} = meta 'e' ns_esc_space {- 51 -} = meta '\x20' ns_esc_double_quote {- 52 -} = meta '"' ns_esc_slash {- 53 -} = meta '/' ns_esc_backslash {- 54 -} = meta '\\' ns_esc_next_line {- 55 -} = meta 'N' ns_esc_non_breaking_space {- 56 -} = meta '_' ns_esc_line_separator {- 57 -} = meta 'L' ns_esc_paragraph_separator {- 58 -} = meta 'P' ns_esc_8_bit {- 59 -} = indicator 'x' ! "escaped" & meta ( ns_hex_digit % 2 ) ns_esc_16_bit {- 60 -} = indicator 'u' ! "escaped" & meta ( ns_hex_digit % 4 ) ns_esc_32_bit {- 61 -} = indicator 'U' ! "escaped" & meta ( ns_hex_digit % 8 ) c_ns_esc_char {- 62 -} = wrapTokens BeginEscape EndEscape $ c_escape ! "escape" & "escaped" ^ ( ns_esc_null / ns_esc_bell / ns_esc_backspace / ns_esc_horizontal_tab / ns_esc_line_feed / ns_esc_vertical_tab / ns_esc_form_feed / ns_esc_carriage_return / ns_esc_escape / ns_esc_space / ns_esc_double_quote / ns_esc_slash / ns_esc_backslash / ns_esc_next_line / ns_esc_non_breaking_space / ns_esc_line_separator / ns_esc_paragraph_separator / ns_esc_8_bit / ns_esc_16_bit / ns_esc_32_bit ) -- 6.1 Indentation Spaces s_indent n {- 63 -} = token Indent ( s_space % n ) s_indent_lt n {- 64 -} = token Indent ( s_space <% n ) s_indent_le n {- 65 -} = token Indent ( s_space <% (n .+ 1) ) -- 6.2 Separation Spaces s_separate_in_line {- 66 -} = token White ( s_white +) / sol -- 6.3 Line Prefixes s_line_prefix n c {- 67 -} = case c of BlockOut -> s_block_line_prefix n BlockIn -> s_block_line_prefix n FlowOut -> s_flow_line_prefix n FlowIn -> s_flow_line_prefix n s_block_line_prefix n {- 68 -} = s_indent n s_flow_line_prefix n {- 69 -} = s_indent n & ( s_separate_in_line ?) -- 6.4 Empty Lines l_empty n c {- 70 -} = ( s_line_prefix n c / s_indent_lt n ) & b_as_line_feed -- 6.5 Line Folding b_l_trimmed n c {- 71 -} = b_non_content & ( l_empty n c +) b_as_space {- 72 -} = token LineFold b_break b_l_folded n c {- 73 -} = b_l_trimmed n c / b_as_space s_flow_folded n {- 74 -} = ( s_separate_in_line ?) & b_l_folded n FlowIn & s_flow_line_prefix n -- 6.6 Comments c_nb_comment_text {- 75 -} = wrapTokens BeginComment EndComment $ c_comment & meta ( nb_char *) b_comment {- 76 -} = b_non_content / eof s_b_comment {- 77 -} = ( s_separate_in_line & ( c_nb_comment_text ?) ?) & b_comment l_comment {- 78 -} = s_separate_in_line & ( c_nb_comment_text ?) & b_comment s_l_comments {- 79 -} = ( s_b_comment / sol ) & ( nonEmpty l_comment *) -- 6.7 Separation Lines s_separate n c {- 80 -} = case c of BlockOut -> s_separate_lines n BlockIn -> s_separate_lines n FlowOut -> s_separate_lines n FlowIn -> s_separate_lines n BlockKey -> s_separate_in_line FlowKey -> s_separate_in_line s_separate_lines n {- 81 -} = s_l_comments & s_flow_line_prefix n / s_separate_in_line -- 6.8 Directives l_directive {- 82 -} = ( wrapTokens BeginDirective EndDirective $ c_directive ! "doc" & "directive" ^ ( ns_yaml_directive / ns_tag_directive / ns_reserved_directive ) ) & s_l_comments ns_reserved_directive {- 83 -} = ns_directive_name & ( s_separate_in_line & ns_directive_parameter *) ns_directive_name {- 84 -} = meta ( ns_char +) ns_directive_parameter {- 85 -} = meta ( ns_char +) -- 6.8.1 Yaml Directives ns_yaml_directive {- 86 -} = meta [ 'Y', 'A', 'M', 'L' ] ! "directive" & s_separate_in_line & ns_yaml_version ns_yaml_version {- 87 -} = meta ( ( ns_dec_digit +) & '.' & ( ns_dec_digit +) ) -- 6.8.2 Tag Directives ns_tag_directive {- 88 -} = meta [ 'T', 'A', 'G' ] ! "directive" & s_separate_in_line & c_tag_handle & s_separate_in_line & ns_tag_prefix -- 6.8.2.1 Tag Handles c_tag_handle {- 89 -} = c_named_tag_handle / c_secondary_tag_handle / c_primary_tag_handle c_primary_tag_handle {- 90 -} = wrapTokens BeginHandle EndHandle $ c_tag c_secondary_tag_handle {- 91 -} = wrapTokens BeginHandle EndHandle $ c_tag & c_tag c_named_tag_handle {- 92 -} = wrapTokens BeginHandle EndHandle $ c_tag & meta ( ns_word_char +) & c_tag -- 6.8.2.2 Tag Prefixes ns_tag_prefix {- 93 -} = wrapTokens BeginTag EndTag $ ( c_ns_local_tag_prefix / ns_global_tag_prefix ) c_ns_local_tag_prefix {- 94 -} = c_tag & meta ( ns_uri_char *) ns_global_tag_prefix {- 95 -} = meta ( ns_tag_char & ( ns_uri_char *) ) -- 6.9 Node Properties c_ns_properties n c {- 96 -} = wrapTokens BeginProperties EndProperties $ ( c_ns_tag_property & ( s_separate n c & c_ns_anchor_property ?) ) / ( c_ns_anchor_property & ( s_separate n c & c_ns_tag_property ?) ) -- 6.9.1 Node Tags c_ns_tag_property {- 97 -} = wrapTokens BeginTag EndTag $ c_verbatim_tag / c_ns_shorthand_tag / c_non_specific_tag c_verbatim_tag {- 98 -} = c_tag & indicator '<' & meta ( ns_uri_char +) & indicator '>' c_ns_shorthand_tag {- 99 -} = c_tag_handle & meta ( ns_tag_char +) c_non_specific_tag {- 100 -} = c_tag -- 6.9.2 Node Anchors c_ns_anchor_property {- 101 -} = wrapTokens BeginAnchor EndAnchor $ c_anchor & ns_anchor_name ns_anchor_char {- 102 -} = ns_char - c_flow_indicator ns_anchor_name {- 103 -} = meta ( ns_anchor_char +) -- 7.1 Alias Nodes c_ns_alias_node {- 104 -} = wrapTokens BeginAlias EndAlias $ c_alias ! "node" & ns_anchor_name -- 7.2 Empty Nodes e_scalar {- 105 -} = wrapTokens BeginScalar EndScalar empty e_node {- 106 -} = wrapTokens BeginNode EndNode e_scalar -- 7.3.1 Double Quoted Style nb_double_char {- 107 -} = "escape" ^ ( c_ns_esc_char / ( nb_json - c_escape - c_double_quote ) ) ns_double_char {- 108 -} = nb_double_char - s_white c_double_quoted n c {- 109 -} = wrapTokens BeginScalar EndScalar $ c_double_quote ! "node" & text ( nb_double_text n c ) & c_double_quote nb_double_text n c {- 110 -} = case c of FlowOut -> nb_double_multi_line n FlowIn -> nb_double_multi_line n BlockKey -> nb_double_one_line FlowKey -> nb_double_one_line nb_double_one_line {- 111 -} = ( nb_double_char *) s_double_escaped n {- 112 -} = ( s_white *) & wrapTokens BeginEscape EndEscape ( c_escape ! "escape" & b_non_content ) & ( l_empty n FlowIn *) & s_flow_line_prefix n s_double_break n {- 113 -} = "escape" ^ ( s_double_escaped n / s_flow_folded n ) nb_ns_double_in_line {- 114 -} = ( ( s_white *) & ns_double_char *) s_double_next_line n {- 115 -} = s_double_break n & ( ns_double_char & nb_ns_double_in_line & ( s_double_next_line n / ( s_white *) ) ?) nb_double_multi_line n {- 116 -} = nb_ns_double_in_line & ( s_double_next_line n / ( s_white *) ) -- 7.3.2 Single Quoted Style c_quoted_quote {- 117 -} = wrapTokens BeginEscape EndEscape $ c_single_quote ! "escape" & meta '\'' nb_single_char {- 118 -} = "escape" ^ ( c_quoted_quote / ( nb_json - c_single_quote ) ) ns_single_char {- 119 -} = nb_single_char - s_white c_single_quoted n c {- 120 -} = wrapTokens BeginScalar EndScalar $ c_single_quote ! "node" & text ( nb_single_text n c ) & c_single_quote nb_single_text n c {- 121 -} = case c of FlowOut -> nb_single_multi_line n FlowIn -> nb_single_multi_line n BlockKey -> nb_single_one_line FlowKey -> nb_single_one_line nb_single_one_line {- 122 -} = ( nb_single_char *) nb_ns_single_in_line {- 123 -} = ( ( s_white *) & ns_single_char *) s_single_next_line n {- 124 -} = s_flow_folded n & ( ns_single_char & nb_ns_single_in_line & ( s_single_next_line n / ( s_white *) ) ?) nb_single_multi_line n {- 125 -} = nb_ns_single_in_line & ( s_single_next_line n / ( s_white *) ) -- 7.3.3 Plain Style ns_plain_first _c {- 126 -} = ns_char - c_indicator / ( ':' / '?' / '-' ) & ( ns_char >?) ns_plain_safe c {- 127 -} = case c of FlowOut -> ns_plain_safe_out FlowIn -> ns_plain_safe_in BlockKey -> ns_plain_safe_out FlowKey -> ns_plain_safe_in ns_plain_safe_out {- 128 -} = ns_char - c_mapping_value - c_comment ns_plain_safe_in {- 129 -} = ns_plain_safe_out - c_flow_indicator ns_plain_char c {- 130 -} = ns_plain_safe c / ( ns_char ?) ns_plain n c {- 131 -} = wrapTokens BeginScalar EndScalar $ text (case c of FlowOut -> ns_plain_multi_line n c FlowIn -> ns_plain_multi_line n c BlockKey -> ns_plain_one_line c FlowKey -> ns_plain_one_line c) nb_ns_plain_in_line c {- 132 -} = ( ( s_white *) & ns_plain_char c *) ns_plain_one_line c {- 133 -} = ns_plain_first c ! "node" & nb_ns_plain_in_line c s_ns_plain_next_line n c {- 134 -} = s_flow_folded n & ns_plain_char c & nb_ns_plain_in_line c ns_plain_multi_line n c {- 135 -} = ns_plain_one_line c & ( s_ns_plain_next_line n c *) -- 7.4 Flow Collection Styles in_flow c {- 136 -} = case c of FlowOut -> FlowIn FlowIn -> FlowIn BlockKey -> FlowKey FlowKey -> FlowKey -- 7.4.1 Flow Sequences c_flow_sequence n c {- 137 -} = wrapTokens BeginSequence EndSequence $ c_sequence_start ! "node" & ( s_separate n c ?) & ( ns_s_flow_seq_entries n (in_flow c) ?) & c_sequence_end ns_s_flow_seq_entries n c {- 138 -} = ns_flow_seq_entry n c & ( s_separate n c ?) & ( c_collect_entry & ( s_separate n c ?) & ( ns_s_flow_seq_entries n c ?) ?) ns_flow_seq_entry n c {- 139 -} = "pair" ^ ( ns_flow_pair n c / "node" ^ ns_flow_node n c ) -- 7.4.2 Flow Mappings c_flow_mapping n c {- 140 -} = wrapTokens BeginMapping EndMapping $ c_mapping_start ! "node" & ( s_separate n c ?) & ( ns_s_flow_map_entries n (in_flow c) ?) & c_mapping_end ns_s_flow_map_entries n c {- 141 -} = ns_flow_map_entry n c & ( s_separate n c ?) & ( c_collect_entry & ( s_separate n c ?) & ( ns_s_flow_map_entries n c ?) ?) ns_flow_map_entry n c {- 142 -} = wrapTokens BeginPair EndPair $ "key" ^ ( ( c_mapping_key ! "key" & s_separate n c & ns_flow_map_explicit_entry n c ) / ns_flow_map_implicit_entry n c ) ns_flow_map_explicit_entry n c {- 143 -} = ns_flow_map_implicit_entry n c / ( e_node & e_node ) ns_flow_map_implicit_entry n c {- 144 -} = "pair" ^ ( ns_flow_map_yaml_key_entry n c / c_ns_flow_map_empty_key_entry n c / c_ns_flow_map_json_key_entry n c ) ns_flow_map_yaml_key_entry n c {- 145 -} = ( "node" ^ ns_flow_yaml_node n c ) ! "pair" & ( ( ( s_separate n c ?) & c_ns_flow_map_separate_value n c ) / e_node ) c_ns_flow_map_empty_key_entry n c {- 146 -} = e_node & c_ns_flow_map_separate_value n c c_ns_flow_map_separate_value n c {- 147 -} = c_mapping_value & ( ns_char >!) ! "pair" & ( ( s_separate n c & ns_flow_node n c ) / e_node ) c_ns_flow_map_json_key_entry n c {- 148 -} = ( "node" ^ c_flow_json_node n c ) ! "pair" & ( ( ( s_separate n c ?) & c_ns_flow_map_adjacent_value n c ) / e_node ) c_ns_flow_map_adjacent_value n c {- 149 -} = c_mapping_value ! "pair" & ( ( ( s_separate n c ?) & ns_flow_node n c ) / e_node ) ns_flow_pair n c {- 150 -} = wrapTokens BeginMapping EndMapping $ wrapTokens BeginPair EndPair $ ( ( c_mapping_key ! "pair" & s_separate n c & ns_flow_map_explicit_entry n c ) / ns_flow_pair_entry n c ) ns_flow_pair_entry n c {- 151 -} = ( ns_flow_pair_yaml_key_entry n c / c_ns_flow_map_empty_key_entry n c / c_ns_flow_pair_json_key_entry n c ) ns_flow_pair_yaml_key_entry n c {- 152 -} = ns_s_implicit_yaml_key FlowKey & c_ns_flow_map_separate_value n c c_ns_flow_pair_json_key_entry n c {- 153 -} = c_s_implicit_json_key FlowKey & c_ns_flow_map_adjacent_value n c ns_s_implicit_yaml_key c {- 154 -} = ( "node" ^ ( ns_flow_yaml_node na c ) & ( s_separate_in_line ?) ) `limitedTo` 1024 c_s_implicit_json_key c {- 155 -} = ( "node" ^ ( c_flow_json_node na c ) & ( s_separate_in_line ?) ) `limitedTo` 1024 -- 7.5 Flow Nodes ns_flow_yaml_content n c {- 156 -} = ns_plain n c c_flow_json_content n c {- 157 -} = c_flow_sequence n c / c_flow_mapping n c / c_single_quoted n c / c_double_quoted n c ns_flow_content n c {- 158 -} = ns_flow_yaml_content n c / c_flow_json_content n c ns_flow_yaml_node n c {- 159 -} = wrapTokens BeginNode EndNode $ c_ns_alias_node / ns_flow_yaml_content n c / ( c_ns_properties n c & ( ( s_separate n c & ns_flow_yaml_content n c ) / e_scalar ) ) c_flow_json_node n c {- 160 -} = wrapTokens BeginNode EndNode $ ( c_ns_properties n c & s_separate n c ?) & c_flow_json_content n c ns_flow_node n c {- 161 -} = wrapTokens BeginNode EndNode $ c_ns_alias_node / ns_flow_content n c / ( c_ns_properties n c & ( ( s_separate n c & ns_flow_content n c ) / e_scalar ) ) -- 8.1.1 Block Scalar Headers c_b_block_header n {- 162 -} = "header" ^ ( do m <- c_indentation_indicator n t <- c_chomping_indicator ( s_white / b_char ) ?! "header" s_b_comment result (m, t) / do t <- c_chomping_indicator m <- c_indentation_indicator n s_b_comment result (m, t) ) -- 8.1.1.1 Block Indentation Indicator c_indentation_indicator n {- 163 -} = indicator ( ns_dec_digit - '0' ) & asInteger / detect_scalar_indentation n detect_scalar_indentation n = peek $ ( nb_char *) & ( b_non_content & ( l_empty n BlockIn *) ?) & count_spaces (-n) count_spaces n = (s_space & count_spaces (n .+ 1)) / result (max 1 n) -- 8.1.1.2 Chomping Indicator c_chomping_indicator {- 164 -} = indicator '-' & result Strip / indicator '+' & result Keep / result Clip end_block_scalar t = case t of Strip -> emptyToken EndScalar Clip -> emptyToken EndScalar Keep -> empty b_chomped_last t {- 165 -} = case t of Strip -> emptyToken EndScalar & b_non_content Clip -> b_as_line_feed & emptyToken EndScalar Keep -> b_as_line_feed l_chomped_empty n t {- 166 -} = case t of Strip -> l_strip_empty n Clip -> l_strip_empty n Keep -> l_keep_empty n l_strip_empty n {- 167 -} = ( s_indent_le n & b_non_content *) & ( l_trail_comments n ?) l_keep_empty n {- 168 -} = ( l_empty n BlockIn *) & emptyToken EndScalar & ( l_trail_comments n ?) l_trail_comments n {- 169 -} = s_indent_lt n & c_nb_comment_text & b_comment & ( nonEmpty l_comment *) -- 8.1.2 Literal Style c_l__literal n {- 170 -} = do emptyToken BeginScalar c_literal ! "node" (m, t) <- c_b_block_header n `prefixErrorWith` emptyToken EndScalar text ( l_literal_content (n .+ m) t ) l_nb_literal_text n {- 171 -} = ( l_empty n BlockIn *) & s_indent n & ( nb_char +) b_nb_literal_next n {- 172 -} = b_as_line_feed & l_nb_literal_text n l_literal_content n t {- 173 -} = ( ( l_nb_literal_text n & ( b_nb_literal_next n *) & b_chomped_last t ) / end_block_scalar t ) & l_chomped_empty n t -- 8.1.3 Folded Style c_l__folded n {- 174 -} = do emptyToken BeginScalar c_folded ! "node" (m, t) <- c_b_block_header n `prefixErrorWith` emptyToken EndScalar text ( l_folded_content (n .+ m) t ) s_nb_folded_text n {- 175 -} = s_indent n & ns_char ! "fold" & ( nb_char *) l_nb_folded_lines n {- 176 -} = s_nb_folded_text n & ( b_l_folded n BlockIn & s_nb_folded_text n *) s_nb_spaced_text n {- 177 -} = s_indent n & s_white ! "fold" & ( nb_char *) b_l_spaced n {- 178 -} = b_as_line_feed & ( l_empty n BlockIn *) l_nb_spaced_lines n {- 179 -} = s_nb_spaced_text n & ( b_l_spaced n & s_nb_spaced_text n *) l_nb_same_lines n {- 180 -} = ( l_empty n BlockIn *) & "fold" ^ ( l_nb_folded_lines n / l_nb_spaced_lines n ) l_nb_diff_lines n {- 181 -} = l_nb_same_lines n & ( b_as_line_feed & l_nb_same_lines n *) l_folded_content n t {- 182 -} = ( ( l_nb_diff_lines n & b_chomped_last t ) / end_block_scalar t ) & l_chomped_empty n t -- 8.2.1 Block Sequences detect_collection_indentation n = peek $ ( nonEmpty l_comment* ) & count_spaces (-n) detect_inline_indentation = peek $ count_spaces 0 l__block_sequence n {- 183 -} = do m <- detect_collection_indentation n wrapTokens BeginSequence EndSequence $ ( s_indent (n .+ m) & c_l_block_seq_entry (n .+ m) +) c_l_block_seq_entry n {- 184 -} = c_sequence_entry & ( ns_char >!) ! "node" & s_l__block_indented n BlockIn s_l__block_indented n c {- 185 -} = do m <- detect_inline_indentation "node" ^ ( ( s_indent m & ( ns_l_in_line_sequence (n .+ 1 .+ m) / ns_l_in_line_mapping (n .+ 1 .+ m) ) ) / s_l__block_node n c / ( e_node & ( s_l_comments ?) & unparsed (n .+ 1) ) ) `recovery` unparsed (n .+ 1) ns_l_in_line_sequence n {- 186 -} = wrapTokens BeginNode EndNode $ wrapTokens BeginSequence EndSequence $ c_l_block_seq_entry n & ( s_indent n & c_l_block_seq_entry n *) -- 8.2.2 Block Mappings l__block_mapping n = {- 187 -} do m <- detect_collection_indentation n wrapTokens BeginMapping EndMapping $ ( s_indent (n .+ m) & ns_l_block_map_entry (n .+ m) +) ns_l_block_map_entry n {- 188 -} = wrapTokens BeginPair EndPair $ c_l_block_map_explicit_entry n / ns_l_block_map_implicit_entry n c_l_block_map_explicit_entry n {- 189 -} = c_l_block_map_explicit_key n & ( l_block_map_explicit_value n / e_node ) c_l_block_map_explicit_key n {- 190 -} = c_mapping_key ! "node" & s_l__block_indented n BlockOut l_block_map_explicit_value n {- 191 -} = s_indent n & c_mapping_value & s_l__block_indented n BlockOut ns_l_block_map_implicit_entry n {- 192 -} = ( ns_s_block_map_implicit_key / e_node ) & c_l_block_map_implicit_value n ns_s_block_map_implicit_key {- 193 -} = c_s_implicit_json_key BlockKey / ns_s_implicit_yaml_key BlockKey c_l_block_map_implicit_value n {- 194 -} = c_mapping_value ! "node" & ( ( s_l__block_node n BlockOut / ( e_node & ( s_l_comments ?) & unparsed (n .+ 1) ) ) `recovery` unparsed (n .+ 1) ) ns_l_in_line_mapping n {- 195 -} = wrapTokens BeginNode EndNode $ wrapTokens BeginMapping EndMapping $ ns_l_block_map_entry n & ( s_indent n & ns_l_block_map_entry n *) -- 8.2.3 Block Nodes unparsed n = ( sol / unparsed_text & unparsed_break ) & ( nonEmpty ( unparsed_indent n & unparsed_text & unparsed_break ) *) unparsed_indent n = token Unparsed ( s_space % n ) unparsed_text = token Unparsed ( upto ( eof / c_forbidden / b_break ) ) unparsed_break = eof / peek c_forbidden / token Unparsed b_break / empty s_l__block_node n c {- 196 -} = s_l__block_in_block n c / s_l__flow_in_block n s_l__flow_in_block n {- 197 -} = s_separate (n .+ 1) FlowOut & ns_flow_node (n .+ 1) FlowOut & s_l_comments s_l__block_in_block n c {- 198 -} = wrapTokens BeginNode EndNode $ ( s_l__block_scalar n c / s_l__block_collection n c ) s_l__block_scalar n c {- 199 -} = s_separate (n .+ 1) c & ( c_ns_properties (n .+ 1) c & s_separate (n .+ 1) c ?) & ( c_l__literal n / c_l__folded n ) s_l__block_collection n c {- 200 -} = ( s_separate (n .+ 1) c & c_ns_properties (n .+ 1) c & ( s_l_comments >?) ?) & s_l_comments & ( l__block_sequence (seq_spaces n c) / l__block_mapping n ) seq_spaces n c {- 201 -} = case c of BlockOut -> n .- 1 BlockIn -> n -- 9.1.1 Document Prefix l_document_prefix {- 202 -} = ( c_byte_order_mark ?) & ( nonEmpty l_comment *) -- 9.1.2 Document Markers c_directives_end {- 203 -} = token DirectivesEnd [ '-', '-', '-' ] c_document_end {- 204 -} = token DocumentEnd [ '.', '.', '.' ] l_document_suffix {- 205 -} = c_document_end & s_l_comments c_forbidden {- 206 -} = sol & ( c_directives_end / c_document_end ) & ( b_char / s_white / eof ) -- 9.1.3 Explicit Documents l_bare_document {- 207 -} = "node" ^ s_l__block_node (-1) BlockIn `forbidding` c_forbidden -- 9.1.4 Explicit Documents l_explicit_document {- 208 -} = c_directives_end ! "doc" & ( ( l_bare_document / e_node & ( s_l_comments ?) & unparsed 0 ) `recovery` unparsed 0 ) -- 9.1.5 Directives Documents l_directives_document {- 209 -} = ( l_directive +) & l_explicit_document -- 9.2 Streams: l_any_document {- 210 -} = wrapTokens BeginDocument EndDocument $ "doc" ^ ( l_directives_document / l_explicit_document / l_bare_document ) `recovery` unparsed 0 l_yaml_stream {- 211 -} = ( nonEmpty l_document_prefix *) & ( eof / ( c_document_end & ( b_char / s_white / eof ) >?) / l_any_document ) & ( nonEmpty ( "more" ^ ( ( l_document_suffix ! "more" +) & ( nonEmpty l_document_prefix *) & ( eof / l_any_document ) / ( nonEmpty l_document_prefix *) & "doc" ^ ( wrapTokens BeginDocument EndDocument l_explicit_document ?) ) ) *)