-- #ignore-exports ------------------------------------------------------------------------------- -- | -- Module : Text.Yaml.Reference -- Copyright : (c) Oren Ben-Kiki 2007 -- License : LGPL -- -- Maintainer : yaml-oren@ben-kiki.org -- Stability : alpha -- Portability : portable -- -- Implementation of the YAML syntax as defined in . -- Actually this file contains the parsing framework and includes (using CPP) -- the actual productions from @Reference.bnf@. -- -- The parsing framework is fully streaming (generates output tokens -- \"immediately\"). The memory leak that existed in previous version has been -- plugged. ------------------------------------------------------------------------------- module Text.Yaml.Reference ( -- Basic parsing: Code, Token, Tokenizer, yaml, -- For testing: Context, Chomp, tokenizer, tokenizerWithN, tokenizerWithC, tokenizerWithT, tokenizerWithNC, tokenizerWithNT, tokenizerNames, showTokens ) where import Control.Monad import qualified Data.ByteString.Lazy.Char8 as C import Data.Char import qualified Data.DList as D import qualified Data.Map as Map import Text.Regex import Debug.Trace import qualified Prelude import Prelude hiding ((/), (*), (+), (-), (^)) -- * Generic operators -- -- ** Numeric operators -- -- We rename the four numerical operators @+@ @-@ @*@ @\/@ to start with @.@ -- (@.+@, @.-@, @.*@, @.\/@). This allows us to use the originals for BNF -- notation (we also hijack the @^@ operator). This is not a generally -- recommended practice. It is justified in this case since we have very little -- arithmetic operations, and a lot of BNF rules which this makes extremely -- readable. infixl 6 .+ -- | \".+\" is the numeric addition (we use \"+\" for postfix \"one or more\"). (.+) = (Prelude.+) infixl 6 .- -- | \".-\" is the numeric subtraction (we use \"-\" for infix \"and not\"). (.-) = (Prelude.-) infixl 7 .* -- | \".*\" is the numeric multiplication (we use \"*\" for postfix \"zero or -- more\"). (.*) = (Prelude.*) infixl 7 ./ -- | \"./\" is the numeric division (we use \"/\" for infix \"or\"). (./) = (Prelude./) -- ** Record field access -- -- We also define @|>@ for record access for increased readability. infixl 9 |> -- | @record |> field@ is the same as @field record@, but is more readable. (|>) :: record -> (record -> value) -> value record |> field = field record -- * UTF decoding -- -- This really should be factored out to the standard libraries. Since it isn't -- there, we get to tailor it exactly to our needs. We use lazy byte strings as -- input, which should give reasonable I\/O performance when reading large -- files. The output is a normal 'Char' list which is easy to work with and -- should be efficient enough as long as the 'Parser' does its job right. -- | Recognized Unicode encodings. UTF-32 isn't required by YAML parsers. data Encoding = UTF8 -- ^ UTF-8 encoding (or ASCII) | UTF16LE -- ^ UTF-16 little endian | UTF16BE -- ^ UTF-16 big endian deriving Show -- | @decode bytes@ automatically detects the 'Encoding' used and converts the -- /bytes/ to Unicode characters. decode :: C.ByteString -> (Encoding, [Char]) decode text = (encoding, undoEncoding encoding text) where encoding = detectEncoding $ C.unpack $ C.take 2 text -- | @detectEncoding text@ examines the first few chars (bytes) of the /text/ -- to deduce the Unicode encoding used according to the YAML spec. detectEncoding :: [Char] -> Encoding detectEncoding text = case text of '\xFF':'\xFE':_ -> UTF16LE '\xFE':'\xFF':_ -> UTF16BE _ -> UTF8 -- | @undoEncoding encoding bytes@ converts a /bytes/ stream to Unicode -- characters according to the /encoding/. undoEncoding :: Encoding -> C.ByteString -> [Char] undoEncoding encoding bytes = case encoding of UTF8 -> undoUTF8 bytes UTF16LE -> combinePairs $ undoUTF16LE bytes UTF16BE -> combinePairs $ undoUTF16BE bytes -- ** UTF-16 decoding -- | @combinePairs chars@ converts each pair of UTF-16 surrogate characters to a -- single Unicode character. combinePairs :: [Char] -> [Char] combinePairs [] = [] combinePairs (lead:rest) | '\xD800' <= lead && lead <= '\xDBFF' = combineLead lead rest | '\xDC00' <= lead && lead <= '\xDFFF' = error "UTF-16 contains trail surrogate without lead surrogate" | otherwise = lead:(combinePairs rest) -- | @combineLead lead rest@ combines the /lead/ surrogate with the head of the -- /rest/ of the input chars, assumed to be a /trail/ surrogate, and continues -- combining surrogate pairs. combineLead :: Char -> [Char] -> [Char] combineLead lead [] = error "UTF-16 contains lead surrogate as final character" combineLead lead (trail:rest) | '\xDC00' <= trail && trail <= '\xDFFF' = (combineSurrogates lead trail):combinePairs rest | otherwise = error "UTF-16 contains lead surrogate without trail surrogate" -- | @surrogateOffset@ is copied from the Unicode FAQs. surrogateOffset :: Int surrogateOffset = 0x10000 .- (0xD800 .* 1024) .- 0xDC00 -- | @combineSurrogates lead trail@ combines two UTF-16 surrogates into a single -- Unicode character. combineSurrogates :: Char -> Char -> Char combineSurrogates lead trail = chr $ (ord lead) .* 1024 .+ (ord trail) .+ surrogateOffset -- | @hasFewerThan bytes n@ checks whether there are fewer than /n/ /bytes/ -- left to read. hasFewerThan :: Int -> C.ByteString -> Bool hasFewerThan n bytes | n == 1 = C.null bytes | n > 1 = C.null bytes || hasFewerThan (n .- 1) (C.tail bytes) -- | @undoUTF18LE bytes@ decoded a UTF-16-LE /bytes/ stream to Unicode chars. undoUTF16LE :: C.ByteString -> [Char] undoUTF16LE bytes | C.null bytes = [] | hasFewerThan 2 bytes = error "UTF-16-LE input contains odd number of bytes" | otherwise = let low = C.head bytes bytes' = C.tail bytes high = C.head bytes' rest = C.tail bytes' in (chr $ (ord high) .* 256 .+ (ord low)):(undoUTF16LE rest) -- | @undoUTF18BE bytes@ decoded a UTF-16-BE /bytes/ stream to Unicode chars. undoUTF16BE :: C.ByteString -> [Char] undoUTF16BE bytes | C.null bytes = [] | hasFewerThan 2 bytes = error "UTF-16-BE input contains odd number of bytes" | otherwise = let high = C.head bytes bytes' = C.tail bytes low = C.head bytes' rest = C.tail bytes' in (chr $ (ord high) .* 256 .+ (ord low)):(undoUTF16BE rest) -- ** UTF-8 decoding -- | @undoUTF8 bytes@ decoded a UTF-8 /bytes/ stream to Unicode chars. undoUTF8 :: C.ByteString -> [Char] undoUTF8 bytes | C.null bytes = [] | otherwise = let first = C.head bytes rest = C.tail bytes in case () of _ | first < '\x80' -> first:(undoUTF8 rest) | first < '\xC0' -> error "UTF-8 input contains invalid first byte" | first < '\xE0' -> decodeTwoUTF8 first rest | first < '\xF0' -> decodeThreeUTF8 first rest | first < '\xF8' -> decodeFourUTF8 first rest | otherwise -> error "UTF-8 input contains invalid first byte" -- | @decodeTwoUTF8 first bytes@ decodes a two-byte UTF-8 character, where the -- /first/ byte is already available and the second is the head of the /bytes/, -- and then continues to undo the UTF-8 encoding. decodeTwoUTF8 :: Char -> C.ByteString -> [Char] decodeTwoUTF8 first bytes | C.null bytes = error "UTF-8 double byte char is missing second byte at eof" | otherwise = let second = C.head bytes rest = C.tail bytes in case () of _ | second < '\x80' || '\xBF' < second -> error "UTF-8 double byte char has invalid second byte" | otherwise -> (combineTwoUTF8 first second):(undoUTF8 rest) -- | @combineTwoUTF8 first second@ combines the /first/ and /second/ bytes of a -- two-byte UTF-8 char into a single Unicode char. combineTwoUTF8 :: Char -> Char -> Char combineTwoUTF8 first second = chr(((ord first) .- 0xC0) .* 64 .+ ((ord second) .- 0x80)) -- | @decodeThreeUTF8 first bytes@ decodes a three-byte UTF-8 character, where -- the /first/ byte is already available and the second and third are the head -- of the /bytes/, and then continues to undo the UTF-8 encoding. decodeThreeUTF8 :: Char -> C.ByteString -> [Char] decodeThreeUTF8 first bytes | hasFewerThan 2 bytes = error "UTF-8 triple byte char is missing bytes at eof" | otherwise = let second = C.head bytes bytes' = C.tail bytes third = C.head bytes' rest = C.tail bytes' in case () of _ | second < '\x80' || '\xBF' < second -> error "UTF-8 triple byte char has invalid second byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 triple byte char has invalid third byte" | otherwise -> (combineThreeUTF8 first second third):(undoUTF8 rest) -- | @combineThreeUTF8 first second@ combines the /first/, /second/ and /third/ -- bytes of a three-byte UTF-8 char into a single Unicode char. combineThreeUTF8 :: Char -> Char -> Char -> Char combineThreeUTF8 first second third = chr(((ord first) .- 0xE0) .* 4096 .+ ((ord second) .- 0x80) .* 64 .+ ((ord third) .- 0x80)) -- | @decodeFourUTF8 first bytes@ decodes a four-byte UTF-8 character, where the -- /first/ byte is already available and the second, third and fourth are the -- head of the /bytes/, and then continues to undo the UTF-8 encoding. decodeFourUTF8 :: Char -> C.ByteString -> [Char] decodeFourUTF8 first bytes | hasFewerThan 3 bytes = error "UTF-8 quad byte char is missing bytes at eof" | otherwise = let second = C.head bytes bytes' = C.tail bytes third = C.head bytes' bytes'' = C.tail bytes' fourth = C.head bytes'' rest = C.tail bytes'' in case () of _ | second < '\x80' || '\xBF' < second -> error "UTF-8 quad byte char has invalid second byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 quad byte char has invalid third byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 quad byte char has invalid fourth byte" | otherwise -> (combineFourUTF8 first second third fourth):(undoUTF8 rest) -- | @combineFourUTF8 first second@ combines the /first/, /second/ and /third/ -- bytes of a three-byte UTF-8 char into a single Unicode char. combineFourUTF8 :: Char -> Char -> Char -> Char -> Char combineFourUTF8 first second third fourth = chr(((ord first) .- 0xF0) .* 262144 .+ ((ord second) .- 0x80) .* 4096 .+ ((ord third) .- 0x80) .* 64 .+ ((ord fourth) .- 0x80)) -- * Result tokens -- -- The parsing result is a stream of tokens rather than a parse tree. The idea -- is to convert the YAML input into \"byte codes\". These byte codes are -- intended to be written into a byte codes file (or more likely a UNIX pipe) -- for further processing. -- | 'Token' codes. data Code = Bom -- ^ BOM, contains \"@TF8@\", \"@TF16LE@\" or \"@TF16BE@\". | Text -- ^ Content text characters. | Meta -- ^ Non-content (meta) text characters. | Break -- ^ Line break preserved in content. | Continue -- ^ Separation line break. | LineFeed -- ^ Line break normalized to content line feed. | LineFold -- ^ Line break folded to content space. | Indicator -- ^ Character indicating structure. | White -- ^ Separation white space. | Indent -- ^ Indentation spaces. | DocumentStart -- ^ Document start marker. | DocumentEnd -- ^ Document end marker. | BeginEscape -- ^ Begins escape sequence. | EndEscape -- ^ Ends escape sequence. | BeginComment -- ^ Begins comment. | EndComment -- ^ Ends comment. | BeginDirective -- ^ Begins directive. | EndDirective -- ^ Ends directive. | BeginTag -- ^ Begins tag. | EndTag -- ^ Ends tag. | BeginHandle -- ^ Begins tag handle. | EndHandle -- ^ Ends tag handle. | BeginAnchor -- ^ Begins anchor. | EndAnchor -- ^ Ends anchor. | BeginProperties -- ^ Begins node properties. | EndProperties -- ^ Ends node properties. | BeginAlias -- ^ Begins alias. | EndAlias -- ^ Ends alias. | BeginScalar -- ^ Begins scalar content. | EndScalar -- ^ Ends scalar content. | BeginSequence -- ^ Begins sequence content. | EndSequence -- ^ Ends sequence content. | BeginMapping -- ^ Begins mapping content. | EndMapping -- ^ Ends mapping content. | BeginPair -- ^ Begins mapping key:value pair. | EndPair -- ^ Ends mapping key:value pair. | BeginNode -- ^ Begins complete node. | EndNode -- ^ Ends complete node. | BeginDocument -- ^ Begins document. | EndDocument -- ^ Ends document. | BeginStream -- ^ Begins YAML stream. | EndStream -- ^ Ends YAML stream. | Error -- ^ Parsing error at this point. | Unparsed -- ^ The rest of the input at the error point. -- For testing:. | Test -- ^ Test characters otherwise unassigned. | Detected -- ^ Detected parameter. deriving Eq -- | @show code@ converts a 'Code' to the one-character YEAST token code char. -- The list of byte codes is also documented in the @yaml2yeast@ program. instance Show Code where show code = case code of Bom -> "U" Text -> "T" Meta -> "t" Break -> "B" Continue -> "b" LineFeed -> "L" LineFold -> "l" Indicator -> "I" White -> "w" Indent -> "i" DocumentStart -> "K" DocumentEnd -> "k" BeginEscape -> "E" EndEscape -> "e" BeginComment -> "C" EndComment -> "c" BeginDirective -> "D" EndDirective -> "d" BeginTag -> "G" EndTag -> "g" BeginHandle -> "H" EndHandle -> "h" BeginAnchor -> "A" EndAnchor -> "a" BeginProperties -> "P" EndProperties -> "p" BeginAlias -> "R" EndAlias -> "r" BeginScalar -> "S" EndScalar -> "s" BeginSequence -> "Q" EndSequence -> "q" BeginMapping -> "M" EndMapping -> "m" BeginNode -> "N" EndNode -> "n" BeginPair -> "X" EndPair -> "x" BeginDocument -> "O" EndDocument -> "o" BeginStream -> "Y" EndStream -> "y" Error -> "!" Unparsed -> "-" Test -> "?" Detected -> "$" -- | Parsed token. data Token = Token { tCode :: Code, -- ^ Specific token 'Code'. tText :: String -- ^ Contained input chars, if any. } -- | @show token@ converts a 'Token' to a single YEAST line. instance Show Token where show token = (show $ token|>tCode) ++ (escapeString $ token|>tText) ++ "\n" -- | @escapeString string@ escapes all the non-ASCII characters in the -- /string/, as well as escaping the \"@\\@\" character, using the \"@\\xXX@\", -- \"@\\uXXXX@\" and \"@\\UXXXXXXXX@\" escape sequences. escapeString :: String -> String escapeString [] = [] escapeString (first:rest) | ' ' <= first && first /= '\\' && first <= '~' = first:(escapeString rest) | first <= '\xFF' = "\\x" ++ (toHex 2 $ ord first) ++ (escapeString rest) | '\xFF' < first && first <= '\xFFFF' = "\\u" ++ (toHex 4 $ ord first) ++ (escapeString rest) | otherwise = "\\U" ++ (toHex 8 $ ord first) ++ (escapeString rest) -- | @toHex digits int@ converts the /int/ to the specified number of -- hexadecimal /digits/. toHex :: Int -> Int -> String toHex digits int | digits > 1 = (toHex (digits .- 1) (int `div` 16)) ++ [intToDigit $ int `mod` 16] | digits == 1 = [intToDigit int] -- | @showTokens tokens@ converts a list of /tokens/ to a multi-line YEAST -- text. showTokens :: [Token] -> String showTokens tokens = foldr (\ token text -> (show token) ++ text) "" tokens -- * Parsing framework -- -- Haskell has no shortage of parsing frameworks. We use our own because: -- -- * Most available frameworks are inappropriate because of their focus on -- building a parse tree, and completing all of it before any of it is -- accessible to the caller. We return a stream of tokens, and would like -- its head to be accessible as soon as possible to allow for streaming. To -- do this with bounded memory usage we use a combination of continuation -- passing style and difference lists for the collected tokens. -- -- * Haskell makes it so easy to roll your own parsing framework. We need some -- specialized machinery (limited lookahead, forbidden patterns). It is -- possible to build these on top of existing frameworks but the end result -- isn't much shorter than rolling our own. -- -- Since we roll our own framework we don't bother with making it generalized, -- so we maintain a single 'State' type rather than having a generic one that -- contains a polymorphic \"UserState\" field etc. -- | A 'Parser' is basically a function computing a 'Reply'. data Parser result = Parser (State -> Reply result) -- | The 'Result' of each invocation is either an error, the actual result, or -- a continuation for computing the actual result. data Result result = Failed String -- ^ Parsing aborted with a failure. | Result result -- ^ Parsing completed with a result. | More (Parser result) -- ^ Parsing is ongoing with a continuation. -- Showing a 'Result' is only used in debugging. instance (Show result) => Show (Result result) where show result = case result of Failed message -> "Failed " ++ message Result result -> "Result " ++ (show result) More _ -> "More" -- | Each invication of a 'Parser' yields a 'Reply'. The 'Result' is only one -- part of the 'Reply'. data Reply result = Reply { rResult :: !(Result result), -- ^ Parsing result. rTokens :: !(D.DList Token), -- ^ Tokens generated by the parser. rCommit :: !(Maybe String), -- ^ Commitment to a decision point. rState :: !State -- ^ The updated parser state. } -- Showing a 'State' is only used in debugging. instance (Show result) => Show (Reply result) where show reply = "Result: " ++ (show $ reply|>rResult) ++ ", Tokens: " ++ (show $ D.toList $ reply|>rTokens) ++ ", Commit: " ++ (show $ reply|>rCommit) ++ ", State: { " ++ (show $ reply|>rState) ++ "}" -- A 'Pattern' is a parser that doesn't have an (interesting) result. type Pattern = Parser () -- ** Parsing state -- | The internal parser state. We don't bother with parameterising it with a -- \"UserState\", we just bundle the generic and specific fields together (not -- that it is that easy to draw the line - is @sLine@ generic or specific?). data State = State { sName :: !String, -- ^ The input name for error messages. sEncoding :: !Encoding, -- ^ The input UTF encoding. sDecision :: !String, -- ^ Current decision name. sLimit :: !Int, -- ^ Lookahead characters limit. sForbidden :: !(Maybe Pattern), -- ^ Pattern we must not enter into. sIsPeek :: !Bool, -- ^ Disables token generation. sChars :: ![Char], -- ^ (Reversed) characters collected for a token. sOffset :: !Int, -- ^ Offset in characters in the input. sLine :: !Int, -- ^ Builds on YAML's line break definition. sColumn :: !Int, -- ^ Actually character number - we hate tabs. sCode :: !Code, -- ^ Of token we are collecting chars for. sLast :: !Char, -- ^ Last matched character. sInput :: ![Char] -- ^ The decoded input characters. } -- Showing a 'State' is only used in debugging. Note that forcing dump of -- @sInput@ will disable streaming it. instance Show State where show state = "Name: " ++ (show $ state|>sName) ++ ", Encoding: " ++ (show $ state|>sEncoding) ++ ", Decision: " ++ (show $ state|>sDecision) ++ ", Limit: " ++ (show $ state|>sLimit) ++ ", IsPeek: " ++ (show $ state|>sIsPeek) ++ ", Chars: >>>" ++ (reverse $ state|>sChars) ++ "<<<" ++ ", Offset: " ++ (show $ state|>sOffset) ++ ", Line: " ++ (show $ state|>sLine) ++ ", Column: " ++ (show $ state|>sColumn) ++ ", Code: " ++ (show $ state|>sCode) ++ ", Last: " ++ (show $ state|>sLast) -- ++ ", Input: >>>" ++ (show $ state|>sInput) ++ "<<<" -- | @initialState name input@ returns an initial 'State' for parsing the -- /input/ (with /name/ for error messages). initialState :: String -> C.ByteString -> State initialState name input = let (encoding, decoded) = decode input in State { sName = name, sEncoding = encoding, sDecision = "", sLimit = -1, sForbidden = Nothing, sIsPeek = False, sChars = [], sOffset = 0, sLine = 1, sColumn = 0, sCode = Test, sLast = ' ', sInput = decoded } -- *** Setters -- -- We need four setter functions to pass them around as arguments. For some -- reason, Haskell only generates getter functions. -- | @setDecision name state@ sets the @sDecision@ field to /decision/. setDecision :: String -> State -> State setDecision decision state = state { sDecision = decision } -- | @setLimit limit state@ sets the @sLimit@ field to /limit/. setLimit :: Int -> State -> State setLimit limit state = state { sLimit = limit } -- | @setForbidden forbidden state@ sets the @sForbidden@ field to /forbidden/. setForbidden :: Maybe Pattern -> State -> State setForbidden forbidden state = state { sForbidden = forbidden } -- | @setCode code state@ sets the @sCode@ field to /code/. setCode :: Code -> State -> State setCode code state = state { sCode = code } -- ** Implicit parsers -- -- It is tedious to have to wrap each expected character (or character range) -- in an explicit 'Parse' constructor. We let Haskell do that for us using a -- 'Match' class. -- | @Match parameter result@ specifies that we can convert the /parameter/ to -- a 'Parser' returning the /result/. class Match parameter result | parameter -> result where match :: parameter -> Parser result -- | We don't need to convert a 'Parser', it already is one. instance Match (Parser result) result where match = id -- | We convert 'Char' to a parser for a character (that returns nothing). instance Match Char () where match code = nextIf (== code) -- | We convert a 'Char' tuple to a parser for a character range (that returns -- nothing). instance Match (Char, Char) () where match (low, high) = nextIf $ \ code -> low <= code && code <= high -- | We convert 'String' to a parser for a sequence of characters (that returns -- nothing). instance Match String () where match = foldr (&) empty -- ** Reply constructors -- | @returnReply state result@ prepares a 'Reply' with the specified /state/ -- and /result/. returnReply :: State -> result -> Reply result returnReply state result = Reply { rResult = Result result, rTokens = D.empty, rCommit = Nothing, rState = state } -- | @tokenReply state token@ returns a 'Reply' containing the /state/ and -- /token/. Any collected characters are cleared (either there are none, or we -- put them in this token, or we don't want them). tokenReply state token = Reply { rResult = Result (), rTokens = D.singleton token, rCommit = Nothing, rState = state { sChars = [] } } -- | @failReply state message@ prepares a 'Reply' with the specified /state/ -- and error /message/. failReply :: State -> String -> Reply result failReply state message = Reply { rResult = Failed $ state|>sName ++ ": line " ++ (show $ state|>sLine) ++ ": column " ++ (show $ state|>sColumn) ++ ": " ++ message, rTokens = D.empty, rCommit = Nothing, rState = state } -- | @unexpectedReply state@ returns a @failReply@ for an unexpected character. unexpectedReply :: State -> Reply result unexpectedReply state = case state|>sInput of (char:_) -> failReply state $ "Unexpected '" ++ [char] ++ "'" [] -> failReply state "Unexpected end of input" -- ** Parsing Monad -- | Allow using the @do@ notation for our parsers, which makes for short and -- sweet @do@ syntax when we want to examine the results (we typically don't). instance Monad Parser where -- @return result@ does just that - return a /result/. return result = Parser $ \ state -> returnReply state result -- @left >>= right@ applies the /left/ parser, and if it didn't fail -- applies the /right/ one (well, the one /right/ returns). left >>= right = bindParser left right where bindParser (Parser left) right = Parser $ \ state -> let reply = left state in case reply|>rResult of Failed message -> reply { rResult = Failed message } Result value -> reply { rResult = More $ right value } More parser -> reply { rResult = More $ bindParser parser right } -- @fail message@ does just that - fails with a /message/. fail message = Parser $ \ state -> failReply state message -- ** Parsing operators -- -- Here we reap the benefits of renaming the numerical operators. The Operator -- precedence, in decreasing strength: -- -- @repeated % n@, @repeated <% n@, @match - rejected@, @match ! decision@, -- @match ?! decision@, @choice ^ (first \/ second)@. -- -- @match - first - second@ is @(match - first) - second@. -- -- @first & second & third@ is @first & (second & third)@. Note that @first - -- rejected & second@ is @(first - rejected) & second@, etc. -- -- @match \/ alternative \/ otherwise@ is @match \/ (alternative \/ -- otherwise)@. Note that @first & second \/ third@ is @(first & second) \/ -- third@. -- -- @( match *)@, @(match +)@, @(match ?)@ are the weakest and require the -- surrounding @()@. infix 3 ^ infix 3 % infix 3 <% infix 3 ! infix 3 ?! infix 3 ? infixl 3 - infixr 2 & infixr 1 / infix 0 ? infix 0 * infix 0 + -- | @parser % n@ repeats /parser/ exactly /n/ times. (%) :: (Match match result) => match -> Int -> Pattern parser % n | n <= 0 = empty | n > 0 = parser & parser % n .- 1 -- | @parser <% n@ matches fewer than /n/ occurrences of /parser/. (<%) :: (Match match result) => match -> Int -> Pattern parser <% n | n < 1 = fail "Fewer than 0 repetitions" | n == 1 = reject parser Nothing | n > 1 = "<%" ^ ( parser ! "<%" & parser <% n .- 1 / empty ) -- | @decision ^ (option \/ option \/ ...)@ provides a /decision/ name to the -- choice about to be made, to allow to @commit@ to it. (^) :: (Match match result) => String -> match -> Parser result decision ^ parser = choice decision (match parser) -- | @parser ! decision@ commits to /decision/ (in an option) after -- successfully matching the /parser/. (!) :: (Match match result) => match -> String -> Pattern parser ! decision = parser & commit decision -- | @parser ?! decision@ commits to /decision/ (in an option) if the current -- position matches /parser/, without consuming any characters. (?!) :: (Match match result) => match -> String -> Pattern parser ?! decision = peek parser & commit decision -- | @lookbehind match -> Parser result (?@ matches the current point without consuming any characters -- if it matches the lookahead parser (positive lookahead) (>?) :: (Match match result) => match -> Parser result (>?) lookahead = peek lookahead -- | @parser - rejected@ matches /parser/, except if /rejected/ matches at this -- point. (-) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1 parser - rejected = reject rejected Nothing & parser -- | @before & after@ parses /before/ and, if it succeeds, parses /after/. This -- basically invokes the monad's @>>=@ (bind) method. (&) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result2 before & after = (match before) >> (match after) -- | @first \/ second@ tries to parse /first/, and failing that parses -- /second/, unless /first/ has committed in which case is fails immediately. (/) :: (Match match1 result, Match match2 result) => match1 -> match2 -> Parser result first / second = Parser $ \ state -> let Parser parser = decide (match first) (match second) in parser state -- | @(optional ?)@ tries to match /parser/, otherwise does nothing. (?) :: (Match match result) => match -> Pattern (?) optional = "?" ^ (optional & empty / empty) -- | @(parser *)@ matches zero or more occurrences of /repeat/, as long as each -- one actually consumes input characters. (*) :: (Match match result) => match -> Pattern (*) parser = "*" ^ zomParser where zomParser = (nonEmpty parser ! "*" & zomParser) / empty -- | @(parser +)@ matches one or more occurrences of /parser/, as long as each -- one actually consumed input characters. (+) :: (Match match result) => match -> Pattern (+) parser = nonEmpty parser & (parser *) -- ** Basic parsers -- | @decide first second@ tries to parse /first/, and failing that parses -- /second/, unless /first/ has committed in which case is fails immediately. decide :: Parser result -> Parser result -> Parser result decide left right = Parser $ \ state -> let Parser parser = decideParser state D.empty left right in parser state where decideParser point tokens (Parser left) right = Parser $ \state -> let reply = left state tokens' reply = D.append tokens $ reply|>rTokens in case (reply|>rResult, reply|>rCommit) of (Failed _, _) -> Reply { rState = point, rTokens = D.empty, rResult = More right, rCommit = Nothing } (Result _, _) -> reply { rTokens = tokens' reply } (More left', Just _) -> reply { rTokens = tokens' reply, rResult = More left' } (More left', Nothing) -> let Parser parser = decideParser point (tokens' reply) left' right in parser $ reply|>rState -- | @choice decision parser@ provides a /decision/ name to the choice about to -- be made in /parser/, to allow to @commit@ to it. choice :: String -> Parser result -> Parser result choice decision parser = Parser $ \ state -> let Parser parser' = choiceParser (state|>sDecision) decision parser in parser' state { sDecision = decision } where choiceParser parentDecision makingDecision (Parser parser) = Parser $ \ state -> let reply = parser state commit' = case reply|>rCommit of Nothing -> Nothing Just decision | decision == makingDecision -> Nothing | otherwise -> reply|>rCommit reply' = case reply|>rResult of More parser' -> reply { rCommit = commit', rResult = More $ choiceParser parentDecision makingDecision parser' } _ -> reply { rCommit = commit', rState = (reply|>rState) { sDecision = parentDecision } } in reply' -- | @prev parser@ succeeds if /parser/ matches at the previous character. It -- does not consume any input. prev :: (Match match result) => match -> Parser result prev parser = Parser $ \ state -> prevParser state (match parser) state { sIsPeek = True, sChars = state|>sLast : state|>sChars } where prevParser point (Parser parser) state = let reply = parser state in case reply|>rResult of Failed message -> failReply point message Result value -> returnReply point value More parser' -> prevParser point parser' $ reply|>rState -- | @peek parser@ succeeds if /parser/ matches at this point, but does not -- consume any input. peek :: (Match match result) => match -> Parser result peek parser = Parser $ \ state -> peekParser state (match parser) state { sIsPeek = True } where peekParser point (Parser parser) state = let reply = parser state in case reply|>rResult of Failed message -> failReply point message Result value -> returnReply point value More parser' -> peekParser point parser' $ reply|>rState -- | @reject parser name@ fails if /parser/ matches at this point, and does -- nothing otherwise. If /name/ is provided, it is used in the error message, -- otherwise the messages uses the current character. reject :: (Match match result) => match -> Maybe String -> Pattern reject parser name = Parser $ \ state -> rejectParser state name (match parser) state { sIsPeek = True } where rejectParser point name (Parser parser) state = let reply = parser state in case reply|>rResult of Failed message -> returnReply point () Result value -> case name of Nothing -> unexpectedReply point Just text -> failReply point $ "Unexpected " ++ text More parser' -> rejectParser point name parser' $ reply|>rState -- | @nonEmpty parser@ succeeds if /parser/ matches some non-empty input -- characters at this point. nonEmpty :: (Match match result) => match -> Parser result nonEmpty parser = Parser $ \ state -> let Parser parser' = nonEmptyParser (state|>sOffset) (match parser) in parser' state where nonEmptyParser offset (Parser parser) = Parser $ \ state -> let reply = parser state state' = reply|>rState in case reply|>rResult of Failed message -> reply Result value -> if state'|>sOffset > offset then reply else failReply state' "Matched empty pattern" More parser' -> reply { rResult = More $ nonEmptyParser offset parser' } -- | @empty@ always matches without consuming any input. empty :: Pattern empty = return () -- | @eof@ matches the end of the input. eof :: Pattern eof = Parser $ \ state -> if state|>sInput == [] then returnReply state () else failReply state "Expected end of input" -- | @sol@ matches the start of a line. sol :: Pattern sol = Parser $ \ state -> if state|>sColumn == 0 then returnReply state () else failReply state "Expected start of line" -- ** State manipulation pseudo-parsers -- | @commit decision@ commits the parser to all the decisions up to the most -- recent parent /decision/. This makes all tokens generated in this parsing -- path immediately available to the caller. commit :: String -> Pattern commit decision = Parser $ \ state -> Reply { rState = state, rTokens = D.empty, rResult = Result (), rCommit = Just decision } -- | @nextLine@ increments @sLine@ counter and resets @sColumn@. nextLine :: Pattern nextLine = Parser $ \ state -> returnReply state { sLine = state|>sLine .+ 1, sColumn = 0 } () -- | @with setField getField value parser@ invokes the specified /parser/ with -- the value of the specified field set to /value/ for the duration of the -- invocation, using the /setField/ and /getField/ functions to manipulate it. with :: (value -> State -> State) -> (State -> value) -> value -> Parser result -> Parser result with setField getField value parser = Parser $ \ state -> let value' = getField state Parser parser' = value' `seq` withParser value' parser in parser' $ setField value state where withParser parentValue (Parser parser) = Parser $ \ state -> let reply = parser state in case reply|>rResult of Failed _ -> reply { rState = setField parentValue $ reply|>rState } Result _ -> reply { rState = setField parentValue $ reply|>rState } More parser' -> reply { rResult = More $ withParser parentValue parser' } -- | @parser ``forbidding`` pattern@ parses the specified /parser/ ensuring -- that it does not contain anything matching the /forbidden/ parser. forbidding :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1 forbidding parser forbidden = with setForbidden sForbidden (Just $ forbidden & empty) (match parser) -- | @parser ``limitedTo`` limit@ parses the specified /parser/ -- ensuring that it does not consume more than the /limit/ input chars. limitedTo :: (Match match result) => match -> Int -> Parser result limitedTo parser limit = with setLimit sLimit limit (match parser) -- ** Consuming input characters -- | @nextIf test@ fails if the current position matches the 'State' forbidden -- pattern or if the 'State' lookahead limit is reached. Otherwise it consumes -- (and buffers) the next input char if it satisfies /test/. nextIf :: (Char -> Bool) -> Pattern nextIf test = Parser $ \ state -> case state|>sForbidden of Nothing -> limitedNextIf state Just parser -> let Parser parser' = reject parser $ Just "forbidden pattern" reply = parser' state { sForbidden = Nothing } in case reply|>rResult of Failed _ -> reply Result _ -> limitedNextIf state where limitedNextIf state = case state|>sLimit of -1 -> consumeNextIf state 0 -> failReply state "Lookahead limit reached" limit -> consumeNextIf state { sLimit = state|>sLimit .- 1 } consumeNextIf state = case state|>sInput of (char:rest) | test char -> let chars = if state|>sIsPeek then [] else char:(state|>sChars) state' = state { sInput = rest, sLast = char, sChars = chars, sOffset = state|>sOffset .+ 1, sColumn = state|>sColumn .+ 1 } in returnReply state' () | otherwise -> unexpectedReply state [] -> unexpectedReply state -- ** Producing tokens -- | @finishToken@ places all collected text into a new token and begins a new -- one, or does nothing if there are no collected characters. finishToken :: Pattern finishToken = Parser $ \ state -> if state|>sIsPeek then returnReply state () else case state|>sChars of [] -> returnReply state () chars@(_:_) -> tokenReply state Token { tCode = state|>sCode, tText = reverse chars } -- | @wrap parser@ invokes the /parser/, ensures any unclaimed input characters -- are wrapped into a token (only happens when testing productions), ensures no -- input is left unparsed, and returns the parser's result. wrap :: (Match match result) => match -> Parser result wrap parser = do result <- match parser finishToken eof return result -- | @consume parser@ invokes the /parser/ and then consumes all remaining -- unparsed input characters. consume :: (Match match result) => match -> Parser result consume parser = do result <- match parser finishToken clearInput return result where clearInput = Parser $ \ state -> returnReply state { sInput = [] } () -- | @token code parser@ places all text matched by /parser/ into a 'Token' with -- the specified /code/ (unless it is empty). Note it collects the text even if -- there is an error. token :: (Match match result) => Code -> match -> Pattern token code parser = finishToken & with setCode sCode code (parser & finishToken) -- | @fake code text@ creates a token with the specified /code/ and \"fake\" -- /text/ characters, instead of whatever characters are collected so far. fake :: Code -> String -> Pattern fake code text = Parser $ \ state -> if state|>sIsPeek then returnReply state () else tokenReply state Token { tCode = code, tText = text } -- | @meta parser@ collects the text matched by the specified /parser/ into a -- | @Meta@ token. meta :: (Match match result) => match -> Pattern meta parser = token Meta parser -- | @indicator code@ collects the text matched by the specified /parser/ into an -- @Indicator@ token. indicator :: (Match match result) => match -> Pattern indicator parser = token Indicator $ parser -- | @text parser@ collects the text matched by the specified /parser/ into a -- @Text@ token. text :: (Match match result) => match -> Pattern text parser = token Text parser -- | @nest code@ returns an empty token with the specified begin\/end /code/ to -- signal nesting. nest :: Code -> Pattern nest code = finishToken & nestParser code where nestParser code = Parser $ \ state -> if state|>sIsPeek then returnReply state () else tokenReply state Token { tCode = code, tText = "" } -- * Production parameters -- | Production context. data Context = BlockOut -- ^ Outside block sequence.. | BlockIn -- ^ Inside block sequence.. | FlowOut -- ^ Outside flow collection. | FlowIn -- ^ Inside flow collection. | FlowKey -- ^ Inside flow key. -- | @show context@ converts a 'Context' to a 'String'. instance Show Context where show context = case context of BlockOut -> "block-out" BlockIn -> "block-in" FlowOut -> "flow-out" FlowIn -> "flow-in" FlowKey -> "flow-key" -- | @read context@ converts a 'String' to a 'Context'. We trust our callers to -- convert any @-@ characters into @_@ to allow the built-in @lex@ function to -- handle the names as single identifiers. instance Read Context where readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ] where r word = case word of "block_out" -> BlockOut "block_in" -> BlockIn "flow_out" -> FlowOut "flow_in" -> FlowIn "flow_key" -> FlowKey _ -> error $ "unknown context: " ++ word -- | Chomp method. data Chomp = Strip -- ^ Remove all trailing line breaks. | Clip -- ^ Keep first trailing line break. | Keep -- ^ Keep all trailing line breaks. -- | @show chomp@ converts a 'Chomp' to a 'String'. instance Show Chomp where show chomp = case chomp of Strip -> "strip" Clip -> "clip" Keep -> "keep" -- | @read chomp@ converts a 'String' to a 'Chomp'. instance Read Chomp where readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ] where r word = case word of "strip" -> Strip "clip" -> Clip "keep" -> Keep _ -> error $ "unknown chomp: " ++ word -- * Tokenizers -- -- We encapsulate the 'Parser' inside a 'Tokenizer'. This allows us to hide the -- implementation details from our callers. -- | 'Tokenizer' converts a (named) input text into a list of 'Token'. Errors -- are reported as tokens with the @Error@ 'Code', and the unparsed text -- following an error may be attached as a final token. type Tokenizer = String -> C.ByteString -> Bool -> [Token] -- | @patternTokenizer pattern@ converts the /pattern/ to a simple 'Tokenizer'. patternTokenizer :: Pattern -> Tokenizer patternTokenizer pattern name input withFollowing = D.toList $ patternParser (wrap pattern) (initialState name input) where patternParser (Parser parser) state = let reply = parser state tokens = commitBugs reply in case reply|>rResult of Failed message -> errorTokens tokens message withFollowing $ reply|>rState|>sInput Result _ -> tokens More parser' -> D.append tokens $ patternParser parser' $ reply|>rState -- | @parserTokenizer what parser@ converts the /parser/ returning /what/ to a -- simple 'Tokenizer' (only used for tests). The result is reported as a token -- with the @Detected@ 'Code' The result is reported as a token with the -- @Detected@ 'Code'. parserTokenizer :: (Show result, Match match result) => String -> match -> Tokenizer parserTokenizer what parser name input withFollowing = D.toList $ parserParser (wrap parser) (initialState name input) where parserParser (Parser parser) state = let reply = parser state tokens = commitBugs reply in case reply|>rResult of Failed message -> errorTokens tokens message withFollowing $ reply|>rState|>sInput Result value -> D.append tokens $ D.singleton Token { tCode = Detected, tText = what ++ "=" ++ (show value) } More parser' -> D.append tokens $ parserParser parser' $ reply|>rState -- | @errorTokens tokens message withFollowing following@ appends an @Error@ -- token with the specified /message/ at the end of /tokens/, and if -- /withFollowing/ also appends the unparsed text /following/ the error as a -- final @Unparsed@ token. errorTokens tokens message withFollowing following = let tokens' = D.append tokens $ D.singleton Token { tCode = Error, tText = message } in if withFollowing && following /= "" then D.append tokens' $ D.singleton Token { tCode = Unparsed, tText = following } else tokens' -- | @commitBugs reply@ inserts an error token if a commit was made outside a -- named choice. This should never happen outside tests. commitBugs :: Reply result -> D.DList Token commitBugs reply = let tokens = reply|>rTokens in case reply|>rCommit of Nothing -> tokens Just commit -> D.append tokens $ D.singleton Token { tCode = Error, tText = "Commit to '" ++ commit ++ "' was made outside it" } -- | @yaml name input@ converts the Unicode /input/ (called /name/ in error -- messages) to a list of 'Token' according to the YAML spec. This is it! yaml :: Tokenizer yaml = patternTokenizer l_yaml_stream #ifdef REAL_CPP -- This is how non-ancient C pre-processor do it. #define STR(X) #X #else -- This only works in GHC's simplistic GHC (and ancient C pre-processors). #define STR(X) "X" #endif -- These allow us to avoid repeating the parser names. #define PAT(PATTERN) pat STR(PATTERN) PATTERN #define PAR(PARSER) par STR(PARSER) PARSER #define PAC(PARSER) pac STR(PARSER) PARSER -- | @pName name@ converts a parser name to the \"proper\" spec name. pName :: String -> String pName name = regexSub questionRegex "?" $ regexSub minusRegex "-" $ regexSub plusRegex "+" name where regexSub regex value text = subRegex regex text value questionRegex = mkRegex "'" minusRegex = mkRegex "_" plusRegex = mkRegex "__" -- | @tokenizers@ returns a mapping from a production name to a production -- tokenizer. tokenizers :: Map.Map String Tokenizer tokenizers = PAR(c_chomping_indicator) "t" $ PAC(detect_inline_indentation) "m" $ PAT(b_as_line_feed) $ PAT(b_carriage_return) $ PAT(b_char) $ PAT(b_generic) $ PAT(b_l_folded_as_space) $ PAT(b_line_feed) $ PAT(b_line_separator) $ PAT(b_next_line) $ PAT(b_non_content_any) $ PAT(b_non_content_generic) $ PAT(b_normalized) $ PAT(b_paragraph_separator) $ PAT(b_specific) $ PAT(c_alias) $ PAT(c_anchor) $ PAT(c_byte_order_mark) $ PAT(c_collect_entry) $ PAT(c_comment) $ PAT(c_directive) $ PAT(c_document_end) $ PAT(c_document_start) $ PAT(c_double_quote) $ PAT(c_escape) $ PAT(c_folded) $ PAT(c_indicator) $ PAT(c_literal) $ PAT(c_mapping_end) $ PAT(c_mapping_key) $ PAT(c_mapping_start) $ PAT(c_mapping_value) $ PAT(c_named_tag_handle) $ PAT(c_nb_comment_text) $ PAT(c_non_specific_tag) $ PAT(c_ns_alias_node) $ PAT(c_ns_anchor_property) $ PAT(c_ns_esc_char) $ PAT(c_ns_local_tag_prefix) $ PAT(c_ns_shorthand_tag) $ PAT(c_ns_tag_property) $ PAT(c_primary_tag_handle) $ PAT(c_printable) $ PAT(c_quoted_quote) $ PAT(c_reserved) $ PAT(c_secondary_tag_handle) $ PAT(c_sequence_end) $ PAT(c_sequence_entry) $ PAT(c_sequence_start) $ PAT(c_s_implicit_json_key) $ PAT(c_single_quote) $ PAT(c_tag) $ PAT(c_tag_handle) $ PAT(c_verbatim_tag) $ PAT(e_node) $ PAT(e_scalar) $ PAT(l_comment) $ PAT(l_directive) $ PAT(l_document_prefix) $ PAT(l_documents) $ PAT(l_document_suffix) $ PAT(l_explicit_document) $ PAT(l_following_document) $ PAT(l_forbidden) $ PAT(l_implicit_document) $ PAT(l_leading_document) $ PAT(l_yaml_stream) $ PAT(nb_char) $ PAT(nb_double_char) $ PAT(nb_double_one_line) $ PAT(nb_ns_double_in_line) $ PAT(nb_ns_single_in_line) $ PAT(nb_single_char) $ PAT(nb_single_one_line) $ PAT(ns_anchor_char) $ PAT(ns_anchor_name) $ PAT(ns_ascii_letter) $ PAT(ns_char) $ PAT(ns_dec_digit) $ PAT(ns_directive_name) $ PAT(ns_directive_parameter) $ PAT(ns_double_char) $ PAT(ns_esc_16_bit) $ PAT(ns_esc_32_bit) $ PAT(ns_esc_8_bit) $ PAT(ns_esc_backslash) $ PAT(ns_esc_backspace) $ PAT(ns_esc_bell) $ PAT(ns_esc_carriage_return) $ PAT(ns_esc_double_quote) $ PAT(ns_esc_escape) $ PAT(ns_esc_form_feed) $ PAT(ns_esc_horizontal_tab) $ PAT(ns_esc_line_feed) $ PAT(ns_esc_line_separator) $ PAT(ns_esc_next_line) $ PAT(ns_esc_non_breaking_space) $ PAT(ns_esc_null) $ PAT(ns_esc_paragraph_separator) $ PAT(ns_esc_space) $ PAT(ns_esc_vertical_tab) $ PAT(ns_global_tag_prefix) $ PAT(ns_hex_digit) $ PAT(ns_plain_safe_in) $ PAT(ns_plain_safe_out) $ PAT(ns_reserved_directive) $ PAT(ns_s_block_map_implicit_key) $ PAT(ns_s_implicit_yaml_key) $ PAT(ns_single_char) $ PAT(ns_tag_char) $ PAT(ns_tag_directive) $ PAT(ns_tag_prefix) $ PAT(ns_uri_char) $ PAT(ns_word_char) $ PAT(ns_yaml_directive) $ PAT(ns_yaml_version) $ PAT(s_b_comment) $ PAT(s_l_comments) $ PAT(s_separate_in_line) $ PAT(s_space) $ PAT(s_tab) $ PAT(s_white) $ Map.empty where pat name pattern = Map.insert (pName name) $ patternTokenizer (match pattern) par name parser what = Map.insert (pName name) $ parserTokenizer what (match parser) pac name parser what = Map.insert (pName name) $ parserTokenizer what (consume parser) -- | @tokenizer name@ converts the production with the specified /name/ to a -- simple 'Tokenizer', or @Nothing@ if it isn't known. tokenizer :: String -> (Maybe Tokenizer) tokenizer name = Map.lookup name tokenizers -- | @tokenizersWithN@ returns a mapping from a production name to a production -- tokenizer (that takes an /n/ argument). tokenizersWithN :: Map.Map String (Int -> Tokenizer) tokenizersWithN = PAR(c_b_block_header) "(m,t)" $ PAC(detect_collection_indentation) "m" $ PAC(detect_scalar_indentation) "m" $ PAR(c_indentation_indicator) "m" $ PAR(count_spaces) "m" $ PAT(b_l_spaced) $ PAT(b_nb_literal_next) $ PAT(c_l_block_map_explicit_entry) $ PAT(c_l_block_map_explicit_key) $ PAT(c_l_block_map_implicit_value) $ PAT(c_l_block_seq_entry) $ PAT(c_l__folded) $ PAT(c_l__literal) $ PAT(l_block_map_explicit_value) $ PAT(l__block_mapping) $ PAT(l__block_sequence) $ PAT(l_keep_empty) $ PAT(l_nb_diff_lines) $ PAT(l_nb_folded_lines) $ PAT(l_nb_literal_text) $ PAT(l_nb_same_lines) $ PAT(l_nb_spaced_lines) $ PAT(l_strip_empty) $ PAT(l_trail_comments) $ PAT(nb_double_multi_line) $ PAT(nb_single_multi_line) $ PAT(ns_l_block_map_entry) $ PAT(ns_l_block_map_implicit_entry) $ PAT(ns_l_in_line_mapping) $ PAT(ns_l_in_line_sequence) $ PAT(s_block_line_prefix) $ PAT(s_flow_line_prefix) $ PAT(s_indent) $ PAT(s_indent_le) $ PAT(s_indent_lt) $ PAT(s_l_double_any) $ PAT(s_l_double_escaped) $ PAT(s_l_flow_folded) $ PAT(s_l__flow_in_block) $ PAT(s_nb_folded_text) $ PAT(s_nb_spaced_text) $ PAT(s_ns_double_next_line) $ PAT(s_ns_single_next_line) $ PAT(s_separate_lines) $ Map.empty where pat name pattern = Map.insert (pName name) (\ n -> patternTokenizer (match $ pattern n)) par name parser what = Map.insert (pName name) (\ n -> parserTokenizer what (match $ parser n)) pac name parser what = Map.insert (pName name) (\ n -> parserTokenizer what (consume $ parser n)) -- | @tokenizerWithN name n@ converts the production (that requires an /n/ -- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if -- it isn't known. tokenizerWithN :: String -> Int -> Maybe Tokenizer tokenizerWithN name n = case Map.lookup name tokenizersWithN of Just tokenizer -> Just $ tokenizer n Nothing -> Nothing -- | @tokenizersWithC@ returns a mapping from a production name to a production -- tokenizer (that takes a /c/ argument). tokenizersWithC :: Map.Map String (Context -> Tokenizer) tokenizersWithC = PAT(nb_ns_plain_in_line) $ PAT(nb_plain_char) $ PAT(ns_plain_char) $ PAT(ns_plain_first) $ PAT(ns_plain_one_line) $ PAT(ns_plain_safe) $ Map.empty where pat name pattern = Map.insert (pName name) (\ c -> patternTokenizer (match $ pattern c)) -- | @tokenizerWithC name c@ converts the production (that requires a /c/ -- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if -- it isn't known. tokenizerWithC :: String -> Context -> Maybe Tokenizer tokenizerWithC name c = case Map.lookup name tokenizersWithC of Just tokenizer -> Just $ tokenizer c Nothing -> Nothing -- | @tokenizersWithT@ returns a mapping from a production name to a production -- tokenizer (that takes a /t/ argument). tokenizersWithT :: Map.Map String (Chomp -> Tokenizer) tokenizersWithT = PAT(b_chomped_last) $ Map.empty where pat name pattern = Map.insert (pName name) (\ t -> patternTokenizer (match $ pattern t)) -- | @tokenizerWithT name t@ converts the production (that requires an /t/ -- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if -- it isn't known. tokenizerWithT :: String -> Chomp -> Maybe Tokenizer tokenizerWithT name t = case Map.lookup name tokenizersWithT of Just tokenizer -> Just $ tokenizer t Nothing -> Nothing -- | @tokenizersWithNC@ returns a mapping from a production name to a -- production tokenizer (that requires /n/ and /c/ arguments). tokenizersWithNC :: Map.Map String (Int -> Context -> Tokenizer) tokenizersWithNC = PAT(b_l_folded_any) $ PAT(b_l_folded_specific) $ PAT(b_l_folded_trimmed) $ PAT(c_double_quoted) $ PAT(c_flow_json_content) $ PAT(c_flow_json_node) $ PAT(c_flow_mapping) $ PAT(c_flow_sequence) $ PAT(c_ns_flow_map_adjacent_value) $ PAT(c_ns_flow_map_empty_key_entry) $ PAT(c_ns_flow_map_json_key_entry) $ PAT(c_ns_flow_map_separate_value) $ PAT(c_ns_flow_pair_json_key_entry) $ PAT(c_ns_properties) $ PAT(c_single_quoted) $ PAT(l_empty) $ PAT(nb_double_text) $ PAT(nb_single_text) $ PAT(ns_flow_content) $ PAT(ns_flow_map_entry) $ PAT(ns_flow_map_explicit_entry) $ PAT(ns_flow_map_implicit_entry) $ PAT(ns_flow_map_yaml_key_entry) $ PAT(ns_flow_node) $ PAT(ns_flow_pair) $ PAT(ns_flow_pair_entry) $ PAT(ns_flow_pair_yaml_key_entry) $ PAT(ns_flow_seq_entry) $ PAT(ns_flow_yaml_content) $ PAT(ns_flow_yaml_node) $ PAT(ns_plain) $ PAT(ns_plain_multi_line) $ PAT(ns_s_flow_map_entries) $ PAT(ns_s_flow_seq_entries) $ PAT(s_l__block_collection) $ PAT(s_l__block_in_block) $ PAT(s_l__block_indented) $ PAT(s_l__block_node) $ PAT(s_l__block_scalar) $ PAT(s_line_prefix) $ PAT(s_ns_plain_next_line) $ PAT(s_separate) $ Map.empty where pat name pattern = Map.insert (pName name) (\ n c -> patternTokenizer (match $ pattern n c)) -- | @tokenizerWithNC name n c@ converts the production (that requires /n/ and -- /c/ arguments) with the specified /name/ to a simple 'Tokenizer', or -- @Nothing@ if it isn't known. tokenizerWithNC :: String -> Int -> Context -> Maybe Tokenizer tokenizerWithNC name n c = case Map.lookup name tokenizersWithNC of Just tokenizer -> Just $ tokenizer n c Nothing -> Nothing -- | @tokenizersWithNT@ returns a mapping from a production name to a -- production tokenizer (that requires /n/ and /t/ arguments). tokenizersWithNT :: Map.Map String (Int -> Chomp -> Tokenizer) tokenizersWithNT = PAT(l_chomped_empty) $ PAT(l_folded_content) $ PAT(l_literal_content) $ Map.empty where pat name pattern = Map.insert (pName name) (\ n t -> patternTokenizer (match $ pattern n t)) -- | @tokenizerWithNT name n t@ converts the production (that requires /n/ and -- /t/ arguments) with the specified /name/ to a simple 'Tokenizer', or -- @Nothing@ if it isn't known. tokenizerWithNT :: String -> Int -> Chomp -> Maybe Tokenizer tokenizerWithNT name n t = case Map.lookup name tokenizersWithNT of Just tokenizer -> Just $ tokenizer n t Nothing -> Nothing -- | @tokenizerNames@ returns the list of all productions (tokenizers). tokenizerNames :: [String] tokenizerNames = (Map.keys tokenizers) ++ (Map.keys tokenizersWithN) ++ (Map.keys tokenizersWithC) ++ (Map.keys tokenizersWithT) ++ (Map.keys tokenizersWithNC) ++ (Map.keys tokenizersWithNT) -- * Productions -- ** BNF compatibility helpers -- | @detect_utf_encoding@ doesn't actually detect the encoding, we just call it -- this way to make the productions compatible with the spec. Instead it simply -- reports the encoding (which was already detected when we started parsing). detect_utf_encoding = Parser $ \ state -> let text = case state|>sEncoding of UTF8 -> "TF8" UTF16LE -> "TF16LE" UTF16BE -> "TF16BE" Parser parser = fake Bom text in parser state { sColumn = state|>sColumn .- 1 } -- | @na@ is the \"non-applicable\" indentation value. We use Haskell's laziness -- to verify it really is never used. na :: Int na = error "Accessing non-applicable indentation" -- | @asInteger@ returns the last consumed character, which is assumed to be a -- decimal digit, as an integer. asInteger :: Parser Int asInteger = Parser $ \ state -> returnReply state $ ord (state|>sLast) .- 48 -- | @result value@ is the same as /return value/ except that we give the -- Haskell type deduction the additional boost it needs to figure out this is -- wrapped in a 'Parser'. result :: result -> Parser result result = return #include "Reference.bnf"