-- #ignore-exports ------------------------------------------------------------------------------- -- | -- Module : Text.Yaml.Reference -- Copyright : (c) Oren Ben-Kiki 2007 -- License : LGPL -- -- Maintainer : yaml-oren@ben-kiki.org -- Stability : alpha -- Portability : portable -- -- Implementation of the YAML syntax as defined in . -- Actually this file contains the parsing framework and includes (using CPP) -- the actual productions from @Reference.bnf@. -- -- The parsing framework is fully streaming (generates output tokens -- \"immediately\"), but has a memory leak (actually retention) which causes it -- to blow up on \"large\" files. To debug this with minimal syntax productions -- use the @debug_leak@ production (@yes "#" | yaml2yeast -p debug-leak@). ------------------------------------------------------------------------------- module Text.Yaml.Reference ( -- Basic parsing: Code, Token, Tokenizer, yaml, -- For testing: Context, Style, Chomp, tokenizer, tokenizerWithN, tokenizerWithC, tokenizerWithS, tokenizerWithT, tokenizerWithNC, tokenizerWithNS, tokenizerWithNT, tokenizerNames, showTokens ) where import Control.Monad import qualified Data.ByteString.Lazy.Char8 as C import Data.Char import qualified Data.DList as D import qualified Data.Map as Map import Text.Regex import Debug.Trace import qualified Prelude import Prelude hiding ((/), (*), (+), (-), (^)) -- * Generic operators -- -- ** Numeric operators -- -- We rename the four numerical operators @+@ @-@ @*@ @\/@ to start with @.@ -- (@.+@, @.-@, @.*@, @.\/@). This allows us to use the originals for BNF -- notation (we also hijack the @^@ operator). This is not a generally -- recommended practice. It is justified in this case since we have very little -- arithmetic operations, and a lot of BNF rules which this makes extremely -- readable. infixl 6 .+ -- | \".+\" is the numeric addition (we use \"+\" for postfix \"one or more\"). (.+) = (Prelude.+) infixl 6 .- -- | \".-\" is the numeric subtraction (we use \"-\" for infix \"and not\"). (.-) = (Prelude.-) infixl 7 .* -- | \".*\" is the numeric multiplication (we use \"*\" for postfix \"zero or -- more\"). (.*) = (Prelude.*) infixl 7 ./ -- | \"./\" is the numeric division (we use \"/\" for infix \"or\"). (./) = (Prelude./) -- ** Record field access -- -- We also define @|>@ for record access for increased readability. infixl 9 |> -- | @record |> field@ is the same as @field record@, but is more readable. (|>) :: record -> (record -> value) -> value record |> field = field record -- * UTF decoding -- -- This really should be factored out to the standard libraries. Since it isn't -- there, we get to tailor it exactly to our needs. We use lazy byte strings as -- input, which should give reasonable I\/O performance when reading large -- files. The output is a normal 'Char' list which is easy to work with and -- should be efficient enough as long as the 'Parser' does its job right. -- | Recognized Unicode encodings. UTF-32 isn't required by YAML parsers. data Encoding = UTF8 -- ^ UTF-8 encoding (or ASCII) | UTF16LE -- ^ UTF-16 little endian | UTF16BE -- ^ UTF-16 big endian deriving Show -- | @decode bytes@ automatically detects the 'Encoding' used and converts the -- /bytes/ to Unicode characters. decode :: C.ByteString -> (Encoding, [Char]) decode text = (encoding, undoEncoding encoding text) where encoding = detectEncoding $ C.unpack $ C.take 2 text -- | @detectEncoding text@ examines the first few chars (bytes) of the /text/ -- to deduce the Unicode encoding used according to the YAML spec. detectEncoding :: [Char] -> Encoding detectEncoding text = case text of '\xFF':'\xFE':_ -> UTF16LE '\xFE':'\xFF':_ -> UTF16BE _ -> UTF8 -- | @undoEncoding encoding bytes@ converts a /bytes/ stream to Unicode -- characters according to the /encoding/. undoEncoding :: Encoding -> C.ByteString -> [Char] undoEncoding encoding bytes = case encoding of UTF8 -> undoUTF8 bytes UTF16LE -> combinePairs $ undoUTF16LE bytes UTF16BE -> combinePairs $ undoUTF16BE bytes -- ** UTF-16 decoding -- | @combinePairs chars@ converts each pair of UTF-16 surrogate characters to a -- single Unicode character. combinePairs :: [Char] -> [Char] combinePairs [] = [] combinePairs (lead:rest) | '\xD800' <= lead && lead <= '\xDBFF' = combineLead lead rest | '\xDC00' <= lead && lead <= '\xDFFF' = error "UTF-16 contains trail surrogate without lead surrogate" | otherwise = lead:(combinePairs rest) -- | @combineLead lead rest@ combines the /lead/ surrogate with the head of the -- /rest/ of the input chars, assumed to be a /trail/ surrogate, and continues -- combining surrogate pairs. combineLead :: Char -> [Char] -> [Char] combineLead lead [] = error "UTF-16 contains lead surrogate as final character" combineLead lead (trail:rest) | '\xDC00' <= trail && trail <= '\xDFFF' = (combineSurrogates lead trail):combinePairs rest | otherwise = error "UTF-16 contains lead surrogate without trail surrogate" -- | @surrogateOffset@ is copied from the Unicode FAQs. surrogateOffset :: Int surrogateOffset = 0x10000 .- (0xD800 .* 1024) .- 0xDC00 -- | @combineSurrogates lead trail@ combines two UTF-16 surrogates into a single -- Unicode character. combineSurrogates :: Char -> Char -> Char combineSurrogates lead trail = chr $ (ord lead) .* 1024 .+ (ord trail) .+ surrogateOffset -- | @hasFewerThan bytes n@ checks whether there are fewer than /n/ /bytes/ -- left to read. hasFewerThan :: Int -> C.ByteString -> Bool hasFewerThan n bytes | n == 1 = C.null bytes | n > 1 = C.null bytes || hasFewerThan (n .- 1) (C.tail bytes) -- | @undoUTF18LE bytes@ decoded a UTF-16-LE /bytes/ stream to Unicode chars. undoUTF16LE :: C.ByteString -> [Char] undoUTF16LE bytes | C.null bytes = [] | hasFewerThan 2 bytes = error "UTF-16-LE input contains odd number of bytes" | otherwise = let low = C.head bytes bytes' = C.tail bytes high = C.head bytes' rest = C.tail bytes' in (chr $ (ord high) .* 256 .+ (ord low)):(undoUTF16LE rest) -- | @undoUTF18BE bytes@ decoded a UTF-16-BE /bytes/ stream to Unicode chars. undoUTF16BE :: C.ByteString -> [Char] undoUTF16BE bytes | C.null bytes = [] | hasFewerThan 2 bytes = error "UTF-16-BE input contains odd number of bytes" | otherwise = let high = C.head bytes bytes' = C.tail bytes low = C.head bytes' rest = C.tail bytes' in (chr $ (ord high) .* 256 .+ (ord low)):(undoUTF16BE rest) -- ** UTF-8 decoding -- | @undoUTF8 bytes@ decoded a UTF-8 /bytes/ stream to Unicode chars. undoUTF8 :: C.ByteString -> [Char] undoUTF8 bytes | C.null bytes = [] | otherwise = let first = C.head bytes rest = C.tail bytes in case () of _ | first < '\x80' -> first:(undoUTF8 rest) | first < '\xC0' -> error "UTF-8 input contains invalid first byte" | first < '\xE0' -> decodeTwoUTF8 first rest | first < '\xF0' -> decodeThreeUTF8 first rest | first < '\xF8' -> decodeFourUTF8 first rest | otherwise -> error "UTF-8 input contains invalid first byte" -- | @decodeTwoUTF8 first bytes@ decodes a two-byte UTF-8 character, where the -- /first/ byte is already available and the second is the head of the /bytes/, -- and then continues to undo the UTF-8 encoding. decodeTwoUTF8 :: Char -> C.ByteString -> [Char] decodeTwoUTF8 first bytes | C.null bytes = error "UTF-8 double byte char is missing second byte at eof" | otherwise = let second = C.head bytes rest = C.tail bytes in case () of _ | second < '\x80' || '\xBF' < second -> error "UTF-8 double byte char has invalid second byte" | otherwise -> (combineTwoUTF8 first second):(undoUTF8 rest) -- | @combineTwoUTF8 first second@ combines the /first/ and /second/ bytes of a -- two-byte UTF-8 char into a single Unicode char. combineTwoUTF8 :: Char -> Char -> Char combineTwoUTF8 first second = chr(((ord first) .- 0xC0) .* 64 .+ ((ord second) .- 0x80)) -- | @decodeThreeUTF8 first bytes@ decodes a three-byte UTF-8 character, where -- the /first/ byte is already available and the second and third are the head -- of the /bytes/, and then continues to undo the UTF-8 encoding. decodeThreeUTF8 :: Char -> C.ByteString -> [Char] decodeThreeUTF8 first bytes | hasFewerThan 2 bytes = error "UTF-8 triple byte char is missing bytes at eof" | otherwise = let second = C.head bytes bytes' = C.tail bytes third = C.head bytes' rest = C.tail bytes' in case () of _ | second < '\x80' || '\xBF' < second -> error "UTF-8 triple byte char has invalid second byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 triple byte char has invalid third byte" | otherwise -> (combineThreeUTF8 first second third):(undoUTF8 rest) -- | @combineThreeUTF8 first second@ combines the /first/, /second/ and /third/ -- bytes of a three-byte UTF-8 char into a single Unicode char. combineThreeUTF8 :: Char -> Char -> Char -> Char combineThreeUTF8 first second third = chr(((ord first) .- 0xE0) .* 4096 .+ ((ord second) .- 0x80) .* 64 .+ ((ord third) .- 0x80)) -- | @decodeFourUTF8 first bytes@ decodes a four-byte UTF-8 character, where the -- /first/ byte is already available and the second, third and fourth are the -- head of the /bytes/, and then continues to undo the UTF-8 encoding. decodeFourUTF8 :: Char -> C.ByteString -> [Char] decodeFourUTF8 first bytes | hasFewerThan 3 bytes = error "UTF-8 quad byte char is missing bytes at eof" | otherwise = let second = C.head bytes bytes' = C.tail bytes third = C.head bytes' bytes'' = C.tail bytes' fourth = C.head bytes'' rest = C.tail bytes'' in case () of _ | second < '\x80' || '\xBF' < second -> error "UTF-8 quad byte char has invalid second byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 quad byte char has invalid third byte" | third < '\x80' || '\xBF' < third -> error "UTF-8 quad byte char has invalid fourth byte" | otherwise -> (combineFourUTF8 first second third fourth):(undoUTF8 rest) -- | @combineFourUTF8 first second@ combines the /first/, /second/ and /third/ -- bytes of a three-byte UTF-8 char into a single Unicode char. combineFourUTF8 :: Char -> Char -> Char -> Char -> Char combineFourUTF8 first second third fourth = chr(((ord first) .- 0xF0) .* 262144 .+ ((ord second) .- 0x80) .* 4096 .+ ((ord third) .- 0x80) .* 64 .+ ((ord fourth) .- 0x80)) -- * Result tokens -- -- The parsing result is a stream of tokens rather than a parse tree. The idea -- is to convert the YAML input into \"byte codes\". These byte codes are -- intended to be written into a byte codes file (or more likely a UNIX pipe) -- for further processing. -- | 'Token' codes. data Code = Bom -- ^ BOM, contains \"@TF8@\", \"@TF16LE@\" or \"@TF16BE@\". | Text -- ^ Content text characters. | Meta -- ^ Non-content (meta) text characters. | Break -- ^ Line break preserved in content. | Continue -- ^ Separation line break. | LineFeed -- ^ Line break normalized to content line feed. | LineFold -- ^ Line break folded to content space. | Indicator -- ^ Character indicating structure. | White -- ^ Separation white space. | Indent -- ^ Indentation spaces. | DocumentStart -- ^ Document start marker. | DocumentEnd -- ^ Document end marker. | BeginEscape -- ^ Begins escape sequence. | EndEscape -- ^ Ends escape sequence. | BeginComment -- ^ Begins comment. | EndComment -- ^ Ends comment. | BeginDirective -- ^ Begins directive. | EndDirective -- ^ Ends directive. | BeginTag -- ^ Begins tag. | EndTag -- ^ Ends tag. | BeginHandle -- ^ Begins tag handle. | EndHandle -- ^ Ends tag handle. | BeginAnchor -- ^ Begins anchor. | EndAnchor -- ^ Ends anchor. | BeginProperties -- ^ Begins node properties. | EndProperties -- ^ Ends node properties. | BeginAlias -- ^ Begins alias. | EndAlias -- ^ Ends alias. | BeginScalar -- ^ Begins scalar content. | EndScalar -- ^ Ends scalar content. | BeginSequence -- ^ Begins sequence content. | EndSequence -- ^ Ends sequence content. | BeginMapping -- ^ Begins mapping content. | EndMapping -- ^ Ends mapping content. | BeginPair -- ^ Begins mapping key:value pair. | EndPair -- ^ Ends mapping key:value pair. | BeginNode -- ^ Begins complete node. | EndNode -- ^ Ends complete node. | BeginDocument -- ^ Begins document. | EndDocument -- ^ Ends document. | BeginStream -- ^ Begins YAML stream. | EndStream -- ^ Ends YAML stream. | Error -- ^ Parsing error at this point. -- For testing:. | Test -- ^ Test characters otherwise unassigned. | Detected -- ^ Detected parameter. deriving Eq -- | @show code@ converts a 'Code' to the one-character YEAST token code char. -- The list of byte codes is also documented in the @yaml2yeast@ program. instance Show Code where show code = case code of Bom -> "U" Text -> "T" Meta -> "t" Break -> "B" Continue -> "b" LineFeed -> "L" LineFold -> "l" Indicator -> "I" White -> "w" Indent -> "i" DocumentStart -> "K" DocumentEnd -> "k" BeginEscape -> "E" EndEscape -> "e" BeginComment -> "C" EndComment -> "c" BeginDirective -> "D" EndDirective -> "d" BeginTag -> "G" EndTag -> "g" BeginHandle -> "H" EndHandle -> "h" BeginAnchor -> "A" EndAnchor -> "a" BeginProperties -> "P" EndProperties -> "p" BeginAlias -> "R" EndAlias -> "r" BeginScalar -> "S" EndScalar -> "s" BeginSequence -> "Q" EndSequence -> "q" BeginMapping -> "M" EndMapping -> "m" BeginNode -> "N" EndNode -> "n" BeginPair -> "X" EndPair -> "x" BeginDocument -> "O" EndDocument -> "o" BeginStream -> "Y" EndStream -> "y" Error -> "!" Test -> "?" Detected -> "$" -- | Parsed token. data Token = Token { tCode :: Code, -- ^ Specific token 'Code'. tText :: Maybe String -- ^ Contained input chars, if any. } -- | @show token@ converts a 'Token' to a single YEAST line. instance Show Token where show token = case token|>tText of Nothing -> (show $ token|>tCode) ++ "\n" Just string -> (show $ token|>tCode) ++ (escapeString string) ++ "\n" -- | @escapeString string@ escapes all the non-ASCII characters in the -- /string/, as well as escaping the \"@\\@\" character, using the \"@\\xXX@\", -- \"@\\uXXXX@\" and \"@\\UXXXXXXXX@\" escape sequences. escapeString :: String -> String escapeString [] = [] escapeString (first:rest) | ' ' <= first && first /= '\\' && first <= '~' = first:(escapeString rest) | first <= '\xFF' = "\\x" ++ (toHex 2 $ ord first) ++ (escapeString rest) | '\xFF' < first && first <= '\xFFFF' = "\\u" ++ (toHex 4 $ ord first) ++ (escapeString rest) | otherwise = "\\U" ++ (toHex 8 $ ord first) ++ (escapeString rest) -- | @toHex digits int@ converts the /int/ to the specified number of -- hexadecimal /digits/. toHex :: Int -> Int -> String toHex digits int | digits > 1 = (toHex (digits .- 1) (int `div` 16)) ++ [intToDigit $ int `mod` 16] | digits == 1 = [intToDigit int] -- | @showTokens tokens@ converts a list of /tokens/ to a multi-line YEAST -- text. showTokens :: [Token] -> String showTokens tokens = foldr (\ token text -> (show token) ++ text) "" tokens -- * Parsing framework -- -- Haskell has no shortage of parsing frameworks. We use our own because: -- -- * Most available frameworks are inappropriate because of their focus on -- building a parse tree, and completing all of it before any of it is -- accessible to the caller. We return a stream of tokens, and would like -- its head to be accessible as soon as possible to allow for streaming. To -- do this we use a difference list (not a 'DList' - we use a \"real\" -- difference list building on Haskell's lazy evaluation nature). -- -- * Haskell makes it so easy to roll your own parsing framework. We need some -- specialized machinery (limited lookahead, forbidden patterns). It is -- possible to build these on top of existing frameworks but the end result -- isn't much shorter than rolling our own. -- -- Since we roll our own framework we don't bother with making it generalized, -- so we maintain a single 'State' type rather than having a generic one that -- contains a polymorphic \"UserState\" field etc. -- | A 'Parser' is basically a function computing a /result/, while at the same -- time accumulating a list of 'Token'. A result of @Nothing@ indicates -- failure. data Parser result = Parser (State -> (State, Maybe result)) -- A 'Pattern' is a parser that doesn't have an (interesting) result. type Pattern = Parser () -- ** Parsing state -- | The internal parser state. We don't bother with parameterising it with a -- \"UserState\", we just bundle the generic and specific fields together (not -- that it is that easy to draw the line - is @sLine@ generic or specific?). -- Note that using 'DList' for @sTokens@ allows us to consume tokens as they -- are generated, instead of waiting for the whole parsing process to complete. -- Likewise for @sCommits@, which allows us to commit to a partially generated -- tokens stream before it is fully generated. data State = State { -- Fields that never change, or only change for nested parsers: sName :: String, -- ^ The input name for error messages. sEncoding :: Encoding, -- ^ The input UTF encoding. sDecision :: String, -- ^ Current decision name. sLimit :: Int, -- ^ Lookahead characters limit. sForbidden :: Maybe Pattern, -- ^ Pattern we must not enter into. sIsPeek :: Bool, -- ^ Disables token generation. -- Fields that get collected by consecutive parsers, then merged: sTokens :: D.DList Token, -- ^ Tokens collected from input. sCommits :: D.DList String, -- ^ Committments collected while parsing. -- Fields that are incrementally modified by consecutive parsers: sConsumed :: Bool, -- ^ Consumed character markers. sChars :: D.DList Char, -- ^ Characters collected for a token. sMessage :: Maybe String, -- ^ If an error occurred. sLine :: Int, -- ^ Builds on YAML's line break definition. sColumn :: Int, -- ^ Actually character number - we hate tabs. sCode :: Code, -- ^ Of token we are collecting chars for. sLast :: Char, -- ^ Last matched character. sInput :: [Char] -- ^ The input decoded characters. } -- Showing a 'State' is only used in debugging. Note that forcing certain -- members (@sTokens@, @sCommits@) may disable streaming. instance Show State where show state = "Name: " ++ (show $ state|>sName) -- ++ ", Encoding: " ++ (show / state|>sEncoding) ++ ", Decision: " ++ (show $ state|>sDecision) -- ++ ", Limit: " ++ (show / state|>sLimit) -- ++ ", IsPeek: " ++ (show / state|>sIsPeek) -- ++ ", Tokens:\n" ++ (showTokens / D.toList / state|>sTokens) ++ "...\n" -- ++ ", Commits: " ++ (show / D.toList / state|>sCommits) -- ++ ", Consumed: >>>" ++ (show / state|>sConsumed) ++ "<<<" -- ++ ", Chars: >>>" ++ (D.toList / state|>sChars) ++ "<<<" ++ ", Message: " ++ (show $ state|>sMessage) ++ ", Line: " ++ (show $ state|>sLine) ++ ", Column: " ++ (show $ state|>sColumn) ++ ", Code: " ++ (show $ state|>sCode) ++ ", Last: >>>" ++ (show $ state|>sLast) ++ "<<<" -- ++ ", Input: >>>" ++ (show / state|>sInput) ++ "<<<" -- | @initialState name input@ returns an initial 'State' for parsing the -- /input/ (with /name/ for error messages). initialState :: String -> C.ByteString -> State initialState name input = let (encoding, decoded) = decode input in State { sName = name, sEncoding = encoding, sDecision = "n/a", sLimit = -1, sForbidden = Nothing, sIsPeek = False, sTokens = D.empty, sCommits = D.empty, sConsumed = False, sChars = D.empty, sMessage = Nothing, sLine = 1, sColumn = 0, sCode = Test, sLast = ' ', sInput = decoded } -- *** Setters -- -- We need setter functions instead of using @{ sXxx = value }@ because the -- nice syntax causes the old state to be fully evaluated, killing the -- streaming (lazy) functionality we need. Generating the setters by hand was -- tedious, there's surely a way to do this using Template Haskell or DrIFT or -- some such. Life is too short. -- | @setDecision name state@ sets the @sDecision@ field to /decision/. setDecision :: String -> State -> State setDecision decision state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = decision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setLimit limit state@ sets the @sLimit@ field to /limit/. setLimit :: Int -> State -> State setLimit limit state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = limit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setForbidden forbidden state@ sets the @sForbidden@ field to /forbidden/. setForbidden :: Maybe Pattern -> State -> State setForbidden forbidden state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = forbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setIsPeek isPeek state@ sets the @sIsPeek@ field to /isPeek/. setIsPeek :: Bool -> State -> State setIsPeek isPeek state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = isPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setTokens tokens state@ sets the @sTokens@ field to /tokens/. setTokens :: D.DList Token -> State -> State setTokens tokens state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = tokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setCommits commits state@ sets the @sCommits@ field to /commits/. setCommits :: D.DList String -> State -> State setCommits commits state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = commits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setChars chars state@ sets the @sChars@ field to /chars/. setChars :: D.DList Char -> State -> State setChars chars state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = chars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setConsumed consumed state@ sets the @sConsumed@ field to /consumed/. setConsumed :: Bool -> State -> State setConsumed consumed state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = consumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setMessage message state@ sets the @sMessage@ field to /message/. setMessage :: Maybe String -> State -> State setMessage message state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = message, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setLine line state@ sets the @sLine@ field to /line/. setLine :: Int -> State -> State setLine line state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = line, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setColumn line state@ sets the @sColumn@ field to /line/. setColumn :: Int -> State -> State setColumn column state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = column, sCode = state|>sCode, sLast = state|>sLast, sInput = state|>sInput } -- | @setCode code state@ sets the @sCode@ field to /code/. setCode :: Code -> State -> State setCode code state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = code, sLast = state|>sLast, sInput = state|>sInput } -- | @setLast last state@ sets the @sLast@ field to /last/. setLast :: Char -> State -> State setLast last state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = last, sInput = state|>sInput } -- | @setInput input state@ sets the @sInput@ field to /input/. setInput :: [Char] -> State -> State setInput input state = State { sName = state|>sName, sEncoding = state|>sEncoding, sDecision = state|>sDecision, sLimit = state|>sLimit, sForbidden = state|>sForbidden, sIsPeek = state|>sIsPeek, sTokens = state|>sTokens, sCommits = state|>sCommits, sConsumed = state|>sConsumed, sChars = state|>sChars, sMessage = state|>sMessage, sLine = state|>sLine, sColumn = state|>sColumn, sCode = state|>sCode, sLast = state|>sLast, sInput = input } -- ** Implicit parsers -- -- It is tedious to have to wrap each expected character (or character range) -- in an explicit 'Parse' constructor. We let Haskell do that for us using a -- 'Match' class. -- | @Match parameter result@ specifies that we can convert the /parameter/ to -- a 'Parser' returning the /result/. class Match parameter result | parameter -> result where match :: parameter -> Parser result -- | @parse parser state@ applies the actual /parser/ match function to a -- /state/. parse :: (Match match result) => match -> State -> (State, Maybe result) parse parser state = let Parser parser' = match parser in parser' state -- | We don't need to convert a 'Parser', it already is one. instance Match (Parser result) result where match = id -- | We convert 'Char' to a parser for a character (that returns nothing). instance Match Char () where match code = nextIf (== code) -- | We convert a 'Char' tuple to a parser for a character range (that returns -- nothing). instance Match (Char, Char) () where match (low, high) = nextIf (\ code -> low <= code && code <= high) -- | We convert 'String' to a parser for a sequence of characters (that returns -- nothing). instance Match String () where match "" = empty match (first:rest) = match first & match rest -- ** Parsing Monad -- | Allow using the @do@ notation for our parsers, which makes for short and -- sweet @do@ syntax when we want to examine the results (we typically don't). -- -- We don't use the 'Monad' @fail@ method because we need access to the 'State' -- on failure. -- -- @return result@ does just that - return a /result/. -- -- @left >>= right@ applies the /left/ parser, and if it didn't fail -- applies the /right/ one (well, the one /right/ returns). To achieve -- streaming, we need to construct the final state manually, \"regardless\" of -- the result of the right parser, and append the collected tokens and -- commitments from the left and right parser. This allows our caller to -- start consuming tokens from the left parser before the right parser is -- done. -- -- This code seems to hang on to the old states for too long a time, causing -- memory usage to grow up when creating long parser chains (e.g. using @*@). instance Monad Parser where return result = Parser (\ state -> (state, Just result)) left >>= right = Parser (\ originalState -> let (leftState, leftResult) = parse left originalState leftTokens = leftState|>sTokens leftCommits = leftState|>sCommits leftConsumed = leftState|>sConsumed leftState' = setTokens D.empty $ setCommits D.empty $ setConsumed False $ leftState (rightState, rightResult) = case leftResult of Nothing -> (leftState', Nothing) Just value -> parse (right value) leftState' rightTokens = rightState|>sTokens rightCommits = rightState|>sCommits rightConsumed = rightState|>sConsumed finalState = setTokens (D.append leftTokens rightTokens) $ setCommits (D.append leftCommits rightCommits) $ setConsumed (leftConsumed || rightConsumed) $ rightState in (finalState, rightResult)) -- ** Parsing operators -- -- Here we reap the benefits of renaming the numerical operators. Note that in -- our specific case, we use the 'MonadPlus' to combine the results of -- different parsers. Typically, parsing frameworks simply use the last result, -- or leave the task of combining them to the rules themselves (using @do@ -- notation). -- -- Operator precedence, in decreasing strength: -- -- @repeated % n@, @repeated <% n@, @match - rejected@, @match ! decision@, -- @match ?! decision@ are the strongest binding, and don't mix. -- -- @match - first - second@ is @(match - first) - second@. -- -- @first & second & third@ is @first & (second & third)@. Note that @first - -- rejected & second@ is @(first - rejected) & second@, etc. d@ is @a & (b - c) -- & d@. -- -- @match \/ alternative \/ otherwise@ is @match \/ (alternative \/ -- otherwise)@. Note that @first & second \/ third@ is @(first & second) \/ -- third@. -- -- @( match *)@, @(match +)@, @(match ?)@ are the weakest and require the -- @()@. infix 3 % infix 3 <% infix 3 ^ infix 3 ! infix 3 ?! infixl 3 - infixr 2 & infixr 1 / infix 0 ? infix 0 * infix 0 + -- | @parser % n@ repeats /parser/ exactly /n/ times. (%) :: (Match match result) => match -> Int -> Pattern parser % n | n <= 0 = empty | n > 0 = parser & parser % n .- 1 -- | @decision ^ (option \/ option \/ ...)@ provides a /decision/ name to the -- choice about to be made, to allow to @commit@ to it. (^) :: (Match match result) => String -> match -> Parser result decision ^ parser = decide decision parser -- | @parser ! decision@ commits to /decision/ after successfully matching the -- /parser/. (!) :: (Match match result) => match -> String -> Pattern parser ! decision = parser & commit decision -- | @parser ?! decision@ commits to /decision/ if the current position matches -- /parser/, without consuming any characters. (?!) :: (Match match result) => match -> String -> Pattern parser ?! decision = peek parser & commit decision -- | @parser <% n@ matches fewer than /n/ occurrences of /parser/. (<%) :: (Match match result) => match -> Int -> Pattern parser <% n | n < 1 = failed "Fewer than 0 repetitions" | n == 1 = reject parser Nothing | n > 1 = parser & parser <% n .- 1 / empty -- | @parser - rejected@ matches /parser/, except if /rejected/ matches at this -- point. (-) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1 parser - rejected = reject rejected Nothing & parser -- | @before & after@ parses /before/ and, if it succeeds, parses /after/. This -- basically invokes the monad's @>>=@ method. (&) :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result2 before & after = (match before) >> (match after) -- | @first \/ second@ tries to parse /first/, and failing that parses /second/, -- unless /first/ has committed in which case is fails immediately. To achieve -- streaming, we need to construct the final state manually, passing the -- commitments through \"regardless\" of the results of either parsers. This -- allows our caller to start consuming tokens from a committed option before -- it is done parsing. -- -- This code seems to hang on to the old states for too long a time, causing -- memory usage to grow up when creating long parser chains (e.g. using @*@). (/) :: (Match match1 result, Match match2 result) => match1 -> match2 -> Parser result first / second = Parser (\ originalState -> let decision = originalState|>sDecision originalTokens = originalState|>sTokens originalCommits = originalState|>sCommits originalConsumed = originalState|>sConsumed nestState = setDecision "n/a" $ setTokens D.empty $ setCommits D.empty $ setConsumed False $ originalState (firstState, firstResult) = parse first nestState (firstIsCommitted, firstCommits) = commitStatus "n/a" decision $ D.toList $ firstState|>sCommits (chosenState, chosenResult, chosenCommits) = if firstIsCommitted then (firstState, firstResult, firstCommits) else case firstResult of Just _ -> (firstState, firstResult, firstCommits) Nothing -> let (secondState, secondResult) = parse second nestState secondCommits = snd $ commitStatus "n/a" decision $ D.toList $ secondState|>sCommits in (secondState, secondResult, secondCommits) chosenTokens = chosenState|>sTokens chosenConsumed = chosenState|>sConsumed finalState = setTokens (D.append originalTokens chosenTokens) $ setCommits (D.append originalCommits chosenCommits) $ setConsumed (originalConsumed || chosenConsumed) $ setDecision decision $ chosenState in (finalState, chosenResult)) where commitStatus :: String -> String -> [String] -> (Bool, D.DList String) commitStatus prev decision [] = (False, D.empty) commitStatus prev decision (name:rest) | name == decision = (True, snd $ commitStatus prev decision rest) | name == prev = (True, snd $ commitStatus prev decision rest) | otherwise = (True, D.cons name $ snd $ commitStatus name decision rest) -- | @(parser ?)@ (optional) tries to match /parser/, otherwise does nothing. (?) :: (Match match result) => match -> Pattern (?) parser = non_empty parser & empty / empty -- | @(parser *)@ matches zero or more occurrences of /parser/. (*) :: (Match match result) => match -> Pattern (*) parser = "*" ^ ( non_empty parser ! "*" & (parser *) / empty ) -- | @(parser +)@ matches one or more occurrences of /parser/. (+) :: (Match match result) => match -> Pattern (+) parser = non_empty parser & (parser *) -- ** Basic parsers -- | @traced name parser@ traces all invocations to the parser. Is only used when -- debugging. traced :: (Match match result, Show result) => String -> match -> Parser result traced name parser = Parser (\ state -> trace_reply name (parse parser (trace_call name state))) -- | @trace_call name state@ traces the /state/ at the start of the call to /name/. trace_call :: String -> State -> State trace_call name state = trace ("Call " ++ name ++ " with " ++ (show state)) state -- | @trace_call name reply@ traces the /reply/ from calling /name/. trace_reply :: Show result => String -> (State, Maybe result) -> (State, Maybe result) trace_reply name reply@(state, result) = case result of Nothing -> trace ("Fail " ++ name ++ " with " ++ (show state)) reply Just value -> trace ("Done " ++ name ++ " with " ++ (show state)) reply -- | @reject rejected name@ fails if /rejected/ matches at this point, and does -- nothing otherwise. If /name/ is provided, it is used in the error message, -- otherwise the messages uses the current character. reject :: (Match match result) => match -> Maybe String -> Pattern reject rejected name = Parser (\ state -> let (_, result) = parse rejected $ setIsPeek True state parser' = case (result, name) of (Nothing, _) -> empty (Just _, Nothing) -> consumeNextIf (const False) (Just _, Just text) -> failed $ "Unexpected " ++ text in parse parser' state) -- | @peek parser@ succeeds if /parser/ matches at this point, but does not -- consume any input. peek :: (Match match result) => match -> Parser result peek parser = Parser (\ state -> let (_, result) = parse parser $ setIsPeek True state in case result of Nothing -> (setFailed "Peek failed" state, Nothing) Just _ -> (state, result)) -- | @non_empty parser@ matches the same as /parser/ as long as it consumes some -- characters. non_empty :: (Match match result) => match -> Parser result non_empty parser = Parser (\ originalState -> let nestState = setConsumed False originalState (parseState, parseResult) = parse parser nestState in case (parseState|>sConsumed, parseResult) of (True, _) -> (parseState, parseResult) (False, Nothing) -> (parseState, parseResult) (False, Just _) -> (setFailed "Parser does not consume characters" originalState, Nothing)) -- | @empty@ always matches without consuming any input. empty :: Pattern empty = return () -- | @setFailed state message@ sets the @sMessage@ field to the error /message/. setFailed :: String -> State -> State setFailed message state = setMessage (Just $ state|>sName ++ ": line " ++ (show $ state|>sLine) ++ ": column " ++ (show $ state|>sColumn) ++ ": " ++ message) state -- | @failed message@ fails parsing with the specified /message/. Note this is -- not the 'Monad' @fail@ method, which we do not use. failed :: String -> Parser result failed message = Parser (\ state -> (setFailed message state, Nothing)) -- | @eof@ matches the end of the input. eof :: Pattern eof = Parser (\ state -> if state|>sInput == [] then (state, Just ()) else (setFailed "Expected end of input" state, Nothing)) -- | @sol@ matches the start of a line. sol :: Pattern sol = Parser (\ state -> if state|>sColumn == 0 then (state, Just ()) else (setFailed "Expected start of line" state, Nothing)) -- ** State manipulation pseudo-parsers -- | @incrLine@ increments @sLine@ counter resets @sColumn@. nextLine :: Pattern nextLine = Parser (\ state -> (setLine (state|>sLine .+ 1) $ setColumn 0 $ state, Just ())) -- | @with setField getField value parser@ invokes the specified /parser/ with -- the value of the specified field set to /value/ for the duration of the -- invocation, using the /setField/ and /getField/ functions to manipulate it. with :: (value -> State -> State) -> (State -> value) -> value -> Parser result -> Parser result with setField getField value parser = Parser (\ originalState -> let originalValue = getField originalState withState = setField value originalState (parseState, parseResult) = parse parser withState finalState = setField originalValue parseState in (finalState, parseResult)) -- | @decide name (a / b / ...)@ names the contained decision point so it can be -- addressed by later @commit@ calls. decide :: (Match match result) => String -> match -> Parser result decide decision parser = with setDecision sDecision decision $ match parser -- | @commit name@ commits the parser to all the decisions up to the most recent -- containing decision with the specified /name/. This makes all tokens -- generated in this parsing path immediately available to the caller. commit :: String -> Pattern commit name = Parser (\ state -> (setCommits (D.snoc (state|>sCommits) name) state, Just ())) -- | @parser ``forbidding`` pattern@ parses the specified /parser/ ensuring -- that it does not contain anything matching the /forbidden/ parser. forbidding :: (Match match1 result1, Match match2 result2) => match1 -> match2 -> Parser result1 forbidding parser forbidden = with setForbidden sForbidden (Just $ forbidden & empty) (match parser) -- | @parser ``limitedTo`` limit@ parses the specified /parser/ -- ensuring that it does not consume more than the /limit/ input chars. limitedTo :: (Match match result) => match -> Int -> Parser result limitedTo parser limit = with setLimit sLimit limit (match parser) -- ** Consuming input characters -- | @consumeNextIf test@ consumes and returns the next character if it satisfies -- the /test/. consumeNextIf :: (Char -> Bool) -> Pattern consumeNextIf test = Parser (\ state -> case state|>sInput of (char:rest) | test char -> (setInput rest $ setLast char $ setColumn (state|>sColumn .+ 1) $ setChars (D.snoc (state|>sChars) char) $ setConsumed True $ state, Just ()) | otherwise -> (setFailed ("Unexpected '" ++ [char] ++ "'") state, Nothing) [] -> (setFailed "Unexpected end of input" state, Nothing)) -- | @limitedNextIf test@ fails if the 'State' lookahead limit is reached. -- Otherwise it consumes and returns the specified input char if it satisfies -- /test/. limitedNextIf :: (Char -> Bool) -> Pattern limitedNextIf test = Parser (\ state -> case state|>sLimit of -1 -> parse (consumeNextIf test) state 0 -> (setFailed "Lookahead limit reached" state, Nothing) limit -> parse (consumeNextIf test) $ setLimit (limit .- 1) state) -- | @nextIf test@ fails if the current position matches the 'State' forbidden -- pattern or if the 'State' lookahead limit is reached. Otherwise it consumes -- (and buffers) the next input char if it satisfies /test/. nextIf :: (Char -> Bool) -> Pattern nextIf test = Parser (\ state -> case state|>sForbidden of Nothing -> parse (limitedNextIf test) state Just parser -> let (_, result) = parse parser $ setIsPeek True $ setForbidden Nothing $ state in case result of Nothing -> parse (limitedNextIf test) state Just _ -> (setFailed "Forbidden pattern detected" state, Nothing)) -- ** Producing tokens -- | @finishToken@ places all collected text into a new token and begins a new -- one, or does nothing if there are no collected characters. finishToken :: Pattern finishToken = Parser (\ state -> let chars = D.toList $ state|>sChars token = Token { tCode = state|>sCode, tText = Just chars } in if state|>sIsPeek || chars == "" then (state, Just ()) else (setTokens (D.snoc (state|>sTokens) token) $ setChars D.empty $ state, Just ())) -- | @wrap parser@ invokes the /parser/, ensures any unclaimed input characters -- are wrapped into a token (only happens when testing productions), ensures no -- input is left unparsed, and returns the parser's result. wrap :: (Match match result) => match -> Parser result wrap parser = do result <- match parser finishToken & eof return result -- | @consume parser@ invokes the /parser/ and then consumes all remaining -- unparsed input characters. consume :: (Match match result) => match -> Parser result consume parser = do result <- match parser finishToken & clear_input return result where clear_input = Parser (\ state -> (setInput [] state, Just ())) -- | @token code parser@ places all text matched by /parser/ into a 'Token' with -- the specified /code/ (unless it is empty). Note it collects the text even if -- there is an error. token :: (Match match result) => Code -> match -> Pattern token code parser = finishToken & with setCode sCode code (parser & finishToken) -- | @fake code text@ creates a token with the specified /code/ and \"fake\" -- /text/ characters. fake :: Code -> String -> Pattern fake code text = Parser (\ state -> if state|>sIsPeek then (state, Just()) else let token = Token { tCode = code, tText = Just text } in (setTokens (D.snoc (state|>sTokens) token) state, Just ())) -- | @meta parser@ collects the text matched by the specified /parser/ into a -- | @Meta@ token. meta :: (Match match result) => match -> Pattern meta parser = token Meta parser -- | @indicator code@ collects the text matched by the specified /parser/ into an -- @Indicator@ token. indicator :: (Match match result) => match -> Pattern indicator parser = token Indicator $ parser -- | @text parser@ collects the text matched by the specified /parser/ into a -- @Text@ token. text :: (Match match result) => match -> Pattern text parser = token Text parser -- | @nest code@ returns an empty token with the specified begin\/end /code/ to -- signal nesting. nest :: Code -> Pattern nest code = Parser (\ state -> if state|>sIsPeek then (state, Just()) else let (state', _) = parse finishToken state token = Token { tCode = code, tText = Nothing } in (setTokens (D.snoc (state'|>sTokens) token) state', Just ())) -- * Production parameters -- | Production context. data Context = BlockOut -- ^ Outside block mapping. | BlockIn -- ^ Inside block mapping. | FlowOut -- ^ Outside flow collection. | FlowIn -- ^ Inside flow collection. | FlowKey -- ^ Inside flow key. -- | @show context@ converts a 'Context' to a 'String'. instance Show Context where show context = case context of BlockOut -> "block-out" BlockIn -> "block-in" FlowOut -> "flow-out" FlowIn -> "flow-in" FlowKey -> "flow-key" -- | @read context@ converts a 'String' to a 'Context'. We trust our callers to -- convert any @-@ characters into @_@ to allow the built-in @lex@ function to -- handle the names as single identifiers. instance Read Context where readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ] where r word = case word of "block_out" -> BlockOut "block_in" -> BlockIn "flow_out" -> FlowOut "flow_in" -> FlowIn "flow_key" -> FlowKey _ -> error $ "unknown context: " ++ word -- | Scalar style. data Style = Plain -- ^ Plain scalar. | Double -- ^ Double quoted. | Single -- ^ Single quoted. | Literal -- ^ Literal block. | Folded -- ^ Folded block. -- | @show style@ converts a 'Style' to a 'String'. instance Show Style where show style = case style of Plain -> "plain" Double -> "double" Single -> "single" Literal -> "literal" Folded -> "folded" -- | @read style@ converts a 'String' to a 'Style'. instance Read Style where readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ] where r word = case word of "plain" -> Plain "double" -> Double "single" -> Single "literal" -> Literal "folded" -> Folded _ -> error $ "unknown style: " ++ word -- | Chomp method. data Chomp = Strip -- ^ Remove all trailing line breaks. | Clip -- ^ Keep first trailing line break. | Keep -- ^ Keep all trailing line breaks. -- | @show chomp@ converts a 'Chomp' to a 'String'. instance Show Chomp where show chomp = case chomp of Strip -> "strip" Clip -> "clip" Keep -> "keep" -- | @read chomp@ converts a 'String' to a 'Chomp'. instance Read Chomp where readsPrec _ text = [ ((r word), tail) | (word, tail) <- lex text ] where r word = case word of "strip" -> Strip "clip" -> Clip "keep" -> Keep _ -> error $ "unknown chomp: " ++ word -- * Tokenizers -- -- We encapsulate the 'Parser' inside a 'Tokenizer'. This allows us to hide the -- implementation details from our callers. -- | 'Tokenizer' converts a (named) input text into a list of 'Token'. Errors -- are reported as tokens. type Tokenizer = String -> C.ByteString -> [Token] -- | @patternTokenizer pattern@ converts the /pattern/ to a simple 'Tokenizer'. -- Note how using difference lists we can tuck the error token at the end of -- the result without preventing the streaming. patternTokenizer :: Pattern -> Tokenizer patternTokenizer pattern name input = let (state, result) = parse (wrap pattern) (initialState name input) last = case result of Just _ -> D.empty Nothing -> D.singleton Token { tCode = Error, tText = state|>sMessage } bugs = commitBugs $ D.toList $ state|>sCommits errors = D.append bugs last in D.toList $ D.append (state|>sTokens) errors -- | @parserTokenizer what parser@ converts the /parser/ returning /what/ to a -- simple 'Tokenizer' (only used for tests). Note how using difference lists we -- can tuck the result or error token at the end of the result without -- preventing the streaming. parserTokenizer :: (Show result, Match match result) => String -> match -> Tokenizer parserTokenizer what parser name input = let (state, result) = parse (wrap parser) (initialState name input) last = case result of Just value -> Token { tCode = Detected, tText = Just $ what ++ "=" ++ (show value) } Nothing -> Token { tCode = Error, tText = state|>sMessage } bugs = commitBugs $ D.toList $ state|>sCommits errors = D.snoc bugs last in D.toList $ D.append (state|>sTokens) errors -- | @commitBugs commits@ converts any @commit@ calls made outside the decision -- they refer to into an error token. No such calls should exists outside -- tests. commitBugs :: [String] -> D.DList Token commitBugs [] = D.empty commitBugs (decision:rest) = D.cons Token { tCode = Error, tText = Just $ "Commit to '" ++ decision ++ "' was made outside it" } $ commitBugs rest -- | @yaml name input@ converts the Unicode /input/ (called /name/ in error -- messages) to a list of 'Token' according to the YAML spec. This is it! yaml :: Tokenizer yaml = patternTokenizer l_yaml_stream -- CPP LINES CAUSE HADDOCK TO BARF -- #ifdef REAL_CPP -- This is how non-ancient C pre-processor do it. #define STR(X) #X #else -- This only works in GHC's simplistic GHC (and ancient C pre-processors). #define STR(X) "X" #endif -- These allow us to avoid repeating the parser names. #define PAT(PATTERN) pat STR(PATTERN) PATTERN #define PAR(PARSER) par STR(PARSER) PARSER #define PAC(PARSER) pac STR(PARSER) PARSER -- CPP LINES CAUSE HADDOCK TO BARF -- -- | @pName name@ converts a parser name to the \"proper\" spec name. pName :: String -> String pName name = regexSub questionRegex "?" $ regexSub minusRegex "-" $ regexSub plusRegex "+" name where regexSub regex value text = subRegex regex text value questionRegex = mkRegex "'" minusRegex = mkRegex "_" plusRegex = mkRegex "__" -- | @tokenizers@ returns a mapping from a production name to a production -- tokenizer. tokenizers :: Map.Map String Tokenizer tokenizers = PAR(c_chomping_indicator) "t" $ PAC(detect_inline_indentation) "m" $ PAT(debug_leak) $ PAT(b_as_line_feed) $ PAT(b_carriage_return) $ PAT(b_char) $ PAT(b_generic) $ PAT(b_ignored_any) $ PAT(b_ignored_generic) $ PAT(b_l_folded_as_space) $ PAT(b_line_feed) $ PAT(b_line_separator) $ PAT(b_next_line) $ PAT(b_normalized) $ PAT(b_paragraph_separator) $ PAT(b_specific) $ PAT(c_alias) $ PAT(c_anchor) $ PAT(c_byte_order_mark) $ PAT(c_collect_entry) $ PAT(c_comment) $ PAT(c_directive) $ PAT(c_document_end) $ PAT(c_document_start) $ PAT(c_double_quote) $ PAT(c_escape) $ PAT(c_flow_indicator) $ PAT(c_folded) $ PAT(c_indicator) $ PAT(c_literal) $ PAT(c_mapping_end) $ PAT(c_mapping_key) $ PAT(c_mapping_start) $ PAT(c_mapping_value) $ PAT(c_named_tag_handle) $ PAT(c_nb_comment_text) $ PAT(c_non_specific_tag) $ PAT(c_ns_alias) $ PAT(c_ns_anchor_property) $ PAT(c_ns_local_tag_prefix) $ PAT(c_ns_properties) $ PAT(c_ns_property) $ PAT(c_ns_shorthand_tag) $ PAT(c_ns_tag_property) $ PAT(c_primary_tag_handle) $ PAT(c_printable) $ PAT(c_quoted_quote) $ PAT(c_reserved) $ PAT(c_secondary_tag_handle) $ PAT(c_sequence_end) $ PAT(c_sequence_entry) $ PAT(c_sequence_start) $ PAT(c_single_quote) $ PAT(c_s_simple_json_key) $ PAT(c_tag) $ PAT(c_tag_handle) $ PAT(c_verbatim_tag) $ PAT(e_node) $ PAT(e_no_document) $ PAT(e_scalar) $ PAT(l_comment) $ PAT(l_directive) $ PAT(l_document_prefix) $ PAT(l_documents) $ PAT(l_document_suffix) $ PAT(l_explicit_document) $ PAT(l_following_document) $ PAT(l_forbidden) $ PAT(l_implicit_document) $ PAT(l_leading_document) $ PAT(l_yaml_stream) $ PAT(nb_char) $ PAT(nb_double_char) $ PAT(nb_single_char) $ PAT(ns_anchor_char) $ PAT(ns_anchor_name) $ PAT(ns_ascii_letter) $ PAT(ns_char) $ PAT(ns_dec_digit) $ PAT(ns_directive_name) $ PAT(ns_directive_parameter) $ PAT(ns_double_char) $ PAT(ns_esc_16_bit) $ PAT(ns_esc_32_bit) $ PAT(ns_esc_8_bit) $ PAT(ns_esc_backslash) $ PAT(ns_esc_backspace) $ PAT(ns_esc_bell) $ PAT(ns_esc_carriage_return) $ PAT(ns_esc_char) $ PAT(ns_esc_double_quote) $ PAT(ns_esc_escape) $ PAT(ns_esc_form_feed) $ PAT(ns_esc_horizontal_tab) $ PAT(ns_esc_line_feed) $ PAT(ns_esc_line_separator) $ PAT(ns_esc_next_line) $ PAT(ns_esc_non_breaking_space) $ PAT(ns_esc_null) $ PAT(ns_esc_paragraph_separator) $ PAT(ns_esc_space) $ PAT(ns_esc_vertical_tab) $ PAT(ns_global_tag_prefix) $ PAT(ns_hex_digit) $ PAT(ns_plain_safe_in) $ PAT(ns_plain_safe_out) $ PAT(ns_reserved_directive) $ PAT(ns_s_block_map_implicit_key) $ PAT(ns_single_char) $ PAT(ns_s_simple_yaml_key) $ PAT(ns_tag_char) $ PAT(ns_tag_directive) $ PAT(ns_tag_prefix) $ PAT(ns_uri_char) $ PAT(ns_word_char) $ PAT(ns_yaml_directive) $ PAT(ns_yaml_version) $ PAT(s_b_comment) $ PAT(s_b_double_escaped) $ PAT(s_l_comments) $ PAT(s_ns_double_chars) $ PAT(s_ns_single_chars) $ PAT(s_separate_in_line) $ PAT(s_space) $ PAT(s_tab) $ PAT(s_white) $ Map.empty where pat name pattern = Map.insert (pName name) $ patternTokenizer (match pattern) par name parser what = Map.insert (pName name) $ parserTokenizer what (match parser) pac name parser what = Map.insert (pName name) $ parserTokenizer what (consume parser) -- | @tokenizer name@ converts the production with the specified /name/ to a -- simple 'Tokenizer', or @Nothing@ if it isn't known. tokenizer :: String -> (Maybe Tokenizer) tokenizer name = Map.lookup name tokenizers -- | @tokenizersWithN@ returns a mapping from a production name to a production -- tokenizer (that takes an /n/ argument). tokenizersWithN :: Map.Map String (Int -> Tokenizer) tokenizersWithN = PAR(c_indentation_indicator) "m" $ PAR(count_spaces) "m" $ PAC(detect_collection_indentation) "m" $ PAC(detect_scalar_indentation) "m" $ PAT(b_l_literal_next) $ PAT(b_l_spaced) $ PAT(c_l_block_map_explicit_entry) $ PAT(c_l_block_map_explicit_key) $ PAT(c_l_block_map_implicit_value) $ PAT(c_l_block_seq_entry) $ PAT(c_l__folded) $ PAT(c_l__literal) $ PAT(l_block_map_explicit_value) $ PAT(l__block_mapping) $ PAT(l__block_sequence) $ PAT(l_keep_empty) $ PAT(l_nb_folded_lines) $ PAT(l_nb_literal_chars) $ PAT(l_nb_spaced_lines) $ PAT(l_nb_start_with_any) $ PAT(l_nb_start_with_folded) $ PAT(l_nb_start_with_spaced) $ PAT(l_strip_empty) $ PAT(l_trail_comments) $ PAT(ns_l_block_map_entry) $ PAT(ns_l_block_map_implicit_entry) $ PAT(ns_l_in_line_mapping) $ PAT(ns_l_in_line_sequence) $ PAT(s_double_multi) $ PAT(s_ignored_prefix_block) $ PAT(s_ignored_prefix_flow) $ PAT(s_indent) $ PAT(s_indent_le) $ PAT(s_indent_lt) $ PAT(s_l_double_any) $ PAT(s_l_flow_folded) $ PAT(s_l__flow_in_block) $ PAT(s_nb_folded_text) $ PAT(s_nb_spaced_text) $ PAT(s_ns_double_next) $ PAT(s_ns_single_next) $ PAT(s_separate_lines) $ PAT(s_single_multi) $ Map.empty where pat name pattern = Map.insert (pName name) (\ n -> patternTokenizer (match $ pattern n)) par name parser what = Map.insert (pName name) (\ n -> parserTokenizer what (match $ parser n)) pac name parser what = Map.insert (pName name) (\ n -> parserTokenizer what (consume $ parser n)) -- | @tokenizerWithN name n@ converts the production (that requires an /n/ -- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if -- it isn't known. tokenizerWithN :: String -> Int -> Maybe Tokenizer tokenizerWithN name n = case Map.lookup name tokenizersWithN of Just tokenizer -> Just $ tokenizer n Nothing -> Nothing -- | @tokenizersWithC@ returns a mapping from a production name to a production -- tokenizer (that takes a /c/ argument). tokenizersWithC :: Map.Map String (Context -> Tokenizer) tokenizersWithC = PAT(nb_plain_char) $ PAT(ns_plain_char) $ PAT(ns_plain_first) $ PAT(ns_plain_safe) $ PAT(ns_plain_single) $ PAT(s_ns_plain_chars) $ Map.empty where pat name pattern = Map.insert (pName name) (\ c -> patternTokenizer (match $ pattern c)) -- | @tokenizerWithC name c@ converts the production (that requires a /c/ -- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if -- it isn't known. tokenizerWithC :: String -> Context -> Maybe Tokenizer tokenizerWithC name c = case Map.lookup name tokenizersWithC of Just tokenizer -> Just $ tokenizer c Nothing -> Nothing -- | @tokenizersWithS@ returns a mapping from a production name to a production -- tokenizer (that takes a /s/ argument). tokenizersWithS :: Map.Map String (Style -> Tokenizer) tokenizersWithS = PAT(c_style_indicator) $ Map.empty where pat name pattern = Map.insert (pName name) (\ s -> patternTokenizer (match $ pattern s)) -- | @tokenizerWithS name s@ converts the production (that requires an /s/ -- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if -- it isn't known. tokenizerWithS :: String -> Style -> Maybe Tokenizer tokenizerWithS name s = case Map.lookup name tokenizersWithS of Just tokenizer -> Just $ tokenizer s Nothing -> Nothing -- | @tokenizersWithT@ returns a mapping from a production name to a production -- tokenizer (that takes a /t/ argument). tokenizersWithT :: Map.Map String (Chomp -> Tokenizer) tokenizersWithT = PAT(b_chomped_last) $ Map.empty where pat name pattern = Map.insert (pName name) (\ t -> patternTokenizer (match $ pattern t)) -- | @tokenizerWithT name t@ converts the production (that requires an /t/ -- argument) with the specified /name/ to a simple 'Tokenizer', or @Nothing@ if -- it isn't known. tokenizerWithT :: String -> Chomp -> Maybe Tokenizer tokenizerWithT name t = case Map.lookup name tokenizersWithT of Just tokenizer -> Just $ tokenizer t Nothing -> Nothing -- | @tokenizersWithNC@ returns a mapping from a production name to a -- production tokenizer (that requires /n/ and /c/ arguments). tokenizersWithNC :: Map.Map String (Int -> Context -> Tokenizer) tokenizersWithNC = PAT(c_double_quoted) $ PAT(c_flow_json_content) $ PAT(c_flow_json_node) $ PAT(c_flow_mapping) $ PAT(c_flow_sequence) $ PAT(c_ns_flow_map_adjacent_value) $ PAT(c_ns_flow_map_implicit_json) $ PAT(c_ns_flow_map_separate_value) $ PAT(c_ns_flow_map_single_json) $ PAT(c_single_quoted) $ PAT(nb_double_text) $ PAT(nb_single_text) $ PAT(ns_flow_content) $ PAT(ns_flow_map_entry) $ PAT(ns_flow_map_implicit_empty) $ PAT(ns_flow_map_implicit_entry) $ PAT(ns_flow_map_implicit_yaml) $ PAT(ns_flow_map_single_entry) $ PAT(ns_flow_map_single_pair) $ PAT(ns_flow_map_single_yaml) $ PAT(ns_flow_node) $ PAT(ns_flow_seq_entry) $ PAT(ns_flow_yaml_content) $ PAT(ns_flow_yaml_node) $ PAT(ns_plain) $ PAT(ns_plain_multi) $ PAT(ns_s_flow_map_entries) $ PAT(ns_s_flow_seq_entries) $ PAT(s_l__block_content) $ PAT(s_l__block_in_block) $ PAT(s_l__block_indented) $ PAT(s_l__block_node) $ PAT(s_ns_flow_map_explicit_empty) $ PAT(s_ns_flow_map_explicit_entry) $ PAT(s_ns_flow_map_explicit_json) $ PAT(s_ns_flow_map_explicit_yaml) $ PAT(s_ns_plain_next) $ PAT(s_separate_in_flow) $ Map.empty where pat name pattern = Map.insert (pName name) (\ n c -> patternTokenizer (match $ pattern n c)) -- | @tokenizerWithNC name n c@ converts the production (that requires /n/ and -- /c/ arguments) with the specified /name/ to a simple 'Tokenizer', or -- @Nothing@ if it isn't known. tokenizerWithNC :: String -> Int -> Context -> Maybe Tokenizer tokenizerWithNC name n c = case Map.lookup name tokenizersWithNC of Just tokenizer -> Just $ tokenizer n c Nothing -> Nothing -- | @tokenizersWithNS@ returns a mapping from a production name to a production -- tokenizer (that requires /n/ and /s/ arguments). tokenizersWithNS :: Map.Map String (Int -> Style -> Tokenizer) tokenizersWithNS = PAR(c_b__block_header) "(m,t)" $ PAT(b_l_folded_any) $ PAT(b_l_folded_specific) $ PAT(b_l_folded_trimmed) $ PAT(l_empty) $ PAT(s_ignored_prefix) $ Map.empty where pat name pattern = Map.insert (pName name) (\ n s -> patternTokenizer (match $ pattern n s)) par name parser what = Map.insert (pName name) (\ n s -> parserTokenizer what (match $ parser n s)) -- | @tokenizerWithNS name n s@ converts the production (that requires /n/ and -- /s/ arguments) with the specified /name/ to a simple 'Tokenizer', or -- @Nothing@ if it isn't known. tokenizerWithNS :: String -> Int -> Style -> Maybe Tokenizer tokenizerWithNS name n s = case Map.lookup name tokenizersWithNS of Just tokenizer -> Just $ tokenizer n s Nothing -> Nothing -- | @tokenizersWithNT@ returns a mapping from a production name to a -- production tokenizer (that requires /n/ and /t/ arguments). tokenizersWithNT :: Map.Map String (Int -> Chomp -> Tokenizer) tokenizersWithNT = PAT(l_chomped_empty) $ PAT(l_folded_content) $ PAT(l_literal_content) $ Map.empty where pat name pattern = Map.insert (pName name) (\ n t -> patternTokenizer (match $ pattern n t)) -- | @tokenizerWithNT name n t@ converts the production (that requires /n/ and -- /t/ arguments) with the specified /name/ to a simple 'Tokenizer', or -- @Nothing@ if it isn't known. tokenizerWithNT :: String -> Int -> Chomp -> Maybe Tokenizer tokenizerWithNT name n t = case Map.lookup name tokenizersWithNT of Just tokenizer -> Just $ tokenizer n t Nothing -> Nothing -- | @tokenizerNames@ returns the list of all productions (tokenizers). tokenizerNames :: [String] tokenizerNames = (Map.keys tokenizers) ++ (Map.keys tokenizersWithN) ++ (Map.keys tokenizersWithC) ++ (Map.keys tokenizersWithS) ++ (Map.keys tokenizersWithT) ++ (Map.keys tokenizersWithNC) ++ (Map.keys tokenizersWithNS) ++ (Map.keys tokenizersWithNT) -- * Productions -- ** BNF compatibility helpers -- | @detect_utf_encoding@ doesn't actually detect the encoding, we just call it -- this way to make the productions compatible with the spec. Instead it simply -- reports the encoding (which was already detected when we started parsing). detect_utf_encoding = Parser (\ state -> let text = case state|>sEncoding of UTF8 -> "TF8" UTF16LE -> "TF16LE" UTF16BE -> "TF16BE" in parse (fake Bom text) $ setChars D.empty $ setColumn (state|>sColumn .- 1) $ state) -- | @na@ is the \"non-applicable\" indentation value. We use Haskell's laziness -- to verify it really is never used. na :: Int na = error "Accessing non-applicable indentation" -- | @asInteger@ returns the last consumed character, which is assumed to be a -- decimal digit, as an integer. asInteger :: Parser Int asInteger = Parser (\ state -> (state, Just $ ord (state|>sLast) .- 48)) -- | @result value@ is the same as /return value/ except that we give the -- Haskell type deduction the additional boost it needs to figure out this is -- wrapped in a 'Parser'. result :: result -> Parser result result = return -- CPP LINES CAUSE HADDOCK TO BARF -- #include "Reference.bnf" -- CPP LINES CAUSE HADDOCK TO BARF -- debug_leak = ( c_comment & b_line_feed *)