{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE Safe #-} -- | -- Copyright: © Herbert Valerio Riedel 2015-2018 -- SPDX-License-Identifier: GPL-2.0-or-later -- -- Event-stream oriented YAML parsing API module Data.YAML.Event ( parseEvents , EvStream , Event(..) , Style(..) , Tag, untagged, isUntagged, tagToText, mkTag , Anchor , Pos(..) ) where import qualified Data.ByteString.Lazy as BS.L import Data.Char import qualified Data.Map as Map import qualified Data.Text as T import qualified Data.YAML.Token as Y import Numeric (readHex) import Util -- TODO: consider also non-essential attributes -- | YAML Event Types -- -- The events correspond to the ones from [LibYAML](http://pyyaml.org/wiki/LibYAML) -- -- The grammar below defines well-formed streams of 'Event's: -- -- @ -- stream ::= 'StreamStart' document* 'StreamEnd' -- document ::= 'DocumentStart' node 'DocumentEnd' -- node ::= 'Alias' -- | 'Scalar' -- | sequence -- | mapping -- sequence ::= 'SequenceStart' node* 'SequenceEnd' -- mapping ::= 'MappingStart' (node node)* 'MappingEnd' -- @ data Event = StreamStart | StreamEnd | DocumentStart !Bool | DocumentEnd !Bool | Alias !Anchor | Scalar !(Maybe Anchor) !Tag !Style !Text | SequenceStart !(Maybe Anchor) !Tag | SequenceEnd | MappingStart !(Maybe Anchor) !Tag | MappingEnd deriving (Show, Eq) -- | YAML Anchor identifiers type Anchor = Text -- | YAML Tags newtype Tag = Tag (Maybe Text) deriving (Eq,Ord) instance Show Tag where show (Tag x) = show x -- | Convert 'Tag' to its string representation -- -- Returns 'Nothing' for 'untagged' tagToText :: Tag -> Maybe T.Text tagToText (Tag x) = x -- | An \"untagged\" YAML tag untagged :: Tag untagged = Tag Nothing -- | Equivalent to @(== 'untagged')@ isUntagged :: Tag -> Bool isUntagged (Tag Nothing) = True isUntagged _ = False -- | Construct YAML tag mkTag :: String -> Tag mkTag "" = error "mkTag" mkTag "!" = Tag (Just $! T.pack "!") mkTag s = Tag (Just $! tagUnescape s) where tagUnescape = T.pack . go where go [] = [] go ('%':h:l:cs) | Just c <- decodeL1 [h,l] = c : go cs go (c:cs) = c : go cs mkTag' :: String -> Tag mkTag' "" = error "mkTag'" mkTag' s = Tag (Just $! T.pack s) mkTag'' :: String -> Tag mkTag'' "" = error "mkTag''" mkTag'' s = Tag (Just $! T.pack ("tag:yaml.org,2002:" ++ s)) -- | Event stream produced by 'parseEvents' -- -- A 'Left' value denotes parsing errors. The event stream ends -- immediately once a 'Left' value is returned. type EvStream = [Either (Pos,String) Event] -- | Position in parsed YAML source data Pos = Pos { posByteOffset :: !Int -- ^ 0-based byte offset , posCharOffset :: !Int -- ^ 0-based character (Unicode code-point) offset , posLine :: !Int -- ^ 1-based line number , posColumn :: !Int -- ^ 0-based character (Unicode code-point) column number } deriving Show tok2pos :: Y.Token -> Pos tok2pos Y.Token { Y.tByteOffset = posByteOffset, Y.tCharOffset = posCharOffset, Y.tLine = posLine, Y.tLineChar = posColumn } = Pos {..} -- | 'Scalar' node style data Style = Plain | SingleQuoted | DoubleQuoted | Literal | Folded deriving (Eq,Ord,Show) -- internal type TagHandle = Text type Props = (Maybe Text,Tag) getHandle :: [Y.Token] -> Maybe (TagHandle,[Y.Token]) getHandle toks0 = do Y.Token { Y.tCode = Y.BeginHandle } : toks1 <- Just toks0 (hs,Y.Token { Y.tCode = Y.EndHandle } : toks2) <- Just $ span (\Y.Token { Y.tCode = c } -> c `elem` [Y.Indicator,Y.Meta]) toks1 pure (T.pack $ concatMap Y.tText hs, toks2) getUriTag :: [Y.Token] -> Maybe (Text,[Y.Token]) getUriTag toks0 = do Y.Token { Y.tCode = Y.BeginTag } : toks1 <- Just toks0 (hs,Y.Token { Y.tCode = Y.EndTag } : toks2) <- Just $ span (\Y.Token { Y.tCode = c } -> c `elem` [Y.Indicator,Y.Meta]) toks1 pure (T.pack $ concatMap Y.tText hs, toks2) {- WARNING: the code that follows will make you cry; a safety pig is provided below for your benefit. _ _._ _..._ .-', _.._(`)) '-. ` ' /-._.-' ',/ ) \ '. / _ _ | \ | a a / | \ .-. ; '-('' ).-' ,' ; '-; | .' \ \ / | 7 .__ _.-\ \ | | | ``/ /` / /,_| | /,_/ / /,_/ '`-' -} -- | Parse YAML 'Event's from a lazy 'BS.L.ByteString'. -- -- The input 'BS.L.ByteString' is expected to have a YAML 1.2 stream -- using the UTF-8, UTF-16 (LE or BE), or UTF-32 (LE or BE) encodings -- (which will be auto-detected). parseEvents :: BS.L.ByteString -> EvStream parseEvents = \bs0 -> Right StreamStart : (go0 mempty $ stripComments $ filter (not . isWhite) $ Y.tokenize bs0 False) where isTCode tc = (== tc) . Y.tCode skipPast tc (t : ts) | isTCode tc t = ts | otherwise = skipPast tc ts skipPast _ [] = error "the impossible happened" -- non-content whitespace isWhite :: Y.Token -> Bool isWhite (Y.Token { Y.tCode = Y.Bom }) = True -- BOMs can occur at each doc-start! isWhite (Y.Token { Y.tCode = Y.White }) = True isWhite (Y.Token { Y.tCode = Y.Indent }) = True isWhite (Y.Token { Y.tCode = Y.Break }) = True isWhite _ = False goDir :: Map TagHandle Text -> [Y.Token] -> EvStream goDir m (Y.Token { Y.tCode = Y.Indicator, Y.tText = "%" } : Y.Token { Y.tCode = Y.Meta, Y.tText = "YAML" } : Y.Token { Y.tCode = Y.Meta } : Y.Token { Y.tCode = Y.EndDirective } : rest) = go0 m rest goDir m (Y.Token { Y.tCode = Y.Indicator, Y.tText = "%" } : Y.Token { Y.tCode = Y.Meta, Y.tText = "TAG" } : rest) | Just (h, rest') <- getHandle rest , Just (t, rest'') <- getUriTag rest' = go0 (Map.insert h t m) (skipPast Y.EndDirective rest'') goDir m (Y.Token { Y.tCode = Y.Indicator, Y.tText = "%" } : Y.Token { Y.tCode = Y.Meta, Y.tText = l } : rest) | l `notElem` ["TAG","YAML"] = go0 m (skipPast Y.EndDirective rest) goDir _ xs = err xs go0 :: Map.Map TagHandle Text -> Tok2EvStream go0 _ [] = [Right StreamEnd] go0 _ (Y.Token { Y.tCode = Y.White } : _) = error "the impossible happened" go0 m (Y.Token { Y.tCode = Y.Indicator } : rest) = go0 m rest -- ignore indicators here go0 m (Y.Token { Y.tCode = Y.DirectivesEnd } : rest) = go0 m rest go0 m (Y.Token { Y.tCode = Y.BeginDocument } : Y.Token { Y.tCode = Y.DirectivesEnd } : rest) = Right (DocumentStart True) : go0 m rest -- hack go0 m (Y.Token { Y.tCode = Y.BeginDocument } : rest@(Y.Token { Y.tCode = Y.BeginDirective } : _)) = Right (DocumentStart True) : go0 m rest -- hack go0 m (Y.Token { Y.tCode = Y.BeginDocument } : rest) = Right (DocumentStart False) : go0 m rest go0 _ (Y.Token { Y.tCode = Y.EndDocument } : Y.Token { Y.tCode = Y.DocumentEnd } : rest) = Right (DocumentEnd True) : go0 mempty rest go0 _ (Y.Token { Y.tCode = Y.EndDocument } : rest) = Right (DocumentEnd False) : go0 mempty rest go0 m (Y.Token { Y.tCode = Y.DocumentEnd } : rest) = go0 m rest -- should not occur go0 m (Y.Token { Y.tCode = Y.BeginNode } : rest) = goNode0 m rest (go0 m) go0 m (Y.Token { Y.tCode = Y.BeginDirective } : rest) = goDir m rest go0 _ xs = err xs err :: Tok2EvStream err (tok@Y.Token { Y.tCode = Y.Error, Y.tText = msg } : _) = [Left (tok2pos tok, msg)] err (tok@Y.Token { Y.tCode = Y.Unparsed, Y.tText = txt } : _) = [Left (tok2pos tok, ("Lexical error near " ++ show txt))] err (tok@Y.Token { Y.tCode = code } : _) = [Left (tok2pos tok, ("Parse failure near " ++ show code ++ " token"))] err [] = [Left ((Pos (-1) (-1) (-1) (-1)), "Unexpected end of token stream")] goNode0 :: Map TagHandle Text -> Tok2EvStreamCont goNode0 tagmap = goNode where goNode :: Tok2EvStreamCont goNode (Y.Token { Y.tCode = Y.BeginScalar } : rest) cont = goScalar (mempty,untagged) rest (flip goNodeEnd cont) goNode (Y.Token { Y.tCode = Y.BeginSequence } : rest) cont = Right (SequenceStart Nothing untagged) : goSeq rest (flip goNodeEnd cont) goNode (Y.Token { Y.tCode = Y.BeginMapping } : rest) cont = Right (MappingStart Nothing untagged) : goMap rest (flip goNodeEnd cont) goNode (Y.Token { Y.tCode = Y.BeginProperties } : rest) cont = goProp (mempty,untagged) rest (\p rest' -> goNode' p rest' cont) goNode (Y.Token { Y.tCode = Y.BeginAlias } : Y.Token { Y.tCode = Y.Indicator } : Y.Token { Y.tCode = Y.Meta, Y.tText = anchor } : Y.Token { Y.tCode = Y.EndAlias } : Y.Token { Y.tCode = Y.EndNode } : rest) cont = Right (Alias (T.pack anchor)) : cont rest goNode xs _cont = err xs goNode' :: Props -> Tok2EvStreamCont goNode' props (Y.Token { Y.tCode = Y.BeginScalar } : rest) cont = goScalar props rest (flip goNodeEnd cont) goNode' (manchor,mtag) (Y.Token { Y.tCode = Y.BeginSequence } : rest) cont = Right (SequenceStart manchor mtag) : goSeq rest (flip goNodeEnd cont) goNode' (manchor,mtag) (Y.Token { Y.tCode = Y.BeginMapping } : rest) cont = Right (MappingStart manchor mtag) : goMap rest (flip goNodeEnd cont) goNode' _ xs _cont = err xs goNodeEnd :: Tok2EvStreamCont goNodeEnd (Y.Token { Y.tCode = Y.EndNode } : rest) cont = cont rest goNodeEnd xs _cont = err xs goProp :: Props -> [Y.Token] -> (Props -> [Y.Token] -> EvStream) -> EvStream goProp props (Y.Token { Y.tCode = Y.EndProperties } : rest) cont = cont props rest goProp props (Y.Token { Y.tCode = Y.BeginAnchor } : rest) cont = goAnchor props rest (\x y -> goProp x y cont) goProp props (Y.Token { Y.tCode = Y.BeginTag } : rest) cont = goTag props rest (\x y -> goProp x y cont) goProp _props xs _cont = err xs goAnchor :: Props -> [Y.Token] -> (Props -> [Y.Token] -> EvStream) -> EvStream goAnchor props (Y.Token { Y.tCode = Y.Indicator } : rest) cont = goAnchor props rest cont goAnchor (_,tag) (Y.Token { Y.tCode = Y.Meta, Y.tText = anchor } : rest) cont = goAnchor (Just $! T.pack anchor,tag) rest cont goAnchor props (Y.Token { Y.tCode = Y.EndAnchor } : rest) cont = cont props rest goAnchor _ xs _ = err xs goTag :: Props -> [Y.Token] -> (Props -> [Y.Token] -> EvStream) -> EvStream goTag (anchor,_) (Y.Token { Y.tCode = Y.Indicator, Y.tText = "!" } : Y.Token { Y.tCode = Y.EndTag } : rest) cont = cont (anchor,mkTag' "!") rest goTag (anchor,_) (Y.Token { Y.tCode = Y.BeginHandle } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "!" } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "!" } : Y.Token { Y.tCode = Y.EndHandle } : Y.Token { Y.tCode = Y.Meta, Y.tText = tag } : Y.Token { Y.tCode = Y.EndTag } : rest) cont | Just t' <- Map.lookup (T.pack ("!!")) tagmap = cont (anchor,mkTag (T.unpack t' ++ tag)) rest | otherwise = cont (anchor,mkTag'' tag) rest goTag (anchor,_) (Y.Token { Y.tCode = Y.Indicator, Y.tText = "!" } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "<" } : Y.Token { Y.tCode = Y.Meta, Y.tText = tag } : Y.Token { Y.tCode = Y.Indicator, Y.tText = ">" } : Y.Token { Y.tCode = Y.EndTag } : rest) cont = cont (anchor,mkTag tag) rest goTag (anchor,_) xs@(Y.Token { Y.tCode = Y.BeginHandle } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "!" } : Y.Token { Y.tCode = Y.Meta, Y.tText = h } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "!" } : Y.Token { Y.tCode = Y.EndHandle } : Y.Token { Y.tCode = Y.Meta, Y.tText = tag } : Y.Token { Y.tCode = Y.EndTag } : rest) cont | Just t' <- Map.lookup (T.pack ("!" ++ h ++ "!")) tagmap = cont (anchor,mkTag (T.unpack t' ++ tag)) rest | otherwise = err xs goTag (anchor,_) (Y.Token { Y.tCode = Y.BeginHandle } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "!" } : Y.Token { Y.tCode = Y.EndHandle } : Y.Token { Y.tCode = Y.Meta, Y.tText = tag } : Y.Token { Y.tCode = Y.EndTag } : rest) cont | Just t' <- Map.lookup (T.pack ("!")) tagmap = cont (anchor,mkTag (T.unpack t' ++ tag)) rest | otherwise = cont (anchor,mkTag' ('!' : tag)) rest -- unresolved goTag _ xs _ = err xs goScalar :: Props -> Tok2EvStreamCont goScalar (manchor,tag) toks0 cont = go0 False Plain toks0 where go0 ii sty (Y.Token { Y.tCode = Y.Indicator, Y.tText = ind } : rest) | "'" <- ind = go' ii "" SingleQuoted rest | "\"" <- ind = go' ii "" DoubleQuoted rest | "|" <- ind = go0 True Literal rest | ">" <- ind = go0 True Folded rest | "+" <- ind = go0 ii sty rest | "-" <- ind = go0 ii sty rest | [c] <- ind, '1' <= c, c <= '9' = go0 False sty rest go0 ii sty (Y.Token { Y.tCode = Y.Text, Y.tText = t } : rest) = go' ii t sty rest go0 ii sty (Y.Token { Y.tCode = Y.LineFold } : rest) = go' ii " " sty rest go0 ii sty (Y.Token { Y.tCode = Y.LineFeed } : rest) = go' ii "\n" sty rest go0 _ sty (Y.Token { Y.tCode = Y.EndScalar } : rest) = Right (Scalar manchor tag sty mempty) : cont rest go0 _ _ xs = err xs ---------------------------------------------------------------------------- go' ii acc sty (Y.Token { Y.tCode = Y.Text, Y.tText = t } : rest) = go' ii (acc ++ t) sty rest go' ii acc sty (Y.Token { Y.tCode = Y.LineFold } : rest) = go' ii (acc ++ " ") sty rest go' ii acc sty (Y.Token { Y.tCode = Y.LineFeed } : rest) = go' ii (acc ++ "\n") sty rest go' ii acc sty@SingleQuoted (Y.Token { Y.tCode = Y.BeginEscape } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "'" } : Y.Token { Y.tCode = Y.Meta, Y.tText = "'" } : Y.Token { Y.tCode = Y.EndEscape } : rest) = go' ii (acc ++ "'") sty rest go' ii acc sty@SingleQuoted (Y.Token { Y.tCode = Y.Indicator, Y.tText = "'" } : rest) = go' ii acc sty rest go' ii acc sty@DoubleQuoted (Y.Token { Y.tCode = Y.BeginEscape } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "\\" } : -- Y.Token { Y.tCode = Y.Break } : Y.Token { Y.tCode = Y.EndEscape } : rest) = go' ii acc sty rest -- line continuation escape go' ii acc sty@DoubleQuoted (Y.Token { Y.tCode = Y.BeginEscape } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "\\" } : Y.Token { Y.tCode = Y.Meta, Y.tText = t } : Y.Token { Y.tCode = Y.EndEscape } : rest) | Just t' <- unescape t = go' ii (acc ++ t') sty rest go' ii acc sty@DoubleQuoted (Y.Token { Y.tCode = Y.BeginEscape } : Y.Token { Y.tCode = Y.Indicator, Y.tText = "\\" } : Y.Token { Y.tCode = Y.Indicator, Y.tText = pfx } : Y.Token { Y.tCode = Y.Meta, Y.tText = ucode } : Y.Token { Y.tCode = Y.EndEscape } : rest) | pfx == "U", Just c <- decodeCP2 ucode = go' ii (acc ++ [c]) sty rest | pfx == "u", Just c <- decodeCP ucode = go' ii (acc ++ [c]) sty rest | pfx == "x", Just c <- decodeL1 ucode = go' ii (acc ++ [c]) sty rest go' ii acc sty@DoubleQuoted (Y.Token { Y.tCode = Y.Indicator, Y.tText = "\"" } : rest) = go' ii acc sty rest go' ii acc sty (t@Y.Token { Y.tCode = Y.EndScalar } : rest) | ii, hasLeadingSpace acc = [Left (tok2pos t, "leading empty lines contain more spaces than the first non-empty line in scalar")] | otherwise = Right (Scalar manchor tag sty (T.pack acc)) : cont rest go' _ _ _ xs | False = error (show xs) go' _ _ _ xs = err xs hasLeadingSpace (' ':_) = True hasLeadingSpace ('\n':cs) = hasLeadingSpace cs hasLeadingSpace _ = False goSeq :: Tok2EvStreamCont goSeq (Y.Token { Y.tCode = Y.EndSequence } : rest) cont = Right SequenceEnd : cont rest goSeq (Y.Token { Y.tCode = Y.BeginNode } : rest) cont = goNode rest (flip goSeq cont) goSeq (Y.Token { Y.tCode = Y.BeginMapping } : rest) cont = Right (MappingStart Nothing untagged) : goMap rest (flip goSeq cont) goSeq (Y.Token { Y.tCode = Y.Indicator } : rest) cont = goSeq rest cont -- goSeq xs _cont = error (show xs) goSeq xs _cont = err xs goMap :: Tok2EvStreamCont goMap (Y.Token { Y.tCode = Y.EndMapping } : rest) cont = Right MappingEnd : cont rest goMap (Y.Token { Y.tCode = Y.BeginPair } : rest) cont = goPair1 rest (flip goMap cont) goMap (Y.Token { Y.tCode = Y.Indicator } : rest) cont = goMap rest cont goMap xs _cont = err xs goPair1 (Y.Token { Y.tCode = Y.BeginNode } : rest) cont = goNode rest (flip goPair2 cont) goPair1 (Y.Token { Y.tCode = Y.Indicator } : rest) cont = goPair1 rest cont goPair1 xs _cont = err xs goPair2 (Y.Token { Y.tCode = Y.BeginNode } : rest) cont = goNode rest (flip goPairEnd cont) goPair2 (Y.Token { Y.tCode = Y.Indicator } : rest) cont = goPair2 rest cont goPair2 xs _cont = err xs goPairEnd (Y.Token { Y.tCode = Y.EndPair } : rest) cont = cont rest goPairEnd xs _cont = err xs stripComments :: [Y.Token] -> [Y.Token] stripComments (Y.Token { Y.tCode = Y.BeginComment } : rest) = skip rest where skip (Y.Token { Y.tCode = Y.EndComment } : rest') = stripComments rest' skip (_ : rest') = skip rest' skip [] = error "the impossible happened" stripComments (t : rest) = t : stripComments rest stripComments [] = [] type Tok2EvStream = [Y.Token] -> EvStream type Tok2EvStreamCont = [Y.Token] -> Cont EvStream [Y.Token] type Cont r a = (a -> r) -> r -- decode 8-hex-digit unicode code-point decodeCP2 :: String -> Maybe Char decodeCP2 s = case s of [_,_,_,_,_,_,_,_] | all isHexDigit s , [(j, "")] <- readHex s -> Just (chr (fromInteger j)) _ -> Nothing -- decode 4-hex-digit unicode code-point decodeCP :: String -> Maybe Char decodeCP s = case s of [_,_,_,_] | all isHexDigit s , [(j, "")] <- readHex s -> Just (chr (fromInteger j)) _ -> Nothing -- decode 2-hex-digit latin1 code-point decodeL1 :: String -> Maybe Char decodeL1 s = case s of [_,_] | all isHexDigit s , [(j, "")] <- readHex s -> Just (chr (fromInteger j)) _ -> Nothing -- decode C-style escapes unescape :: String -> Maybe String unescape [c] = Map.lookup c m where m = Map.fromList [ (k,[v]) | (k,v) <- escapes ] escapes :: [(Char,Char)] escapes = [ ('0', '\0') , ('a', '\x7') , ('b', '\x8') , ('\x9', '\x9') , ('t', '\x9') , ('n', '\xa') , ('v', '\xb') , ('f', '\xc') , ('r', '\xd') , ('e', '\x1b') , (' ', ' ') , ('"', '"') , ('/', '/') , ('\\', '\\') , ('N', '\x85') , ('_', '\xa0') , ('L', '\x2028') , ('P', '\x2029') ] unescape _ = Nothing