{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternSynonyms #-} {-# LANGUAGE ScopedTypeVariables #-} -- | XML back and forth! -- -- @xmlbf@ doesn't do any parsing of raw XML on its own. Instead, one should -- rely on libraries like @xmlbf-xeno@ or @xmlbf-xmlhtml@ for this. -- -- @xmlbf@ provides a 'FromXml' class intended to be used as the familiar -- 'Data.Aeson.FromJSON' from the @aeson@ package. This relies on the -- 'Parser' type and the related tools. -- -- @xmlbf@ provides a 'ToXml' class intended to be used as the familiar -- 'Data.Aeson.toJSON' from the @aeson@ package. -- -- @xmlb@ provides tools like 'df' and 'dfM' for finding a fixpoint -- of a XML structure. module Xmlbf ( -- * Parsing FromXml(fromXml) , Parser , runParser , pElement , pAttr , pAttrs , pText , pRead , pEndOfInput -- * Rendering , ToXml(toXml) , encode , Node , pattern Element , element , pattern Text , text -- * Fixpoints , df , dfM ) where import qualified Data.ByteString.Builder as BB import qualified Data.ByteString.Builder.Prim as BBP import Data.Foldable (for_, toList) import Data.Functor.Identity (Identity(Identity), runIdentity) import qualified Data.HashMap.Strict as HM import Data.Monoid ((<>)) import Data.Sequence (Seq) import qualified Data.Sequence as Seq import Data.String (IsString(fromString)) import qualified Data.Text as T import qualified Data.Text.Encoding as T import Data.Typeable (Typeable, typeRep, tyConName, typeRepTyCon) import Data.Word (Word8) import Control.Applicative (Alternative(empty, (<|>))) import Control.Monad (MonadPlus(mplus, mzero), join, guard) import Control.Monad.Fail (MonadFail(fail)) import qualified Text.Read -------------------------------------------------------------------------------- -- | Either a text or an element node in an XML fragment. -- -- Construct with 'text' or 'element'. Destruct with 'Text' or 'Element'. data Node = Element' !T.Text !(HM.HashMap T.Text T.Text) ![Node] | Text' !T.Text deriving (Eq, Show) -- | Destruct an element 'Node'. pattern Element :: T.Text -> (HM.HashMap T.Text T.Text) -> [Node] -> Node pattern Element t as cs <- Element' t as cs -- | Destruct a text 'Node'. pattern Text :: T.Text -> Node pattern Text t <- Text' t -- | Constructs a 'Text'. instance IsString Node where fromString = text . T.pack {-# INLINABLE fromString #-} -- | Concats 'Text's together. normalize :: [Node] -> [Node] {-# INLINE normalize #-} normalize = \case Text a : Text b : ns -> normalize (text (a <> b) : ns) (n : ns) -> n : normalize ns [] -> [] -- | Construct a text 'Node'. text :: T.Text -> Node text = Text' {-# INLINE text #-} -- | Construct an element 'Node'. element :: T.Text -- ^ Element' name. -> HM.HashMap T.Text T.Text -- ^ Attributes. -> [Node] -- ^ Children. -> Either String Node -- ^ Returns 'Left' if the element name, or atribute names, or attribute -- values are invalid. -- -- TODO: We just check for emptyness currently. element t0 hm0 ns0 = do guarde (t0 == T.strip t0) $ "Element name has surrounding whitespace: " ++ show t0 guarde (not (T.null t0)) ("Element name is blank: " ++ show t0) for_ (HM.keys hm0) $ \k -> do guarde (k == T.strip k) $ "Attribute name has surrounding whitespace: " ++ show k guarde (not (T.null k)) ("Attribute name is blank: " ++ show k) Right (Element' t0 hm0 (normalize ns0)) guarde :: Bool -> String -> Either String () {-# INLINE guarde #-} guarde True _ = Right () guarde False s = Left s -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- Parsing class FromXml a where -- | Parses an XML fragment into a value of type @a@. -- -- If a 'ToXml' instance for @a@ exists, then: -- -- @ -- 'runParser' 'fromXml' ('toXml' a) == 'Right' a -- @ fromXml :: Parser a -- | XML parser monad. To be run with 'runParser'. -- -- You can build a 'Parser' using 'element', 'pAttr', 'pAttrs', 'pText', 'pRead', -- or any of the 'Applicative', 'Alternative' or 'Monad' combinators. newtype Parser a = Parser { unParser :: S -> Either String (a, S) } deriving (Functor) -- | Run a parser on an XML fragment. If the parser fails, then a 'String' with -- an error message is returned. runParser :: Parser a -> [Node] -> Either String a runParser p0 = fmap fst . unParser p0 . STop . normalize data S = STop ![Node] -- ^ Parsing the top-level nodes. | SReg !T.Text !(HM.HashMap T.Text T.Text) ![Node] -- ^ Parsing a particular root element. instance Applicative Parser where {-# INLINE pure #-} pure = \a -> Parser (\s -> Right (a, s)) {-# INLINE (<*>) #-} Parser gf <*> Parser ga = Parser $ \s0 -> do (f, s1) <- gf s0 (a, s2) <- ga s1 pure (f a, s2) instance Monad Parser where {-# INLINE return #-} return = pure {-# INLINE (>>=) #-} Parser ga >>= k = Parser $ \s0 -> do (a, s1) <- ga s0 unParser (k a) s1 fail e = Parser (\_ -> Left e) instance MonadFail Parser where fail e = Parser (\_ -> Left e) -- | Backtracks. instance Alternative Parser where {-# INLINE empty #-} empty = Parser (\_ -> Left "empty") {-# INLINE (<|>) #-} Parser a <|> Parser b = Parser (\s -> either (\_ -> b s) Right (a s)) -- | Backtracks. instance MonadPlus Parser where {-# INLINE mzero #-} mzero = empty {-# INLINE mplus #-} mplus = (<|>) -------------------------------------------------------------------------------- -- Some parsers -- | @'pElement' "foo" p@ runs a 'Parser' @p@ inside a element node named -- @"foo"@. This fails if such element does not exist at the current position. -- -- Consumes the element from the parser state. pElement :: T.Text -> Parser a -> Parser a pElement t0 p0 = Parser $ \case SReg t as (Element t' as' cs' : cs) | t == t0 -> do (a,_) <- unParser p0 (SReg t' as' cs') Right (a, SReg t as cs) STop (Element t as cs : ns) | t == t0 -> do (a,_) <- unParser p0 (SReg t as cs) Right (a, STop ns) _ -> Left ("Missing element " ++ show t0) -- | Return the value of the requested attribute, if defined. May return an -- empty string in case the attribute is defined but no value was given to it. -- -- Consumes the attribute from the parser state. pAttr :: T.Text -> Parser T.Text {-# INLINABLE pAttr #-} pAttr n = Parser $ \case STop _ -> Left "Before selecting an attribute, you must select an element" SReg t as cs -> case HM.lookup n as of Just x -> Right (x, SReg t (HM.delete n as) cs) Nothing -> Left ("Missing attribute " ++ show n ++ " in element " ++ show t) -- | Returns all of the available element attributes. May return empty strings -- as values in case an attribute is defined but no value was given to it. -- -- Consumes all of the remaining attributes for this element from the parser -- state. pAttrs :: Parser (HM.HashMap T.Text T.Text) {-# INLINABLE pAttrs #-} pAttrs = Parser $ \case STop _ -> Left "Before selecting an attribute, you must select an element" SReg t as cs -> Right (as, SReg t mempty cs) -- | Return a text node value (including CDATA). -- -- Consumes the text node from the parser state. -- -- Law: When two consecutive calls to 'pText' are made, the first call returns -- all of the available consecutive text, and the second call always fails. pText :: Parser T.Text {-# INLINABLE pText #-} pText = Parser $ \case -- Note: this works only because we asume 'normalize' has been used. STop (Text x : ns) -> Right (x, STop ns) SReg t as (Text x : cs) -> Right (x, SReg t as cs) _ -> Left "Missing text node" -- | Parses a value that can be 'read'. -- -- Consumes the raw string from the parser state. pRead :: (Typeable a, Read a) => T.Text -> Parser a pRead = \t -> case Text.Read.readMaybe (T.unpack t) of Just a -> pure a ya@Nothing -> do let ty = tyConName (typeRepTyCon (typeRep ya)) Parser (\_ -> Left ("Can't read as " ++ ty ++ ": " ++ show t)) -- | Succeeds if all of the elements, attributes and text nodes have -- been consumed. pEndOfInput :: Parser () pEndOfInput = Parser (\s -> if isEof s then Right ((), s) else Left "Not end of input yet") isEof :: S -> Bool {-# INLINE isEof #-} isEof = \case SReg _ as cs -> HM.null as && null cs STop ns -> null ns -------------------------------------------------------------------------------- -- Rendering class ToXml a where -- | Renders a value of type @a@ into an XML fragment. -- -- If a 'FromXml' instance for @a@ exists, then: -- -- @ -- 'runParser' 'fromXml' ('toXml' a) == 'Right' a -- @ toXml :: a -> [Node] -- | Encodes a list of XML 'Node's to an UTF8-encoded and XML-escaped -- bytestring. encode :: [Node] -> BB.Builder encode xs = mconcat $ xs >>= \case Text x -> [encodeXmlUtf8 x] Element t as cs -> [ "<" , encodeUtf8 t , mconcat $ do (k,v) <- HM.toList as guard (not (T.null k)) [ " ", encodeUtf8 k, "=\"", encodeXmlUtf8 v, "\"" ] , if null cs then "/" else "" , ">" , encode cs , if null cs then "" else " encodeUtf8 t <> ">" ] -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- Node fixpoint -- | Post-order depth-first replacement of 'Node' and all of its children. -- -- This function works like 'fix', but the given function is trying to find a -- fixpoint for the individual children nodes, not for the root node. -- -- For example, the following function renames every node named @"w"@ to @"y"@, -- and every node named @"y"@ to @"z"@. It accomplishes this by first renaming -- @"w"@ nodes to @"x"@, and then, by using @k@ recursively to further rename -- all @"x"@ nodes (including the ones that were just created) to @"y"@ in a -- post-order depth-first manner. After renaming an @"x"@ node to @"y"@, the -- recursion stops (i.e., @k@ is not used), so our new @"y"@ nodes won't be -- further renamed to @"z"@. However, nodes that were named @"y"@ initially will -- be renamed to @"z"@. -- -- In our example we only replace one node with another, but a node can be -- replaced with zero or more nodes, depending on the length of the resulting -- list. -- -- @ -- foo :: 'Node' -> ['Node'] -- foo = 'df' $ \\k -> \\case -- 'Element' "w" as cs -> let 'Right' e = 'element' "x" as cs in k e -- 'Element' "x" as cs -> let 'Right' e = 'element' "y" as cs in [e] -- 'Element' "y" as cs -> let 'Right' e = 'element' "z" as cs in k e -- @ -- -- /WARNING/ If you call @k@ in every branch, then 'df' will never terminate. -- Make sure the recursion stops at some point by simply returning a list of -- nodes instead of calling @k@. df :: ((Node -> [Node]) -> Node -> [Node]) -> Node -> [Node] df f = runIdentity . dfM (\k -> Identity . f (runIdentity . k)) -- | Monadic version of 'df'. dfM :: Monad m => ((Node -> m [Node]) -> Node -> m [Node]) -> Node -> m [Node] dfM f = \n0 -> do let c0 = cursorFromNode n0 c1 <- traverschildren (dfM f) c0 c2 <- traverseRightSiblings (dfM f) c1 fmap (normalize . join) (traverse (f (dfM f)) (cursorSiblings c2)) -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- INTERNAL: Cursor -- -- Most of this comes from Chris Smith's xmlhtml, BSD3 licensed -- https://hackage.haskell.org/package/xmlhtml -- | Zipper into a 'Node' tree. data Cursor = Cursor { _cursorCurrent :: !Node -- ^ Retrieves the current node of a 'Cursor'. , _cursorLefts :: !(Seq Node) -- ^ Nodes to the left (ordered right to left). , _cursorRights :: !(Seq Node) -- ^ Nodes to the right (ordered left to right). , _cursorParents :: !(Seq (Seq Node, T.Text, HM.HashMap T.Text T.Text, Seq Node)) -- ^ Parents' name, attributes, and siblings. } ------------------------------------------------------------------------------ -- | The cursor if left where it starts. traverschildren :: Monad m => (Node -> m [Node]) -> Cursor -> m Cursor {-# INLINE traverschildren #-} traverschildren f c0 = case _cursorCurrent c0 of Text _ -> pure c0 Element t as cs -> do n1s <- fmap (normalize . join) (traverse f cs) pure (c0 {_cursorCurrent = Element' t as n1s}) -- | The cursor if left in the rightmost sibling. traverseRightSiblings :: Monad m => (Node -> m [Node]) -> Cursor -> m Cursor {-# INLINE traverseRightSiblings #-} traverseRightSiblings f c0 = case cursorRemoveRight c0 of Nothing -> pure c0 Just (n1, c1) -> do n2s <- fmap normalize (f n1) traverseRightSiblings f (cursorInsertManyRight n2s c1) -- | Builds a 'Cursor' for navigating a tree. That is, a forest with a single -- root 'Node'. cursorFromNode :: Node -> Cursor {-# INLINE cursorFromNode #-} cursorFromNode n = Cursor n mempty mempty mempty -- | Retrieves a list of the 'Node's at the same level as the current position -- of a cursor, including the current node. cursorSiblings :: Cursor -> [Node] {-# INLINE cursorSiblings #-} cursorSiblings (Cursor cur ls rs _) = toList (Seq.reverse ls <> (cur Seq.<| rs)) -- | Removes the node to the right and return it. cursorRemoveRight :: Cursor -> Maybe (Node, Cursor) {-# INLINE cursorRemoveRight #-} cursorRemoveRight = \case Cursor n ls rs0 ps | not (Seq.null rs0) -> case Seq.viewl rs0 of r Seq.:< rs -> Just (r, Cursor n ls rs ps) _ -> undefined -- unreachable, rs0 is not empty _ -> Nothing -- | Inserts a list of new 'Node's to the right of the current position. cursorInsertManyRight :: [Node] -> Cursor -> Cursor {-# INLINE cursorInsertManyRight #-} cursorInsertManyRight ns (Cursor nn ls rs ps) = Cursor nn ls (Seq.fromList ns <> rs) ps -------------------------------------------------------------------------------- -------------------------------------------------------------------------------- -- Miscellaneous encodeUtf8 :: T.Text -> BB.Builder {-# INLINE encodeUtf8 #-} encodeUtf8 = T.encodeUtf8Builder encodeXmlUtf8 :: T.Text -> BB.Builder {-# INLINE encodeXmlUtf8 #-} encodeXmlUtf8 = T.encodeUtf8BuilderEscaped xmlEscaped xmlEscaped :: BBP.BoundedPrim Word8 {-# INLINE xmlEscaped #-} xmlEscaped = BBP.condB (== 38) (fixed5 (38,(97,(109,(112,59))))) $ -- '&' -> "&" BBP.condB (== 60) (fixed4 (38,(108,(116,59)))) $ -- '<' -> "<" BBP.condB (== 62) (fixed4 (38,(103,(116,59)))) $ -- '>' -> ">" BBP.condB (== 34) (fixed5 (38,(35,(51,(52,59))))) $ -- '"' -> """ BBP.liftFixedToBounded BBP.word8 where {-# INLINE fixed4 #-} fixed4 :: (Word8, (Word8, (Word8, Word8))) -> BBP.BoundedPrim Word8 fixed4 x = BBP.liftFixedToBounded (const x BBP.>$< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8) {-# INLINE fixed5 #-} fixed5 :: (Word8, (Word8, (Word8, (Word8, Word8)))) -> BBP.BoundedPrim Word8 fixed5 x = BBP.liftFixedToBounded (const x BBP.>$< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8)