lib/Xmlbf.hs

{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- | XML back and forth!
--
-- @xmlbf@ doesn't do any parsing of raw XML on its own. Instead, one should
-- rely on libraries like @xmlbf-xeno@ or @xmlbf-xmlhtml@ for this.
--
-- @xmlbf@ provides a 'FromXml' class intended to be used as the familiar
-- 'Data.Aeson.FromJSON' from the @aeson@ package. This relies on the
-- 'Parser' type and the related tools.
--
-- @xmlbf@ provides a 'ToXml' class intended to be used as the familiar
-- 'Data.Aeson.toJSON' from the @aeson@ package.
--
-- @xmlb@ provides tools like 'dfpos' and 'dfposM' for finding a fixpoint
-- of a XML structure.
module Xmlbf
 ( -- * Parsing
   FromXml(fromXml)
 , Parser
 , runParser
 , pElement
 , pAttr
 , pAttrs
 , pText
 , pRead
 , pEndOfInput

   -- * Rendering
 , ToXml(toXml)
 , encode

 , Node

 , pattern Element
 , element
 , element'

 , pattern Text
 , text

   -- * Fixpoints
 , dfpos
 , dfposM
 , dfpre
 , dfpreM
 ) where

import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Builder.Prim as BBP
import Data.Foldable (for_, toList)
import Data.Functor.Identity (Identity(Identity), runIdentity)
import qualified Data.HashMap.Strict as HM
import Data.Monoid ((<>))
import Data.Sequence (Seq)
import qualified Data.Sequence as Seq
import Data.String (IsString(fromString))
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.Typeable (Typeable, typeRep, tyConName, typeRepTyCon)
import Data.Traversable (for)
import Data.Word (Word8)
import Control.Applicative (Alternative(empty, (<|>)))
import Control.Monad (MonadPlus(mplus, mzero), join, guard)
import Control.Monad.Fail (MonadFail(fail))
import qualified Text.Read

--------------------------------------------------------------------------------

-- | Either a text or an element node in an XML fragment.
--
-- Construct with 'text' or 'element'. Destruct with 'Text' or 'Element'.
data Node
  = Element' !T.Text !(HM.HashMap T.Text T.Text) ![Node]
  | Text' !T.Text
  deriving (Eq)

instance Show Node where
  showsPrec n = \x -> showParen (n > 10) $ case x of
    Text' t -> showString "Text " . showsPrec 0 t
    Element' t as cs ->
      showString "Element " .
      showsPrec 0 t . showChar ' ' .
      showsPrec 0 (HM.toList as) . showChar ' ' .
      showsPrec 0 cs

-- | Destruct an element 'Node'.
pattern Element :: T.Text -> (HM.HashMap T.Text T.Text) -> [Node] -> Node
pattern Element t as cs <- Element' t as cs
{-# COMPLETE Element #-}

-- | Destruct a text 'Node'.
pattern Text :: T.Text -> Node
pattern Text t <- Text' t
{-# COMPLETE Text #-}

-- | Constructs a 'Text'.
instance IsString Node where
  fromString = text . T.pack
  {-# INLINABLE fromString #-}

-- | Concats 'Text's together.
normalize :: [Node] -> [Node]
{-# INLINE normalize #-}
normalize = \case
   Text a : Text b : ns -> normalize (text (a <> b) : ns)
   (n : ns) -> n : normalize ns
   [] -> []

-- | Construct a text 'Node'.
text :: T.Text -> Node
text = Text'
{-# INLINE text #-}

-- | Construct an element 'Node'.
element
  :: T.Text
  -- ^ Element' name.
  -> HM.HashMap T.Text T.Text
  -- ^ Attributes.
  -> [Node]
  -- ^ Children.
  -> Either String Node
  -- ^ Returns 'Left' if the element name, or atribute names, or attribute
  -- values are invalid.
  --
  -- TODO: We just check for emptyness currently.
element t0 hm0 ns0 = do
  guarde (t0 == T.strip t0) $
     "Element name has surrounding whitespace: " ++ show t0
  guarde (not (T.null t0)) ("Element name is blank: " ++ show t0)
  for_ (HM.keys hm0) $ \k -> do
     guarde (k == T.strip k) $
        "Attribute name has surrounding whitespace: " ++ show k
     guarde (not (T.null k)) ("Attribute name is blank: " ++ show k)
  Right (Element' t0 hm0 (normalize ns0))

-- | Unsafe version of 'element', causing a runtime 'error' in situations
-- where 'element' would return 'Left'. So, don't use this unless you know
-- what you are doing.
element'
  :: T.Text -- ^ Element' name.
  -> HM.HashMap T.Text T.Text -- ^ Attributes.
  -> [Node] -- ^ Children.
  -> Node
{-# INLINE element' #-}
element' t hm ns =
  case element t hm ns of
     Right x -> x
     Left e -> error ("element': " ++ e)


guarde :: Bool -> String -> Either String ()
{-# INLINE guarde #-}
guarde True  _ = Right ()
guarde False s = Left s

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Parsing

class FromXml a where
  -- | Parses an XML fragment into a value of type @a@.
  --
  -- If a 'ToXml' instance for @a@ exists, then:
  --
  -- @
  -- 'runParser' 'fromXml' ('toXml' a) == 'Right' a
  -- @
  fromXml :: Parser a

-- | XML parser monad. To be run with 'runParser'.
--
-- You can build a 'Parser' using 'element', 'pAttr', 'pAttrs', 'pText', 'pRead',
-- or any of the 'Applicative', 'Alternative' or 'Monad' combinators.
newtype Parser a = Parser { unParser :: S -> Either String (a, S) }
  deriving (Functor)

-- | Run a parser on an XML fragment. If the parser fails, then a 'String' with
-- an error message is returned.
--
-- Notice that this function doesn't enforce that all input is consumed. If you
-- want that behavior, then please use 'pEndOfInput' in the given 'Parser'.
runParser :: Parser a -> [Node] -> Either String a
runParser p0 = fmap fst . unParser p0 . STop . normalize

data S
  = STop ![Node]
    -- ^ Parsing the top-level nodes.
  | SReg !T.Text !(HM.HashMap T.Text T.Text) ![Node]
    -- ^ Parsing a particular root element.

instance Applicative Parser where
  {-# INLINE pure #-}
  pure = \a -> Parser (\s -> Right (a, s))
  {-# INLINE (<*>) #-}
  Parser gf <*> Parser ga = Parser $ \s0 -> do
    (f, s1) <- gf s0
    (a, s2) <- ga s1
    pure (f a, s2)

instance Monad Parser where
  {-# INLINE return #-}
  return = pure
  {-# INLINE (>>=) #-}
  Parser ga >>= k = Parser $ \s0 -> do
    (a, s1) <- ga s0
    unParser (k a) s1
  fail e = Parser (\_ -> Left e)

instance MonadFail Parser where
  fail e = Parser (\_ -> Left e)

-- | Backtracks.
instance Alternative Parser where
  {-# INLINE empty #-}
  empty = Parser (\_ -> Left "empty")
  {-# INLINE (<|>) #-}
  Parser a <|> Parser b = Parser (\s -> either (\_ -> b s) Right (a s))

-- | Backtracks.
instance MonadPlus Parser where
  {-# INLINE mzero #-}
  mzero = empty
  {-# INLINE mplus #-}
  mplus = (<|>)

--------------------------------------------------------------------------------
-- Some parsers

-- | @'pElement' "foo" p@ runs a 'Parser' @p@ inside a element node named
-- @"foo"@. This fails if such element does not exist at the current position.
--
-- Consumes the element from the parser state.
pElement :: T.Text -> Parser a -> Parser a
{-# INLINABLE pElement #-}
pElement t0 p0 = Parser $ \case
  SReg t as (Element t' as' cs' : cs) | t == t0 ->  do
    (a,_) <- unParser p0 (SReg t' as' cs')
    Right (a, SReg t as cs)
  STop (Element t as cs : ns) | t == t0 -> do
    (a,_) <- unParser p0 (SReg t as cs)
    Right (a, STop ns)
  _ -> Left ("Missing element " ++ show t0)

-- | Return the value of the requested attribute, if defined. May return an
-- empty string in case the attribute is defined but no value was given to it.
--
-- Consumes the attribute from the parser state.
pAttr :: T.Text -> Parser T.Text
{-# INLINABLE pAttr #-}
pAttr n = Parser $ \case
  STop _ -> Left "Before selecting an attribute, you must select an element"
  SReg t as cs -> case HM.lookup n as of
    Just x -> Right (x, SReg t (HM.delete n as) cs)
    Nothing -> Left ("Missing attribute " ++ show n ++ " in element " ++ show t)

-- | Returns all of the available element attributes. May return empty strings
-- as values in case an attribute is defined but no value was given to it.
--
-- Consumes all of the remaining attributes for this element from the parser
-- state.
pAttrs :: Parser (HM.HashMap T.Text T.Text)
{-# INLINABLE pAttrs #-}
pAttrs = Parser $ \case
  STop _ -> Left "Before selecting an attribute, you must select an element"
  SReg t as cs -> Right (as, SReg t mempty cs)

-- | Return a text node value (including CDATA).
--
-- Consumes the text node from the parser state.
--
-- Law: When two consecutive calls to 'pText' are made, the first call returns
-- all of the available consecutive text, and the second call always fails.
pText :: Parser T.Text
{-# INLINABLE pText #-}
pText = Parser $ \case
    -- Note: this works only because we asume 'normalize' has been used.
    STop (Text x : ns) -> Right (x, STop ns)
    SReg t as (Text x : cs) -> Right (x, SReg t as cs)
    _ -> Left "Missing text node"

-- | Parses a value that can be 'read'.
--
-- Consumes the raw string from the parser state.
pRead :: (Typeable a, Read a) => T.Text -> Parser a
{-# INLINABLE pRead #-}
pRead = \t -> case Text.Read.readMaybe (T.unpack t) of
  Just a -> pure a
  ya@Nothing -> do
    let ty = tyConName (typeRepTyCon (typeRep ya))
    Parser (\_ -> Left ("Can't read as " ++ ty ++ ": " ++ show t))

-- | Succeeds if all of the elements, attributes and text nodes have
-- been consumed.
pEndOfInput :: Parser ()
{-# INLINABLE pEndOfInput #-}
pEndOfInput = Parser (\s ->
  if isEof s then Right ((), s)
             else Left "Not end of input yet")

isEof :: S -> Bool
{-# INLINE isEof #-}
isEof = \case
  SReg _ as cs -> HM.null as && null cs
  STop ns -> null ns

--------------------------------------------------------------------------------
-- Rendering

class ToXml a where
  -- | Renders a value of type @a@ into an XML fragment.
  --
  -- If a 'FromXml' instance for @a@ exists, then:
  --
  -- @
  -- 'runParser' 'fromXml' ('toXml' a) == 'Right' a
  -- @
  toXml :: a -> [Node]

-- | Encodes a list of XML 'Node's to an UTF8-encoded and XML-escaped
-- bytestring.
encode :: [Node] -> BB.Builder
encode xs = mconcat $ xs >>= \case
  Text x -> [encodeXmlUtf8 x]
  Element t as cs ->
    [ "<"
    , encodeUtf8 t
    , mconcat $ do
        (k,v) <- HM.toList as
        guard (not (T.null k))
        [ " ", encodeUtf8 k, "=\"", encodeXmlUtf8 v, "\"" ]
    , if null cs then "/" else ""
    , ">"
    , encode cs
    , if null cs then "" else "</" <> encodeUtf8 t <> ">"
    ]

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Node fixpoint

-- | Post-order depth-first replacement of 'Node' and all of its children.
--
-- This function works like 'fix', but the given function is trying to find a
-- fixpoint for the individual children nodes, not for the root node.
--
-- For example, the following function renames every node named @"w"@ to @"y"@,
-- and every node named @"y"@ to @"z"@. It accomplishes this by first renaming
-- @"w"@ nodes to @"x"@, and then, by using @k@ recursively to further rename
-- all @"x"@ nodes (including the ones that were just created) to @"y"@ in a
-- post-order depth-first manner. After renaming an @"x"@ node to @"y"@, the
-- recursion stops (i.e., @k@ is not used), so our new @"y"@ nodes won't be
-- further renamed to @"z"@. However, nodes that were named @"y"@ initially will
-- be renamed to @"z"@.
--
-- In our example we only replace one node with another, but a node can be
-- replaced with zero or more nodes, depending on the length of the resulting
-- list.
--
-- @
-- foo :: 'Node' -> ['Node']
-- foo = 'dfpos' $ \\k -> \\case
--     'Element' "w" as cs -> let 'Right' e = 'element' "x" as cs in k e
--     'Element' "x" as cs -> let 'Right' e = 'element' "y" as cs in [e]
--     'Element' "y" as cs -> let 'Right' e = 'element' "z" as cs in k e
-- @
--
-- See 'dfpre' for pre-orderd depth-first replacement.
--
-- /WARNING/ If you call @k@ in every branch, then 'dfpos' will never terminate.
-- Make sure the recursion stops at some point by simply returning a list of
-- nodes instead of calling @k@.
dfpos :: ((Node -> [Node]) -> Node -> [Node]) -> Node -> [Node]
dfpos f = runIdentity . dfposM (\k -> Identity . f (runIdentity . k))

-- | Monadic version of 'dfpos'.
dfposM :: Monad m => ((Node -> m [Node]) -> Node -> m [Node]) -> Node -> m [Node]
dfposM f = \n0 -> do
  c1 <- traverseChildren (dfposM f) (cursorFromNode n0)
  c2 <- traverseRightSiblings (dfposM f) c1
  fmap (normalize . join)
       (traverse (f (dfposM f)) (cursorSiblings c2))


-- | Pre-order depth-first replacement of 'Node' and all of its children.
--
-- This is just like 'dfpos' but the search proceeds in a different order.
dfpre :: ((Node -> [Node]) -> Node -> [Node]) -> Node -> [Node]
dfpre f = runIdentity . dfpreM (\k -> Identity . f (runIdentity . k))

-- | Monadic version of 'dfpre'.
dfpreM :: Monad m => ((Node -> m [Node]) -> Node -> m [Node]) -> Node -> m [Node]
dfpreM f = \n0 -> do
  ns <- f (dfpreM f) n0
  fmap (normalize . join) $ for ns $ \n -> do
     c1 <- traverseChildren (dfpreM f) (cursorFromNode n)
     cursorSiblings <$> traverseRightSiblings (dfpreM f) c1


--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- INTERNAL: Cursor
--
-- Most of this comes from Chris Smith's xmlhtml, BSD3 licensed
-- https://hackage.haskell.org/package/xmlhtml

-- | Zipper into a 'Node' tree.
data Cursor = Cursor
  { _cursorCurrent :: !Node
    -- ^ Retrieves the current node of a 'Cursor'.
  , _cursorLefts :: !(Seq Node)
    -- ^ Nodes to the left (ordered right to left).
  , _cursorRights :: !(Seq Node)
    -- ^ Nodes to the right (ordered left to right).
  , _cursorParents :: !(Seq (Seq Node, T.Text, HM.HashMap T.Text T.Text, Seq Node))
    -- ^ Parents' name, attributes, and siblings.
  }

------------------------------------------------------------------------------

-- | The cursor if left where it starts.
traverseChildren :: Monad m => (Node -> m [Node]) -> Cursor -> m Cursor
{-# INLINE traverseChildren #-}
traverseChildren f c0 = case _cursorCurrent c0 of
  Text _ -> pure c0
  Element t as cs -> do
     n1s <- fmap (normalize . join) (traverse f cs)
     pure (c0 {_cursorCurrent = Element' t as n1s})

-- | The cursor if left in the rightmost sibling.
traverseRightSiblings :: Monad m => (Node -> m [Node]) -> Cursor -> m Cursor
{-# INLINE traverseRightSiblings #-}
traverseRightSiblings f c0 = case cursorRemoveRight c0 of
   Nothing -> pure c0
   Just (n1, c1) -> do
      n2s <- fmap normalize (f n1)
      traverseRightSiblings f (cursorInsertManyRight n2s c1)

-- | Builds a 'Cursor' for navigating a tree. That is, a forest with a single
-- root 'Node'.
cursorFromNode :: Node -> Cursor
{-# INLINE cursorFromNode #-}
cursorFromNode n = Cursor n mempty mempty mempty

-- | Retrieves a list of the 'Node's at the same level as the current position
-- of a cursor, including the current node.
cursorSiblings :: Cursor -> [Node]
{-# INLINE cursorSiblings #-}
cursorSiblings (Cursor cur ls rs _) =
  toList (Seq.reverse ls <> (cur Seq.<| rs))

-- | Removes the node to the right and return it.
cursorRemoveRight :: Cursor -> Maybe (Node, Cursor)
{-# INLINE cursorRemoveRight #-}
cursorRemoveRight = \case
  Cursor n ls rs0 ps | not (Seq.null rs0) ->
     case Seq.viewl rs0 of
        r Seq.:< rs -> Just (r, Cursor n ls rs ps)
        _ -> undefined -- unreachable, rs0 is not empty
  _ -> Nothing

-- | Inserts a list of new 'Node's to the right of the current position.
cursorInsertManyRight :: [Node] -> Cursor -> Cursor
{-# INLINE cursorInsertManyRight #-}
cursorInsertManyRight ns (Cursor nn ls rs ps) =
  Cursor nn ls (Seq.fromList ns <> rs) ps

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Miscellaneous

encodeUtf8 :: T.Text -> BB.Builder
{-# INLINE encodeUtf8 #-}
encodeUtf8 = T.encodeUtf8Builder

encodeXmlUtf8 :: T.Text -> BB.Builder
{-# INLINE encodeXmlUtf8 #-}
encodeXmlUtf8 = T.encodeUtf8BuilderEscaped xmlEscaped

xmlEscaped :: BBP.BoundedPrim Word8
{-# INLINE xmlEscaped #-}
xmlEscaped =
   BBP.condB (== 38) (fixed5 (38,(97,(109,(112,59))))) $  -- '&'  ->  "&amp;"
   BBP.condB (== 60) (fixed4 (38,(108,(116,59)))) $       -- '<'  ->  "&lt;"
   BBP.condB (== 62) (fixed4 (38,(103,(116,59)))) $       -- '>'  ->  "&gt;"
   BBP.condB (== 34) (fixed5 (38,(35,(51,(52,59))))) $    -- '"'  ->  "&#34;"
   BBP.liftFixedToBounded BBP.word8
 where
   {-# INLINE fixed4 #-}
   fixed4 :: (Word8, (Word8, (Word8, Word8))) -> BBP.BoundedPrim Word8
   fixed4 x = BBP.liftFixedToBounded
     (const x BBP.>$< BBP.word8 BBP.>*< BBP.word8
              BBP.>*< BBP.word8 BBP.>*< BBP.word8)
   {-# INLINE fixed5 #-}
   fixed5 :: (Word8, (Word8, (Word8, (Word8, Word8)))) -> BBP.BoundedPrim Word8
   fixed5 x = BBP.liftFixedToBounded
     (const x BBP.>$< BBP.word8 BBP.>*< BBP.word8
              BBP.>*< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8)