lib/Xmlbf.hs

{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternSynonyms #-}

{-# LANGUAGE ScopedTypeVariables #-}

-- | XML back and forth!
--
-- @xmlbf@ doesn't do any parsing of raw XML on its own. Instead, one should
-- rely on libraries like @xmlbf-xeno@ or @xmlbf-xmlhtml@ for this.
--
-- @xmlbf@ provides a 'FromXml' class intended to be used as the familiar
-- 'Data.Aeson.FromJSON' from the @aeson@ package. This relies on the
-- 'Parser' type and the related tools.
--
-- @xmlbf@ provides a 'ToXml' class intended to be used as the familiar
-- 'Data.Aeson.toJSON' from the @aeson@ package. This relies on the
-- 'Node' and 'Element'' types.
--
-- @xmlb@ provides tools like 'df' and 'dfM' for finding a fixpoint
-- of a XML structure.
module Xmlbf
 ( -- * Parsing
   FromXml(fromXml)
 , Parser
 , runParser
 , pElement
 , pAttr
 , pAttrs
 , pText
 , pRead
 , pEndOfInput

   -- * Rendering
 , ToXml(toXml)
 , encode

 , Node(Text, Element)
 , element

   -- * Fixpoints
 , df
 , dfM
 ) where

import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Builder.Prim as BBP
import Data.Foldable (for_, toList)
import Data.Functor.Identity (Identity(Identity), runIdentity)
import qualified Data.HashMap.Strict as HM
import Data.Monoid ((<>))
import Data.Sequence (Seq)
import qualified Data.Sequence as Seq
import Data.String (IsString(fromString))
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.Typeable (Typeable, typeRep, tyConName, typeRepTyCon)
import Data.Word (Word8)
import Control.Applicative (Alternative(empty, (<|>)))
import Control.Monad (MonadPlus(mplus, mzero), join, guard)
import Control.Monad.Fail (MonadFail(fail))
import qualified Text.Read

--------------------------------------------------------------------------------

-- | Either a text or an element node in an XML fragment. Construct with either
-- 'Text' or 'element', respectively.
data Node
  = Element' !T.Text !(HM.HashMap T.Text T.Text) ![Node]
  | Text !T.Text
  deriving (Eq, Show)

pattern Element :: T.Text -> (HM.HashMap T.Text T.Text) -> [Node] -> Node
pattern Element t as cs <- Element' t as cs

-- | Constructs a 'Text'.
instance IsString Node where
  fromString = Text . T.pack
  {-# INLINABLE fromString #-}

-- | Concats 'Text's together.
normalize :: [Node] -> [Node]
{-# INLINE normalize #-}
normalize = \case
   Text a : Text b : ns -> normalize (Text (a <> b) : ns)
   (n : ns) -> n : normalize ns
   [] -> []

element
  :: T.Text
  -- ^ Element' name.
  -> HM.HashMap T.Text T.Text
  -- ^ Attributes.
  -> [Node]
  -- ^ Children.
  -> Either String Node
  -- ^ Returns 'Left' if the element name, or atribute names, or attribute
  -- values are invalid.
  --
  -- TODO: We just check for emptyness currently.
element t0 hm0 ns0 = do
  guarde (t0 == T.strip t0) $
     "Element' name has surrounding whitespace: " ++ show t0
  guarde (not (T.null t0)) ("Element' name is blank: " ++ show t0)
  for_ (HM.keys hm0) $ \k -> do
     guarde (k == T.strip k) $
        "Attribute name has surrounding whitespace: " ++ show k
     guarde (not (T.null k)) ("Attribute name is blank: " ++ show k)
  Right (Element' t0 hm0 (normalize ns0))

guarde :: Bool -> String -> Either String ()
{-# INLINE guarde #-}
guarde True  _ = Right ()
guarde False s = Left s

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Parsing

class FromXml a where
  -- | Parses an XML fragment into a value of type @a@.
  --
  -- If a 'ToXml' instance for @a@ exists, then:
  --
  -- @
  -- 'runParser' 'fromXml' ('toXml' a) == 'Right' a
  -- @
  fromXml :: Parser a

-- | XML parser monad. To be run with 'runParser'.
--
-- You can build a 'Parser' using 'element', 'pAttr', 'pAttrs', 'pText', 'pRead',
-- or any of the 'Applicative', 'Alternative' or 'Monad' combinators.
newtype Parser a = Parser { unParser :: S -> Either String (a, S) }
  deriving (Functor)

-- | Run a parser on an XML fragment. If the parser fails, then a 'String' with
-- an error message is returned.
runParser :: Parser a -> [Node] -> Either String a
runParser p0 = fmap fst . unParser p0 . STop . normalize

data S
  = STop ![Node]
    -- ^ Parsing the top-level nodes.
  | SReg !T.Text !(HM.HashMap T.Text T.Text) ![Node]
    -- ^ Parsing a particular root element.

instance Applicative Parser where
  {-# INLINE pure #-}
  pure = \a -> Parser (\s -> Right (a, s))
  {-# INLINE (<*>) #-}
  Parser gf <*> Parser ga = Parser $ \s0 -> do
    (f, s1) <- gf s0
    (a, s2) <- ga s1
    pure (f a, s2)

instance Monad Parser where
  {-# INLINE return #-}
  return = pure
  {-# INLINE (>>=) #-}
  Parser ga >>= k = Parser $ \s0 -> do
    (a, s1) <- ga s0
    unParser (k a) s1
  fail e = Parser (\_ -> Left e)

instance MonadFail Parser where
  fail e = Parser (\_ -> Left e)

-- | Backtracks.
instance Alternative Parser where
  {-# INLINE empty #-}
  empty = Parser (\_ -> Left "empty")
  {-# INLINE (<|>) #-}
  Parser a <|> Parser b = Parser (\s -> either (\_ -> b s) Right (a s))

-- | Backtracks.
instance MonadPlus Parser where
  {-# INLINE mzero #-}
  mzero = empty
  {-# INLINE mplus #-}
  mplus = (<|>)

--------------------------------------------------------------------------------
-- Some parsers

-- | @'pElement' "foo" p@ runs a 'Parser' @p@ inside a element node named
-- @"foo"@. This fails if such element does not exist at the current position.
--
-- Consumes the element from the parser state.
pElement :: T.Text -> Parser a -> Parser a
pElement t0 p0 = Parser $ \case
  SReg t as (Element' t' as' cs' : cs) | t == t0 ->  do
    (a,_) <- unParser p0 (SReg t' as' cs')
    Right (a, SReg t as cs)
  STop (Element' t as cs : ns) | t == t0 -> do
    (a,_) <- unParser p0 (SReg t as cs)
    Right (a, STop ns)
  _ -> Left ("Missing element " ++ show t0)

-- | Return the value of the requested attribute, if defined. May return an
-- empty string in case the attribute is defined but no value was given to it.
--
-- Consumes the attribute from the parser state.
pAttr :: T.Text -> Parser T.Text
{-# INLINABLE pAttr #-}
pAttr n = Parser $ \case
  STop _ -> Left "Before selecting an attribute, you must select an element"
  SReg t as cs -> case HM.lookup n as of
    Just x -> Right (x, SReg t (HM.delete n as) cs)
    Nothing -> Left ("Missing attribute " ++ show n ++ " in element " ++ show t)

-- | Returns all of the available element attributes. May return empty strings
-- as values in case an attribute is defined but no value was given to it.
--
-- Consumes all of the remaining attributes for this element from the parser
-- state.
pAttrs :: Parser (HM.HashMap T.Text T.Text)
{-# INLINABLE pAttrs #-}
pAttrs = Parser $ \case
  STop _ -> Left "Before selecting an attribute, you must select an element"
  SReg t as cs -> Right (as, SReg t mempty cs)

-- | Return a text node value (including CDATA).
--
-- Consumes the text node from the parser state.
--
-- Law: When two consecutive calls to 'pText' are made, the first call returns
-- all of the available consecutive text, and the second call always fails.
pText :: Parser T.Text
{-# INLINABLE pText #-}
pText = Parser $ \case
    -- Note: this works only because we asume 'normalize' has been used.
    STop (Text x : ns) -> Right (x, STop ns)
    SReg t as (Text x : cs) -> Right (x, SReg t as cs)
    _ -> Left "Missing text node"

-- | Parses a value that can be 'read'.
--
-- Consumes the raw string from the parser state.
pRead :: (Typeable a, Read a) => T.Text -> Parser a
pRead = \t -> case Text.Read.readMaybe (T.unpack t) of
  Just a -> pure a
  ya@Nothing -> do
    let ty = tyConName (typeRepTyCon (typeRep ya))
    Parser (\_ -> Left ("Can't read as " ++ ty ++ ": " ++ show t))

-- | Succeeds if all of the elements, attributes and text nodes have
-- been consumed.
pEndOfInput :: Parser ()
pEndOfInput = Parser (\s ->
  if isEof s then Right ((), s)
             else Left "Not end of input yet")

isEof :: S -> Bool
{-# INLINE isEof #-}
isEof = \case
  SReg _ as cs -> HM.null as && null cs
  STop ns -> null ns

--------------------------------------------------------------------------------
-- Rendering

class ToXml a where
  -- | Renders a value of type @a@ into an XML fragment.
  --
  -- If a 'FromXml' instance for @a@ exists, then:
  --
  -- @
  -- 'runParser' 'fromXml' ('toXml' a) == 'Right' a
  -- @
  toXml :: a -> [Node]

-- | Encodes a list of XML 'Node's to an UTF8-encoded and XML-escaped
-- bytestring.
encode :: [Node] -> BB.Builder
encode xs = mconcat $ xs >>= \case
  Text x -> [encodeXmlUtf8 x]
  Element' t as cs ->
    [ "<"
    , encodeUtf8 t
    , mconcat $ do
        (k,v) <- HM.toList as
        guard (not (T.null k))
        [ " ", encodeUtf8 k, "=\"", encodeXmlUtf8 v, "\"" ]
    , if null cs then "/" else ""
    , ">"
    , encode cs
    , if null cs then "" else "</" <> encodeUtf8 t <> ">"
    ]

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Node fixpoint

-- | Post-order depth-first replacement of 'Node' and all of its children.
--
-- This function works like 'fix', but the given function is trying to find a
-- fixpoint for the individual children nodes, not for the root node.
--
-- For example, the following function renames every node named @"w"@ to @"y"@,
-- and every node named @"y"@ to @"z"@. It accomplishes this by first renaming
-- @"w"@ nodes to @"x"@, and then, by using @k@ recursively to further rename
-- all @"x"@ nodes (including the ones that were just created) to @"y"@ in a
-- post-order depth-first manner. After renaming an @"x"@ node to @"y"@, the
-- recursion stops (i.e., @k@ is not used), so our new @"y"@ nodes won't be
-- further renamed to @"z"@. However, nodes that were named @"y"@ initially will
-- be renamed to @"z"@.
--
-- In our example we only replace one node with another, but a node can be
-- replaced with zero or more nodes, depending on the length of the resulting
-- list.
--
-- @
-- foo :: 'Node' -> ['Node']
-- foo = 'df' $ \\k -> \\case
--     'Element'' "w" as cs -> k ('Element'' "x" as cs)
--     'Element'' "x" as cs -> ['Element'' "y" as cs]
--     'Element'' "y" as cs -> k ('Element'' "z" as cs)
-- @
--
-- /WARNING/ If you call @k@ in every branch, then 'df' will never terminate.
-- Make sure the recursion stops at some point by simply returning a list of
-- nodes instead of calling @k@.
df :: ((Node -> [Node]) -> Node -> [Node]) -> Node -> [Node]
df f = runIdentity . dfM (\k -> Identity . f (runIdentity . k))

-- | Monadic version of 'df'.
dfM :: Monad m => ((Node -> m [Node]) -> Node -> m [Node]) -> Node -> m [Node]
dfM f = \n0 -> do
  let c0 = cursorFromNode n0
  c1 <- traverschildren (dfM f) c0
  c2 <- traverseRightSiblings (dfM f) c1
  fmap (normalize . join)
       (traverse (f (dfM f)) (cursorSiblings c2))

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- INTERNAL: Cursor
--
-- Most of this comes from Chris Smith's xmlhtml, BSD3 licensed
-- https://hackage.haskell.org/package/xmlhtml

-- | Zipper into a 'Node' tree.
data Cursor = Cursor
  { _cursorCurrent :: !Node
    -- ^ Retrieves the current node of a 'Cursor'.
  , _cursorLefts :: !(Seq Node)
    -- ^ Nodes to the left (ordered right to left).
  , _cursorRights :: !(Seq Node)
    -- ^ Nodes to the right (ordered left to right).
  , _cursorParents :: !(Seq (Seq Node, T.Text, HM.HashMap T.Text T.Text, Seq Node))
    -- ^ Parents' name, attributes, and siblings.
  }

------------------------------------------------------------------------------

-- | The cursor if left where it starts.
traverschildren :: Monad m => (Node -> m [Node]) -> Cursor -> m Cursor
{-# INLINE traverschildren #-}
traverschildren f c0 = case _cursorCurrent c0 of
  Text _ -> pure c0
  Element' t as cs -> do
     n1s <- fmap (normalize . join) (traverse f cs)
     pure (c0 {_cursorCurrent = Element' t as n1s})

-- | The cursor if left in the rightmost sibling.
traverseRightSiblings :: Monad m => (Node -> m [Node]) -> Cursor -> m Cursor
{-# INLINE traverseRightSiblings #-}
traverseRightSiblings f c0 = case cursorRemoveRight c0 of
   Nothing -> pure c0
   Just (n1, c1) -> do
      n2s <- fmap normalize (f n1)
      traverseRightSiblings f (cursorInsertManyRight n2s c1)

-- | Builds a 'Cursor' for navigating a tree. That is, a forest with a single
-- root 'Node'.
cursorFromNode :: Node -> Cursor
{-# INLINE cursorFromNode #-}
cursorFromNode n = Cursor n mempty mempty mempty

-- | Retrieves a list of the 'Node's at the same level as the current position
-- of a cursor, including the current node.
cursorSiblings :: Cursor -> [Node]
{-# INLINE cursorSiblings #-}
cursorSiblings (Cursor cur ls rs _) =
  toList (Seq.reverse ls <> (cur Seq.<| rs))

-- | Removes the node to the right and return it.
cursorRemoveRight :: Cursor -> Maybe (Node, Cursor)
{-# INLINE cursorRemoveRight #-}
cursorRemoveRight = \case
  Cursor n ls rs0 ps | not (Seq.null rs0) ->
     case Seq.viewl rs0 of
        r Seq.:< rs -> Just (r, Cursor n ls rs ps)
        _ -> undefined -- unreachable, rs0 is not empty
  _ -> Nothing

-- | Inserts a list of new 'Node's to the right of the current position.
cursorInsertManyRight :: [Node] -> Cursor -> Cursor
{-# INLINE cursorInsertManyRight #-}
cursorInsertManyRight ns (Cursor nn ls rs ps) =
  Cursor nn ls (Seq.fromList ns <> rs) ps

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Miscellaneous

encodeUtf8 :: T.Text -> BB.Builder
{-# INLINE encodeUtf8 #-}
encodeUtf8 = T.encodeUtf8Builder

encodeXmlUtf8 :: T.Text -> BB.Builder
{-# INLINE encodeXmlUtf8 #-}
encodeXmlUtf8 = T.encodeUtf8BuilderEscaped xmlEscaped

xmlEscaped :: BBP.BoundedPrim Word8
{-# INLINE xmlEscaped #-}
xmlEscaped =
   BBP.condB (== 38) (fixed5 (38,(97,(109,(112,59))))) $  -- '&'  ->  "&amp;"
   BBP.condB (== 60) (fixed4 (38,(108,(116,59)))) $       -- '<'  ->  "&lt;"
   BBP.condB (== 62) (fixed4 (38,(103,(116,59)))) $       -- '>'  ->  "&gt;"
   BBP.condB (== 34) (fixed5 (38,(35,(51,(52,59))))) $    -- '"'  ->  "&#34;"
   BBP.liftFixedToBounded BBP.word8
 where
   {-# INLINE fixed4 #-}
   fixed4 :: (Word8, (Word8, (Word8, Word8))) -> BBP.BoundedPrim Word8
   fixed4 x = BBP.liftFixedToBounded
     (const x BBP.>$< BBP.word8 BBP.>*< BBP.word8
              BBP.>*< BBP.word8 BBP.>*< BBP.word8)
   {-# INLINE fixed5 #-}
   fixed5 :: (Word8, (Word8, (Word8, (Word8, Word8)))) -> BBP.BoundedPrim Word8
   fixed5 x = BBP.liftFixedToBounded
     (const x BBP.>$< BBP.word8 BBP.>*< BBP.word8
              BBP.>*< BBP.word8 BBP.>*< BBP.word8 BBP.>*< BBP.word8)