Text/XML/Cursor.hs

-- | This module provides for simple DOM traversal. It is inspired by XPath. There are two central concepts here:
--
-- * A 'Cursor' represents a node in the DOM. It also contains information on the node's /location/. While the 'Node' datatype will only know of its children, a @Cursor@ knows about its parent and siblings as well. (The underlying mechanism allowing this is called a zipper, see <http://www.haskell.org/haskellwiki/Zipper> and <http://www.haskell.org/haskellwiki/Tying_the_Knot>.)
--
-- * An 'Axis', in its simplest form, takes a @Cursor@ and returns a list of @Cursor@s. It is used for selections, such as finding children, ancestors, etc. Axes can be chained together to express complex rules, such as all children named /foo/.
--
-- The terminology used in this module is taken directly from the XPath
-- specification: <http://www.w3.org/TR/xpath/>. For those familiar with XPath,
-- the one major difference is that attributes are not considered nodes in this
-- module.
module Text.XML.Cursor
    (
    -- * Data types
      Cursor
    , Axis
    -- * Production
    , fromDocument
    , fromNode
    , cut
    -- * Axes
    , parent
    , CG.precedingSibling
    , CG.followingSibling
    , child
    , node
    , CG.preceding
    , CG.following
    , CG.ancestor
    , descendant
    , orSelf
      -- ** Filters
    , check
    , checkNode
    , checkElement
    , checkName
    , anyElement
    , element
    , laxElement
    , content
    , attribute
    , laxAttribute
    , hasAttribute
    , attributeIs
    -- * Operators
    , (CG.&|)
    , (CG.&/)
    , (CG.&//)
    , (CG.&.//)
    , (CG.$|)
    , (CG.$/)
    , (CG.$//)
    , (CG.$.//)
    , (CG.>=>)
    -- * Type classes
    , Boolean(..)
    -- * Error handling
    , force
    , forceM
    ) where

import           Control.Monad
import           Data.Function                (on)
import           Text.XML
import qualified Control.Failure              as F
import qualified Data.Text                    as T
import qualified Text.XML.Cursor.Generic      as CG
import           Text.XML.Cursor.Generic      (node, child, parent, descendant, orSelf)
import           Data.Maybe                   (maybeToList)

-- TODO: Consider [Cursor] -> [Cursor]?
-- | The type of an Axis that returns a list of Cursors.
-- They are roughly modeled after <http://www.w3.org/TR/xpath/#axes>.
-- 
-- Axes can be composed with '>=>', where e.g. @f >=> g@ means that on all results of
-- the @f@ axis, the @g@ axis will be applied, and all results joined together. 
-- Because Axis is just a type synonym for @Cursor -> [Cursor]@, it is possible to use
-- other standard functions like '>>=' or 'concatMap' similarly.
-- 
-- The operators '&|', '&/', '&//' and '&.//' can be used to combine axes so that the second
-- axis works on the context nodes, children, descendants, respectively the context node as 
-- well as its descendants of the results of the first axis.
-- 
-- The operators '$|', '$/', '$//' and '$.//' can be used to apply an axis (right-hand side)
-- to a cursor so that it is applied on the cursor itself, its children, its descendants,
-- respectively itself and its descendants.
-- 
-- Note that many of these operators also work on /generalised Axes/ that can return 
-- lists of something other than Cursors, for example Content elements.
type Axis = Cursor -> [Cursor]

-- XPath axes as in http://www.w3.org/TR/xpath/#axes

-- TODO: Decide whether to use an existing package for this
-- | Something that can be used in a predicate check as a boolean.
class Boolean a where
    bool :: a -> Bool

instance Boolean Bool where 
    bool = id
instance Boolean [a] where 
    bool = not . null
instance Boolean (Maybe a) where 
    bool (Just _) = True
    bool _        = False
instance Boolean (Either a b) where
    bool (Left _)  = False
    bool (Right _) = True

-- | A cursor: contains an XML 'Node' and pointers to its children, ancestors and siblings.
type Cursor = CG.Cursor Node

-- | Cut a cursor off from its parent. The idea is to allow restricting the scope of queries on it.
cut :: Cursor -> Cursor
cut = fromNode . CG.node

-- | Convert a 'Document' to a 'Cursor'. It will point to the document root.
fromDocument :: Document -> Cursor
fromDocument = fromNode . NodeElement . documentRoot

-- | Convert a 'Node' to a 'Cursor' (without parents).
fromNode :: Node -> Cursor
fromNode =
    CG.toCursor cs
  where
    cs (NodeElement (Element _ _ x)) = x
    cs _ = []

-- | Filter cursors that don't pass a check.
check :: Boolean b => (Cursor -> b) -> Axis
check f c = case bool $ f c of
              False -> []
              True -> [c]

-- | Filter nodes that don't pass a check.
checkNode :: Boolean b => (Node -> b) -> Axis
checkNode f c = check (f . node) c

-- | Filter elements that don't pass a check, and remove all non-elements.
checkElement :: Boolean b => (Element -> b) -> Axis
checkElement f c = case node c of
                     NodeElement e -> case bool $ f e of
                                        True -> [c]
                                        False -> []
                     _ -> []

-- | Filter elements that don't pass a name check, and remove all non-elements.
checkName :: Boolean b => (Name -> b) -> Axis
checkName f c = checkElement (f . elementName) c

-- | Remove all non-elements. Compare roughly to XPath:
-- /A node test * is true for any node of the principal node type. For example, child::* will select all element children of the context node [...]/.
anyElement :: Axis
anyElement = checkElement (const True)

-- | Select only those elements with a matching tag name. XPath:
-- /A node test that is a QName is true if and only if the type of the node (see [5 Data Model]) is the principal node type and has an expanded-name equal to the expanded-name specified by the QName./
element :: Name -> Axis
element n = checkName (== n)

-- | Select only those elements with a loosely matching tag name. Namespace and case are ignored. XPath:
-- /A node test that is a QName is true if and only if the type of the node (see [5 Data Model]) is the principal node type and has an expanded-name equal to the expanded-name specified by the QName./
laxElement :: T.Text -> Axis
laxElement n = checkName (on (==) T.toCaseFold n . nameLocalName)

-- | Select only text nodes, and directly give the 'Content' values. XPath:
-- /The node test text() is true for any text node./
-- 
-- Note that this is not strictly an 'Axis', but will work with most combinators.
content :: Cursor -> [T.Text]
content c = case node c of
              (NodeContent v) -> [v]
              _               -> []

-- | Select attributes on the current element (or nothing if it is not an element). XPath:
-- /the attribute axis contains the attributes of the context node; the axis will be empty unless the context node is an element/
-- 
-- Note that this is not strictly an 'Axis', but will work with most combinators.
-- 
-- The return list of the generalised axis contains as elements lists of 'Content' 
-- elements, each full list representing an attribute value.
attribute :: Name -> Cursor -> [T.Text]
attribute n c =
    case node c of
        NodeElement e -> maybeToList $ lookup n $ elementAttributes e
        _ -> []

-- | Select attributes on the current element (or nothing if it is not an element).  Namespace and case are ignored. XPath:
-- /the attribute axis contains the attributes of the context node; the axis will be empty unless the context node is an element/
-- 
-- Note that this is not strictly an 'Axis', but will work with most combinators.
-- 
-- The return list of the generalised axis contains as elements lists of 'Content' 
-- elements, each full list representing an attribute value.
laxAttribute :: T.Text -> Cursor -> [T.Text]
laxAttribute n c =
    case node c of
        NodeElement e -> do
            (n', v) <- elementAttributes e
            guard $ (on (==) T.toCaseFold) n (nameLocalName n')
            return v
        _ -> []

-- | Select only those element nodes with the given attribute.
hasAttribute :: Name -> Axis
hasAttribute n c =
    case node c of
        NodeElement (Element _ as _) -> maybe [] (const [c]) $ lookup n as
        _ -> []

-- | Select only those element nodes containing the given attribute key/value pair.
attributeIs :: Name -> T.Text -> Axis
attributeIs n v c =
    case node c of
        NodeElement (Element _ as _) -> if Just v == lookup n as then [c] else []
        _ -> []

force :: F.Failure e f => e -> [a] -> f a
force e [] = F.failure e
force _ (x:_) = return x

forceM :: F.Failure e f => e -> [f a] -> f a
forceM e [] = F.failure e
forceM _ (x:_) = x