src/Text/XML/DOM/Parser/Combinators.hs

module Text.XML.DOM.Parser.Combinators
  ( -- * Generic combinators to traverse descendants
    traverseElems
  , inFilteredTrav
    -- * Using 'DomTraversable'
  , inElemTrav
  , inElem
  , inElemAll
  , inElemMay
  , inElemNe
    -- * Dive combinators
  , divePath
  , diveElem
    -- * Explicit ignoring elements
  , ignoreElem
  , ignoreEmpty
  , ignoreBlank
    -- * Checking current element properties
  , checkCurrentName
    -- * Parsing arbitrary content
  , parseContent
  , readContent
  ) where

import           Control.Lens
import           Control.Monad.Except
import           Control.Monad.Reader
import           Data.List.NonEmpty (NonEmpty(..))
import           Data.Monoid
import           Data.Text (Text)
import qualified Data.Text as T
import           Data.Traversable
import           Data.Typeable
import           Text.Read
import           Text.Shakespeare.Text (st)
import           Text.XML
import           Text.XML.DOM.Parser.Types
import           Text.XML.Lens


-- | Generic function to traverse arbitrary inner cursors.
traverseElems
  :: (Monad m, Foldable g, Traversable f)
  => ([Element] -> DomParserT g m (f ([Text], Element)))
     -- ^ Takes set of current elements and
  -> DomParserT Identity m a
     -- ^ Parser will be runned for each element found in traversable
  -> DomParserT g m (f a)
traverseElems trav parser = do
  pd <- ask
  inner <- trav $ pd ^.. pdElements . folded
  for inner $ \(subpath, e) -> do
    let newpd = ParserData
          { _pdElements = Identity e
          , _pdPath     = pd ^. pdPath <> subpath }
    magnify (to $ const newpd) parser

-- | Takes function filtering
inFilteredTrav
  :: (Monad m, Foldable g, DomTraversable f)
  => ([Element] -> ([Text], [Element]))
   -- ^ Function returning some filtered elements with path suffixes which will
   -- be appended to parser's state
  -> DomParserT Identity m a
  -> DomParserT g m (f a)
inFilteredTrav deeper = traverseElems trav
  where
    trav e = do
      let (path, elems) = deeper e
      case buildDomTraversable elems of
        Nothing -> throwParserError $ PENotFound . (<> path)
        Just tr -> return $ fmap (path,) tr

inElemTrav
  :: (Monad m, Foldable g, DomTraversable f)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m (f a)
inElemTrav n = inFilteredTrav deeper
  where
    deeper = ([n],) . toListOf (folded . nodes . folded . _Element . ell n)

-- | Runs parser inside first children element with given name
inElem
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m a
inElem n = fmap runIdentity . inElemTrav n

inElemAll
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m [a]
inElemAll = inElemTrav

inElemMay
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m (Maybe a)
inElemMay = inElemTrav

inElemNe
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT Identity m a
  -> DomParserT g m (NonEmpty a)
inElemNe = inElemTrav

{- | Dive given parser's current tags set into the given path. The @divePath
["a", "b"]@ differs from @inElem "a" . inElem "b"@. Namely the first variant
will not fail if occured tag "a" which does not contains tag "b". This
behaviour is desireable when you dont want to parse whole XML and just want
to pull tags in some path. The other difference is in traversing inner
elements. Consider this code

@
inElem "a" $ inElem "b" $ inElemAll "c" fromDom
@

which translates to pseudo-CSS query like: @a:nth(1) > b:nth(1) > c > fromDom@

@
divePath ["a", "b"] $ inElemAll "c" fromDom
@

which translates like: @a > b > c > fromDom@

As you can see, inElem always takes first element and runs inner parser in this
single element, unlike 'divePath' which runs inner parser @in all@ descendants
in given path.
-}

divePath
  :: forall m g a
   . (Monad m, Foldable g)
  => [Text]
  -> DomParserT [] m a
  -> DomParserT g m a
divePath path = magnify $ to modElems
  where
    modElems
      = over pdElements (toListOf $ folded . diver)
      . over pdPath (<> path)
    diver :: Fold Element Element
    diver    = foldr (.) id $ map toDive path
    toDive n = nodes . folded . _Element . ell n

diveElem
  :: (Monad m, Foldable g)
  => Text
  -> DomParserT [] m a
  -> DomParserT g m a
diveElem p = divePath [p]

-- | Ignore arbitrary current element if it conforms to predicate.
ignoreElem
  :: (Monad m)
  => (Element -> Bool)
     -- ^ Predicate checking that we must ignore some current tag. If returns
     -- true then parser will not be runned and combinator just returns Nothing.
  -> DomParserT Identity m a
  -> DomParserT Identity m (Maybe a)
ignoreElem test parser = do
  ign <- view $ pdElements . to (test . runIdentity)
  if ign then pure Nothing else Just <$> parser

-- | If current element has no children nodes does not run parser and returns
-- Nothing. Otherwise runs parser inside current element. Usefull when you got
-- XML with strange empty elements which must be just ignored, but `inElem` runs
-- parser inside of this elements which causes to parser error.
ignoreEmpty
  :: (Monad m)
  => DomParserT Identity m a
  -> DomParserT Identity m (Maybe a)
ignoreEmpty = ignoreElem test
  where
    test e = null $ e ^. nodes

-- | If all current elements contains blank content, or contains nothing at all
-- , then returns Nothing, else runs parser.
ignoreBlank
  :: (Monad m)
  => DomParserT Identity m a
  -> DomParserT Identity m (Maybe a)
ignoreBlank = ignoreElem test
  where
    test e =
      let
        elems = e ^.. nodes . folded . _Element
        cont = mconcat $ e ^.. nodes . folded . _Content
      in if | not $ null elems      -> False
            | T.null $ T.strip cont -> True
            | otherwise             -> False

-- | If name of current tag differs from first argument throws 'PENotFound' with
-- tag name replaced in last path's segment. Usefull for checking root
-- document's element name.
checkCurrentName
  :: (Monad m)
  => Text
  -> DomParserT Identity m ()
checkCurrentName n = do
  cn <- view $ pdElements . to runIdentity . localName
  unless (cn == n) $ do
    p <- view pdPath
    let pinit = if null p then [] else init p
    throwError $ ParserErrors [PENotFound $ pinit ++ [n]]
  return ()

-- | Parses content inside current tag. It expects current element set consists
-- of exactly ONE element. Throws error if current elements set contains
-- multiple of them.
parseContent
  :: (Monad m)
  => (Text -> DomParserT Identity m a)
  -> DomParserT Identity m a
parseContent parse = do
  e <- view $ pdElements . to runIdentity
  let
    nds = e ^. nodes
    els = nds ^.. folded . _Element
    conts = nds ^.. folded . _Content
  when (not $ null els) $ throwParserError PEContentNotFound
  when (null conts) $ throwParserError PEContentNotFound
  parse $ mconcat conts

readContent
  :: forall m g a
   . (Read a, Typeable a, Monad m)
  => Text
  -> DomParserT g m a
readContent t = case readMaybe $ T.unpack t of
  Nothing -> throwParserError $ PEWrongFormat [st|Not readable #{n}: #{t}|]
  Just a  -> pure a
  where
    n = show $ typeRep (Proxy :: Proxy a)