module Text.XML.DOM.Parser ( -- * Parser internals ParserData(..) , pdCursor, pdAxis, pdPath , ParserError(..) , pePath, peDetails , ParserErrors(..) , throwParserError , renderPath -- * Parser itself , DomParser , runDomParser , ContentParser -- * Common parsers , unitFromDom , voidFromDom , textFromContent , stringFromContent , charFromContent , intFromContent , integerFromContent , doubleFromContent , fixedFromContent , boolFromContent -- * Parser classes , FromDom(..) , FromContent(..) -- * Combinators , inElem, inElems, nonEmptyInElems, maybeInElem, inElemsPred , inAxis, inDescendants, inTags -- * Content getters , tryCurrentContent, tryCurrentContentText , currentContent , tryCurrentName , currentName , tryCurrentAttr , currentAttr , elemContent, nonEmptyElemsContent, elemsContent, maybeElemContent -- * Helpers , fromContentR , CurrentContent(..) -- * Raw node getters , currentNodes -- * Checkers , checkCurrentLaxName, checkCurrentName ) where import Control.Applicative import Control.Exception import Control.Lens import Control.Monad import Control.Monad.Except import Control.Monad.Reader.Class import Control.Monad.Trans.Except import Control.Monad.Trans.Reader (Reader, runReader) import Data.Fixed import Data.Functor.Compose import qualified Data.List as L import Data.List.NonEmpty (NonEmpty(..)) import qualified Data.List.NonEmpty as NE import Data.Maybe import Data.Text (Text) import qualified Data.Text as T import Data.Traversable import Data.Typeable import Data.Void import GHC.Generics (Generic) import Text.Read (readMaybe) import Text.Shakespeare.Text (st) import Text.XML import Text.XML.Cursor -- | DOM parser error description. data ParserError -- | Tag not found which should be. = PENotFound { _pePath :: ![Text] } -- | Tag contents has wrong format, (could not read text to value) | PEWrongFormat { _pePath :: ![Text] -- ^ path of element , _peDetails :: Text } -- | Such tag name is not expected in this place | PEWrongTagName { _pePath :: ![Text] , _peDetails :: !Text } -- | Node is not an element but should be | PENotElement { _pePath :: ![Text] } -- | Node should have text content, but it does not. | PEContentNotFound { _pePath :: ![Text] } -- | Any other error | PEOther { _pePath :: ![Text] , _peDetails :: !Text } deriving (Eq, Ord, Show, Generic) makeLenses ''ParserError makePrisms ''ParserError instance Exception ParserError data ParserData = ParserData { _pdCursor :: !Cursor -- ^ Cursor to current parser's environment , _pdAxis :: !Axis -- ^ Context axis to follow deeper , _pdPath :: ![Text] -- ^ Path for errors } deriving (Generic) makeLenses ''ParserData newtype ParserErrors = ParserErrors { unParserErrors :: [ParserError] } deriving (Ord, Eq, Show, Generic) makeWrapped ''ParserErrors instance Exception ParserErrors -- | Parser monad where all parsing actions live type DomParser = ExceptT [ParserError] (Reader ParserData) -- | Content parser type. Parser is just a function taking Text and -- returning either error description or successfully parsed value. type ContentParser a = Text -> Either Text a -- | Render path for showing error renderPath :: [Text] -> String renderPath [] = "document" renderPath path = T.unpack $ mconcat $ L.intersperse ">" path throwParserError :: ([Text] -> ParserError) -> DomParser a throwParserError mkerr = do pd <- ask let err = mkerr $ pd ^. pdPath throwE [err] nodeName :: Node -> Maybe Name nodeName (NodeElement el) = Just $ elementName el nodeName _ = Nothing -- | Run parser on root element of Document. runDomParser :: Document -> DomParser a -> Either [ParserError] a runDomParser doc par = let cur = fromDocument doc Just root = nameLocalName <$> nodeName (node cur) pd = ParserData { _pdCursor = cur , _pdAxis = pure , _pdPath = [root] } in runReader (runExceptT par) pd -- | Helper function, throws 'PENotFound' if second argument is empty -- list takeFirstElem :: [Text] -> [a] -> DomParser a takeFirstElem path [] = throwE $ [PENotFound path] takeFirstElem _ (a:_) = return a -- | Find first element with given name in current element and run -- parser inside of found element. Throws PENotFound error if element -- not found. inElem :: Text -> DomParser a -> DomParser a inElem name p = do pd <- ask let newpath = (pd ^. pdPath) ++ [name] newcur <- takeFirstElem newpath $ (pd ^. pdCursor) $/ (pd ^. pdAxis) >=> (laxElement name) let newdata = ParserData { _pdCursor = newcur , _pdAxis = pure , _pdPath = newpath } local (const newdata) p -- | Generic elements combinator. Takes predicate filtering/converting -- list of cursors to some traversable (with posible filtering and/or -- reordering) inElemsPred :: (Traversable f) => ([Cursor] -> f Cursor) -- ^ Some predicate like 'listToMaybe' -> Text -- ^ Name of tags to find in current tag -> DomParser a -- ^ Parser to run inside found cursors -> DomParser (f a) inElemsPred cpred name p = do pd <- ask let newpath = (pd ^. pdPath) ++ [name] curs = cpred $ (pd ^. pdCursor) $/ (pd ^. pdAxis) >=> (laxElement name) for curs $ \cur -> do let newdata = ParserData { _pdCursor = cur , _pdAxis = pure , _pdPath = newpath } local (const newdata) p -- | Find all elements with gievn name in current element and run -- parser inside of this elements. inElems :: Text -> DomParser a -> DomParser [a] inElems = inElemsPred id nonEmptyInElems :: Text -> DomParser a -> DomParser (NonEmpty a) nonEmptyInElems name parser = (getCompose <$> inElemsPred (Compose . NE.nonEmpty) name parser) >>= maybe (throwParserError PENotFound) return -- | Try to find element with given name and run parser inside of -- it. If not found return Nothing maybeInElem :: Text -> DomParser a -> DomParser (Maybe a) maybeInElem = inElemsPred listToMaybe -- | Run parser within axis context. Expected to not use directly. inAxis :: [Text] -- ^ Path suffix to append to path before run parser -> Axis -- ^ Axis to append to context -> DomParser a -- ^ Parser to run -> DomParser a inAxis pathsuff axis parser = do local ( over pdAxis (>=> axis) . over pdPath (++ pathsuff) ) parser -- | Given parser will match inside specific inTags :: [Text] -- ^ Sequence of tag names parser must -- match inside -> DomParser a -- ^ Parser to run -> DomParser a inTags names p = let axis = foldr (&/) pure $ map laxElement names in inAxis names axis p -- | Given parser will match tag in arbitrary deepness inDescendants :: DomParser a -> DomParser a inDescendants = inAxis ["*"] descendant -- | Return the name of current cursor we staying in. Return 'Nothing' -- if we are not staying on element node tryCurrentName :: DomParser (Maybe Name) tryCurrentName = do pd <- ask return $ nodeName $ node $ pd ^. pdCursor -- | Return name of current element the parser in. currentName :: DomParser Name currentName = tryCurrentName >>= \case Nothing -> throwParserError PENotElement Just name -> return name -- | Run predicate with current tag name. Parser fails if predicate -- returned (Just msg) or node is not an element. checkCurrentName :: (Name -> Maybe Text) -- ^ name checking predicate -> DomParser () checkCurrentName predicate = do n <- currentName case predicate n of Nothing -> return () Just msg -> throwParserError $ \p -> PEWrongTagName p msg -- | Throw 'PEWrongTagName' if name of current element does not match -- with given. checkCurrentLaxName :: Text -> DomParser () checkCurrentLaxName name = let msg = [st|Expected tag name: #{name}|] predicate n = if (nameLocalName n == name) then Nothing else Just msg in checkCurrentName predicate -- | Get concatenated text from current parser's node(s). If current -- context have no @Content@ nodes then return Nothing. tryCurrentContentText :: DomParser (Maybe Text) tryCurrentContentText = do pd <- ask let txt = (pd ^. pdCursor) $/ (pd ^. pdAxis) >=> content return $ case txt of [] -> Nothing x -> Just $ mconcat x tryCurrentContent :: ContentParser a -> DomParser (Maybe a) tryCurrentContent cparse = do tmay <- tryCurrentContentText for tmay $ \t -> case cparse t of Left err -> throwParserError $ \p -> PEWrongFormat p err Right a -> pure a currentContent :: ContentParser a -> DomParser a currentContent cparse = tryCurrentContent cparse >>= maybe (throwParserError PEContentNotFound) return elemContent :: Text -> ContentParser a -> DomParser a elemContent name cparse = inElem name $ currentContent cparse nonEmptyElemsContent :: Text -> ContentParser a -> DomParser (NonEmpty a) nonEmptyElemsContent name cparse = (NE.nonEmpty <$> elemsContent name cparse) >>= maybe (throwParserError PENotFound) return elemsContent :: Text -> ContentParser a -> DomParser [a] elemsContent name cparse = fmap catMaybes $ inElems name $ tryCurrentContent cparse maybeElemContent :: Text -> ContentParser a -> DomParser (Maybe a) maybeElemContent name cparse = fmap join $ maybeInElem name $ tryCurrentContent cparse -- | Take attribute from current node (if it is an element). Throws -- 'PENotFound' or 'PENotElement' currentAttr :: Text -> DomParser Text currentAttr aname = do pd <- ask let newpath = (pd ^. pdPath) ++ [mappend "attribute " aname] takeFirstElem newpath $ laxAttribute aname (pd ^. pdCursor) tryCurrentAttr :: Text -> DomParser (Maybe Text) tryCurrentAttr aname = do pd <- ask return $ listToMaybe $ laxAttribute aname $ pd ^. pdCursor -- | Always successfully parses any DOM to @()@ unitFromDom :: DomParser () unitFromDom = pure () -- | Never parses successfully. It is just 'mzero' voidFromDom :: DomParser Void voidFromDom = mzero -- | Does not strip content. Returns content unmodified. textFromContent :: ContentParser Text textFromContent = Right -- | Does not strip content. Returns content unmodified. stringFromContent :: ContentParser String stringFromContent = Right . T.unpack -- | Expects content to be a singe non-blank character. Blank characters -- are stripped to parse pretty-printed XML files. charFromContent :: ContentParser Char charFromContent t = case T.unpack $ T.strip t of [a] -> Right a x -> let msg = [st|Tag sould contain exactly one char, but it contains: #{x}|] in Left msg intFromContent :: ContentParser Int intFromContent = fromContentR integerFromContent :: ContentParser Integer integerFromContent = fromContentR doubleFromContent :: ContentParser Double doubleFromContent = fromContentR fixedFromContent :: (HasResolution a, Typeable a) => ContentParser (Fixed a) fixedFromContent = fromContentR -- | Expects content to be y, yes, t, true or 1 for True value. n, no, -- f, false or 0 for False value. Case is not significant, blank -- characters are striped. boolFromContent :: ContentParser Bool boolFromContent t = let lowt = T.toLower $ T.strip t tvals = ["y", "yes", "t", "true", "1"] fvals = ["n", "no", "f", "false", "0"] in if | lowt `elem` tvals -> return True | lowt `elem` fvals -> return False | otherwise -> let msg = [st|Could not read "#{t}" as Bool|] in Left msg -- | Typeclass for structures which may be parsed from XML -- DOM. Usually you should pass parsing function explicitly to -- combinators like 'inElem', 'maybeInElem' or 'inTags' , but -- sometimes you need term search. Especially when you try to parse -- polymorphic types. Or you maybe generate parser with TH for your -- types, so typeclass would be convenient also. class FromDom a where fromDom :: DomParser a instance FromDom () where fromDom = unitFromDom -- | Usually you should pass 'ContentParser' to combinators like -- 'elemContent' or 'maybeElemContent' explicitly. But sometimes you -- need term search. Especially for code generated with TH. class FromContent a where -- | Should return either error message (what was wrong) or parsed -- value fromContent :: ContentParser a instance FromContent Text where fromContent t = Right t instance FromContent String where fromContent t = pure $ T.unpack t instance FromContent Char where fromContent = charFromContent fromContentR :: forall a. (Read a, Typeable a) => Text -> Either Text a fromContentR t = case readMaybe $ T.unpack $ T.strip t of Nothing -> let name = typeRep (Proxy :: Proxy a) in Left [st|Unreadable #{show name}: #{t}|] Just x -> Right x instance FromContent Int where fromContent = intFromContent instance FromContent Integer where fromContent = integerFromContent instance FromContent Double where fromContent = doubleFromContent instance (HasResolution a, Typeable a) => FromContent (Fixed a) where fromContent = fixedFromContent -- | This isntance might be not very obvious but anyway instance FromContent Bool where fromContent = boolFromContent -- | Helper newtype returning 'currentContent' for any type with -- instance 'FromContent' newtype CurrentContent a = CurrentContent { unCurrentContent :: a } deriving (Ord, Eq, Show, Generic) instance (FromContent a) => FromDom (CurrentContent a) where fromDom = CurrentContent <$> currentContent fromContent -- | Get children nodes from current parser's node. currentNodes :: DomParser [Node] currentNodes = do pd <- ask let curs = (pd ^. pdCursor) $/ (pd ^. pdAxis) return $ fmap node curs