First paragraph. \
-- \Second paragraph. \
-- \Third paragraph. \
-- \{-# OPTIONS_HADDOCK hide #-} module Text.HTML.Scalpel.Internal.Scrape ( Scraper (..) , scrape , attr , attrs , html , htmls , innerHTML , innerHTMLs , text , texts , chroot , chroots , matches , position ) where import Text.HTML.Scalpel.Internal.Select import Text.HTML.Scalpel.Internal.Select.Types import Control.Applicative import Control.Monad import Data.Maybe import qualified Control.Monad.Fail as Fail import qualified Data.Vector as Vector import qualified Text.HTML.TagSoup as TagSoup import qualified Text.StringLike as TagSoup -- | A value of 'Scraper' @a@ defines a web scraper that is capable of consuming -- a list of 'TagSoup.Tag's and optionally producing a value of type @a@. newtype Scraper str a = MkScraper { scrapeTagSpec :: TagSpec str -> Maybe a } instance Functor (Scraper str) where fmap f (MkScraper a) = MkScraper $ fmap (fmap f) a instance Applicative (Scraper str) where pure = MkScraper . const . Just (MkScraper f) <*> (MkScraper a) = MkScraper applied where applied tags | (Just aVal) <- a tags = ($ aVal) <$> f tags | otherwise = Nothing instance Alternative (Scraper str) where empty = MkScraper $ const Nothing (MkScraper a) <|> (MkScraper b) = MkScraper choice where choice tags | (Just aVal) <- a tags = Just aVal | otherwise = b tags instance Monad (Scraper str) where fail = Fail.fail return = pure (MkScraper a) >>= f = MkScraper combined where combined tags | (Just aVal) <- a tags = let (MkScraper b) = f aVal in b tags | otherwise = Nothing instance MonadPlus (Scraper str) where mzero = empty mplus = (<|>) instance Fail.MonadFail (Scraper str) where fail _ = mzero -- | The 'scrape' function executes a 'Scraper' on a list of -- 'TagSoup.Tag's and produces an optional value. scrape :: (TagSoup.StringLike str) => Scraper str a -> [TagSoup.Tag str] -> Maybe a scrape s = scrapeTagSpec s . tagsToSpec . TagSoup.canonicalizeTags -- | The 'chroot' function takes a selector and an inner scraper and executes -- the inner scraper as if it were scraping a document that consists solely of -- the tags corresponding to the selector. -- -- This function will match only the first set of tags matching the selector, to -- match every set of tags, use 'chroots'. chroot :: (TagSoup.StringLike str) => Selector -> Scraper str a -> Scraper str a chroot selector inner = do maybeResult <- listToMaybe <$> chroots selector inner guard (isJust maybeResult) return $ fromJust maybeResult -- | The 'chroots' function takes a selector and an inner scraper and executes -- the inner scraper as if it were scraping a document that consists solely of -- the tags corresponding to the selector. The inner scraper is executed for -- each set of tags (possibly nested) matching the given selector. -- -- > s = "
First paragraph. \
-- \Second paragraph. \
-- \Third paragraph. \
-- \@ tag -- within the @article@ tag by doing the following. -- -- @ -- chroots "article" // "p" $ do -- index <- position -- content <- text "p" -- return (index, content) -- @ -- -- Which will evaluate to the list: -- -- @ -- [ -- (0, "First paragraph.") -- , (1, "Second paragraph.") -- , (2, "Third paragraph.") -- ] -- @ position :: (TagSoup.StringLike str) => Scraper str Int position = MkScraper $ Just . tagsToPosition withHead :: (a -> b) -> [a] -> Maybe b withHead _ [] = Nothing withHead f (x:_) = Just $ f x withAll :: (a -> b) -> [a] -> Maybe [b] withAll f xs = Just $ map f xs foldSpec :: TagSoup.StringLike str => (TagSoup.Tag str -> str -> str) -> TagSpec str -> str foldSpec f = Vector.foldr' (f . infoTag) TagSoup.empty . (\(a, _, _) -> a) tagsToText :: TagSoup.StringLike str => TagSpec str -> str tagsToText = foldSpec f where f (TagSoup.TagText str) s = str `TagSoup.append` s f _ s = s tagsToHTML :: TagSoup.StringLike str => TagSpec str -> str tagsToHTML = foldSpec (\tag s -> TagSoup.renderTags [tag] `TagSoup.append` s) tagsToInnerHTML :: TagSoup.StringLike str => TagSpec str -> str tagsToInnerHTML (tags, tree, ctx) | len < 2 = TagSoup.empty | otherwise = tagsToHTML (Vector.slice 1 (len - 2) tags, tree, ctx) where len = Vector.length tags tagsToAttr :: (Show str, TagSoup.StringLike str) => str -> TagSpec str -> Maybe str tagsToAttr attr (tags, _, _) = do guard $ 0 < Vector.length tags let tag = infoTag $ tags Vector.! 0 guard $ TagSoup.isTagOpen tag return $ TagSoup.fromAttrib attr tag tagsToPosition :: TagSpec str -> Int tagsToPosition (_, _, ctx) = ctxPosition ctx