{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FunctionalDependencies #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE CPP #-}
{-# OPTIONS_HADDOCK hide #-}
module Text.HTML.Scalpel.Internal.Scrape (
Scraper
, ScraperT (..)
, scrape
, scrapeT
, attr
, attrs
, html
, htmls
, innerHTML
, innerHTMLs
, text
, texts
, chroot
, chroots
, matches
, position
) where
import Text.HTML.Scalpel.Internal.Select
import Text.HTML.Scalpel.Internal.Select.Types
import Control.Applicative
import Control.Monad
import Control.Monad.Except (MonadError)
import Control.Monad.Cont (MonadCont)
import Control.Monad.Reader
import Control.Monad.State (MonadState)
import Control.Monad.Trans.Maybe
import Control.Monad.Writer (MonadWriter)
import Data.Functor.Identity
import Data.Maybe
import qualified Control.Monad.Fail as Fail
import qualified Data.Vector as Vector
import qualified Text.HTML.TagSoup as TagSoup
import qualified Text.StringLike as TagSoup
newtype ScraperT str m a = MkScraper (ReaderT (TagSpec str) (MaybeT m) a)
deriving (Functor, Applicative, Alternative, Monad, MonadPlus, MonadFix,
MonadIO, MonadCont, MonadError e, MonadState s, MonadWriter w)
#if MIN_VERSION_base(4,9,0)
deriving instance Monad m => Fail.MonadFail (ScraperT str m)
#else
instance Fail.MonadFail m => Fail.MonadFail (ScraperT str m) where
fail = lift . Fail.fail
#endif
instance MonadTrans (ScraperT str) where
lift op = MkScraper . lift . lift $ op
instance MonadReader s m => MonadReader s (ScraperT str m) where
ask = MkScraper (lift . lift $ ask)
local f (MkScraper op) = (fmap MkScraper . mapReaderT . local) f op
type Scraper str = ScraperT str Identity
scrapeTagSpec :: ScraperT str m a -> TagSpec str -> m (Maybe a)
scrapeTagSpec (MkScraper r) = runMaybeT . runReaderT r
scrapeT :: (TagSoup.StringLike str)
=> ScraperT str m a -> [TagSoup.Tag str] -> m (Maybe a)
scrapeT s = scrapeTagSpec s . tagsToSpec . TagSoup.canonicalizeTags
scrape :: (TagSoup.StringLike str)
=> Scraper str a -> [TagSoup.Tag str] -> Maybe a
scrape = fmap runIdentity . scrapeT
chroot :: (TagSoup.StringLike str, Monad m)
=> Selector -> ScraperT str m a -> ScraperT str m a
chroot selector inner = do
maybeResult <- listToMaybe <$> chroots selector inner
guard (isJust maybeResult)
return $ fromJust maybeResult
chroots :: (TagSoup.StringLike str, Monad m)
=> Selector -> ScraperT str m a -> ScraperT str m [a]
chroots selector (MkScraper (ReaderT inner)) =
MkScraper $ ReaderT $ \tags -> MaybeT $ do
mvalues <- forM (select selector tags) (runMaybeT . inner)
return $ Just $ catMaybes mvalues
matches :: (TagSoup.StringLike str, Monad m) => Selector -> ScraperT str m ()
matches s = MkScraper $ (guard . not . null) =<< reader (select s)
text :: (TagSoup.StringLike str, Monad m) => Selector -> ScraperT str m str
text s = MkScraper $ withHead tagsToText =<< reader (select s)
texts :: (TagSoup.StringLike str, Monad m)
=> Selector -> ScraperT str m [str]
texts s = MkScraper $ withAll tagsToText =<< reader (select s)
html :: (TagSoup.StringLike str, Monad m) => Selector -> ScraperT str m str
html s = MkScraper $ withHead tagsToHTML =<< reader (select s)
htmls :: (TagSoup.StringLike str, Monad m)
=> Selector -> ScraperT str m [str]
htmls s = MkScraper $ withAll tagsToHTML =<< reader (select s)
innerHTML :: (TagSoup.StringLike str, Monad m)
=> Selector -> ScraperT str m str
innerHTML s = MkScraper $ withHead tagsToInnerHTML =<< reader (select s)
innerHTMLs :: (TagSoup.StringLike str, Monad m)
=> Selector -> ScraperT str m [str]
innerHTMLs s = MkScraper $ withAll tagsToInnerHTML =<< reader (select s)
attr :: (Show str, TagSoup.StringLike str, Monad m)
=> String -> Selector -> ScraperT str m str
attr name s = MkScraper $ ReaderT $ MaybeT
. return . listToMaybe . catMaybes
. fmap (tagsToAttr $ TagSoup.castString name) . select s
attrs :: (Show str, TagSoup.StringLike str, Monad m)
=> String -> Selector -> ScraperT str m [str]
attrs name s = MkScraper $ ReaderT $ MaybeT
. return . Just . catMaybes
. fmap (tagsToAttr nameStr) . select s
where nameStr = TagSoup.castString name
position :: (TagSoup.StringLike str, Monad m) => ScraperT str m Int
position = MkScraper $ reader tagsToPosition
withHead :: Monad m => (a -> b) -> [a] -> ReaderT (TagSpec str) (MaybeT m) b
withHead _ [] = empty
withHead f (x:_) = return $ f x
withAll :: Monad m => (a -> b) -> [a] -> ReaderT (TagSpec str) (MaybeT m) [b]
withAll f xs = return $ map f xs
foldSpec :: TagSoup.StringLike str
=> (TagSoup.Tag str -> str -> str) -> TagSpec str -> str
foldSpec f = Vector.foldr' (f . infoTag) TagSoup.empty . (\(a, _, _) -> a)
tagsToText :: TagSoup.StringLike str => TagSpec str -> str
tagsToText = foldSpec f
where
f (TagSoup.TagText str) s = str `TagSoup.append` s
f _ s = s
tagsToHTML :: TagSoup.StringLike str => TagSpec str -> str
tagsToHTML = foldSpec (\tag s -> TagSoup.renderTags [tag] `TagSoup.append` s)
tagsToInnerHTML :: TagSoup.StringLike str => TagSpec str -> str
tagsToInnerHTML (tags, tree, ctx)
| len < 2 = TagSoup.empty
| otherwise = tagsToHTML (Vector.slice 1 (len - 2) tags, tree, ctx)
where len = Vector.length tags
tagsToAttr :: (Show str, TagSoup.StringLike str)
=> str -> TagSpec str -> Maybe str
tagsToAttr tagName (tags, _, _) = do
guard $ 0 < Vector.length tags
let tag = infoTag $ tags Vector.! 0
guard $ TagSoup.isTagOpen tag
return $ TagSoup.fromAttrib tagName tag
tagsToPosition :: TagSpec str -> Int
tagsToPosition (_, _, ctx) = ctxPosition ctx