{-# LANGUAGE OverloadedStrings #-} module Network.Scraper.State ( get, runScraper, InpFilter (..), FormAttr(..), Scraper, getCurrentCursor, toCursor, isDisplayed, hasDisplayNone, getFormBy, getCurrentHtml, fillForm, hasHide, getInputs, postToForm, printFormNames ) where import Control.Applicative import Control.Arrow ((***)) import Control.Lens ((^.)) import Control.Monad import Control.Monad.IO.Class (liftIO) import Control.Monad.Trans.Class import qualified Control.Monad.Trans.State.Strict as ST import qualified Data.ByteString.Lazy as LBS import qualified Data.Map as M import Data.Maybe (fromJust, fromMaybe, isJust, listToMaybe) import Data.Monoid import qualified Data.Text as T import Data.Text.Encoding (encodeUtf8) import qualified Data.Text.IO as TIO import Network.URL import Network.Wreq (FormParam (..)) import qualified Network.Wreq as Wreq import Network.Wreq.Session (Session (..), withSession) import qualified Network.Wreq.Session as Sesh import Network.Wreq.Types import Safe import Text.HTML.DOM (parseLBS) import Text.XML.Cursor import qualified Text.XML.Cursor.Generic as CG data ScraperState = PS { currentOptions :: Wreq.Options , currentHtml :: LBS.ByteString , currentCursor :: Maybe Cursor , currentSession :: Session , currentURL :: Maybe URL } deriving (Show) type Scraper = ST.StateT ScraperState IO toCursor = fromDocument . parseLBS withInitialState :: (ScraperState -> IO a) -> IO a withInitialState callback = withSession $ \s -> do let initialState = PS { currentOptions = Wreq.defaults , currentHtml = ("" :: LBS.ByteString) , currentCursor = Nothing , currentSession = s , currentURL = Nothing } callback initialState runScraper :: Scraper a -> IO a runScraper k = withInitialState (evalScraperWith k) evalScraperWith :: Scraper a -> ScraperState -> IO a evalScraperWith k s = ST.evalStateT k s -- TODO: Move somewhere else??? setCurrentOptions :: Wreq.Options -> Scraper () setCurrentOptions o = do scraper <- ST.get ST.put $ scraper { currentOptions = o } setCurrentURL :: Maybe URL -> Scraper () setCurrentURL u = ST.get >>= (\s -> ST.put $ s { currentURL = u }) getCurrentURL :: Scraper(Maybe URL) getCurrentURL = ST.get >>= return . currentURL getCurrentHtml :: Scraper(LBS.ByteString) getCurrentHtml = ST.get >>= return . currentHtml -- TODO: Move somewhere else??? -- getCurrentPage :: Shpider Page getCurrentCursor :: Scraper (Maybe Cursor) getCurrentCursor = do scraper <- ST.get return $ currentCursor scraper -- TODO: Move somewhere else??? getCurrentSession :: Scraper (Session) getCurrentSession = do scraper <- ST.get return $ currentSession scraper -- TODO: Move somewhere else??? setCurrentSession :: Session -> Scraper () setCurrentSession s = do scraper <- ST.get ST.put $ scraper { currentSession = s} -- TODO: Move somewhere else??? setCurrentCursor :: Cursor -> Scraper ( ) setCurrentCursor c = do scraper <- ST.get ST.put $ scraper { currentCursor = Just c } -- TODO: Move somewhere else??? setCurrentHtml :: LBS.ByteString -> Scraper () setCurrentHtml html = do scraper <- ST.get ST.put $ scraper { currentHtml = html } -- TODO: Move somewhere else??? formShortInfo' f = formInfo' where go Nothing = "N/A" go (Just x) = x formInfo = (headMay . attribute "name" $ f, headMay . attribute "action" $ f) formInfo' = (\(x,y) -> (go x, go y)) formInfo -- TODO: Move somewhere else... ppTuple :: (T.Text, T.Text) -> T.Text ppTuple = \(x,y) -> "[" <> x <> "]" <> ": " <> y -- TODO: Move somewhere else... printFormNames :: Scraper () printFormNames = do c <- getCurrentCursor let c' = fromMaybe (error "No cursor set") c forms = c' $// element "form" formInfo = map (ppTuple . formShortInfo') forms liftIO $ mapM_ (TIO.putStrLn) formInfo -- TODO: Move somewhere else??? get :: String -> Scraper (LBS.ByteString) get urlStr = do let url = fromMaybe (error ("invalid urlStr: " ++ urlStr)) (importURL urlStr) urlStr' = exportURL url -- TODO: Display under debug mode -- liftIO . putStrLn $ "GET: " ++ url ++ "\n" opts <- ST.gets currentOptions sesh <- ST.gets currentSession r <- liftIO $ Sesh.getWith opts sesh urlStr' let html = r ^. Wreq.responseBody setCurrentURL (Just url) setCurrentHtml html setCurrentCursor (toCursor html) return html -- TODO: Move somewhere else??? post :: Postable a => String -> a -> Scraper (LBS.ByteString) post urlStr params = do opts <- ST.gets currentOptions sesh <- ST.gets currentSession -- TODO: should take an actual url.. makes api more difficult... -- TODO: Make url absolute by storing host of current site -- TODO: Display under debug mode -- liftIO . print $ params -- TODO: Make minimal repro and ask question... not sure how to print something so polymorphic -- liftIO . putStrLn $ "POST: " ++ url ++ "\n" let url = fromMaybe (error ("invalid urlStr: " ++ urlStr)) (importURL urlStr) absURL <- toAbsUrl url let url' = exportURL absURL r <- liftIO $ Sesh.postWith opts sesh url' params let html = r ^. Wreq.responseBody setCurrentHtml html setCurrentCursor (toCursor html) return html toAbsUrl :: URL -> Scraper(URL) toAbsUrl u@(URL (Absolute _) _ _) = return u toAbsUrl u@(URL _ _ _) = do hostUrl <- getCurrentURL let hostUrl' = fromMaybe (error errMsg) hostUrl absUrl = u { url_type = url_type hostUrl' } return absUrl where errMsg = "You must 'get' or 'post' to something before making urls absolute" -- toAbsUrl u@(URL _ _ _) = return u -- TODO: Move to tests testToAbsUrl :: Scraper() testToAbsUrl = do setCurrentURL (importURL "http://www.google.com") aUrl <- toAbsUrl (fromJust . importURL $ "blah.php") liftIO . print . exportURL $ aUrl hasDisplayNone el = fromMaybe False . fmap (== "display: none;") . headMay $ (attribute "style" el) hasHide el = fromMaybe False . fmap (T.isInfixOf "hide") . headMay $ (attribute "class" el) anyParentIsHidden el = isJust . listToMaybe . join $ map (\c -> (c $/ check (hasHide))) cs where cs = ancestor el -- checks to see if an eleemnt is displayed -- isDisplayed el = any (== True) $ [displayNone el, hide el] isDisplayed :: Cursor -> Bool isDisplayed el = all (== False) $ [ hasDisplayNone el , hasHide el , anyParentIsHidden el -- this was hiding inputs on the paypal billing form ] -- TODO: Move somewhere else??? -- let aForm = toCursor "
" getVisibleInputs :: Cursor -> M.Map T.Text T.Text getVisibleInputs c = do let inputs' = filter isDisplayed inputs mayPairs = map (\e -> (listToMaybe $ attribute "name" e, listToMaybe $ attribute "value" e)) inputs' pairs = map (fromMaybe "" *** fromMaybe "") mayPairs M.fromList $ filter ((/= "") . fst) pairs where inputs = c $// element "input" data InpFilter a = Custom ([T.Text]) | AllVisible | AllInps deriving (Show) getInputs :: InpFilter a -> Cursor -> M.Map T.Text T.Text getInputs (Custom paramFilterList) c = do let mayPairs = map (\e -> (listToMaybe $ attribute "name" e, listToMaybe $ attribute "value" e)) inputs pairs = map (fromMaybe "" *** fromMaybe "") mayPairs m = M.fromList $ filter ((/= "") . fst) pairs -- filter out keys user didn't want M.filterWithKey (\k _ -> not . any (== k) $ paramFilterList) m where inputs = c $// element "input" getInputs AllVisible c = getVisibleInputs c getInputs AllInps c = getAllInputs c getAllInputs :: Cursor -> M.Map T.Text T.Text getAllInputs c = do let mayPairs = map (\e -> (listToMaybe $ attribute "name" e, listToMaybe $ attribute "value" e)) inputs pairs = map (fromMaybe "" *** fromMaybe "") mayPairs M.fromList $ filter ((/= "") . fst) pairs where inputs = c $// element "input" -- test get inputs -- todo: Move to tests tgi = do LBS.readFile "mismatchedinputkeyvalsform.html" >>= return . getAllInputs . toCursor -- TODO: Move somewhere else??? getLoginForm url = get url >>= return . getAllInputs . toCursor -- TODO: Move somewhere else??? toWreqFormParams :: [(T.Text, T.Text)] -> [FormParam] toWreqFormParams params = map (\(k,v) -> k := v) (map (encodeUtf8 *** encodeUtf8) params) -- TODO: Move somewhere else??? linkWithText :: T.Text -> Cursor -> Maybe Cursor linkWithText t cursor = listToMaybe $ filter (\c -> (any (T.isInfixOf t)) (c $/ content)) (cursor $// element "a") -- TODO: Move somewhere else??? addToMap pairs m = foldl (\m ->(\(k,v) -> M.insert k v m)) m pairs -- TODO: Move somewhere else??? getFormByName :: T.Text -> Scraper (Maybe Cursor) getFormByName name = do c <- getCurrentCursor let c' = fromMaybe (error "Nocursor set") c formList = c' $// element "form" >=> attributeIs "name" name return . listToMaybe $ formList -- TODO: Move somewhere else??? data FormAttr = Name T.Text | ActionUrl T.Text | FormId T.Text deriving Show -- TODO: Move somewhere else??? getFormBy :: FormAttr -> Scraper (Maybe Cursor) getFormBy formAttr = do c <- getCurrentCursor let c' = fromMaybe (error "Nocursor set") c return . listToMaybe $ formList formAttr c' where formList (Name val) c = c $// element "form" >=> attributeIs "name" val formList (ActionUrl val) c = c $// element "form" >=> attributeIs "action" val formList (FormId val) c = c $// element "form" >=> attributeIs "id" val fillForm :: Maybe Cursor -> Maybe [(T.Text, T.Text)] -> InpFilter a -> [FormParam] fillForm form Nothing paramFilterList = do -- liftIO . putStrLn $ "old inputs: " ++ show (map (attribute "class") $ filter (not . isDisplayed) form) let formParams = fromMaybe (error "no params in form") (getInputs paramFilterList <$> form) toWreqFormParams . M.toList $ formParams fillForm form (Just params) paramFilterList = do -- putStrLn $ "filling form: " ++ show form let formParams = fromMaybe (error "no params in form") (getInputs paramFilterList <$> form) formParams' = addToMap params formParams toWreqFormParams . M.toList $ formParams' -- TODO: Move somewhere else??? -- Takes a form name, fields to fill out in the form, then submits the form -- TODO: Change all[ (T.Text,T.Text)] to just be wreq formvalues... neater api anyway postToForm :: FormAttr -> Maybe [(T.Text,T.Text)] -> InpFilter a -> Scraper (LBS.ByteString) postToForm formAttr params paramFilterList = do form <- getFormBy formAttr c <- getCurrentCursor case form of Just _ -> do -- liftIO $ putStrLn $ "Found form: " ++ show formAttr return () Nothing -> do -- liftIO $ do -- putStrLn "forms found: " -- mapM_ TIO.putStrLn $ join $ (map (attribute "name") $ (fromJust c) $// element "form") error ("Couldn't find form: " ++ show formAttr) let formParams = fillForm form params paramFilterList -- todo, this is duplicated above... delete one or the other if possible mActionUrl = T.strip <$> (join $ listToMaybe <$> attribute "action" <$> form) actionUrl = fromMaybe (error "Couldn't find action url in form") mActionUrl -- liftIO $ do -- TIO.putStrLn $ "POST " <> actionUrl -- print formParams -- getCurrentHtml >>= liftIO . LBS.writeFile "last.html" html <- post (T.unpack actionUrl) formParams return html