Network/Shpider.hs

{-
 -
 - Copyright (c) 2009-2010 Johnny Morrice
 -
 - Permission is hereby granted, free of charge, to any person
 - obtaining a copy of this software and associated documentation 
 - files (the "Software"), to deal in the Software without 
 - restriction, including without limitation the rights to use, copy, 
 - modify, merge, publish, distribute, sublicense, and/or sell copies 
 - of the Software, and to permit persons to whom the Software is 
 - furnished to do so, subject to the following conditions:
 -
 - The above copyright notice and this permission notice shall be 
 - included in all copies or substantial portions of the Software.
 -
 - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 - SOFTWARE.
 -
-}

{-# OPTIONS -XScopedTypeVariables #-}
-- | This module exposes the main functionality of shpider
-- It allows you to quickly write crawlers, and for simple cases even without reading the page source eg.
--
-- @
-- `runShpider` $ do
--    `download` \"http:\/\/hackage.haskell.org\/packages\/archive\/pkg-list.html\"
--    l : _ <- `getLinksByText` \"shpider\"
--    `download` $ linkAddress l
-- @
module Network.Shpider 
   ( module Network.Shpider.Code
   , module Network.Shpider.State
   , module Network.Shpider.URL
   , module Network.Shpider.Options
   , module Network.Shpider.Forms
   , module Network.Shpider.Links
   -- * Crawl Functions
   , download
   , sendForm
   -- * Basic Parsing/Decision Support
   , getLinksByText
   , getLinksByTextRegex
   , getLinksByAddressRegex
   , getFormsByAction
   , getFormsHasAction
   , currentLinks
   , currentForms
   -- * Utilities
   , parsePage
   , isAuthorizedDomain
   , withAuthorizedDomain
   , haveVisited
   ) 
   where

import           Control.Concurrent
import qualified Data.Map                as M
import           Data.Maybe
import           Data.Time
import           Network.Curl
import           Network.Shpider.Code
import           Network.Shpider.Forms
import           Network.Shpider.Links
import           Network.Shpider.Options
import           Network.Shpider.State
import           Network.Shpider.URL
import           Text.HTML.TagSoup
import           Text.HTML.TagSoup
import           Text.Regex.Posix
import           Web.Encodings


-- | if `keepTrack` has been set, then haveVisited will return `True` if the given URL has been visited.
haveVisited :: String -> Shpider Bool
haveVisited uncleanUrl = do
   murl <- mkAbsoluteUrl uncleanUrl
   maybe ( return False )
         ( \ url -> do
              shpider <- get
              return $ maybe False 
                             ( \ vs ->
                                  elem url vs
                             )
                             ( visited shpider )
         )
         murl

-- | Parse a given URL and source html into the `Page` datatype.
-- This will set the current page.
parsePage :: String -> String -> Shpider Page
parsePage paddr html = do
   let ts =
          parseTags html
       ls =
          gatherLinks ts
       fs =
          gatherForms ts
       nPge = emptyPage { addr = paddr }
   -- seems weird, but this is the side effect needed here to create the absolute urls next
   setCurrentPage nPge
   maybeAbsFormActions <- mapM mkAbsoluteUrl $ map action fs 
   maybeAbsLinkAddrs <- mapM mkAbsoluteUrl $ map linkAddress ls
   let absLinkAddrs = catMaybes maybeAbsLinkAddrs
       absFormActions = catMaybes maybeAbsFormActions
       absFs = zipWith ( \ form a -> form { action = a }) fs absFormActions 
       absLinks = zipWith ( \ laddr l -> l { linkAddress = laddr }) absLinkAddrs ls
       newP = 
          nPge { links = absLinks 
               , forms = absFs
               , source = html
               , tags = ts
               , addr = paddr
               }
   setCurrentPage newP
   return newP


-------------------------------------------------------------------------------
-- | Perform the given operation subject to blocking throttling based
-- on the last time there was a download.
withThrottle :: Shpider a -> Shpider a
withThrottle f = do
  let perform = do
        res <- f
        sh <- get
        now <- liftIO $ getCurrentTime
        put $ sh { lastDownloadTime = Just now }
        return res
  thOpt <- gets downloadThrottle
  lastD <- gets lastDownloadTime
  case thOpt of
    Nothing -> perform
    Just n -> do
      case lastD of 
        Nothing -> perform
        Just ld -> do
          th <- liftIO $ shouldThrottle n ld
          case th of
            Just x -> liftIO (threadDelay x) >> perform
            Nothing -> perform
            

-------------------------------------------------------------------------------
-- | Test whether we need to throttle
shouldThrottle :: Int -> UTCTime -> IO (Maybe Int)
shouldThrottle n lastTime = do
  now <- getCurrentTime
  let n' = fromIntegral n / 1000000
      diff = diffUTCTime now lastTime
      delta = round . (* 1000000) $ n' - diff
  return $  if delta > 0 then (Just delta) else Nothing


curlDownload url = withThrottle $ do
  shpider <- get
  res <- liftIO $ curlGetString url $ curlOpts shpider
  r <- mkRes url res
  return r

mkRes url ( curlCode , html ) = do
   p <- if curlCode == CurlOK
           then
              parsePage url html
           else
              return emptyPage
   return ( ccToSh curlCode , p )


curlDownloadPost url fields = withThrottle $ do
   shpider <- get
   res <- liftIO $ curlGetString url $ opts shpider
   mkRes url res
   where
    opts sh = 
      [ CurlPostFields (map toPostField fields)
      , CurlPost True
      ] ++ curlOpts sh


curlDownloadHead urlStr = withThrottle $ do
   shpider <- get
   liftIO $ curlHead urlStr $ curlOpts shpider

validContentType ct =
   or $ map ( \ htmlct ->
                 ct =~ htmlct
            )
            htmlContentTypes

htmlContentTypes =
   [ "text/html"
   , "application/xhtml+xml"
   ]

-- | Fetch whatever is at this address, and attempt to parse the content into a Page. 
-- Return the status code with the parsed content.
download :: String -> Shpider ( ShpiderCode , Page )
download messyUrl = do
   shpider <- get
   let maybeWrite u =
          maybe ( return ( ) )
                ( \ vs ->
                     put $ shpider { visited = Just $ u : vs }
                )
                ( visited shpider ) 
   if not $ isMailto messyUrl
      then do 
         murl <- mkAbsoluteUrl messyUrl
         maybe ( return ( InvalidURL , emptyPage ) )
               ( \ url -> withAuthorizedDomain url $ do
                    res@( c , page ) <- downloadAPage url
                    maybeWrite $ addr page 
                    return res
               )
               murl
      else do
         maybeWrite messyUrl --if it's mail we want to write it so we don't try it again
         return ( UnsupportedProtocol , emptyPage )


downloadAPage url = do
   shpider <- get
   if htmlOnlyDownloads shpider
      then do
         if isHttp url
            then do
               ( _ , headers ) <- curlDownloadHead url
               let maybeContentType =
                      lookup "Content-Type" headers
               maybe ( curlDownload url )
                     ( \ ct -> do
                          if validContentType ct 
                             then
                                curlDownload url
                             else
                                return ( WrongData , emptyPage )
                     )
                     maybeContentType
            else
               curlDownload url
      else
         curlDownload url

-- | withAuthorizedDomain will execute the function if the url given is an authorized domain.
-- See `isAuthorizedDomain`.
withAuthorizedDomain :: String -> Shpider ( ShpiderCode , Page ) -> Shpider ( ShpiderCode , Page )
withAuthorizedDomain url f = do
   shpider <- get
   if dontLeaveDomain shpider
      then do
         let d = startPage shpider 
         if isSameDomain d url 
            then
               f
            else
               return ( OffSite , emptyPage )
      else
         f

-- | Send a form to the URL specified in its action attribute
sendForm :: Form -> Shpider ( ShpiderCode , Page )
sendForm form = do
   mabsAddr <- mkAbsoluteUrl $ action form
   maybe ( return (InvalidURL , emptyPage ) )
         ( \ absAddr -> withAuthorizedDomain absAddr $ do
              case method form of
                 GET -> do
                    let Just u = importURL addr -- we can do the indisputable pattern match because mkAbsoluteUrl already calls importURL
                        addr = exportURL $ foldl ( \ a i -> add_param a i
                                                 ) 
                                                 u
                                                 ( M.toList $ inputs form )
                    curlDownload addr
                 POST ->
                    curlDownloadPost absAddr $ M.toList $ inputs form          
         )
         mabsAddr
   
toPostField ( name , value ) =
   encodeUrl name ++ "=" ++ encodeUrl value

-- | Return the links on the current page.
currentLinks :: Shpider [ Link ]
currentLinks = do
   p <- getCurrentPage
   return $ links p

-- | Return the forms on the current page.
currentForms :: Shpider [ Form ]
currentForms = do
   p <- getCurrentPage
   return $ forms p

-- | Get all links which match this text.
getLinksByText :: String -> Shpider [ Link ]
getLinksByText t = do
   cls <- currentLinks
   return $ filter ( (==) t . linkText )
                   cls

-- | If `stayOnDomain` has been set to true, then isAuthorizedDomain returns `True` if the given URL is on the domain and false otherwise.  If `stayOnDomain` has not been set to True, then it returns `True`.
isAuthorizedDomain :: String -> Shpider Bool
isAuthorizedDomain url = do
   shpider <- get
   return $ if dontLeaveDomain shpider
               then
                  isSameDomain ( startPage shpider ) url
               else
                  True 

-- | Get all links whose text matches this regex.
getLinksByTextRegex :: String -> Shpider [ Link ]
getLinksByTextRegex r = do
   cls <- currentLinks
   return $ filter ( flip (=~) r . linkText )
                   cls

-- | Get all forms whose action matches the given action
getFormsByAction :: String -> Shpider [ Form ]
getFormsByAction a = do
   murl <- mkAbsoluteUrl a
   maybe ( return [ ] )
         ( \ url -> fmap (filter $ (==) url . action) currentForms )
         murl

getFormsHasAction :: (String -> Bool) -> Shpider [Form] 
getFormsHasAction f = fmap (filter $ f . action ) currentForms
  

-- | Get all links whose address matches this regex.
getLinksByAddressRegex :: String -> Shpider [ Link ]
getLinksByAddressRegex r = do
   cls <- currentLinks
   return $ filter ( flip (=~) r . linkAddress )
                   cls