module Network.Shpider
( module Network.Shpider.Code
, module Network.Shpider.State
, module Network.Shpider.URL
, module Network.Shpider.Options
, module Network.Shpider.Forms
, module Network.Shpider.Links
, download
, sendForm
, getLinksByText
, getLinksByTextRegex
, getLinksByAddressRegex
, getFormsByAction
, currentLinks
, currentForms
, parsePage
, isAuthorizedDomain
, withAuthorizedDomain
, haveVisited
)
where
import Network.Shpider.Curl.Curl
import Text.HTML.TagSoup
import Text.Regex.Posix
import qualified Data.Map as M
import Data.Maybe
import Text.HTML.TagSoup
import Network.Shpider.State
import Network.Shpider.URL
import Network.Shpider.Code
import Network.Shpider.Options
import Network.Shpider.Forms
import Network.Shpider.Links
haveVisited :: String -> Shpider Bool
haveVisited uncleanUrl = do
murl <- mkAbsoluteUrl uncleanUrl
maybe ( return False )
( \ url -> do
shpider <- get
return $ maybe False
( \ vs ->
elem url vs
)
( visited shpider )
)
murl
parsePage :: String -> String -> Shpider Page
parsePage paddr html = do
let ts =
parseTags html
ls =
gatherLinks ts
fs =
gatherForms ts
nPge = emptyPage { addr = paddr }
setCurrentPage nPge
maybeAbsFormActions <- mapM mkAbsoluteUrl $ map action fs
maybeAbsLinkAddrs <- mapM mkAbsoluteUrl $ map linkAddress ls
let absLinkAddrs = catMaybes maybeAbsLinkAddrs
absFormActions = catMaybes maybeAbsFormActions
absFs = zipWith ( \ form a -> form { action = a }) fs absFormActions
absLinks = zipWith ( \ laddr l -> l { linkAddress = laddr }) absLinkAddrs ls
newP =
nPge { links = absLinks
, forms = absFs
, source = html
, tags = ts
, addr = paddr
}
setCurrentPage newP
return newP
curlDownload url = do
shpider <- get
res <- liftIO $ curlGetString url $ curlOpts shpider
r <- mkRes url res
return r
mkRes url ( curlCode , html ) = do
p <- if curlCode == CurlOK
then
parsePage url html
else
return emptyPage
return ( ccToSh curlCode , p )
curlDownloadPost url fields = do
shpider <- get
res <- liftIO $ curlGetString url $ CurlPostFields ( map toPostField fields ) : curlOpts shpider
mkRes url res
curlDownloadHead urlStr = do
shpider <- get
liftIO $ curlHead urlStr $ curlOpts shpider
validContentType ct =
or $ map ( \ htmlct ->
ct =~ htmlct
)
htmlContentTypes
htmlContentTypes =
[ "text/html"
, "application/xhtml+xml"
]
download :: String -> Shpider ( ShpiderCode , Page )
download messyUrl = do
shpider <- get
let maybeWrite u =
maybe ( return ( ) )
( \ vs ->
put $ shpider { visited = Just $ u : vs }
)
( visited shpider )
if not $ isMailto messyUrl
then do
murl <- mkAbsoluteUrl messyUrl
maybe ( return ( InvalidURL , emptyPage ) )
( \ url -> withAuthorizedDomain url $ do
res@( c , page ) <- downloadAPage url
maybeWrite $ addr page
return res
)
murl
else do
maybeWrite messyUrl
return ( UnsupportedProtocol , emptyPage )
downloadAPage url = do
shpider <- get
if htmlOnlyDownloads shpider
then do
if isHttp url
then do
( _ , headers ) <- curlDownloadHead url
let maybeContentType =
lookup "Content-Type" headers
maybe ( curlDownload url )
( \ ct -> do
if validContentType ct
then
curlDownload url
else
return ( WrongData , emptyPage )
)
maybeContentType
else
curlDownload url
else
curlDownload url
withAuthorizedDomain :: String -> Shpider ( ShpiderCode , Page ) -> Shpider ( ShpiderCode , Page )
withAuthorizedDomain url f = do
shpider <- get
if dontLeaveDomain shpider
then do
let d = startPage shpider
if isSameDomain d url
then
f
else
return ( OffSite , emptyPage )
else
f
sendForm :: Form -> Shpider ( ShpiderCode , Page )
sendForm form = do
mabsAddr <- mkAbsoluteUrl $ action form
maybe ( return (InvalidURL , emptyPage ) )
( \ absAddr -> withAuthorizedDomain absAddr $ do
case method form of
GET -> do
let Just u = importURL addr
addr = exportURL $ foldl ( \ a i -> add_param a i
)
u
( M.toList $ inputs form )
curlDownload addr
POST ->
curlDownloadPost absAddr $ M.toList $ inputs form
)
mabsAddr
toPostField ( name , value ) =
name ++ "=" ++ value
currentLinks :: Shpider [ Link ]
currentLinks = do
p <- getCurrentPage
return $ links p
currentForms :: Shpider [ Form ]
currentForms = do
p <- getCurrentPage
return $ forms p
getLinksByText :: String -> Shpider [ Link ]
getLinksByText t = do
cls <- currentLinks
return $ filter ( (==) t . linkText )
cls
isAuthorizedDomain :: String -> Shpider Bool
isAuthorizedDomain url = do
shpider <- get
return $ if dontLeaveDomain shpider
then
isSameDomain ( startPage shpider ) url
else
True
getLinksByTextRegex :: String -> Shpider [ Link ]
getLinksByTextRegex r = do
cls <- currentLinks
return $ filter ( flip (=~) r . linkText )
cls
getFormsByAction :: String -> Shpider [ Form ]
getFormsByAction a = do
murl <- mkAbsoluteUrl a
maybe ( return [ ] )
( \ url -> fmap (filter $ (==) url . action) currentForms )
murl
getLinksByAddressRegex :: String -> Shpider [ Link ]
getLinksByAddressRegex r = do
cls <- currentLinks
return $ filter ( flip (=~) r . linkAddress )
cls