{-# OPTIONS -XScopedTypeVariables #-}
-- | This module exposes the main functionality of shpider
-- It allows you to quickly write crawlers, and for simple cases even without reading the page source eg.
--
-- @
-- `runShpider` $ do
--    `download` \"http:\/\/hackage.haskell.org\/packages\/archive\/pkg-list.html\"
--    l : _ <- `getLinksByText` \"shpider\"
--    `download` $ linkAddress l
-- @
module Network.Shpider 
   ( module Network.Shpider.Code
   , module Network.Shpider.State
   , module Network.Shpider.URL
   , module Network.Shpider.Options
   , module Network.Shpider.Forms
   , module Network.Shpider.Links
   , download
   , sendForm
   , getLinksByText
   , getLinksByTextRegex
   , getLinksByAddressRegex
   , getFormsByAction
   , currentLinks
   , currentForms
   , parsePage
   , isAuthorizedDomain
   , withAuthorizedDomain
   , haveVisited
   ) 
   where

import Network.Shpider.Curl.Curl 

import Text.HTML.TagSoup
import Text.Regex.Posix

import qualified Data.Map as M
import Data.Maybe

import Text.HTML.TagSoup

import Network.Shpider.State
import Network.Shpider.URL
import Network.Shpider.Code
import Network.Shpider.Options
import Network.Shpider.Forms
import Network.Shpider.Links

-- | if `keepTrack` has been set, then haveVisited will return `True` if the given URL has been visited.
haveVisited :: String -> Shpider Bool
haveVisited uncleanUrl = do
   murl <- mkAbsoluteUrl uncleanUrl
   maybe ( return False )
         ( \ url -> do
              shpider <- get
              return $ maybe False 
                             ( \ vs ->
                                  elem url vs
                             )
                             ( visited shpider )
         )
         murl

-- | Parse a given URL and source html into the `Page` datatype.
-- This will set the current page.
parsePage :: String -> String -> Shpider Page
parsePage paddr html = do
   let ts =
          parseTags html
       ls =
          gatherLinks ts
       fs =
          gatherForms ts
   setCurrentPage $ emptyPage { addr = paddr }
   maybeAbsLinkAddrs <- mapM mkAbsoluteUrl $ map linkAddress ls
   let absLinkAddrs = catMaybes maybeAbsLinkAddrs
   let absLinks = zipWith ( \ laddr ltxt -> Link { linkAddress = laddr
                                                 , linkText = linkText ltxt
                                                 }
                          )
                          absLinkAddrs
                          ls
   let newP = 
          Page { links = absLinks 
               , forms = fs
               , source = html
               , tags = ts
               , addr = paddr
               }
   setCurrentPage newP
   return newP

curlDownload url = do
   shpider <- get
   res <- liftIO $ curlGetString url $ curlOpts shpider
   r <- mkRes url res
   return r

mkRes url ( curlCode , html ) = do
   p <- if curlCode == CurlOK
           then
              parsePage url html
           else
              return emptyPage
   return ( ccToSh curlCode , p )


curlDownloadPost url fields = do
   shpider <- get
   res <- liftIO $ curlGetString url $ CurlPostFields ( map toPostField fields ) : curlOpts shpider
   mkRes url res


curlDownloadHead urlStr = do
   shpider <- get
   liftIO $ curlHead urlStr $ curlOpts shpider

validContentType ct =
   or $ map ( \ htmlct ->
                 ct =~ htmlct
            )
            htmlContentTypes

htmlContentTypes =
   [ "text/html"
   , "application/xhtml+xml"
   ]

-- | Fetch whatever is at this address, and attempt to parse the content into a Page. 
-- Return the status code with the parsed content.
download :: String -> Shpider ( ShpiderCode , Page )
download messyUrl = do
   shpider <- get
   let maybeWrite u =
          maybe ( return ( ) )
                ( \ vs ->
                     put $ shpider { visited = Just $ u : vs }
                )
                ( visited shpider ) 
   if not $ isMailto messyUrl
      then do 
         murl <- mkAbsoluteUrl messyUrl
         maybe ( return ( InvalidURL , emptyPage ) )
               ( \ url -> withAuthorizedDomain url $ do
                    res@( c , page ) <- downloadAPage url
                    maybeWrite $ addr page 
                    return res
               )
               murl
      else do
         maybeWrite messyUrl --if it's mail we want to write it so we don't try it again
         return ( UnsupportedProtocol , emptyPage )


downloadAPage url = do
   shpider <- get
   if htmlOnlyDownloads shpider
      then do
         if isHttp url
            then do
               ( _ , headers ) <- curlDownloadHead url
               let maybeContentType =
                      lookup "Content-Type" headers
               maybe ( curlDownload url )
                     ( \ ct -> do
                          if validContentType ct 
                             then
                                curlDownload url
                             else
                                return ( WrongData , emptyPage )
                     )
                     maybeContentType
            else
               curlDownload url
      else
         curlDownload url

-- | withAuthorizedDomain will execute the function if the url given is an authorized domain.
-- See `isAuthorizedDomain`.
withAuthorizedDomain :: String -> Shpider ( ShpiderCode , Page ) -> Shpider ( ShpiderCode , Page )
withAuthorizedDomain url f = do
   shpider <- get
   if dontLeaveDomain shpider
      then do
         let d = startPage shpider 
         if isSameDomain d url 
            then
               f
            else
               return ( OffSite , emptyPage )
      else
         f

-- | Send a form to the URL specified in its action attribute
sendForm :: Form -> Shpider ( ShpiderCode , Page )
sendForm form = do
   mabsAddr <- mkAbsoluteUrl $ action form
   maybe ( return (InvalidURL , emptyPage ) )
         ( \ absAddr -> withAuthorizedDomain absAddr $ do
              case method form of
                 GET -> do
                    let Just u = importURL addr -- we can do the indisputable pattern match because mkAbsoluteUrl already calls importURL
                        addr = exportURL $ foldl ( \ a i -> add_param a i
                                                 ) 
                                                 u
                                                 ( M.toList $ inputs form )
                    curlDownload addr
                 POST ->
                    curlDownloadPost absAddr $ M.toList $ inputs form          
         )
         mabsAddr
   
toPostField ( name , value ) =
   name ++ "=" ++ value

-- | Return the links on the current page.
currentLinks :: Shpider [ Link ]
currentLinks = do
   p <- getCurrentPage
   return $ links p

-- | Return the forms on the current page.
currentForms :: Shpider [ Form ]
currentForms = do
   p <- getCurrentPage
   return $ forms p

-- | Get all links which match this text.
getLinksByText :: String -> Shpider [ Link ]
getLinksByText t = do
   cls <- currentLinks
   return $ filter ( (==) t . linkText )
                   cls

-- | If `stayOnDomain` has been set to true, then isAuthorizedDomain returns `True` if the given URL is on the domain and false otherwise.  If `stayOnDomain` has not been set to True, then it returns `True`.
isAuthorizedDomain :: String -> Shpider Bool
isAuthorizedDomain url = do
   shpider <- get
   return $ if dontLeaveDomain shpider
               then
                  isSameDomain ( startPage shpider ) url
               else
                  True 

-- | Get all links whose text matches this regex.
getLinksByTextRegex :: String -> Shpider [ Link ]
getLinksByTextRegex r = do
   cls <- currentLinks
   return $ filter ( flip (=~) r . linkText )
                   cls

-- | Get all forms whose action matches the given action
getFormsByAction :: String -> Shpider [ Form ]
getFormsByAction a = do
   murl <- mkAbsoluteUrl a
   maybe ( return [ ] )
         ( \ url -> do
              fs <- currentForms
              return $ filter ( (==) url . action )
                              fs
         )
         murl

-- | Get all links whose address matches this regex.
getLinksByAddressRegex :: String -> Shpider [ Link ]
getLinksByAddressRegex r = do
   cls <- currentLinks
   return $ filter ( flip (=~) r . linkAddress )
                   cls