module Network.CrawlChain.CrawlChains (
executeCrawlChain,
CrawlDirective(..),
DirectiveChainResult, lastResult, resultHistory, showResultPath,
(>>+),
combineAbsoluteUrl)
where
import Data.List (intersperse)
import Data.List.Split (splitOn)
import System.IO.Unsafe as Unsafe (unsafeInterleaveIO)
import Network.CrawlChain.CrawlAction
import Network.CrawlChain.CrawlResult
import Network.CrawlChain.CrawlingContext
import Network.CrawlChain.CrawlDirective
import Network.CrawlChain.DirectiveChainResult
import Network.CrawlChain.Report
import Network.CrawlChain.Util
executeCrawlChain :: CrawlingContext a => a -> CrawlAction -> CrawlDirective -> IO [DirectiveChainResult]
executeCrawlChain = followDirective [] []
followDirective :: CrawlingContext a =>
[DirectiveChainResult] -> [Report] -> a -> CrawlAction -> CrawlDirective
-> IO [DirectiveChainResult]
followDirective collectedResults reportPath context crawlAction = followDirective'
where
followDirective' :: CrawlDirective -> IO [DirectiveChainResult]
followDirective' (DirectiveSequence sequence') =
followDirectiveSequence collectedResults reportPath context crawlAction sequence'
followDirective' (SimpleDirective logic) = crawlAndSearch (logic . crawlingContent)
followDirective' (RelativeDirective logic) = crawlAndSearch (makeAbsoluteLogicMapper logic)
followDirective' (FollowUpDirective logic) = crawlAndSearch logic
followDirective' (DelayDirective sec d) = delaySeconds sec >> followDirective' d
followDirective' (RetryDirective num d) = do
results <- followDirective' d
if num > 0 && all (null . lastResult) results
then do
putStrLn $ "retrying "++(show num)++" more times"
followDirective' $ RetryDirective (num1) d
else return results
followDirective' (AlternativeDirective a1 a2) = do
a1Results <- followDirective' a1
if all (null . lastResult) a1Results
then followDirective' a2
else return a1Results
followDirective' (RestartChainDirective restart) =
uncurry (followDirective collectedResults reportPath context) restart
followDirective' (GuardDirective guard) =
if guard crawlAction
then putStrLn (" guard: accepting "++(crawlUrl crawlAction)) >> return [DirectiveChainResult reportPath [crawlAction]]
else putStrLn (" guard: rejecting "++(crawlUrl crawlAction)) >> return []
crawlAndSearch :: (CrawlResult -> [CrawlAction]) -> IO [DirectiveChainResult]
crawlAndSearch searchLogic = crawler context crawlAction >>= processCrawlingResult
where
processCrawlingResult crawlingResult =
return searchCrawlingResult >>= logSearchResults >>= return . wrapActions >>= appendResult where
searchCrawlingResult :: [CrawlAction]
searchCrawlingResult = if crawlWasNoSuccess crawlingResult then [] else searchLogic crawlingResult
logSearchResults res =
putStrLn (" found " ++ (show $ length res) ++" follow-up actions:" ++ (show $ map crawlUrl $ res))
>> return res
wrapActions :: [CrawlAction] -> DirectiveChainResult
wrapActions res
| null res = DirectiveChainResult (reportPath ++ [errReport crawlingResult]) []
| otherwise = DirectiveChainResult updateReportPath res
where
updateReportPath :: [Report]
updateReportPath = okReport crawlingResult : reportPath
appendResult :: DirectiveChainResult -> IO [DirectiveChainResult]
appendResult res = return $ collectedResults ++ [res]
followDirectiveSequence :: CrawlingContext a => [DirectiveChainResult] -> [Report] -> a -> CrawlAction -> [CrawlDirective] -> IO [DirectiveChainResult]
followDirectiveSequence collectedResults reportPath context crawlingAction' = followDirectiveSequence'
where
followDirectiveSequence' :: [CrawlDirective] -> IO [DirectiveChainResult]
followDirectiveSequence' [] = return [DirectiveChainResult (Report "unsupported: empty sequence" "":reportPath) []]
followDirectiveSequence' (single:[]) = followDirective collectedResults reportPath context crawlingAction' single
followDirectiveSequence' (nextDirective:remainingDirectives) = chainDirectives
where
nextStepActions :: IO [DirectiveChainResult]
nextStepActions = followDirective [] reportPath context crawlingAction' nextDirective
remainingActions :: [DirectiveChainResult] -> [Report] -> CrawlAction -> IO [DirectiveChainResult]
remainingActions nextStepResults nextStepReportPath nextStepCrawlAction =
followDirectiveSequence nextStepResults nextStepReportPath context nextStepCrawlAction remainingDirectives
chainDirectives :: IO [DirectiveChainResult]
chainDirectives = do
nextStepResults <- nextStepActions
followNextSteps nextStepResults
where
followNextSteps :: [DirectiveChainResult] -> IO [DirectiveChainResult]
followNextSteps nextStepResults = nextStepsResults
where
nextStepsResults :: IO [DirectiveChainResult]
nextStepsResults = wrapResults $ nextStepsResults' allInputActionsForFollowingSteps
where
nextStepsResults' :: [CrawlAction] -> [IO [DirectiveChainResult]]
nextStepsResults' = map (remainingActions [] reportPath)
allInputActionsForFollowingSteps :: [CrawlAction]
allInputActionsForFollowingSteps = concat $ map lastResult nextStepResults
wrapResults :: [IO [a]] -> IO [a]
wrapResults = lazyIOsequence
lazyIOsequence :: [IO [a]] -> IO [a]
lazyIOsequence (mx:mxs) = do
x <- mx
xs <- Unsafe.unsafeInterleaveIO (Prelude.sequence mxs)
return $ concat (x : xs)
lazyIOsequence [] = return []
errReport :: CrawlResult -> Report
errReport crawlingResult = report "crawling unsuccessful: " crawlingResult
okReport :: CrawlResult -> Report
okReport crawlingResult = report "ok: " crawlingResult
report :: String -> CrawlResult -> Report
report prefix crawlingResult = Report (prefix ++ (show $ crawlingAction crawlingResult)) (crawlingContent crawlingResult)
crawlWasNoSuccess :: CrawlResult -> Bool
crawlWasNoSuccess = (/= CrawlingOk) . crawlingResultStatus
(>>+) :: (CrawlAction, CrawlDirective) -> CrawlDirective -> (CrawlAction, CrawlDirective)
(>>+) (initialAction, first) second = (initialAction, DirectiveSequence [first, second])
makeAbsoluteLogicMapper :: (String -> [CrawlAction]) -> CrawlResult -> [CrawlAction]
makeAbsoluteLogicMapper logic crawlResult = combineAbsoluteUrls (crawlingAction crawlResult) $ logic $ crawlingContent crawlResult
combineAbsoluteUrls :: CrawlAction -> [CrawlAction] -> [CrawlAction]
combineAbsoluteUrls previousAction = map $ combineAbsoluteUrl previousAction
combineAbsoluteUrl :: CrawlAction -> CrawlAction -> CrawlAction
combineAbsoluteUrl previousAction = addUrlPrefix baseUrl
where
baseUrl = ((++"/") . concat . intersperse "/" . dropLast . splitOn "/") (crawlUrl previousAction)
dropLast es = take (length es 1) es