module Text.HTML.CrawlChain.HtmlFiltering ( extractTagsContent, findAttributes, extractLinks, extractLinksMatching, extractLinksWithAttributes, extractLinksFilteringUrlAttrs, extractLinksFilteringAll, unevaluated, findAllUrlsEndingWith, findFirstLinkAfter, extractFirstForm, Method(..), noUrlFilter, AttrFilter, noAttrFilter, ContainedTextFilter, noTextFilter ) where import Data.Char (toLower) import Data.List (isSuffixOf, isPrefixOf, isInfixOf, nub, sort) import Data.List.Split (splitOneOf) import Data.Maybe (fromMaybe) import Text.HTML.TagSoup import Network.CrawlChain.CrawlAction type TagS = Tag String type AttrFilter = [(String, String)] -> Bool type ContainedTextFilter = [String] -> Bool {-| (name, content, attributes)-} type TagContent = (String, String, [(String, String)]) {-| Content of the tags up to the first child tag (as simplification) - with attributes. -} extractTagsContent :: String -> [TagContent] extractTagsContent = extractContentFromTagStream ("", "", []) . canonicalizeTags . parseTags where extractContentFromTagStream :: TagContent -> [TagS] -> [(String, String, [(String, String)])] extractContentFromTagStream tagContent@(name, _, _) [] | null name = [] | otherwise = [tagContent] extractContentFromTagStream tagContent@(name,content,attributes) (t:ts) = handle t where handle (TagText c) = extractContentFromTagStream (name, content ++ c, attributes) ts handle (TagOpen n as) = (if null name then [] else [tagContent]) ++ extractContentFromTagStream (n, "", as) ts handle _ = extractContentFromTagStream tagContent ts findAttributes :: String -> [TagContent] -> [String] findAttributes name = foldr ((++) . findAttr') [] where findAttr' :: TagContent -> [String] findAttr' (_, _, as) = map snd $ filter (\(n,_) -> n==name) as noUrlFilter :: String -> Bool noUrlFilter = unevaluated noAttrFilter :: AttrFilter noAttrFilter = unevaluated noTextFilter :: ContainedTextFilter noTextFilter = unevaluated unevaluated :: a -> Bool unevaluated _ = True extractLinks :: String -> [CrawlAction] extractLinks = extractLinksFilteringUrlAttrs unevaluated unevaluated extractLinksMatching :: (String -> Bool) -> String -> [CrawlAction] extractLinksMatching = flip extractLinksFilteringUrlAttrs unevaluated extractLinksWithAttributes :: AttrFilter -> String -> [CrawlAction] extractLinksWithAttributes = extractLinksFilteringUrlAttrs unevaluated extractLinksFilteringUrlAttrs :: (String -> Bool) -> AttrFilter -> String -> [CrawlAction] extractLinksFilteringUrlAttrs urlFilter attrFilter = extractLinksFilteringAll urlFilter attrFilter noTextFilter extractLinksFilteringAll :: (String -> Bool) -> AttrFilter -> ContainedTextFilter -> String -> [CrawlAction] extractLinksFilteringAll urlFilter attrFilter containsTextFilter = extractLinksFiltering combinedFilter where combinedFilter ts = (urlFilter $ getSrc linkTag) && (attrFilter $ getTagAttrs linkTag) && (containsTextFilter $ getTexts ts) where linkTag = if null ts then error "empty link tag group" else head ts getTexts :: [TagS] -> [String] getTexts = filter (not . null) . map (maybe "" id . maybeTagText) extractLinksFiltering :: ([TagS] -> Bool) -> String -> [CrawlAction] extractLinksFiltering linkFilter = map GetRequest . nub . filter (not . null) . map getSrc . map head . filter linkFilter . filter (not . null) . filterAndGroupLinks . canonicalizeTags . parseTags where filterAndGroupLinks :: [TagS] -> [[TagS]] filterAndGroupLinks = map cleanupLinkGroup . splitWhen' (\t -> isTagOpenName "a" t || isTagOpenName "iframe" t) where splitWhen' :: (a -> Bool) -> [a] -> [[a]] splitWhen' f = splitWhen'' [] where splitWhen'' col [] = [col] splitWhen'' col (x:rest) = if f x then col:(splitWhen'' [x] rest) else splitWhen'' (col++[x]) rest cleanupLinkGroup :: [TagS] -> [TagS] cleanupLinkGroup = takeWhile notEndSrcTag . dropWhile notStartSrcTag where notStartSrcTag t = not $ any (flip isTagOpenName t) supportedTags -- (not . (isTagOpenName "a")) notEndSrcTag t = not $ any (flip isTagCloseName t) supportedTags supportedTags = ["a", "iframe"] findFirstLinkAfter :: String -> [(String, String)] -> String -> [CrawlAction] findFirstLinkAfter tagName tagAttrs = take 1 . map GetRequest . map getSrc . filter isLink . dropWhile (not . isMarker) . canonicalizeTags . parseTags where isMarker (TagOpen tagName' as) = tagName' == tagName && sort tagAttrs == sort as isMarker _ = False isLink (TagOpen "a" as) = maybe False (not . null) $ lookup "href" as isLink _ = False getSrc :: Tag String -> String getSrc (TagOpen _ attributes) = fromMaybe (fromMaybe "" (lookup "src" attributes)) (lookup "href" attributes) getSrc _ = [] getTagAttrs :: Tag String -> [(String, String)] getTagAttrs (TagOpen _ as) = as getTagAttrs _ = [] findAllUrlsEndingWith :: String -> String -> [CrawlAction] findAllUrlsEndingWith endMarker = map GetRequest . filter (endMarker `isSuffixOf`) . filter ("http" `isPrefixOf`) . splitOneOfRetainingNonEmpty " \t\r\n\"\'" data Method = POST | GET extractFirstForm :: [String] -> String -> Maybe Method -> String -> String -> Maybe CrawlAction extractFirstForm extraParams previousUrl method formName content = (buildAction . extractForm method formName [] . parseTags) content where initialParams = findExtraParams extraParams content buildAction :: Maybe [Tag String] -> Maybe CrawlAction buildAction = maybe Nothing buildAction' where buildAction' :: [Tag String] -> Maybe CrawlAction buildAction' tags | null tags = Nothing | not (isFormStart (head tags)) = error $ show tags | otherwise = maybe Nothing (buildAction'' (tail tags) initialParams) determineUrl where determineUrl :: Maybe String determineUrl = lookup "action" (tagAttributes (head tags)) >>= (\u -> return (if null u then previousUrl else u)) buildAction'' :: [Tag String] -> [(String, String)] -> String -> Maybe CrawlAction buildAction'' [] params url = Just $ PostRequest url (reverse params) PostForm buildAction'' (t:tags') params url = buildAction'' tags' addToParams url where addToParams :: [(String, String)] addToParams = maybe params (:params) (extractFormParam t) isFormStart :: Tag String -> Bool isFormStart (TagOpen tagName _) = isFormTag tagName isFormStart _ = False isFormClose :: Tag String -> Bool isFormClose (TagClose tagName) = isFormTag tagName isFormClose _ = False isFormTag :: String -> Bool isFormTag = (=="form") . map toLower isFormStartOf :: Maybe Method -> String -> Tag String -> Bool isFormStartOf method formName t = isFormStart t && (null formName || formNameMatches) && maybe True methodMatches method where formNameMatches = maybe False (==formName) (getAttributeValue "name") methodMatches m = maybe False (methodMatches' m) (getAttributeValue "method") where methodMatches' POST = (=="POST") methodMatches' GET = (=="GET") getAttributeValue :: String -> Maybe String getAttributeValue = flip lookup (tagAttributes t) tagAttributes :: Tag String -> [(String, String)] tagAttributes (TagOpen _ as) = as tagAttributes _ = [] extractForm :: Maybe Method -> String -> [Tag String] -> [Tag String] -> Maybe [Tag String] extractForm _ _ _ [] = Nothing -- no complete form extractForm m n [] (t:tags) | isFormStartOf m n t = extractForm m n [t] tags -- start form element collecting | otherwise = extractForm m n [] tags -- ... or skip until it starts extractForm m n collected (t:tags) | isFormClose t = Just (reverse collected) -- first form close, return form (nested forms unsupported) | otherwise = extractForm m n (t:collected) tags -- continue collecting form elements extractFormParam :: Tag String -> Maybe (String, String) extractFormParam (TagOpen "input" as) = maybe Nothing (Just . findValueFor) (lookup "name" as) where findValueFor :: String -> (String, String) findValueFor key = (key, fromMaybe [] (lookup "value" as)) extractFormParam _ = Nothing findExtraParams :: [String] -> String -> [(String, String)] findExtraParams keys = if null keys then \_ -> [] else findKeyValues where findKeyValues :: String -> [(String, String)] findKeyValues = extractValues . filter containsKey . lines where containsKey :: String -> Bool containsKey line = any (`isInfixOf` line) keys extractValues :: [String] -> [(String, String)] extractValues = foldr addIfIsValue [] addIfIsValue :: String -> [(String, String)] -> [(String, String)] addIfIsValue line = (values ++) where splittedLine = splitOneOfRetainingNonEmpty " ,':" line values = foldr extractKey [] keys extractKey :: String -> [(String, String)] -> [(String, String)] extractKey k = (++ (findValue splittedLine)) where findValue :: [String] -> [(String, String)] findValue (n':k':v':v'':rest) = if n'=="name" && k'==k && v'=="value" && not (null v'') then [(k, v'')] else findValue (k':v':v'':rest) findValue _ = [] splitOneOfRetainingNonEmpty :: [Char] -> String -> [String] splitOneOfRetainingNonEmpty splitters = filter (not . null) . splitOneOf splitters