module Text.HTML.CrawlChain.HtmlFiltering (
extractTagsContent, findAttributes,
extractLinks, extractLinksMatching, extractLinksWithAttributes, extractLinksFilteringUrlAttrs, extractLinksFilteringAll, unevaluated,
findAllUrlsEndingWith, findFirstLinkAfter,
extractFirstForm,
Method(..),
noUrlFilter, AttrFilter, noAttrFilter, ContainedTextFilter, noTextFilter
) where
import Data.Char (toLower)
import Data.List (isSuffixOf, isPrefixOf, isInfixOf, nub, sort)
import Data.List.Split (splitOneOf)
import Data.Maybe (fromMaybe)
import Text.HTML.TagSoup
import Network.CrawlChain.CrawlAction
type TagS = Tag String
type AttrFilter = [(String, String)] -> Bool
type ContainedTextFilter = [String] -> Bool
type TagContent = (String, String, [(String, String)])
extractTagsContent :: String -> [TagContent]
extractTagsContent =
extractContentFromTagStream ("", "", []) .
canonicalizeTags .
parseTags
where
extractContentFromTagStream :: TagContent -> [TagS] -> [(String, String, [(String, String)])]
extractContentFromTagStream tagContent@(name, _, _) []
| null name = []
| otherwise = [tagContent]
extractContentFromTagStream tagContent@(name,content,attributes) (t:ts) = handle t where
handle (TagText c) = extractContentFromTagStream (name, content ++ c, attributes) ts
handle (TagOpen n as) = (if null name then [] else [tagContent]) ++ extractContentFromTagStream (n, "", as) ts
handle _ = extractContentFromTagStream tagContent ts
findAttributes :: String -> [TagContent] -> [String]
findAttributes name = foldr ((++) . findAttr') []
where
findAttr' :: TagContent -> [String]
findAttr' (_, _, as) = map snd $ filter (\(n,_) -> n==name) as
noUrlFilter :: String -> Bool
noUrlFilter = unevaluated
noAttrFilter :: AttrFilter
noAttrFilter = unevaluated
noTextFilter :: ContainedTextFilter
noTextFilter = unevaluated
unevaluated :: a -> Bool
unevaluated _ = True
extractLinks :: String -> [CrawlAction]
extractLinks = extractLinksFilteringUrlAttrs unevaluated unevaluated
extractLinksMatching :: (String -> Bool) -> String -> [CrawlAction]
extractLinksMatching = flip extractLinksFilteringUrlAttrs unevaluated
extractLinksWithAttributes :: AttrFilter -> String -> [CrawlAction]
extractLinksWithAttributes = extractLinksFilteringUrlAttrs unevaluated
extractLinksFilteringUrlAttrs :: (String -> Bool) -> AttrFilter -> String -> [CrawlAction]
extractLinksFilteringUrlAttrs urlFilter attrFilter = extractLinksFilteringAll urlFilter attrFilter noTextFilter
extractLinksFilteringAll :: (String -> Bool) -> AttrFilter -> ContainedTextFilter -> String -> [CrawlAction]
extractLinksFilteringAll urlFilter attrFilter containsTextFilter = extractLinksFiltering combinedFilter
where
combinedFilter ts =
(urlFilter $ getSrc linkTag)
&& (attrFilter $ getTagAttrs linkTag)
&& (containsTextFilter $ getTexts ts)
where
linkTag = if null ts then error "empty link tag group" else head ts
getTexts :: [TagS] -> [String]
getTexts = filter (not . null) . map (maybe "" id . maybeTagText)
extractLinksFiltering :: ([TagS] -> Bool) -> String -> [CrawlAction]
extractLinksFiltering linkFilter =
map GetRequest .
nub .
filter (not . null) .
map getSrc .
map head . filter linkFilter . filter (not . null) .
filterAndGroupLinks .
canonicalizeTags .
parseTags
where
filterAndGroupLinks :: [TagS] -> [[TagS]]
filterAndGroupLinks =
map cleanupLinkGroup . splitWhen' (\t -> isTagOpenName "a" t || isTagOpenName "iframe" t)
where
splitWhen' :: (a -> Bool) -> [a] -> [[a]]
splitWhen' f = splitWhen'' []
where
splitWhen'' col [] = [col]
splitWhen'' col (x:rest) = if f x then col:(splitWhen'' [x] rest) else splitWhen'' (col++[x]) rest
cleanupLinkGroup :: [TagS] -> [TagS]
cleanupLinkGroup = takeWhile notEndSrcTag . dropWhile notStartSrcTag where
notStartSrcTag t = not $ any (flip isTagOpenName t) supportedTags
notEndSrcTag t = not $ any (flip isTagCloseName t) supportedTags
supportedTags = ["a", "iframe"]
findFirstLinkAfter :: String -> [(String, String)] -> String -> [CrawlAction]
findFirstLinkAfter tagName tagAttrs =
take 1 .
map GetRequest .
map getSrc .
filter isLink .
dropWhile (not . isMarker) .
canonicalizeTags .
parseTags
where
isMarker (TagOpen tagName' as) = tagName' == tagName && sort tagAttrs == sort as
isMarker _ = False
isLink (TagOpen "a" as) = maybe False (not . null) $ lookup "href" as
isLink _ = False
getSrc :: Tag String -> String
getSrc (TagOpen _ attributes) = fromMaybe (fromMaybe "" (lookup "src" attributes)) (lookup "href" attributes)
getSrc _ = []
getTagAttrs :: Tag String -> [(String, String)]
getTagAttrs (TagOpen _ as) = as
getTagAttrs _ = []
findAllUrlsEndingWith :: String -> String -> [CrawlAction]
findAllUrlsEndingWith endMarker =
map GetRequest . filter (endMarker `isSuffixOf`) . filter ("http" `isPrefixOf`) . splitOneOfRetainingNonEmpty " \t\r\n\"\'"
data Method = POST | GET
extractFirstForm :: [String] -> String -> Maybe Method -> String -> String -> Maybe CrawlAction
extractFirstForm extraParams previousUrl method formName content = (buildAction . extractForm method formName [] . parseTags) content
where
initialParams = findExtraParams extraParams content
buildAction :: Maybe [Tag String] -> Maybe CrawlAction
buildAction = maybe Nothing buildAction'
where
buildAction' :: [Tag String] -> Maybe CrawlAction
buildAction' tags
| null tags = Nothing
| not (isFormStart (head tags)) = error $ show tags
| otherwise = maybe Nothing (buildAction'' (tail tags) initialParams) determineUrl
where
determineUrl :: Maybe String
determineUrl = lookup "action" (tagAttributes (head tags)) >>= (\u -> return (if null u then previousUrl else u))
buildAction'' :: [Tag String] -> [(String, String)] -> String -> Maybe CrawlAction
buildAction'' [] params url = Just $ PostRequest url (reverse params) PostForm
buildAction'' (t:tags') params url = buildAction'' tags' addToParams url
where
addToParams :: [(String, String)]
addToParams = maybe params (:params) (extractFormParam t)
isFormStart :: Tag String -> Bool
isFormStart (TagOpen tagName _) = isFormTag tagName
isFormStart _ = False
isFormClose :: Tag String -> Bool
isFormClose (TagClose tagName) = isFormTag tagName
isFormClose _ = False
isFormTag :: String -> Bool
isFormTag = (=="form") . map toLower
isFormStartOf :: Maybe Method -> String -> Tag String -> Bool
isFormStartOf method formName t = isFormStart t
&& (null formName || formNameMatches)
&& maybe True methodMatches method
where
formNameMatches = maybe False (==formName) (getAttributeValue "name")
methodMatches m = maybe False (methodMatches' m) (getAttributeValue "method")
where
methodMatches' POST = (=="POST")
methodMatches' GET = (=="GET")
getAttributeValue :: String -> Maybe String
getAttributeValue = flip lookup (tagAttributes t)
tagAttributes :: Tag String -> [(String, String)]
tagAttributes (TagOpen _ as) = as
tagAttributes _ = []
extractForm :: Maybe Method -> String -> [Tag String] -> [Tag String] -> Maybe [Tag String]
extractForm _ _ _ [] = Nothing
extractForm m n [] (t:tags)
| isFormStartOf m n t = extractForm m n [t] tags
| otherwise = extractForm m n [] tags
extractForm m n collected (t:tags)
| isFormClose t = Just (reverse collected)
| otherwise = extractForm m n (t:collected) tags
extractFormParam :: Tag String -> Maybe (String, String)
extractFormParam (TagOpen "input" as) = maybe Nothing (Just . findValueFor) (lookup "name" as)
where
findValueFor :: String -> (String, String)
findValueFor key = (key, fromMaybe [] (lookup "value" as))
extractFormParam _ = Nothing
findExtraParams :: [String] -> String -> [(String, String)]
findExtraParams keys = if null keys then \_ -> [] else findKeyValues
where
findKeyValues :: String -> [(String, String)]
findKeyValues = extractValues . filter containsKey . lines
where
containsKey :: String -> Bool
containsKey line = any (`isInfixOf` line) keys
extractValues :: [String] -> [(String, String)]
extractValues = foldr addIfIsValue []
addIfIsValue :: String -> [(String, String)] -> [(String, String)]
addIfIsValue line = (values ++)
where
splittedLine = splitOneOfRetainingNonEmpty " ,':" line
values = foldr extractKey [] keys
extractKey :: String -> [(String, String)] -> [(String, String)]
extractKey k = (++ (findValue splittedLine))
where
findValue :: [String] -> [(String, String)]
findValue (n':k':v':v'':rest) = if n'=="name" && k'==k && v'=="value" && not (null v'')
then [(k, v'')] else findValue (k':v':v'':rest)
findValue _ = []
splitOneOfRetainingNonEmpty :: [Char] -> String -> [String]
splitOneOfRetainingNonEmpty splitters = filter (not . null) . splitOneOf splitters