module Holumbus.Crawler.Html
where
import Data.Function.Selector
import Data.List
import Data.Maybe
import Holumbus.Crawler.Types
import Holumbus.Crawler.URIs
import System.FilePath
import Text.XML.HXT.Core hiding ( when
, getState
)
defaultHtmlCrawlerConfig :: AccumulateDocResult a r ->
MergeDocResults r -> CrawlerConfig a r
defaultHtmlCrawlerConfig op op2 = ( setS theSysConfig
( withValidate no
>>>
withParseHTML yes
>>>
withInputEncoding isoLatin1
>>>
withWarnings no
>>>
withIgnoreNoneXmlContents yes
)
>>>
setS thePreRefsFilter this
>>>
setS theProcessRefs getHtmlReferences
$
defaultCrawlerConfig op op2
)
getHtmlReferences :: ArrowXml a => a XmlTree URI
getHtmlReferences = fromLA (getRefs $< computeDocBase)
where
getRefs base = deep (hasNameWith
( (`elem` ["a","frame","iframe"]) . localPart )
)
>>>
( getAttrValue0 "href"
<+>
getAttrValue0 "src"
)
>>^ toAbsRef base
getDocReferences :: ArrowXml a => a XmlTree URI
getDocReferences = fromLA (getRefs $< computeDocBase)
where
getRefs base = multi selRefs >>^ toAbsRef base
where
hasLocName n = hasNameWith ((== n) . localPart)
selRef en an = hasLocName en :-> getAttrValue0 an
selRefs = choiceA $
map (uncurry selRef) names
++
[ appletRefs
, objectRefs
, this :-> none
]
names = [ ("img", "src")
, ("input", "src")
, ("link", "href")
, ("script", "src")
]
appletRefs = hasLocName "applet" :-> (getAppRef $< getAppBase)
where
getAppBase = (getAttrValue0 "codebase" `withDefault` ".") >>^ toAbsRef base
getAppRef ab = getAttrValue0 "code" >>^ toAbsRef ab
objectRefs = hasLocName "object" :-> none
toAbsRef :: URI -> URI -> URI
toAbsRef base ref = ( expandURIString ref
>>>
fromMaybe ref
>>>
removeFragment
) base
where
removeFragment r
| "#" `isPrefixOf` path = reverse . tail $ path
| otherwise = r
where
path = dropWhile (/='#') . reverse $ r
computeDocBase :: ArrowXml a => a XmlTree String
computeDocBase = ( ( ( getByPath ["html", "head", "base"]
>>>
getAttrValue "href"
)
&&&
getAttrValue transferURI
)
>>> expandURI
)
`orElse`
getAttrValue transferURI
getByPath :: ArrowXml a => [String] -> a XmlTree XmlTree
getByPath = seqA . map (\ n -> getChildren >>> hasName n)
getHtmlTitle :: ArrowXml a => a XmlTree String
getHtmlTitle = getAllText $
getByPath ["html", "head", "title"]
getHtmlPlainText :: ArrowXml a => a XmlTree String
getHtmlPlainText = getAllText $
getByPath ["html", "body"]
getAllText :: ArrowXml a => a XmlTree XmlTree -> a XmlTree String
getAllText getText' = ( getText'
>>>
( fromLA $ deep getText )
>>^
(" " ++)
)
>. (concat >>> normalizeWS)
isHtmlContents :: ArrowXml a => a XmlTree XmlTree
isHtmlContents = ( getAttrValue transferMimeType
>>>
isA ( `elem` [text_html, application_xhtml] )
) `guards` this
isPdfContents :: ArrowXml a => a XmlTree XmlTree
isPdfContents = ( getAttrValue transferMimeType
>>>
isA ( == application_pdf )
) `guards` this
getTitleOrDocName :: ArrowXml a => a XmlTree String
getTitleOrDocName = ( getHtmlTitle >>> isA (not . null) )
`orElse`
( getAttrValue transferURI >>^ takeFileName )
isElemWithAttr :: ArrowXml a => String -> String -> (String -> Bool) -> a XmlTree XmlTree
isElemWithAttr en an av = isElem
>>>
hasName en
>>>
hasAttrValue an av
application_pdf :: String
application_pdf = "application/pdf"
normalizeWS :: String -> String
normalizeWS = words >>> unwords
limitLength :: Int -> String -> String
limitLength n s
| length s' <= n = s
| otherwise = take (n 3) s' ++ "..."
where
s' = take (n + 1) s