Safe Haskell | None |
---|
- defaultHtmlCrawlerConfig :: AccumulateDocResult a r -> MergeDocResults r -> CrawlerConfig a r
- getHtmlReferences :: ArrowXml a => a XmlTree URI
- getDocReferences :: ArrowXml a => a XmlTree URI
- toAbsRef :: URI -> URI -> URI
- computeDocBase :: ArrowXml a => a XmlTree String
- getByPath :: ArrowXml a => [String] -> a XmlTree XmlTree
- getHtmlTitle :: ArrowXml a => a XmlTree String
- getHtmlPlainText :: ArrowXml a => a XmlTree String
- getAllText :: ArrowXml a => a XmlTree XmlTree -> a XmlTree String
- isHtmlContents :: ArrowXml a => a XmlTree XmlTree
- isPdfContents :: ArrowXml a => a XmlTree XmlTree
- getTitleOrDocName :: ArrowXml a => a XmlTree String
- isElemWithAttr :: ArrowXml a => String -> String -> (String -> Bool) -> a XmlTree XmlTree
- application_pdf :: String
- normalizeWS :: String -> String
- limitLength :: Int -> String -> String
Documentation
defaultHtmlCrawlerConfig :: AccumulateDocResult a r -> MergeDocResults r -> CrawlerConfig a rSource
getHtmlReferences :: ArrowXml a => a XmlTree URISource
Collect all HTML references to other documents within a, frame and iframe elements
getDocReferences :: ArrowXml a => a XmlTree URISource
toAbsRef :: URI -> URI -> URISource
construct an absolute URI by a base URI and a possibly relative URI
computeDocBase :: ArrowXml a => a XmlTree StringSource
Compute the base URI of a HTML page with respect to a possibly given base element in the head element of a html page.
Stolen from Uwe Schmidt, http://www.haskell.org/haskellwiki/HXT and then stolen back again by Uwe from Holumbus.Utility
getHtmlTitle :: ArrowXml a => a XmlTree StringSource
getHtmlPlainText :: ArrowXml a => a XmlTree StringSource
isHtmlContents :: ArrowXml a => a XmlTree XmlTreeSource
isPdfContents :: ArrowXml a => a XmlTree XmlTreeSource
getTitleOrDocName :: ArrowXml a => a XmlTree StringSource
normalizeWS :: String -> StringSource
normalize whitespace by splitting a text into words and joining this together with unwords
limitLength :: Int -> String -> StringSource
take the first n chars of a string, if the input is too long the cut off is indicated by "..." at the end