module Text.Html.IsLink ( isLinkAttr , allLinkAttrs -- * Example with HXT -- $example ) where import Data.HashSet (HashSet) import qualified Data.HashSet as HS -- | @isLinkAttr tag attr@ returns 'True' if the attribute @attr@ of an HTML -- element with tag name @tag@ points to an external resource, and 'False' -- otherwise. So for example @isLinkAttr \"a\" \"href\"@ returns 'True' whereas -- @isLinkAttr \"a\" \"class\"@ returns 'False'. Note that 'isLinkAttr' -- expects both @tag@ and @attr@ to be in lowercase, so for example -- @isLinkAttr \"A\" \"HREF\"@ returns 'False'. isLinkAttr :: String -> String -> Bool isLinkAttr tag attr = HS.member (tag, attr) allLinkAttrs -- sources: -- * The HTML 4.01 transitional DTD -- * The HTML 4.01 strict DTD -- * The HTML 3.2 DTD -- * The HTML 3.0 DTD -- * The XMLmind XML Editor 6.0.0 Evaluation Edition contains -- a BSD licensed W3C XML Schema file for HTML 5 -- * The HTML::Tagset perl module (version 3.20) -- see the scripts/ directory for more details -- | A 'HashSet' that contains all combinations of tag names and attributes -- that correspond to links. allLinkAttrs :: HashSet (String, String) allLinkAttrs = HS.fromList [ ("a", "href") , ("applet", "archive") , ("applet", "code") , ("applet", "codebase") , ("area", "href") , ("audio", "src") , ("base", "href") , ("bgsound", "src") , ("blockquote", "cite") , ("body", "background") , ("button", "formaction") , ("command", "icon") , ("del", "cite") , ("embed", "pluginspage") , ("embed", "src") , ("fig", "src") , ("form", "action") , ("frame", "longdesc") , ("frame", "src") , ("head", "profile") , ("hr", "src") , ("html", "manifest") , ("iframe", "longdesc") , ("iframe", "src") , ("ilayer", "background") , ("img", "longdesc") , ("img", "lowsrc") , ("img", "src") , ("img", "usemap") , ("input", "formaction") , ("input", "src") , ("input", "usemap") , ("ins", "cite") , ("isindex", "action") , ("layer", "background") , ("layer", "src") , ("link", "href") , ("note", "src") , ("object", "archive") , ("object", "classid") , ("object", "codebase") , ("object", "data") , ("object", "usemap") , ("overlay", "src") , ("q", "cite") , ("script", "for") , ("script", "src") , ("source", "src") , ("table", "background") , ("td", "background") , ("th", "background") , ("track", "src") , ("tr", "background") , ("video", "poster") , ("video", "src") , ("xmp", "href") ] -- $example -- Here's an example illustrating how to use 'isLinkAttr' with @hxt@ in -- order to extract all links from an HTML document: -- -- > {-# LANGUAGE Arrows #-} -- > -- > import Text.Html.IsLink -- > import Text.XML.HXT.Core -- > -- > -- returns a list of tuples containing the tag name, attribute name, -- > -- attribute value of all links -- > getAllLinks :: FilePath -> IO [(String, String, String)] -- > getAllLinks path = runX $ doc >>> multi getLink -- > where -- > doc = readDocument [withParseHTML yes, withWarnings no] path -- > -- > getLink :: ArrowXml a => a XmlTree (String, String, String) -- > getLink = proc node -> do -- > tag <- getName -< node -- > attrbNode <- getAttrl -< node -- > attrb <- getName -< attrbNode -- > val <- xshow getChildren -< attrbNode -- > isLinkA -< (tag, attrb, val) -- > where -- > isLinkA = isLink `guardsP` this -- > isLink (tag, attrb, _) = isLinkAttr tag attrb