{- | We do not define a tag data type here, since this is too much bound to the particular use (e.g. list or tree structure). However we define a tag name and several -} module Text.HTML.Basic.Tag ( Tag.Name(..), Tag.doctype, Tag.doctypeName, Tag.doctypeString, Tag.cdata, Tag.cdataName, Tag.cdataString, isEmpty, isSloppy, isInnerOf, closes, maybeMetaHTTPHeader, maybeMetaEncoding, ) where import Text.XML.Basic.Tag (Name, ) import qualified Text.XML.Basic.Tag as Tag import qualified Text.XML.Basic.Attribute as Attr import qualified Text.XML.Basic.Name as Name import qualified Data.Map as Map import qualified Data.Set as Set import qualified Data.Char as Char import Data.List.HT (takeWhileRev, ) import Data.Tuple.HT (mapFst, ) import Control.Monad (guard, liftM2, ) {- | Check whether a HTML tag is empty. -} isEmpty :: (Name.Tag name) => Name name -> Bool isEmpty = flip Set.member emptySet {- | Set of empty HTML tags. -} emptySet :: (Name.Tag name) => Set.Set (Name name) emptySet = nameSet $ "area" : "base" : "br" : "col" : "frame" : "hr" : "img" : "input" : "link" : "meta" : "param" : [] {- | Some tags, namely those for text styles like FONT, B, I, are used quite sloppily. That is, they are not terminated or not terminated in the right order. We close them implicitly, if another tag closes and ignore non-matching closing tags. -} isSloppy :: (Name.Tag name) => Name name -> Bool isSloppy = flip Set.member sloppySet {- Example page: http://extremetracking.com/open;unique?login=crsucks -} sloppySet :: (Name.Tag name) => Set.Set (Name name) sloppySet = nameSet $ "font" : "b" : "i" : "tt" : "u" : "strike" : "s" : "big" : "small" : [] isInnerOf :: (Name.Tag name) => Name name -> Name name -> Bool isInnerOf outer inner = maybe False (Set.member inner) $ Map.lookup outer innerMap innerMap :: (Name.Tag name) => Map.Map (Name name) (Set.Set (Name name)) innerMap = nameMap $ ("body", pSet) : ("caption", pSet) : ("dd", pSet) : ("div", pSet) : ("dl", dtdSet) : ("dt", pSet) : ("li", pSet) : ("map", pSet) : ("object", pSet) : ("ol", liSet) : ("table", nameSet ["th","tr","td","thead","tfoot","tbody"]) : ("tbody", thdrSet) : ("td", pSet) : ("tfoot", thdrSet) : ("th", pSet) : ("thead", thdrSet) : ("tr", thdSet) : ("ul", liSet) : [] closes :: (Name.Tag name) => Name name -> Name name -> Bool closes closing opening = (not (Name.match "option" closing) && Name.match "select" opening) || (Name.matchAny ["option", "script", "style","textarea","title"] opening) || (maybe False (Set.member opening) $ Map.lookup closing closesMap) closesMap :: (Name.Tag name) => Map.Map (Name name) (Set.Set (Name name)) closesMap = nameMap $ ("a" , nameSingle "a") : ("li" , liSet) : ("th" , thdSet) : ("td" , thdSet) : ("tr" , thdrSet) : ("dt" , dtdSet) : ("dd" , dtdSet) : ("hr" , pSet) : ("colgroup" , nameSingle "colgroup") : ("form" , nameSingle "form") : ("label" , nameSingle "label") : ("map" , nameSingle "map") : ("object" , nameSingle "object") : ("thead" , nameSet ["colgroup"]) : ("tfoot" , nameSet ["thead", "colgroup"]) : ("tbody" , nameSet ["tbody", "tfoot", "thead", "colgroup"]) : ("h1" , headingSet) : ("h2" , headingSet) : ("h3" , headingSet) : ("h4" , headingSet) : ("h5" , headingSet) : ("h6" , headingSet) : ("dl" , headingSet) : ("ol" , headingSet) : ("ul" , headingSet) : ("table" , headingSet) : ("div" , headingSet) : ("p" , headingSet) : [] nameMap :: (Name.Tag name) => [(String,a)] -> Map.Map (Name name) a nameMap = Map.fromList . map (mapFst Name.fromString) nameSet :: (Name.Tag name) => [String] -> Set.Set (Name name) nameSet = Set.fromList . map Name.fromString nameSingle :: (Name.Tag name) => String -> Set.Set (Name name) nameSingle = Set.singleton . Name.fromString pSet, dtdSet, thdSet, thdrSet, liSet, headingSet :: (Name.Tag name) => Set.Set (Name name) pSet = nameSet ["p"] dtdSet = nameSet ["dt","dd"] thdSet = nameSet ["th","td"] thdrSet = nameSet ["th","td","tr"] liSet = nameSet ["li"] headingSet = nameSet ["h1","h2","h3","h4","h5","h6","p" {- not "div" -}] maybeMetaHTTPHeader :: (Name.Tag name, Name.Attribute name) => Tag.Name name -> [Attr.T name string] -> Maybe (string, string) maybeMetaHTTPHeader name attrs = do guard (Name.match "meta" name) liftM2 (,) (Attr.lookupLit "http-equiv" attrs) (Attr.lookupLit "content" attrs) {- | A simple routine that does not check for valid syntax of the Content-Type specification. In future we might use a distinct @Encoding@ type instead of plain String. -} maybeMetaEncoding :: (Name.Tag name, Name.Attribute name) => Tag.Name name -> [Attr.T name String] -> Maybe String maybeMetaEncoding name attrs = do (headerName, content) <- maybeMetaHTTPHeader name attrs guard (("content-type"==) . map Char.toLower $ headerName) return (map Char.toLower . takeWhileRev ('='/=) $ content)