{-| Module : Text.HTML.TagSoup Copyright : (c) Neil Mitchell 2006-2007 License : BSD-style Maintainer : http://www.cs.york.ac.uk/~ndm/ Stability : unstable Portability : portable This module is for extracting information out of unstructured HTML code, sometimes known as tag-soup. This is for situations where the author of the HTML is not cooperating with the person trying to extract the information, but is also not trying to hide the information. The standard practice is to parse a String to 'Tag's using 'parseTags', then operate upon it to extract the necessary information. -} module Text.HTML.TagSoup( -- * Data structures and parsing Tag(..), Attribute, module Text.HTML.TagSoup.Parser, canonicalizeTags, -- * Tag identification isTagOpen, isTagClose, isTagText, isTagWarning, isTagOpenName, isTagCloseName, -- * Extraction fromTagText, fromAttrib, maybeTagText, maybeTagWarning, innerText, -- * Utility sections, partitions, -- * Combinators TagRep, IsChar, (~==),(~/=) ) where import Text.HTML.TagSoup.Parser import Text.HTML.TagSoup.Type import Data.Char import Data.List {- | Turns all tag names to lower case and converts DOCTYPE to upper case. -} canonicalizeTags :: [Tag] -> [Tag] canonicalizeTags = map f where f (TagOpen name attrs) | "!" `isPrefixOf` name = TagOpen (map toUpper name) attrs f (TagOpen name attrs) | otherwise = TagOpen (map toLower name) attrs f (TagClose name) = TagClose (map toLower name) f a = a -- | Define a class to allow String's or Tag's to be used as matches class TagRep a where toTagRep :: a -> Tag instance TagRep Tag where toTagRep = id class IsChar a where toChar :: a -> Char instance IsChar Char where toChar = id instance IsChar c => TagRep [c] where toTagRep x = case parseTags s of [a] -> a _ -> error $ "When using a TagRep it must be exactly one tag, you gave: " ++ s where s = map toChar x -- | Performs an inexact match, the first item should be the thing to match. -- If the second item is a blank string, that is considered to match anything. -- For example: -- -- > (TagText "test" ~== TagText "" ) == True -- > (TagText "test" ~== TagText "test") == True -- > (TagText "test" ~== TagText "soup") == False -- -- For 'TagOpen' missing attributes on the right are allowed. (~==) :: TagRep t => Tag -> t -> Bool (~==) a b = f a (toTagRep b) where f (TagText y) (TagText x) = null x || x == y f (TagClose y) (TagClose x) = null x || x == y f (TagOpen y ys) (TagOpen x xs) = (null x || x == y) && all g xs where g (name,val) | null name = val `elem` map snd ys | null val = name `elem` map fst ys g nameval = nameval `elem` ys f _ _ = False -- | Negation of '~==' (~/=) :: TagRep t => Tag -> t -> Bool (~/=) a b = not (a ~== b) -- | This function takes a list, and returns all suffixes whose -- first item matches the predicate. sections :: (a -> Bool) -> [a] -> [[a]] sections p = filter (p . head) . init . tails -- | This function is similar to 'sections', but splits the list -- so no element appears in any two partitions. partitions :: (a -> Bool) -> [a] -> [[a]] partitions p = let notp = not . p in groupBy (const notp) . dropWhile notp