{- | Module : Text.HTML.Parser.Util Description : Utility functions for the html-parse library Copyright : (c) Neil Mitchell 2006–2019 (TagSoup), Tony Zorman 2020–2022 (port to html-parse) License : BSD-3 Maintainer : Tony Zorman Stability : experimental Portability : non-portable Utility functions to make working with @html-parse@ as easy as working with TagSoup! Most functions are one-to-one replacements for their respective TagSoup analogues and work the same way. -} module Text.HTML.Parser.Util ( -- * Conversion toToken -- :: Text -> Token , toTokenDefault -- :: Token -> Text -> Token -- * Tag identification , isTagOpen -- :: Token -> Bool , isTagClose -- :: Token -> Bool , isTagSelfClose -- :: Token -> Bool , isContentText -- :: Token -> Bool , isContentChar -- :: Token -> Bool , isComment -- :: Token -> Bool , isDoctype -- :: Token -> Bool , isTagOpenName -- :: Text -> Token -> Bool , isTagCloseName -- :: Text -> Token -> Bool -- * Extraction , fromContentText -- :: Token -> Text , maybeContentText -- :: Token -> Maybe Text , fromAttrib -- :: Attr -> Token -> Attr , maybeAttrib -- :: Attr -> Token -> Maybe Attr , innerText -- :: [Token] -> Text , toHeadContentText -- :: [Token] -> Text , between -- :: Token -> Token -> [Token] -> [Token] , dropHeader -- :: [Attr] -> [Token] -> [Token] , allContentText -- :: [Token] -> [Text] -- * Utility , sections -- :: (a -> Bool) -> [a] -> [[a]] , section -- :: (a -> Bool) -> [a] -> [a] , partitions -- :: (a -> Bool) -> [a] -> [[a]] -- * Combinators , (~==) -- :: Token -> Token -> Bool , (~/=) -- :: Token -> Token -> Bool ) where import qualified Data.Attoparsec.Text as A import qualified Data.List.NonEmpty as NE import qualified Data.Text as T import Data.Either (fromRight) import Data.List (groupBy, tails) import Data.Maybe (fromMaybe, mapMaybe) import Data.Text (Text) import Text.HTML.Parser (Attr (Attr), Token (Comment, ContentChar, ContentText, Doctype, TagClose, TagOpen, TagSelfClose), token) -- | Like 'toTokenDefault', but with a supplied default value. -- -- >>> toToken "text" -- ContentText "text" toToken :: Text -> Token toToken = toTokenDefault (Doctype "Could not parse string into token.") -- | Convert 'Text' to 'Token', with a default in case of a parse failure. toTokenDefault :: Token -> Text -> Token toTokenDefault d = fromRight d . A.parseOnly token -- | This function takes a list, and returns all suffixes whose first item -- matches the predicate. -- -- >>> sections (== 'c') "abc cba ccb" -- ["c cba ccb","cba ccb","ccb","cb"] sections :: (a -> Bool) -> [a] -> [[a]] sections p = filter (p . head) . init . tails -- | Like 'sections', but return the head element. Returns an empty list if no -- head element is present. -- -- >>> section (== 'c') "abc cba ccb" -- "c cba ccb" section :: (a -> Bool) -> [a] -> [a] section f = \case [] -> [] xs -> maybe [] NE.head (NE.nonEmpty (sections f xs)) -- | This function is similar to 'sections', but splits the list so no element -- appears in any two partitions. -- -- >>> partitions (== 'c') "abc cba ccb" -- ["c ","cba ","c","cb"] partitions :: (a -> Bool) -> [a] -> [[a]] partitions p = groupBy (const notp) . dropWhile notp where notp = not . p -- | Get the first 'ContentText' element from a list of 'Token's. If no tag -- could be found, return an empty string. toHeadContentText :: [Token] -> Text toHeadContentText = maybe "" NE.head . NE.nonEmpty . allContentText -- | Get all 'Token's between @start@ and @end@. between :: Token -> Token -> [Token] -> [Token] between start end = takeWhile (~/= end ) . drop 1 -- drop the tag . dropWhile (~/= start) -- | Drop an HTML header (i.e. the header tags and everything in between), as -- well as everything before it, from a list of 'Token's. dropHeader :: [Attr] -> [Token] -> [Token] dropHeader attr = drop 1 -- drop . dropWhile (~/= TagClose "header" ) . dropWhile (~/= TagOpen "header" attr) -- | Get all 'ContentText' entries from a list of 'Token's and extract their -- content. allContentText :: [Token] -> [Text] allContentText = mapMaybe maybeContentText -- | Test if a 'Token' is a 'TagOpen'. isTagOpen :: Token -> Bool isTagOpen = \case TagOpen{} -> True _ -> False -- | Test if a 'Token' is a 'TagClose'. isTagClose :: Token -> Bool isTagClose = \case TagClose{} -> True _ -> False -- | Test if a 'Token' is a 'ContentText'. isContentText :: Token -> Bool isContentText = \case ContentText{} -> True _ -> False -- | Extract the string from within 'ContentText', otherwise return 'Nothing'. maybeContentText :: Token -> Maybe Text maybeContentText = \case ContentText t -> Just t _ -> Nothing -- | Extract the string from within 'ContentText', crashes if not a -- 'ContentText'. fromContentText :: Token -> Text fromContentText = \case ContentText t -> t t -> error $ "(" ++ show t ++ ") is not a ContentText" -- | Extract all text content from a list of Tokens (similar to Verbatim found -- in HaXml). innerText :: [Token] -> Text innerText = mconcat . mapMaybe maybeContentText -- | Test if a 'Token' is a 'TagSelfClose'. isTagSelfClose :: Token -> Bool isTagSelfClose = \case TagSelfClose{} -> True _ -> False -- | Test if a 'Token' is a 'ContentChar'. isContentChar :: Token -> Bool isContentChar = \case ContentChar{} -> True _ -> False -- | Test if a 'Token' is a 'Comment'. isComment :: Token -> Bool isComment = \case Comment{} -> True _ -> False -- | Test if a 'Token' is a 'Doctype'. isDoctype :: Token -> Bool isDoctype = \case Doctype{} -> True _ -> False -- | Returns True if the 'Token' is 'TagOpen' and matches the given name. isTagOpenName :: Text -> Token -> Bool isTagOpenName name (TagOpen n _) = n == name isTagOpenName _ _ = False -- | Returns True if the 'Token' is 'TagClose' and matches the given name. isTagCloseName :: Text -> Token -> Bool isTagCloseName name (TagClose n) = n == name isTagCloseName _ _ = False -- | Extract an attribute; crashes if not a 'TagOpen'. Returns @Attr \"\" \"\"@ -- if no attribute present. -- -- Warning: does not distinguish between missing attribute and present -- attribute with values @\"\"@. fromAttrib :: Attr -> Token -> Attr fromAttrib att tag = fromMaybe (Attr "" "") $ maybeAttrib att tag -- | Extract an attribute; crashes if not a 'TagOpen'. Returns -- 'Nothing' if no attribute present. maybeAttrib :: Attr -> Token -> Maybe Attr maybeAttrib att (TagOpen _ atts) | att `elem` atts = Just att | otherwise = Nothing maybeAttrib _ t = error ("(" ++ show t ++ ") is not a TagOpen") infixl 9 ~== -- | Performs an inexact match, the first item should be the thing to -- match. -- -- >>> ContentText "test" ~== ContentText "" -- True -- -- >>> TagOpen "div" [Attr "class" "division ", Attr "id" "dd"] ~== TagOpen "div" [Attr "class" "division "] -- True (~==) :: Token -> Token -> Bool (~==) = f where f (ContentText y) (ContentText x) = T.null x || x == y f (TagClose y) (TagClose x) = T.null x || x == y f (Comment x) (Comment y) = x == mempty || x == y f (TagOpen y ys) (TagOpen x xs) = (T.null x || x == y) && all g xs where g :: Attr -> Bool g nv@(Attr name val) | T.null name = val `elem` map (\(Attr o _) -> o) ys | T.null val = name `elem` map (\(Attr _ t) -> t) ys | otherwise = nv `elem` ys f _ _ = False infixl 9 ~/= -- | Negation of '(~==)'. (~/=) :: Token -> Token -> Bool (~/=) a b = not (a ~== b)