{- |
Module : Text.HTML.Parser.Util
Description : Utility functions for the html-parse library
Copyright : (c) Neil Mitchell 2006–2019 (TagSoup),
Tony Zorman 2020–2022 (port to html-parse)
License : BSD-3
Maintainer : Tony Zorman
Stability : experimental
Portability : non-portable
Utility functions to make working with @html-parse@ as easy as working
with TagSoup! Most functions are one-to-one replacements for their
respective TagSoup analogues and work the same way.
-}
module Text.HTML.Parser.Util
( -- * Conversion
toToken -- :: Text -> Token
, toTokenDefault -- :: Token -> Text -> Token
-- * Tag identification
, isTagOpen -- :: Token -> Bool
, isTagClose -- :: Token -> Bool
, isTagSelfClose -- :: Token -> Bool
, isContentText -- :: Token -> Bool
, isContentChar -- :: Token -> Bool
, isComment -- :: Token -> Bool
, isDoctype -- :: Token -> Bool
, isTagOpenName -- :: Text -> Token -> Bool
, isTagCloseName -- :: Text -> Token -> Bool
-- * Extraction
, fromContentText -- :: Token -> Text
, maybeContentText -- :: Token -> Maybe Text
, fromAttrib -- :: Attr -> Token -> Attr
, maybeAttrib -- :: Attr -> Token -> Maybe Attr
, innerText -- :: [Token] -> Text
, toHeadContentText -- :: [Token] -> Text
, between -- :: Token -> Token -> [Token] -> [Token]
, dropHeader -- :: [Attr] -> [Token] -> [Token]
, allContentText -- :: [Token] -> [Text]
-- * Utility
, sections -- :: (a -> Bool) -> [a] -> [[a]]
, section -- :: (a -> Bool) -> [a] -> [a]
, partitions -- :: (a -> Bool) -> [a] -> [[a]]
-- * Combinators
, (~==) -- :: Token -> Token -> Bool
, (~/=) -- :: Token -> Token -> Bool
) where
import qualified Data.Attoparsec.Text as A
import qualified Data.List.NonEmpty as NE
import qualified Data.Text as T
import Data.Either (fromRight)
import Data.List (groupBy, tails)
import Data.Maybe (fromMaybe, mapMaybe)
import Data.Text (Text)
import Text.HTML.Parser (Attr (Attr), Token (Comment, ContentChar, ContentText, Doctype, TagClose, TagOpen, TagSelfClose), token)
-- | Like 'toTokenDefault', but with a supplied default value.
--
-- >>> toToken "text"
-- ContentText "text"
toToken :: Text -> Token
toToken = toTokenDefault (Doctype "Could not parse string into token.")
-- | Convert 'Text' to 'Token', with a default in case of a parse failure.
toTokenDefault :: Token -> Text -> Token
toTokenDefault d = fromRight d . A.parseOnly token
-- | This function takes a list, and returns all suffixes whose first item
-- matches the predicate.
--
-- >>> sections (== 'c') "abc cba ccb"
-- ["c cba ccb","cba ccb","ccb","cb"]
sections :: (a -> Bool) -> [a] -> [[a]]
sections p = filter (p . head) . init . tails
-- | Like 'sections', but return the head element. Returns an empty list if no
-- head element is present.
--
-- >>> section (== 'c') "abc cba ccb"
-- "c cba ccb"
section :: (a -> Bool) -> [a] -> [a]
section f = \case
[] -> []
xs -> maybe [] NE.head (NE.nonEmpty (sections f xs))
-- | This function is similar to 'sections', but splits the list so no element
-- appears in any two partitions.
--
-- >>> partitions (== 'c') "abc cba ccb"
-- ["c ","cba ","c","cb"]
partitions :: (a -> Bool) -> [a] -> [[a]]
partitions p = groupBy (const notp) . dropWhile notp
where notp = not . p
-- | Get the first 'ContentText' element from a list of 'Token's. If no tag
-- could be found, return an empty string.
toHeadContentText :: [Token] -> Text
toHeadContentText = maybe "" NE.head . NE.nonEmpty . allContentText
-- | Get all 'Token's between @start@ and @end@.
between :: Token -> Token -> [Token] -> [Token]
between start end = takeWhile (~/= end )
. drop 1 -- drop the tag
. dropWhile (~/= start)
-- | Drop an HTML header (i.e. the header tags and everything in between), as
-- well as everything before it, from a list of 'Token's.
dropHeader :: [Attr] -> [Token] -> [Token]
dropHeader attr = drop 1 -- drop
. dropWhile (~/= TagClose "header" )
. dropWhile (~/= TagOpen "header" attr)
-- | Get all 'ContentText' entries from a list of 'Token's and extract their
-- content.
allContentText :: [Token] -> [Text]
allContentText = mapMaybe maybeContentText
-- | Test if a 'Token' is a 'TagOpen'.
isTagOpen :: Token -> Bool
isTagOpen = \case
TagOpen{} -> True
_ -> False
-- | Test if a 'Token' is a 'TagClose'.
isTagClose :: Token -> Bool
isTagClose = \case
TagClose{} -> True
_ -> False
-- | Test if a 'Token' is a 'ContentText'.
isContentText :: Token -> Bool
isContentText = \case
ContentText{} -> True
_ -> False
-- | Extract the string from within 'ContentText', otherwise return 'Nothing'.
maybeContentText :: Token -> Maybe Text
maybeContentText = \case
ContentText t -> Just t
_ -> Nothing
-- | Extract the string from within 'ContentText', crashes if not a
-- 'ContentText'.
fromContentText :: Token -> Text
fromContentText = \case
ContentText t -> t
t -> error $ "(" ++ show t ++ ") is not a ContentText"
-- | Extract all text content from a list of Tokens (similar to Verbatim found
-- in HaXml).
innerText :: [Token] -> Text
innerText = mconcat . mapMaybe maybeContentText
-- | Test if a 'Token' is a 'TagSelfClose'.
isTagSelfClose :: Token -> Bool
isTagSelfClose = \case
TagSelfClose{} -> True
_ -> False
-- | Test if a 'Token' is a 'ContentChar'.
isContentChar :: Token -> Bool
isContentChar = \case
ContentChar{} -> True
_ -> False
-- | Test if a 'Token' is a 'Comment'.
isComment :: Token -> Bool
isComment = \case
Comment{} -> True
_ -> False
-- | Test if a 'Token' is a 'Doctype'.
isDoctype :: Token -> Bool
isDoctype = \case
Doctype{} -> True
_ -> False
-- | Returns True if the 'Token' is 'TagOpen' and matches the given name.
isTagOpenName :: Text -> Token -> Bool
isTagOpenName name (TagOpen n _) = n == name
isTagOpenName _ _ = False
-- | Returns True if the 'Token' is 'TagClose' and matches the given name.
isTagCloseName :: Text -> Token -> Bool
isTagCloseName name (TagClose n) = n == name
isTagCloseName _ _ = False
-- | Extract an attribute; crashes if not a 'TagOpen'. Returns @Attr \"\" \"\"@
-- if no attribute present.
--
-- Warning: does not distinguish between missing attribute and present
-- attribute with values @\"\"@.
fromAttrib :: Attr -> Token -> Attr
fromAttrib att tag = fromMaybe (Attr "" "") $ maybeAttrib att tag
-- | Extract an attribute; crashes if not a 'TagOpen'. Returns
-- 'Nothing' if no attribute present.
maybeAttrib :: Attr -> Token -> Maybe Attr
maybeAttrib att (TagOpen _ atts)
| att `elem` atts = Just att
| otherwise = Nothing
maybeAttrib _ t = error ("(" ++ show t ++ ") is not a TagOpen")
infixl 9 ~==
-- | Performs an inexact match, the first item should be the thing to
-- match.
--
-- >>> ContentText "test" ~== ContentText ""
-- True
--
-- >>> TagOpen "div" [Attr "class" "division ", Attr "id" "dd"] ~== TagOpen "div" [Attr "class" "division "]
-- True
(~==) :: Token -> Token -> Bool
(~==) = f
where
f (ContentText y) (ContentText x) = T.null x || x == y
f (TagClose y) (TagClose x) = T.null x || x == y
f (Comment x) (Comment y) = x == mempty || x == y
f (TagOpen y ys) (TagOpen x xs) = (T.null x || x == y) && all g xs
where
g :: Attr -> Bool
g nv@(Attr name val)
| T.null name = val `elem` map (\(Attr o _) -> o) ys
| T.null val = name `elem` map (\(Attr _ t) -> t) ys
| otherwise = nv `elem` ys
f _ _ = False
infixl 9 ~/=
-- | Negation of '(~==)'.
(~/=) :: Token -> Token -> Bool
(~/=) a b = not (a ~== b)