{-# LANGUAGE LambdaCase #-} module Text.Seonbi.Html.Clipper ( clipPrefixText , clipSuffixText , clipText ) where import Control.Monad import Data.List (dropWhileEnd) import Data.Text import Text.Seonbi.Html -- | Clip the given prefix text and suffix text from the HTML fragments. -- It simply is composed of 'clipPrefixText' and 'clipSuffixText' functions. -- It returns 'Nothing' if any of a prefix and a suffix does not match. clipText :: Text -> Text -> [HtmlEntity] -> Maybe [HtmlEntity] clipText prefix suffix = clipSuffixText suffix <=< clipPrefixText prefix -- | Clip the given prefix text from the HTML fragments. If its first -- text element does not have the same prefix, or the first element is not -- an 'HtmlText' node, or the list of HTML fragments have nothing at all, -- it returns 'Nothing'. -- -- >>> :set -XOverloadedLists -- >>> :set -XOverloadedStrings -- >>> clipPrefixText "foo" [HtmlText [] "bar", HtmlStartTag [] P ""] -- Nothing -- >>> clipPrefixText "foo" [HtmlStartTag [] P "", HtmlText [] "foo"] -- Nothing -- >>> clipPrefixText "foo" [] -- Nothing -- -- If the first element is an 'HtmlText' node, and its 'rawText' contains -- the common prefix text, it returns a 'Just' value holding a list of -- HTML fragments with the common prefix removed. -- -- >>> clipPrefixText "foo" [HtmlText [] "foobar", HtmlStartTag [] P ""] -- Just [HtmlText {... "bar"},HtmlStartTag {...}] -- >>> clipPrefixText "foo" [HtmlText [] "foo", HtmlStartTag [] P ""] -- Just [HtmlStartTag {..., tag = P, ...}] -- -- A given text is treated as a raw text, which means even if some HTML -- entities refer to the same characters it may fails to match unless -- they share the exactly same representation, e.g.: -- -- >>> clipPrefixText "&" [HtmlText [] "&"] -- Nothing -- -- In the same manner, it doesn't find a prefix from 'HtmlCdata', e.g.: -- -- >>> clipPrefixText "foo" [HtmlCdata [] "foo", HtmlStartTag [] P ""] -- Nothing -- -- In order to remove a prefix from both 'HtmlText' and 'HtmlCdata', -- apply 'normalizeText' first so that all 'HtmlCdata' entities are transformed -- to equivalent 'HtmlText' entities: -- -- >>> import Text.Seonbi.Html.TextNormalizer (normalizeText) -- >>> let normalized = normalizeText [HtmlCdata [] "foo", HtmlStartTag [] P ""] -- >>> clipPrefixText "foo" normalized -- Just [HtmlStartTag {..., tag = P, ...}] -- -- Plus, it works even if HTML fragments contain some 'HtmlComment' entities, -- but these are not touched at all, e.g.: -- -- >>> clipPrefixText "bar" [HtmlComment [] "foo", HtmlText [] "barbaz"] -- Just [HtmlComment {... "foo"},HtmlText {... "baz"}] clipPrefixText :: Text -> [HtmlEntity] -> Maybe [HtmlEntity] clipPrefixText prefix [] | Data.Text.null prefix = Just [] | otherwise = Nothing clipPrefixText prefix (x@HtmlComment {} : xs) = (x :) <$> clipPrefixText prefix xs clipPrefixText prefix (x@HtmlText { rawText = rawText' } : xs) | prefix == rawText' = Just xs | prefix `isPrefixOf` rawText' = Just $ x { rawText = Data.Text.drop (Data.Text.length prefix) rawText' } : xs | otherwise = Nothing clipPrefixText _ _ = Nothing -- | Clip the given suffix text from the HTML fragments, in the same manner -- to 'clipPrefixText'. clipSuffixText :: Text -> [HtmlEntity] -> Maybe [HtmlEntity] clipSuffixText suffix [] | Data.Text.null suffix = Just [] | otherwise = Nothing clipSuffixText suffix entities = case Prelude.last entities' of e@HtmlText { rawText = rawText' } | suffix == rawText' -> Just (init' ++ comments) | suffix `isSuffixOf` rawText' -> let sLen = Data.Text.length suffix rtLen = Data.Text.length rawText' clipped = Data.Text.take (rtLen - sLen) rawText' in Just (init' ++ e { rawText = clipped } : comments) | otherwise -> Nothing _ -> Nothing where entities' :: [HtmlEntity] entities' = (`Data.List.dropWhileEnd` entities) $ \ case HtmlComment {} -> True _ -> False init' :: [HtmlEntity] init' = Prelude.init entities' comments :: [HtmlEntity] comments = Prelude.drop (Prelude.length entities') entities