{-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} {-| This implements a common DSL for regular expression replacement text. This is represented with the 'Replace' data type. It also implements the 'IsString' interface, so if 'OverloadedStrings' is on, you can use a raw string to build the replacement. -} module Data.Text.ICU.Replace ( -- * @OverloadedStrings@ Syntax -- $doc -- * Types Replace -- * High-level interface , replace , replace' , replaceAll , replaceAll' -- * Low-level interface , rgroup , rtext , rstring , rfn , rtfn , rbuilder ) where import Control.Applicative import Control.Arrow import Data.Attoparsec.Text import Data.Foldable import Data.Monoid import Data.String import qualified Data.Text as T import Data.Text.ICU import qualified Data.Text.ICU as ICU import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as TB -- | A 'Replace' instance is a function from a regular expression match to -- a 'Data.Text.Lazy.Builder.Builder'. This naturally forms a 'Monoid', so -- they're easy to combine. -- -- 'Replace' also implements 'IsString', so raw strings can be used to -- construct them. newtype Replace = Replace { unReplace :: Match -> TB.Builder } deriving (Monoid) instance IsString Replace where fromString = parseReplace . T.pack -- | Execute a regular expression on a 'Data.Text.Text' and replace the -- first match. replace :: Regex -- ^ The regular expression to match. -> Replace -- ^ The specification to replace it with. -> T.Text -- ^ The text to operate on. -> T.Text -- ^ The text with the first match replaced. replace re r t = maybe t (replace' r) $ ICU.find re t -- | Replace one regular expression match with the 'Replace'. replace' :: Replace -- ^ The specification to replace it with. -> Match -- ^ The match to replace. -> T.Text -- ^ The text with the match replaced. replace' r m = finish (Last (Just m)) $ unReplace r m -- | Execute a regular expression on a 'Data.Text.Text' and replace all -- matches. replaceAll :: Regex -- ^ The regular expression to match. -> Replace -- ^ The specification to replace it with. -> T.Text -- ^ The text to operate on. -> T.Text -- ^ The text with all matches replaced. replaceAll re r t = case ICU.findAll re t of [] -> t ms -> replaceAll' r ms -- | Replace all regular expression matches with the 'Replace'. replaceAll' :: Replace -- ^ The specification to replace it with. -> [Match] -- ^ The matches to replace. -> T.Text -- ^ The text with all matches replaced. replaceAll' r ms = uncurry finish $ foldMap (Last . Just &&& build r) ms where build :: Replace -> Match -> TB.Builder build repl m = TB.fromText (ICU.span m) <> unReplace repl m -- This handles the last match by including not only the match's prefix and -- the replacement text, but also the suffix trailing the match. finish :: Last Match -> TB.Builder -> T.Text finish m b = TL.toStrict . TB.toLazyText . mappend b . TB.fromText . fold $ suffix 0 =<< getLast m -- | Create a 'Replace' that inserts a regular expression group. rgroup :: Int -- ^ The number of the group in a regular expression. -> Replace -- ^ The 'Replace' that inserts a group's match. rgroup g = Replace $ fold . fmap TB.fromText . group g -- | Create a 'Replace' that inserts static 'Data.Text.Text'. rtext :: T.Text -- ^ The static 'Data.Text.Text' to insert. -> Replace -- ^ The 'Replace' that inserts the static 'Data.Text.Text'. rtext = rbuilder . TB.fromText -- | Create a 'Replace' that inserts a static 'String'. rstring :: String -- ^ The static 'String' to insert. -> Replace -- ^ The 'Replace' that inserts the static 'String'. rstring = rbuilder . TB.fromString -- | Create a 'Replace' from a function that transforms a 'Match' into -- a 'Data.Text.Lazy.Builder.Builder'. rfn :: (Match -> TB.Builder) -- ^ The function that creates the replacement text. -> Replace -- ^ The 'Replace' based off that function. rfn = Replace -- | Create a 'Replace' From a function that transforms a 'Match' into -- a 'Data.Text.Text'. rtfn :: (Match -> T.Text) -- ^ The function that creates the replacement text. -> Replace -- ^ The 'Replace' based off that function. rtfn = Replace . (TB.fromText .) -- | Create a 'Replace' that inserts a static 'Data.Text.Lazy.Builder.Builder'. rbuilder :: TB.Builder -- ^ The 'Data.Text.Lazy.Builder.Builder' to insert. -> Replace -- ^ The 'Replace' that inserts the static -- 'Data.Text.Lazy.Builder.Builder'. rbuilder = Replace . const -- | This parses a 'Data.Text.Text' into a 'Replace' structure. -- -- Generally, input text is considered to be static. -- -- However, groups from the regular expression's matches can be insert -- using @$1@ (to insert the first group) or @${7}@ (to insert the seventh -- group). -- -- Dollar signs can be included in the output by doubling them (@$$@). parseReplace :: T.Text -> Replace parseReplace t = either (const $ rtext t) id $ parseOnly (replacement <* endOfInput) t -- A replacement. replacement :: Parser Replace replacement = mconcat <$> many (dollarGroup <|> raw) -- A @\$\d+@ or @\$\{\d+\}@ group. This could also be an escaped dollar -- sign (@$$@). dollarGroup :: Parser Replace dollarGroup = char '$' *> (grp <|> escaped) where curly = char '{' *> decimal <* char '}' grp = rgroup <$> (decimal <|> curly) escaped = rtext . T.singleton <$> char '$' -- A raw input string. It must contain no dollar signs (@$@). raw :: Parser Replace raw = rtext <$> takeWhile1 (/= '$') -- $doc -- -- The syntax used with 'OverloadedStrings' is meant to be similar to that -- used in other regular expression libraries in other programming -- languages. -- -- Generally, input text is considered to be static. -- -- >>> replaceAll "a" "b" "aaa" -- "bbb" -- >>> replaceAll "ab" "ba" "cdababcd" -- "cdbabacd" -- -- However, groups from the regular expression's matches can be insert -- using @$1@ (to insert the first group) or @${7}@ (to insert the seventh -- group). -- -- >>> replaceAll "(.*), (.*)" "$2 $1" "Beeblebrox, Zaphod" -- "Zaphod Beeblebrox" -- >>> replaceAll "4(\\d)" "${1}4" "7458" -- "7548" -- -- Dollar signs can be included in the output by doubling them (@$$@). -- -- >>> replaceAll "(\\d+\\.\\d+)" "$$$1" "9.99" -- "$9.99"