-- | Program to replace HTML tags by whitespace -- -- This program was originally contributed by Petr Prokhorenkov. -- -- Tested in this benchmark: -- -- * Reading the file -- -- * Replacing text between HTML tags (<>) with whitespace -- -- * Writing back to a handle -- {-# OPTIONS_GHC -fspec-constr-count=5 #-} module Benchmarks.Programs.StripTags ( benchmark ) where import Criterion (Benchmark, bgroup, bench, whnfIO) import Data.List (mapAccumL) import System.IO (Handle, hPutStr) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as BC import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.IO as T benchmark :: FilePath -> Handle -> IO Benchmark benchmark i o = return $ bgroup "StripTags" [ bench "String" $ whnfIO $ readFile i >>= hPutStr o . string , bench "ByteString" $ whnfIO $ B.readFile i >>= B.hPutStr o . byteString , bench "Text" $ whnfIO $ T.readFile i >>= T.hPutStr o . text , bench "TextByteString" $ whnfIO $ B.readFile i >>= B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8 ] string :: String -> String string = snd . mapAccumL step 0 text :: T.Text -> T.Text text = snd . T.mapAccumL step 0 byteString :: B.ByteString -> B.ByteString byteString = snd . BC.mapAccumL step 0 step :: Int -> Char -> (Int, Char) step d c | d > 0 || d' > 0 = (d', ' ') | otherwise = (d', c) where d' = d + depth c depth '>' = 1 depth '<' = -1 depth _ = 0