module Heuristics.BanWords where import Control.Applicative import Data.Attoparsec.Text import Data.Char import Data.Foldable import Data.Monoid import Data.Text (Text) import Data.Vector (Vector) import qualified Data.Vector as V newtype MatchPolicy = MatchPolicy { _unMatchPolicy :: Vector Text -> Parser Text } instance Monoid MatchPolicy where mempty = MatchPolicy . const $ empty mappend f g = MatchPolicy $ \x -> _unMatchPolicy f x <|> _unMatchPolicy g x passBlacklist :: MatchPolicy -> Vector Text -> Text -> Maybe Text passBlacklist f blacklist t = case parseOnly (_unMatchPolicy f blacklist) t of Right _ -> Nothing Left _ -> Just t -- | Parses sucessfully (which means the word gets blocked) if any of the Texts -- in the blacklist are an exact match. banExact :: MatchPolicy banExact = MatchPolicy $ \blacklist -> asum $ (\x -> string x *> endOfInput *> return x) <$> blacklist -- | Parses sucessfully (which means the word gets blocked) if any of the Texts -- in the blacklist are a match. -- -- Case is ignored for ASCII characters. Non-alphabetic Unicode characters at -- either the start of the end of the word can be skipped. -- -- E.g. " Admin" is banned against ["admin"], but "a admin" isn't. banAlmostExact :: MatchPolicy banAlmostExact = MatchPolicy $ \blacklist -> asum $ (\x -> skipNonAlphabetical *> asciiCI x *> skipNonAlphabetical *> endOfInput *> return x) <$> blacklist where skipNonAlphabetical :: Parser () skipNonAlphabetical = skipWhile $ not . isAlpha exampleReserved :: Vector Text exampleReserved = V.fromList [ "accounts" , "admin" , "beta" , "billing" , "help" , "jobs" , "mail" , "registration" , "root" , "security" , "support" ]