module Data.Text.Stemming.English
(
stem
) where
import Control.Monad.Reader (MonadReader)
import Control.Applicative ((<|>))
import Data.Maybe (fromMaybe, listToMaybe, catMaybes)
import Data.Monoid ((<>))
import Data.Foldable (asum)
import qualified Data.Set as S
import qualified Data.Map as M
import Data.Proxy
import qualified Data.Text as T
import qualified Data.Text.Stemming.Constants.English as Constants
import Debug.Trace
stem :: S.Set T.Text -> T.Text -> T.Text
stem stopwords input =
fromMaybe stemmed $ checkStop stopwords word <|>
checkLength word <|>
checkSpecialWords word
where
word = T.toLower input
stemmed = doStem word
data WordRegion = WR {
word :: T.Text,
r1 :: T.Text,
r2 :: T.Text
} deriving Show
mapWR :: (T.Text -> T.Text) -> WordRegion -> WordRegion
mapWR f (WR word r1 r2) = WR (f word) (f r1) (f r2)
vowls :: [Char]
vowls = ['a', 'e', 'i', 'o', 'u', 'y']
doStem :: T.Text -> T.Text
doStem =
extractResult . stemmingSteps . regions
where
extractResult = T.toLower
stemmingSteps = step5 . step4 . word . step3 . step2 . step1 . step0
regions = computeRegions . scrubWord
scrubWord :: T.Text -> T.Text
scrubWord =
markConsonantY . stripLeadingApostrophe . normApostrophe
normApostrophe :: T.Text -> T.Text
normApostrophe =
T.replace "\x2018" "\x27" .
T.replace "\x2019" "\x27" .
T.replace "\x201B" "\x27"
stripLeadingApostrophe :: T.Text -> T.Text
stripLeadingApostrophe word
| T.isPrefixOf "\x27" word = T.tail word
| otherwise = word
markConsonantY :: T.Text -> T.Text
markConsonantY = go . replaceInit
where
replaceInit :: T.Text -> T.Text
replaceInit w
| T.head w == 'y' = 'Y' `T.cons` T.tail w
| otherwise = w
go w = T.head w `T.cons` T.zipWith f w (T.tail w)
f c 'y'
| c `elem` vowls = 'Y'
| otherwise = 'y'
f _ c = c
computeRegions :: T.Text -> WordRegion
computeRegions word = let
r1 = fromMaybe (region word) (specialRegion word)
r2 = region r1
in WR word r1 r2
where
region txt
| T.null txt || T.null (T.tail txt) = ""
| otherwise =
case (T.head txt, T.head (T.tail txt)) of
(a,b) | a `elem` vowls && (b `notElem` vowls) -> T.tail $ T.tail txt
(a,b) -> region $ T.tail txt
specialRegion txt = listToMaybe . catMaybes $ [T.stripPrefix x txt | x <- ["gener", "commun", "arsen"]]
step0Suffixes :: [T.Text]
step0Suffixes = ["'s'", "'s", "'"]
step0 ::
WordRegion ->
WordRegion
step0 wr = fromMaybe wr . asum $ dropSuffix wr <$> step0Suffixes
where
dropSuffix wr s
| T.isSuffixOf s (word wr) = let
len = T.length s
wr' = mapWR (T.dropEnd len) wr
in Just wr'
| otherwise = Nothing
step1 ::
WordRegion ->
WordRegion
step1 = step1c . step1b . step1a
step1a ::
WordRegion ->
WordRegion
step1a wr =
let w = word wr
in fromMaybe wr . asum $ [(f . const wr) <$> T.stripSuffix s w | (s, f) <- suffixes]
where
suffixes = [
("sses", ssesF),
("ied", ieF),
("ies", ieF),
("s", sF)]
ssesF = mapWR (T.dropEnd 2)
ieF wr
| T.length (word wr) > 4 = mapWR (T.dropEnd 2) wr
| otherwise = mapWR (T.dropEnd 1) wr
sF wr
| T.any (`elem` vowls) (T.dropEnd 2 (word wr)) &&
T.takeEnd 2 (word wr) `notElem` ["ss", "us"]
= mapWR (T.dropEnd 1) wr
| otherwise = wr
step1b ::
WordRegion ->
WordRegion
step1b wr =
let w' = word wr
in fromMaybe wr . asum $ [(f . const wr) <$> T.stripSuffix s w' | (s,f) <- suffixes]
where
suffixes = [
("eedly", eeF "eedly"),
("eed", eeF "eed"),
("edly", edF "edly"),
("ingly", edF "ingly"),
("ed", edF "ed"),
("ing", edF "ing")
]
eeF s wr = case mapWR (fromMaybe "" . T.stripSuffix s) wr of
(WR word r1 r2) | not (T.null r1) -> let
r2' = if T.null r2
then ""
else T.append r2 "ee"
in WR (T.append word "ee") (T.append r1 "ee") r2'
_ -> mapWR (T.dropEnd $ T.length s) wr
edF s wr =
case T.stripSuffix s (word wr) of
Just w' | T.any (`elem` vowls) w' ->
if
| T.takeEnd 2 w' `elem` doubleConsonants -> endsInDoubleConsonant wr'
| T.null (r1 wr') && (shortV w' || twoLetterWord w') -> addAnE wr'
| T.length w' <= 3 -> addAnE wr'
| T.takeEnd 2 w' `elem` specialEndings -> addAnE wr'
| otherwise -> wr'
_ -> wr
where
twoLetterWord w = T.length w == 2 && T.head w `elem` vowls && T.last w `notElem` vowls
wr' = mapWR (T.dropEnd (T.length s)) wr
doubleConsonants = ["bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt"]
endsInDoubleConsonant = mapWR (T.dropEnd 1)
addAnE = mapWR (\w -> if T.null w then w else w `T.snoc` 'e')
specialEndings = ["at", "bl", "iz"]
step1c ::
WordRegion ->
WordRegion
step1c = swapY
where
endsWith w c = T.takeEnd 1 w == c
safeNonVowl = fromMaybe '_' . safeHead
swapY wr
| T.length (word wr) <= 2 = wr
| word wr `endsWith` "y" && notElem (safeNonVowl . T.takeEnd 1 . T.init $ word wr) vowls =
mapWR (maybe "" (`T.snoc` 'i') . T.stripSuffix "y") wr
| word wr `endsWith` "Y" && notElem (safeNonVowl . T.takeEnd 1 . T.init $ word wr) vowls =
mapWR (maybe "" (`T.snoc` 'i') . T.stripSuffix "Y") wr
| otherwise = wr
safeHead ::
T.Text ->
Maybe Char
safeHead t
| T.length t >= 1 = Just $ T.head t
| otherwise = Nothing
swapLastWithE ::
T.Text ->
T.Text
swapLastWithE = (`T.snoc` 'e') . T.dropEnd 1
step2 ::
WordRegion ->
WordRegion
step2 wr =
let w = word wr
in fromMaybe wr . asum $
[const (suffixInR1 s f wr) =<< T.stripSuffix s w | (s, f) <- suffixes]
where
suffixInR1 :: T.Text -> (T.Text -> T.Text) -> WordRegion -> Maybe WordRegion
suffixInR1 s f wr = fmap (const $ mapWR f wr) . T.stripSuffix s $ r1 wr
suffixes = [
("ational", swapLastWithE . T.dropEnd 4),
("fulness", T.dropEnd 4),
("ousness", T.dropEnd 4),
("iveness", T.dropEnd 4),
("tional", T.dropEnd 2),
("biliti", (`T.append` "le") . T.dropEnd 6),
("lessli", T.dropEnd 2),
("iviti", swapLastWithE . T.dropEnd 2),
("entli", T.dropEnd 2),
("ation", swapLastWithE . T.dropEnd 2),
("alism", T.dropEnd 3),
("aliti", T.dropEnd 3),
("ousli", T.dropEnd 2),
("fulli", T.dropEnd 2),
("enci", swapLastWithE),
("alli", T.dropEnd 2),
("anci", swapLastWithE),
("abli", swapLastWithE),
("izer", T.dropEnd 1),
("ator", swapLastWithE . T.dropEnd 1),
("logi", T.dropEnd 1),
("bli", swapLastWithE),
("li", \w -> if (fromMaybe ' ' . safeHead $ T.takeEnd 3 w) `elem` liEnding
then T.dropEnd 2 w
else w)
]
liEnding :: String
liEnding = "cdeghkmnrt"
suffixInR1 :: T.Text -> (T.Text -> T.Text) -> WordRegion -> Maybe WordRegion
suffixInR1 s f wr = fmap (const $ mapWR f wr) . T.stripSuffix s $ r1 wr
suffixInR2 ::
T.Text ->
(T.Text -> T.Text) ->
WordRegion ->
Maybe WordRegion
suffixInR2 s f wr
| T.isSuffixOf s (r2 wr)
= Just $ mapWR f wr
| otherwise
= Nothing
step3 ::
WordRegion ->
WordRegion
step3 wr =
let w = word wr
in fromMaybe wr . asum $
specialCase wr:[const (suffixInR1 s f wr) =<< T.stripSuffix s w | (s, f) <- suffixes]
where
suffixes = [
("ational", swapLastWithE . T.dropEnd 4),
("tional", T.dropEnd 2),
("alize", T.dropEnd 3),
("icate", T.dropEnd 3),
("iciti", T.dropEnd 3),
("ical", T.dropEnd 2),
("ness", T.dropEnd 4),
("ful", T.dropEnd 3)
]
specialCase wr = const (suffixInR2 "ative" id wr) =<< T.stripSuffix "ative" (word wr)
step4 ::
T.Text ->
T.Text
step4 w =
maybe w word . asum $
[const (suffixInR2 s f wr) =<< T.stripSuffix s w | (s, f) <- suffixes]
where
wr = computeRegions w
ionHandler wr = let
w = word wr
in if T.isSuffixOf "sion" w || T.isSuffixOf "tion" w
then T.dropEnd 3
else id
suffixes = [
("ement", T.dropEnd 5),
("ment", T.dropEnd 4),
("ance", T.dropEnd 4),
("ence", T.dropEnd 4),
("able", T.dropEnd 4),
("ible", T.dropEnd 4),
("ion", ionHandler wr),
("ant", T.dropEnd 3),
("ent", T.dropEnd 3),
("ism", T.dropEnd 3),
("ate", T.dropEnd 3),
("iti", T.dropEnd 3),
("ous", T.dropEnd 3),
("ive", T.dropEnd 3),
("ize", T.dropEnd 3),
("al", T.dropEnd 2),
("er", T.dropEnd 2),
("ic", T.dropEnd 2)
]
step5 ::
T.Text ->
T.Text
step5 w = maybe w word . asum $ [stripL, stripE] <*> [wr]
where
wr = computeRegions w
stripL wr
| T.isSuffixOf "l" (r2 wr) && T.isSuffixOf "ll" (word wr)
= Just $ mapWR (T.dropEnd 1) wr
| otherwise = Nothing
stripE wr
| T.isSuffixOf "e" (r2 wr)
= Just $ mapWR (T.dropEnd 1) wr
| T.isSuffixOf "e" (r1 wr) && not (shortV (word wr))
= Just $ mapWR (T.dropEnd 1) wr
| otherwise
= Just wr
vWXY = vowls <> ['w', 'x', 'Y']
shortV ::
T.Text ->
Bool
shortV w = T.length w >= 4 &&
(
T.head (T.takeEnd 4 w) `notElem` vWXY &&
T.head (T.takeEnd 3 w) `elem` vowls &&
T.head (T.takeEnd 2 w) `notElem` vowls
)
checkStop :: S.Set T.Text -> T.Text -> Maybe T.Text
checkStop sw word
| word `S.member` sw = Just word
| otherwise = Nothing
checkLength :: T.Text -> Maybe T.Text
checkLength word
| T.length word <= 2 = Just word
| otherwise = Nothing
checkSpecialWords :: T.Text -> Maybe T.Text
checkSpecialWords word = M.lookup word Constants.specialStems