{-# LANGUAGE OverloadedStrings #-} module NLP.PStemmer.Ru where import qualified Data.Text as T import NLP.PStemmer.Internal.Function type ListG = [(T.Text, [T.Text])] runPorter' :: [T.Text] -> [(T.Text, T.Text)] runPorter' = map (id &&& runPorter) runPorter :: T.Text -> T.Text runPorter txt = let rv :: RV rv = mkRV txt in fst rv `T.append` applySteps (snd rv) [step1, step2, step3, step4] step4 :: Step step4 txt | T.null txt = txt | otherwise = getFirstSR [step4_1, step4_2, step4_3] txt where step4_1 :: Substep step4_1 txt' = if "нн" `T.isSuffixOf` txt' then (T.init txt', True) else (txt', False) step4_2 :: Substep step4_2 txt' = case cutList mapSuperl txt' of (t, True) -> (fst $ step4_1 t, True) _ -> (txt', False) step4_3 :: Substep step4_3 txt' = if T.last txt' == 'ь' then (T.init txt', True) else (txt', False) step3 :: Step step3 txt | T.null txt = txt | otherwise = let r2 :: Maybe T.Text r2 = snd . mkR1R2 $ txt in case r2 of Just t -> if any (`T.isSuffixOf` t) mapDerivat then fst . cutList mapDerivat $ txt else txt _ -> txt step2 :: Step step2 txt | T.null txt = txt | otherwise = if T.last txt == 'и' then T.init txt else txt step1 :: Step step1 txt | T.null txt = txt | otherwise = getFirstSR [step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7, step1_8, step1_9] txt where step1_1 :: Substep step1_1 = cutList mapPG_G2 step1_2 :: Substep step1_2 = cutListG mapPG_G1 step1_3 :: Substep step1_3 t = second (const False) $ cutList mapRefl t step1_4 :: Substep step1_4 = cutList mapPartic_G2 step1_5 :: Substep step1_5 = cutListG mapPartic_G1 step1_6 :: Substep step1_6 = cutList mapADJ step1_7 :: Substep step1_7 = cutList mapVerb_G2 step1_8 :: Substep step1_8 = cutListG mapVerb_G1 step1_9 :: Substep step1_9 = cutList mapNoun cutListG :: ListG -> Substep cutListG ((w, ws):ls) txt = if any (`T.isSuffixOf` txt) ws then fromMaybe (txt, False) (flip (,) True <$> T.stripSuffix w txt) else cutListG ls txt cutListG _ txt = (txt, False) prepareList :: List -> List prepareList = sortByLengthDown prepareListWith :: List -> List -> List prepareListWith sl al = sortByLengthDown . concatMap (\w -> map (w `T.append`) al) $ sl prepareListG :: List -> ListG prepareListG = map (\w -> (w, map (`T.cons` w) neccSuff)) . sortByLengthDown prepareListGWith :: List -> List -> ListG prepareListGWith sl al = map (\w -> (w, map (`T.cons` w) neccSuff)) . sortByLengthDown . concatMap (\w -> map (w `T.append`) al) $ sl mkRV :: T.Text -> RV mkRV txt = let (p1,p2) = T.break (`elem` vowels) txt in case T.uncons p2 of Just (ch, p2') -> (p1 `T.snoc` ch, p2') _ -> (p1,p2) mkR1R2 :: T.Text -> R1R2 mkR1R2 txt | T.null txt = r1r2null | otherwise = mkR' (T.tails txt) r1r2null where mkR' :: [T.Text] -> R1R2 -> R1R2 mkR' _ r1r2@(Just _, Just _) = r1r2 mkR' (t1:t2:xs) r1r2@(Nothing, r2@Nothing) | T.head t1 `notElem` vowels = mkR' (t2:xs) (tNull t2, r2) | otherwise = mkR' (t2:xs) r1r2 mkR' (t1:t2:t3:xs) r1r2@(r1@(Just _), Nothing) | chkV t1 t2 = mkR' (t3:xs) (r1, tNull t3) | otherwise = mkR' (t2:t3:xs) r1r2 mkR' _ r1r2 = r1r2 r1r2null :: R1R2 r1r2null = (Nothing, Nothing) tNull :: T.Text -> Maybe T.Text tNull t | T.null t = Nothing | otherwise = Just t chkV :: T.Text -> T.Text -> Bool chkV t1 t2 = (T.head t1 `elem` vowels) && (T.head t2 `notElem` vowels) -- ----------------------------------------------------------------------------- -- * Data of Poter stemmer for Russian language neccSuff :: String neccSuff = "ая" mapPG_G1 :: ListG mapPG_G1 = prepareListG ["в", "вши", "вшись"] mapPG_G2 :: List mapPG_G2 = prepareList ["ив", "ивши", "ившись", "ыв", "ывши", "ывшись"] mapADJ :: List mapADJ = prepareList ["ее", "её", "ёе", "ёё", "ие", "иё", "ые", "ыё", "ое", "оё", "ими", "ыми", "ей", "ёй", "ий", "ый", "ой", "ем", "ём", "им", "ым", "ом", "его", "ёго", "ого", "ему", "ёму", "ому", "их", "ых", "ую", "юю", "ая", "яя", "ою", "ею", "ёю"] mapPartic_G1 :: ListG mapPartic_G1 = prepareListGWith ["ем", "ём", "нн", "вш", "ющ", "щ"] mapADJ mapPartic_G2 :: List mapPartic_G2 = prepareListWith ["ивш", "ывш", "ующ"] mapADJ mapRefl :: List mapRefl = prepareList ["ся", "сь"] mapVerb_G1 :: ListG mapVerb_G1 = prepareListG ["ла", "на", "ете", "ёте", "етё", "ётё", "йте", "йтё", "ли", "й", "л", "ем", "ём", "н", "ло", "но", "ет", "ёт", "ют", "ны", "ть", "ешь", "ёшь", "нно"] mapVerb_G2 :: List mapVerb_G2 = prepareList ["ила", "ыла", "ена", "ёна", "ейте", "ёйте", "ейтё", "ёйтё", "уйте", "уйтё", "ите", "итё", "или", "ыли", "ей", "ёй", "уй", "ил", "ыл", "им", "ым", "ен", "ён", "ило", "ыло", "ено", "ёно", "ят", "ует", "уёт", "уют", "ит", "ыт", "ены", "ёны", "ить", "ыть", "ишь", "ую", "ю"] mapNoun :: List mapNoun = prepareList ["а", "ев", "ёв", "ов", "ие", "иё", "ье", "е", "ё", "иями", "ями", "ами", "еи", "ёи", "ии", "и", "ией", "иёй", "ей", "ёй", "ой", "ий", "й", "иям", "ям", "ием", "иём", "ем", "ём", "ам", "ом", "о", "у", "ах", "иях", "ях", "ы", "ь", "ию", "ью", "ю", "ия", "ья", "я"] mapSuperl :: List mapSuperl = prepareList ["ейш", "ёйш", "ейше", "ёйше", "ейшё", "ёйшё"] mapDerivat :: List mapDerivat = prepareList ["ост", "ость"] vowels :: String vowels = "аеёиоуыэюя"