module Data.PoliMorf
(
Form
, Base
, Tag
, Cat
, Entry (..)
, atomic
, readPoliMorf
, parsePoliMorf
, Rule (..)
, apply
, toBase
, mkRuleMap
, BaseMap
, mkBaseMap
, FormMap
, mkFormMap
, RelCode (..)
, mergeWith
, merge
) where
import Control.Applicative ((<$>), (<*>))
import Data.Maybe (catMaybes)
import Data.Monoid (mappend)
import Data.Binary (Binary, get, put)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
import qualified Data.DAWG as D
import Debug.Trace (trace)
type Form = T.Text
type Base = T.Text
type Tag = T.Text
type Cat = T.Text
data Entry = Entry
{ form :: !Form
, base :: !Base
, tag :: !Tag
, cat :: !Cat }
deriving (Eq, Ord, Show, Read)
atomic :: Entry -> Bool
atomic x
| "sup" `T.isInfixOf` tag x && "naj" `T.isPrefixOf` form x = False
| "neg" `T.isInfixOf` tag x && "nie" `T.isPrefixOf` form x = False
| otherwise = True
readPoliMorf :: FilePath -> IO [Entry]
readPoliMorf path = parsePoliMorf <$> L.readFile path
parsePoliMorf :: L.Text -> [Entry]
parsePoliMorf = map parsePoliRow . L.lines
parsePoliRow :: L.Text -> Entry
parsePoliRow row = case map L.toStrict (L.split (=='\t') row) of
[_form, _base, _tag, _cat] -> Entry _form _base _tag _cat
_ -> error $ "parsePoliRow: invalid row \"" ++ L.unpack row ++ "\""
data Rule = Rule {
cut :: !Int
, suffix :: !T.Text }
deriving (Show, Eq, Ord)
apply :: Rule -> T.Text -> T.Text
apply r x = T.take (T.length x cut r) x `T.append` suffix r
toBase :: Entry -> Maybe Rule
toBase x
| "sup" `T.isInfixOf` tag x && "naj" `T.isPrefixOf` form x = Nothing
| "neg" `T.isInfixOf` tag x && "nie" `T.isPrefixOf` form x = Nothing
| otherwise =
let k = lcp (form x) (base x)
in Just $ Rule (T.length (form x) k) (T.drop k (base x))
where
lcp a b = case T.commonPrefixes a b of
Just (c, _, _) -> T.length c
Nothing -> trace (show (form x, base x)) 0
between :: T.Text -> T.Text -> Rule
between source dest =
let k = lcp source dest
in Rule (T.length source k) (T.drop k dest)
where
lcp a b = case T.commonPrefixes a b of
Just (c, _, _) -> T.length c
Nothing -> 0
type BaseMap = D.DAWG (S.Set Rule)
type FormMap = D.DAWG (S.Set Rule)
mkRuleMap :: [(T.Text, T.Text)] -> D.DAWG (S.Set Rule)
mkRuleMap xs = D.fromListWith S.union $
[ ( T.unpack x
, S.singleton (between x y) )
| (x, y) <- xs ]
mkBaseMap :: [Entry] -> BaseMap
mkBaseMap = mkRuleMap . map ((,) <$> form <*> base)
mkFormMap :: [Entry] -> FormMap
mkFormMap = mkRuleMap . map ((,) <$> base <*> form)
data RelCode
= ByForm
| ByBase
| Exact
deriving (Eq, Ord, Show, Read)
instance Binary RelCode where
put Exact = put '1'
put ByBase = put '2'
put ByForm = put '3'
get = get >>= \x -> return $ case x of
'1' -> Exact
'2' -> ByBase
'3' -> ByForm
c -> error $ "get: invalid RelCode code '" ++ [c] ++ "'"
mergeWith
:: Ord a
=> (String -> String -> a -> a)
-> BaseMap
-> D.DAWG (S.Set a)
-> D.DAWG (M.Map a RelCode)
mergeWith f poli dict0 = D.fromList
[ (x, combine x)
| x <- keys ]
where
keys = join (D.keys poli) (D.keys dict0)
combine x = (M.unionsWith max . catMaybes)
[ label Exact <$> D.lookup x dict0
, label ByBase <$> D.lookup x dict1
, label ByForm <$> D.lookup x dict2 ]
label :: Ord a => RelCode -> S.Set a -> M.Map a RelCode
label code s = M.fromList [(x, code) | x <- S.toList s]
dict1 = D.fromListWith mappend
[ (lemma, f'Set lemma _form x)
| (_form, x) <- D.assocs dict0
, lemma <- elemsOn poli _form ]
dict2 = D.fromListWith mappend
[ (form', f'Set form' _form x)
| (_form, x) <- D.assocs dict0
, lemma <- elemsOn poli _form
, form' <- elemsOn ilop lemma ]
ilop = mkRuleMap
[ (base'Text, form'Text)
| (form'String, rules) <- D.assocs poli
, rule <- S.toList rules
, let form'Text = T.pack form'String
, let base'Text = apply rule form'Text ]
join (x:xs) (y:ys)
| x < y = x : join xs (y:ys)
| x > y = y : join (x:xs) ys
| otherwise = x : join xs ys
join xs [] = xs
join [] ys = ys
f'Set v w = S.fromList . map (f v w) . S.toList
merge
:: Ord a => BaseMap
-> D.DAWG (S.Set a)
-> D.DAWG (M.Map a RelCode)
merge = mergeWith $ \_ _ x -> x
elemsOn :: D.DAWG (S.Set Rule) -> String -> [String]
elemsOn m x = case x `D.lookup` m of
Just s ->
[ T.unpack . apply rule . T.pack $ x
| rule <- S.toList s ]
Nothing -> []