module Dictionary where import General import List (intersperse,sortBy, group,sort) import Char import IO import Util import System.IO.Unsafe (unsafePerformIO) import qualified Data.Set as Set import SharedString import Dict.ErrM import Maybe import qualified Data.Set as Set -- | An instance of the Dict class provides information on how -- | to construct an entry for a given dictionary type. In -- | particular, it associated a word class identifier to the -- | dictionary type. class Param a => Dict a where dictword :: (a -> Str) -> String dictword f = getDictWord f category :: (a -> Str) -> String category = const "Undefined" defaultAttr :: (a -> Str) -> Attr defaultAttr = const noComp attrException :: (a -> Str) -> [(a,Attr)] attrException = const [] getDictWord f = case [x | (x:_) <- map (unStr . f) values] of (w:_) -> w _ -> "ET" -- | The Dictionary ADT data Dictionary = Dict [Entry] -- | Dictionary_Word = citation form type Dictionary_Word = String -- | Part of speech type Category = String -- | Untyped inherent type Inherent = String -- | Untyped inflection parameter configuration type Untyped = String -- | Untyped inflection table type Inflection_Table = [(Untyped,(Attr,Str))] -- | It is possible to add information about every word form, -- | such as pronounciation. type Extra = [Str] -- | Paradigm identifier. type Paradigm = String type LemmaID = String -- | An Entry contains all information about a word. type Entry = (LemmaID, Dictionary_Word, Paradigm, Category, [Inherent], Inflection_Table,Extra) -- | An Entry without the compound information. type EntryN = (Dictionary_Word, Category, [Inherent], [(Untyped,Str)]) get_id :: Entry -> String get_id (i,_,_,_,_,_,_) = i is_equal_entry :: Entry -> Entry -> Bool is_equal_entry e1 e2 = f e1 == f e2 where f (_,d,_,c,inhs,infl,e) = (d,c,inhs,sort_table infl,e) sort_table table = [(p,(a,strings (sort (unStr s)))) | (p,(a,s)) <- table] -- | Empty Dictionary emptyDict :: Dictionary emptyDict = Dict [] emptyEntry :: Entry emptyEntry = ([],[],[],[],[],[],[]) is_empty_entry :: Entry -> Bool is_empty_entry e = case e of (_,[],_,[],[],[],[]) -> True _ -> False is_empty_dictionary :: Dictionary -> Bool is_empty_dictionary (Dict xs) = and $ map is_empty_entry xs -- | Perform sharing on the strings in the Dictionary. shareDictionary :: Dictionary -> Dictionary shareDictionary (Dict xs) = Dict $ map shareEntry xs -- | set paradigm id. set_paradigm_id :: String -> Entry -> Entry set_paradigm_id p (i,x@(_:_),_,c,y,z,w) = (i,x,p,c,y,z,w) set_paradigm_id _ e = e -- | set pos. set_pos :: String -> Entry -> Entry set_pos pos (i,x,p,_,y,z,w) = (i,x,p,pos,y,z,w) get_pos :: Entry -> Category get_pos (i,x,p,pos,y,z,w) = pos set_inhs :: [String] -> Entry -> Entry set_inhs inhs (i,x,p,pos,_,z,w) = (i,x,p,pos,inhs,z,w) -- | get head. get_head :: Entry -> String get_head (i,h,p,x,y,z,w) = h -- | set head. set_head :: String -> Entry -> Entry set_head h (i,_,p,x,y,z,w) = (i,h,p,x,y,z,w) -- | set lemma id. set_lemma_id :: String -> Entry -> Entry set_lemma_id i (_,x,p,pos,y,z,w) = (i,x,p,pos,y,z,w) get_lemma_id :: Entry -> String get_lemma_id (i,x,p,pos,y,z,w) = i replace_attr :: Attr -> Attr -> Entry -> Entry replace_attr a b (i,x,p,pos,y,table,w) = (i,x,p,pos,y,table',w) where table' = [(u,(if n == a then b else n,str)) | (u,(n,str)) <- table] replace_param :: [(String,String)] -> Entry -> Entry replace_param xs (i,x,p,pos,y,table,w) = (i,x,p,pos,y,table',w) where table' = [(param_map u,s) | (u,s) <- table] param_map u = case lookup u xs of Nothing -> u Just p_new -> p_new -- | share an Entry shareEntry :: Entry -> Entry shareEntry (i,dict_word, p, cat, inhs, infl, extra) = (i, shareString dict_word, shareString p, shareString cat, map shareString inhs, shareTable infl, extra) where shareTable xs = map (\(a,(p,b)) -> (shareString a,(p,shareStr b))) xs -- [(Untyped,(Attr,Str))] multi_words :: ([String], Entry ,[String]) -> Entry multi_words (xs,e,ys) = e' where f = (ps ++) . (++ ss) ps = unwords xs ++ (if (null xs) then "" else " ") ss = (if (null ys) then "" else " ") ++ unwords ys e' = case e of (id,dict_word, p, cat, inhs, infl, extra) -> (id,f dict_word, p, cat, inhs, [ (p,(a,mapStr f str)) | (p,(a,str)) <- infl], extra) first_mw :: Category -> (String -> Entry) -> (String -> Entry) first_mw pos f s = case words s of (x:xs) -> set_pos pos $ multi_words ([],f x,xs) _ -> error $ "first, invalid multiword: " ++ s last_mw :: Category -> (String -> Entry) -> (String -> Entry) last_mw pos f s = set_pos pos $ multi_words (xs, f l, []) where ws = words s l = last ws xs = init ws -- | create an inflection table with the compound information. infTable :: Dict a => (a -> Str) -> Inflection_Table infTable f = prTableAttr f (defaultAttr f) (attrException f) -- | Translate the function encoding the extra information about the -- | word forms to a list of strings. extraTable :: Dict a => (a -> Str) -> Extra extraTable f = [s | (_,s) <- table f] -- | Translate an inflection function to an Entry. entry :: Dict a => (a -> Str) -> Entry entry f = entryI f [] -- | Translate an inflection function with inherent information to an Entry. entryI :: Dict a => (a -> Str) -> [Inherent] -> Entry entryI f ihs = (entry_id f ihs [], dictword f, "", category f, ihs, infTable f,[]) entry_id :: Dict a => (a -> Str) -> [Inherent] -> Paradigm -> LemmaID entry_id f inhs p = construct_name (dictword f) (category f) (concat (intersperse "_" inhs)) p -- | Translate an inflection function with paradigm identifier to an Entry. entryP :: Dict a => (a -> Str) -> Paradigm -> Entry entryP f p = entryIP f [] p -- | Inflection function + inherent + paradigm identifier -> Entry entryIP :: Dict a => (a -> Str) -> [Inherent] -> Paradigm -> Entry entryIP f ihs p = (entry_id f ihs p, dictword f, p , category f, ihs, infTable f,[]) -- | Inflection function with extra information entryWithInfo :: Dict a => (a -> (Str,Str)) -> Entry entryWithInfo f = entryWithInfoI f [] -- | inflection function with extra information and inherent information. entryWithInfoI :: Dict a => (a -> (Str,Str)) -> [Inherent] -> Entry entryWithInfoI fun ihs = (entry_id f ihs [], dictword f, "", category f, ihs, infTable f,extraTable g) where f = \a -> fst (fun a) g = \a -> snd (fun a) -- entryI (\a -> unionStr (f a) (unionStr (mkStr "|") (g a))) ihs -- | inflection function with extra information and inherent information. entryWithInfoP :: Dict a => (a -> (Str,Str)) -> Paradigm -> Entry entryWithInfoP f = entryWithInfoIP f [] -- | inflection function with extra information and inherent information -- | and paradigm identifier. entryWithInfoIP :: Dict a => (a -> (Str,Str)) -> [Inherent] -> Paradigm -> Entry entryWithInfoIP fun ihs p = (entry_id f ihs p, dictword f, p, category f, ihs, infTable f,extraTable g) where f = \a -> fst (fun a) g = \a -> snd (fun a) -- | Create a table with compound attributes. prTableAttr :: Param a => (a -> Str) -> Attr -> [(a,Attr)] -> [(String,(Attr,Str))] prTableAttr t da ts = [(prValue a,(maybe da id (lookup a ts),s)) | (a,s) <- table t] -- | Create a table with compound attributes when compound analysis -- | is not used. --prTableW :: Param a => Table a -> [(String,(Attr,Str))] --prTableW t = [ (a,(noComp,s)) | (a,s) <- prTable t] -- | Transform typed table to untyped. prTable :: Param a => Table a -> Table String prTable = map (\ (a,b) -> (prValue a, b)) unDict :: Dictionary -> [Entry] unDict (Dict xs) = xs -- | Number of Entry:s in Dictionary size :: Dictionary -> Int size = length . unDict -- | Number of word forms. sizeW :: Dictionary -> Int sizeW = sum . map sizeEntry . unDict -- | Number of word forms in Entry. sizeEntry :: Entry -> Int sizeEntry (_,_,_,_,_,t,_) = length t -- | Create a Dictionary dictionary :: [Entry] -> Dictionary dictionary = Dict . filter (not . is_empty_entry) is_empty :: Dictionary -> Bool is_empty (Dict []) = True is_empty _ = False -- | Concatenate two dictionaries. unionDictionary :: Dictionary -> Dictionary -> Dictionary unionDictionary (Dict xs) (Dict ys) = Dict $ xs ++ ys -- | Concatenate a list of Dictionaries. unionDictionaries :: [Dictionary] -> Dictionary unionDictionaries = foldr unionDictionary emptyDict -- | Remove attributes from a dictionary. removeAttr :: Dictionary -> [EntryN] removeAttr = map noAttr . unDict -- | Remove attributes from Entry. Also remove extra information. noAttr :: Entry -> EntryN noAttr (_,d,_,c,inh,tab,_) = (d,c,inh,[(i, s) | (i,(_,s)) <- tab]) -- | Group a dictionary into categories; reverses the entries... classifyDict :: Dictionary -> [(Category,[Entry])] classifyDict = foldr addNext [] . unDict where addNext entry@(_,_,_,cat,_,_,_) dict = case dict of (c,es) : dict' | cat == c -> (c, entry:es) : dict' ces : dict' -> ces : addNext entry dict' [] -> [(cat,[entry])] -- | A list of the word form together with the analyses and compound attributes. type FullFormLex = [(String,[(Attr,String)])] -- | A fullform lexicon structured around the word identifier dict2idlex :: Dictionary -> FullFormLex dict2idlex (Dict es) = map entry2id es -- | Translate Entry to a word identifier and its associated word forms. entry2id :: Entry -> (String,[(Attr,String)]) entry2id (name, stem, para, typ, inhs, infl,_) = (name,[(a,concat $ "{":(jword w):jk:(jhead stem):jk :(jpos typ): (jparam par):jk:(jinhs inhs):jk:(jid name):jk:(jp para):jk:(jattr (show a)) : ["}"]) | (par,(a,str)) <- infl, w <- unStr str]) -- name = construct_name stem typ (concat (intersperse "_" inhs)) para testdata :: Dictionary -> [(String,String,String,String,[String],[String],String)] testdata (Dict es) = concat $ map createTest es where createTest (id,stem,para,typ,inhs,infl,_) = [(s,stem,typ,para,words u, inhs,id) | (u,(_,str)) <- infl, s <- f (unStr str)] f [] = [""] f xs = xs entrywords :: Entry -> (String,[String]) entrywords (_,stem, para, typ, inhs, infl,extra) = (stem, [ x | (_,(_,str)) <- infl, x <- unStr str ]) --- | Create a fullform lexicon. dict2fullform :: Dictionary -> FullFormLex dict2fullform (Dict es) = concat $ map entry2full es entry2full :: Entry -> [(String,[(Attr,String)])] entry2full (name, stem, para, typ, inhs, infl,_) = concatMap mkForm infl where mkForm (par,(a,str)) = [(s1, [(a,concat ["{",jword s1,jk,jhead stem,jk,jpos typ,jk,jparam par,jk,jinhs inhs,jk,jid name,jk,jp para,jk,jattr (show a),"}"])]) | s1 <- unStr str] jword :: String -> String jword s1 = "\"word\":" ++ pr_v s1 jhead :: String -> String jhead s1 = "\"head\":" ++ pr_v s1 jpos :: String -> String jpos s1 = "\"pos\":" ++ pr_v s1 jparam :: String -> String jparam s1 = "\"param\":" ++ pr_v s1 jinhs :: [String] -> String jinhs s1 = "\"inhs\":" ++ "[" ++ (concat (intersperse ", " (map quote s1))) ++ "]" jid :: String -> String jid s1 = "\"id\":" ++ pr_v s1 jp :: String -> String jp s1 = "\"p\":" ++ pr_v s1 jattr :: String -> String jattr s1 = "\"attr\":" ++ pr_v s1 jk :: String jk = "," pr_v :: String -> String pr_v [] = show "*" pr_v v = quote v --json_list :: String -> [String] -> String --json_list field values = quote field ++ ":[" ++ (concat (intersperse ", " (map quote values))) ++ "]" -- | Create word identifier. construct_name :: String -> String -> String -> String -> String construct_name stem typ inhs para = case para of [] -> nospace $ stem++"_"++typ++ inhs' para -> nospace $ stem++"_"++typ++ inhs' ++ "_" ++para where inhs' = if null inhs then "" else "_" ++ inhs nospace = map (\c -> if (c==' ') then '_' else c) type Unknowns = Set.Set String type WrongArguments = Set.Set (String,Int) type ParadigmErrors = (Unknowns, WrongArguments) emptyParadigmErrors :: ParadigmErrors emptyParadigmErrors = (Set.empty,Set.empty) insertParadigmError :: Either String (String,Int) -> ParadigmErrors -> ParadigmErrors insertParadigmError (Left p) (unknowns, wa) = (Set.insert p unknowns, wa) insertParadigmError (Right a) (us, wrong_arguments) = (us, Set.insert a wrong_arguments) prErrorTable :: [String] -> String prErrorTable ts = unlines [unwords [" ",pad s1 c1,pad s2 c2, pad s3 c3] | (s1,s2,s3) <- sp] where f1 (x,_,_) = length x f2 (_,x,_) = length x f3 (_,_,x) = length x c1 = maximum (map f1 sp) c2 = maximum (map f2 sp) c3 = maximum (map f3 sp) sp = splits ts pad s c = s ++ take (c - length s) (repeat ' ') splits [] = [] splits [x] = [(x,[],[])] splits [x,y] = [(x,y,[])] splits xs = case splitAt 3 xs of ([x,y,z],rs) -> (x,y,z):splits rs equal_entry :: [(String,String)] -> Entry -> Entry -> Maybe (String,String) equal_entry tab e1 e2 | is_equal_entry e1 e2 = if (except tab e1 e2) then Nothing else Just (pr e1, pr e2) | otherwise = Nothing where pr (name, stem, para, typ, inhs, infl,extra) = para except xs (n1, _, _, _,_,_,_) (n2, _, _, _,_,_,_) = elem (n1,n2) xs || elem (n2,n1) xs check_duplication :: [(Entry,[Entry])] -> [(String,String)] -> [(String,String)] check_duplication entries tab = catMaybes $ run_check entries where run_check [] = [] run_check ((e,es):ess) = map (equal_entry tab e) es ++ run_check ess print_duplicates :: [(Entry,[Entry])] -> [(String,String)] -> IO () print_duplicates ds tab = case clean (check_duplication ds tab) of (x:xs) -> do prStd $ "\nPotentially identical paradigms found:" prStd $ unlines [ " " ++ p1 ++ " ~ " ++ p2 | (p1,p2) <- (x:xs)] _ -> prStd "No identical paradigms found!" where clean [] = [] clean ((x@(p1,p2)):xs) | elem (p1,p2) xs || elem (p2,p1) xs = x:clean xs | otherwise = clean xs transform_dictionary (pos,inhs,param) (Dict xs) = Dict (map tr xs) where tr (name, stem, para, typ, is, infl,extra) = let infl' = [(param_f (typ,a,is),b) | (a,b) <- infl] in (name,stem,para,pos_f typ, inhs_f is,infl',extra) pos_f = f pos inhs_f = f inhs param_f i@(p,a,is)= case param of Nothing -> a Just g -> g i f Nothing = id f (Just g) = g map_wordforms :: (String -> String) -> Entry -> Entry map_wordforms f (name, stem, para, typ, inhs, infl,extra) = (name,stem,para,typ,inhs,infl',extra) where infl' = [(u,(a,strings (map f (unStr ss)))) | (u,(a,ss)) <- infl] duplicated_lemma_id :: Dictionary -> [String] duplicated_lemma_id (Dict es) = check (map get_lemma_id es) (Set.empty,Set.empty) where check [] (_,b) = Set.toList b check (x:xs) (s,b) | Set.member x s = check xs (s,Set.insert x b) | otherwise = check xs (Set.insert x s, b) remove_param :: String -> Entry -> Entry remove_param p (name, stem, para, typ, inhs, infl,extra) = (name, stem, para, typ, inhs, infl',extra) where infl' = [f (u,(a,ss)) | (u,(a,ss)) <- infl] f (u,(a,ss)) | elem p (words u) = (u,(a,nonExist)) | otherwise = (u,(a,ss)) type Positive = [String] type Negative = [String] expand_multiword :: String -> [(Positive,Negative,String)] -> Entry -> Entry expand_multiword p xs (name, stem, para, typ, inhs, infl,extra) = (name, stem, para, typ, inhs, infl',extra) where infl' = [(u,(a,strings (concat (map (f (words u) . words) (unStr ss))))) | (u,(a,ss)) <- infl] f us = map unwords . flatten . map (g us) g us x = if x == p then [ s | (pos,neg,s) <- xs, all_in pos us && (null neg || not (all_in neg us))] else [x] all_in [] ys = True all_in (x:xs) ys = elem x ys && all_in xs ys flatten [] = [[]] flatten (x:xs) = [x':xs' | x' <- x, xs' <- flatten xs] combine_tables :: Entry -> Entry -> Entry combine_tables (name, stem, para, typ, inhs, infl,extra) (_, _, _, _, _, infl1, _) = (name, stem, para, typ, inhs, infl',extra) where infl' = [(u,(a,unionStr ss ss2)) | ((u,(a,ss)),(_,(_,ss2))) <- zip infl infl1]