-- lookup the Kotus list. AR 17/3/2013 -- lookup a word: -- - if found with declension, return this -- - if found without declension, look up as compound (INCOMPOUND), with the latter part's declension -- - if not found, look up as compound (OUTCOMPOUND), with the latter part's declension -- This is performed line by line on standard input and written into standard output -- 25/3: 93,696 words, 2628 not found, 6964 unsure -- 26/3: new heuristics for adjectives of type "älyperäinen", 1258 not found import Data.List import Data.Char import qualified Data.Map as M main = do --- kotus <- readFile "kotus-sanalista_v1.xml" >>= return . mkKotus . lines -- do lookup --- interact $ unlines . map (look kotus) . lines --- just print all words --- mapM_ (putStrLn . fst) $ M.toList kotus --- debug --- mapM_ print $ take 60 $ M.toList kotus -- debug --- analyse fin wordnet senses --- wordnet to kotus: runghc Kotus.hs wn-kotus.txt -- interact $ unlines . map (annotateWordnet kotus) . lines --- find Finnish words for DictEng: runghc Kotus.hs DictEngFin.senses wnkotus <- readFile "wn-kotus.txt" >>= return . mkWNKotus . lines interact $ unlines . map (annotateSenseInflection wnkotus) . lines annotateSenseInflection wnkotus line = case tabs line of w:s:_ -> untabs (w : s : lookupSenseInflection wnkotus s) _ -> "-- " ++ line type WNKotus = M.Map String [[String]] mkWNKotus :: [String] -> WNKotus mkWNKotus = M.fromListWith (++) . map mkOne where mkOne s = case tabs s of w:ws -> (drop 3 w,[ws]) -- fi: dropped _ -> (s,[]) lookupSenseInflection :: WNKotus -> String -> [String] lookupSenseInflection wnkotus synset = case M.lookup synset wnkotus of Just fws@(_:_) -> minimumBy (\x y -> compare (last x) (last y)) fws -- choose one with the best status _ -> ["NOT_IN_KOTUS"] annotateWordnet :: Kotus -> String -> String annotateWordnet kotus line = untabs $ [tline !! 0, cat, tline !! 1] ++ case cat of "V" -> lookk ((iline !! 1) !! 0) -- fi:v02758977 sataa lunta 0 -> sataa _ -> lookk (last (iline !! 1)) -- fi:n02769075 näytön taustakuva 0 -> taustakuva where iline = map words (init tline) tline = tabs line cat = case line !! 3 of -- fi:v0276... -> v 'v' -> "V" 'a' -> "A" 'n' -> "N" 'r' -> "Adv" lookk = look kotus tabs :: String -> [String] tabs s = case break (=='\t') s of (s1,_:s2) -> s1 : tabs s2 _ -> [s] untabs :: [String] -> String untabs = concat . intersperse "\t" look :: Kotus -> String -> [String] look kotus w = case M.lookup w kotus of Just ["NOPAR"] -> lookCompound "INCOMPOUND" kotus w -- compound in Kotus Just descr -> descr ++ ["FOUND"] _ -> lookCompound "OUTCOMPOUND" kotus w -- compound not in Kotus lookCompound :: String -> Kotus -> String -> [String] lookCompound pref kotus w = case sort (concatMap looks (splits w)) of -- sort: 1's first (_,descr):_ -> descr _ -> ["MK", "XX5_NOTFOUND_"++pref] -- NOUTFOUND, worst - use smart paradigm where splits s = reverse [splitAt n s | n <- [3 .. length s - 3]] looks (x,y) = case M.lookup y kotus of Just descr | makesCompPrefix x -> comp 1 descr pref x y -- preferred: 1 Just descr -> comp 2 descr ("XX3_UNSURE_" ++ pref) x y -- secondary: 2, UNSURE _ | drop (length y - 3) y == "nen" && makesCompPrefix x -> comp 3 ["38"] ("XX4_INVENTED_" ++ pref) x y _ | drop (length y - 3) y == "ttu" && makesCompPrefix x -> comp 3 ["1"] ("XX4_INVENTED_" ++ pref) x y _ | drop (length y - 3) y == "tty" && makesCompPrefix x -> comp 3 ["1"] ("XX4_INVENTED_" ++ pref) x y _ | drop (length y - 3) y == "nut" && makesCompPrefix x -> comp 3 ["47"] ("XX4_INVENTED_" ++ pref) x y _ | drop (length y - 3) y == "nyt" && makesCompPrefix x -> comp 3 ["47"] ("XX4_INVENTED_" ++ pref) x y _ -> [] comp status descr pref x y = [(status, [unwords (descr ++ ["C", x ++ "_" ++ y]), pref])] makesCompPrefix x = elem x compPrefixes || any isCompPrefix (compForms x) isCompPrefix x = case M.lookup x kotus of Just _ -> True _ -> case M.lookup (x ++ "-") kotus of Just _ -> True _ -> False compForms x = let (initx,lastx) = (init x,last x) in x : [initx ++ "nen" | lastx == 's'] ++ [initx | elem lastx "n-" ] -- pakkas-, pakkanen compPrefixes = ["epä","ylä","yö"] -- yö is the only 2-letter word type Kotus = M.Map String [String] mkKotus :: [String] -> Kotus mkKotus = M.fromList . groupHomonyms . map oneKotusLine . filter isWord isWord = (=="") . take 4 oneKotusLine :: String -> (String,[String]) oneKotusLine s = case untag s of w:ws -> (w,ws) -- yhdesti99 -- yhdesti 99 untag s = case break (=='<') (drop 7 s) of (w,d) -> case drop 4 d of '<':'t':'>':_ -> [w, takeWhile isDigit (drop 11 d) ++ rest (drop 11 d)] -- 99, '<':'h':'n':'>':h:'<':'/':'h':'n':'>':'<':'t':'>':_ -> [w, ['H',h], takeWhile isDigit (drop 21 d) ++ rest (drop 21 d)] -- homonym _ -> [w, "NOPAR"] -- no paradigm given where rest t = case t of '<':'a':'v':'>':_ -> "A" _:cs -> rest cs _ -> [] groupHomonyms :: [(String,[String])] -> [(String,[String])] groupHomonyms ws = [(w,concat (intersperse [";"] (map snd wds))) | wds@((w,_):_) <- groupBy (\x y -> fst x == fst y) ws]