module CommonMain where import DictToDictionary import Print import List(intersperse,isPrefixOf,sortBy,nub) import System(getArgs, getEnv) import GeneralIO import General import IO import qualified Data.Set as Set import System(getProgName) import Dictionary import Frontend import Char import Dict.ErrM import Print import Monad import Command import Util import Combine --import Net import qualified Data.Map as Map -- import Tokenize import qualified CTrie data AnaType = Normal | NoAnalysis | AnaEval | FilterLexiconNo | FilterLexiconNoComp | FilterLexiconComp gfTypes :: Language a => a -> String gfTypes l = "types." ++ name l ++ ".gf" readDicts :: Language a => a -> [FilePath] -> (Bool,Bool,Bool) -> IO (Dictionary,Int) readDicts l fs (undefcheck,argccheck,unusedcheck) = do output readDicts' l fs where output | length fs > 1 = prErr $ "\nprocessing dictionaries in files: " ++ (unwords fs) | null fs = prErr $ "\nno dictionary loaded" | otherwise = prErr $ "\nprocessing dictionary in file " ++ (unwords fs) readDicts' l [] = return $ (internDict l, 0) readDicts' l (f:fs) = do (d,n) <- readDicts' l fs res <- parseDict l f (undefcheck,argccheck,unusedcheck) case res of Ok (d1,n1) -> return $ (unionDictionary d d1,n+n1) Bad s -> do prErr s return (d,n) prStatistics :: Language a => a -> Int -> IO() prStatistics l n = do let is = size (internDict l) prErr $ print_lang l ++ print_paradigms l ++ print_size (n+is,n,is) ++ "\n" print_lang l = "language id: " ++ (name l) ++ "\n" print_paradigms l = case paradigmCount l of 0 -> "no paradigms\n" 1 -> "1 paradigm\n" n -> show n ++ " paradigms\n" print_size (0,n,isz) = "no/empty dictionary" print_size (1,_,_) = "1 entry" print_size (sz,n,isz) = show nsz ++ "k entries [e: " ++ show n ++ ", i: " ++ show isz ++ "]" where nsz = fromInteger $ round (((fromInteger (toInteger sz))::Double) / 1000) uName :: Language a => a -> String uName l = case name l of [] -> [] (x:xs) -> toUpper x : xs commonMain :: Language a => a -> IO () commonMain l = do xx <- getArgs res <- retrieve l xx case res of Left err -> do prErr $ err Right (flags, []) | dictionary_needed flags -> do prg <- getProgName prErr $ welcome l prErr $ "dictionary file required" prErr $ "Usage: " ++ prg ++ " [OPTION...] dictionary_file(s)...\n" Right (flags, files) -> if is_help flags then do s <- help l ; prText (welcome l) ; prText s else if is_version flags then prText (welcome l) else do prErr $ welcome l let undefcheck = is_undef flags || is_all flags unusedcheck = is_unused flags || is_all flags argccheck = is_argc flags || is_all flags dupcheck = is_duplicated flags || is_all flags (compInf,cmode) = (is_compound flags, get_compound flags) tokS = if (is_reduce flags && not (elem (get_tokenizer_name flags) ["lines","norm"])) then norm_tokenizer else (get_tokenizer (tokenizer l) flags) when (is_reduce flags && not (elem (get_tokenizer_name flags) ["lines","norm"])) $ prErr "Invalid tokenizer in reduce mode, changed to 'norm'" (d,n) <- if (dictionary_needed flags) then do (dict,n1) <- readDicts l files (undefcheck,argccheck,unusedcheck) return (apply_encoding l flags (dictionary_postprocessing l dict),n1) else return (emptyDict,0) case flags of fs | invalid_tokenizer fs -> prErr $ "unknown tokenizer: " ++ (get_tokenizer_name fs) fs | invalid_quality fs -> prErr $ "unknown quality argument: " ++ (get_quality fs) fs | is_quality fs -> do when dupcheck $ check_lemma_duplication d when (is_test fs || is_all fs) $ tester (testBench l) (testdata d) when (is_paradigm_test fs || is_all fs) $ do prErr "Running paradigm duplication check..." let lenw = length . concat . map words print_duplicates [(f xs, [fun xs | (p1,(ls,fun)) <- Map.toList (paradigms l), p1 /= p, lenw ls == lenw xs]) | (p,(xs,f)) <- Map.toList (paradigms l), not (Set.member p (paradigm_dup_exception l))] (dup_id_exceptions l) -- fs | is_net fs -> case get_port fs of -- Nothing -> prErr "Invalid port" -- Just n -> do build_trie l fs d -- prStatistics l n -- server n (\s -> prAnalysis l Normal s (analysis cmode (composition l) (sandhi_rules l) s)) fs | elem Infl fs -> do prErr $ "[ FM inflection mode ]" s <- imode l output_write fs s fs | elem Synth fs -> do prErr $ "[ FM synthesiser mode ]\n" CTrie.buildTrieDictSynt d False prStatistics l n s <- synthesiser l output_write fs s fs | is_paradigm fs -> lookup_paradigm l fs | elem Tag fs -> do --CTrie.buildTrieDict (isComp l) d False dupcheck build_trie l fs d prStatistics l n s <- posify l tokS (analysis l cmode (composition l) (sandhi_rules l)) output_write fs s fs | is_reduce fs -> do build_trie l fs d prStatistics l n s <- posify l tokS (analysis l cmode (composition l) (sandhi_rules l)) s2 <- readFile (get_reduce_file fs) output_write fs $ Combine.combine s s2 fs | is_mode fs -> do --CTrie.buildTrieDict (isComp l) d False dupcheck build_trie l fs d prStatistics l n run l tokS (analysis l cmode (composition l) (sandhi_rules l)) (get_mode fs) fs | is_printer fs -> output_write fs $ case printer fs of (Just p) | elem p (map fst (lprinter l)) -> case lookup p (lprinter l) of Just p -> p d (Just "core") -> unlines (paradigmNames l) (Just "paradigms") -> prDictionary $ apply_encoding l flags (dictionary [f xs | (p,(xs,f)) <- Map.toList (paradigms l)]) (Just "paradigms_compact") -> prParadigmsCompact $ apply_encoding l flags (dictionary [f xs | (p,(xs,f)) <- Map.toList (paradigms l)]) (Just "paradigms_latex") -> prLatex $ apply_encoding l flags (dictionary [f xs | (p,(xs,f)) <- Map.toList (paradigms l)]) (Just "tagset") -> prTagset l $ apply_encoding l flags (dictionary [f xs | (p,(xs,f)) <- Map.toList (paradigms l)]) (Just "paradigms_list") -> "[" ++ (concat (intersperse ", " ['\"' : p ++ "\"" | (p,_) <- Map.toList (paradigms l)])) ++ "]" (Just "paradigms_plist") -> unlines ["# -*- coding: utf-8 -*-", "paradigms = '[" ++ (concat (intersperse "," ['\"' : p ++ "\"" | (p,_) <- Map.toList (paradigms l)])) ++ "]'"] (Just "pos_plist") -> unlines ["# -*- coding: utf-8 -*-", "pos = '[" ++ (concat (intersperse "," (nub ['\"' : (get_pos (f xs)) ++ "\"" | (p,(xs,f)) <- Map.toList (paradigms l)]))) ++ "]'"] (Just "extract") -> prExtract l (Just "compound") -> prCompound l (Just "newlex") -> prNewDictionary d -- (Just "json") -> prJSON d (Just "lex") -> prJSON d --prFullFormLex (dict2fullform d False) (Just "tabbedlex") -> prTabbedLex d (Just "webservice") -> prWebService d (Just "tagtab") -> prWordTaglist d (Just "words") -> prWordlist (dict2fullform d) (Just "tables") -> prDictionary d (Just "gf") -> "-- machine-generated GF file\n\n" ++ "include " ++ (gfTypes l) ++ " ;\n\n" ++ (prGF d) -- (Just "gfr") -> "-- machine-generated GF file\n\n" ++ -- "include " ++ (gfTypes l) ++ " ;\n\n" ++ -- prGFRes d --(Just "latex") -> prLatex d (Just "xml") -> prXML d (Just "clex") -> prCLEX d (Just "sfst") -> prSFST d (Just "sfstheader") -> prSFSTHEAD d (Just "sfstlex") -> prSFSTLEX d (Just "lexc") -> prLEXC d (Just "xfst") -> prXFST d (Just "hundict") -> prHunDict d (Just "hunaffix") -> prHunAffix d (Just "sql") -> prSQL d (Just "lmf") -> prLMF (name l) d (Just "rdf") -> prRDF (name l) d (Just x) -> error $ "unknown printer: " ++ x Nothing -> error $ "Internal error. This is a bug." fs -> do build_trie l fs d prStatistics l n run l tokS (analysis l cmode (composition l) (sandhi_rules l)) [] build_trie :: Language l => l -> [Flag] -> Dictionary -> IO () build_trie l fs d = CTrie.buildTrieDict (isComp l) d False data Stats = Stats { totalWords :: Int, coveredWords :: Int } initStats :: Stats initStats = Stats { totalWords = 0, coveredWords = 0 } posify :: Language a => a -> (String -> [Tok]) -> (String -> [[String]]) -> IO String posify l lexer f = do s' <- hGetContents stdin let ts = lexer s' -- tss = get_sentences ts return $ (unlines (map anapos ts)) ++ "\n" where -- printResult [] = [] --printResult xs = unlines xs -- "{" ++ (unwords xs) ++ "}\n" anapos t = case t of BL -> [] (P s) -> s ++ "\t" ++ s ++ ":spec" -- "(\"" ++ esc s ++ "\",spec)" (PD s) -> s ++ "\t" ++ s ++ ":num/spec" -- "(\"" ++ esc s ++ "\",num/spec)" (D s) -> s ++ "\t" ++ s ++ ":num" --"(\"" ++ esc s ++ "\",num)" (W s) -> case f s of [] -> s -- "(\"" ++ esc s ++ "\",)" xs -> s ++ "\t" ++ prResult xs -- "(\"" ++ esc s ++ "\"," ++ prResult xs ++ ")" (A (u,l)) -> case (f u) ++ (f l) of [] -> u -- "(\"" ++ esc u ++ "\",)" xs -> u ++ "\t" ++ prResult xs -- "(\"" ++ esc u ++ "\"," ++ prResult xs ++ ")" (AA (u,m,l)) -> case (f u) ++ (f m) ++ (f l) of [] -> u -- "(\"" ++ esc u ++ "\",)" xs -> u ++ "\t" ++ prResult xs -- "(\"" ++ esc u ++ "\"," ++ prResult xs ++ ")" prResult :: [[String]] -> String prResult xs = concat $ intersperse "\t" $ filter (not.null) (map filter_analysis xs) -- esc [] = [] -- esc ('\"':xs) = '\\':'\"':esc xs -- esc (x:xs) = x:esc xs filter_analysis :: [String] -> String filter_analysis [s] = get_head s ++ ":" ++ (unwords $ filter (not.null) $ (get_pos s):(get_inhs s):[get_param s]) filter_analysis _ = [] get_head s@(x:xs) | isPrefixOf "id\":" s = case span (/= '\"') (drop 5 s) of (r,_) -> r | otherwise = get_head xs get_inhs s@(x:xs) | isPrefixOf "inhs\":" s = case span (/= ']') (drop 7 s) of (r,_) -> case filter (\c -> c /= '"' && c /= ',') r of [] -> [] s | elem '*' s -> [] s -> s | otherwise = get_inhs xs get_pos s@(x:xs) | isPrefixOf "pos\":" s = case span (/= '\"') (drop 6 s) of (r,_) | elem '*' r -> [] (r,_) -> r | otherwise = get_pos xs get_param s@(x:xs) | isPrefixOf "param\":" s = case span (/= '\"') (drop 8 s) of (r,_) | elem '*' r -> [] (r,_) -> r | otherwise = get_param xs get_sentences :: [Tok] -> [[Tok]] get_sentences xs = gets xs [] where gets [] s = [reverse s] gets (c:cs) s | isMajor c = (reverse (c:s)):gets cs [] | otherwise = gets cs (c:s) isMajor (P [c]) = elem c ".?!" isMajor _ = False run :: Language a => a -> (String -> [Tok]) -> (String -> [[String]]) -> String -> IO () run l t f "fail" = run' l t f NoAnalysis initStats >> return () run l t f "eval" = run' l t f AnaEval initStats >> return () run l t f "lexfail" = run' l t f FilterLexiconNo initStats >> return () run l t f "nocomp" = run' l t f FilterLexiconNoComp initStats >> return () run l t f "lexcomp" = run' l t f FilterLexiconComp initStats >> return () run l t f _ = do st <- run' l t f Normal initStats prErr $ "Total words: " ++ show (totalWords st) prErr $ "Covered words: " ++ show (coveredWords st) run' :: Language a => a -> (String -> [Tok]) -> (String -> [[String]]) -> AnaType -> Stats -> IO Stats run' l tokenizer f a st = do b <- hIsEOF stdin if b then return st else do s <- hGetLine stdin analyze l a f (tokenizer s) st >>= run' l tokenizer f a word_tokens :: [Tok] -> [String] word_tokens xs = [s | (W s) <- xs] analyze :: Language a => a -> AnaType -> (String -> [[String]]) -> [Tok] -> Stats -> IO Stats analyze _ _ _ [] st = return st analyze l t f (s:ss) st = case s of BL -> analyze l t f ss st (P s) -> do case t of Normal -> do prText $ "{\"" ++ s ++ "\":\"-Symb-\"}" analyze l t f ss st _ -> analyze l t f ss st (PD s) -> do case t of Normal -> do prText $ "{\"" ++ s ++ "\":\"-Num/Symb-\"}" analyze l t f ss st _ -> analyze l t f ss st (D s) -> do case t of Normal -> do prText $ "{\"" ++ s ++ "\":\"-Num-\"}" analyze l t f ss st _ -> analyze l t f ss st (W s) -> case f s of [] -> do prText $ prAnalysis l t s [] analyze l t f ss (st { totalWords = totalWords st + 1, coveredWords = coveredWords st }) xs -> do prText $ prAnalysis l t s xs analyze l t f ss (st { totalWords = totalWords st + 1, coveredWords = coveredWords st + 1 }) (A (s,ls)) -> case (f s) ++(f ls) of [] -> do prText $ prAnalysis l t s [] analyze l t f ss (st { totalWords = totalWords st + 1, coveredWords = coveredWords st }) xs -> do prText $ prAnalysis l t s xs analyze l t f ss (st { totalWords = totalWords st + 1, coveredWords = coveredWords st + 1 }) (AA (s,m,ls)) -> case (f s) ++ (f m) ++(f ls) of [] -> do prText $ prAnalysis l t s [] analyze l t f ss (st { totalWords = totalWords st + 1, coveredWords = coveredWords st }) xs -> do prText $ prAnalysis l t s xs analyze l t f ss (st { totalWords = totalWords st + 1, coveredWords = coveredWords st + 1 }) prText [] = return () prText s = hPutStrLn stdout s prAnalysis :: Language a => a -> AnaType -> String -> [[String]] -> String prAnalysis l Normal s [] = case wordGuesser l s of (x:xs) -> concat [concat ["{\"", s, "\":[\"-Guess-\",\n"], prA l (map (:[]) (x:xs)), "\n}"] [] -> concat ["{\"", s, "\":\"-Unknown-\"}"] prAnalysis l Normal s xs = concat [concat ["{\"", s, "\":{\n"], prA l xs, "\n}}"] prAnalysis _ NoAnalysis s [] = s prAnalysis _ NoAnalysis s _ = [] -- force evaluation without printing. prAnalysis _ AnaEval s xs | length (["" | x <- xs,"" <- x]) == 1 = s | otherwise = [] prAnalysis _ FilterLexiconNo s xs = case [ x | [x] <- xs] of (_:_) -> [] _ -> s prAnalysis _ FilterLexiconNoComp s xs = case [x | [x] <- xs] of ys | length xs == length ys -> s _ -> [] prAnalysis _ FilterLexiconComp s xs = case [x | (x:y:_) <- xs] of ys | length xs == length ys -> s _ -> [] prA l xs = concat $ intersperse ",\n" (map pr (annotate 1 1 (sort_length (affixes l) xs))) annotate sn cn [] = [] annotate sn cn ([]:xs) = (0,[]):annotate sn cn xs annotate sn cn ([x]:xs) = (sn,[x]):annotate (sn+1) cn xs annotate sn cn (xs:ys) = (cn,xs):annotate sn (cn+1) ys pr (_,[]) = [] pr (n,[x]) = "\"s_" ++ (show n) ++ "\":" ++ x pr (n,xs) = str ++ (concat (intersperse (",\n" ++ pad) xs)) ++ "]" where str = "\"c_" ++ (show n) ++ "\":[" pad = take (length str) (repeat ' ') welcome :: Language a => a -> String welcome l = (morphology_header l) ++ "\n includes FM 3.0 (M. Forsberg & A. Ranta, 2010, under GNU GPL)"