module GramLab.Morfette.Utils ( train , predict , Flag(..) , Input (..) , Output (..) , ROW , morfette ) where import Prelude hiding (print,getContents,putStrLn,putStr ,writeFile,readFile) import System.IO (stderr,stdout,stdin,hSetBinaryMode) import System.IO.UTF8 hiding (getContents,print,putStr,putStrLn) import qualified System.IO.UTF8 as UTF8 import GramLab.Commands import qualified GramLab.Morfette.Models2 as Models import GramLab.Morfette.Models2 (Row(..)) import qualified Data.Set as Set import qualified Data.Map as Map import qualified Data.IntMap as IntMap import Control.Monad hiding (join) import GramLab.Utils (padRight,splitWith,splitOn ,splitInto,join,tokenize,lowercase) import qualified GramLab.Perceptron.Model as M import GramLab.Morfette.Token import GramLab.Morfette.LZipper import GramLab.Morfette.MWE import Data.Maybe import System.Directory import System.FilePath import Text.Printf import qualified Data.List as List import Data.Char import GramLab.Morfette.Lang.Conf import Data.Binary import qualified Data.ByteString.Lazy as B import qualified Data.Text.Lazy as Text import qualified Data.Text.Lazy.IO as Text import qualified GramLab.Morfette.Config as C import GramLab.Morfette.Evaluation import GramLab.Morfette.Settings.Defaults import GramLab.Intern (Table(..),intern,initial,runState,evalState) import GramLab.FeatureSet (toFeatureSet,FeatureSet) import GramLab.Data.Diff.EditTree (EditTree) import qualified Paths_morfette import Data.Version (Version(..)) import Debug.Trace data Flag = ModelPrefix String | Eval | BeamSize Int | Multi Int | IgnoreCase | DictFile FilePath | ClusterFile FilePath | BaselineFile FilePath | Lang Lang | Gaussian Double | Tokenize | IgnorePunct | IgnorePOS String | Pipeline | EntropyTh Double | IterPOS Int | IterLemma Int | ModelId String | PruneUniqLabels deriving Eq data Input = Input { inputForm :: !String, inputEmb :: !(Maybe Emb) } deriving (Eq, Ord, Show) data Output = ET !(EditTree String Char) | POS !String deriving (Eq, Ord, Show) instance Binary Output where put (ET et) = put False >> put et put (POS pos) = put True >> put pos get = do tag <- get case tag of False -> liftM ET get True -> liftM POS get type ROW = Row Input Output type FEATSPEC = Models.FeatureSpec Input Output morfette fs fspecs = defaultMain (commands fs fspecs) "Usage: morfette command [OPTION...] [ARG...]" commands fs fspecs = [ ("train" , CommandSpec (train fs fspecs) "train models" [ Option [] ["dict-file"] (ReqArg DictFile "PATH") "path to optional dictionary" , Option [] ["language-configuration"] (ReqArg Lang "es|pl|tr|..") "language configuration" , Option [] ["iter-pos"] (ReqArg (IterPOS . read) "NUM") "iterations for POS model" , Option [] ["iter-lemma"] (ReqArg (IterLemma . read) "NUM") "iterations for Lemma model" , Option [] ["prune-unique-labels"] (NoArg PruneUniqLabels) "discard labels occurring only once" {- , Option [] ["cluster-file"] (ReqArg ClusterFile "PATH") "path to optional cluster file" -} ] [ "TRAIN-FILE", "MODEL-DIR" ]) , ("extract-features", CommandSpec (extractFeatures fs fspecs) "extract features" [ Option [] ["dict-file"] (ReqArg DictFile "PATH") "path to optional dictionary" , Option [] ["model-id"] (ReqArg ModelId "pos|lemma") "model id (`pos' or `lemma')" ] [ "MODEL-DIR"]) , ("predict" , CommandSpec (predict fs fspecs) "predict postags and lemmas using saved model data" [ Option [] ["beam"] (ReqArg (BeamSize . read) "+INT") "beam size to use" , Option [] ["tokenize"] (NoArg Tokenize) "tokenize input" , Option [] ["multi"] (ReqArg (Multi . read) "+INT") "n-best output" ] [ "MODEL-DIR" ] ) , ("eval" , CommandSpec eval "evaluate morpho-tagging and lemmatization results" [ Option [] ["ignore-case"] (NoArg IgnoreCase) "ignore case for evaluation" , Option [] ["baseline-file"] (ReqArg BaselineFile "PATH") "path to baseline results" , Option [] ["dict-file"] (ReqArg DictFile "PATH") "path to optional dictionary" , Option [] ["ignore-punctuation"] (NoArg IgnorePunct) "ignore punctuation for evaluation" , Option [] ["ignore-pos"] (ReqArg IgnorePOS "POS-prefix") "ignore POS starting with POS-prefix for evaluation" ] ["TRAIN-FILE","GOLD-FILE","TEST-FILE"]) , ("version", CommandSpec version "show version" [] [] ) ] version _ _ = do let showVersion (Version { versionBranch = is }) = "morfette-" ++ (List.concat . List.intersperse "." . map show $ is) putStrLn . showVersion $ Paths_morfette.version predict (_,format) fspecs flags [modelprefix] = do hPutStrLn stderr $ "Loading models from " ++ (modelFile modelprefix) ms <- fmap decode (B.readFile (modelFile modelprefix)) ms == ms `seq` return () when True $ do mwes <- loadMwes (mweFile modelprefix) lex <- readConf (confFile modelprefix) txt <- Text.getContents let models = zipWith Models.toModelFun (map ($lex) fspecs) ms defaultBeamSize = 3 n = case [f | BeamSize f <- flags ] of { [f] -> f ; _ -> defaultBeamSize } k = case [f | Multi f <- flags ] of { [f] -> f ; _ -> 1 } f = if Pipeline `elem` flags then if k /= 1 then error $ "GramLab.Morfette.Utils.predict: " ++ "pipeline mode doesn't work with n-best output" else \ n ms -> map (map return) . Models.predictPipeline n ms else Models.predict k putStr . unlines . map format . f n models . toksToForms . getToks flags mwes $ txt predict _fs _fspecs _flags _args = do error $ "GramLab.Morfette.Utils.predict: " ++ "Incorrect arguments to command predict." confFile dir = dir "conf.model" mweFile dir = dir "mwe.model" modelFile dir = dir "models.model" classMapFile dir = dir "classmap.model" featMapFile dir = dir "featmap.model" defaultGaussianPrior = 1 extractFeatures :: (Token -> ROW, t) -> [Conf -> FEATSPEC] -> [Flag] -> [FilePath] -> IO () extractFeatures (prepr,fmt) [fspos,fslem] flags [modeldir] = do dict <- getDict flags Nothing -- dict == dict `seq` return () clusters <- getClusters flags let langConf = case [f | Lang f <- flags ] of { [] -> "xx" ; [f] -> f } lex = Conf { dictLex = dict , clusterDict = clusters , lang = langConf } fs = case [f | ModelId f <- flags ] of ["pos"] -> fspos lex ["lemma"] -> fslem lex [] -> fspos lex other -> error $ "GramLab.Morfette.Utils.extractFeatures: " ++ "invalid option value: " ++ show other toks <- fmap (map parseToken . Text.lines) Text.getContents let ws = filter (not . null) . map tokenForm $ toks (xm,ym,xys) = convertFeatures . map swap . concatMap (Models.sentToExamples fs) . toksToSentences prepr $ toks putStr . unlines . map format . zip ws $ xys B.writeFile (classMapFile modeldir) . encode $ ym B.writeFile (featMapFile modeldir) . encode $ xm extractFeatures _fs _fspecs _flags _args = do error $ "GramLab.Morfette.Utils.extractFeatures: " ++ "Incorrect arguments to command extract-features." format :: (String,(IntMap.IntMap Double,Int)) -> String format (w,(x,y)) = unwords (w:show y : [ show i ++ ":" ++ show n | (i,n) <- IntMap.toList x ]) parse :: String -> (String,(IntMap.IntMap Double,Int)) parse s = case words s of (w:y:x) -> (w,( IntMap.fromList [ (read i,read xi) | ixi <- x , let [i,xi] = splitOn ':' ixi ] , read y )) swap (y,x) = (x,y) convertFeatures xys = let (xs,ys) = unzip xys (xs',xm) = flip runState initial . mapM toFeatureSet $ xs (ys',ym) = flip runState initial . mapM intern $ ys in (xm,ym,zip xs' ys') train :: (Token -> ROW, t) -> [Conf -> FEATSPEC] -> [Flag] -> [FilePath] -> IO () train (prepr,_) fspecs flags [dat,modeldir] = do toks <- fmap (map parseToken . Text.lines) (Text.readFile dat) let tokSet = Set.fromList [ tokenForm t | t <- toks ] dict <- getDict flags Nothing clusters <- getClusters flags let langConf = case [f | Lang f <- flags ] of { [] -> "xx" ; [f] -> f } lex = Conf { dictLex = dict , clusterDict = clusters , lang = langConf } mwes = mweSet toks g = case [f | EntropyTh f <- flags ] of [] -> M.entropyTh posTrainSettings [f] -> f i_p = case [f | IterPOS f <- flags ] of [] -> M.iter posTrainSettings [f] -> f i_l = case [f | IterLemma f <- flags ] of [] -> M.iter lemmaTrainSettings [f] -> f sentences = toksToSentences prepr toks createDirectoryIfMissing True modeldir let models = Models.train (map (\(i,fs) -> let fs' = fs lex ts = Models.trainSettings fs' in fs' { Models.trainSettings = ts { M.entropyTh = g , M.iter = i } , Models.pruneUniqLabels = PruneUniqLabels `elem` flags }) $ zip [i_p,i_l] fspecs) $ sentences B.writeFile (modelFile modeldir) (encode models) saveConf (confFile modeldir) lex saveMwes (mweFile modeldir) mwes train _fs _fspecs _flags _args = do error $ "GramLab.Morfette.Utils.train: " ++ "Incorrect arguments to command train." toksToSentences :: (Token -> ROW) -> [Token] -> [[ROW]] toksToSentences f toks = map (map f) $ splitWith isNullToken toks -- Transform list of tokens to list of sentences, where each sentence is a list of ROWs -- (with empty output fields) toksToForms :: [Token] -> [[ROW]] toksToForms toks = [ [ Row { input = Input { inputForm = tokenForm tok , inputEmb = tokenEmb tok } , output = [] } | tok <- sent ] | sent <- splitWith isNullToken toks ] getDict :: [Flag] -> Maybe (Set.Set String) -> IO Lexicon getDict flags tokSet = do case [f | DictFile f <- flags ] of [f] -> fmap (parseLexicon tokSet) $ readFile f [] -> return emptyLexicon getClusters :: [Flag] -> IO ClusterDict getClusters flags = do case [f | ClusterFile f <- flags ] of [f] -> fmap parseClusterDict $ readFile f [] -> return Map.empty getToks :: [Flag] -> [[String]] -> Text.Text -> [Token] getToks flags mwes text = let f = if Tokenize `elem` flags then map Text.pack . concatMap (detectMwes mwes) . List.intersperse [""] . map tokenize . map Text.unpack else id in map parseToken . f . Text.lines $ text -- TODO remove these if unnecessary formatTriple (form,lemma,pos) = unwords . map (padRight ' ' 12) $ [form,lemma,pos] formatToken (f,ml,mp) = unwords [f,fromMaybe "" ml,fromMaybe "" mp] getEval flags trainf goldf testf = do let uncase = if IgnoreCase `elem` flags then map lowercaseToken else id ignore = case [f | IgnorePOS f <- flags ] of [] -> const False xs -> (\t -> case tokenPOS t of Nothing -> False Just pos -> any (`List.isPrefixOf` pos) xs) isPunct = if IgnorePunct `elem` flags then (\t -> (not . isNullToken) t && all isPunctuation (tokenForm t)) else const False keep tok = (not . ignore) tok && (not . isPunct) tok train <- fmap uncase (getTokens trainf) gold <- fmap uncase (getTokens goldf) test <- fmap uncase (getTokens testf) baseline <- case [f | BaselineFile f <- flags ] of [] -> return Nothing [f] -> fmap (Just . uncase) (getTokens f) let keeps = map keep gold return ( train , filterZip keeps gold , filterZip keeps test , fmap (filterZip keeps) baseline ) where getTokens f = fmap (map parseToken . Text.lines) (Text.readFile f) -- FIXME its breaks sentence accuracy somehow... eval flags [trainf,goldf,testf] = do (train,gold,test,baseline) <- fmap (\ (tr, g, t, b) -> (tr,toks g, toks t, fmap toks b)) (getEval flags trainf goldf testf) let seen = Set.fromList (map tokenForm train) dict <- getDict flags . Just $ seen let isUnseen t = not (tokenForm t `Set.member` seen) isUnseenInDict t = not (lowercase (tokenForm t) `Map.member` dict) isUnseenBoth x = isUnseen x && isUnseenInDict x all_acc = tokenAccuracy gold test baseline unseen_acc = tokenAccuracy (filter isUnseen gold) (filter isUnseen test) (fmap (filter isUnseen) baseline) seen_acc = tokenAccuracy (filter (not . isUnseen) gold) (filter (not . isUnseen) test) (fmap (filter (not . isUnseen)) baseline) unseen_ratio = 100 * fromIntegral (length (filter isUnseen test)) / fromIntegral (length test) unseen_train_and_dict_acc = tokenAccuracy (filter isUnseenBoth gold) (filter isUnseenBoth test) (fmap (filter isUnseenBoth) baseline) sent_acc = sentenceAccuracy (sents gold) (sents test) (fmap sents baseline) goldlemma g= map (fmap lowercase . tokenLemma) g uniquePOS = fromIntegral $ Set.size $ Set.fromList $ map tokenPOS gold putStrLn $ "Unseen word ratio: " ++ printf "%4.2f" (unseen_ratio::Double) putStrLn $ "Token accuracy all:\n" ++ showAccuracy all_acc putStrLn $ "Token accuracy seen:\n" ++ showAccuracy seen_acc putStrLn $ "Token accuracy unseen:\n" ++ showAccuracy unseen_acc when (Map.size dict > 0) (putStrLn $ "Token accuracy unseen train+dict:\n" ++ showAccuracy unseen_train_and_dict_acc) -- FIXME Sentence accuracy is broken -- putStrLn $ "Sentence accuracy:\n" ++ showAccuracy sent_acc where toks xs = filter (not . isNullToken) xs sents xs = splitWith isNullToken xs eval _flags _args = do error $ "GramLab.Morfette.Utils.eval: " ++ "Incorrect arguments to command eval." filterZip :: [Bool] -> [a] -> [a] filterZip xs ys = catMaybes $ zipWith (\b x -> if b then Just x else Nothing) xs ys getContents :: IO String getContents = hSetBinaryMode stdin True >> UTF8.getContents putStr :: String -> IO () putStr s = hSetBinaryMode stdout True >> UTF8.putStr s putStrLn :: String -> IO () putStrLn s = hSetBinaryMode stdout True >> UTF8.putStrLn s