{-# LANGUAGE ParallelListComp, TemplateHaskell, ScopedTypeVariables #-} {- Command line interface for phonotactic learner Copyright © 2016-2017 George Steel and Peter Jurgec This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. -} import Text.PhonotacticLearner import Text.PhonotacticLearner.PhonotacticConstraints import Text.PhonotacticLearner.PhonotacticConstraints.Generators import Text.PhonotacticLearner.DFST import Text.PhonotacticLearner.Util.Ring import Text.PhonotacticLearner.Util.Probability import Text.PhonotacticLearner.MaxentGrammar import Options.Applicative import Options.Applicative.Extra import Control.Monad import Control.Monad.State import Control.Applicative import qualified Data.Text as T import qualified Data.Map.Lazy as M import Data.Array.IArray import Data.Maybe import Data.FileEmbed import Data.Char import Text.Read import Numeric import Control.Arrow import System.Exit import Control.DeepSeq import Control.Exception import Control.Parallel.Strategies import System.Random data SegmentType = Chars | Words | Fiero deriving (Enum, Eq, Ord, Read, Show) data Command = Learn { lexicon :: FilePath, thresholds :: [Double], hasFreqs :: Bool, useEdges :: Bool, useTrigrams :: Maybe String, useBroken :: Maybe String } | GenSalad { grammarfile :: FilePath } deriving Show data ParsedArgs = ParsedArgs { cmd :: Command, ftable :: Maybe FilePath, segtype :: SegmentType, samplesize :: Int, outfile :: Maybe FilePath } deriving (Show) parseOpts :: Parser ParsedArgs parseOpts = ParsedArgs <$> hsubparser (command "learn" (info (Learn <$> strArgument (metavar "LEXICON") <*> option auto (long "thresholds" <> metavar "THRESHOLDS" <> value [0.01, 0.1, 0.2, 0.3] <> help "thresholds to use for candidate selection (default is [0.01, 0.1, 0.2, 0.3]).") <*> switch (long "freqs" <> short 'f' <> help "Lexicon file contains word frequencies.") <*> switch (long "edges" <> short 'e' <> help "Allow constraints involving word boundaries.") <*> optional (strOption $ long "trigrams" <> short '3' <> metavar "COREFEATURES" <> help "Allow trigram constraints where at least one class uses a single one of the following features (comma-separated).") <*> optional (strOption $ long "longdistance" <> short 'l' <> metavar "SKIPFEATURES" <> help "Allow constraints with two classes separated by a run of characters possibly restricted to all having one of the following features.") ) (fullDesc <> progDesc "Learn a phonotactic grammar from a given lexicon")) <> command "gensalad" (info (GenSalad <$> strArgument (metavar "GRAMMAR")) (fullDesc <> progDesc "Generate random words from an already-calculated grammar"))) <*> optional (option str $ long "featuretable" <> short 't' <> metavar "CSVFILE" <> help "Use the features and segment list from a feature table in CSV format (a table for IPA is used by default).") <*> (flag' Chars (long "charsegs" <> short 'c' <> help "Use characters as segments (default).") <|> flag' Words (long "wordsegs" <> short 'w' <> help "Separate segments by spaces.") <|> flag' Fiero (long "fierosegs" <> help "Parse segments by repeatedly taking the longest possible match and use ' to break up unintended digraphs (used for Fiero orthography).") <|> pure Chars) <*> option auto (long "samples" <> short 'n' <> value 3000 <> help "Number of samples to use for salad generation.") <*> optional (strOption $ long "output" <> short 'o' <> metavar "OUTFILE" <> help "Record final output to OUTFILE as well as stdout.") opts = info (helper <*> parseOpts) (fullDesc <> progDesc "Automatically infer phonotactic grammars from text and apply them as probability distributions.") ipaft :: FeatureTable String ipaft = fromJust (csvToFeatureTable id $(embedStringFile "./app/ft-ipa.csv")) freqreader :: FeatureTable String -> (String -> [String]) -> String -> [([SegRef],Int)] freqreader ft seg text = do line <- lines text let (wt@(_:_),wf') = break (== '\t') line [wf] <- return (words wf') Just n <- return $ readMaybe wf return (segsToRefs ft (seg wt), n) nofreqreader :: FeatureTable String -> (String -> [String]) -> String -> [([SegRef],Int)] nofreqreader ft seg text = do line <- lines text return (segsToRefs ft (seg line), 1) prettyprintGrammar :: (Show clabel) => [clabel] -> Vec -> String prettyprintGrammar grammar weights = (unlines . reverse) [showFFloat (Just 2) w " " ++ show c | c <- grammar | w <- coords weights] isNonComment :: String -> Bool isNonComment [] = False isNonComment "\n" = False isNonComment ('#':_) = False isNonComment _ = True restrictedClasses :: FeatureTable String -> String -> [(NaturalClass, SegSet SegRef)] restrictedClasses ft arg = fmap ((id &&& classToSeglist ft) . NClass False) $ [] : do feat <- fmap T.pack (words arg) Just _ <- return $ M.lookup feat (featLookup ft) [[(FPlus, feat)], [(FMinus, feat)]] main = do args <- execParser opts putStrLn (show args) ft <- case ftable args of Just fname -> do ftcsv <- readFile fname case csvToFeatureTable id ftcsv of Just ft -> return ft Nothing -> die "Invalid feature table." Nothing -> do putStrLn "Using default IPA feature table." return ipaft case cmd args of Learn lexfile thresh lfreqs gedges gtris gbroken -> do let segmenter = case segtype args of Words -> words Chars -> fmap return Fiero -> segmentFiero (elems (segNames ft)) cls = force $ classesByGenerality ft 3 lexdata <- readFile lexfile let lexlist = (if lfreqs then freqreader else nofreqreader) ft segmenter lexdata when (null lexlist) (die "Invalid lexicon file") let wfs = sortLexicon lexlist singles = ugSingleClasses cls edges = if gedges then (ugEdgeClasses cls) else [] doubles = ugBigrams cls edoubles = if gedges then (ugEdgeBigrams cls) else [] triples = case gtris of Just rcls -> ugLimitedTrigrams cls (restrictedClasses ft rcls) Nothing -> [] longdistance = case gbroken of Just rcls -> ugLongDistance cls (restrictedClasses ft rcls) Nothing -> [] globs <- evaluate . force $ join [singles,edges,doubles,edoubles,triples,longdistance] putStrLn $ "Generated candidates with " ++ show (length cls) ++ " classes and " ++ show (length globs) ++ " globs, running DFA generation in parallel." let candidates = fmap (force . (id *** matchCounter)) globs `using` (parListChunk 1000 rdeepseq) (grammar, dfa, weights) <- generateGrammarIO (samplesize args) thresh candidates lexlist let output = "# Length Distribution:\n" ++ (show . assocs . lengthFreqs $ wfs) ++ "\n\n# Rules:\n" ++ prettyprintGrammar grammar weights putStrLn "\n\n\n\n" putStrLn output case outfile args of Just outf -> writeFile outf output Nothing -> return () GenSalad gfile -> do rawgrammar <- readFile gfile (fline:glines) <- evaluate $ filter isNonComment (lines rawgrammar) let lendist :: [(Int,Int)] = read fline grammar :: [(Double,ClassGlob)] = fmap ((read *** read) . break isSpace) glines lencdf = massToCdf (fmap (second fromIntegral) lendist) (weightlist,rulelist) = unzip (reverse grammar) weights = vec weightlist blankdfa = nildfa (srBounds ft) dfa = foldr (\g t -> force $ dfaProduct consMC (unpackDFA . cgMatchCounter ft $ g) (force t)) blankdfa rulelist unsegmenter = case segtype args of Words -> unwords Chars -> join Fiero -> joinFiero (elems (segNames ft)) evaluate . force $ grammar evaluate . force $ dfa salad <- getStdRandom . runState $ sampleWordSalad (fmap (maxentProb weights) dfa) lencdf (samplesize args) let output = unlines . fmap (unsegmenter . refsToSegs ft) $ salad putStrLn "\n\n\n\n" putStrLn output case outfile args of Just outf -> writeFile outf output Nothing -> return () return ()