{-# LANGUAGE RecordWildCards, NamedFieldPuns #-} {-# LANGUAGE DeriveDataTypeable #-} import Control.Applicative import Control.Arrow import Control.Monad ( forM, when ) import Data.Binary import Data.Data import Data.Ratio import Data.Text (Text) import Data.Version ( showVersion ) import System.Console.CmdArgs import System.Directory import System.FilePath import System.IO import qualified Data.Map as Map import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.IO as T import Prelude hiding (floor) import qualified NLP.Brillig as Br import NLP.Brillig hiding ( Tag ) import NLP.Brillig.Brill import NLP.Brillig.Unigram import NLP.Brillig.Util import NLP.Brillig.Wrong import Paths_brillig import Data.List data Tagger = Unigram { tdict :: FilePath } | Brill { tdict :: FilePath, trules :: FilePath } data Config = Count { trainingD :: FilePath , dict :: FilePath } | TrainBrill { trainingD :: FilePath , dict :: FilePath , rules :: FilePath , floor :: Int } | TagBrill { dict :: FilePath , input :: Maybe FilePath , test :: Bool , rules :: FilePath } | Tag { dict :: FilePath , input :: Maybe FilePath , test :: Bool } deriving (Show, Data, Typeable) configs = modes [ Count { trainingD = def &= argPos 0 &= typ "DIR" , dict = def &= argPos 1 &= typ "FILE" } , TrainBrill { trainingD = def &= argPos 0 &= typ "DIR" , dict = def &= argPos 1 &= typ "FILE" , rules = def &= argPos 2 &= typ "FILE" , floor = 0 &= typ "INT" &= help "Stop at score..." } &= help "training dict rules" , Tag { dict = def &= argPos 0 &= typ "FILE" , input = Nothing &= typ "FILE" &= help "Input file (stdin otherwise)" , test = False &= help "Input is in word/tag format" } , TagBrill { dict = def &= argPos 0 &= typ "FILE" , rules = def &= argPos 1 &= typ "FILE" , input = Nothing &= typ "FILE" &= help "Input file (stdin otherwise)" , test = False &= help "Input is in word/tag format" } &= help "dict rules" ] &= summary ("brillig " ++ showVersion version) main = do conf <- cmdArgs configs case conf of (Count _ _) -> counter conf (TrainBrill _ _ _ _) -> brillTrainer conf (Tag _ _ _) -> tagger conf (TagBrill _ _ _ _) -> tagger conf -- ---------------------------------------------------------------------- -- build dictionary -- ---------------------------------------------------------------------- counter conf = do let dir = trainingD conf files <- getDirectoryContentsNoJunk dir counts <- forM files $ \f -> do hPutStrLn stderr $ "Reading " ++ f ++ "..." process <$> T.readFile (dir f) hPutStrLn stderr "Merging..." encodeFile (dict conf) (mergeCounts counts) process :: Text -> Count process = count . map readTag . T.words . T.toLower count :: [(Text, Br.Tag)] -> Count count = Map.map histogram . Map.fromListWith (++) . map (second pure) mergeCounts :: [Count] -> Count mergeCounts = Map.unionsWith (Map.unionWith (+)) -- ---------------------------------------------------------------------- -- train Brill -- ---------------------------------------------------------------------- brillTrainer conf = do let dir = trainingD conf c <- decodeFile (dict conf) files <- getDirectoryContentsNoJunk dir tags <- forM files $ \f -> do hPutStrLn stderr $ "Reading " ++ f ++ "..." map readTag . T.words . T.toLower <$> T.readFile (dir f) let actual = concat tags guessed = tag c (map fst actual) rulez = learnConverge (floor conf) (map snd actual) (map snd guessed) ruleStr = unlines (map show rulez) hPutStrLn stderr ruleStr -- should trickle out due to lazy evaluation writeFile (rules conf) ruleStr -- ---------------------------------------------------------------------- -- do tagging -- ---------------------------------------------------------------------- toTagger :: Config -> Tagger toTagger TagBrill {..} = Brill dict rules toTagger Tag {..} = Unigram dict toTagger _ = error "Unknown tagger config!" tagger conf = do c <- decodeFile (dict conf) text <- case input conf of Nothing -> T.getContents Just inpF -> do isD <- doesDirectoryExist inpF if isD then do fs <- getDirectoryContentsNoJunk inpF T.concat <$> mapM (\f -> T.readFile (inpF f)) fs else T.readFile inpF let sents = map T.words (T.lines text) original = map (map readTag) sents unitagged = map (tag c . map fst) original tagged <- case toTagger conf of Brill {..} -> do rs <- (map read . lines) `fmap` readFile trules return (map (brilltag rs) unitagged) Unigram {..} -> return $ unitagged T.putStrLn $ showTagging tagged when (test conf) $ do let correct = length [ w | ((w,t1),(w2,t2)) <- zip (concat original) (concat tagged), t1 == t2 ] total = length (concat original) putStrLn $ "Accuracy: " ++ show (fromRational (100 * fromIntegral correct % fromIntegral total)) showTagging = T.unlines . map (T.unwords . map showTag) -- ---------------------------------------------------------------------- -- odds and ends -- ---------------------------------------------------------------------- getDirectoryContentsNoJunk :: FilePath -> IO [FilePath] getDirectoryContentsNoJunk dir = filter (not . (`elem` [".",".."])) <$> getDirectoryContents dir