{-# LANGUAGE BangPatterns , OverloadedStrings #-} import NLP.Scores (recipRank,mean) import Entropy.Algorithm import Text.Printf (printf) import System.IO import Reader (Token,readcorpus,format) import Debug.Trace import System.Environment import Data.Binary (encode,decode,put,get,Binary) import qualified Data.ByteString.Lazy as B (readFile,writeFile) import Control.Monad (when) import qualified Data.Map as Map import SparseVector (plus) import qualified Control.Monad.Atom as Atom import qualified Data.IntMap as IntMap import Data.Foldable (foldlM) import Utils (groupsOf,splitWith) import Data.List (sortBy,foldl') import Data.Ord (comparing) import Counts (counts,vi,ari) import EvalSem (evalSem) import qualified Data.Text.Lazy as Text type Txt = Text.Text main = do (command:args) <- getArgs case command of "learn" -> learn cluster args "learn-beam" -> do let (k:args') = args learn (clusterBeam (read k)) args' "learn-seeded" -> do let (n:m:seedf:trainf:_) = args seed' <- fmap decode $ B.readFile seedf :: IO (ClusterSet (Int,String)) let seed = makeClusterSet . prune (read m) . countXY $ seed' hPutStrLn stderr . show . Map.size . countXY $ seed train <- fmap readcorpus $ readFile trainf let xss = groupsOf (read n) train fids = featIDs seed step z (i,xs) = do let cs = cluster False z . concat . examples fids $ xs tf = (trainf ++ ".learn-seeded." ++ show i ++ ".model") hPutStrLn stderr $ "Writing model " ++ tf hFlush stdout B.writeFile tf . encode $ cs return cs cs <- foldlM step seed . zip [1..] $ xss hPutStrLn stderr . show . Map.size . countXY $ cs B.writeFile (trainf ++ ".learn-seeded.model") . encode $ cs "learn-intermed" -> do let (n:fids:trainf:_) = args train <- fmap readcorpus $ readFile trainf let xss = groupsOf (read n) $ train fs = read fids let step :: ClusterSet (Int,String) -> (Int,[[Token]]) -> IO (ClusterSet (Int,String)) step z (i,xs) = do let cs = cluster False z . concat . examples fs $ xs printf "%.6f %.6f\n" (weightedhXY cs) (hY cs) hFlush stdout B.writeFile (trainf ++ "." ++ fids ++ ".learn." ++ show i ++ ".model") . encode $ cs let ys = [ clusterWords fs cs . map fst $ x | x <- xs ] xs' = zipWith (zipWith (\(w,_) y -> (w,y))) xs ys writeFile (trainf ++ "." ++ fids ++ ".learn." ++ show i ++ ".labeled") . format $ xs' return cs cs <- foldlM step empty . zip [1..] $ xss B.writeFile (trainf ++ "." ++ fids ++ ".learn.model") . encode $ cs "teach" -> do let (fids:labelf:_) = args train <- fmap readcorpus $ readFile labelf let (cs,as) = teach (read fids) train hPutStrLn stderr . show . Map.size . countXY $ cs B.writeFile (labelf ++ "." ++ fids ++ ".teach.model") . encode $ cs {- writeFile (labelf ++ "." ++ fids ++ ".teach.mapping") . unlines . map (\(i,s) -> unwords [show i,s]) . IntMap.toList . Atom.from $ as -} "display" -> do let (modelf:_) = args cs <- fmap decode $ B.readFile modelf putStr . unlines . map display . Map.toList . countXY $ cs "distribution" -> do let (modelf:_) = args cs <- fmap decode $ B.readFile modelf :: IO (ClusterSet (Int,String)) putStr . unlines . map (\(k,v) -> unwords [show k,show v]) . sortBy (comparing snd) . Map.toList . Map.fromListWith (+) . map (\n -> (n,1)) . Map.elems . Map.map (Map.fold (+) 0) . countXY $ cs "label" -> do let (foc:backoff:modelf:_) = args cs <- fmap decode $ B.readFile modelf ws <- fmap readcorpus $ getContents let xs = map (if read foc then id else map defocus) . examples (featIDs cs) $ ws label = if read backoff then labelToken cs else fst . head . clusterToken True cs ys = map (map (show . label)) $ xs xyss = zipWith zip (map (map fst) ws) ys::[[Token]] putStr . format $ xyss "eval-mrr"-> do let (full:details:modelf:_) = args cs <- fmap decode $ B.readFile modelf :: IO (ClusterSet (Int,String)) xs <- fmap (concat . examples (featIDs cs) . readcorpus) $ getContents let yss = map ((if read full then predictX0Full else predictX0) cs) xs yys = zip (map getX0 xs) $ yss rrs = map (uncurry recipRank) yys when (read details) $ do putStr . unlines . map (take 120) . map (\(r,(x,xs)) -> printf "%-4.5f %-10s %s" r x (unwords xs)) . zip rrs $ yys printf "MRR: %.4f\n" . avg $ rrs "eval-mrr-gold" -> do let [trainf] = args train <- fmap readcorpus $ readFile trainf let (cs,as) = teach [0] train xys <- fmap (concat . readcorpus) $ getContents let yys = zip (map fst xys) . map (\k -> case fst . Atom.runAtom (Atom.maybeToAtom (snd k)) $ as of Just y -> clusterLabelToX0 cs y Nothing -> []) $ xys let rrs = recipRanks yys printf "MMR: %.4f\n" . avg $ rrs "eval-goldpos" -> do let (testf:goldf:_) = args test <- fmap (map snd . concat . readcorpus) $ readFile testf gold <- fmap (map snd . concat . readcorpus) $ readFile goldf let cs = counts . zip gold $ test printf "VI: %.4f\n" . vi $ cs printf "ARI: %.4f\n" . ari $ cs "eval-sem" -> evalSem args learn f args = do let (fids:trainf:_) = args train <- fmap readcorpus $ readFile trainf let xss = concat . examples (read fids) $ train cs = f False empty xss hPutStrLn stderr . show . Map.size . countXY $ cs B.writeFile (trainf ++ "." ++ fids ++ ".learn.model") . encode $ cs teach :: [Int] -> [[Token]] -> (ClusterSet (Int,String),Atom.AtomTable String) teach fids train = flip Atom.runAtom Atom.empty $ do fmap (makeClusterSet . foldl' (\ z (!k,!x) -> Map.insertWith' plus k x z) Map.empty . concat) . flip mapM train $ \s -> do ys' <- mapM (\(x,y) -> Atom.toAtom y) s let xs' = ys' == ys' `seq` concat $ examples fids [s] return $ zipWith (\y x -> (x == x `seq` y,x)) ys' xs' prune :: Int -> Map.Map Y (Map.Map (Int,String) Count) -> Map.Map Y (Map.Map (Int,String) Count) prune m = Map.fromList . take m . sortBy (flip $ comparing (foldl' (+) 0 . Map.elems . snd)) . Map.toList mrr :: (Eq y) => [(y,[y])] -> Double mrr = mean . recipRanks recipRanks :: (Eq y) => [(y,[y])] -> [Double] recipRanks = map (uncurry recipRank) avg :: [Double] -> Double avg = mean