module EvalSem
(evalSem)
where
import qualified Data.Map as Map
import qualified Data.Set as Set
import Reader (readcorpus,Token)
import Data.List (foldl',inits,isPrefixOf,sortBy)
import Data.Ord (comparing)
import SparseVector (plus,scale)
import Utils (splitOn)
import Data.Char (toLower)
import System.Environment
import System.IO (stderr,hPutStr)
import Control.Exception (assert)
import Text.Printf
import NLP.Scores (avgPrecision, mean)
import Debug.Trace
type Word = String
type POS = String
type ClustID = String
type Feat = String
type Count = Double
type SemLex = Map.Map (Word,POS) (Map.Map Feat Count)
type SemClust = Map.Map ClustID [(Feat,Count)]
parseEntry :: String -> ((Word,POS),Map.Map Feat Count)
parseEntry ln = case words ln of
(wp:fs) ->
let [w,p] = splitOn ':' wp
in ((w,map toLower p),Map.fromList . map (\f -> (f,1))
. splitOn ','
. unwords
$ fs)
parseLexicon :: String -> SemLex
parseLexicon = foldl' f Map.empty
. map parseEntry
. filter (not . null)
. lines
where f z (k,v) = Map.insertWith' (Map.unionWith (+)) (v == v `seq` k) v z
semClusters :: SemLex -> [((Word,ClustID,POS),Count)] -> SemClust
semClusters dict =
Map.map (sortBy (flip $ comparing snd)
. Map.toList)
. Map.fromListWith (plus)
. map (\((w,cid,p),c) ->
(cid,Map.findWithDefault Map.empty (w,p) dict `scale` c))
evalSem args = do
let [details
,lexf
,trainposf
,trainf
,posf
,clustf
] = args
lex <- fmap parseLexicon $ readFile lexf
css <- fmap readcorpus $ readFile trainf
cpos <- fmap readcorpus $ readFile trainposf
pss <- fmap readcorpus $ readFile posf
xss <- fmap readcorpus $ readFile clustf
let toks yss zss = Map.toList
. Map.fromListWith (+)
. map (\k -> (k,1))
. zipWith (\(w,p) (w',cid) -> assert (w == w') (w,cid,p))
(concat yss)
. concat
$ zss
its = filter (\((w,_,p),_) ->
(take 1 p `elem` ["n","v"] && Map.member (w,p) lex))
. toks pss
$ xss
cs = semClusters lex . toks cpos $ css
ap ((w,cid,p),c) | read details &&
trace (show $ Map.findWithDefault [] cid $ cs) False =
undefined
ap ((w,cid,p),c) = c * (avgPrecision (Map.keysSet
. Map.findWithDefault Map.empty (w,p)
$ lex)
. map fst
. Map.findWithDefault [] cid
$ cs)
aps = map ap $ its :: [Double]
hPutStr stderr . unlines . map (\(t,a) -> printf "%-40s %2.3f" (show t) a)
. zip its $ aps
printf "%2.3f\n" . (/ sum (map snd its)) . sum $ aps