{-# LANGUAGE OverloadedStrings , FlexibleInstances , DeriveGeneric , NoMonomorphismRestriction , DeriveDataTypeable , TemplateHaskell , BangPatterns #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | Word Class induction with LDA -- -- This module provides function which implement word class induction -- using the generic algorithm implemented in Colada.LDA. -- -- You can access and set options in the @Options@ record using lenses. -- Example: -- -- > import Data.Label -- > let options = set passes 5 -- > . set beta 0.01 -- > . set topicNum 100 -- > $ defaultOptions -- > in run options sentences module Colada.WordClass ( -- * Running the sampler learnIO , defaultOptions -- * Extracting information , summary , summarize , wordTypeClasses -- * Class and word prediction , label , predict -- * Data types and associated lenses , WordClass , ldaModel -- | LDA model , wordTypeTable -- | Word type string to atom and vice versa conversion tables , featureTable -- | Feature string to atom and vice versa conversion tables , options -- | Options for Gibbs sampling , LDA.Finalized , LDA.docTopics , LDA.wordTopics , LDA.topics , LDA.topicDocs , LDA.topicWords , Options , featIds -- | Feature ids , topicNum -- | Number of topics K , alphasum -- | Dirichlet parameter alpha*K which controls topic sparseness , beta -- | Dirichlet parameter beta which controls word sparseness , passes -- | Number of sampling passes per batch , batchSize -- | Number of sentences per batch , seed -- | Random seed for the sampler , topn -- | Number of most probable words to return , exponent , progressive , lambda ) where -- Standard libraries import qualified Data.Text.Lazy.IO as Text import qualified Data.Text.Lazy as Text import qualified Data.Text.Lazy.Builder as Text import qualified Data.Text.Lazy.Builder.Int as Text import qualified Data.Text.Lazy.Encoding as Text import qualified Data.Vector as V import qualified Data.Vector.Generic as G import qualified Data.Vector.Unboxed as U import qualified Data.IntMap as IntMap import qualified Data.Map as Map import qualified Data.Serialize as Serialize import qualified Control.Monad as M import qualified Data.List as List import qualified Data.List.Split as Split import qualified Data.Ord as Ord import qualified Data.Foldable as Fold import qualified Data.Traversable as Trav import qualified Control.Monad.ST as ST import qualified Control.Monad.ST.Lazy as LST import Data.Function (on) import Control.Monad.Writer import Data.Word (Word32) import Data.Typeable (Typeable) import Data.Data (Data) import Prelude hiding ((.), exponent) import Control.Category ((.)) import Control.Applicative ((<$>)) import qualified System.IO.Unsafe as Unsafe -- Third party modules import qualified Control.Monad.Atom as Atom import qualified NLP.CoNLL as CoNLL import qualified Data.List.Zipper as Zipper import GHC.Generics (Generic) import qualified Data.Label as L import Data.Label (get) import qualified NLP.SwiftLDA as LDA -- Package modules import qualified Colada.Features as F import qualified NLP.Symbols as Symbols import Debug.Trace -- | Container for the Word Class model data WordClass = WordClass { _ldaModel :: LDA.Finalized -- ^ LDA model , _wordTypeTable :: Atom.AtomTable (U.Vector Char) , _featureTable :: Atom.AtomTable (U.Vector Char) , _options :: Options } deriving (Generic) data Options = Options { _featIds :: [Int] , _topicNum :: !Int , _alphasum :: !Double , _beta :: !Double , _passes :: !Int , _batchSize :: !Int , _seed :: !Word32 , _topn :: !Int , _exponent :: !(Maybe Double) , _progressive :: !Bool , _lambda :: !Double } deriving (Eq, Show, Typeable, Data, Generic) instance Serialize.Serialize Options $(L.mkLabels [''WordClass, ''Options]) defaultOptions :: Options defaultOptions = Options { _featIds = [-1,1] , _topicNum = 10 , _alphasum = 10 , _beta = 0.1 , _passes = 1 , _batchSize = 1 , _seed = 0 , _topn = maxBound , _exponent = Nothing , _progressive= False , _lambda = 1.0 } -- | @learnIO options f xs@ runs the LDA Gibbs sampler for word classes -- with @options@ on sentences @xs@, and returns the resulting -- model. The progressive class assignments are passed to the handler -- function f. learnIO :: Options -> (V.Vector (U.Vector Double) -> IO ()) -> [CoNLL.Sentence] -> IO WordClass learnIO opts f xs = do let (bs, atomTabD, atomTabW) = Symbols.runSymbols (prepareData (get batchSize opts) (get featIds opts) xs) Symbols.empty Symbols.empty sampler :: IO LDA.Finalized sampler = do m <- ST.stToIO $ LDA.initial (U.singleton (get seed opts)) (get topicNum opts) (get alphasum opts) (get beta opts) Fold.forM_ bs $ \b -> do _ <- ST.stToIO . LDA.run m (get passes opts) . Fold.msum $ b when (get progressive opts) $ do Fold.forM_ b $ \sent -> do ls <- ST.stToIO $ V.mapM (interpWordClasses m (get lambda opts)) sent f ls {- Fold.forM_ [1..get passes opts] $ \i -> do Fold.forM_ b $ \sent -> do _ <- ST.stToIO $ LDA.pass 1 m sent when (get progressive opts && i == get passes opts) $ do ls <- ST.stToIO $ V.mapM (interpWordClasses m (get lambda opts)) sent f ls -} ST.stToIO $ LDA.finalize m lda <- sampler return (WordClass lda atomTabD atomTabW opts) type Symb = Symbols.Symbols (U.Vector Char) (U.Vector Char) type Sent = V.Vector LDA.Doc prepareData :: Int -> [Int] -> [CoNLL.Sentence] -> Symb [V.Vector Sent] prepareData bsz is ss = do ss' <- mapM symbolize . map (featurize is) $ ss return $! map V.fromList . Split.chunksOf bsz $ ss' -- | Extract features from a sentence featurize :: [Int] -> CoNLL.Sentence -> [(Text.Text, [Text.Text])] featurize is s = let mk fs = let d = IntMap.findWithDefault (error "parseData: focus feature missing") 0 fs ws = [ Text.concat [f,"^",Text.pack . show $ i ] | i <- is , Just f <- [IntMap.lookup i fs] ] in (d, ws) in map mk . extractFeatures $ s -- | Convert text strings into symbols (ints) symbolize :: [(Text.Text, [Text.Text])] -> Symb Sent symbolize s = V.fromList <$> mapM doc s where doc (d, ws) = do da <- Symbols.toAtomA . compress $ d was <- mapM (Symbols.toAtomB . compress) ws --return (da, U.fromList $ zip was (repeat Nothing)) return (da, U.fromList was) -- | @summary m@ returns a textual summary of word classes found in -- model @m@ summary :: WordClass -> Text.Text summary = summarize False summarize :: Bool -> WordClass -> Text.Text summarize harden m = let format (z,cs) = do cs' <- mapM (Atom.fromAtom . fst) . takeWhile ((>0) . snd) . take 10 . List.sortBy (flip $ Ord.comparing snd) . IntMap.toList $ cs return . Text.unwords $ Text.pack (show z) : map (Text.pack . U.toList) cs' in fst . flip Atom.runAtom (get wordTypeTable m) . M.liftM Text.unlines . mapM format . IntMap.toList . IntMap.fromListWith (IntMap.unionWith (+)) . concatMap (\(d,zs) -> [ (z, IntMap.singleton d c) | (z,c) <- zs ]) -- Maybe harden . (if harden then map (\(d,zs) -> let s = sum . map snd $ zs (z',_) = List.maximumBy (Ord.compare `on` snd) zs in (d, [ (z, if z == z' then s else 0) | (z,_) <- zs ])) else id) -- . IntMap.toList . IntMap.map IntMap.toList . LDA.docTopics . get ldaModel $ m -- | @interpWordClasses m lambda doc@ gives the class probabilities for -- word type in context @doc@ according to evolving model @m@. It -- interpolates the prior word type probability with the -- context-conditioned probabilities using lambda: -- P(d,w) = lambda * P(z|d) + (1-lambda) * P(z|d,w). -- This function does not mutate any weights of the model. interpWordClasses :: LDA.LDA s -> Double -> LDA.Doc -> ST.ST s (U.Vector Double) interpWordClasses m lambda doc@(d,_) = do pzd <- normalize <$> LDA.priorDocTopicWeights_ m d pzdw <- normalize <$> LDA.docTopicWeights_ m doc return $! normalize $ U.zipWith (\p q -> lambda * p + (1-lambda) * q) pzd pzdw where normalize x = let uniform = U.replicate (U.length x) (1 / (fromIntegral (U.length x))) in case U.sum x of 0 -> uniform s | s >= 1/0 -> uniform s -> U.map (/s) x -- | @wordTypeClasses m@ returns a Map from word types to unnormalized -- distributions over word classes wordTypeClasses :: WordClass -> Map.Map Text.Text (IntMap.IntMap Double) wordTypeClasses m = fst . flip Atom.runAtom (get wordTypeTable m) . fmap Map.fromList . mapM (\(k,v) -> do k' <- Atom.fromAtom k ; return (decompress k',v)) . IntMap.toList . LDA.docTopics . get ldaModel $ m -- | @label m s@ returns for each word in sentences s, -- unnormalized probabilities of word classes. label :: Bool -> WordClass -> CoNLL.Sentence -> V.Vector (U.Vector Double) label noctx m s = fst3 $ Symbols.runSymbols label' (L.get wordTypeTable m) (L.get featureTable m) where dectx doc@(d, _) = if noctx then (d, U.singleton (-1)) --FIXME: ugly hack else doc label' = do let fm = L.get ldaModel m s' <- prepareSent m s return $! V.map (LDA.docTopicWeights fm . dectx) $ s' -- | @predict m s@ returns for each word in sentence s, unnormalized -- probabilities of words given predicted word class. predict :: WordClass -> CoNLL.Sentence -> [V.Vector (Double, Text.Text)] predict m s = fst3 $ Symbols.runSymbols predict' (L.get wordTypeTable m) (L.get featureTable m) where predict' = do let fm = L.get ldaModel m s' <- prepareSent m s let ws = map ( G.convert . predictDoc (get (topn . options) m) fm . docToWs ) . V.toList $ s' mapM (V.mapM fromAtom) ws docToWs = snd fromAtom (n,w) = do w' <- Symbols.fromAtomA w return (n, decompress w') prepareSent :: WordClass -> CoNLL.Sentence -> Symb Sent prepareSent m = symbolize . featurize (L.get (featIds . options) m) -- | @predictDoc n m ws@ returns unnormalized probabilities of top @n@ -- most probable document ids given the model @m@ and words @ws@. The -- candidate document ids are taken from the model @m@. The weights -- are computed according to the following formula: -- -- > P(d|{w}) ∝ Σ_z[n(d,z)+a Σ_{w in ws}(n(w,z)+b)/(Σ_{w in V} n(w,z)+b)] predictDoc :: Int -> LDA.Finalized -> U.Vector LDA.W -> U.Vector (Double, LDA.D) predictDoc n m ws = let k = LDA.topicNum m a = LDA.alphasum m / fromIntegral k b = LDA.beta m v = fromIntegral . LDA.wSize $ m zt = LDA.topics m wsums = IntMap.fromList [ (z, U.sum . U.map (\w -> (count w wt_z + b) / denom) $ ws) | z <- IntMap.keys zt , let wt_z = IntMap.findWithDefault IntMap.empty z . LDA.topicWords $ m denom = count z zt + b * v ] wsum z = IntMap.findWithDefault (error $ "Colada.WordClass.predictDoc: key not found: " ++ show z) z wsums in U.fromList . take n . List.sortBy (flip compare) $ [ ( sum [ (c + a) * wsum z | (z,c) <- IntMap.toList zt_d ] , d) | (d, zt_d) <- IntMap.toList . LDA.docTopics $ m ] extractFeatures :: CoNLL.Sentence -> [IntMap.IntMap Text.Text] extractFeatures = F.featureSeq combine . Zipper.fromList . map (V.! 0) . V.toList combine :: Maybe Text.Text -> Maybe Text.Text -> Maybe Text.Text combine (Just a) (Just b) = Just $ Text.concat [a, "|", b] combine (Just a) Nothing = Just a combine Nothing (Just b) = Just b combine Nothing Nothing = Nothing compress :: Text.Text -> U.Vector Char compress = U.fromList . Text.unpack decompress :: U.Vector Char -> Text.Text decompress = Text.pack . U.toList fst3 :: (a, b, c) -> a fst3 (a,_,_) = a count :: Int -> IntMap.IntMap Double -> Double count z t = case IntMap.findWithDefault 0 z t of n | n < 0 -> error "Colada.WordClass.count: negative count" n -> n {-# INLINE count #-} st :: Monoid w => ST.ST s a -> WriterT w (LST.ST s) a st = lift . LST.strictToLazyST -- Instances for serialization instance Serialize.Serialize LDA.Finalized instance Serialize.Serialize Text.Text where put = Serialize.put . Text.encodeUtf8 get = Text.decodeUtf8 `fmap` Serialize.get instance Serialize.Serialize (U.Vector Char) where put v = Serialize.put (Text.pack . U.toList $ v) get = do t <- Serialize.get return $! U.fromList . Text.unpack $ t instance Serialize.Serialize (Atom.AtomTable (U.Vector Char)) instance Serialize.Serialize WordClass