src/Entropy/Algorithm.hs

{-# LANGUAGE NoMonomorphismRestriction , BangPatterns #-}
module Entropy.Algorithm ( cluster
                         , clusterBeam
                         , clusterToken
                         , labelToken
                         , clusterWords
                         , ClusterSet (..)
                         , weightedhXY
                         , empty
                         , makeClusterSet
                         , X
                         , Y
                         , Count
                         , featIDs
                         , predictX0
                         , predictX0Full
                         , clusterLabelToX0
                         , defocus
                         , getX0
                         , examples
                         , display
                         , getLabeler
                         )
where
  
import qualified Entropy.Features as F
import qualified Data.Map as Map
import qualified Data.Set as Set
import Data.Map ((!))
import ListZipper 
import Data.List (sort,sortBy,foldl')
import Data.Ord (comparing)
import Reader (Token,readcorpus)
import Debug.Trace
import Prelude hiding (sum)
import SparseVector (plus,dot)
import Data.Binary (encode,decode,put,get,Binary)
import Control.Monad (ap)
import qualified Data.ByteString.Lazy as BS
import Counts (rankNormalize)

type Count = Double
type H = Double

nextID c = lastID c + 1
ids = Map.keys . countY

flatten :: (Ord k, Ord k1) => Map.Map k (Map.Map k1 a) -> Map.Map (k,k1) a
flatten m = Map.fromList [ ((k,k1),x) | (k,m1) <- Map.toList m 
                                      , (k1,x) <- Map.toList m1 ]


features :: ListZipper String -> X (Int,String)
features = flatten . F.features

data ClusterSet x = CS { countY   :: Map.Map Y Count 
                       , hY       :: !H
                       , countXY  :: Map.Map Y (Map.Map x Count) 
                       , hXY      :: Map.Map Y H
                       , countN   :: !Count 
                       , lastID   :: !Y }
                    deriving (Eq,Ord,Show)  

instance (Ord x,Binary x) => Binary (ClusterSet x) where
    put cs =      put (countY cs) 
               >> put (hY cs)
               >> put (countXY cs)
               >> put (hXY cs)
               >> put (countN cs)
               >> put (lastID cs)

    get = return CS `ap` get `ap` get `ap` get `ap` get `ap` get `ap` get
             
    
empty = CS { countY  = Map.empty
           , hY      = 0
           , countXY = Map.empty
           , hXY     = Map.empty
           , countN  = 0
           , lastID  = 0 } 

featIDs :: (Ord a, Ord b) => ClusterSet (a,b) -> [a]
featIDs = uniq . map fst . concat . Map.elems . Map.map Map.keys . countXY

makeClusterSet ::  Map.Map Y (Map.Map x Count) -> ClusterSet x
makeClusterSet cxy =
    let cy = Map.map (Map.fold (+) 0) $ cxy
        hy = entropy cy
        hxy = Map.map entropy $ cxy
        n = Map.fold (+) 0 cy
        last = maximum . Map.keys $ cy
    in CS { countY = cy , hY = hy 
          , countXY = cxy
          , hXY = hxy 
          , countN = n 
          , lastID = last + 1 }

type X k = Map.Map k Count
type Y = Int

sum :: [Double] -> Double
sum = foldl' (+) 0

entropy cs = 
    let xs = Map.elems cs
        n  = sum xs
        logn = logBase 2 n
        loop !s [] = s
        loop !s (!x:xs) = loop (s - x/n * (logBase 2 x - logn)) xs
    in loop 0 xs

pseudoC = 0.00001
pseudoN = 1
entropy' pc pn m = let ms = Map.elems m 
                       n = sum ms + pn
                       logn = logBase 2 n
                   in - pn * (if pc == 0 then 0 else pc/n * logBase 2 (pc/n))
                      - sum [ m / n * logBase 2 (m/n) | m <- ms ]

score cs = weightedhXY cs + hY cs
weightedhXY cs = sum 
              . Map.elems
              . Map.mapWithKey (\k c -> c/countN cs * hXY cs ! k) 
              . countY
              $ cs
{- incremental update:

 H^t(X|Y) - H^{t-1}(X|Y)
        =     \sum_{y \neq y^n} [ count(y)/N(N+1) H^{t-1}(X|Y=y) ]
              + [ p^t(y^n)H^t(X|Y=y^n) - p^{t-1}(y^n)H^{t-1}(X|Y=y^n) ]


where N is the count of features at time t-1
-}

update :: (Ord k) => ClusterSet k -> X k -> Y -> ClusterSet k
update cs x y = 
    CS { countY  = countY'
       , hY      = entropy countY'
       , countXY = countXY'
       , hXY     = hXY'
       , countN  = countN'
       , lastID  = if Map.member y (countY cs) 
                   then lastID cs
                   else lastID cs + 1
       }
    where countN'  = countN cs + nx
          countXY' = countXY cs `plus` Map.singleton y x
          countY' = Map.insertWith' (+) y nx . countY $ cs
          hy_new  = entropy (countXY' ! y)
          hXY'    = Map.insert y hy_new . hXY $ cs
          nx      = sum . Map.elems $ x
          
clusterToken :: Bool 
             -> ClusterSet (Int,String) 
             -> X (Int,String)
             -> [(Y,ClusterSet (Int,String))]
clusterToken freeze cs x = map snd . clusterTok freeze cs $ x

clusterTok :: Bool 
             -> ClusterSet (Int,String) 
             -> X (Int,String)
             -> [((Double,Y), (Y,ClusterSet (Int,String)))]
clusterTok freeze cs x =
    let rs = rank 
             $ [let cs' = update cs x y 
                in ((score cs',y),(y,cs'))
                | y <- nextID cs : ids cs
               ,  y == nextID cs 
                      || Map.size (countXY cs ! y `Map.intersection` x) > 0
                     ]
    in rs

rank :: [((Double,Y),a)] -> [((Double,Y),a)]
rank = sortBy (comparing (\((s,y),_) -> (realToFrac s :: Float, negate y)))

-- | labelToken: output a single label (from a closed set) 
labelToken :: ClusterSet (Int,String)
           -> X (Int,String)
           -> Y
labelToken cs x = 
    let ((y,cs'):rs) = clusterToken True cs x
    in if y == nextID cs 
       then fst . head . sortBy (flip $ comparing snd) . Map.toList . countY 
                $ cs
       else y

getLabeler :: Bool -> Maybe FilePath -> FilePath -> IO ([String] -> [String])
getLabeler nonew md file = do
  cs <- fmap decode $ BS.readFile file
  conv <- case md of
         Nothing -> return show
         Just fp -> do ls <- fmap lines $ readFile fp
                       let dict = Map.fromList 
                                  . map (\ln -> case words ln of
                                                  [k,v] -> (read k,v))
                                  $ ls
                       return $ 
                        (\k -> Map.findWithDefault (error 
                                              $ "getLabeler:Not found: " ++ 
                                                show k) k dict)
  let fids = featIDs cs
      label = if nonew 
              then labelToken cs 
              else fst . head . clusterToken True cs
  return $ \ws -> let toks = zip ws ws
                      xs = examples fids [toks]
                  in map (conv . label) . head $ xs

clusterWords :: [Int] -> ClusterSet (Int,String) -> [String] -> [String]
clusterWords fids cs ws =
    let toks = zip ws ws
        xs = examples fids [toks]
    in map (show . fst . head . clusterToken True cs) . head $ xs

cluster :: Bool 
        -> ClusterSet (Int,String) 
        -> [X (Int, String)] 
        -> ClusterSet (Int, String)
cluster freeze = foldl' (\cs x -> snd . head . clusterToken freeze cs $ x) 
                        
clusterBeam :: Int 
            -> Bool 
            -> ClusterSet (Int,String)
            -> [X (Int, String)] 
            -> ClusterSet (Int,String)
clusterBeam sz freeze cs = 
    let step css x = map (snd . snd)
                     . take sz
                     . rank 
                     . concat 
                     $ [ take sz . clusterTok freeze cs $ x | cs <- css ]
    in head . foldl' step (return cs)

normalize :: (Ord k) => Map.Map k Double -> Map.Map k Double
normalize x = let s = Map.fold (+) 0 x in Map.map (/s) x

rescale xs = let a = (negate $ minimum xs)
                 xs' = map (+ a) xs
                 s = sum xs'
             in map (/s) xs'


predictX0Full :: ClusterSet (Int,String)
              -> X (Int,String)
              -> [String]
predictX0Full cs' = 
    let xy = Map.map normalize . countXY $ cs'
        unigram = normalize . Map.unionsWith (+) 
                  . map (Map.filterWithKey (\(i,_) _ -> i == 0))
                  . Map.elems 
                  . countXY 
                  $ cs'
        uniform = (1/) . fromIntegral . succ . Map.size . countY $ cs'
        ws = uniq 
             . map snd
             . filter ((==0) . fst)
             . concat 
             . Map.elems 
             . Map.map Map.keys 
             . countXY 
             $ cs'
        score0 = score cs'
    in \x ->
        let ycs = clusterToken True cs' . defocus $ x
            cs  = map snd ycs
            es = map ((score0 -) . score) cs
            ys' = map fst ycs
            (ys,ps) = unzip . rankNormalize . zip ys' $ es
            pyxs = Map.fromList . zip ys $ ps
        in map fst
               . sortBy (flip $ comparing snd)
                     $ [ (w, sum [ let px0y =   Map.findWithDefault 0 (0,w) 
                                             . Map.findWithDefault Map.empty y
                                             $ xy
                                       pyx =  Map.findWithDefault 0 y pyxs
                                   in  if y == nextID cs' then
                                           (unigram ! (0,w)) * uniform
                                       else 
                                       px0y * pyx 
                                   | y <- ys ]) 
                         | w <- ws ]

predictX0 :: ClusterSet (Int,String) 
          -> X (Int,String) 
          -> [String]
predictX0 cs = 
    let unigram =  Map.filterWithKey (\(i,w) _ -> i == 0) 
                  . Map.unionsWith (+) 
                  . Map.elems 
                  . countXY 
                  $ cs
    in \x ->     let ((y,CS { countXY = xy} ):_) = 
                         clusterToken True cs . defocus $ x
                     pred = Map.filterWithKey (\(i,w) _ -> i == 0) $ xy ! y 
                 in map fst 
                            . sortBy (flip $ comparing snd)
                            . map (\((0,w),c) -> (w,c))
                            . Map.toList
                            $ if Map.size pred < 1 then unigram else pred

clusterLabelToX0 :: ClusterSet (Int,String)
                 -> Y
                 -> [String]
clusterLabelToX0 cs = 
    let m = Map.map (map fst 
                     . sortBy (flip $ comparing snd)
                     . map (\((0,w),c) -> (w,c))
                     . Map.toList
                     . Map.filterWithKey (\(i,w) _ -> i == 0))
            . countXY
            $ cs
    in \y -> m ! y

examples :: [Int] -> [[Token]] -> [[X (Int, String)]]
examples fids train = [ [ Map.filterWithKey (\(fid,_) _ -> fid `elem` fids)
                         . features 
                         $ t 
                       | t <- take (length s) 
                                . iterate next 
                                . fromList 
                                . map fst
                                $ s ] | s <- train ]

display :: (Y,Map.Map (Int,String) Double) -> String
display (y,c) = 
    let cs = Map.toList c
        fis = uniq . map (fst . fst) $ cs
        grps = [ (fi,
                    sortBy (flip $ comparing snd) 
                    . map (\((_,fv),c) -> (fv,c))
                    . filter (\((fi',fv),c) -> fi' == fi) $ cs) | fi <- fis ]
    in show y ++ "\n" ++ 
       unlines [unwords $ [ take 3 $  show fi ++ (repeat ' ') ] 
                               ++ [ fi ++ ":" ++ show (ceiling fv) 
                                      | (fi,fv) <- take 20 fs ] 
               | (fi,fs) <- grps ] 

uniq = Set.toList . Set.fromList

defocus :: X (Int,String) -> X (Int,String) 
defocus = Map.filterWithKey (\(i,_) _ -> i /= 0)

getX0 :: X (Int,String) -> String
getX0 = maybe (error "Entropy.Algorithm.getX0: not found") 
              id      
        . lookup 0
        . Map.keys