{-# LANGUAGE ScopedTypeVariables,
             ExplicitForAll,
             MultiParamTypeClasses,
             FlexibleInstances,
             FlexibleContexts,
             UndecidableInstances,
             BangPatterns #-}

{-|
Module: Text.PhonotacticLearner.MaxentGrammar.MaxentGrammar
Description: Functions to model maxent grammars using DFSTs.
Copyright: © 2016-2017 George Steel and Peter Jurgec
License: GPL-2+
Maintainer: george.steel@gmail.com

Library for using DFAs to represent maxent grammars. A mexent grammar consists of a set of constraints, each of which is given a weight, which define a probability diatribution over the set of strings of each given length.
The relative probability (maxent score) of each string is equal to the negative exponential of the the total weight of the violated constraints.  In this module, such a grammar is reperesented by a 'DFST' which can count violations and a 'Vec' of weights.

This module is mainly concerned with calculating probabilities of samples of text and finding the optimal weights to maximize that probability. There are also functions to randomly generate text using the distribution implied by a mexent grammar.
-}

module Text.PhonotacticLearner.MaxentGrammar (
    Length, Lexicon(..), sortLexicon, lengthCdf, lengthPdf,

    maxentProb,
    lexLogProbTotalDeriv, lexLogProbPartialDeriv,
    llpOptimizeWeights,

    sampleWord, sampleWordSalad
) where

import Text.PhonotacticLearner.Util.Ring
import Text.PhonotacticLearner.DFST
import Text.PhonotacticLearner.Util.Probability
import Text.PhonotacticLearner.Util.ConjugateGradient

import Data.Array.IArray
import Data.Array.Unboxed
import Control.Monad
import Control.Monad.State
--import Control.Monad.Trans.Class
import Control.Arrow ((&&&),(***))
import System.Random
import Data.List as L
import Data.Monoid
import Data.Maybe
import qualified Data.Map as M
import qualified Data.Vector.Unboxed as V
--------------------------------------------------------------------------------


--------------------------------------------------------------------------------

-- | Apply weights to violation counts to get a relative probability.
maxentProb :: Vec -> Multicount -> Double
maxentProb !weights !counts = exp . negate $ innerProd weights (fromMC counts)

{-# INLINE maxentProb #-}

type Length = Int

-- | Returns the probability (as a logarithm) of a lexicon with aand associated length distribution.
data Lexicon sigma = Lex { totalWords :: Int
                         , lengthFreqs :: Array Length Int
                         , wordFreqs :: [([sigma], Int)]
                         } deriving Show

-- | Convert jumbled list of words and frequencies to sorted lexicon.
sortLexicon :: (Ord sigma) => [([sigma],Int)] -> Lexicon sigma
sortLexicon wfs = Lex twords alengths awf
    where
        mwf = M.fromListWith (+) wfs
        awf = M.assocs mwf
        mlengths = M.mapKeysWith (+) length mwf
        maxlen = fst (M.findMax mlengths)
        alengths = accumArray (+) 0 (0,maxlen) (M.assocs mlengths)
        twords = sum (M.elems mlengths)

-- | Retrieve length distribution as a 'Cdf' for sampling.
lengthCdf :: Lexicon sigma -> Cdf Length
lengthCdf = massToCdf . assocs . fmap fromIntegral . lengthFreqs

-- | Retrieve length distribution as a normalized probability mass function. Probabilities add up to 1.
lengthPdf :: Lexicon sigma -> [(Length, Double)]
lengthPdf wfs = assocs . fmap fracOfTotal . lengthFreqs $ wfs
    where fracOfTotal k = fromIntegral k / fromIntegral (totalWords wfs)


priorDeriv :: Vec -> Vec -> Vec
priorDeriv (Vec !weights) (Vec !dlp) = Vec $ V.zipWith (\w d -> if w < 0.01 then min (d+1) 0 else d+1) weights dlp
{-# INLINE priorDeriv #-}

-- | For a given set of consteraints (reperesented by a DFST counting violations),
-- lexicon (reprersented as length distribution and total violation count, which should be precomputed),
-- and weight vector, returns the absolute probability (as a negative logarithm) and its derivative with respect to the weight vector.
--
-- Minimize this to find the optimal weights.
-- To prevent overfitting, this function includes an exponential (L₁) prior equivalent to each constraint being violated once for existing.
-- This intentionally differs from Hayes and Wilson since their gaussian (L₂²) prior had a strong preference for as many simillar constraints as possible as opposed to a single constraint. The exponential prior was chosen since it is independent of splitting constraints into duplicates with the weight distributed between them.
lexLogProbTotalDeriv :: (Ix sigma) => MulticountDFST sigma -- ^ DFST counting constraint violations
                                   -> Array Length Int -- ^ Length distribution of lexicon
                                   -> Vec -- ^ Observed violations in lexicon
                                   -> Vec -- ^ Weights to give constraints
                                   -> (Double, Vec) -- ^ Probability and its derivative w.r.t. the weights
lexLogProbTotalDeriv !ctr !lengths !oviols !weights = (totalViolWeight + totalNormalizer + prior, priorDeriv weights (oviols ⊖ expviols))
    where
        --prior = innerProd weights weights / 2
        prior = l1Vec weights
        edfa = weightExpVec ctr weights
        (_,maxlen) = bounds lengths
        exps = expsByLengthVec edfa maxlen
        totalViolWeight = innerProd oviols weights
        totalNormalizer = sum . fmap (\(l,n) -> n ⊙ log (prob (exps ! l))) . assocs $ lengths
        expviols = sumR . fmap (\(l,n) -> n ⊙ normalizeExp (exps ! l)) . assocs $ lengths


-- | Compute partial derivative of lexicon probability. Much faster equivalent of
--
-- > lexLogProbPartialDeriv ctr lengths oviols weights dir = dir `innerProd` snd (lexLogProbTotalDeriv ctr lengths oviols weights)

lexLogProbPartialDeriv :: (Ix sigma) => MulticountDFST sigma -> Array Length Int -> Vec -> Vec -> Vec -> Double
lexLogProbPartialDeriv !ctr !lengths !oviols !weights !dir = innerProd (dl1Vec weights) dir + innerProd dir oviols - expviols
    where
        edfa = weightExpPartial ctr weights dir
        (_,maxlen) = bounds lengths
        exps = expsByLengthDouble edfa maxlen
        expviols = sumR . fmap (\(l,n) -> n ⊙ normalizeExp (exps ! l)) . assocs $ lengths


zeroNeg :: Vec -> (Vec, Bool)
zeroNeg (Vec v) = (Vec (V.map (\x -> if x < 0.01 then 0 else x) v), V.any (\x -> x /= 0 && x < -0.01) v)

-- | Calculate weights to maximize probability of lexicon.
--  Takes starting position of search which MUST have the correct dimensionality (do not use 'zero')
llpOptimizeWeights :: (Ix sigma) => Array Length Int -- Length distribution
                                 -> PackedText sigma -- Packed lexicon
                                 -> MulticountDFST sigma -- Constraint violation counter
                                 -> Vec -- initial guess at weights
                                 -> Vec -- Weights
llpOptimizeWeights lengths pwfs dfa initweights =
    let oviols = fromMC (transducePackedMulti dfa pwfs)
    in conjugateGradientSearch True
                               (0.01, 0.005)
                               zeroNeg
                               (lexLogProbTotalDeriv dfa lengths oviols)
                               (lexLogProbPartialDeriv dfa lengths oviols)
                               initweights


--------------------------------------------------------------------------------


-- used for statistical calculations over an entire dfa with ring weights (e.g. probabilities)
-- given an array mapping states to weights (e.g, a maxent distribution),
-- gives a new distribution after transducing an additional character
stepweights :: (Ix q, Ix sigma, Semiring k) => DFST q sigma k -> Array q k -> Array q k
stepweights dfa@(DFST _ tm _) prev = accumArray (⊕) zero sbound (fmap pathweight (range (bounds tm)))
    where
        sbound = stateBounds dfa
        pathweight (s,c) = let (ns,w) = tm!(s,c) in (ns, (prev!s) ⊗ w)

-- gives an array from states to weights with 1 in the first position and 0 elsewhere
initialWeightArray :: (Ix l, Ix sigma, Semiring w) => DFST l sigma w -> Array l w
initialWeightArray dfa = fnArray (stateBounds dfa) (\x -> if x == initialState dfa then one else zero)

-- converts to an NFA with all the arrows reversed
reverseTM :: (Ix q, Ix sigma) => DFST q sigma k -> Array (q,sigma) [(q,k)]
reverseTM (DFST _ arr _) = accumArray (flip (:)) [] (bounds arr) (fmap (\((s,c),(s',w)) -> ((s',c),(s,w))) (assocs arr))


-- | Returns a monadic action to sample random words from a probability transducer,
-- which may be generated from a violation counter with @('fmap' ('maxentProb' weights) ctr)@).
-- For efficiency, evaluate this once then sequence the action repeatedly as intermediate values will be memoized.

sampleWord :: forall g sigma m . (RandomGen g, Ix sigma, MonadState g m)
    => DFST Int sigma Double -- ^ Probability DFST
    -> Length -- ^ Maximum length to greate generator fot
    -> (Length -> m [sigma]) -- ^ Random generator taking length and returning action.
sampleWord dfa maxn = backDists `seq` \n -> do
        fs <- sampleCdf (finalStates ! n)
        rcs <- flip evalStateT fs . forM (reverse . range $ (1, min n maxn)) $ \k -> do
            s <- get
            (c,s') <- lift . sampleCdf $ backDists!(k,s)
            put s'
            return c
        return (reverse rcs)
    where
        backnfa = reverseTM dfa
        qbound = stateBounds dfa

        maxentPrefixes = take (maxn + 1) (iterate (stepweights dfa) (initialWeightArray dfa))
        maxentArray :: UArray (Int,Int) Double
        maxentArray = array ((0,maxn) `xbd` qbound) . join . snd . mapAccumL (\k a -> (k+1, fmap (\(x,p)->((k,x),p)) (assocs a))) 0 $ maxentPrefixes

        backDist :: (Int, Int) -> Cdf (sigma, Int)
        backDist (k, s) = massToCdf $ do
            c <- range (segBounds dfa)
            (s', w) <- backnfa!(s,c)
            return ((c,s'), w * (maxentArray!(k-1,s')))
        -- memoized version
        backDists :: Array (Int, Int) (Cdf (sigma, Int))
        backDists = fnArray ((1,maxn) `xbd` qbound) backDist

        finalStates :: Array Int (Cdf Int)
        finalStates = array (1,maxn) $ do
            n <- range (1,maxn)
            let cdf = massToCdf $ do
                    s <- range qbound
                    return (s, maxentArray!(n,s) * finalWeights dfa!s)
            return (n,cdf)
-- | Like sampleWord but generates multiple words. Length distribution is specified as a 'Cdf' and number of words to generate.
sampleWordSalad :: (RandomGen g, Ix sigma, MonadState g m) => DFST Int sigma Double -> Cdf Length -> Int -> m [[sigma]]
sampleWordSalad dfa lengthdist samples = mapM sampler wordlenlist
    where
        wordlen = uniformSample lengthdist samples
        maxn = maximum (fmap fst wordlen)
        sampler = sampleWord dfa maxn
        wordlenlist = wordlen >>= uncurry (flip replicate)