BioInf/PassiveAggressive.hs

{-# LANGUAGE RecordWildCards #-}

-- | Passive-aggressive optimization. Mainly based on:
--
-- Zakov, Shay and Goldberg, Yoav and Elhaded, Michael and Ziv-Ukelson, Michal
-- "Rich Parameterization Improves RNA Structure Prediction"
-- RECOMB 2011
--
-- and
--
-- Crammer, Koby and (et al)
-- "Online Passive-Aggressive Algorithms"
-- Journal of Machine Learning Research (2006)
--
-- TODO as always: move out of here and put in its own library

module BioInf.PassiveAggressive where

import qualified Data.Vector.Unboxed as VU
import Data.List as L
import Data.Set as S
import Control.Arrow
import Data.Map as M
import Text.Printf

import Biobase.TrainingData
import BioInf.Keys

import qualified BioInf.Params as P
import qualified BioInf.Params.Import as P
import qualified BioInf.Params.Export as P

import Statistics.ConfusionMatrix
import Statistics.PerformanceMetrics

import Data.PrimitiveArray as PA
import Data.PrimitiveArray.Ix



-- | Default implementation of P/A.

defaultPA :: Double -> P.Params -> TrainingData -> (P.Params,Double,Double,[(Int,Double)])
defaultPA aggressiveness params td@TrainingData{..}
  | L.null $ pOnly++kOnly = (params,0,1,[])
  | sty >= 0.999 = (params,0,1,[])
  | otherwise = ( new
                , tau
                , sty
                , changes
                )
  where
    -- create new vector
    new = P.fromList . VU.toList $ VU.accum (\v pm -> v+pm) cur changes
    pFeatures = featureVector primary predicted
    kFeatures = featureVector primary secondary
    pOnly = pFeatures L.\\ kFeatures
    kOnly = kFeatures L.\\ pFeatures
    numChanges = genericLength $ pOnly ++ kOnly
    changes = zip kOnly (repeat $ negate tau) ++ zip pOnly (repeat tau)
    cur = VU.fromList . P.toList $ params
    pScore = sum . L.map (cur VU.!) $ pFeatures
    kScore = sum . L.map (cur VU.!) $ kFeatures
    -- weight calculation
    tau
      | kScore + epsilon < pScore
          = error $ "S(known) < S(predicted)\n" ++ errorKnownTooGood td cur kFeatures pFeatures
      |  sty >  0.999
      && kScore+epsilon < pScore
          = error $ "S(known) < S(predicted)\n" ++ errorKnownTooGood td cur kFeatures pFeatures
      | sty >= 0.999
          = 0
      | otherwise
          = val
      where
        val = min aggressiveness $ (kScore - pScore + sqrt (1-sty)) / (numChanges ^ 2)
    sty = case fmeasure (mkConfusionMatrix td) of -- currently optimizing using F_1
            Left  _ -> 1
            Right v -> v
    -- special constants
    epsilon = 0.1

-- | In case that the known structure has a score 'epsilon' better than the
-- predicted, we have an error condition, as this should never be the case.

errorKnownTooGood TrainingData{..} curPs kFeatures pFeatures = z where
  z =  printf "S(known) = %7.4f, S(pred) = %7.4f, S(known) - S(pred) = %7.4f\n"
        kScore pScore (kScore - pScore)
    ++ printf "%s\n%s\n" primary (concat $ intersperse "\n" comments)
  kScore = sum . L.map (curPs VU.!) $ kFeatures
  pScore = sum . L.map (curPs VU.!) $ pFeatures

-- | Pull in the statistical interface. From the confusion matrix, we
-- automagically get everything we need.
--
-- NOTE Unfortunately, StatisticalMethods has heavy dependencies.

instance MkConfusionMatrix TrainingData where
  mkConfusionMatrix TrainingData{..} = ConfusionMatrix
    { fn = Right . fromIntegral . S.size $ k `S.difference` p
    , fp = Right . fromIntegral . S.size $ p `S.difference` k
    , tn = Right . fromIntegral $ allPs - S.size (k `S.union` p)
    , tp = Right . fromIntegral . S.size $ k `S.intersection` p
    } where
        k = S.fromList secondary
        p = S.fromList predicted
        allPs = ((length primary) * (length primary -1)) `div` 2