{-# LANGUAGE RecordWildCards #-} -- | Passive-aggressive optimization. Mainly based on: -- -- Zakov, Shay and Goldberg, Yoav and Elhaded, Michael and Ziv-Ukelson, Michal -- "Rich Parameterization Improves RNA Structure Prediction" -- RECOMB 2011 -- -- and -- -- Crammer, Koby and (et al) -- "Online Passive-Aggressive Algorithms" -- Journal of Machine Learning Research (2006) -- -- TODO as always: move out of here and put in its own library module BioInf.PassiveAggressive where import Control.Arrow import Control.DeepSeq import Control.Parallel (pseq) import Data.List as L import Data.Map as M import Data.Set as S import qualified Data.Vector.Unboxed as VU import Text.Printf import Biobase.TrainingData import BioInf.Keys import qualified BioInf.Params as P import qualified BioInf.Params.Export as P import qualified BioInf.Params.Import as P import Statistics.ConfusionMatrix import Statistics.PerformanceMetrics -- | Default implementation of P/A. We return a data structure that contains -- all 'changes' required from this run, the 'enerDif' or energy difference -- between the known and the predicted structure, and a structural difference -- score. Furthermore, some errors are being reported in 'errors'. -- -- The energy difference can be (i) in that case, a wrongly predicted structure -- has better (lower) energy than the known one. (ii) It can be zero, then we -- have either found a co-optimal structural or the correct structure. (iii) In -- some cases, it can be positive, this is a formal error, but will not abort -- the program. (The calling program may opt to abort on (not . null $ errors). -- -- The structural difference is [0..1] with "0" for structurally identical -- predictions and known structures and otherwise growing toward "1" for bad -- predictions where nothing is correct. defaultPA :: Double -> P.Params -> TrainingData -> PA defaultPA aggressiveness params td@TrainingData{..} | L.null $ pOnly++kOnly = PA { changes = [] , enerDif = edif , accMeas = struc , errors = [] } | struc >= 0.999 = PA { changes = [] , enerDif = edif , accMeas = struc , errors = [] } | otherwise = PA { changes = changes , enerDif = edif , accMeas = struc , errors = eError } where -- calculate changes pFeatures = featureVector primary predicted kFeatures = featureVector primary secondary pOnly = pFeatures L.\\ kFeatures kOnly = kFeatures L.\\ pFeatures numChanges = genericLength $ pOnly ++ kOnly changes = zip kOnly (repeat $ negate tau) ++ zip pOnly (repeat tau) cur = VU.fromList . P.toList $ params pScore = sum . L.map (cur VU.!) $ pFeatures kScore = sum . L.map (cur VU.!) $ kFeatures edif = kScore - pScore eError = if edif <= 0 then [] else ["S(known) < S(predicted)\n" ++ errorKnownTooGood td cur kFeatures pFeatures] struc = case fmeasure (mkConfusionMatrix td) of -- currently optimizing using F_1 Left _ -> 1 Right v -> v -- weight calculation tau = min aggressiveness $ ( (min 0 $ kScore - pScore) + sqrt (1-struc) ) / (numChanges ^ 2) -- | In case that the known structure has a score 'epsilon' better than the -- predicted, we have an error condition, as this should never be the case. errorKnownTooGood TrainingData{..} curPs kFeatures pFeatures = z where z = printf "S(known) = %7.4f, S(pred) = %7.4f, S(known) - S(pred) = %7.4f\n" kScore pScore (kScore - pScore) ++ printf "%s\n%s\n" primary (concat $ intersperse "\n" comments) kScore = sum . L.map (curPs VU.!) $ kFeatures pScore = sum . L.map (curPs VU.!) $ pFeatures -- | Return a lot of information from each P/A call. We do not return the new -- 'Params' anymore, only a list of changes. This allows us to do some things. -- If the implementation of 'Params' is switched, we can update in place; or we -- can perform calculations in parallel. data PA = PA { changes :: [(Int,Double)] -- (index of change, change) , enerDif :: Double -- how much pressure from a wrong energy difference , accMeas :: Double -- accuracy measure , errors :: [String] -- if something strange happens } deriving (Show) instance NFData PA where rnf PA{..} = rnf changes `pseq` rnf enerDif `pseq` rnf accMeas `pseq` rnf errors -- * Instances -- | Pull in the statistical interface. From the confusion matrix, we -- automagically get everything we need. -- -- NOTE Unfortunately, StatisticalMethods has heavy dependencies. instance MkConfusionMatrix TrainingData where mkConfusionMatrix TrainingData{..} = ConfusionMatrix { fn = Right . fromIntegral . S.size $ k `S.difference` p , fp = Right . fromIntegral . S.size $ p `S.difference` k , tn = Right . fromIntegral $ allPs - S.size (k `S.union` p) , tp = Right . fromIntegral . S.size $ k `S.intersection` p } where k = S.fromList secondary p = S.fromList predicted allPs = ((length primary) * (length primary -1)) `div` 2