{-# LANGUAGE BangPatterns #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE DeriveDataTypeable #-} -- | This program trains a parameter database for RNAwolf. The user has to take -- care to only give appropriate training data to the optimizer. The most -- important rule is to not give any pseudoknotted data. The small helper -- program "MkTrainingData" should be able to take care of this. -- "MkTrainingData" is part of BiobaseTrainingData. -- -- We currently train using an optimization scheme described in: -- -- Zakov, Shay and Goldberg, Yoav and Elhaded, Michael and Ziv-Ukelson, Michal -- "Rich Parameterization Improves RNA Structure Prediction" -- RECOMB 2011 -- -- NOTE It is likely that this we extended with other methods in the (near) -- future, again. Especially the convex-optimization-based (even though the -- Zakov et al. scheme is derived from cvx-methods) system seems promising. -- Right now, this version simply is faster... -- -- TODO update the DB within IO to save creation / destruction of Params in -- each iteration -- -- TODO re-allow co-folding module Main where import Control.Monad import System.Console.CmdArgs import Text.Printf import Data.List import Data.Function (on) import System.Random import Control.Applicative import Data.Ord import Control.Arrow import qualified Data.Vector.Unboxed as VU import qualified Data.Map as M import Biobase.Primary import Biobase.TrainingData import Biobase.TrainingData.Import import Statistics.ConfusionMatrix import Statistics.PerformanceMetrics import Biobase.Secondary.Diagrams import BioInf.Params as P import BioInf.Params.Export as P import BioInf.Params.Import as P import BioInf.RNAwolf import BioInf.PassiveAggressive import BioInf.Keys -- | Entry function main :: IO () main = do o@Options{..} <- cmdArgs options when (null outDB) $ error "please set --outdb" when (null trainingData) $ error "please give at least one training data file with --trainingdata" -- read training data xs <- id . fmap (filter (\TrainingData{..} -> True -- length primary > 20 && && all (/='&') primary -- no co-folding right now -- length secondary > 5 -- at least 5 basepairs ) ) . fmap (filter (lengthFlt maxLength)) . fmap concat $ mapM fromFile trainingData -- read database or use zero-based parameters dbIn <- maybe (return . P.fromList . map (+0.01) . P.toList $ P.zeroParams) (fmap read . readFile) inDB -- dbOut <- foldM (foldTD $ length xs) dbIn $ zip xs [1..] (dbOut,_) <- foldM (doIteration o xs) (dbIn,[]) [1..numIterations] writeFile outDB $ show dbOut -- | length filter for training data lengthFlt l TrainingData{..} = maybe True (length primary <) l -- | iterations to go doIteration :: Options -> [TrainingData] -> (P.Params,[Double]) -> Int -> IO (P.Params,[Double]) doIteration o@Options{..} xs' (!p,rhos) !k = do xs <- shuffle xs' when (Iteration `elem` verbose) $ do putStrLn "\n======================================" printf "# INFO iteration: %4d / %4d starting\n" k numIterations putStrLn "======================================\n" (newp,totalchange,rhosum,cooptimality) <- foldM (foldTD o $ length xs) (p,0,0,0) $ zip xs [1..] let drctch = sum $ zipWith (\x y -> abs $ x-y) (P.toList p) (P.toList newp) let rho = rhosum / genericLength xs when (Iteration `elem` verbose) $ do putStrLn "\n======================================" printf "# INFO iteration: %4d / %4d ended\n" k numIterations printf "# INFO sum tau: %7.2f, total change: %7.2f, avg.rho: %4.2f, avg.coopt: %5d\n" totalchange drctch rho (cooptimality `div` length xs) putStr "# INFO history:" zipWithM_ (printf " %4d %4.2f") [1::Int ..] $ rhos++[rho] putStrLn "" print $ sum $ map abs $ P.toList newp print $ minimum $ P.toList newp print $ maximum $ P.toList newp putStrLn "======================================\n" writeFile (printf "%04d.db" k) . show $ newp return (newp,rhos++[rho]) -- | Fold one 'TrainingData', print some info and stuff foldTD :: Options -> Int -> (P.Params,Double,Double,Int) -> (TrainingData,Int) -> IO (P.Params,Double,Double,Int) foldTD o@Options{..} total (!p,accChange,rhosum,cooptimality) (td@TrainingData{},k) = do print $ length $ primary td let pri = mkPrimary $ primary td let tables = rnaWolf p pri let bs' = let f x = td{predicted = x} in map (first f) . take (maybe 1 id maxLoss) $ rnaWolfBacktrack p pri 0.00001 tables let bs = pure $ minimumBy (comparing (sensitivity . mkConfusionMatrix . fst)) bs' case bs of [(x,ddd)] -> do let fV = featureVector (primary x) (predicted x) let pVU = VU.fromList . P.toList $ p let sss = map (pVU VU.!) fV when (abs (ddd - sum sss) > 0.0001) $ do printf "SCORE DIFFERENCE, backtracking score: %f, sum features: %f\n" ddd (sum sss) -- , " ", map (vks M.!) fV, " ", sss) mapM_ print $ zip (map (vks M.!) fV) sss print "You have found a bug, now write choener to have him fix it!" let (newp,tau,rho,votes) = defaultPA aggressiveness p $ x { comments = [ show ddd , simpleViewer (primary x) $ secondary x , simpleViewer (primary x) $ predicted x , show $ predicted x ] } when (Single `elem` verbose) $ do printf "# INFO currently at: %4d / %4d (s-tau: %7.4f, rho: %5.2f, changes: %4d)\n" k total (tau * genericLength votes) rho (length votes) when (Detailed `elem` verbose) $ do putStrLn $ take (length $ primary x) . concatMap show . concat . repeat $ [0..9] putStrLn $ primary x putStrLn $ simpleViewer (primary x) $ secondary x putStrLn $ simpleViewer (primary x) $ predicted x when (AllPairs `elem` verbose) $ do mapM_ print $ predicted x when (errorOnError && abs (ddd - sum sss) > 0.0001) $ error "error-ing out" return (newp,accChange + tau * genericLength votes, rhosum+rho, cooptimality + length bs') _ -> error $ "no prediction for: " ++ show td -- | simple viewer... simpleViewer s xs = foldl f (replicate (length s) '.') xs where f str ((i,j),_) = upd ')' j $ upd '(' i str upd c k str | l=='(' && c=='(' = pre ++ "<" ++ post | l==')' && c==')' = pre ++ ">" ++ post | l/='.' = pre ++ "X" ++ post | otherwise = pre ++ [c] ++ post where pre = take k str l = head $ drop k str post = drop (k+1) str -- ** program options data Options = Options { inDB :: Maybe FilePath , outDB :: FilePath , trainingData :: [FilePath] , maxLength :: Maybe Int , numIterations :: Int , verbose :: [Verbose] , maxLoss :: Maybe Int , aggressiveness :: Double , errorOnError :: Bool } deriving (Show,Data,Typeable) data Verbose = Iteration | Single | Detailed | AllPairs deriving (Show,Data,Typeable,Eq) options = Options { inDB = Nothing &= help "database from which to continue optimizing; if none is given, start from scratch" , outDB = "" &= help "new database to write out" , trainingData = [] &= help "training data elements to read" , maxLength = Nothing &= help "[dev] only train using elements of length or less" , numIterations = 50 &= help "how many optimizer iterations" , verbose = [Iteration] &= help "select verbosity options: single, iteration, detailed (all switch on different verbosity options)" , maxLoss = Nothing &= help "use maxLoss optimization instead of prediction-based, requires maximal number of instances to search for maxLoss (default: not used)" , aggressiveness = 1 &= help "maximal tau for each round" , errorOnError = False &= help "error out if an error is detected (default: false)" } -- ** helper functions -- | simple shuffling of a list shuffle :: [a] -> IO [a] shuffle [] = return [] shuffle xs = do r <- getStdRandom (randomR (0,length xs -1)) let (hs,ts) = splitAt r xs let y = head ts ys <- shuffle $ hs ++ tail ts return $ y : ys