{-# LANGUAGE BangPatterns #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE DeriveDataTypeable #-} -- | This program trains a parameter database for RNAwolf. The user has to take -- care to only give appropriate training data to the optimizer. The most -- important rule is to not give any pseudoknotted data. The small helper -- program "MkTrainingData" should be able to take care of this. -- "MkTrainingData" is part of BiobaseTrainingData. -- -- We currently train using an optimization scheme described in: -- -- Zakov, Shay and Goldberg, Yoav and Elhaded, Michael and Ziv-Ukelson, Michal -- "Rich Parameterization Improves RNA Structure Prediction" -- RECOMB 2011 -- -- NOTE It is likely that this we extended with other methods in the (near) -- future, again. Especially the convex-optimization-based (even though the -- Zakov et al. scheme is derived from cvx-methods) system seems promising. -- Right now, this version simply is faster... -- -- TODO update the DB within IO to save creation / destruction of Params in -- each iteration -- -- TODO re-allow co-folding module Main where import Control.Applicative import Control.Arrow import Control.Monad import Control.Parallel (pseq) import Control.Parallel.Strategies import Data.Function (on) import Data.List import Data.List.Split (splitEvery) import Data.Ord import qualified Data.Map as M import qualified Data.Vector.Unboxed as VU import System.Console.CmdArgs import System.Random import Text.Printf import System.IO (hFlush,stdout) import Biobase.Primary import Biobase.Secondary.Constraint import Biobase.Secondary.Diagrams import Biobase.TrainingData import Biobase.TrainingData.Import import Statistics.ConfusionMatrix import Statistics.PerformanceMetrics import BioInf.Keys import BioInf.Params as P import BioInf.Params.Export as P import BioInf.Params.Import as P import BioInf.PassiveAggressive import BioInf.RNAwolf -- | Entry function main :: IO () main = do o@Options{..} <- cmdArgs options when (null outDB) $ error "please set --outdb" when (null trainingData) $ error "please give at least one training data file with --trainingdata" -- read training data xs <- id . fmap (filter (\TrainingData{..} -> True -- length primary > 20 && && all (/='&') primary -- no co-folding right now -- length secondary > 5 -- at least 5 basepairs ) ) . fmap (filter (lengthFlt maxLength)) . fmap concat $ mapM fromFile trainingData -- read database or use zero-based parameters dbIn <- maybe (return . P.fromList . map (+0.01) . P.toList $ P.zeroParams) (fmap read . readFile) inDB -- dbOut <- foldM (foldTD $ length xs) dbIn $ zip xs [1..] (dbOut,_) <- foldM (doIteration o xs) (dbIn,[]) [1..numIterations] writeFile outDB $ show dbOut -- | length filter for training data lengthFlt l TrainingData{..} = maybe True (length primary <) l -- | iterations to go doIteration :: Options -> [TrainingData] -> (P.Params,[Double]) -> Int -> IO (P.Params,[Double]) doIteration o@Options{..} xs' (!p,rhos) !k = do xs <- fmap (splitEvery parallelism) $ shuffle xs' when (Iteration `elem` verbose) $ do putStrLn "\n======================================" printf "# INFO iteration: %4d / %4d starting\n" k numIterations printf "# INFO folding %d elements, maximal length: %d\n" (length xs') (maximum $ map (length . primary) xs') putStrLn "======================================\n" hFlush stdout let indices = mapAccumL (\acc x -> (acc+x,(acc+1,acc+x))) 0 $ map length xs (newp,rs) <- foldM (foldTD o $ length xs) (p,[]) . zip xs . snd $ indices let drctch = sum $ zipWith (\x y -> abs $ x-y) (P.toList p) (P.toList newp) let rhosum = sum $ map accMeas rs let rho = rhosum / (sum . map genericLength $ xs) when (Iteration `elem` verbose) $ do putStrLn "\n======================================" printf "# INFO iteration: %4d / %4d ended, rho: %4.2f (%4.2f, %4.2f)\n" k numIterations rho (minimum $ map accMeas rs) (maximum $ map accMeas rs) putStr "# INFO rho history:" zipWithM_ (printf " %4d %4.2f") [1::Int ..] $ rhos++[rho] putStrLn "" putStrLn "======================================\n" hFlush stdout writeFile (printf "%04d.db" k) . show $ newp return (newp,rhos++[rho]) -- | Fold one 'TrainingData' element and return the suggested changes and -- additional information. foldOne :: Options -> P.Params -> TrainingData -> (TrainingData,PA) foldOne o@Options{..} p td | null bs = (td , PA [] 0 0 ["no prediction for: " ++ primary td]) | otherwise = (fst worst, ret) where pri = mkPrimary $ primary td cst = mkConstraint $ replicate (length $ primary td) '.' tables = rnaWolf p cst pri bs = let f x = td{predicted = x} in map (first f) . take (maybe 1 id maxLoss) $ rnaWolfBacktrack p cst pri 0.001 tables worst = minimumBy (comparing (fmeasure . mkConfusionMatrix . fst)) bs runPA (x,score) = defaultPA aggressiveness p $ x { comments = [ show score , simpleViewer (primary x) $ secondary x , simpleViewer (primary x) $ predicted x , show $ predicted x ] } ret = runPA worst -- | Folding of 'TrainingData' elements. foldTD :: Options -> Int -> (P.Params,[PA]) -> ([TrainingData],(Int,Int)) -> IO (P.Params,[PA]) foldTD o@Options{..} total (!p,oldresults) (ts,(f,t)) = do -- At this point, we trade most efficient optimization with increased parallelism, if that option is >1 let parfolds = map (foldOne o p) ts let !results = let xs = map snd parfolds in xs `using` (parList rdeepseq) let cs = concatMap changes results let cur = VU.fromList . P.toList $ p let new = P.fromList . VU.toList $ VU.accum (\v pm -> v+pm) cur cs let rhosum = sum $ map accMeas results let rho = rhosum / genericLength ts let rhosumR = sum . map accMeas $ oldresults ++ results let rhoR = rhosumR / genericLength (oldresults ++ results) when (Single `elem` verbose) $ do printf "# INFO parallel: %4d - %4d, avg.rho: %4.2f, running rho: %4.2f\n" f t rho rhoR -- detailed information on each folded structure mapM_ (printDetailed o . fst) parfolds hFlush stdout return $ pseq (rdeepseq results) ( new , oldresults ++ results ) -- | simple viewer... simpleViewer s xs = foldl f (replicate (length s) '.') xs where f str ((i,j),_) = upd ')' j $ upd '(' i str upd c k str | l=='(' && c=='(' = pre ++ "<" ++ post | l==')' && c==')' = pre ++ ">" ++ post | l/='.' = pre ++ "X" ++ post | otherwise = pre ++ [c] ++ post where pre = take k str l = head $ drop k str post = drop (k+1) str -- | print out detailed information on a folded candidate printDetailed :: Options -> TrainingData -> IO () printDetailed Options{..} x = do when (Detailed `elem` verbose) $ do putStrLn $ take (length $ primary x) . concatMap show . concat . repeat $ [0..9] putStrLn $ primary x putStrLn $ simpleViewer (primary x) $ secondary x putStrLn $ simpleViewer (primary x) $ predicted x when (AllPairs `elem` verbose) $ do mapM_ print $ predicted x -- ** program options data Options = Options { inDB :: Maybe FilePath , outDB :: FilePath , trainingData :: [FilePath] , maxLength :: Maybe Int , numIterations :: Int , verbose :: [Verbose] , maxLoss :: Maybe Int , aggressiveness :: Double , errorOnError :: Bool , parallelism :: Int } deriving (Show,Data,Typeable) data Verbose = Iteration | Single | Detailed | AllPairs deriving (Show,Data,Typeable,Eq) options = Options { inDB = Nothing &= help "database from which to continue optimizing; if none is given, start from scratch" , outDB = "" &= help "new database to write out" , trainingData = [] &= help "training data elements to read" , maxLength = Nothing &= help "[dev] only train using elements of length or less" , numIterations = 50 &= help "how many optimizer iterations" , verbose = [] &= help "select verbosity options: single, iteration, detailed (all switch on different verbosity options)" , maxLoss = Nothing &= help "use maxLoss optimization instead of prediction-based, requires maximal number of instances to search for maxLoss (default: not used)" , aggressiveness = 1 &= help "maximal tau for each round" , errorOnError = False &= help "error out if an error is detected (default: false)" , parallelism = 1 &= help "perform more than one prediction concurrently. Will probably reduce the effectiveness of the algorithm but allow to use more than one core; call with +RTS -N -RTS" } -- ** helper functions -- | simple shuffling of a list shuffle :: [a] -> IO [a] shuffle [] = return [] shuffle xs = do r <- getStdRandom (randomR (0,length xs -1)) let (hs,ts) = splitAt r xs let y = head ts ys <- shuffle $ hs ++ tail ts return $ y : ys