{-# LANGUAGE TupleSections #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE OverlappingInstances #-} module Biobase.TrainingData where import Data.ByteString.Char8 as BS import Data.List as L import Prelude as P hiding (sequence) import Biobase.FR3D import Biobase.RNAstrand import Biobase.Secondary import Biobase.Secondary.Diagrams (dotBracket) -- | One training data element. We can store sequence and known structure -- (primary, secondary) as well as a predicted structure (stored, if -- "not.null"). The 'weight' is how strongly this element should influence a -- training system. 'extendedKnowledge' is True, if the data element comes from -- a source which knows about extended secondary structures, like the PDB. -- Otherwise it is False. -- -- NOTE During training, one should not penalize non-canonical predictions in -- interior loops and multibranch loops, unless they hinder formation of true -- pairs. -- -- TODO at some point we will move toward pseudoknots and other fun data TrainingData = TrainingData { primary :: String -- "CCCAAAGGG" , secondary :: [ExtPairIdx] -- "(((...)))" , predicted :: [ExtPairIdx] -- ".(..()..)" , weight :: Double -- e.g. 1.0 , extendedKnowledge :: Bool , comments :: [String] } deriving (Read,Show) -- ** Instance creation -- | Create 'TrainingData' from various sources. class MkTrainingData a where mkTrainingData :: a -> TrainingData -- | Import from linearized FR3D data instance MkTrainingData LinFR3D where mkTrainingData LinFR3D{..} = TrainingData { primary = BS.unpack sequence , secondary = nub . P.map (\((i,j),cww) -> if i ((i,j),threeChar cww)) $ pairs , predicted = [] , weight = 1.0 , extendedKnowledge = True -- since fr3d knows non-canoncal pairs , comments = [] } where swp (c,x,y) = (c,y,x) -- | Import from RNAstrand data. Each annotated pair is assumed to be of type -- "cWW". instance MkTrainingData RNAstrand where mkTrainingData RNAstrand{..} = TrainingData { primary = BS.unpack sequence , secondary = P.map (,(cis,wc,wc)) . dotBracket ["()","[]","<>"] . BS.unpack $ structure , predicted = [] , weight = 1.0 , extendedKnowledge = False , comments = [] }