{-# LANGUAGE TupleSections #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverlappingInstances #-}

module Biobase.TrainingData where

import Data.ByteString.Char8 as BS
import Data.List as L
import Prelude as P hiding (sequence)

import Biobase.FR3D
import Biobase.RNAstrand
import Biobase.Secondary
import Biobase.Secondary.Diagrams (dotBracket)



-- | One training data element. We can store sequence and known structure
-- (primary, secondary) as well as a predicted structure (stored, if
-- "not.null"). The 'weight' is how strongly this element should influence a
-- training system. 'extendedKnowledge' is True, if the data element comes from
-- a source which knows about extended secondary structures, like the PDB.
-- Otherwise it is False.
--
-- NOTE During training, one should not penalize non-canonical predictions in
-- interior loops and multibranch loops, unless they hinder formation of true
-- pairs.
--
-- TODO at some point we will move toward pseudoknots and other fun

data TrainingData = TrainingData
  { primary :: String     -- "CCCAAAGGG"
  , secondary :: [ExtPairIdx] -- "(((...)))"
  , predicted :: [ExtPairIdx] -- ".(..()..)"
  , weight :: Double      -- e.g. 1.0
  , extendedKnowledge :: Bool
  , comments :: [String]
  } deriving (Read,Show)



-- ** Instance creation

-- | Create 'TrainingData' from various sources.

class MkTrainingData a where
  mkTrainingData :: a -> TrainingData

-- | Import from linearized FR3D data

instance MkTrainingData LinFR3D where
  mkTrainingData LinFR3D{..} = TrainingData
    { primary = BS.unpack sequence
    , secondary = nub
                . P.map (\((i,j),cww) -> if i<j then ((i-1,j-1),cww) else ((j-1,i-1),swp cww))
                . P.map (\(i,j,cww) -> ((i,j),threeChar cww))
                $ pairs
    , predicted = []
    , weight = 1.0
    , extendedKnowledge = True -- since fr3d knows non-canoncal pairs
    , comments = []
    } where swp (c,x,y) = (c,y,x)

-- | Import from RNAstrand data. Each annotated pair is assumed to be of type
-- "cWW".

instance MkTrainingData RNAstrand where
  mkTrainingData RNAstrand{..} = TrainingData
    { primary = BS.unpack sequence
    , secondary = P.map (,(cis,wc,wc)) . dotBracket ["()","[]","<>"] . BS.unpack $ structure
    , predicted = []
    , weight = 1.0
    , extendedKnowledge = False
    , comments = []
    }