{-# LANGUAGE GeneralizedNewtypeDeriving #-} -- | The next iteration of MC-Fold-DP which is, hopefully, a bit faster. It can -- be seen that the dsConnect and ddConnect data structures do not care about -- the type of connection, only which kind of NCM is involved and the pair. -- This follows the way MC-Fold integrates away the specific type of pair -- family involved. module Biobase.DataSource.MCFold where import Control.DeepSeq import Data.Ix import Data.Ix.Tuple import Data.PrimitiveArray import Data.PrimitiveArray.Ix -- only needed for the ghci part? import qualified Data.Vector.Unboxed as VU import Biobase.RNA import Biobase.RNA.Hashes -- | This version of the motif database accepts ACGU+E as nucleotides. -- Occurances of E lead to a maximization over all strings where the E are -- replaced by ACGU -- if the corresponding function was used, otherwise all E -- map to infinity (actually only a large number). -- -- The dsConnect and ddConnect database stores the combined information for -- connections between different cycles. data MotifDB = MotifDB -- The most important data, NCMs and connections between two of those { sCycles :: [(Int,CycleArray)] -- ^ store length of cycles, cycle array for this length , dCycles :: [((Int,Int),CycleArray)] -- ^ store (left,right) lengths, cycle array , dsConnect :: [(((Int,Int),Int),ConnectArray)] , ddConnect :: [(((Int,Int),(Int,Int)),ConnectArray)] -- Following is other data, mostly from the importer , rawPairScores :: PairScoreArray , rawJunctionScores :: JunctionArray , rawSHingeCounts :: [(Int,[RawHinge])] -- ^ length of cycle, hinge data , rawDHingeCounts :: [((Int,Int),[RawHinge])] -- ^ length of cycle, length of cycle, hinge data } deriving (Show) -- * Newtypes for different annotations newtype TransCis = TransCis Int deriving (Eq,Ord,Ix,NFData,Read,Show,Enum) (trans:cis:_) = map TransCis [0..] instance Bounded TransCis where minBound = trans maxBound = cis newtype AntiPara = AntiPara Int deriving (Eq,Ord,Ix,NFData,Read,Show,Enum) (anti:para:_) = map AntiPara [0..] instance Bounded AntiPara where minBound = anti maxBound = para newtype Slash = Slash Int deriving (Eq,Ord,Ix,NFData,Read,Show,Enum) (wsp:hsp:csp:ssp:bsp:_) = map Slash [0..] instance Bounded Slash where minBound = wsp maxBound = bsp newtype Prime = Prime Int deriving (Eq,Ord,Ix,NFData,Read,Show,Enum) (p5:p3:_) = map Prime [0..] instance Bounded Prime where minBound = p5 maxBound = p3 -- | log-scores as derived from probabilities. -- -- TODO apply (min eInf)? logScore :: Double -> Double logScore x = (-0.606) * log x type CycleArray = PrimArray HashedPrimary Double type ConnectArray = PrimArray (Nucleotide,Nucleotide) Double -- ,Slash,Slash,AntiPara,TransCis) Double type RawHinge = (((Prime,Nucleotide,Nucleotide),(Slash,Slash,AntiPara,TransCis)),Double) type PairScoreArray = PrimArray (Nucleotide,Nucleotide) Double type JunctionArray = PrimArray (Int,Int) Double -- * A lot of static information -- | Known single-pair types. (Length of nucleotide string, 0-based index). -- 4 -> (5,2) -- 5 -> (5,3) -- 6 -> (5,4) -- -- found to be wrong -- 5 -> (0,5) knownSingleNCM :: VU.Vector (Int,Int) knownSingleNCM = VU.fromList [ (4,2) , (5,3) , (6,4) ] -- | Known double-pair types. ((Length of left...,Length of right nucleotide -- string), 0-based index -- found to be right through hacking! -- all are (..) to next type -- NCM type -> junctions IV.! (a,b) -- (a+1,b+1) for the junction.db -- (2,2) -> (5,5) -- (2,3) -> (8,2) -- (3,2) -> (9,2) knownDoubleNCM :: VU.Vector ((Int,Int),Int) knownDoubleNCM = VU.fromList [ ((2,2), 5) , ((3,3), 6) , ((2,3), 7) , ((3,2), 8) , ((2,4), 9) , ((4,2),10) ] -- | This is a list of known hinge types. If it has to be regenerated at some -- point, run the following command inside the MCfold DB: -- -- cat *hinge | grep -v "^\[" | awk '{print $1 " " $2 " " $3}' | sort | uniq | awk '{printf " , \"%s\"\n", $0}' knownHinges = [ "B/B anti trans" , "B/H anti trans" , "B/H para cis" , "B/S anti cis" , "B/S anti trans" , "B/S para trans" , "B/W anti cis" , "B/W anti trans" , "B/W para cis" , "B/W para trans" , "C/H para trans" , "H/B anti trans" , "H/C para trans" , "H/H anti cis" , "H/H para trans" , "H/S anti trans" , "H/S para trans" , "H/W anti cis" , "H/W anti trans" , "H/W para cis" , "H/W para trans" , "S/B anti cis" , "S/B para trans" , "S/C para cis" , "S/H anti cis" , "S/H anti trans" , "S/H para cis" , "S/H para trans" , "S/S anti cis" , "S/S para trans" , "S/W anti cis" , "S/W anti trans" , "S/W para cis" , "S/W para trans" , "W/B anti cis" , "W/B anti trans" , "W/B para cis" , "W/C anti trans" , "W/H anti cis" , "W/H anti trans" , "W/H para cis" , "W/H para trans" , "W/S anti cis" , "W/S anti trans" , "W/S para cis" , "W/S para trans" , "W/W anti cis" , "W/W anti trans" , "W/W para cis" , "W/W para trans" ]