-- | Types for RNA secondary structure. Types vary from the simplest array -- (D1Secondary) to rather complex ones. module Biobase.Secondary.Diagrams where import Control.Applicative import Control.Arrow import Control.Lens import Data.Aeson import Data.Binary import Data.List ((\\)) import Data.List (sort,groupBy,sortBy,intersperse) import Data.List.Split (splitOn) import Data.Serialize import Data.Tuple.Select (sel1,sel2) import Data.Tuple (swap) import Data.Vector.Binary import Data.Vector.Serialize import GHC.Generics import qualified Data.Vector.Unboxed as VU import Text.Printf import Control.DeepSeq import Biobase.Primary.Nuc import Biobase.Secondary.Basepair -- | RNA secondary structure with 1-diagrams. Each nucleotide is paired with at -- most one other nucleotide. A nucleotide with index @k@ in @[0..len-1]@ is -- paired if @unD1S VU.! k >=0 0@ Unpaired status is @-1@. -- -- TODO Provide @iso@ between @D1Secondary@ and @RNAss@. newtype D1Secondary = D1S {unD1S :: VU.Vector Int} deriving (Read,Show,Eq,Generic,NFData) instance Binary D1Secondary instance Serialize D1Secondary instance FromJSON D1Secondary instance ToJSON D1Secondary -- RNA secondary structure with 2-diagrams. Each nucleotide is paired with up -- to two other nucleotides. newtype D2Secondary = D2S {unD2S :: VU.Vector ( (Int,Edge,CTisomerism), (Int,Edge,CTisomerism) )} deriving (Read,Show,Eq,Generic) instance Binary D2Secondary instance Serialize D2Secondary instance FromJSON D2Secondary instance ToJSON D2Secondary -- | Conversion to and from 1-diagrams. class MkD1Secondary a where mkD1S :: a -> D1Secondary fromD1S :: D1Secondary -> a -- | Conversion to and from 2-diagrams. class MkD2Secondary a where mkD2S :: a -> D2Secondary fromD2S :: D2Secondary -> a -- * Tree-based representation -- -- TODO Tree -> d1/2Secondary ? -- | A secondary-structure tree. Has no notion of pseudoknots. data SSTree idx a = SSTree idx a [SSTree idx a] | SSExtern Int a [SSTree idx a] deriving (Read,Show,Eq,Generic) -- | Create a tree from (pseudoknot-free [not checked]) 1-diagrams. d1sTree :: D1Secondary -> SSTree PairIdx () d1sTree s = ext $ sort ps where (len,ps) = fromD1S s ext [] = SSExtern len () [] ext xs = SSExtern len () . map tree $ groupBy (\l r -> snd l > fst r) xs -- ">=" would be partial allowance for 2-diagrams tree [ij] = SSTree ij () [] tree (ij:xs) = SSTree ij () . map tree $ groupBy (\l r -> snd l > fst r) xs -- | Create a tree from (pseudoknot-free [not checked]) 2-diagrams. d2sTree :: D2Secondary -> SSTree ExtPairIdx () d2sTree s = ext $ sortBy d2Compare ps where (len,ps) = fromD2S s ext [] = SSExtern len () [] ext xs = SSExtern len () . map tree . groupBy d2Grouping $ xs tree [ij] = SSTree ij () [] tree (ij:xs) = SSTree ij () . map tree . groupBy d2Grouping $ xs d2Compare ((i,j),_) ((k,l),_) | i==k = compare l j | j==l = compare i k | otherwise = compare (i,j) (k,l) d2Grouping ((i,j),_) ((k,l),_) = i<=k && j>=l -- * Instances for D1S -- | Conversion between D1S and D2S is lossy in D2S -> D1S instance MkD1Secondary D2Secondary where mkD1S = fromD2S fromD1S = mkD2S -- | (Length,List of Pairs) instance MkD1Secondary (Int,[PairIdx]) where mkD1S (len,ps) = let xs = concatMap (\ij -> [ij,swap ij]) ps in D1S $ VU.replicate len (-1) VU.// xs fromD1S (D1S s) = (VU.length s, filter (\(i,j) -> i=0) . zip [0..] . VU.toList $ s) -- * Instances for D2S -- | Conversion between D1S and D2S is lossy in D2S -> D1S -- -- TODO 'fromD2S' makes me wanna rewrite everything... instance MkD2Secondary D1Secondary where mkD2S = D2S . VU.map (\k -> ((k,W,Cis),(-1,W,Cis))) . unD1S fromD2S (D2S xs) = D1S . VU.map (sel1 . sel1) $ xs instance MkD2Secondary (Int,[ExtPairIdx]) where mkD2S (len,ps) = let xs = concatMap (\((i,j),(ct,e1,e2)) -> [ (i, (j,e1,ct)) , (j, (i,e2,ct)) ]) ps f (x,y) z = if sel1 x == -1 then (z,y) else (x,z) in D2S $ VU.accum f (VU.replicate len ((-1,W,Cis),(-1,W,Cis))) xs fromD2S (D2S s) = ( VU.length s , let (xs,ys) = unzip . VU.toList $ s g i j = let z = s VU.! i in if sel1 (sel1 z) == j then sel2 (sel1 z) else sel2 (sel2 z) f (i,(j,eI,ct)) = ((i,j),(ct,eI,g j i)) in map f . filter (\(i,(j,_,_)) -> i=0) $ zip [0..] xs ++ zip [0..] ys ) -- * Older instances (should still work) -- | A second primitive generator, requiring dictionary and String. This one -- generates pairs that are then used by the above instance. The dict is a list -- of possible brackets: ["()"] being the minimal set. instance MkD1Secondary ([String],String) where mkD1S (dict,xs) = mkD1S (length xs,ps) where ps :: [(Int,Int)] ps = unsafeDotBracket2pairlist dict xs fromD1S (D1S s) = (["()"], zipWith f [0..] $ VU.toList s) where f k (-1) = '.' f k p | k>p = ')' | otherwise = '(' -- | Generate Secondary given that we have an unboxed vector of characters instance MkD1Secondary ([String],VU.Vector Char) where mkD1S (dict,xs) = mkD1S (dict, VU.toList xs) fromD1S s = let (dict,res) = fromD1S s in (dict,VU.fromList res) -- | A "fast" instance for getting the pair list of vienna-structures. instance MkD1Secondary String where mkD1S xs = mkD1S (["()" ::String],xs) fromD1S s = let (_::[String],res) = fromD1S s in res instance MkD1Secondary (VU.Vector Char) where mkD1S xs = mkD1S (["()" ::String],xs) fromD1S s = let (_::[String],res::VU.Vector Char) = fromD1S s in res -- * High-level parsing functionality for secondary structures -- | Completely canonical structure. -- -- TODO Check size of hairpins and interior loops? isCanonicalStructure :: String -> Bool isCanonicalStructure = all (`elem` "().") -- | Is constraint type structure, i.e. there can also be symbols present -- that denote up- or downstream pairing. isConstraintStructure :: String -> Bool isConstraintStructure = all (`elem` "().<>{}|") -- | Take a structural string and split it into its constituents. -- -- If we decide to /NOT/ depend on @lens@ explicitly, another way to write -- this is: -- -- @ -- structures :: forall p f . (Profunctor p, Functor f) => p [String] (f [String]) -> p String (f String) -- structures = dimap (splitOn "&") (fmap (concat . intersperse "&")) -- @ structures :: Iso' String [String] structures = iso (splitOn "&") (concat . intersperse "&") -- | A @fold@ structure is a single structure foldStructure :: Prism' String String foldStructure = prism id to where to s = case s^.structures of [t] -> Right t _ -> Left s -- | A @cofold@ structure has exactly two structures split by @&@ (which the -- prism removes). cofoldStructure :: Prism' String (String,String) cofoldStructure = prism from to where from (l,r) = l ++ '&' : r to s = case s^.structures of [l,r] -> Right (l,r) _ -> Left s -- * Helper functions -- | Secondary structure parser which allows pseudoknots, if they use different -- kinds of brackets. unsafeDotBracket2pairlist :: [String] -> String -> [(Int,Int)] unsafeDotBracket2pairlist dict xs = sort . concatMap (f xs) $ dict where f xs [l,r] = g 0 [] . map (\x -> if x `elem` [l,r] then x else '.') $ xs where g :: Int -> [Int] -> String -> [(Int,Int)] g _ st [] = [] g k st ('.':xs) = g (k+1) st xs g k sst (x:xs) | l==x = g (k+1) (k:sst) xs g k (s:st) (x:xs) | r==x = (s,k) : g (k+1) st xs g a b c = error $ show (a,b,c) -- | Secondary structure parser with a notion of errors. We either return a -- @Right@ structure, including flags, or a @Left@ error. dotBracket2pairlist :: [String] -> String -> Either String ( [(Int,Int)] ) dotBracket2pairlist dict str = fmap (sort . concat) . sequence . map (f str) $ dict where f ys [l,r] = g 0 [] . map (\x -> if x `elem` [l,r] then x else '.') $ ys where g :: Int -> [Int] -> String -> Either String ( [(Int,Int)] ) g _ [] [] = pure [] g k st ('.':xs) = g (k+1) st xs g k st (x:xs) | l==x = g (k+1) (k:st) xs g k (s:st) (x:xs) | r==x = ((s,k):) <$> g (k+1) st xs g k [] xs = fail $ printf "too many closing brackets at position %d: '%s' (dot-bracket: %s)" k xs str g k st [] = fail $ printf "too many opening brackets, opening bracket(s) at: %s (dot-bracket: %s)" (show $ reverse st) str g a b c = fail $ printf "unspecified error: %s (dot-bracket: %s)" (show (a,b,c)) str f xs lr@(_:_:_:_) = fail $ printf "unsound dictionary: %s (dot-bracket: %s)" lr str f xs lr = fail $ printf "unspecified error: dict: %s, input: %s (dot-bracket: %s)" lr xs str -- | Calculates the distance between two vienna strings. viennaStringDistance :: Bool -> Bool -> String -> String -> (String,Int) viennaStringDistance sPairs tPairs s t = (t,length $ ss++tt) where s' = either error id . dotBracket2pairlist ["()"] $ s t' = either error id . dotBracket2pairlist ["()"] $ t ss = if sPairs then s' \\ t' else [] tt = if tPairs then t' \\ s' else [] -- | Calculate the distance between two 'D1Secondary' structures, that live -- in the same underlying space. In particular, this probably only works -- for structures on the same primary sequence. -- -- This function assumes somewhat dense structures, as it is @O(2n)@ with -- @n@ the length of the underlying vectors. -- -- @(i,k)@ vs @(j,l)@ -- -- TODO error out on weird inputs! d1Distance :: D1Secondary -> D1Secondary -> Int d1Distance (D1S x) (D1S y) -- | VU.length x /= VU.length y = error "d1Distance called on vectors with differing lengths!" | otherwise = (`div` 2) . VU.sum $ VU.zipWith chk (x VU.++ xx) (y VU.++ yy) where xx = VU.replicate (VU.length y - VU.length x) (-2) yy = VU.replicate (VU.length x - VU.length y) (-2) chk i j | i==j = 0 | i < 0 && j < 0 = 0 | i >= 0 && j >= 0 = 2 | otherwise = 1 {-# Inline chk #-} {-# NoInline d1Distance #-}