-- | Here we collect the necessary data structures for the RNAs to be -- compared. This data is serialized to disk once calculated, since it is -- most likely the part that takes longest. -- -- TODO if the vienna wrapper allows, we should parallelize the -- calculations. -- -- TODO nice interface counting up? module BioInf.MutationOrder.RNA where import Data.Char (toUpper) import Codec.Compression.GZip (compress,decompress) import Control.Arrow (second) import Control.DeepSeq import Control.Error import Control.Monad (unless) import Control.Parallel.Strategies import Data.Aeson as DA import Data.Bits import Data.ByteString (ByteString) import Data.Char (isDigit) import Data.List (sort,nub,(\\)) import Data.Maybe (catMaybes) import Data.Monoid import Data.Serialize import Data.Serialize.Instances import Data.Text.Encoding (decodeUtf8, encodeUtf8) import Data.Tuple (swap) import Data.Vector.Serialize import Data.Vector.Strategies import Debug.Trace import GHC.Generics import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy as BSL import qualified Data.HashMap.Strict as HM import qualified Data.IntMap.Strict as IM import qualified Data.Vector as V import qualified Data.Vector.Unboxed as VU import System.IO.Unsafe (unsafePerformIO) import qualified Data.Bijection.HashMap as B import BioInf.ViennaRNA.Bindings import qualified Data.PrimitiveArray as PA import Biobase.Secondary.Diagrams (D1Secondary(..), mkD1S) -- | A single RNA with pre-calculated elements. -- -- All calculations are done at 37 C. -- -- TODO include the basepair probability matrix? Can we "compress" that -- one? -- -- We do not encode D1S into the json data RNA = RNA { mutationSet :: !(VU.Vector (Int,Char)) -- ^ we store just the mutation set, since this is more sparse and -- gives access to the mutational events. , primarySequence :: !ByteString -- ^ store RNA sequence too, for now , mfeStructure :: !ByteString -- ^ the mfe structure we get , mfeD1S :: !D1Secondary -- ^ efficient structure encoding , mfeEnergy :: !Double -- ^ mfe energy of the structure , centroidStructure :: !ByteString -- ^ the centroid structure , centroidD1S :: !D1Secondary -- ^ efficient centroid structure encoding , centroidEnergy :: !Double } deriving (Show,Eq,Generic) instance NFData RNA instance Serialize RNA instance ToJSON RNA where toJSON RNA{..} = object [ "mutationSet" .= mutationSet , "primarySequence" .= decodeUtf8 primarySequence , "mfeStructure" .= decodeUtf8 mfeStructure , "mfeEnergy" .= mfeEnergy , "centroidStructure" .= decodeUtf8 centroidStructure , "centroidEnergy" .= centroidEnergy ] toEncoding RNA{..} = pairs ( "mutationSet" .= mutationSet <> "primarySequence" .= decodeUtf8 primarySequence <> "mfeStructure" .= decodeUtf8 mfeStructure <> "mfeEnergy" .= mfeEnergy <> "centroidStructure" .= decodeUtf8 centroidStructure <> "centroidEnergy" .= centroidEnergy ) instance FromJSON RNA where parseJSON (Object v) = do mutationSet <- v .: "mutationSet" primarySequence <- encodeUtf8 <$> v .: "primarySequence" let (e,s) = second BS.pack . unsafePerformIO . mfeTemp 37 $ BS.unpack primarySequence mfeStructure <- (fmap encodeUtf8 <$> v .:? "mfeStructure") .!= s mfeEnergy <- v .:? "mfeEnergy" .!= e let (ce,cs) = second BS.pack . unsafePerformIO . centroidTemp 37 $ BS.unpack primarySequence centroidStructure <- (fmap encodeUtf8 <$> v .:? "centroidStructure") .!= cs centroidEnergy <- v .:? "centroidEnergy" .!= ce let mfeD1S = bldD1S mfeStructure let centroidD1S = bldD1S centroidStructure return RNA{..} bldD1S :: ByteString -> D1Secondary bldD1S x = mkD1S (["()"::String], BS.unpack x) -- | Given the primary sequence and the mutation set, fill the 'RNA' -- structure. -- -- NOTE This wraps some @ViennaRNA-bindings@ calls that are in @IO@. -- -- TODO check if these calls are *really* thread-safe! mkRNA :: Maybe (HM.HashMap ByteString QLine) -> ByteString -- ^ primary sequence of the *origin* RNA -> VU.Vector (Int,Char) -- ^ set of mutations compared to the origin -> RNA mkRNA lkup inp' ms = RNA { mutationSet = ms , primarySequence = inp , mfeStructure = mS , mfeEnergy = mE , centroidStructure = cS , centroidEnergy = cE , mfeD1S = bldD1S mS , centroidD1S = bldD1S cS } where inp = insertMutations ms inp' ((mE,mS),(cE,cS)) = maybe calculateHere lookup lkup calculateHere = ( second BS.pack . unsafePerformIO . mfeTemp 37 $ BS.unpack inp , second BS.pack . unsafePerformIO . centroidTemp 37 $ BS.unpack inp ) lookup lkup = case HM.lookup inp lkup of Nothing -> traceShow ("WARNING! have RNA lookup table but have to calculate!", inp) calculateHere Just QLine{..} -> (swap qlmfe,swap qlcentroid) -- | Insert a set of mutations in a @ByteString@. insertMutations :: VU.Vector (Int,Char) -> ByteString -> ByteString insertMutations ms s' = VU.foldl' go s' ms where go s (k,c) = let (h,t) = BS.splitAt k s in BS.concat [h, BS.singleton c, BS.drop 1 t] data Landscape = Landscape { rnas :: HM.HashMap (PA.BitSet PA.I) RNA -- ^ the individual RNA mutations. The index should be calculated from -- @linearIndex 0 high mutationSet@ , mutationCount :: !Int -- ^ how many nucleotides are mutated in total , landscapeOrigin :: !ByteString -- ^ the ancestral sequence , landscapeDestination :: !ByteString -- ^ the final sequence , mutationPositions :: !(B.BimapHashMap Int Int) } deriving (Show,Eq,Generic) instance NFData Landscape instance Serialize Landscape instance ToJSON Landscape where toJSON Landscape{..} = object [ "rnas" .= rnas , "mutationCount" .= mutationCount , "landscapeOrigin" .= decodeUtf8 landscapeOrigin , "landscapeDestination" .= decodeUtf8 landscapeDestination , "mutationPositions" .= mutationPositions ] toEncoding Landscape{..} = pairs ( "rnas" .= rnas <> "mutationCount" .= mutationCount <> "landscapeOrigin" .= decodeUtf8 landscapeOrigin <> "landscapeDestination" .= decodeUtf8 landscapeDestination <> "mutationPositions" .= mutationPositions ) instance FromJSON Landscape where parseJSON (Object v) = do rnas <- v .: "rnas" mutationCount <- v .: "mutationCount" landscapeOrigin <- encodeUtf8 <$> v .: "landscapeOrigin" landscapeDestination <- encodeUtf8 <$> v .: "landscapeDestination" mutationPositions <- v .: "mutationPositions" return Landscape{..} newtype Ancestral = Ancestral { getAncestral ∷ ByteString } newtype Extant = Extant { getExtant ∷ ByteString } newtype GlobalBackmutations = GlobalBackmutations { getGlobalBackmutations ∷ Int } newtype BackmutationCol = BackmutationCol Int -- | The new method for creating the RNA landscape. The outgoing stream of -- sequences should be directly to the 'BioInf.MutationOrder.SequenceDB' -- module, and 'writeSequenceFiles' in particular. -- -- TODO generalise over the alphabet -- -- TODO currently, the number of backmutations is either @0@ or @1@. For more, -- a number of things need to be fixed: (i) creation of all variants of -- @1,2,3...g@ backmutations. That is, consider all sequences that have at -- least one, but no more than @g@ global back mutations. -- -- TODO this will create duplicate sequences for those columns where the -- ancestral and extant have different nucleotides during global backmutation -- generation. createRNAlandscape2 ∷ (Monad m) ⇒ [Char] → Either GlobalBackmutations [Int] → [BackmutationCol] → Ancestral → Extant → ExceptT String m (Int,[ByteString], [(Int,Char,[ByteString])]) -- ^ Return A complex triple with (total sequence count, original problem -- sequences, list of global variants). The list of global variants holds for -- each (index,single nucleotide intermediate, list of variant sequences). createRNAlandscape2 alphabet gxorgs bs (Ancestral a) (Extant e) = do -- some sanity checks unless (BS.length a == BS.length e) $ throwE "different sequence lengths encountered" -- expand later! Right now, we either generate the unless (either ((<=1) . getGlobalBackmutations) ((<=1) . length) gxorgs) $ throwE "we currently allow *at most* one globally active backmutation" -- TODO need to fix up interesting columns unless (null bs) $ throwE "fix up interesting columns" -- collect the possible characters for each position. let ahm = IM.fromList . zip [0∷Int ..] . map ((:[]) . toUpper) $ BS.unpack a let ehm = IM.fromList . zip [0..] . map ( (:[]) . toUpper) $ BS.unpack e -- back mutation columns are active together with the above let bhm = IM.fromListWith (++) [ (k,alphabet) | BackmutationCol k ← bs ] let merge x y = sort . nub $ x++y let hm = IM.unionWith merge (IM.unionWith merge ahm ehm) bhm -- global back mutations, this will introduce only those characters not already present in each column let gbm = IM.fromList [ (k, alphabet \\ hm IM.! k) | k ← either (\_ → [0..BS.length a -1]) id gxorgs ] -- begin with the set of sequences without any global backmutations let localList = map BS.pack . sequence . map snd . IM.toAscList let localCount = product . map (length . snd) $ IM.toAscList hm -- do *not* count explicitly! let globalModifiers = concat . map (\(k,xs) → map (k,) xs) $ IM.toAscList gbm -- we now repeat the local generation, but "splice in" the globally modified characters -- TODO we currently allow exactly one global modifier in @[(k,z)]@ let globals = if True -- g == 1 -- TODO need to fix up then [ (k, z, [ BS.take k orig `BS.append` (BS.cons z (BS.drop (k+1) orig)) | orig ← localList hm -- produce a sequence only if this mutation is -- independent of the a/e mutations OR we look -- at a variant or the ancestral sequence. This -- prevents duplicate entries. , (a `BS.index` k == e `BS.index` k) || (a `BS.index` k == orig `BS.index` k) ] ) | (k,z) ← globalModifiers ] else [] let globalCount = localCount * length globalModifiers -- if g == 1 then localCount * length globalModifiers else 0 return $ (localCount + globalCount, localList hm, globals) -- | -- -- TODO prime candidate for parallelization. ViennaRNA-bindings currently -- does not allow parallel runs! It would be possible to consider -- externalizing this, but for now we just run single-threaded. createRNAlandscape :: Maybe (HM.HashMap ByteString QLine) -> Bool -> ByteString -> ByteString -> (Landscape, [(Int,ByteString)]) createRNAlandscape lkup verbose origin mutation = (ls, zipWith (\mm k -> (k,insertMutations mm origin)) mus [0..]) where ls = Landscape { rnas = rs -- `using` (parVector chunkSize) , mutationCount = length . filter (>1) . map length $ pms , landscapeOrigin = origin , landscapeDestination = mutation , mutationPositions = mutbit } rs = HM.fromList . map pairWithBitSet $ zipWith talk mus [0..] talk s c = (if (c `mod` 1000 == 0 && verbose) then traceShow c else id) mkRNA lkup origin s mus = map (VU.fromList . catMaybes) . sequence $ pms -- possible mutations pms = zipWith3 genM (BS.unpack origin) (BS.unpack mutation) [0..] genM a b k | a==b = [Nothing] | otherwise = [Nothing,Just (k,b)] -- pair each @RNA@ with the correct bitset pairWithBitSet r = (calcBitSet zeroBits . map fst . VU.toList $ mutationSet r, r) -- calculate the bitset pattern for this mutation calcBitSet bs [] = bs calcBitSet bs (x':xs) = let x = maybe (error $ "calcBitSet") id $ B.lookupL mutbit x' in calcBitSet (bs `setBit` x) xs -- bijection between mutation position and bit position -- @BitSet Bit <-> Mutated Bit@ mutbit = B.fromList . zipWith (flip (,)) [0 :: Int ..] . catMaybes $ zipWith3 genB (BS.unpack origin) (BS.unpack mutation) [0 :: Int ..] genB a b k | a == b = Nothing | otherwise = Just $ k -- | Write a generated landscape to disk. toFile :: FilePath -> Landscape -> IO () toFile fp = BSL.writeFile fp . compress . encodeLazy toFileJSON :: FilePath -> Landscape -> IO () toFileJSON fp = BSL.writeFile fp . compress . DA.encode fromFile :: FilePath -> IO Landscape fromFile fp = (decodeLazy . decompress) <$> BSL.readFile fp >>= \case Left err -> error $ "BioInf.MutationOrder.RNA.fromFile: " ++ err Right ls -> return ls fromFileJSON :: FilePath -> IO Landscape fromFileJSON fp = (DA.eitherDecode' . decompress) <$> BSL.readFile fp >>= \case Left err -> error $ "BioInf.MutationOrder.RNA.fromFile: " ++ err Right ls -> return ls -- stupid parsing for quintuple rnafold lines data QLine = QLine { qlSequence :: ByteString , qlmfe :: (ByteString,Double) , qlensemble :: (ByteString,Double) , qlcentroid :: (ByteString,Double) } deriving (Show) qlines f = do ls <- BS.lines <$> BS.readFile f return $ qlhm $ go ls where go [] = [] go ls = let (hs,ts) = splitAt 5 ls in parseql hs : go ts parseql [s,m,e,c,_] = QLine s (stupid m) (stupid e) (stupid c) stupid bs = let (h:ts) = BS.words bs r = BS.dropWhile (\c -> not $ isDigit c || c=='-') $ BS.unwords ts in (h, read $ BS.unpack $ BS.takeWhile (\c -> isDigit c || c =='-' || c=='.') r) qlhm xs = HM.fromList $ map (\q -> (qlSequence q, q)) xs