module BioInf.MutationOrder.RNA where
import Data.Aeson as DA
import Data.Bits
import Codec.Compression.GZip (compress,decompress)
import Control.Arrow (second)
import Control.DeepSeq
import Control.Parallel.Strategies
import Data.ByteString (ByteString)
import Data.Maybe (catMaybes)
import Data.Serialize
import Data.Vector.Serialize
import Data.Vector.Strategies
import Debug.Trace
import GHC.Generics
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.HashMap.Strict as HM
import Data.Serialize.Instances
import Data.Text.Encoding (decodeUtf8, encodeUtf8)
import Data.Monoid
import Data.Char (isDigit)
import Data.Tuple (swap)
import qualified Data.Bijection.HashMap as B
import BioInf.ViennaRNA.Bindings
import qualified Data.PrimitiveArray as PA
import Biobase.Secondary.Diagrams (D1Secondary(..), mkD1S)
data RNA = RNA
{ mutationSet :: !(VU.Vector (Int,Char))
, primarySequence :: !ByteString
, mfeStructure :: !ByteString
, mfeD1S :: !D1Secondary
, mfeEnergy :: !Double
, centroidStructure :: !ByteString
, centroidD1S :: !D1Secondary
, centroidEnergy :: !Double
}
deriving (Show,Eq,Generic)
instance NFData RNA
instance Serialize RNA
instance ToJSON RNA where
toJSON RNA{..} =
object [ "mutationSet" .= mutationSet
, "primarySequence" .= decodeUtf8 primarySequence
, "mfeStructure" .= decodeUtf8 mfeStructure
, "mfeEnergy" .= mfeEnergy
, "centroidStructure" .= decodeUtf8 centroidStructure
, "centroidEnergy" .= centroidEnergy
]
toEncoding RNA{..} =
pairs ( "mutationSet" .= mutationSet
<> "primarySequence" .= decodeUtf8 primarySequence
<> "mfeStructure" .= decodeUtf8 mfeStructure
<> "mfeEnergy" .= mfeEnergy
<> "centroidStructure" .= decodeUtf8 centroidStructure
<> "centroidEnergy" .= centroidEnergy
)
instance FromJSON RNA where
parseJSON (Object v) = do
mutationSet <- v .: "mutationSet"
primarySequence <- encodeUtf8 <$> v .: "primarySequence"
let (e,s) = second BS.pack . unsafePerformIO . mfeTemp 37 $ BS.unpack primarySequence
mfeStructure <- (fmap encodeUtf8 <$> v .:? "mfeStructure") .!= s
mfeEnergy <- v .:? "mfeEnergy" .!= e
let (ce,cs) = second BS.pack . unsafePerformIO . centroidTemp 37 $ BS.unpack primarySequence
centroidStructure <- (fmap encodeUtf8 <$> v .:? "centroidStructure") .!= cs
centroidEnergy <- v .:? "centroidEnergy" .!= ce
let mfeD1S = bldD1S mfeStructure
let centroidD1S = bldD1S centroidStructure
return RNA{..}
bldD1S :: ByteString -> D1Secondary
bldD1S x = mkD1S (["()"::String], BS.unpack x)
mkRNA
:: Maybe (HM.HashMap ByteString QLine)
-> ByteString
-> VU.Vector (Int,Char)
-> RNA
mkRNA lkup inp' ms = RNA
{ mutationSet = ms
, primarySequence = inp
, mfeStructure = mS
, mfeEnergy = mE
, centroidStructure = cS
, centroidEnergy = cE
, mfeD1S = bldD1S mS
, centroidD1S = bldD1S cS
}
where
inp = insertMutations ms inp'
((mE,mS),(cE,cS)) = maybe calculateHere lookup lkup
calculateHere = ( second BS.pack . unsafePerformIO . mfeTemp 37 $ BS.unpack inp
, second BS.pack . unsafePerformIO . centroidTemp 37 $ BS.unpack inp
)
lookup lkup =
case HM.lookup inp lkup of
Nothing -> traceShow ("WARNING! have RNA lookup table but have to calculate!", inp) calculateHere
Just QLine{..} -> (swap qlmfe,swap qlcentroid)
insertMutations :: VU.Vector (Int,Char) -> ByteString -> ByteString
insertMutations ms s' = VU.foldl' go s' ms
where go s (k,c) =
let (h,t) = BS.splitAt k s
in BS.concat [h, BS.singleton c, BS.drop 1 t]
data Landscape = Landscape
{ rnas :: HM.HashMap (PA.BitSet PA.I) RNA
, mutationCount :: !Int
, landscapeOrigin :: !ByteString
, landscapeDestination :: !ByteString
, mutationPositions :: !(B.BimapHashMap Int Int)
}
deriving (Show,Eq,Generic)
instance NFData Landscape
instance Serialize Landscape
instance ToJSON Landscape where
toJSON Landscape{..} =
object [ "rnas" .= rnas
, "mutationCount" .= mutationCount
, "landscapeOrigin" .= decodeUtf8 landscapeOrigin
, "landscapeDestination" .= decodeUtf8 landscapeDestination
, "mutationPositions" .= mutationPositions
]
toEncoding Landscape{..} =
pairs ( "rnas" .= rnas
<> "mutationCount" .= mutationCount
<> "landscapeOrigin" .= decodeUtf8 landscapeOrigin
<> "landscapeDestination" .= decodeUtf8 landscapeDestination
<> "mutationPositions" .= mutationPositions
)
instance FromJSON Landscape where
parseJSON (Object v) = do
rnas <- v .: "rnas"
mutationCount <- v .: "mutationCount"
landscapeOrigin <- encodeUtf8 <$> v .: "landscapeOrigin"
landscapeDestination <- encodeUtf8 <$> v .: "landscapeDestination"
mutationPositions <- v .: "mutationPositions"
return Landscape{..}
createRNAlandscape :: Maybe (HM.HashMap ByteString QLine) -> Bool -> ByteString -> ByteString -> (Landscape, [(Int,ByteString)])
createRNAlandscape lkup verbose origin mutation = (ls, zipWith (\mm k -> (k,insertMutations mm origin)) mus [0..])
where
ls = Landscape
{ rnas = rs
, mutationCount = length . filter (>1) . map length $ pms
, landscapeOrigin = origin
, landscapeDestination = mutation
, mutationPositions = mutbit
}
rs = HM.fromList . map pairWithBitSet $ zipWith talk mus [0..]
talk s c = (if (c `mod` 1000 == 0 && verbose) then traceShow c else id) mkRNA lkup origin s
mus = map (VU.fromList . catMaybes)
. sequence
$ pms
pms = zipWith3 genM (BS.unpack origin) (BS.unpack mutation) [0..]
genM a b k | a==b = [Nothing]
| otherwise = [Nothing,Just (k,b)]
pairWithBitSet r = (calcBitSet zeroBits . map fst . VU.toList $ mutationSet r, r)
calcBitSet bs [] = bs
calcBitSet bs (x':xs) =
let x = maybe (error $ "calcBitSet") id $ B.lookupL mutbit x'
in calcBitSet (bs `setBit` x) xs
mutbit = B.fromList
. zipWith (flip (,)) [0 :: Int ..]
. catMaybes $ zipWith3 genB (BS.unpack origin) (BS.unpack mutation) [0 :: Int ..]
genB a b k | a == b = Nothing
| otherwise = Just $ k
toFile :: FilePath -> Landscape -> IO ()
toFile fp = BSL.writeFile fp . compress . encodeLazy
toFileJSON :: FilePath -> Landscape -> IO ()
toFileJSON fp = BSL.writeFile fp . compress . DA.encode
fromFile :: FilePath -> IO Landscape
fromFile fp = (decodeLazy . decompress) <$> BSL.readFile fp >>= \case
Left err -> error $ "BioInf.MutationOrder.RNA.fromFile: " ++ err
Right ls -> return ls
fromFileJSON :: FilePath -> IO Landscape
fromFileJSON fp = (DA.eitherDecode' . decompress) <$> BSL.readFile fp >>= \case
Left err -> error $ "BioInf.MutationOrder.RNA.fromFile: " ++ err
Right ls -> return ls
data QLine = QLine
{ qlSequence :: ByteString
, qlmfe :: (ByteString,Double)
, qlensemble :: (ByteString,Double)
, qlcentroid :: (ByteString,Double)
}
deriving (Show)
qlines f = do
ls <- BS.lines <$> BS.readFile f
return $ qlhm $ go ls
where go [] = []
go ls = let (hs,ts) = splitAt 5 ls
in parseql hs : go ts
parseql [s,m,e,c,_] =
QLine s
(stupid m)
(stupid e)
(stupid c)
stupid bs = let (h:ts) = BS.words bs
r = BS.dropWhile (\c -> not $ isDigit c || c=='-') $ BS.unwords ts
in (h, read $ BS.unpack $ BS.takeWhile (\c -> isDigit c || c =='-' || c=='.') r)
qlhm xs = HM.fromList $ map (\q -> (qlSequence q, q)) xs