-- | Functions to deal with a (large) RNA sequence data base and corresponding -- RNAfold foldings. module BioInf.MutationOrder.SequenceDB where import Codec.Compression.GZip (compress,decompress) import Control.Error import Control.Monad.Except import Control.Monad.IO.Class (liftIO, MonadIO) import Data.ByteString.Char8 (ByteString) import Data.Char (isDigit) import Data.Hashable import Debug.Trace import GHC.Generics import qualified Data.Attoparsec.ByteString.Char8 as AC import qualified Data.Attoparsec.ByteString.Lazy as A import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as BSL import qualified Data.Char as C import System.Directory (doesFileExist, getDirectoryContents) import System.FilePath ((), (<.>)) import System.IO.Unsafe (unsafeInterleaveIO) import qualified System.FilePath.Find as FP type PrefixLen = Int type SeqsPerFile = Int -- * Plain sequences, with no folding yet -- | Writes sequence files out to disk. Does *not* check if duplicates are -- written. The user should do this externally via @sort@ and @uniq@. The next -- free prefix is chosen. writeSequenceFiles ∷ (MonadIO m) ⇒ FilePath -- ^ Directory to write to. → PrefixLen -- ^ Number of characters in the prefix to count up. Fail if the prefix space -- is too small. → SeqsPerFile -- ^ How many sequences to write per file → [ByteString] -- ^ Sequences to write out. → ExceptT String m () writeSequenceFiles fp pfx spf xs = do -- cull prefix list to unused prefixes only let unused [] = throwE "writeSequenceFiles: prefix space empty" unused (x:xs) = do dfe ← liftIO . doesFileExist $ fp x <.> "gz" if dfe then unused xs else return $ x:xs -- will write out file names to unused prefixes. let go ps [] = return () go ps ys = do let (here,there) = splitAt spf ys (p:qs) ← unused ps -- write to compressed file @p@ let fname = fp p <.> "gz" liftIO . BSL.writeFile fname . compress . BSL.unlines $ map BSL.fromStrict here go qs there let pfxs = sequence $ replicate pfx ['a'..'z'] go pfxs xs -- | Reads all sequences. -- -- TODO make sure to read lazily enough that collection happens without first -- reading in the whole list. readSequenceFiles ∷ FilePath → ExceptT () IO [ByteString] readSequenceFiles fp = do return undefined -- * Structural information from RNAfold -- | Space-efficient RNAfold structure data RNAfoldResult = RNAfoldResult { rnaFoldSequence ∷ !ByteString , rnaFoldMFEStruc ∷ !ByteString , rnaFoldMFEEner ∷ !Double , rnaFoldMfeFrequency ∷ !Double , rnaFoldEnsembleStruc ∷ !ByteString -- ^ uses special syntax with unpaired, weakly paired, somewhat paired, -- somewhat paired up or down, strongly paired up or down for the ensemble , rnaFoldEnsembleEner ∷ !Double -- ^ this *is* the ensemble free energy , rnaFoldCentroidStruc ∷ !ByteString , rnaFoldCentroidEner ∷ !Double , rnaFoldDiversity ∷ !Double } deriving (Read,Show,Eq,Ord,Generic) -- | Lazily read @RNAfold@ structures. -- -- TODO use @pipes/machines@! we need lazy reading of files and live in an exceptt transformer stack! -- TODO generalize transformer stack readRNAfoldFiles ∷ FilePath → IO [RNAfoldResult] readRNAfoldFiles fp = do let go [] = return [] go (f:fs) = do bs ← unsafeInterleaveIO (putStrLn ("#" ++ f) >> decompress <$> BSL.readFile f) let rs = either error id $ runExcept (bslToRNAfoldResult bs) rss ← go fs return $ rs ++ rss go ∷ [FilePath] → IO [RNAfoldResult] FP.find FP.always (FP.extension FP.==? ".gz") (fp "structures") >>= go {-# NoInline readRNAfoldFiles #-} -- | bslToRNAfoldResult ∷ (Monad m) ⇒ BSL.ByteString → ExceptT String m [RNAfoldResult] bslToRNAfoldResult bs = do case A.eitherResult $ A.parse pRNAfold bs of Left e → throwE e Right r → return r {-# Inline bslToRNAfoldResult #-} -- | -- -- @ -- echo "CCCAAAGGG\nCCCAAAGGG" | ./RNAfold -p -- CCCAAAGGG -- (((...))) ( -1.20) -- (((...))) [ -1.41] -- (((...))) { -1.20 d=1.06} -- frequency of mfe structure in ensemble 0.707288; ensemble diversity 1.67 -- CCCAAAGGG -- (((...))) ( -1.20) -- (((...))) [ -1.41] -- (((...))) { -1.20 d=1.06} -- frequency of mfe structure in ensemble 0.707288; ensemble diversity 1.67 -- @ pRNAfold ∷ A.Parser [RNAfoldResult] pRNAfold = A.many1' go <* A.endOfInput where go = do -- 1. sequence rnaFoldSequence ← BS.copy <$> AC.takeWhile AC.isAlpha_ascii <* AC.skipSpace A. "RNAfold sequence" -- 2. mfe rnaFoldMFEStruc ← BS.copy <$> AC.takeTill AC.isSpace <* AC.skipSpace A. "RNAfold MFE structure" rnaFoldMFEEner ← AC.char '(' *> AC.skipSpace *> AC.double <* AC.char ')' <* AC.skipSpace A. "RNAfold MFE energy" -- 3. ensemble rnaFoldEnsembleStruc ← BS.copy <$> AC.takeTill AC.isSpace <* AC.skipSpace rnaFoldEnsembleEner ← AC.char '[' *> AC.skipSpace *> AC.double <* AC.char ']' <* AC.skipSpace -- 4. centroid rnaFoldCentroidStruc ← BS.copy <$> AC.takeTill AC.isSpace <* AC.skipSpace rnaFoldCentroidEner ← AC.char '{' *> AC.skipSpace *> AC.double <* AC.skipSpace dequal ← AC.string "d=" *> AC.double <* AC.char '}' <* AC.skipSpace -- 5.mfe frequency and diversity AC.string "frequency of mfe structure in ensemble" *> AC.skipSpace A. "frequency" rnaFoldMfeFrequency ← AC.double AC.string "; ensemble diversity" *> AC.skipSpace rnaFoldDiversity ← AC.double AC.skipSpace return RNAfoldResult{..} {-# Inline pRNAfold #-}