{-# LANGUAGE ScopedTypeVariables #-} -- | Importer for the NCM database. -- -- NOTE This version reads only known single/double-stranded NCMs. -- -- NOTE "infinity" is a large constant, given by RNA.Constants.eInf module Biobase.DataSource.MCFold.Import ( parseDir ) where import Control.Applicative import Control.Arrow (first, second) import Control.Monad (liftM) import Data.Either.Unwrap (fromRight) import Data.Maybe (fromJust) import qualified Data.ByteString as B import qualified Data.Map as M import qualified Data.Vector.Unboxed as VU import System.FilePath (()) import Text.Parsec.ByteString import Text.Parsec hiding (many, (<|>)) import Text.Parsec.Prim (parse) import Text.Printf import Text.Parsec.Numbers import Data.PrimitiveArray import Biobase.Constants import Biobase.DataSource.MCFold import Biobase.RNA import Biobase.RNA.Hashes import Biobase.RNA.NucBounds parseDir :: FilePath -> IO MotifDB parseDir dir = do let sList = map fst $ VU.toList knownSingleNCM let dList = map fst $ VU.toList knownDoubleNCM -- basic filenames let (spFiles :: [FilePath]) = map ((dir ) . printf "1_strand_%d.db") sList let (dsFiles :: [FilePath]) = map ((dir ) . uncurry (printf "2_strand_%d_%d.db")) dList -- parse cycle DB let parseWith parser suffix lst = liftM (zip lst) . mapM (liftM fromRight . parseFromFile parser . (++suffix)) sCycles <- parseWith parseCycle ".cycle" sList spFiles dCycles <- parseWith parseCycle ".cycle" dList dsFiles -- parse hinge DB: parses the original counted data sHinges <- parseWith parseHinge ".hinge" sList spFiles dHinges <- parseWith parseHinge ".hinge" dList dsFiles -- parse the DB with individual pair contributions, dropping the 5 integer-based values first pairScores <- liftM (fromAssocs minExtended maxExtended eInf . zip [(n1,n2)|n1<-eacgu,n2<-eacgu] . drop 5 . fromRight) . parseFromFile parsePairScores $ (dir "hinge.db") -- strangely, the data is in hinge.db -- parse contributions by individual junctions junctionScores <- liftM (fromList (0,0) (21,21) . fromRight) . parseFromFile parseJunction $ (dir "junction.db") return $ MotifDB { sCycles = map (second (toNCMArray pairScores)) sCycles , dCycles = map (second (toNCMArray pairScores)) dCycles , dsConnect = [ mkDSConnect junctionScores pairScores d s | d <- dHinges, s <- sHinges ] , ddConnect = [ mkDDConnect junctionScores pairScores d d | d <- dHinges, d <- dHinges ] , rawPairScores = pairScores , rawJunctionScores = junctionScores , rawSHingeCounts = sHinges , rawDHingeCounts = dHinges } -- | Create the connections array for a given doubleNCM known by (l,r) length -- and singleNCM. Such a connection is then only defined by the connecting -- pair. -- -- TODO check if connect calculates correctly! mkDSConnect :: JunctionArray -> PairScoreArray -> ((Int,Int),[RawHinge]) -> (Int,[RawHinge]) -> (((Int,Int),Int),ConnectArray) mkDSConnect junction pair ((lenLeft,lenRight),ds) (len,ss) = (((lenLeft,lenRight),len), fromAssocs l u z xs) where l = minExtended u = maxExtended z = eInf xs = [ ((n1,n2), connect n1 n2) | n1<-acgu, n2<-acgu ] dsMap = M.map normalize . M.fromListWith (++) $ map (\((pnn,others),v) -> (pnn,[(others,v)])) $ ds ssMap = M.map normalize . M.fromListWith (++) $ map (\((pnn,others),v) -> (pnn,[(others,v)])) $ ss connect n1 n2 = let common = [ va*vb | (ka,va) <- M.findWithDefault [] (p3,n1,n2) dsMap, (kb,vb) <- M.findWithDefault [] (p5,n1,n2) ssMap, ka==kb ] c = if null common then eInf else junction ! (((lenLeft,lenRight) `lookup` knownDoubleNCM),(len `lookup` knownSingleNCM)) -- junction score between two NCMs + (logScore $ sum common) in c lookup k = snd . fromJust . VU.find ((==k).fst) normalize xs | null xs = xs | otherwise = map (second (/total)) xs where total = sum $ map snd xs -- | Same as above. Some annoying code duplication. -- -- TODO check if connect calculates correctly mkDDConnect :: JunctionArray -> PairScoreArray -> ((Int,Int),[RawHinge]) -> ((Int,Int),[RawHinge]) -> (((Int,Int),(Int,Int)),ConnectArray) mkDDConnect junction pair ((lenLA,lenRA),dsA) ((lenLB,lenRB),dsB) = (((lenLA,lenRA),(lenLB,lenRB)), fromAssocs l u z xs) where l = minExtended u = maxExtended z = eInf xs = [ ((n1,n2), connect n1 n2) | n1<-acgu, n2<-acgu ] dsAMap = M.map normalize . M.fromListWith (++) $ map (\((pnn,others),v) -> (pnn,[(others,v)])) $ dsA dsBMap = M.map normalize . M.fromListWith (++) $ map (\((pnn,others),v) -> (pnn,[(others,v)])) $ dsB connect n1 n2 = let common = [ va*vb | (ka,va) <- M.findWithDefault [] (p3,n1,n2) dsAMap, (kb,vb) <- M.findWithDefault [] (p5,n1,n2) dsBMap, ka==kb ] c = if null common then eInf else junction ! (((lenLA,lenRA) `lookup` knownDoubleNCM),((lenLB,lenRB) `lookup` knownDoubleNCM)) -- junction score between two NCMs + (logScore $ sum common) in c lookup k = snd . fromJust . VU.find ((==k).fst) normalize xs | null xs = xs | otherwise = map (second (/total)) xs where total = sum $ map snd xs -- | Generates the array of all NCMs. We add the score for the closing pair -- (hinge.db) directly here. toNCMArray :: PairScoreArray -> [Cycle] -> PrimArray HashedPrimary Double toNCMArray pairScores xs | null xs = error "toNCMArray expects non-empty list of Cycles" | otherwise = fromAssocs l u z $ map f xs where lp = replicate (length . fst $ head xs) (minExtended :: Nucleotide) up = replicate (length . fst $ head xs) (maxExtended :: Nucleotide) l = mkci $ mkPrimary lp u = mkci $ mkPrimary up z = eInf mkci s = mkHashedPrimary (minExtended,maxExtended) s f (s,v) = let p = mkPrimary s pS = pairScores ! (VU.head p, VU.last p) in (mkci p,v + pS) -- * Parsers for different files -- | parser for junction.db parseJunction :: GenParser B.ByteString st [Double] parseJunction = many1 (doubles <* spaces) <* eof where doubles = try (const eInf <$> string "+inf") <|> floatX -- | parser for hinge.db Returns scores for individual pairs. parsePairScores :: GenParser B.ByteString st [Double] parsePairScores = (\a b -> a ++ b) <$> fl <*> ls where fl = many1 (read <$> many1 digit <* spaces) -- the 5 integers in the top line ls = many1 (doubles <* spaces) <* eof -- the 5x5 matrix of doubles doubles = try (const eInf <$> string "+inf") <|> floatX -- | parser for NCM database (*.cycle) parseCycle :: GenParser B.ByteString st [Cycle] parseCycle = many1 (cycle <* newline) <* eof where cycle :: GenParser B.ByteString st Cycle cycle = (\a b -> (a,b)) <$> many1 letter <* ws <*> floatX -- many ws characters ws = many $ char ' ' -- | helper function for the strange double values -- -- TODO move floatX into HsTools! floatX :: GenParser B.ByteString st Double floatX = toDouble <$> prefix <*> many digit <*> char '.' <*> many digit where prefix = char '+' <|> char '-' toDouble '+' ds1 dot ds2 = read $ ds1 ++ [dot] ++ ds2 toDouble '-' ds1 dot ds2 = read $ "-" ++ ds1 ++ [dot] ++ ds2 -- | new hinge parser parseHinge :: GenParser B.ByteString st [RawHinge] parseHinge = concat <$> many hinge <* eof where mkap "anti" = anti mkap "para" = para mktc "trans" = trans mktc "cis" = cis mksp x = case x of 'S' -> ssp 'H' -> hsp 'W' -> wsp 'B' -> bsp 'C' -> csp _ -> error $ "parseHinge: " ++ [x] hinge :: GenParser B.ByteString st [RawHinge] -- use monad format to check opening / closing equality hinge = do string "[ " n53 <- digit let primeType = if n53 == '5' then p5 else p3 string "' " b1 <- oneOf "ACGU" string " " b2 <- oneOf "ACGU" string " ]" newline hts <- many (try hingetype) return $ map (\((s1,s2,ap,tr),count) -> (((primeType,mkNuc b1,mkNuc b2),(s1,s2,ap,tr)),count)) hts hingetype :: GenParser B.ByteString st ((Slash,Slash,AntiPara,TransCis),Double) hingetype = mkht <$> hw <* space <*> (try (string "anti") <|> string "para") <* space <*> (try (string "trans") <|> string "cis") <* space <*> many1 digit <* newline mkht (a:_:b:[]) ap tc cnt = ((mksp a, mksp b, mkap ap, mktc tc), read cnt) hw = many1 (char '/' <|> letter) -- * Types type Cycle = (String,Double)