-- -- A parser for Infernal's Covariance Models (CM). Should work with version 1.0 -- of Infernal. -- -- TODO use generics? -- -- TODO importing the cmcalibrate calibration data is currently broken! (data is parsed as separate lines) module Biobase.Infernal.CM.Import (fromFile,fromString) where import Data.Maybe (fromJust) import qualified Data.Array.IArray as A import qualified Data.List as L import Text.ParserCombinators.Parsec hiding (State) import Biobase.Infernal.CM import Biobase.RNA hiding (nucE) -- {{{ Parsec myChar = alphaNum <|> char '-' <|> char '_' <|> char '.' headerChar = myChar <|> char '[' <|> char ']' floating = many1 $ digit <|> char '.' <|> char 'e' <|> char '-' headerLine = do key <- many1 headerChar --spaces many1 $ char ' ' val <- many1 (headerChar <|> char ' ' <|> char '/' <|> char ':') newline return (key,val) unknownNumbers = do many1 $ char ' ' ns <- floating `sepEndBy` (many1 $ char ' ') newline return ("",concat ns) theHeader = do hs <- many $ (try headerLine <|> try unknownNumbers) string "MODEL:" newline return hs -- }}} -- {{{ Node node = do spaces string "[ " name <- many letter spaces num <- many digit >>= (return . read) spaces string "]" newline states <- many $ try (state num) return $ (Node { ntype = read name , nid = num , nstates = map sid states , nparents = [] , nchildren = [] , ntag = () } , states) -- }}} -- {{{ stuff -- TODO put some of this into HsTools/Parsec stuff aNum = many1 $ char '-' <|> digit addneginf :: String -> Double addneginf "*" = (-1)/0 --"-1000000000.0" addneginf x = read x -- }}} -- {{{ State state nodeID = do spaces name <- many1 myChar spaces sid <- aNum >>= (return . read) spaces plast <- aNum >>= (return . read) spaces pnum <- aNum >>= (return . read) spaces cfirst <- aNum >>= (return . read) spaces cnum <- aNum >>= (return . read) probs <- ( (many $ myChar <|> char '*') `sepBy` (many1 $ char ' ') >>= return . map addneginf . filter ((/=) "") ) newline let s = case name of "B" -> State { stype = read name , sid = sid , snode = nodeID , sparents = [plast-pnum+1 .. plast] , schildren = [Branch cfirst, Branch cnum] , semission = [] , stag = () } _ -> State { stype = read name , sid = sid , snode = nodeID , sparents = [plast-pnum+1 .. plast] , schildren = zipWith Transition [cfirst .. cfirst+cnum-1] probs , semission = let keep = drop cnum probs in case length keep of 0 -> [] 4 -> zipWith EmitS acgu keep 16 -> zipWith (\(k1,k2) v -> EmitP k1 k2 v) acguPairs keep _ -> error $ "strange number of probabilities" ++ show (keep) , stag = () } return s -- }}} -- {{{ Model models = do ms <- many model eof return ms model = do h <- theHeader ns <- many node let states = concatMap snd ns let nodes = map (addPCinfo states . fst) ns -- just add all the node parent / child info string "//" newline -- eof -- removed, we want to be able to read concatenated models! return $ CM { nodes = A.array (0, length nodes -1) $ zip (map nid nodes) nodes , states = A.array (0, length states -1) $ zip (map sid states) states , header = h , localBegin = A.array (0, length states -1) $ zip [0 .. length states -1] (0.0 : repeat (-1/0)) , localEnd = A.array (0, length states -1) $ zip [0 .. length states -1] (repeat (-1/0)) , cmType = CMScore , nullModel = A.array (nucA,nucU) $ zip acgu (map read . words . fromJust $ "NULL" `lookup` h) -- TODO circumvents the whole parsing stuff! } -- }}} -- {{{ Stuff addPCinfo states n = let s = nstates n sp = L.nub $ L.sort $ concatMap (sparents . (states !!)) s sc = L.nub $ L.sort $ concatMap (transitionTargets . schildren . (states !!)) s np = (L.nub $ L.sort $ map (snode . (states !!)) sp) L.\\ [nid n] nc = (L.nub $ L.sort $ map (snode . (states !!)) sc) L.\\ [nid n] in n {nparents = np, nchildren = nc} -- | Two types of parsing, once using a file and once by parsing a string. fromFile f = parseFromFile models f fromString s = parse models "(stdin)" s -- }}} -- | Helper function to remove impossible state transitions (those that have -- -infty score). -- TODO move to InfernalCM.hs and have it for Prob and Score both {- canonize cm = cm {states = A.amap f $ states cm} where f s = s {schildren = filter g $ schildren s} g (Branch _) = True g (Transition _ v) = h (_,s) | s == (-1)/0 = False | otherwise = True -}