{-# LANGUAGE NoMonomorphismRestriction #-} -- | Turner file parser. Returns a Turner2004 data structure. We store data in -- the same way it is stored in the ViennaRNA package. Pairs are tuples -- however. -- -- How is 'stack' data stored: -- -- AX -- UY -> ((A,U),(Y,X)) -- -- How 'iloop1x1' is stored: -- -- X -- A G -- U C -> ((A,U),(C,G),X,Y) -- Y -- -- Now 'iloop1x2' is stored: -- -- X -- A G -- U C -> ((A,U),(C,G),X,C,Y), single (X) first, then 5' to 3' -- YC -- -- 'iloop2x2' is stored: -- -- XY -- A G -- U C -> ((A,U),(C,G),X,Y,y,x), X-->Y then x<--y -- xy -- --TODO not sure if dangle3/dangle5 are correctly split or if they should switch module Biobase.DataSource.Turner.Import ( parseTurner ) where import Control.Applicative import Control.Arrow (first) import Control.Monad import Data.Either import Data.List.Split (splitEvery) import Data.List (transpose) import qualified Data.Map as M import System.FilePath.Find import Text.Parsec hiding ((<|>), many) import Text.Parsec.String import Biobase.RNA import Data.Ix.Tuple import Data.PrimitiveArray import Data.PrimitiveArray.Ix import Text.Parsec.Numbers import Biobase.DataSource.Turner.Tables -- * Associate every file with a parse. minp = (nucA,nucA) maxp = (nucU,nucU) minpb = (minp,nucA) maxpb = (maxp,nucU) minpp = (minp,minp) maxpp = (maxp,maxp) minppbb = (minp,minp,nucA,nucA) maxppbb = (maxp,maxp,nucU,nucU) minppbbb = (minp,minp,nucA,nucA,nucA) maxppbbb = (maxp,maxp,nucU,nucU,nucU) minppbbbb = (minp,minp,nucA,nucA,nucA,nucA) maxppbbbb = (maxp,maxp,nucU,nucU,nucU,nucU) minpbb = (minp,nucA,nucA) maxpbb = (maxp,nucU,nucU) -- (4,3) switched for vienna rna compatibility keyspp = [((k1,k2),(k4,k3)) | k1 <- acgu, k3 <- acgu, k2 <- acgu, k4 <- acgu] keyspb = [((k1,k2),k3) | k1 <- acgu, k2 <- acgu, k3 <- acgu] keyspbb = [((k1,k2),k3,k4) | k1 <- acgu, k3 <- acgu, k2 <- acgu, k4 <- acgu] plist11 = [(nucA,nucU),(nucC,nucG),(nucG,nucC),(nucU,nucA),(nucG,nucU),(nucU,nucG)] plist22 = [(nucA,nucU),(nucC,nucG),(nucG,nucC),(nucG,nucU),(nucU,nucA),(nucU,nucG)] -- (4,3) switched for vienna rna compatibility keysppbb = [((k1,k2),(k4,k3),k5,k6) | (k1,k2) <- plist11, k5 <- acgu, (k3,k4) <- plist11, k6 <- acgu] -- (4,3) switched for vienna rna compatibility keysppbbb = [((k1,k2),(k4,k3),k5,k6,k7) | (k1,k2) <- plist11, k6 <- acgu, k5 <- acgu, (k3,k4) <- plist11, k7 <- acgu] -- (4,3) switched for vienna rna compatibility, 5786 is 5'3' order top, bottom! keysppbbbb = [((k1,k2),(k4,k3),k5,k6,k7,k8) | (k1,k2) <- plist22, (k3,k4) <- plist22, k5 <- acgu, k8 <- acgu, k6 <- acgu, k7 <- acgu] z = 999999 -- | Given the base dir and a suffix (.dat/.dh most likely), parse the relevant -- files. The prefix can be used for, eg., dna file parseTurner prefix basedir fsuffix = do vstack <- grabB basedir fsuffix $ prefix ++ "stack" vdangles <- grabB basedir fsuffix $ prefix ++ "dangle" vlengths <- grabB basedir fsuffix $ prefix ++ "loop" vhairmm <- grabB basedir fsuffix $ prefix ++ "tstackh" viloopmm <- grabB basedir fsuffix $ prefix ++ "tstacki" viloop23mm <- grabB basedir fsuffix $ prefix ++ "tstacki23" viloop1nmm <- grabB basedir fsuffix $ prefix ++ "tstacki1n" vmultimm <- grabB basedir fsuffix $ prefix ++ "tstackm" vextmm <- grabB basedir fsuffix $ prefix ++ "tstack" viloop11 <- grabB basedir fsuffix $ prefix ++ "int11" viloop12 <- grabB basedir fsuffix $ prefix ++ "int21" viloop22 <- grabB basedir fsuffix $ prefix ++ "int22" vlookups <- fmap (map (first (map mkNuc)) . concat) $ mapM (\f -> grabA basedir fsuffix $ prefix ++ f) ["triloop","tloop","hexaloop"] vmisc <- grabB basedir fsuffix $ prefix ++ "miscloop" let (vdangle3,vdangle5) = splitAt (length vdangles `div` 2) vdangles let (_:viloopl:vbulgel:vhairpinl:[]) = transpose $ splitEvery 4 vlengths return $ Turner2004 { stack = fromAssocs minpp maxpp z $ zip keyspp vstack , dangle3 = fromAssocs minpb maxpb z $ zip keyspb vdangle3 , dangle5 = fromAssocs minpb maxpb z $ zip keyspb vdangle5 , hairpinL = fromAssocs 0 30 z $ zip [1..] vhairpinl , bulgeL = fromAssocs 0 30 z $ zip [1..] vbulgel , iloopL = fromAssocs 0 30 z $ zip [1..] viloopl , hairpinMM = fromAssocs minpbb maxpbb z $ zip keyspbb vhairmm , iloopMM = fromAssocs minpbb maxpbb z $ zip keyspbb viloopmm , iloop2x3MM = fromAssocs minpbb maxpbb z $ zip keyspbb viloop23mm , iloop1xnMM = fromAssocs minpbb maxpbb z $ zip keyspbb viloop1nmm , multiMM = fromAssocs minpbb maxpbb z $ zip keyspbb vmultimm , extMM = fromAssocs minpbb maxpbb z $ zip keyspbb vextmm , hairpinLookup = M.fromList vlookups , iloop1x1 = fromAssocs minppbb maxppbb z $ zip keysppbb viloop11 , iloop1x2 = fromAssocs minppbbb maxppbbb z $ zip keysppbbb viloop12 , iloop2x2 = fromAssocs minppbbbb maxppbbbb z $ zip keysppbbbb viloop22 , ninio = vmisc !! 2 , maxNinio = vmisc !! 1 , multiOffset = vmisc !! 6 , multiNuc = vmisc !! 7 , multiHelix = vmisc !! 8 , largeLoop = vmisc !! 0 , termAU = vmisc !! 14 , intermolecularInit = vmisc !! 19 } grabB basedir fsuffix fname = do p <- parseFromFile pBlockFile (basedir ++ "/" ++ fname ++ fsuffix) case p of (Right ans) -> return ans (Left err) -> error $ show err grabA basedir fsuffix fname = do p <- parseFromFile pAssocFile (basedir ++ "/" ++ fname ++ fsuffix) case p of (Right ans) -> return ans (Left err) -> error $ show err -- * File parser: We do not want to understand the format, just extract all -- data. -- | Blocks of data. pBlockFile = concat <$> many (try numberRow <|> infoRow) <* eof where -- a line with a least one number numberRow = many (char ' ') *> (number <* notFollowedBy (char '\'')) `sepEndBy1` (many $ char ' ') <* newline -- a number is a dot or some extended floating point number number = (try dotFloat) <|> (999999 <$ char '.') <|> parseExtFloat "floating point number" -- | A File with associations. pAssocFile = concat <$> many1 (try assocRow <|> infoRow) where -- one assocs of string, value assocRow = (\a b -> [(a,b)]) <$ many (char ' ') <*> many1 (oneOf nuc) <* many (char ' ') <*> parseExtFloat <* many (char ' ') -- a line with crap infoRow = [] <$ anyChar `manyTill` newline -- a rather stupid float dotFloat = (\s a b -> read $ s:'0':a:b) <$> (char '-' <|> pure ' ') <*> char '.' <*> many1 digit nuc = "ACGUT"