{-# LANGUAGE RecordWildCards #-} -- | Imports a dot-parenthesis file. -- -- TODO This importer is not unicode-safe. It silently filters out all -- sequence/structure pairs with unequal lengths. This is work for later. module Biobase.DataSource.DotParen.Import ( fromFile , fromFileOrFail ) where import Control.Applicative import Control.Monad (when) import Data.Either.Unwrap (isLeft,fromLeft,fromRight) import qualified Data.ByteString.Lazy.Char8 as BS import System.Exit (exitFailure) import Text.Parsec.ByteString.Lazy import Text.Parsec hiding (many,(<|>)) import Data.List (findIndices) import Biobase.DataSource.DotParen -- | Import a set of sequence/structure pairs from a file. -- -- TODO improve error handling fromFile :: FilePath -> IO (Either String DotParens) fromFile fname = do res <- parseFromFile pFile fname check res where check res | Left err <- res = return $ Left (show err) {- | Right ls <- res , xs@(x:_) <- findIndices unEqualLengths ls = return . Left $ "unequal sequence / structure length for: " ++ show (map (ls!!) xs) -} | Right ls <- res = return . Right . filter (not . unEqualLengths) $ ls unEqualLengths DotParen{..} = BS.length sequence /= BS.length dotparen -- | Read RnaStrand data or fail fromFileOrFail fname = do xs' <- fromFile fname when (isLeft xs') $ do print "error reading source:" putStrLn $ fromLeft xs' exitFailure return $ fromRight xs' -- * The parsers -- | parse a comment line pComment :: GenParser Char st BS.ByteString pComment = BS.pack <$ char '#' <*> manyTill anyChar newline -- | parse all comments pComments :: GenParser Char st [BS.ByteString] pComments = many pComment data DS = D BS.ByteString | S BS.ByteString isS (S _) = True isS _ = False fromDS (S x) = x fromDS (D x) = x -- | parse a sequence line pSequence :: GenParser Char st DS pSequence = (\c cs -> S $ BS.pack $ c:cs) <$> noneOf "#" <*> manyTill anyChar newline where -- | parse one dot-paren line pDotParen :: GenParser Char st DS pDotParen = D . BS.pack <$> manyTill dps newline where dps = oneOf "()[]<>{}." -- | parse either a dot-paren line or a sequence line pDS :: GenParser Char st DS pDS = choice [try pDotParen, pSequence] -- | Returns a pair of (sequence,dot-paren) pDSs :: GenParser Char st (BS.ByteString,BS.ByteString) pDSs = f <$> many1 pDS where f xs = let (ss,ds) = span isS xs in (BS.concat $ map fromDS ss,BS.concat $ map fromDS ds) -- | Parse one block pBlock :: GenParser Char st DotParen pBlock = (\cs (ss,ds) -> DotParen cs ss ds) <$> pComments <* many newline <*> pDSs -- | Parse many blocks pBlocks :: GenParser Char st DotParens pBlocks = many pBlock -- | Parse file pFile :: GenParser Char st DotParens pFile = pBlocks <* eof