-- | Parse a CT file. module Biobase.DataSource.CT.Import where import Control.Applicative import Text.Parsec.Char import Text.Parsec.Combinator hiding (optional) import Text.Parsec.Error import Text.Parsec hiding (many) import Text.Parsec hiding ((<|>), many, optional) import Text.Parsec.String import qualified Data.Array.IArray as A import Biobase.RNA import Biobase.RNA.Pairs import Biobase.DataSource.CT -- | Parse a CT file. fromFile :: FilePath -> IO (Either String CT) fromFile fname = do x <- parseFromFile ctP fname return $ case x of Left err -> Left $ show err Right res -> res -- * Parser commentP :: GenParser Char st String commentP = char '#' *> anyChar `manyTill` newline commentsP = many commentP lenlineP :: GenParser Char st (Int,String) lenlineP = (,) <$> snumP <*> anyChar `manyTill` newline -- | We parse six fields: position, nucleotide, next nuc, previous nuc, pairing -- partner and (position again). CT8 adds two additional fields (unknown) and -- (unknown). Fields in brackets () are not returned, just parsed. nucLineP :: GenParser Char st (Int,Nuc) nucLineP = f <$> snumP <*> scharP <*> snumP <*> snumP <*> snumP <* snumP <* choice [newline, snumP *> snumP *> newline] where f pos chr prv nxt partner = (pos, Nuc chr prv nxt partner) nucLinesP :: GenParser Char st [(Int,Nuc)] nucLinesP = many (try nucLineP) -- | Parse a CT file with some error handling. ctP :: GenParser Char st (Either String CT) ctP = f <$> commentsP <*> lenlineP <*> nucLinesP <* spaces <* eof where f cs (l,ll) ns | l /= length ns = Left $ "CT inscribed length of: " ++ show l ++ " doesn't match read entries in: " ++ unlines cs | otherwise = Right . CT cs ll $ A.array (1,length ns) ns scharP = spaces *> anyChar snumP :: GenParser Char st Int snumP = read <$ try spaces <*> numberP numberP = many1 digit