-- | Imports a mafft file. The mafft file may be gzip-compressed, as long as -- the filename contains a ".gz" suffix. Files without ".gz" suffix are treated -- as uncrompressed. module Biobase.DataSource.Mafft.Import ( fromFile ) where import qualified Codec.Compression.GZip as GZip import System.FilePath (takeExtension) import qualified Data.ByteString.Lazy.Char8 as BS import Text.Parsec hiding (many,(<|>)) import Text.Parsec.ByteString.Lazy import Control.Applicative import Biobase.DataSource.Mafft test = fromFile "tba.maf.gz" -- | Import 'Mafft' from file. fromFile :: FilePath -> IO (Either String Mafft) fromFile fname = do content <- if (takeExtension fname == ".gz") then GZip.decompress `fmap` BS.readFile fname else BS.readFile fname res <- return $ runP pFile () fname content case res of Left err -> return . Left $ show err Right m -> return . Right $ m -- | Parses one comment line. pComment :: GenParser Char st BS.ByteString pComment = BS.pack <$ char '#' <*> manyTill anyChar newline -- | Parses many comment lines. pComments :: GenParser Char st [BS.ByteString] pComments = many pComment -- | Parses one complete 'Block'. pBlock :: GenParser Char st Block pBlock = Block <$> pAli <*> many1 pSequence where pAli = BS.pack <$ char 'a' <*> manyTill anyChar newline "expecting alignment (score)" -- | Parses many 'Block's. pBlocks :: GenParser Char st [Block] pBlocks = (try $ pBlock <* newline) `manyTill` (string "##eof maf" <* spaces <* eof) -- | The Mafft file parser. pFile :: GenParser Char st Mafft pFile = Mafft <$> pComments <*> pBlocks -- | Parses one 'Sequence' line. pSequence :: GenParser Char st Sequence pSequence = Sequence <$ string "s " <*> sname <*> sstart <*> slength <*> sstrand <*> sgenomesize <*> salisequence where sname = BS.pack <$> manyTill alphaNum (many1 space) sstart = read <$> number <* spaces slength = read <$> number <* spaces sstrand = PlusMinus <$> (char '+' <|> char '-') <* spaces sgenomesize = read <$> number <* spaces salisequence = CompressedBS . GZip.compress . BS.pack <$> manyTill anyChar newline -- | Parses an integral number. number = many1 digit