module Data.Fasta.ByteString.Lazy.Parse ( parsecFasta
, parsecCLIPFasta
, attoFasta
, attoCLIPFasta
, pipesFasta
, pipesCLIPFasta
, removeNs
, removeN
, removeCLIPNs ) where
import Data.Char
import Text.Parsec
import Text.Parsec.ByteString.Lazy
import qualified Data.Map.Strict as Map
import qualified Data.ByteString as BW
import qualified Data.ByteString.Char8 as SB
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Control.Applicative as CA
import Control.Monad (void)
import qualified Data.Attoparsec.ByteString.Char8 as A
import Pipes
import qualified Pipes.Prelude as P
import qualified Pipes.ByteString as PB
import qualified Pipes.Group as PG
import qualified Pipes.Attoparsec as PA
import Control.Lens (view)
import qualified Control.Foldl as FL
import Data.Fasta.ByteString.Lazy.Types
eol :: Parsec B.ByteString u String
eol = choice . map (try . string) $ ["\n\r", "\r\n", "\n", "\r"]
eoe :: Parsec B.ByteString u ()
eoe = do
lookAhead (void $ char '>') <|> eof
fasta :: Parsec B.ByteString u FastaSequence
fasta = do
spaces
char '>'
header <- manyTill (satisfy (/= '>')) eol
fseq <- manyTill anyChar eoe
return (FastaSequence { fastaHeader = B.pack header
, fastaSeq = B.pack . removeWhitespace $ fseq } )
where
removeWhitespace = filter (`notElem` ("\n\r " :: String))
fastaFile :: Parsec B.ByteString u [FastaSequence]
fastaFile = do
spaces
many fasta
fastaCLIP :: Parsec B.ByteString u (FastaSequence, [FastaSequence])
fastaCLIP = do
spaces
char '>'
germline <- fasta
clones <- many $ try fasta
return (germline, clones)
fastaCLIPFile :: Parsec B.ByteString u [(FastaSequence, [FastaSequence])]
fastaCLIPFile = do
spaces
many fastaCLIP
parsecFasta :: B.ByteString -> [FastaSequence]
parsecFasta = eToV . parse fastaFile "error"
where
eToV (Right x) = x
eToV (Left x) = error ("Unable to parse fasta file\n" ++ show x)
parsecCLIPFasta :: B.ByteString -> CloneMap
parsecCLIPFasta = Map.fromList
. map (\(!x, (!y, !z)) -> ((x, y), z))
. zip [0..]
. eToV
. parse fastaCLIPFile "error"
where
eToV (Right x) = x
eToV (Left x) = error ("Unable to parse fasta file\n" ++ show x)
anyButSpace :: A.Parser Char
anyButSpace = do
A.skipSpace
x <- A.anyChar
A.skipSpace
return x
fasta' :: A.Parser FastaSequence
fasta' = do
header <- A.takeWhile (\x -> x /= '\n' && x /= '\r')
A.endOfLine
fseq <- A.manyTill anyButSpace (void (A.char '>') CA.<|> A.endOfInput)
return FastaSequence { fastaHeader = B.fromStrict header
, fastaSeq = B.pack fseq }
fastaFile' :: A.Parser [FastaSequence]
fastaFile' = do
A.skipSpace
A.char '>'
A.many' fasta'
fastaCLIP' :: A.Parser FastaSequence
fastaCLIP' = do
header <- A.takeWhile (\x -> x /= '\n' && x /= '\r')
A.endOfLine
fseq <- A.manyTill anyButSpace (void (A.char '>') CA.<|> A.endOfInput)
return FastaSequence { fastaHeader = B.fromStrict header
, fastaSeq = B.pack fseq }
clone' :: A.Parser (Germline, [FastaSequence])
clone' = do
A.skipSpace
germline <- fastaCLIP'
fseqs <- A.manyTill fasta' (void (A.char '>') CA.<|> A.endOfInput)
return (germline, fseqs)
fastaCLIPFile' :: A.Parser [(Germline, [FastaSequence])]
fastaCLIPFile' = do
A.skipSpace
A.string ">>"
A.many' clone'
attoFasta :: BW.ByteString -> [FastaSequence]
attoFasta = eToV . A.parseOnly fastaFile'
where
eToV (Right x) = x
eToV (Left x) = error ("Unable to parse fasta file\n" ++ show x)
attoCLIPFasta :: BW.ByteString -> [(Germline, [FastaSequence])]
attoCLIPFasta = eToV . A.parseOnly fastaCLIPFile'
where
eToV (Right x) = x
eToV (Left x) = error ("Unable to parse fasta file\n" ++ show x)
pipesFasta :: (MonadIO m)
=> Producer SB.ByteString m ()
-> Producer FastaSequence m ()
pipesFasta p = FL.purely
PG.folds
FL.mconcat
( view (PB.splits (fromIntegral $ ord '>'))
. PB.drop (1 :: Int)
$ p )
>-> P.map toFasta
where
toFasta x = FastaSequence { fastaHeader = B.fromChunks
. take 1
. SB.lines
$ x
, fastaSeq = B.fromChunks
. tail
. SB.lines
$ x }
pipesCLIPFasta :: (MonadIO m)
=> Producer BW.ByteString m ()
-> Producer (Germline, [FastaSequence]) m (Either (PA.ParsingError, Producer BW.ByteString m ()) ())
pipesCLIPFasta = PA.parsed clone'
. PB.drop 2
. PB.dropWhile (`BW.elem` "\n\r\t ")
removeNs :: [FastaSequence] -> [FastaSequence]
removeNs = map (\x -> x { fastaSeq = noN . fastaSeq $ x })
where
noN = B.map (\y -> if (y /= 'N' && y /= 'n') then y else '-')
removeN :: FastaSequence -> FastaSequence
removeN x = x { fastaSeq = noN . fastaSeq $ x }
where
noN = B.map (\y -> if (y /= 'N' && y /= 'n') then y else '-')
removeCLIPNs :: CloneMap -> CloneMap
removeCLIPNs = Map.fromList . map remove . Map.toList
where
remove ((!x, !y), !z) = ((x, newSeq y), map newSeq z)
newSeq !x = x { fastaSeq = noN . fastaSeq $ x }
noN = B.map (\y -> if (y /= 'N' && y /= 'n') then y else '-')