module Biobase.Fasta.Import where
import Control.Arrow (second)
import Control.Monad.IO.Class (liftIO, MonadIO (..))
import Control.Monad (unless)
import Data.ByteString (ByteString, breakByte, takeWhile, empty, null, uncons)
import Data.Char
import Data.Conduit as C
import Data.Conduit.Binary as C
import Data.Conduit.List as CL
import Prelude as P hiding (null)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import Bio.Core.Sequence (Offset(..))
import Biobase.Fasta
parseFastaWindows :: Monad m => Int -> Conduit ByteString m FastaWindow
parseFastaWindows wsize = parseEvents wsize =$= CL.concatMapAccum go Nothing where
go (Header i d) _ = (Just (0,i,d,""), [])
go (Data x) Nothing = (Just (0,"","",""), [FastaW "" "" 0 x ""])
go (Data x) (Just (k,i,d,p)) = (Just (k + (fromIntegral $ B.length x), i, d, x), [FastaW i d (Offset k) x p])
go Done _ = (Nothing, [])
renderFastaWindows :: Monad m => Int -> Conduit FastaWindow m ByteString
renderFastaWindows cols = CL.concatMapAccum go Nothing =$= renderEvents cols where
go fw Nothing = (Just (_identifier fw), [Header (_identifier fw) (_description fw), Data (_fasta fw)])
go fw (Just i) = if _identifier fw == i
then (Just i, [Data (_fasta fw)])
else go fw Nothing
data Event
= Header !ByteString !ByteString
| Data !ByteString
| Done
deriving (Eq,Show)
isHeader :: Event -> Bool
isHeader (Header _ _) = True
isHeader _ = False
parseEvents :: Monad m => Int -> GInfConduit ByteString m Event
parseEvents wsize = awaitE >>= either return goU where
loopU = awaitE >>= either finishU goU
loopH front = awaitE >>= either (finishH front) (goH front)
loopD k front = awaitE >>= either (finishD k front) (goD k front)
finishU r = yield Done >> return r
finishH front r = let final = front empty
in unless (null final) (yield . uncurry Header . second (B.drop 1) . breakByte 32 . B.drop 1 $ final) >> yield Done >> return r
finishD k front r = let final = front empty
in unless (null final) (yield $ Data final) >> yield Done >> return r
goU s = case BC.uncons s of
Just ('>', _) -> goH id s
Just _ -> goD 0 id s
Nothing -> loopU
goH sofar more = case uncons rpart of
Just (_, rpart') -> yield (uncurry Header . second (B.drop 1) . breakByte 32 . B.drop 1 $ sofar fpart) >> goU rpart'
Nothing -> loopH . B.append $ sofar more
where (fpart,rpart) = breakByte 10 more
goD k sofar more
| Just ('>',_) <- BC.uncons more = let final = sofar empty in unless (null final) (yield $ Data final) >> goU more
| otherwise = case uncons rpart of
Just (_, rpart') -> let k' = k + B.length fpart in case k' `compare` wsize of
LT -> goD k' (B.append $ sofar fpart) rpart'
EQ -> yield (Data $ sofar fpart) >> goU rpart'
GT -> let (lp,rp) = B.splitAt wsize $ sofar fpart in yield (Data lp) >> goD 0 id (B.append rp rpart)
Nothing -> let k' = k + B.length more in case k' `compare` wsize of
LT -> loopD k' . B.append $ sofar more
EQ -> yield (Data $ sofar more) >> loopU
GT -> let (lp,rp) = B.splitAt wsize $ sofar more in yield (Data lp) >> goD 0 id rp
where (fpart,rpart) = breakByte 10 more
renderEvents :: Monad m => Int -> Conduit Event m ByteString
renderEvents cols = CL.concatMap go =$= CL.map (`BC.snoc` '\n') where
go (Header i d) = [printHeader $ Header i d]
go (Data xs) = rows xs
go (Done) = []
rows xs = let (x,xs') = B.splitAt cols xs
in if B.length xs <= cols
then [xs]
else x : rows xs'
printHeader (Header i d) = BC.concat $ [">",i] ++ (if null d then [] else [" ", d])
test :: IO ()
test = do
let prnt (Header i d) = BC.putStr i >> BC.putStrLn d
prnt (Data d) = BC.putStrLn d
runResourceT $ sourceFile "big.fa" $= parseEvents 1000 $$ CL.foldM (\_ x -> liftIO $ prnt x) ()