{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NoMonomorphismRestriction #-} -- | The importer for MAF files. -- -- The importer is in enumerator form. In case of large alignments, move -- computations into an iteratee. module Biobase.MAF.Import where import Control.Arrow import Control.Monad import Data.ByteString.Char8 as BS import Data.Either as E import Data.Iteratee as I import Data.Iteratee.Char as C import Data.Iteratee.IO import Data.Iteratee.ListLike as LL import Data.List as L import Data.Map as M import Data.Maybe import Prelude as P import Biobase.MAF -- | Transforms a ByteString stream into a stream of "Either MAF Alignment". eneeMAF :: (Monad m) => Enumeratee BS.ByteString [Either MAF Alignment] m a eneeMAF = enumLinesBS ><> convStream f where f = icont step Nothing step (Chunk []) = f step (Chunk xs@(x:_)) | BS.isPrefixOf "#" x = icont (mh xs) Nothing | otherwise = icont (md xs) Nothing step str = idone [] str -- create the header mh xs (Chunk ys) | P.null ts = icont (mh hs) Nothing | otherwise = idone [mkMAF hs] (Chunk ts) where (hs,ts) = P.span (BS.isPrefixOf "#") $ xs++ys mh xs str = idone [] str -- TODO error if xs not null? -- create each data block md xs (Chunk ys) | P.length zs == 1 = icont (md (xs++ys)) Nothing | otherwise = idone (P.map mkAlignment $ P.init zs) (Chunk $ P.last zs) where zs = L.groupBy (\a b -> not $ BS.isPrefixOf "a" b) . L.filter (not . BS.isPrefixOf "#") . L.filter (not . BS.null) $ xs++ys md xs str = idone [mkAlignment xs] str mkMAF [] = error "eneeMAF: empty stream or no header" mkMAF xs = Left $ MAF (mkKVs $ P.head xs) (P.map BS.copy $ P.tail xs) [] [] mkAlignment xs = Right $ Alignment (mkKVs $ P.head xs) (P.map mkAligned $ P.tail xs) mkKVs = M.fromList . P.map mkKV . P.drop 1 . BS.words mkKV = (BS.copy *** BS.copy . BS.drop 1) . BS.span (/='=') mkAligned x | P.length ws == 7 = Aligned { key = BS.copy $ ws!!1 , start = read . BS.unpack $ ws!!2 , length = read . BS.unpack $ ws!!3 , strand = BS.head $ ws!!4 , genomesize = read . BS.unpack $ ws!!5 , value = BS.copy $ ws!!6 } | otherwise = error $ "couldn't mkAligned from string: " ++ BS.unpack x where ws = BS.words x -- | Very simple stream transformer that will remove the payload from alignment -- blocks. The positional information is keps, only the aligned sequence data -- is removed. For applications which do not requice the aligned sequences, -- this can yield enormous space improvements. eneeRemovePayload = mapStream f where f l@(Left _) = l f (Right x) = Right $ x{sequences = P.map g $ sequences x} g a@Aligned{..} = a{value = BS.empty} -- | Simple iteratee that builds a complete MAF out of a stream of "Either MAF -- Alignment" iMAF = do Left maf <- LL.head xs <- stream2list return maf{blocks = rights xs}