-- read a clustering on Unigene format -- each cluster separated by '^#.*$' module Unigene (ugRead) where import Control.Monad import System.IO import qualified Data.ByteString.Lazy.Char8 as B import Bio.Sequence (Sequence(..), Nuc, Unknown, castToNuc) import Bio.Sequence.Fasta (mkSeqs) import Data.List (groupBy) -- Isn't this lazy? ugRead :: FilePath -> IO [[Sequence Nuc]] ugRead f = do s <- B.readFile f let ls = dropWhile B.null $ B.lines $ s when (B.head (head ls) /= '#') (hPutStrLn stderr ("'"++f++"' does not look like Unigene format")) return $ map (map castToNuc) $ clusters $ ls clusters :: [B.ByteString] -> [[Sequence Unknown]] clusters (l:ls) = if B.head l == '#' then map (mkSeqs . tail) $ groupBy (const (('#' /=) . B.head)) $ filter (not . B.null) (l:ls) else [mkSeqs (l:ls)] clusters [] = []