{- | This is a meta-module importing and re-exporting sequence-related stuff. It encompasses the "Bio.Sequence.SeqData", "Bio.Sequence.Fasta", and "Bio.Sequence.TwoBit" modules. -} module Bio.Sequence ( -- * Data structures etc ("Bio.Sequence.SeqData") Sequence(..), Unknown, Offset, SeqData, Qual, QualData -- ** Accessor functions , seqlength, seqlabel, seqheader, seqdata, seqqual, (!) , appendHeader, setHeader -- ** Converting to and from String. , fromStr, toStr -- ** Nucleotide functionality. , compl, revcompl, revcompl', Nuc, castToNuc -- ** Protein sequence functionality , Amino(..), translate, fromIUPAC, toIUPAC, castToAmino -- ** Other utility functions , defragSeq, seqmap -- * File IO -- ** Generic sequence reading , readNuc, readProt -- ** The Fasta file format ("Bio.Sequence.Fasta") , readFasta, hReadFasta , writeFasta, hWriteFasta -- ** Quality data -- | Not part of the Fasta format, and treated separately. , readQual, writeQual, hWriteQual , readFastaQual , writeFastaQual, hWriteFastaQual -- ** The FastQ format ("Bio.Sequence.FastQ") -- Combines sequence data and quality in one file. -- Warning: Solexa uses a different formula for the quality values! , readFastQ, writeFastQ, hReadFastQ, hWriteFastQ , readSangerQ, writeSangerQ, hReadSangerQ, hWriteSangerQ , readIllumina, writeIllumina, hReadIllumina, hWriteIllumina -- ** The phd file format ("Bio.Sequence.Phd") -- | These contain base (nucleotide) calling information, -- and are generated by @phred@. , readPhd, hReadPhd -- ** TwoBit file format support ("Bio.Seqeunce.TwoBit") -- | Used by @BLAT@ and related tools. , decode2Bit, read2Bit, hRead2Bit -- ,encode2Bit, write2Bit, hWrite2Bit -- * Hashing functionality ("Bio.Sequence.HashWord") -- | Packing words from sequences into integral data types , HashF (..) , contigous, rcontig, rcpacked -- * Entropy calculations , KWords(..), entropy ) where -- basic sequence data structures import Bio.Sequence.SeqData -- file formats import Bio.Sequence.Fasta import Bio.Sequence.FastQ import Bio.Sequence.Phd import Bio.Sequence.TwoBit import Bio.Sequence.SFF -- sequence-oriented stuff import Bio.Sequence.Entropy import Bio.Sequence.HashWord import Control.Monad (filterM) import System.Directory (doesFileExist) -- | Read nucleotide sequences in any format - Fasta, SFF, FastQ, 2bit, PHD... -- Todo: detect Illumina vs Sanger FastQ, transparent compression readNuc :: FilePath -> IO [Sequence Nuc] readNuc fp | ext `elem` ["fasta", "fna", "fa", "fst"] = do ps <- findQual fp ss <- (case ps of [q] -> readFastaQual fp q [] -> readFasta fp qs -> error ("Ambigous quality file for "++show fp++": "++show qs)) return (map castSeq ss) | ext == "2bit" = read2Bit $ fp | ext == "sff" = fmap sffToSequence . readSFF $ fp | ext `elem` ["fq","fastq"] = readSangerQ $ fp | ext == "txt" = readIllumina $ fp | ext2 == "phd" = fmap return . readPhd $ fp -- only a single sequence -- "ace" ? | otherwise = error "readNuc: unknown file suffix!" where ext = reverse . takeWhile (/='.') . reverse $ fp ext2 = reverse . takeWhile (/='.') . dropWhile (=='.') . dropWhile (/='.') . reverse $ fp basename = reverse . dropWhile (=='.') . dropWhile (/= '.') . reverse findQual = filterM doesFileExist . qualnames qualnames f = [f++".qual",basename f++".qual"] -- | Read protein sequences in any supported format (i.e. Fasta) readProt :: FilePath -> IO [Sequence Amino] readProt xs = map castSeq `fmap` readFasta xs