-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A bioinformatics library -- -- This is a collection of data structures and algorithms I've found -- useful when building various bioinformatics-related tools and -- utilities. -- -- Current list of features includes: a Sequence data type supporting -- protein and nucleotide sequences and conversion between them, quality -- data, reading and writing Fasta formatted files, reading TwoBit and -- phd formats. Rudimentary support for doing alignments - including -- dynamic adjustment of scores based on sequence quality - and Blast -- output parsing. Partly implemented single linkage clustering, and -- multiple alignment. Reading Gene Ontology (GO) annotations (GOA) and -- definitions/hierarchy. -- -- The Darcs repository is at: -- http://malde.org/~ketil/biohaskell/biolib. @package bio @version 0.3.5 module Bio.GFF3.Escape unEscapeByteString :: (Error e, MonadError e m) => ByteString -> m ByteString escapeByteString :: (Char -> Bool) -> ByteString -> ByteString escapeAllBut :: String -> ByteString -> ByteString escapeAllOf :: String -> ByteString -> ByteString -- | Lazy "many" combinator for Parsec. Courtesy of Tomasz Zielonka. module Bio.Util.Parsex lazyMany :: GenParser Char () a -> SourceName -> [Char] -> [a] -- | Utility module, with various useful stuff. module Bio.Util lines :: ByteString -> [ByteString] -- | Break a list of bytestrings on a predicate. splitWhen :: (ByteString -> Bool) -> [ByteString] -> [[ByteString]] -- | Output (to stderr) progress while evaluating a lazy list. Useful for -- generating output while (conceptually, at least) in pure code countIO :: String -> String -> Int -> [a] -> IO [a] -- | A lazier version of Control.Monad.sequence in Control.Monad, -- needed by countIO above. sequence' :: [IO a] -> IO [a] -- | Workaround, the current Data.ByteString.Lazy.Char8 contains a -- bug in Data.ByteString.Lazy.Char8.lines. mylines :: ByteString -> [ByteString] -- | Implement clustering module Bio.Clustering -- | Data structure for storing hierarchical clusters data Clustered score datum Branch :: score -> (Clustered score datum) -> (Clustered score datum) -> Clustered score datum Leaf :: datum -> Clustered score datum -- | Single linkage agglomerative clustering. Cluster elements by slurping -- a sorted list of pairs with score (i.e. triples :-) Keeps a set of -- contained elements at each branch's root, so O(n log n), and requires -- elements to be in Ord. For this to work, the triples must be sorted on -- score. Earlier scores in the list will make up the lower nodes, so -- sort descending for similarity, ascending for distance. cluster_sl :: (Ord a, Ord s) => [(s, a, a)] -> [Clustered s a] instance (Show score, Show datum) => Show (Clustered score datum) -- | This module implements a hierarchical data structure for BLAST -- results, there is an alternative flat structure in the -- Bio.Alignment.BlastFlat module. -- -- BLAST is a tool for searching in (biological) sequences for -- similarity. This library is tested against NCBI-blast version 2.2.14. -- There exist several independent versions of BLAST, so expect some -- incompatbilities if you're using a different BLAST version. -- -- For parsing BLAST results, the XML format (blastall -m 7) is by far -- the most robust choice, and is implemented in the -- Bio.Alignment.BlastXML module. -- -- The format is straightforward (and non-recursive). For more -- information on BLAST, check -- http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html module Bio.Alignment.BlastData -- | The sequence id, i.e. the first word of the header field. type SeqId = ByteString -- | The Strand indicates the direction of the match, i.e. the plain -- sequence or its reverse complement. data Strand Plus :: Strand Minus :: Strand -- | The Aux field in the BLAST output includes match information that -- depends on the BLAST flavor (blastn, blastx, or blastp). This data -- structure captures those variations. data Aux -- | blastn Strands :: !Strand -> !Strand -> Aux -- | blastx Frame :: !Strand -> !Int -> Aux -- | A BlastResult is the root of the hierarchy. data BlastResult BlastResult :: !ByteString -> !ByteString -> !ByteString -> !ByteString -> !ByteString -> !Integer -> !Integer -> [BlastRecord] -> BlastResult blastprogram :: BlastResult -> !ByteString blastversion :: BlastResult -> !ByteString blastdate :: BlastResult -> !ByteString blastreferences :: BlastResult -> !ByteString database :: BlastResult -> !ByteString dbsequences :: BlastResult -> !Integer dbchars :: BlastResult -> !Integer results :: BlastResult -> [BlastRecord] -- | Each query sequence generates a BlastRecord data BlastRecord BlastRecord :: !SeqId -> !Int -> [BlastHit] -> BlastRecord query :: BlastRecord -> !SeqId qlength :: BlastRecord -> !Int hits :: BlastRecord -> [BlastHit] -- | Each match between a query and a target sequence (or subject) is a -- BlastHit. data BlastHit BlastHit :: !SeqId -> !Int -> [BlastMatch] -> BlastHit subject :: BlastHit -> !SeqId slength :: BlastHit -> !Int matches :: BlastHit -> [BlastMatch] -- | A BlastHit may contain multiple separate matches (typcially -- when an indel causes a frameshift that blastx is unable to bridge). data BlastMatch BlastMatch :: !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastMatch bits :: BlastMatch -> !Double e_val :: BlastMatch -> !Double identity :: BlastMatch -> (Int, Int) q_from :: BlastMatch -> !Int q_to :: BlastMatch -> !Int h_from :: BlastMatch -> !Int h_to :: BlastMatch -> !Int aux :: BlastMatch -> !Aux instance Show BlastMatch instance Show BlastHit instance Show BlastRecord instance Show BlastResult instance Show Aux instance Eq Aux instance Read Strand instance Show Strand instance Eq Strand -- | This module implements a parser for BLAST results. -- -- This module is DEPRECATED. It is *very* recommended that you run blast -- with XML output instaed, and use the BlastXML module to parse it. -- Don't say I didn't warn you! -- -- BLAST is a tool for searching in (biological) sequences for -- similarity. This library is tested against NCBI-blast version 2.2.14. -- There exist several independent versions, so expect some -- incompatbilities if you're using a different BLAST version. -- -- The format is straightforward (and non-recursive), and this -- implementation uses a simple line-based, hierarchical parser. -- -- For more information on BLAST, check -- http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html module Bio.Alignment.Blast parse :: ByteString -> BlastResult -- | Parse blast XML output. -- -- If you use a recent version of NCBI BLAST and specify XML output -- (blastall -m 7), this module should be able to parse the result into a -- hierarchical BlastResult structure. -- -- While the process may consume a bit of memory, the parsing is lazy, -- and file sizes of several gigabytes can be parsed (see e.g. the xml2x -- tool for an example). To parse XML, we use Text.HTML.TagSoup. module Bio.Alignment.BlastXML -- | Parse BLAST results in XML format readXML :: FilePath -> IO [BlastResult] -- | This module implements a "flattened" data structure for Blast hits, as -- opposed to the hierarchical structure in -- Bio.Alignment.BlastData. -- -- The flat data type is useful in many cases where it is more natural to -- see the result as a set of rows (e.g. for insertaion in a database). -- -- It would probably be more (memory-) efficient to go the other way -- (i.e. from flat to hierarchical), as passing the current, partially -- built BlastFlat object down the stream of results and stamping -- out a stream of completed ones. (See -- Bio.Alignment.BlastXML.breaks for this week's most cumbersome -- use of parallelism to avoid the memory issue.) module Bio.Alignment.BlastFlat -- | The BlastFlat data structure contains information about a single match data BlastFlat BlastFlat :: !SeqId -> !Int -> !SeqId -> !Int -> !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastFlat query :: BlastFlat -> !SeqId qlength :: BlastFlat -> !Int subject :: BlastFlat -> !SeqId slength :: BlastFlat -> !Int bits :: BlastFlat -> !Double e_val :: BlastFlat -> !Double identity :: BlastFlat -> (Int, Int) q_from :: BlastFlat -> !Int q_to :: BlastFlat -> !Int h_from :: BlastFlat -> !Int h_to :: BlastFlat -> !Int aux :: BlastFlat -> !Aux readXML :: FilePath -> IO [BlastFlat] -- | Convert BlastRecords into BlastFlats (representing a depth-first -- traversal of the BlastRecord structure.) flatten :: [BlastRecord] -> [BlastFlat] -- | Each query sequence generates a BlastRecord data BlastRecord blastprogram :: BlastResult -> ByteString blastversion :: BlastResult -> ByteString blastdate :: BlastResult -> ByteString blastreferences :: BlastResult -> ByteString database :: BlastResult -> ByteString dbsequences :: BlastResult -> Integer dbchars :: BlastResult -> Integer results :: BlastResult -> [BlastRecord] -- | The Aux field in the BLAST output includes match information that -- depends on the BLAST flavor (blastn, blastx, or blastp). This data -- structure captures those variations. data Aux -- | blastn Strands :: !Strand -> !Strand -> Aux -- | blastx Frame :: !Strand -> !Int -> Aux -- | The Strand indicates the direction of the match, i.e. the plain -- sequence or its reverse complement. data Strand Plus :: Strand Minus :: Strand -- | GeneOntology - parse and index Gene Ontology Annotations In -- particular, the file 'gene_association.goa_uniprot' that contains -- links between GO terms and UniProt accessions. -- -- module Bio.Sequence.GeneOntology -- | A GO term is a positive integer newtype GoTerm GO :: Int -> GoTerm -- | A GoDef maps a GoTerm to a description and a GoClass. data GoDef GoDef :: !GoTerm -> !ByteString -> !GoClass -> GoDef -- | A list of Go definitions, with pointers to parent nodes. Read from the -- .obo file. The user may construct the explicit hierachy by storing -- these in a Map or similar type GoHierarchy = [(GoDef, [GoTerm])] -- | Read the GO hierarchy from the obo file. Note that this is not quite a -- tree structure. readObo :: FilePath -> IO GoHierarchy -- | Read GO term definitions, from the GO.terms_and_ids file readTerms :: FilePath -> IO [GoDef] -- | A GOA annotation, containing a UniProt identifier, a GoTerm and an -- evidence code. data Annotation Ann :: !UniProtAcc -> !GoTerm -> !EvidenceCode -> Annotation -- | A UniProt identifier (short string of capitals and numbers). type UniProtAcc = ByteString data GoClass Func :: GoClass Proc :: GoClass Comp :: GoClass -- | Evidence codes describe the type of support for an annotation -- http://www.geneontology.org/GO.evidence.shtml data EvidenceCode -- | Inferred by Curator IC :: EvidenceCode -- | Inferred from Direct Assay IDA :: EvidenceCode -- | Inferred from Electronic Annotation IEA :: EvidenceCode -- | Inferred from Expression Pattern IEP :: EvidenceCode -- | Inferred from Genomic Context IGC :: EvidenceCode -- | Inferred from Genetic Interaction IGI :: EvidenceCode -- | Inferred from Mutant Phenotype IMP :: EvidenceCode -- | Inferred from Physical Interaction IPI :: EvidenceCode -- | Inferred from Sequence or Structural Similarity ISS :: EvidenceCode -- | Non-traceable Author Statement NAS :: EvidenceCode -- | No biological Data available ND :: EvidenceCode -- | Inferred from Reviewed Computational Analysis RCA :: EvidenceCode -- | Traceable Author Statement TAS :: EvidenceCode -- | Not Recorded NR :: EvidenceCode -- | Read the goa_uniprot file (warning: this one is huge!) readGOA :: FilePath -> IO [Annotation] -- | The vast majority of GOA data is IEA, while the most reliable -- information is manually curated. Filtering on this is useful to keep -- data set sizes manageable, too. isCurated :: EvidenceCode -> Bool decomment :: ByteString -> [ByteString] instance Read EvidenceCode instance Show EvidenceCode instance Eq EvidenceCode instance Show Annotation instance Show GoDef instance Eq GoTerm instance Ord GoTerm instance Show GoClass instance Read GoClass instance Show GoTerm instance Read GoTerm -- | Functionality for manipulating KEGG annotations. -- -- KEGG is a bit hard find, but there exist species-specific tables -- Available organisms are listed in the table at -- -- ftp://ftp.genome.jp/pub/kegg/genes/etc/all_species.tab -- -- Data for each organism is stored its own subdirectory under -- -- ftp://ftp.genome.jp/pub/kegg/genes/organisms/ -- -- Containing tables linking everything -- including external resources -- like UniProt, PDB, or NCBI -- together. module Bio.Sequence.KEGG -- | Most KEGG files that contain associations, have one association per -- line, consisting of two items separated by whitespace. This is a -- generalized reader function. genReadKegg :: FilePath -> IO [(ByteString, ByteString)] newtype KO KO :: ByteString -> KO -- | Convert UniProt IDs (up:xxxxxx) to the UniProtAcc type. decodeUP :: ByteString -> UniProtAcc -- | Convert KO IDs (ko:xxxxx) to the KO data type. decodeKO :: ByteString -> KO -- | KEGG uses strings with an identifying prefix for IDs. This helper -- function checks and removes prefix to construct native values. removePrefix :: String -> String -> (ByteString -> a) -> ByteString -> a instance Show KO -- | Moved to GeneOnthology - this is for backwards compatibility. module Bio.Sequence.GOA readGO :: FilePath -> IO [GoDef] module Bio.Sequence.Entropy class KWords s kwords :: (KWords s) => Int -> s -> [s] entropy :: (Ord str, KWords str) => Int -> str -> Double instance KWords [a] -- | Data structures for manipulating (biological) sequences. -- -- Generally supports both nucleotide and protein sequences, some -- functions, like revcompl, only makes sense for nucleotides. module Bio.Sequence.SeqData -- | A sequence consists of a header, the sequence data itself, and -- optional quality data. data Sequence -- | header and actual sequence Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence -- | An offset, index, or length of a SeqData type Offset = Int64 -- | The basic data type used in Sequences type SeqData = ByteString -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from -- bad. type Qual = Word8 -- | Quality data is a Qual vector, currently implemented as a -- ByteString. type QualData = ByteString -- | Read the character at the specified position in the sequence. (!) :: Sequence -> Offset -> Char -- | Return sequence length. seqlength :: Sequence -> Offset -- | Return sequence label (first word of header) seqlabel :: Sequence -> SeqData -- | Return full header. seqheader :: Sequence -> SeqData -- | Return the sequence data. seqdata :: Sequence -> SeqData (?) :: Sequence -> Offset -> Qual -- | Check whether the sequence has associated quality data. hasqual :: Sequence -> Bool -- | Return the quality data, or error if none exist. Use hasqual if in -- doubt. seqqual :: Sequence -> QualData appendHeader :: Sequence -> String -> Sequence -- | Modify the header by appending text, or by replacing all but the -- sequence label (i.e. first word). setHeader :: Sequence -> String -> Sequence -- | Convert a String to SeqData fromStr :: String -> SeqData -- | Convert a SeqData to a String toStr :: SeqData -> String -- | Complement a single character. I.e. identify the nucleotide it can -- hybridize with. Note that for multiple nucleotides, you usually want -- the reverse complement (see revcompl for that). compl :: Char -> Char -- | Calculate the reverse complement. This is only relevant for the -- nucleotide alphabet, and it leaves other characters unmodified. revcompl :: Sequence -> Sequence data Amino Ala :: Amino Arg :: Amino Asn :: Amino Asp :: Amino Cys :: Amino Gln :: Amino Glu :: Amino Gly :: Amino His :: Amino Ile :: Amino Leu :: Amino Lys :: Amino Met :: Amino Phe :: Amino Pro :: Amino Ser :: Amino Thr :: Amino Tyr :: Amino Trp :: Amino Val :: Amino STP :: Amino Asx :: Amino Glx :: Amino Xle :: Amino Xaa :: Amino -- | Translate a nucleotide sequence into the corresponding protein -- sequence. This works rather blindly, with no attempt to identify ORFs -- or otherwise QA the result. translate :: Sequence -> Offset -> [Amino] -- | Convert a sequence in IUPAC format to a list of amino acids. fromIUPAC :: SeqData -> [Amino] -- | Convert a list of amino acids to a sequence in IUPAC format. toIUPAC :: [Amino] -> SeqData instance Show Amino instance Eq Amino instance Show Sequence instance Eq Sequence -- | This module incorporates functionality for reading and writing -- sequence data in the Fasta format. Each sequence consists of a header -- (with a > prefix) and a set of lines containing the sequence -- data. module Bio.Sequence.Fasta -- | Lazily read sequences from a FASTA-formatted file readFasta :: FilePath -> IO [Sequence] -- | Write sequences to a FASTA-formatted file. Line length is 60. writeFasta :: FilePath -> [Sequence] -> IO () -- | Lazily read sequence from handle hReadFasta :: Handle -> IO [Sequence] -- | Write sequences in FASTA format to a handle. hWriteFasta :: Handle -> [Sequence] -> IO () -- | Read quality data for sequences to a file. readQual :: FilePath -> IO [Sequence] -- | Write quality data for sequences to a file. writeQual :: FilePath -> [Sequence] -> IO () hWriteQual :: Handle -> [Sequence] -> IO () -- | Read sequence and associated quality. Will error if the sequences and -- qualites do not match one-to-one in sequence. readFastaQual :: FilePath -> FilePath -> IO [Sequence] hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO () -- | Write sequence and quality data simulatnously This may be more -- laziness-friendly. writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO () countSeqs :: FilePath -> IO Int -- | Convert a list of FASTA-formatted lines into a list of sequences. -- Blank lines are ignored. Comment lines start with # are allowed -- between sequences (and ignored). Lines starting with > -- initiate a new sequence. mkSeqs :: [ByteString] -> [Sequence] -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from -- bad. type Qual = Word8 -- | Support the FastQ format that combines sequence and quality. See: -- -- -- -- Of course, this is yet another vaguely defined pseudo-standard with -- conflicting definitions. Of course Solexa had to go and invent a -- different, but indistinguishably so, way to do it: -- -- -- -- Currently, we only support the non-Solexa FastQ, adding/subtracting 33 -- for the quality values. module Bio.Sequence.FastQ readFastQ :: FilePath -> IO [Sequence] hReadFastQ :: Handle -> IO [Sequence] -- | Parse one FastQ entry, suitable for using in unfoldr over -- lines from a file parse :: [ByteString] -> Maybe (Either String Sequence, [ByteString]) writeFastQ :: FilePath -> [Sequence] -> IO () hWriteFastQ :: Handle -> [Sequence] -> IO () unparse :: Sequence -> ByteString -- | This module implements the 2bit format for sequences. -- -- Based on: http://genome.ucsc.edu/FAQ/FAQformat#format7 Note! -- the description is not accurate, it is missing a reserved word in each -- sequence record. -- -- There are also other, completely different ideas of the 2bit format, -- e.g. http://jcomeau.freeshell.org/www/genome/2bitformat.html module Bio.Sequence.TwoBit -- | Parse a (lazy) ByteString as sequences in the 2bit format. decode2Bit :: ByteString -> [Sequence] -- | Extract sequences from a file in 2bit format. read2Bit :: FilePath -> IO [Sequence] -- | Extract sequences in the 2bit format from a handle. hRead2Bit :: Handle -> IO [Sequence] instance Show SRLE instance Show SRBE instance Show SR instance Show Entry instance Binary SRLE instance Binary SRBE instance Binary Entries instance Show Entries instance Binary Entry instance Binary Header instance Show Header -- | Parse phd files (phred base calling output). module Bio.Sequence.Phd -- | Parse a .phd file, extracting the contents as a Sequence readPhd :: FilePath -> IO Sequence -- | Parse .phd contents from a handle hReadPhd :: Handle -> IO Sequence module Bio.Sequence.HashWord -- | This is a struct for containing a set of hashing functions data HashF k HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k -- | calculates the hash at a given offset in the sequence hash :: HashF k -> SeqData -> Offset -> Maybe k -- | calculate all hashes from a sequence, and their indices hashes :: HashF k -> SeqData -> [(k, Offset)] -- | for sorting hashes ksort :: HashF k -> [k] -> [k] -- | Adds a default hashes function to a HashF, when -- hash is defined. genkeys :: HashF k -> HashF k -- | Contigous constructs an int/eger from a contigous k-word. contigous :: (Integral k) => Int -> HashF k -- | Like contigous, but returns the same hash for a word and its -- reverse complement. rcontig :: (Integral k) => Int -> HashF k compact :: SeqData -> [SeqData] -- | Like rcontig, but ignoring monomers (i.e. arbitrarily long -- runs of a single nucelotide are treated the same a single nucleotide. rcpacked :: (Integral k) => Int -> HashF k type Shape = String gapped :: (Integral k) => Shape -> HashF k isN :: Char -> Bool n2k :: (Integral k) => Int -> SeqData -> k n2i' :: (Num a) => a -> SeqData -> a k2n :: (Integral k) => Int -> k -> SeqData val :: (Num t) => Char -> t unval :: (Num a) => a -> Char complement :: Char -> Char -- | Read (and write?) the SFF file format used by Roche/454 sequencing to -- store flowgram data. -- -- A flowgram is a series of values (intensities) representing -- homopolymer runs of A,G,C, and T in a fixed cycle, and usually -- displayed as a histogram. -- -- The Staden Package contains an io_lib, with a C routine for parsing -- this format. According to comments in the sources, the io_lib -- implementation is based on a file called getsff.c, which I've been -- unable to track down. -- -- It is believed that all values are stored big endian. module Bio.Sequence.SFF -- | The data structure storing the contents of an SFF file (modulo the -- index) data SFF SFF :: !CommonHeader -> [ReadBlock] -> SFF -- | SFF has a 31-byte common header Todo: remove items that are derivable -- (counters, magic, etc) cheader_lenght points to the first read header. -- Also, the format is open to having the index anywhere between reads, -- we should really keep count and check for each read. In practice, it -- seems to be places after the reads. -- -- The following two fields are considered part of the header, but as -- they are static, they are not part of the data structure magic :: -- Word32 -- ^ 0x2e736666, i.e. the string .sff version :: Word32 -- -- ^ 0x00000001 data CommonHeader CommonHeader :: Int64 -> Int32 -> Int32 -> Int16 -> Int16 -> Word8 -> ByteString -> ByteString -> CommonHeader -- | Points to a text(?) section index_offset :: CommonHeader -> Int64 index_length :: CommonHeader -> Int32 num_reads :: CommonHeader -> Int32 key_length :: CommonHeader -> Int16 flow_length :: CommonHeader -> Int16 flowgram_fmt :: CommonHeader -> Word8 flow :: CommonHeader -> ByteString key :: CommonHeader -> ByteString -- | Each Read has a fixed read header data ReadHeader ReadHeader :: Int16 -> Int32 -> Int16 -> Int16 -> Int16 -> Int16 -> ByteString -> ReadHeader name_length :: ReadHeader -> Int16 num_bases :: ReadHeader -> Int32 clip_qual_left :: ReadHeader -> Int16 clip_qual_right :: ReadHeader -> Int16 clip_adapter_left :: ReadHeader -> Int16 clip_adapter_right :: ReadHeader -> Int16 read_name :: ReadHeader -> ByteString -- | This contains the actual flowgram for a single read. data ReadBlock ReadBlock :: ReadHeader -> [Flow] -> ByteString -> ByteString -> ByteString -> ReadBlock read_header :: ReadBlock -> ReadHeader flowgram :: ReadBlock -> [Flow] flow_index :: ReadBlock -> ByteString bases :: ReadBlock -> ByteString quality :: ReadBlock -> ByteString readSFF :: FilePath -> IO SFF writeSFF :: FilePath -> SFF -> IO () sffToSequence :: SFF -> [Sequence] -- | test serialization by output'ing the header and first two reads in an -- SFF, and the same after a decode + encode cycle. test :: FilePath -> IO () -- | Convert a file by decoding it and re-encoding it This will lose the -- index (which isn't really necessary) convert :: FilePath -> IO () -- | The type of flowgram value type Flow = Int16 -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from -- bad. type Qual = Word8 type Index = Word8 instance Show ReadBlock instance Binary ReadHeader instance Show ReadHeader instance Binary CommonHeader instance Show CommonHeader instance Binary SFF instance Show SFF -- | Data structures and helper functions for calculating alignments -- -- There are two ways to view an alignment: either as a list of edits -- (i.e., insertions, deletions, or substitutions), or as a set of -- sequences with inserted gaps. -- -- The edit list approach is perhaps more restrictive model but doesn't -- generalize to multiple alignments. -- -- The gap approach is more general, and probably more commonly used by -- other software (see e.g. the ACE file format). module Bio.Alignment.AlignData data Dir Fwd :: Dir Rev :: Dir type Gaps = [Offset] type Alignment = [(Offset, Dir, Sequence, Gaps)] -- | Gaps are coded as *s, this function removes them, and returns -- the sequence along with the list of gap positions. extractGaps :: SeqData -> (SeqData, Gaps) insertGaps :: Char -> (SeqData, Gaps) -> SeqData -- | An Edit is either the insertion, the deletion, or the replacement of a -- character. data Edit Ins :: Chr -> Edit Del :: Chr -> Edit Repl :: Chr -> Chr -> Edit -- | An alignment is a sequence of edits. type EditList = [Edit] -- | A substitution matrix gives scores for replacing a character with -- another. Typically, it will be symmetric. type SubstMx a = (Chr, Chr) -> a -- | A Selector consists of a zero element, and a funcition that chooses a -- possible Edit operation, and generates an updated result. type Selector a = [(a, Edit)] -> a -- | The sequence element type, used in alignments. type Chr = Word8 -- | Calculate a set of columns containing scores This represents the -- columns of the alignment matrix, but will only require linear space -- for score calculation. columns :: Selector a -> a -> Sequence -> Sequence -> [[a]] -- | Evaluate an Edit based on SubstMx and gap penalty eval :: SubstMx a -> a -> Edit -> a -- | True if the Edit is a Repl. isRepl :: Edit -> Bool -- | turn an alignment into sequences with - representing gaps (for -- checking, filtering out the - characters should return the -- original sequences, provided - isn't part of the sequence -- alphabet) toStrings :: EditList -> (String, String) instance Show Edit instance Eq Edit instance Eq Dir instance Show Dir -- | Common substitution matrices for alignments. -- -- When in doubt, use BLOSUM62. Consult -- http://www.ncbi.nlm.nih.gov/blast/blast_whatsnew.shtml#20051206 -- for some hints on good parameters for nucleotide alignments. -- -- See also http://en.wikipedia.org/wiki/Substitution_matrix for a -- summary about the difference between the different matrices. module Bio.Alignment.Matrices -- | BLOSUM45 matrix, suitable for distantly related sequences blosum45 :: (Char, Char) -> Int -- | The standard BLOSUM62 matrix. blosum62 :: (Char, Char) -> Int -- | BLOSUM80 matrix, suitable for closely related sequences. blosum80 :: (Char, Char) -> Int -- | The standard PAM30 matrix pam30 :: (Char, Char) -> Int -- | The standard PAM70 matrix. pam70 :: (Char, Char) -> Int -- | Blast defaults, use with gap_open = -5 gap_extend = -3 This should -- really check for valid nucleotides, and perhaps be more lenient in the -- case of Ns. Oh well. blastn_default :: (Num a) => (Chr, Chr) -> a -- | Construct a simple matrix from match score/mismatch penalty simpleMx :: (Num a) => a -> a -> (Chr, Chr) -> a -- | Simple alignment of sequences -- -- Standard alignment/edit distance module Bio.Alignment.SAlign -- | Calculate local edit distance (Smith-Waterman alignment score) local_score :: (Num a, Ord a) => SubstMx a -> a -> Sequence -> Sequence -> a local_align :: (Num a, Ord a) => SubstMx a -> a -> Sequence -> Sequence -> EditList -- | Calculate global edit distance (Needleman-Wunsch alignment score) global_score :: (Num a, Ord a) => SubstMx a -> a -> Sequence -> Sequence -> a -- | Calculate alignments. global_align :: (Num a, Ord a) => SubstMx a -> a -> Sequence -> Sequence -> EditList -- | Implement alignments/edit distance with affine gap penalties -- -- I've seen g = (-10,-1) as the suggested price to pay for a gaps using -- BLOSUM62. Good choice as any, I guess. module Bio.Alignment.AAlign -- | Calculate local edit distance (Smith-Waterman alignment score) local_score :: (Num a, Ord a) => SubstMx a -> (a, a) -> Sequence -> Sequence -> a -- | Calculate local alignmnet (Smith-Waterman) local_align :: (Num a, Ord a) => SubstMx a -> (a, a) -> Sequence -> Sequence -> (a, EditList) -- | Calculate global edit distance (Needleman-Wunsch alignment score) global_score :: (Num a, Ord a) => SubstMx a -> (a, a) -> Sequence -> Sequence -> a -- | Calculate global alignment (Needleman-Wunsch) global_align :: (Num a, Ord a) => SubstMx a -> (a, a) -> Sequence -> Sequence -> (a, EditList) -- | Quality-aware alignments -- -- Generally, quality data are ignored for alignment/pattern searching -- like Smith-Waterman, Needleman-Wunsch, or BLAST(p|n|x). I believe that -- accounting for quality will at the very least affect things like BLAST -- statistics, and e.g. is crucial for good EST annotation using Blastx. -- -- This module performs sequences alignments, takes quality values into -- account. -- -- See also -- http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btn052v1. module Bio.Alignment.QAlign -- | Calculate local edit distance (Smith-Waterman alignment score) local_score :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> Double -- | Calculate local alignment (Smith-Waterman) (can we replace uncurry -- max' with fst - a local alignment must always end on a subst, no?) local_align :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> (Double, EditList) -- | Calculate global edit distance (Needleman-Wunsch alignment score) global_score :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> Double -- | Calculate global alignment (Needleman-Wunsch) global_align :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> (Double, EditList) -- | Calucalte best overlap score, where gaps at the edges are free The -- starting point is like for local score (0 cost for initial indels), -- the result is the maximum anywhere in the last column or bottom row of -- the matrix. overlap_score :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> Double -- | Calucalte best overlap score, where gaps at the edges are free The -- starting point is like for local score (0 cost for initial indels), -- the result is the maximum anywhere in the last column or bottom row of -- the matrix. overlap_align :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> (Double, EditList) qualMx :: Qual -> Qual -> (Chr, Chr) -> Double test :: IO () -- | Read ACE format assembly files -- -- These are typically output by sequence assembly tools, like CAP3 or -- Phrap. -- -- Documented in the section labelled "ACE FILE FORMAT" at -- http://bozeman.mbt.washington.edu/consed/distributions/README.14.0.txt -- -- Briefly: each field is a line starting with a two letter code, in some -- cases followed by data lines termintated by a blank line. Here's an -- brief example how an ACE file looks like: -- --
--   AS contigs reads
--   CO contig_name bases reads segments compl (CAP3: segments=0)
--   sequence
--   BQ base_qualities
--   AF read1 compl padded_start_consensus (negatives meaning?)
--   AF read2 ..
--   BS segments
--   RD read1 bases info_items info_tags (latter two set to 0 by CAP3)
--   sequence
--   QA read1 qual_start qual_end align_start align_end
--   DS (phred header? left empty by CAP3)
--   RD read2 ...
--   
module Bio.Alignment.ACE -- | Reading an ACE file. readACE :: FilePath -> IO [[Assembly]] writeACE :: FilePath -> [Assembly] -> IO () data Assembly Asm :: (Sequence, Gaps) -> Alignment -> Assembly contig :: Assembly -> (Sequence, Gaps) fragments :: Assembly -> Alignment -- | Test parser p on a list of ACE elements ptest :: (Show a) => String -> AceParser a -> [ACE] -> IO () reads :: Assembly -> Alignment instance Eq ACE instance Show Assembly instance Show ACE module Bio.Util.TestBase data Test T :: String -> t -> Test newtype Nucleotide N :: Char -> Nucleotide newtype Quality Q :: Word8 -> Quality fromN :: Nucleotide -> Char fromQ :: Quality -> Word8 -- | For testing, variable lengths newtype EST E :: Sequence -> EST newtype ESTq Eq :: Sequence -> ESTq newtype Protein P :: Sequence -> Protein -- | For benchmarking, fixed lengths newtype EST_short ES :: Sequence -> EST_short newtype EST_long EL :: Sequence -> EST_long newtype EST_set ESet :: [Sequence] -> EST_set -- | Take time (CPU and wall clock) and report it time :: String -> IO () -> IO () -- | Print a CPUTime difference showT :: (Integral a) => a -> String -- | Shamelessly stolen from FPS integralRandomR :: (Integral a, RandomGen g) => (a, a) -> g -> (a, g) -- | Constrained position generators genOffset :: Gen Offset genNonNegOffset :: Gen Offset genPositiveOffset :: Gen Offset instance Show EST_set instance Show EST_long instance Show EST_short instance Show Protein instance Show ESTq instance Show EST instance Show Quality instance Show Nucleotide instance Arbitrary EST_set instance Arbitrary EST_long instance Arbitrary EST_short instance Arbitrary Char instance Arbitrary EST instance Arbitrary ESTq instance Arbitrary Quality instance Arbitrary Nucleotide instance Arbitrary Word8 instance Random Word8 module Bio.Location.Strand -- | Sequence strand data Strand Fwd :: Strand RevCompl :: Strand -- | Anything, such as a location or a sequence, which lies on a strand and -- can thus be reverse complemented. class Stranded s revCompl :: (Stranded s) => s -> s stranded :: (Stranded s) => Strand -> s -> s instance Eq Strand instance Ord Strand instance Show Strand instance Read Strand instance Bounded Strand instance Enum Strand instance Ix Strand instance Stranded ByteString instance Stranded Char instance Stranded Strand -- | Positions on a sequence. Zero-based Int64 indices are used throughout, -- to facilitate direct use of indexing functions on SeqData. module Bio.Location.Position -- | Position in a sequence data Pos Pos :: !Offset -> !Strand -> Pos -- | 0-based index of the position offset :: Pos -> !Offset -- | Optional strand of the position strand :: Pos -> !Strand -- | Slide a position by an offset slide :: Pos -> Offset -> Pos seqNt :: (Error e, MonadError e m) => SeqData -> Pos -> m Char seqNtPadded :: SeqData -> Pos -> Char display :: Pos -> String instance Eq Pos instance Ord Pos instance Show Pos instance Read Pos instance Ix Pos instance Stranded Pos -- | Data types for working with locations in a sequence. Zero-based Int64 -- indices are used throughout, to facilitate direct use of indexing -- functions on SeqData. module Bio.Location.ContigLocation -- | Contiguous set of positions in a sequence data ContigLoc ContigLoc :: !Offset -> !Offset -> !Strand -> ContigLoc -- | 5' end of region on target sequence, 0-based index offset5 :: ContigLoc -> !Offset -- | length of region on target sequence length :: ContigLoc -> !Offset -- | strand of region strand :: ContigLoc -> !Strand -- | Create a ContigLoc from 0-based starting and ending positions. -- When start is less than end the position will be on the Fwd -- Strand, otherwise it will be on the RevCompl strand. fromStartEnd :: Offset -> Offset -> ContigLoc -- | Create a ContigLoc from a Pos.Pos defining the start -- (ContigLoc 5 prime end) position on the sequence and the -- length. fromPosLen :: Pos -> Offset -> ContigLoc -- | The bounds of a ContigLoc, a pair of the lowest and highest -- sequence indices covered by the region, which ignores the strand of -- the ContigLoc. The first element of the pair will always be -- lower than the second. bounds :: ContigLoc -> (Offset, Offset) -- | 0-based starting (5' in the region orientation) position startPos :: ContigLoc -> Pos -- | 0-based ending (3' in the region orientation) position endPos :: ContigLoc -> Pos -- | Move a ContigLoc region by a specified offset slide :: Offset -> ContigLoc -> ContigLoc -- | ContigLoc extended on the 5' and 3' ends. extend :: (Offset, Offset) -> ContigLoc -> ContigLoc -- | For a Pos and a ContigLoc on the same sequence, find the -- corresponding Pos relative to the ContigLoc, provided it is -- within the ContigLoc. posInto :: Pos -> ContigLoc -> Maybe Pos -- | For a Pos specified relative to a ContigLoc, find the -- corresponding Pos relative to the outer sequence, provided that the -- Pos is within the bounds of the ContigLoc. posOutof :: Pos -> ContigLoc -> Maybe Pos -- | Subsequence SeqData for a ContigLoc, provided that the -- region is entirely within the sequence. seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData -- | Subsequence SeqData for a ContigLoc, padded as needed -- with Ns seqDataPadded :: SeqData -> ContigLoc -> SeqData -- | For a Pos and a ContigLoc on the same sequence, is the Pos -- within the ContigLoc. isWithin :: Pos -> ContigLoc -> Bool -- | For a pair of ContigLoc regions on the same sequence, indicates -- if they overlap at all. overlaps :: ContigLoc -> ContigLoc -> Bool display :: ContigLoc -> String instance Eq ContigLoc instance Ord ContigLoc instance Show ContigLoc instance Stranded ContigLoc -- | Data types for working with locations in a sequence. Zero-based Int64 -- indices are used throughout, to facilitate direct use of indexing -- functions on SeqData. module Bio.Location.Location -- | General (disjoint) sequence region consisting of a concatenated set of -- contiguous regions newtype Loc Loc :: [ContigLoc] -> Loc -- | The bounds of a Loc, consisting of the lowest & highest -- sequence indices lying within the region. The first element of the -- pair will always be lower than the second. bounds :: Loc -> (Offset, Offset) -- | Length of the region length :: Loc -> Offset -- | 0-based starting (5' in the region orientation) offset of the region -- on its sequence. startPos :: Loc -> Pos -- | 0-based ending (3' in the region orientation) offset of the region on -- its sequence. endPos :: Loc -> Pos -- | Extend a Loc region by incorporating contigous nucleotide -- regions of the specified lengths on the 5' and 3' ends extend :: (Offset, Offset) -> Loc -> Loc -- | For a Pos and a Loc region on the same sequence, find the -- corresponding Pos relative to the region, if the Pos is within the -- region. If the Loc region has redundant positions for a given -- sequence position, the first is returned. posInto :: Pos -> Loc -> Maybe Pos -- | For a Loc region on a sequence and a Pos relative to the -- region, find the corresponding Pos on the sequence, provided that the -- position is within the bounds of the region. posOutof :: Pos -> Loc -> Maybe Pos -- | For a Pos and a Loc on the same sequence, does the position -- fall within the Loc region? isWithin :: Pos -> Loc -> Bool -- | For a pair of Loc regions on the same sequence, do they overlap -- at all? overlaps :: Loc -> Loc -> Bool -- | Subsequence SeqData for a Loc, provided that the region -- is entirely within the sequence. seqData :: (Error e, MonadError e m) => SeqData -> Loc -> m SeqData seqDataPadded :: SeqData -> Loc -> SeqData display :: Loc -> String instance Eq Loc instance Ord Loc instance Show Loc instance Stranded Loc -- | A structure to allow fast lookup of objects whose sequence location -- lines up with a give position. module Bio.Location.LocMap -- | Collection mapping a collection of Loc locations, possibly -- overlapping, binned for efficient lookup by position. data LocMap a -- | Create an empty LocMap with a specified position bin size mkLocMap :: Offset -> LocMap a defaultZonesize :: Offset -- | Create a LocMap from an associated list. fromList :: Offset -> [(Loc, a)] -> LocMap a -- | Find the (possibly empty) list of sequence regions and associated -- objects that contain a Pos position, in the sense of withinLoc lookupWithin :: Pos -> LocMap a -> [(Loc, a)] -- | Find the (possibly empty) list of sequence regions and associated -- objects that overlap a Loc region, in the sense of overlapsLoc lookupOverlaps :: Loc -> LocMap a -> [(Loc, a)] -- | Remove a region / object association from the map, if it is present. -- If it is present multiple times, only the first occurrence will be -- deleted. delete :: (Eq a) => (Loc, a) -> LocMap a -> LocMap a -- | Remove the first region / object association satisfying a predicate -- function. deleteBy :: ((Loc, a) -> Bool) -> LocMap a -> LocMap a -- | Add an object with an associated Loc sequence region insert :: Loc -> a -> LocMap a -> LocMap a checkInvariants :: LocMap a -> [String] instance Monoid (LocMap a) module Bio.Location.OnSeq type SeqName = SeqData data OnSeq a OnSeq :: !SeqName -> !a -> OnSeq a onSeqName :: OnSeq a -> !SeqName onSeqObj :: OnSeq a -> !a withSeqData :: (Error e, MonadError e m) => (SeqData -> a -> m b) -> (SeqName -> m SeqData) -> OnSeq a -> m b andSameSeq :: (a -> b -> Bool) -> OnSeq a -> OnSeq b -> Bool onSameSeq :: (Monad m) => (a -> b -> m c) -> OnSeq a -> OnSeq b -> m c type OnSeqs a = Map SeqName a perSeq :: (Monoid b) => (a -> b -> c) -> OnSeq a -> OnSeqs b -> c perSeqUpdate :: (Monoid b) => (a -> b -> b) -> OnSeq a -> OnSeqs b -> OnSeqs b withNameAndSeq :: (Monad m) => (SeqName -> a -> b -> m c) -> OnSeq a -> OnSeqs b -> m c instance (Eq a) => Eq (OnSeq a) instance (Ord a) => Ord (OnSeq a) instance (Show a) => Show (OnSeq a) instance Functor OnSeq module Bio.Location.SeqLocation type SeqPos = OnSeq Pos displaySeqPos :: SeqPos -> String type ContigSeqLoc = OnSeq ContigLoc withinContigSeqLoc :: SeqPos -> ContigSeqLoc -> Bool displayContigSeqLoc :: ContigSeqLoc -> String type SeqLoc = OnSeq Loc isWithin :: SeqPos -> SeqLoc -> Bool overlaps :: SeqLoc -> SeqLoc -> Bool seqData :: (Error e, MonadError e m) => (SeqName -> m SeqData) -> SeqLoc -> m SeqData display :: SeqLoc -> String module Bio.Alignment.Soap -- | Alignment output from SOAP data SoapAlign SA :: !SeqName -> !SeqData -> !QualData -> !Int -> !Char -> !Offset -> !Strand -> !SeqName -> !Offset -> !Int -> ![SoapAlignMismatch] -> SoapAlign name :: SoapAlign -> !SeqName -- | Reference strand orientation sequence sequ :: SoapAlign -> !SeqData -- | Reference strand orientation quality data qual :: SoapAlign -> !QualData nhit :: SoapAlign -> !Int pairend :: SoapAlign -> !Char length :: SoapAlign -> !Offset strand :: SoapAlign -> !Strand refname :: SoapAlign -> !SeqName -- | 1-based index, as output by SOAP, of reference strand 5' end refstart :: SoapAlign -> !Offset nmismatch :: SoapAlign -> !Int mismatches :: SoapAlign -> ![SoapAlignMismatch] data SoapAlignMismatch SAM :: !Char -> !Char -> !Offset -> !Qual -> SoapAlignMismatch -- | Read nt in reference strand orientation readnt :: SoapAlignMismatch -> !Char -- | Reference nt in reference strand orientation refnt :: SoapAlignMismatch -> !Char -- | Offset from reference strand 5' end in reference strand orientation offset :: SoapAlignMismatch -> !Offset -- | Quality score of read nt qualnt :: SoapAlignMismatch -> !Qual refSeqPos :: SoapAlign -> SeqPos refCSeqLoc :: SoapAlign -> ContigSeqLoc refSeqLoc :: SoapAlign -> SeqLoc mismatchSeqPos :: SoapAlign -> SoapAlignMismatch -> SeqPos parse :: (Error e, MonadError e m) => ByteString -> m SoapAlign unparse :: SoapAlign -> ByteString parseMismatch :: (Error e, MonadError e m) => ByteString -> m SoapAlignMismatch unparseMismatch :: SoapAlignMismatch -> ByteString group :: [SoapAlign] -> [[SoapAlign]] instance Read SoapAlignMismatch instance Show SoapAlignMismatch instance Eq SoapAlignMismatch instance Ord SoapAlignMismatch instance Read SoapAlign instance Show SoapAlign instance Eq SoapAlign instance Ord SoapAlign module Bio.Location.SeqLocMap type SeqLocMap a = OnSeqs (LocMap a) empty :: SeqLocMap a fromList :: [(SeqLoc, a)] -> SeqLocMap a insert :: SeqLoc -> a -> SeqLocMap a -> SeqLocMap a lookupWithin :: SeqPos -> SeqLocMap a -> [(SeqLoc, a)] lookupOverlaps :: SeqLoc -> SeqLocMap a -> [(SeqLoc, a)] module Bio.GFF3.Feature data GFFAttr GFFAttr :: !ByteString -> ![ByteString] -> GFFAttr attrTag :: GFFAttr -> !ByteString attrValues :: GFFAttr -> ![ByteString] data Feature Feature :: !ByteString -> !ByteString -> !ByteString -> !Offset -> !Offset -> !Maybe Double -> !Maybe Strand -> !Maybe Offset -> ![GFFAttr] -> Feature seqid :: Feature -> !ByteString source :: Feature -> !ByteString ftype :: Feature -> !ByteString start :: Feature -> !Offset end :: Feature -> !Offset score :: Feature -> !Maybe Double strand :: Feature -> !Maybe Strand phase :: Feature -> !Maybe Offset attributes :: Feature -> ![GFFAttr] length :: Feature -> Offset parse :: (Error e, MonadError e m) => ByteString -> m Feature unparse :: Feature -> ByteString parseWithFasta :: (Error e, MonadError e m) => ByteString -> m ([Feature], [ByteString]) attrByTag :: ByteString -> Feature -> [ByteString] ids :: Feature -> [ByteString] parentIds :: Feature -> [ByteString] contigLoc :: Feature -> ContigLoc loc :: Feature -> Loc seqLoc :: Feature -> SeqLoc name :: (Error e, MonadError e m) => Feature -> m SeqName instance Eq Feature instance Ord Feature instance Show Feature instance Eq GFFAttr instance Ord GFFAttr instance Show GFFAttr module Bio.GFF3.FeatureHier data FeatureHier features :: FeatureHier -> (Set Feature) lookupId :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m Feature lookupIdChildren :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m [Feature] fromList :: (Error e, MonadError e m) => [Feature] -> m FeatureHier insert :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier delete :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier parents :: FeatureHier -> Feature -> [Feature] children :: FeatureHier -> Feature -> [Feature] parentsM :: (MonadReader FeatureHier m) => Feature -> m [Feature] childrenM :: (MonadReader FeatureHier m) => Feature -> m [Feature] checkInvariants :: FeatureHier -> [String] instance Show FeatureHier module Bio.GFF3.FeatureHierSequences data FeatureHierSequences features :: FeatureHierSequences -> Set Feature sequences :: FeatureHierSequences -> [Sequence] fromLists :: (Error e, MonadError e m) => [Feature] -> [Sequence] -> m FeatureHierSequences parse :: (Error e, MonadError e m) => ByteString -> m FeatureHierSequences lookupId :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m Feature parents :: FeatureHierSequences -> Feature -> [Feature] children :: FeatureHierSequences -> Feature -> [Feature] seqData :: (Error e, MonadError e m) => FeatureHierSequences -> SeqLoc -> m SeqData getSequence :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m SeqData featureSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m Sequence runGFF :: FilePath -> (ErrorT String (Reader FeatureHierSequences) a) -> ErrorT String IO a runGFFIO :: FilePath -> (ErrorT String (ReaderT FeatureHierSequences IO) a) -> ErrorT String IO a asksGFF :: (Error e, MonadError e m, MonadReader FeatureHierSequences m) => (FeatureHierSequences -> a -> m b) -> a -> m b instance Show FeatureHierSequences module Bio.GFF3.SGD chromosomes :: FeatureHierSequences -> [Feature] genes :: FeatureHierSequences -> [Feature] rRNAs :: FeatureHierSequences -> [Feature] sortExons :: (Error e, MonadError e m) => [Feature] -> m [Feature] geneSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m Sequence geneSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc geneCDSes :: FeatureHierSequences -> Feature -> [Feature] noncodingSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m Sequence noncodingSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc noncodingExons :: FeatureHierSequences -> Feature -> [Feature] namedSLM :: FeatureHierSequences -> SeqLocMap Feature geneCDS_SLM :: (Error e, MonadError e m) => FeatureHierSequences -> m (SeqLocMap Feature) -- | This is a meta-module importing and re-exporting sequence-related -- stuff. -- -- It encompasses the Bio.Sequence.SeqData, -- Bio.Sequence.Fasta, and Bio.Sequence.TwoBit modules. module Bio.Sequence -- | A sequence consists of a header, the sequence data itself, and -- optional quality data. data Sequence -- | header and actual sequence Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence -- | An offset, index, or length of a SeqData type Offset = Int64 -- | The basic data type used in Sequences type SeqData = ByteString -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from -- bad. type Qual = Word8 -- | Quality data is a Qual vector, currently implemented as a -- ByteString. type QualData = ByteString -- | Return sequence length. seqlength :: Sequence -> Offset -- | Return sequence label (first word of header) seqlabel :: Sequence -> SeqData -- | Return full header. seqheader :: Sequence -> SeqData -- | Return the sequence data. seqdata :: Sequence -> SeqData -- | Return the quality data, or error if none exist. Use hasqual if in -- doubt. seqqual :: Sequence -> QualData -- | Read the character at the specified position in the sequence. (!) :: Sequence -> Offset -> Char appendHeader :: Sequence -> String -> Sequence -- | Modify the header by appending text, or by replacing all but the -- sequence label (i.e. first word). setHeader :: Sequence -> String -> Sequence -- | Convert a String to SeqData fromStr :: String -> SeqData -- | Convert a SeqData to a String toStr :: SeqData -> String -- | Complement a single character. I.e. identify the nucleotide it can -- hybridize with. Note that for multiple nucleotides, you usually want -- the reverse complement (see revcompl for that). compl :: Char -> Char -- | Calculate the reverse complement. This is only relevant for the -- nucleotide alphabet, and it leaves other characters unmodified. revcompl :: Sequence -> Sequence data Amino Ala :: Amino Arg :: Amino Asn :: Amino Asp :: Amino Cys :: Amino Gln :: Amino Glu :: Amino Gly :: Amino His :: Amino Ile :: Amino Leu :: Amino Lys :: Amino Met :: Amino Phe :: Amino Pro :: Amino Ser :: Amino Thr :: Amino Tyr :: Amino Trp :: Amino Val :: Amino STP :: Amino Asx :: Amino Glx :: Amino Xle :: Amino Xaa :: Amino -- | Translate a nucleotide sequence into the corresponding protein -- sequence. This works rather blindly, with no attempt to identify ORFs -- or otherwise QA the result. translate :: Sequence -> Offset -> [Amino] -- | Convert a sequence in IUPAC format to a list of amino acids. fromIUPAC :: SeqData -> [Amino] -- | Convert a list of amino acids to a sequence in IUPAC format. toIUPAC :: [Amino] -> SeqData -- | Lazily read sequences from a FASTA-formatted file readFasta :: FilePath -> IO [Sequence] -- | Lazily read sequence from handle hReadFasta :: Handle -> IO [Sequence] -- | Write sequences to a FASTA-formatted file. Line length is 60. writeFasta :: FilePath -> [Sequence] -> IO () -- | Write sequences in FASTA format to a handle. hWriteFasta :: Handle -> [Sequence] -> IO () -- | Read quality data for sequences to a file. readQual :: FilePath -> IO [Sequence] -- | Write quality data for sequences to a file. writeQual :: FilePath -> [Sequence] -> IO () hWriteQual :: Handle -> [Sequence] -> IO () -- | Read sequence and associated quality. Will error if the sequences and -- qualites do not match one-to-one in sequence. readFastaQual :: FilePath -> FilePath -> IO [Sequence] -- | Write sequence and quality data simulatnously This may be more -- laziness-friendly. writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO () hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO () readFastQ :: FilePath -> IO [Sequence] writeFastQ :: FilePath -> [Sequence] -> IO () hReadFastQ :: Handle -> IO [Sequence] hWriteFastQ :: Handle -> [Sequence] -> IO () -- | Parse a .phd file, extracting the contents as a Sequence readPhd :: FilePath -> IO Sequence -- | Parse .phd contents from a handle hReadPhd :: Handle -> IO Sequence -- | Parse a (lazy) ByteString as sequences in the 2bit format. decode2Bit :: ByteString -> [Sequence] -- | Extract sequences from a file in 2bit format. read2Bit :: FilePath -> IO [Sequence] -- | Extract sequences in the 2bit format from a handle. hRead2Bit :: Handle -> IO [Sequence] -- | This is a struct for containing a set of hashing functions data HashF k HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k -- | calculates the hash at a given offset in the sequence hash :: HashF k -> SeqData -> Offset -> Maybe k -- | calculate all hashes from a sequence, and their indices hashes :: HashF k -> SeqData -> [(k, Offset)] -- | for sorting hashes ksort :: HashF k -> [k] -> [k] -- | Contigous constructs an int/eger from a contigous k-word. contigous :: (Integral k) => Int -> HashF k -- | Like contigous, but returns the same hash for a word and its -- reverse complement. rcontig :: (Integral k) => Int -> HashF k -- | Like rcontig, but ignoring monomers (i.e. arbitrarily long -- runs of a single nucelotide are treated the same a single nucleotide. rcpacked :: (Integral k) => Int -> HashF k class KWords s kwords :: (KWords s) => Int -> s -> [s] entropy :: (Ord str, KWords str) => Int -> str -> Double -- | Multiple alignments. module Bio.Alignment.Multiple -- | Progressive multiple alignment. Calculate a tree from agglomerative -- clustering, then align at each branch going bottom up. Returns a list -- of columns (rows?). progressive :: (Sequence -> Sequence -> (Double, EditList)) -> [Sequence] -> [String] -- | Derive alignments indirectly, i.e. calculate A|C using alignments A|B -- and B|C. This is central for Coffee evaluation of alignments, and -- T-Coffee construction of alignments. indirect :: EditList -> EditList -> EditList