-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A bioinformatics library -- -- This is a collection of data structures and algorithms useful for -- building bioinformatics-related tools and utilities. -- -- Current list of features includes: a Sequence data type supporting -- protein and nucleotide sequences and conversion between them. As of -- version 0.4, different kinds of sequence have different types. Support -- for quality data, reading and writing Fasta formatted files, reading -- TwoBit and phd formats, and Roche/454 SFF files. Rudimentary (i.e. -- unoptimized) support for doing alignments - including dynamic -- adjustment of scores based on sequence quality. Also Blast output -- parsing. Partly implemented single linkage clustering, and multiple -- alignment. Reading Gene Ontology (GO) annotations (GOA) and -- definitions/hierarchy. -- -- The Darcs repository is at: -- http://malde.org/~ketil/biohaskell/biolib. @package bio @version 0.4.7 module Bio.GFF3.Escape unEscapeByteString :: (Error e, MonadError e m) => ByteString -> m ByteString escapeByteString :: (Char -> Bool) -> ByteString -> ByteString escapeAllBut :: String -> ByteString -> ByteString escapeAllOf :: String -> ByteString -> ByteString -- | Lazy "many" combinator for Parsec. Courtesy of Tomasz Zielonka. module Bio.Util.Parsex lazyMany :: GenParser Char () a -> SourceName -> [Char] -> [a] -- | Implement clustering module Bio.Clustering -- | Data structure for storing hierarchical clusters data Clustered score datum Branch :: score -> (Clustered score datum) -> (Clustered score datum) -> Clustered score datum Leaf :: datum -> Clustered score datum -- | Single linkage agglomerative clustering. Cluster elements by slurping -- a sorted list of pairs with score (i.e. triples :-) Keeps a set of -- contained elements at each branch's root, so O(n log n), and requires -- elements to be in Ord. For this to work, the triples must be sorted on -- score. Earlier scores in the list will make up the lower nodes, so -- sort descending for similarity, ascending for distance. cluster_sl :: (Ord a, Ord s) => [(s, a, a)] -> [Clustered s a] instance (Show score, Show datum) => Show (Clustered score datum) -- | This models the PSL format used by e.g. the alignment tool BLAT. It is -- a simple, textual representation of (spliced) alignments, with -- tab-separated fields. -- -- See http:genome.ucsc.eduFAQFAQformat#format2 for -- details. module Bio.Alignment.PSL data PSL PSL :: Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> ByteString -> ByteString -> Int -> Int -> Int -> ByteString -> Int -> Int -> Int -> Int -> [Int] -> [Int] -> [Int] -> PSL match :: PSL -> Int mismatch :: PSL -> Int repmatch :: PSL -> Int ncount :: PSL -> Int qgapcount :: PSL -> Int qgaplength :: PSL -> Int tgapcount :: PSL -> Int tgaplength :: PSL -> Int strand :: PSL -> ByteString qname :: PSL -> ByteString qsize :: PSL -> Int qstart :: PSL -> Int qend :: PSL -> Int tname :: PSL -> ByteString tsize :: PSL -> Int tstart :: PSL -> Int tend :: PSL -> Int blockcount :: PSL -> Int blocksizes :: PSL -> [Int] qstarts :: PSL -> [Int] tstarts :: PSL -> [Int] readPSL :: FilePath -> IO [PSL] writePSL :: FilePath -> [PSL] -> IO () parsePSL :: ByteString -> [PSL] unparsePSL :: [PSL] -> ByteString pslHeader :: ByteString instance Eq PSL instance Show PSL -- | Utility module, with various useful stuff. module Bio.Util lines :: ByteString -> [ByteString] -- | Break a list of bytestrings on a predicate. splitWhen :: (ByteString -> Bool) -> [ByteString] -> [[ByteString]] -- | Output (to stderr) progress while evaluating a lazy list. Useful for -- generating output while (conceptually, at least) in pure code countIO :: String -> String -> Int -> [a] -> IO [a] -- | A lazier version of Control.Monad.sequence in -- Control.Monad, needed by countIO above. sequence' :: [IO a] -> IO [a] -- | Workaround, the current Data.ByteString.Lazy.Char8 contains a -- bug in Data.ByteString.Lazy.Char8.lines. mylines :: ByteString -> [ByteString] -- | This module implements a hierarchical data structure for BLAST -- results, there is an alternative flat structure in the -- Bio.Alignment.BlastFlat module. -- -- BLAST is a tool for searching in (biological) sequences for -- similarity. This library is tested against NCBI-blast version 2.2.14. -- There exist several independent versions of BLAST, so expect some -- incompatbilities if you're using a different BLAST version. -- -- For parsing BLAST results, the XML format (blastall -m 7) is by far -- the most robust choice, and is implemented in the -- Bio.Alignment.BlastXML module. -- -- The format is straightforward (and non-recursive). For more -- information on BLAST, check -- http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html module Bio.Alignment.BlastData -- | The sequence id, i.e. the first word of the header field. type SeqId = ByteString -- | The Strand indicates the direction of the match, i.e. the plain -- sequence or its reverse complement. data Strand Plus :: Strand Minus :: Strand -- | The Aux field in the BLAST output includes match information that -- depends on the BLAST flavor (blastn, blastx, or blastp). This data -- structure captures those variations. data Aux -- | blastn Strands :: !Strand -> !Strand -> Aux -- | blastx Frame :: !Strand -> !Int -> Aux -- | A BlastResult is the root of the hierarchy. data BlastResult BlastResult :: !ByteString -> !ByteString -> !ByteString -> !ByteString -> !ByteString -> !Integer -> !Integer -> [BlastRecord] -> BlastResult blastprogram :: BlastResult -> !ByteString blastversion :: BlastResult -> !ByteString blastdate :: BlastResult -> !ByteString blastreferences :: BlastResult -> !ByteString database :: BlastResult -> !ByteString dbsequences :: BlastResult -> !Integer dbchars :: BlastResult -> !Integer results :: BlastResult -> [BlastRecord] -- | Each query sequence generates a BlastRecord data BlastRecord BlastRecord :: !SeqId -> !Int -> [BlastHit] -> BlastRecord query :: BlastRecord -> !SeqId qlength :: BlastRecord -> !Int hits :: BlastRecord -> [BlastHit] -- | Each match between a query and a target sequence (or subject) is a -- BlastHit. data BlastHit BlastHit :: !SeqId -> !Int -> [BlastMatch] -> BlastHit subject :: BlastHit -> !SeqId slength :: BlastHit -> !Int matches :: BlastHit -> [BlastMatch] -- | A BlastHit may contain multiple separate matches (typcially -- when an indel causes a frameshift that blastx is unable to bridge). data BlastMatch BlastMatch :: !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastMatch bits :: BlastMatch -> !Double e_val :: BlastMatch -> !Double identity :: BlastMatch -> (Int, Int) q_from :: BlastMatch -> !Int q_to :: BlastMatch -> !Int h_from :: BlastMatch -> !Int h_to :: BlastMatch -> !Int aux :: BlastMatch -> !Aux instance Show BlastMatch instance Show BlastHit instance Show BlastRecord instance Show BlastResult instance Show Aux instance Eq Aux instance Read Strand instance Show Strand instance Eq Strand -- | Parse blast XML output. -- -- If you use a recent version of NCBI BLAST and specify XML output -- (blastall -m 7), this module should be able to parse the result into a -- hierarchical BlastResult structure. -- -- While the process may consume a bit of memory, the parsing is lazy, -- and file sizes of several gigabytes can be parsed (see e.g. the xml2x -- tool for an example). To parse XML, we use Text.HTML.TagSoup. module Bio.Alignment.BlastXML -- | Parse BLAST results in XML format readXML :: FilePath -> IO [BlastResult] -- | This module implements a "flattened" data structure for Blast hits, as -- opposed to the hierarchical structure in -- Bio.Alignment.BlastData. -- -- The flat data type is useful in many cases where it is more natural to -- see the result as a set of rows (e.g. for insertaion in a database). -- -- It would probably be more (memory-) efficient to go the other way -- (i.e. from flat to hierarchical), as passing the current, partially -- built BlastFlat object down the stream of results and stamping -- out a stream of completed ones. (See -- Bio.Alignment.BlastXML.breaks for this week's most cumbersome -- use of parallelism to avoid the memory issue.) module Bio.Alignment.BlastFlat -- | The BlastFlat data structure contains information about a single match data BlastFlat BlastFlat :: !SeqId -> !Int -> !SeqId -> !Int -> !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastFlat query :: BlastFlat -> !SeqId qlength :: BlastFlat -> !Int subject :: BlastFlat -> !SeqId slength :: BlastFlat -> !Int bits :: BlastFlat -> !Double e_val :: BlastFlat -> !Double identity :: BlastFlat -> (Int, Int) q_from :: BlastFlat -> !Int q_to :: BlastFlat -> !Int h_from :: BlastFlat -> !Int h_to :: BlastFlat -> !Int aux :: BlastFlat -> !Aux readXML :: FilePath -> IO [BlastFlat] -- | Convert BlastRecords into BlastFlats (representing a depth-first -- traversal of the BlastRecord structure.) flatten :: [BlastRecord] -> [BlastFlat] -- | Each query sequence generates a BlastRecord data BlastRecord blastprogram :: BlastResult -> ByteString blastversion :: BlastResult -> ByteString blastdate :: BlastResult -> ByteString blastreferences :: BlastResult -> ByteString database :: BlastResult -> ByteString dbsequences :: BlastResult -> Integer dbchars :: BlastResult -> Integer results :: BlastResult -> [BlastRecord] -- | The Aux field in the BLAST output includes match information that -- depends on the BLAST flavor (blastn, blastx, or blastp). This data -- structure captures those variations. data Aux -- | blastn Strands :: !Strand -> !Strand -> Aux -- | blastx Frame :: !Strand -> !Int -> Aux -- | The Strand indicates the direction of the match, i.e. the plain -- sequence or its reverse complement. data Strand Plus :: Strand Minus :: Strand -- | This module implements a parser for BLAST results. -- -- This module is DEPRECATED. It is *very* recommended that you run blast -- with XML output instaed, and use the BlastXML module to parse it. -- Don't say I didn't warn you! -- -- BLAST is a tool for searching in (biological) sequences for -- similarity. This library is tested against NCBI-blast version 2.2.14. -- There exist several independent versions, so expect some -- incompatbilities if you're using a different BLAST version. -- -- The format is straightforward (and non-recursive), and this -- implementation uses a simple line-based, hierarchical parser. -- -- For more information on BLAST, check -- http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html module Bio.Alignment.Blast parse :: ByteString -> BlastResult module Bio.Sequence.SFF_name -- | Read names encode various information, as per this struct. data ReadName ReadName :: (Int, Int, Int) -> (Int, Int, Int) -> Int -> Int -> Int -> ReadName date :: ReadName -> (Int, Int, Int) time :: ReadName -> (Int, Int, Int) region :: ReadName -> Int x_loc :: ReadName -> Int y_loc :: ReadName -> Int decodeReadName :: ByteString -> Maybe ReadName decodeLocation :: ByteString -> Maybe (Int, Int) decodeDate :: ByteString -> Maybe [Int] encodeReadName :: ReadName -> ByteString encodeLocation :: Int -> Int -> ByteString encodeRegion :: Int -> ByteString encodeDate :: (Int, Int, Int) -> (Int, Int, Int) -> ByteString divMods :: Int -> [Int] -> [Int] decode36 :: ByteString -> Maybe Int decCh :: Char -> Maybe Int encode36 :: Int -> ByteString b36 :: UArray Int Char instance Show ReadName -- | GeneOntology - parse and index Gene Ontology Annotations In -- particular, the file 'gene_association.goa_uniprot' that contains -- links between GO terms and UniProt accessions. -- --
-- AS contigs reads -- CO contig_name bases reads segments compl (CAP3: segments=0) -- sequence -- BQ base_qualities -- AF read1 compl padded_start_consensus (negatives meaning?) -- AF read2 .. -- BS segments -- RD read1 bases info_items info_tags (latter two set to 0 by CAP3) -- sequence -- QA read1 qual_start qual_end align_start align_end -- DS (phred header? left empty by CAP3) -- RD read2 ... ---- -- As far as I know, this is only used for nucleotide sequences. module Bio.Alignment.ACE -- | Reading an ACE file. readACE :: FilePath -> IO [[Assembly]] writeACE :: FilePath -> [Assembly] -> IO () data Assembly Asm :: (Sequence Nuc, Gaps) -> Alignment Nuc -> Assembly contig :: Assembly -> (Sequence Nuc, Gaps) fragments :: Assembly -> Alignment Nuc -- | Test parser p on a list of ACE elements ptest :: (Show a) => String -> AceParser a -> [ACE] -> IO () reads :: Assembly -> Alignment Nuc instance Eq ACE instance Show Assembly instance Show ACE -- | Data types for functorially lifting sequence positions and locations -- onto named sequences. These are useful for taking functions that work -- with sequence positions and locations and associating them specific, -- named sequences. module Bio.Location.OnSeq -- | Sequence name, as in a Sequence type SeqName = SeqData -- | Data type for an object associated with a specific, named sequence data OnSeq a OnSeq :: !SeqName -> !a -> OnSeq a onSeqName :: OnSeq a -> !SeqName onSeqObj :: OnSeq a -> !a -- | Looks up a sequence by name and applies a function to it withSeqData :: (Monad m) => (SeqData -> a -> m b) -> (SeqName -> m SeqData) -> OnSeq a -> m b -- | Tests a predicate when two objects are on the same sequence, returning -- False if they are on different sequences. andSameSeq :: (a -> b -> Bool) -> OnSeq a -> OnSeq b -> Bool -- | Performs an action when two objects are on the same sequence and -- produces an error otherwise. onSameSeq :: (Error e, MonadError e m) => (a -> b -> m c) -> OnSeq a -> OnSeq b -> m c -- | Data type for a collection of objects indexed by sequence name type OnSeqs a = Map SeqName a -- | Lifts a function on an underlying object to look up the sequence name -- in a name-indexed collection. perSeq :: (Monoid b) => (a -> b -> c) -> OnSeq a -> OnSeqs b -> c -- | Lifts a function that updates an underlying object to look up the -- named sequence and update a named-index collection. perSeqUpdate :: (Monoid b) => (a -> b -> b) -> OnSeq a -> OnSeqs b -> OnSeqs b -- | Lifts a function on underlying objects to look up a sequence in a -- name-indexed collection withNameAndSeq :: (Monad m) => (SeqName -> a -> b -> m c) -> OnSeq a -> OnSeqs b -> m c instance (Eq a) => Eq (OnSeq a) instance (Ord a) => Ord (OnSeq a) instance (Show a) => Show (OnSeq a) instance Functor OnSeq -- | Utilities for manipulating nucleotide sequences and locations on -- nucleotide sequences that occur on a forward or a reverse-complement -- strand. module Bio.Location.Strand -- | Sequence strand data Strand Fwd :: Strand RevCompl :: Strand -- | A nucleotide sequence or location on a nucleotide sequence that lies -- on a specific strand and has an orientation. class Stranded s revCompl :: (Stranded s) => s -> s -- | Convert the orientation of a Stranded thing based on a -- specified Strand stranded :: (Stranded s) => Strand -> s -> s instance Eq Strand instance Ord Strand instance Show Strand instance Read Strand instance Bounded Strand instance Enum Strand instance Ix Strand instance Stranded ByteString instance Stranded Char instance Stranded Strand -- | Data type for a sequence position. -- -- Zero-based Offset / Int64 indices are used throughout, -- to facilitate direct use of indexing functions on SeqData. module Bio.Location.Position -- | Position in a sequence data Pos Pos :: !Offset -> !Strand -> Pos -- | 0-based index of the position offset :: Pos -> !Offset -- | Strand of the position strand :: Pos -> !Strand -- | Returns a position resulting from sliding the original position along -- the sequence by a specified offset. A positive offset will move the -- position away from the 5' end of the forward stand of the sequence -- regardless of the strand of the position itself. Thus, -- --
-- slide (revCompl pos) off == revCompl (slide pos off) --slide :: Pos -> Offset -> Pos -- | Extract the nucleotide at a specific sequence position. If the -- position lies outside the bounds of the sequence, an error results. seqNt :: (Error e, MonadError e m) => SeqData -> Pos -> m Char -- | As seqNt, extract the nucleotide at a specific sequence -- position, but return N when the position lies outside the -- bounds of the sequence. -- --
-- seqNtPadded sequ pos == (either 'N' id . seqNt sequ) pos --seqNtPadded :: SeqData -> Pos -> Char -- | Display a human-friendly, zero-based representation of a sequence -- position. display :: Pos -> String instance Eq Pos instance Ord Pos instance Show Pos instance Read Pos instance Ix Pos instance Stranded Pos -- | Data type for a sequence location consiting of a contiguous range of -- positions on the sequence. -- -- Throughout, sequence position refers to a Pos which -- includes a strand. An index into a sequence is referred to as an -- offset, and is generally of type Offset. module Bio.Location.ContigLocation -- | Contiguous sequence location defined by a span of sequence positions, -- lying on a specific strand of the sequence. data ContigLoc ContigLoc :: !Offset -> !Offset -> !Strand -> ContigLoc -- | The offset of the 5' end of the location, as a 0-based index offset5 :: ContigLoc -> !Offset -- | The length of the location length :: ContigLoc -> !Offset -- | The strand of the location strand :: ContigLoc -> !Strand -- | Create a sequence location lying between 0-based starting and ending -- offsets. When start < end, the location be on the forward -- strand, otherwise it will be on the reverse complement strand. fromStartEnd :: Offset -> Offset -> ContigLoc -- | Create a sequence location from the sequence position of the start of -- the location and the length of the position. The strand of the -- location, and the direction it extends from the starting position, are -- determined by the strand of the starting position. fromPosLen :: Pos -> Offset -> ContigLoc -- | The bounds of a sequence location. This is a pair consisting of the -- lowest and highest sequence offsets covered by the region. The bounds -- ignore the strand of the sequence location, and the first element of -- the pair will always be lower than the second. bounds :: ContigLoc -> (Offset, Offset) -- | Sequence position of the start of the location. This is the 5' end on -- the location strand, which will have a higher offset than -- endPos if the location is on the RevCompl strand. startPos :: ContigLoc -> Pos -- | Sequence position of the end of the location, as described in -- startPos. endPos :: ContigLoc -> Pos -- | Given a sequence position and a sequence location relative to the same -- sequence, compute a new position representing the original position -- relative to the subsequence defined by the location. If the sequence -- position lies outside of the sequence location, Nothing is -- returned; thus, the offset of the new position will always be in the -- range [0, length cloc - 1]. posInto :: Pos -> ContigLoc -> Maybe Pos -- | Given a sequence location and a sequence position within that -- location, compute a new position representing the original position -- relative to the outer sequence. If the sequence position lies outside -- the location, Nothing is returned. -- -- This function inverts posInto when the sequence position lies -- within the position is actually within the location. posOutof :: Pos -> ContigLoc -> Maybe Pos -- | Returns True when a sequence position lies within a sequence -- location on the same sequence, and occupies the same strand. isWithin :: Pos -> ContigLoc -> Bool -- | Returns True when two sequence locations overlap at any -- position. overlaps :: ContigLoc -> ContigLoc -> Bool -- | Extract the nucleotide SeqData for the sequence location. If -- any part of the location lies outside the bounds of the sequence, an -- error results. seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData -- | As seqData, extract the nucleotide subsequence for the -- location. Any positions in the location lying outside the bounds of -- the sequence are returned as N rather than producing an -- error. seqDataPadded :: SeqData -> ContigLoc -> SeqData -- | Returns a location resulting from sliding the original location along -- the sequence by a specified offset. A positive offset will move the -- location away from the 5' end of the forward stand of the sequence -- regardless of the strand of the location itself. Thus, -- --
-- slide (revCompl cloc) off == revCompl (slide cloc off) --slide :: Offset -> ContigLoc -> ContigLoc -- | Returns a sequence location produced by extending the original -- location on each end, based on a pair of (5\' extension, /3' -- extension/). The 5' extension is applied to the 5' end of the location -- on the location strand; if the location is on the RevCompl -- strand, the 5' end will have a higher offset than the 3' end and this -- offset will increase by the amount of the 5' extension. Similarly, the -- 3' extension is applied to the 3' end of the location. extend :: (Offset, Offset) -> ContigLoc -> ContigLoc -- | Display a human-friendly, zero-based representation of a sequence -- location. display :: ContigLoc -> String instance Eq ContigLoc instance Ord ContigLoc instance Show ContigLoc instance Stranded ContigLoc -- | Data type for a more general sequence location consiting of -- potentially disjoint ranges of positions on the sequence. -- -- Throughout, sequence position refers to a Pos which -- includes a strand. An index into a sequence is referred to as an -- offset, and is generally of type Offset. module Bio.Location.Location -- | General (disjoint) sequence region consisting of a concatenated set of -- contiguous regions (see ContigLoc). newtype Loc Loc :: [ContigLoc] -> Loc -- | The bounds of a sequence location. This is a pair consisting of the -- lowest and highest sequence offsets covered by the region. The bounds -- ignore the strand of the sequence location, and the first element of -- the pair will always be lower than the second. Even if the positions -- in the location do not run monotonically through the location, the -- overall lowest and highest sequence offsets are returned. bounds :: Loc -> (Offset, Offset) -- | Returns the length of the region length :: Loc -> Offset -- | Sequence position of the start of the location. This is the 5' end on -- the location strand, which will have a higher offset than -- endPos if the location is on the RevCompl strand. startPos :: Loc -> Pos -- | Sequence position of the end of the location, as described in -- startPos. endPos :: Loc -> Pos -- | Given a sequence position and a sequence location relative to the same -- sequence, compute a new position representing the original position -- relative to the subsequence defined by the location. If the sequence -- position lies outside of the sequence location, Nothing is -- returned; thus, the offset of the new position will always be in the -- range [0, length cloc - 1]. -- -- When the sequence positions in the location are not monotonic, there -- may be multiple possible posInto solutions. That is, if the same outer -- sequence position is covered by two different contiguous blocks of the -- location, then it would have two possible sequence positions relative -- to the location. In this case, the position 5'-most in the location -- orientation is returned. posInto :: Pos -> Loc -> Maybe Pos -- | Given a sequence location and a sequence position within that -- location, compute a new position representing the original position -- relative to the outer sequence. If the sequence position lies outside -- the location, Nothing is returned. -- -- This function inverts posInto when the sequence position lies -- within the position is actually within the location. Due to the -- possibility of redundant location-relative positions for a given -- absolute position, posInto does not necessary invert -- posOutof posOutof :: Pos -> Loc -> Maybe Pos -- | Returns True when a sequence position lies within a sequence -- location on the same sequence, and occupies the same strand. isWithin :: Pos -> Loc -> Bool -- | Returns True when two sequence locations overlap at any -- position. overlaps :: Loc -> Loc -> Bool -- | Extract the nucleotide SeqData for the sequence location. If -- any part of the location lies outside the bounds of the sequence, an -- error results. seqData :: (Error e, MonadError e m) => SeqData -> Loc -> m SeqData -- | As seqData, extract the nucleotide subsequence for the -- location. Any positions in the location lying outside the bounds of -- the sequence are returned as N rather than producing an -- error. seqDataPadded :: SeqData -> Loc -> SeqData -- | Returns a sequence location produced by extending the original -- location on each end, based on a pair of (5\' extension, /3' -- extension/). These add contiguous positions to the 5' and 3' ends of -- the original location. The 5' extension is applied to the 5' end of -- the location on the location strand; if the location is on the -- RevCompl strand, the 5' end will have a higher offset than the -- 3' end and this offset will increase by the amount of the 5' -- extension. Similarly, the 3' extension is applied to the 3' end of the -- location. extend :: (Offset, Offset) -> Loc -> Loc -- | Display a human-friendly, zero-based representation of a sequence -- location. display :: Loc -> String instance Eq Loc instance Ord Loc instance Show Loc instance Stranded Loc -- | Data types for sequence locations and sequence positions associated -- with specific, named sequences. module Bio.Location.SeqLocation -- | A position on a named sequence type SeqPos = OnSeq Pos -- | A location consisting of a contiguous span of positions on a named -- sequence. type ContigSeqLoc = OnSeq ContigLoc -- | Test whether a sequence position lies within a sequence location. This -- requires that the position lie within the location as per -- isWithin and have the same sequence name. withinContigSeqLoc :: SeqPos -> ContigSeqLoc -> Bool -- | A general location, consisting of spans of sequence positions on a -- specific, named sequence. type SeqLoc = OnSeq Loc -- | Test whether a sequence position lies within a sequence location. This -- requires that the position lie within the location as per -- isWithin and have the same sequence name. isWithin :: SeqPos -> SeqLoc -> Bool -- | Test whether two sequence locations overlap in any position. This -- requires that the locations overlap as per overlaps and have -- the same sequence name. overlaps :: SeqLoc -> SeqLoc -> Bool -- | Extract the subsequence specified by a sequence location from a -- sequence database. The sequence name is used to retrieve the full -- sequence and the subsequence is extracted as by seqData seqData :: (Error e, MonadError e m) => (SeqName -> m SeqData) -> SeqLoc -> m SeqData -- | Display a human-friendly representation of a SeqPos displaySeqPos :: SeqPos -> String -- | Display a human-friendly representation of a ContigSeqLoc displayContigSeqLoc :: ContigSeqLoc -> String -- | Display a human-friendly representation of a SeqLoc display :: SeqLoc -> String -- | This module provides a data type to represent an alignment produced by -- the Bowtie short-read alignment tool (see -- http://bowtie-bio.sourceforge.net/index.shtml). -- -- The simple accessors recapitulate the details of the Bowtie alignment -- output. The position of the alignment is given by the "0-based offset -- into the reference sequence where leftmost character of the alignment -- occurs". Thus, for forward-strand alignments this is the 5' end of the -- query sequence while for reverse-complement alignments this is the 3' -- end of the query sequence. Similarly, the query sequence and query -- quality are shown in reference forward strand orientation, and thus -- may be reverse complemented. module Bio.Alignment.Bowtie data Align Align :: !SeqName -> !Strand -> !SeqName -> !Offset -> !SeqData -> !QualData -> ![Mismatch] -> Align -- | Name of the query sequence name :: Align -> !SeqName -- | Strand of the alignment on the reference sequence strand :: Align -> !Strand -- | Name of the reference sequence refname :: Align -> !SeqName -- | Zero-based offset of the left-most aligned position in the reference leftoffset :: Align -> !Offset -- | Query sequence, in the reference forward strand orientation sequ :: Align -> !SeqData -- | Query quality, in the reference forward strand orientation qual :: Align -> !QualData -- | Mismatches mismatches :: Align -> ![Mismatch] -- | Representation of a single mismatch in a bowtie alignment data Mismatch Mismatch :: !Offset -> !Char -> !Char -> Mismatch -- | Offset of the mismatch site from the 5' end of the query mmoffset :: Mismatch -> !Offset -- | Reference nucleotide refbase :: Mismatch -> !Char -- | Query nucleotide readbase :: Mismatch -> !Char -- | Returns the length of the query sequence length :: Align -> Offset -- | Returns the number of mismatches in the alignment nmismatch :: Align -> Int -- | Query sequence as given in the query file querySequ :: Align -> SeqData -- | Query quality as given in the query file queryQual :: Align -> QualData -- | As refCSeqLoc but without the reference sequence name. refCLoc :: Align -> ContigLoc -- | Returns the sequence location covered by the query in the alignment. -- This will be a sequence location on the reference sequence and may run -- on the forward or the reverse complement strand. refCSeqLoc :: Align -> ContigSeqLoc -- | Returns the sequence location covered by the query, as -- refCSeqLoc, as a SeqLoc location. refSeqLoc :: Align -> SeqLoc -- | Returns the sequence position of the start of the query sequence -- alignment. This will include the strand of the alignment and will not -- be the same as the position computed from leftoffset when the -- alignment is on the reverse complement strand. refSeqPos :: Align -> SeqPos -- | Sequence position of a mismatch on the reference sequence. mismatchSeqPos :: Align -> Mismatch -> SeqPos -- | Parses a line of Bowtie output to produce a Align parse :: ByteString -> Either String Align -- | Returns true when two alignments were derived from the same sequencing -- read. As Bowtie writes alignments of query sequences in their order in -- the query file, all alignments of a given read are grouped together -- and the lists of all alignments for each read can be gathered with -- --
-- groupBy sameRead --sameRead :: Align -> Align -> Bool instance Read Mismatch instance Show Mismatch instance Eq Mismatch instance Ord Mismatch instance Read Align instance Show Align instance Eq Align instance Ord Align module Bio.Alignment.Soap -- | Alignment output from SOAP data SoapAlign SA :: !SeqName -> !SeqData -> !QualData -> !Int -> !Char -> !Offset -> !Strand -> !SeqName -> !Offset -> !Int -> ![SoapAlignMismatch] -> SoapAlign name :: SoapAlign -> !SeqName -- | Reference strand orientation sequence sequ :: SoapAlign -> !SeqData -- | Reference strand orientation quality data qual :: SoapAlign -> !QualData nhit :: SoapAlign -> !Int pairend :: SoapAlign -> !Char length :: SoapAlign -> !Offset strand :: SoapAlign -> !Strand refname :: SoapAlign -> !SeqName -- | 1-based index, as output by SOAP, of reference strand 5' end refstart :: SoapAlign -> !Offset nmismatch :: SoapAlign -> !Int mismatches :: SoapAlign -> ![SoapAlignMismatch] data SoapAlignMismatch SAM :: !Char -> !Char -> !Offset -> !Qual -> SoapAlignMismatch -- | Read nt in reference strand orientation readnt :: SoapAlignMismatch -> !Char -- | Reference nt in reference strand orientation refnt :: SoapAlignMismatch -> !Char -- | Offset from reference strand 5' end in reference strand orientation offset :: SoapAlignMismatch -> !Offset -- | Quality score of read nt qualnt :: SoapAlignMismatch -> !Qual refSeqPos :: SoapAlign -> SeqPos refCSeqLoc :: SoapAlign -> ContigSeqLoc refSeqLoc :: SoapAlign -> SeqLoc mismatchSeqPos :: SoapAlign -> SoapAlignMismatch -> SeqPos parse :: (Error e, MonadError e m) => ByteString -> m SoapAlign unparse :: SoapAlign -> ByteString parseMismatch :: (Error e, MonadError e m) => ByteString -> m SoapAlignMismatch unparseMismatch :: SoapAlignMismatch -> ByteString group :: [SoapAlign] -> [[SoapAlign]] instance Read SoapAlignMismatch instance Show SoapAlignMismatch instance Eq SoapAlignMismatch instance Ord SoapAlignMismatch instance Read SoapAlign instance Show SoapAlign instance Eq SoapAlign instance Ord SoapAlign -- | Model the BED format, according to the spec at -- http:genome.ucsc.eduFAQFAQformat#format1 module Bio.Alignment.BED -- | The BED data type Note that the specification allows a variable number -- of fields, with only the three first required. This definition -- requires all fields to be present. data BED BED :: ByteString -> Offset -> Offset -> ByteString -> Int -> Dir -> Offset -> Offset -> (Word8, Word8, Word8) -> [(Offset, Offset)] -> BED chrom :: BED -> ByteString chromStart :: BED -> Offset chromEnd :: BED -> Offset name :: BED -> ByteString -- | Range 0..1000 score :: BED -> Int strand :: BED -> Dir thickStart :: BED -> Offset thickEnd :: BED -> Offset -- | Available BED files appear to not support this format. RGB is -- therefore ignored (read and written as '0') itemRGB :: BED -> (Word8, Word8, Word8) -- | Lists of lenght blockCount, blockStarts are relative to chromStart blockSizeStart :: BED -> [(Offset, Offset)] -- | Yet another direction data structure. data Dir Fwd :: Dir Rev :: Dir readBED :: FilePath -> IO [BED] writeBED :: FilePath -> [BED] -> IO () instance Eq Dir instance Show BED instance Read Dir instance Show Dir -- | Efficient lookup of sequence positions and locations in a large map of -- target locations. For example, target locations might represent a -- collection of genes annotated on a chromosome. The LocMap would -- efficiently find which gene(s) overlapped a sequence position on that -- chromosome. -- -- Target locations are assigned to one or more zones based on -- bounds. Query locations are then tested only against the target -- locations in the relevant zones. module Bio.Location.LocMap -- | Data structure allowing efficient lookup of target sequence locations -- that overlap a query location. Target locations can be paired with an -- arbitrary object. data LocMap a -- | Create a LocMap from an association list of target locations. fromList :: Offset -> [(Loc, a)] -> LocMap a -- | Find the (possibly empty) list of target locations and associated -- objects that contain a sequence position, in the sense of -- isWithin lookupWithin :: Pos -> LocMap a -> [(Loc, a)] -- | Find the (possibly empty) list of target locations and associated -- objects that overlap a sequence location, in the sense of -- overlaps lookupOverlaps :: Loc -> LocMap a -> [(Loc, a)] -- | Remove a target location and object association from the map, if it is -- present. If it is present multiple times, only the first occurrence -- will be deleted. delete :: (Eq a) => (Loc, a) -> LocMap a -> LocMap a -- | Generalized version of delete that removes the first target -- location / object association that satisfies a predicate function. deleteBy :: ((Loc, a) -> Bool) -> LocMap a -> LocMap a -- | Insert a new target association into a target location map. insert :: Loc -> a -> LocMap a -> LocMap a checkInvariants :: LocMap a -> [String] instance Monoid (LocMap a) -- | Efficient lookup of query positions in a collection of target sequence -- locations where positions and locations are associated with specific -- sequence names. This is an extension of LocMap to use -- locations and positions on named sequences as in SeqLocation. module Bio.Location.SeqLocMap -- | A data structure for efficiently finding target sequence locations -- (SeqLoc.Loc) that overlap query positions or locations. Each -- target location can be associated with an arbitrary additional value -- in the lookup map. type SeqLocMap a = OnSeqs (LocMap a) -- | Empty lookup map. empty :: SeqLocMap a -- | Creates a SeqLocMap from a list of target locations and their -- associated objects fromList :: [(SeqLoc, a)] -> SeqLocMap a -- | Inserts a new target location and associated object into the location -- lookup map. insert :: SeqLoc -> a -> SeqLocMap a -> SeqLocMap a -- | Find the (possibly empty) list of target locations and associated -- objects that contain a sequence position, in the sense of -- Loc.isWithin. lookupWithin :: SeqPos -> SeqLocMap a -> [(SeqLoc, a)] -- | Find the (possibly empty) list of target locations and associated -- objects that overlap a sequence location, in the sense of -- Loc.overlaps. lookupOverlaps :: SeqLoc -> SeqLocMap a -> [(SeqLoc, a)] module Bio.GFF3.Feature data GFFAttr GFFAttr :: !ByteString -> ![ByteString] -> GFFAttr attrTag :: GFFAttr -> !ByteString attrValues :: GFFAttr -> ![ByteString] data Feature Feature :: !ByteString -> !ByteString -> !ByteString -> !Offset -> !Offset -> !Maybe Double -> !Maybe Strand -> !Maybe Offset -> ![GFFAttr] -> Feature seqid :: Feature -> !ByteString source :: Feature -> !ByteString ftype :: Feature -> !ByteString start :: Feature -> !Offset end :: Feature -> !Offset score :: Feature -> !Maybe Double strand :: Feature -> !Maybe Strand phase :: Feature -> !Maybe Offset attributes :: Feature -> ![GFFAttr] length :: Feature -> Offset parse :: (Error e, MonadError e m) => ByteString -> m Feature unparse :: Feature -> ByteString parseWithFasta :: (Error e, MonadError e m) => ByteString -> m ([Feature], [ByteString]) attrByTag :: ByteString -> Feature -> [ByteString] ids :: Feature -> [ByteString] parentIds :: Feature -> [ByteString] contigLoc :: Feature -> ContigLoc loc :: Feature -> Loc seqLoc :: Feature -> SeqLoc name :: (Error e, MonadError e m) => Feature -> m SeqName instance Eq Feature instance Ord Feature instance Show Feature instance Eq GFFAttr instance Ord GFFAttr instance Show GFFAttr module Bio.GFF3.FeatureHier data FeatureHier features :: FeatureHier -> (Set Feature) lookupId :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m Feature lookupIdChildren :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m [Feature] fromList :: (Error e, MonadError e m) => [Feature] -> m FeatureHier insert :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier delete :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier parents :: FeatureHier -> Feature -> [Feature] children :: FeatureHier -> Feature -> [Feature] parentsM :: (MonadReader FeatureHier m) => Feature -> m [Feature] childrenM :: (MonadReader FeatureHier m) => Feature -> m [Feature] checkInvariants :: FeatureHier -> [String] instance Show FeatureHier module Bio.GFF3.FeatureHierSequences data FeatureHierSequences features :: FeatureHierSequences -> Set Feature sequences :: FeatureHierSequences -> [Sequence a] fromLists :: (Error e, MonadError e m) => [Feature] -> [Sequence a] -> m FeatureHierSequences parse :: (Error e, MonadError e m) => ByteString -> m FeatureHierSequences lookupId :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m Feature parents :: FeatureHierSequences -> Feature -> [Feature] children :: FeatureHierSequences -> Feature -> [Feature] seqData :: (Error e, MonadError e m) => FeatureHierSequences -> SeqLoc -> m SeqData getSequence :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m SeqData featureSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a) runGFF :: FilePath -> (ErrorT String (Reader FeatureHierSequences) a) -> ErrorT String IO a runGFFIO :: FilePath -> (ErrorT String (ReaderT FeatureHierSequences IO) a) -> ErrorT String IO a asksGFF :: (Error e, MonadError e m, MonadReader FeatureHierSequences m) => (FeatureHierSequences -> a -> m b) -> a -> m b instance Show FeatureHierSequences module Bio.GFF3.SGD chromosomes :: FeatureHierSequences -> [Feature] genes :: FeatureHierSequences -> [Feature] rRNAs :: FeatureHierSequences -> [Feature] sortExons :: (Error e, MonadError e m) => [Feature] -> m [Feature] geneSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a) geneSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc geneCDSes :: FeatureHierSequences -> Feature -> [Feature] noncodingSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a) noncodingSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc noncodingExons :: FeatureHierSequences -> Feature -> [Feature] namedSLM :: FeatureHierSequences -> SeqLocMap Feature geneCDS_SLM :: (Error e, MonadError e m) => FeatureHierSequences -> m (SeqLocMap Feature) -- | This is a meta-module importing and re-exporting sequence-related -- stuff. -- -- It encompasses the Bio.Sequence.SeqData, -- Bio.Sequence.Fasta, and Bio.Sequence.TwoBit modules. module Bio.Sequence -- | A sequence consists of a header, the sequence data itself, and -- optional quality data. The type parameter is a phantom type to -- separate nucleotide and amino acid sequences data Sequence t -- | header and actual sequence Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence t data Unknown -- | An offset, index, or length of a SeqData type Offset = Int64 -- | The basic data type used in Sequences type SeqData = ByteString -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from -- bad. type Qual = Word8 -- | Quality data is a Qual vector, currently implemented as a -- ByteString. type QualData = ByteString -- | Return sequence length. seqlength :: Sequence a -> Offset -- | Return sequence label (first word of header) seqlabel :: Sequence a -> SeqData -- | Return full header. seqheader :: Sequence a -> SeqData -- | Return the sequence data. seqdata :: Sequence a -> SeqData -- | Return the quality data, or error if none exist. Use hasqual if in -- doubt. seqqual :: Sequence a -> QualData -- | Read the character at the specified position in the sequence. (!) :: Sequence a -> Offset -> Char appendHeader :: Sequence a -> String -> Sequence a -- | Modify the header by appending text, or by replacing all but the -- sequence label (i.e. first word). setHeader :: Sequence a -> String -> Sequence a -- | Convert a String to SeqData fromStr :: String -> SeqData -- | Convert a SeqData to a String toStr :: SeqData -> String -- | Complement a single character. I.e. identify the nucleotide it can -- hybridize with. Note that for multiple nucleotides, you usually want -- the reverse complement (see revcompl for that). compl :: Char -> Char -- | Calculate the reverse complement. This is only relevant for the -- nucleotide alphabet, and it leaves other characters unmodified. revcompl :: Sequence Nuc -> Sequence Nuc -- | Calculate the reverse complent for SeqData only. revcompl' :: SeqData -> SeqData -- | For type tagging sequences (protein sequences use Amino below) data Nuc castToNuc :: Sequence a -> Sequence Nuc data Amino Ala :: Amino Arg :: Amino Asn :: Amino Asp :: Amino Cys :: Amino Gln :: Amino Glu :: Amino Gly :: Amino His :: Amino Ile :: Amino Leu :: Amino Lys :: Amino Met :: Amino Phe :: Amino Pro :: Amino Ser :: Amino Thr :: Amino Tyr :: Amino Trp :: Amino Val :: Amino STP :: Amino Asx :: Amino Glx :: Amino Xle :: Amino Xaa :: Amino -- | Translate a nucleotide sequence into the corresponding protein -- sequence. This works rather blindly, with no attempt to identify ORFs -- or otherwise QA the result. translate :: Sequence Nuc -> Offset -> [Amino] -- | Convert a sequence in IUPAC format to a list of amino acids. fromIUPAC :: SeqData -> [Amino] -- | Convert a list of amino acids to a sequence in IUPAC format. toIUPAC :: [Amino] -> SeqData castToAmino :: Sequence a -> Sequence Amino -- | Returns a sequence with all internal storage freshly copied and with -- sequence and quality data present as a single chunk. -- -- By freshly copying internal storage, defragSeq allows garbage -- collection of the original data source whence the sequence was read; -- otherwise, use of just a short sequence name can cause an entire -- sequence file buffer to be retained. -- -- By compacting sequence data into a single chunk, defragSeq -- avoids linear-time traversal of sequence chunks during random access -- into sequence data. defragSeq :: Sequence t -> Sequence t -- | map over sequences, treating them as a sequence of (char,word8) pairs. -- This will work on sequences without quality, as long as the function -- doesn't try to examine it. The current implementation is not very -- efficient. seqmap :: ((Char, Qual) -> (Char, Qual)) -> Sequence t -> Sequence t -- | Read nucleotide sequences in any format - Fasta, SFF, FastQ, 2bit, -- PHD... readNuc :: FilePath -> IO [Sequence Nuc] -- | Read protein sequences in any supported format (i.e. Fasta) readProt :: FilePath -> IO [Sequence Amino] -- | Lazily read sequences from a FASTA-formatted file readFasta :: FilePath -> IO [Sequence Unknown] -- | Lazily read sequence from handle hReadFasta :: Handle -> IO [Sequence Unknown] -- | Write sequences to a FASTA-formatted file. Line length is 60. writeFasta :: FilePath -> [Sequence a] -> IO () -- | Write sequences in FASTA format to a handle. hWriteFasta :: Handle -> [Sequence a] -> IO () -- | Read quality data for sequences to a file. readQual :: FilePath -> IO [Sequence Unknown] -- | Write quality data for sequences to a file. writeQual :: FilePath -> [Sequence a] -> IO () hWriteQual :: Handle -> [Sequence a] -> IO () -- | Read sequence and associated quality. Will error if the sequences and -- qualites do not match one-to-one in sequence. readFastaQual :: FilePath -> FilePath -> IO [Sequence Unknown] -- | Write sequence and quality data simulatnously This may be more -- laziness-friendly. writeFastaQual :: FilePath -> FilePath -> [Sequence a] -> IO () hWriteFastaQual :: Handle -> Handle -> [Sequence a] -> IO () readFastQ :: FilePath -> IO [Sequence Nuc] writeFastQ :: FilePath -> [Sequence a] -> IO () hReadFastQ :: Handle -> IO [Sequence Nuc] hWriteFastQ :: Handle -> [Sequence a] -> IO () -- | Parse a .phd file, extracting the contents as a Sequence readPhd :: FilePath -> IO (Sequence Nuc) -- | Parse .phd contents from a handle hReadPhd :: Handle -> IO (Sequence Nuc) -- | Parse a (lazy) ByteString as sequences in the 2bit format. decode2Bit :: ByteString -> [Sequence Unknown] -- | Read sequences from a file in 2bit format and | unmarshall/deserialize -- into Sequence format. read2Bit :: FilePath -> IO [Sequence Unknown] -- | Read sequences from a file handle in the 2bit format and | -- unmarshall/deserialze into Sequence format. hRead2Bit :: Handle -> IO [Sequence Unknown] -- | This is a struct for containing a set of hashing functions data HashF k HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k -- | calculates the hash at a given offset in the sequence hash :: HashF k -> SeqData -> Offset -> Maybe k -- | calculate all hashes from a sequence, and their indices hashes :: HashF k -> SeqData -> [(k, Offset)] -- | for sorting hashes ksort :: HashF k -> [k] -> [k] -- | Contigous constructs an int/eger from a contigous k-word. contigous :: (Integral k) => Int -> HashF k -- | Like contigous, but returns the same hash for a word and its -- reverse complement. rcontig :: (Integral k) => Int -> HashF k -- | Like rcontig, but ignoring monomers (i.e. arbitrarily long -- runs of a single nucelotide are treated the same a single nucleotide. rcpacked :: (Integral k) => Int -> HashF k class KWords s kwords :: (KWords s) => Int -> s -> [s] entropy :: (Ord str, KWords str) => Int -> str -> Double -- | Multiple alignments. module Bio.Alignment.Multiple -- | Progressive multiple alignment. Calculate a tree from agglomerative -- clustering, then align at each branch going bottom up. Returns a list -- of columns (rows?). progressive :: (Sequence a -> Sequence a -> (Double, EditList)) -> [Sequence a] -> [String] -- | Derive alignments indirectly, i.e. calculate A|C using alignments A|B -- and B|C. This is central for Coffee evaluation of alignments, -- and T-Coffee construction of alignments. indirect :: EditList -> EditList -> EditList