-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A bioinformatics library -- -- This is a collection of data structures and algorithms useful for -- building bioinformatics-related tools and utilities. -- -- Current list of features includes: a Sequence data type supporting -- protein and nucleotide sequences and conversion between them. As of -- version 0.4, different kinds of sequence have different types. Support -- for quality data, reading and writing Fasta formatted files, reading -- TwoBit and phd formats, and Roche/454 SFF files. Rudimentary (i.e. -- unoptimized) support for doing alignments - including dynamic -- adjustment of scores based on sequence quality. Also Blast output -- parsing. Partly implemented single linkage clustering, and multiple -- alignment. Reading Gene Ontology (GO) annotations (GOA) and -- definitions/hierarchy. -- -- The Darcs repository is at: -- http://malde.org/~ketil/biohaskell/biolib. @package bio @version 0.5 module Bio.GFF3.Escape unEscapeByteString :: (Error e, MonadError e m) => ByteString -> m ByteString escapeByteString :: (Char -> Bool) -> ByteString -> ByteString escapeAllBut :: String -> ByteString -> ByteString escapeAllOf :: String -> ByteString -> ByteString -- | Lazy "many" combinator for Parsec. Courtesy of Tomasz Zielonka. module Bio.Util.Parsex lazyMany :: GenParser Char () a -> SourceName -> [Char] -> [a] -- | Implement clustering module Bio.Clustering -- | Data structure for storing hierarchical clusters data Clustered score datum Branch :: score -> (Clustered score datum) -> (Clustered score datum) -> Clustered score datum Leaf :: datum -> Clustered score datum -- | Single linkage agglomerative clustering. Cluster elements by slurping -- a sorted list of pairs with score (i.e. triples :-) Keeps a set of -- contained elements at each branch's root, so O(n log n), and requires -- elements to be in Ord. For this to work, the triples must be sorted on -- score. Earlier scores in the list will make up the lower nodes, so -- sort descending for similarity, ascending for distance. cluster_sl :: (Ord a, Ord s) => [(s, a, a)] -> [Clustered s a] instance (Show score, Show datum) => Show (Clustered score datum) -- | This models the PSL format used by e.g. the alignment tool BLAT. It is -- a simple, textual representation of (spliced) alignments, with -- tab-separated fields. -- -- See http:genome.ucsc.eduFAQFAQformat#format2 for -- details. module Bio.Alignment.PSL data PSL PSL :: Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> ByteString -> ByteString -> Int -> Int -> Int -> ByteString -> Int -> Int -> Int -> Int -> [Int] -> [Int] -> [Int] -> PSL match :: PSL -> Int mismatch :: PSL -> Int repmatch :: PSL -> Int ncount :: PSL -> Int qgapcount :: PSL -> Int qgaplength :: PSL -> Int tgapcount :: PSL -> Int tgaplength :: PSL -> Int strand :: PSL -> ByteString qname :: PSL -> ByteString qsize :: PSL -> Int qstart :: PSL -> Int qend :: PSL -> Int tname :: PSL -> ByteString tsize :: PSL -> Int tstart :: PSL -> Int tend :: PSL -> Int blockcount :: PSL -> Int blocksizes :: PSL -> [Int] qstarts :: PSL -> [Int] tstarts :: PSL -> [Int] readPSL :: FilePath -> IO [PSL] writePSL :: FilePath -> [PSL] -> IO () parsePSL :: ByteString -> [PSL] unparsePSL :: [PSL] -> ByteString pslHeader :: ByteString instance Eq PSL instance Show PSL -- | Utility module, with various useful stuff. module Bio.Util lines :: ByteString -> [ByteString] -- | Break a list of bytestrings on a predicate. splitWhen :: (ByteString -> Bool) -> [ByteString] -> [[ByteString]] -- | Output (to stderr) progress while evaluating a lazy list. Useful for -- generating output while (conceptually, at least) in pure code countIO :: String -> String -> Int -> [a] -> IO [a] -- | A lazier version of Control.Monad.sequence in -- Control.Monad, needed by countIO above. sequence' :: [IO a] -> IO [a] -- | Workaround, the current Data.ByteString.Lazy.Char8 contains a -- bug in Data.ByteString.Lazy.Char8.lines. mylines :: ByteString -> [ByteString] -- | This module implements a hierarchical data structure for BLAST -- results, there is an alternative flat structure in the -- Bio.Alignment.BlastFlat module. -- -- BLAST is a tool for searching in (biological) sequences for -- similarity. This library is tested against NCBI-blast version 2.2.14. -- There exist several independent versions of BLAST, so expect some -- incompatbilities if you're using a different BLAST version. -- -- For parsing BLAST results, the XML format (blastall -m 7) is by far -- the most robust choice, and is implemented in the -- Bio.Alignment.BlastXML module. -- -- The format is straightforward (and non-recursive). For more -- information on BLAST, check -- http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html module Bio.Alignment.BlastData -- | The sequence id, i.e. the first word of the header field. type SeqId = ByteString -- | The Strand indicates the direction of the match, i.e. the plain -- sequence or its reverse complement. data Strand Plus :: Strand Minus :: Strand -- | The Aux field in the BLAST output includes match information that -- depends on the BLAST flavor (blastn, blastx, or blastp). This data -- structure captures those variations. data Aux -- | blastn Strands :: !Strand -> !Strand -> Aux -- | blastx Frame :: !Strand -> !Int -> Aux -- | A BlastResult is the root of the hierarchy. data BlastResult BlastResult :: !ByteString -> !ByteString -> !ByteString -> !ByteString -> !ByteString -> !Integer -> !Integer -> [BlastRecord] -> BlastResult blastprogram :: BlastResult -> !ByteString blastversion :: BlastResult -> !ByteString blastdate :: BlastResult -> !ByteString blastreferences :: BlastResult -> !ByteString database :: BlastResult -> !ByteString dbsequences :: BlastResult -> !Integer dbchars :: BlastResult -> !Integer results :: BlastResult -> [BlastRecord] -- | Each query sequence generates a BlastRecord data BlastRecord BlastRecord :: !SeqId -> !Int -> [BlastHit] -> BlastRecord query :: BlastRecord -> !SeqId qlength :: BlastRecord -> !Int hits :: BlastRecord -> [BlastHit] -- | Each match between a query and a target sequence (or subject) is a -- BlastHit. data BlastHit BlastHit :: !SeqId -> !Int -> [BlastMatch] -> BlastHit subject :: BlastHit -> !SeqId slength :: BlastHit -> !Int matches :: BlastHit -> [BlastMatch] -- | A BlastHit may contain multiple separate matches (typcially -- when an indel causes a frameshift that blastx is unable to bridge). data BlastMatch BlastMatch :: !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastMatch bits :: BlastMatch -> !Double e_val :: BlastMatch -> !Double identity :: BlastMatch -> (Int, Int) q_from :: BlastMatch -> !Int q_to :: BlastMatch -> !Int h_from :: BlastMatch -> !Int h_to :: BlastMatch -> !Int aux :: BlastMatch -> !Aux instance Show BlastMatch instance Show BlastHit instance Show BlastRecord instance Show BlastResult instance Show Aux instance Eq Aux instance Read Strand instance Show Strand instance Eq Strand -- | Parse blast XML output. -- -- If you use a recent version of NCBI BLAST and specify XML output -- (blastall -m 7), this module should be able to parse the result into a -- hierarchical BlastResult structure. -- -- While the process may consume a bit of memory, the parsing is lazy, -- and file sizes of several gigabytes can be parsed (see e.g. the xml2x -- tool for an example). To parse XML, we use Text.HTML.TagSoup. module Bio.Alignment.BlastXML -- | Parse BLAST results in XML format readXML :: FilePath -> IO [BlastResult] -- | This module implements a "flattened" data structure for Blast hits, as -- opposed to the hierarchical structure in -- Bio.Alignment.BlastData. -- -- The flat data type is useful in many cases where it is more natural to -- see the result as a set of rows (e.g. for insertaion in a database). -- -- It would probably be more (memory-) efficient to go the other way -- (i.e. from flat to hierarchical), as passing the current, partially -- built BlastFlat object down the stream of results and stamping -- out a stream of completed ones. (See -- Bio.Alignment.BlastXML.breaks for this week's most cumbersome -- use of parallelism to avoid the memory issue.) module Bio.Alignment.BlastFlat -- | The BlastFlat data structure contains information about a single match data BlastFlat BlastFlat :: !SeqId -> !Int -> !SeqId -> !Int -> !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastFlat query :: BlastFlat -> !SeqId qlength :: BlastFlat -> !Int subject :: BlastFlat -> !SeqId slength :: BlastFlat -> !Int bits :: BlastFlat -> !Double e_val :: BlastFlat -> !Double identity :: BlastFlat -> (Int, Int) q_from :: BlastFlat -> !Int q_to :: BlastFlat -> !Int h_from :: BlastFlat -> !Int h_to :: BlastFlat -> !Int aux :: BlastFlat -> !Aux readXML :: FilePath -> IO [BlastFlat] -- | Convert BlastRecords into BlastFlats (representing a depth-first -- traversal of the BlastRecord structure.) flatten :: [BlastRecord] -> [BlastFlat] -- | Each query sequence generates a BlastRecord data BlastRecord blastprogram :: BlastResult -> ByteString blastversion :: BlastResult -> ByteString blastdate :: BlastResult -> ByteString blastreferences :: BlastResult -> ByteString database :: BlastResult -> ByteString dbsequences :: BlastResult -> Integer dbchars :: BlastResult -> Integer results :: BlastResult -> [BlastRecord] -- | The Aux field in the BLAST output includes match information that -- depends on the BLAST flavor (blastn, blastx, or blastp). This data -- structure captures those variations. data Aux -- | blastn Strands :: !Strand -> !Strand -> Aux -- | blastx Frame :: !Strand -> !Int -> Aux -- | The Strand indicates the direction of the match, i.e. the plain -- sequence or its reverse complement. data Strand Plus :: Strand Minus :: Strand -- | This module implements a parser for BLAST results. -- -- This module is DEPRECATED. It is *very* recommended that you run blast -- with XML output instaed, and use the BlastXML module to parse it. -- Don't say I didn't warn you! -- -- BLAST is a tool for searching in (biological) sequences for -- similarity. This library is tested against NCBI-blast version 2.2.14. -- There exist several independent versions, so expect some -- incompatbilities if you're using a different BLAST version. -- -- The format is straightforward (and non-recursive), and this -- implementation uses a simple line-based, hierarchical parser. -- -- For more information on BLAST, check -- http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html module Bio.Alignment.Blast parse :: ByteString -> BlastResult module Bio.Sequence.SFF_name -- | Read names encode various information, as per this struct. data ReadName ReadName :: (Int, Int, Int) -> (Int, Int, Int) -> Int -> Int -> Int -> ReadName date :: ReadName -> (Int, Int, Int) time :: ReadName -> (Int, Int, Int) region :: ReadName -> Int x_loc :: ReadName -> Int y_loc :: ReadName -> Int decodeReadName :: ByteString -> Maybe ReadName decodeLocation :: ByteString -> Maybe (Int, Int) decodeDate :: ByteString -> Maybe [Int] encodeReadName :: ReadName -> ByteString encodeLocation :: Int -> Int -> ByteString encodeRegion :: Int -> ByteString encodeDate :: (Int, Int, Int) -> (Int, Int, Int) -> ByteString divMods :: Int -> [Int] -> [Int] decode36 :: ByteString -> Maybe Int decCh :: Char -> Maybe Int encode36 :: Int -> ByteString b36 :: UArray Int Char instance Show ReadName -- | GeneOntology - parse and index Gene Ontology Annotations In -- particular, the file 'gene_association.goa_uniprot' that contains -- links between GO terms and UniProt accessions. -- --
-- -- magic :: Word32 -- 0x2e736666, i.e. the string ".sff" -- version :: Word32 -- 0x00000001 --data CommonHeader CommonHeader :: Int64 -> Int32 -> Int32 -> Int16 -> Int16 -> Word8 -> ByteString -> ByteString -> CommonHeader -- | Points to a text(?) section index_offset :: CommonHeader -> Int64 index_length :: CommonHeader -> Int32 num_reads :: CommonHeader -> Int32 key_length :: CommonHeader -> Int16 flow_length :: CommonHeader -> Int16 flowgram_fmt :: CommonHeader -> Word8 flow :: CommonHeader -> ByteString key :: CommonHeader -> ByteString -- | Each Read has a fixed read header, containing various information. data ReadHeader ReadHeader :: Int16 -> Int32 -> Int16 -> Int16 -> Int16 -> Int16 -> ByteString -> ReadHeader name_length :: ReadHeader -> Int16 num_bases :: ReadHeader -> Int32 clip_qual_left :: ReadHeader -> Int16 clip_qual_right :: ReadHeader -> Int16 clip_adapter_left :: ReadHeader -> Int16 clip_adapter_right :: ReadHeader -> Int16 read_name :: ReadHeader -> ByteString -- | This contains the actual flowgram for a single read. data ReadBlock ReadBlock :: !ReadHeader -> !ByteString -> !ByteString -> !SeqData -> !QualData -> ReadBlock read_header :: ReadBlock -> !ReadHeader flow_data :: ReadBlock -> !ByteString flow_index :: ReadBlock -> !ByteString bases :: ReadBlock -> !SeqData quality :: ReadBlock -> !QualData -- | Read an SFF file. readSFF :: FilePath -> IO SFF -- | Write an SFF to the specified file name writeSFF :: FilePath -> SFF -> IO () -- | Write an SFF to the specified file name, but go back and update -- the read count. Useful if you want to output a lazy stream of -- ReadBlocks. Returns the number of reads written. writeSFF' :: FilePath -> SFF -> IO Int -- | Read an SFF file, but be resilient against errors. recoverSFF :: FilePath -> IO SFF -- | Extract the sequences from an SFF data structure. sffToSequence :: SFF -> [Sequence Nuc] -- | Extract the sequence information from a ReadBlock. rbToSequence :: ReadBlock -> Sequence Nuc -- | Trim a read according to clipping information trim :: ReadBlock -> ReadBlock -- | Trim a read to specific sequence position, inclusive bounds The -- current implementation has the unintended side effect of always -- trimming the flowgram down to a basecalled position. Note that you -- can't (easily) write trimmed ReadBlocks to a file, since they -- need to have the same number of flows as given in the -- CommmonHeader. trimFromTo :: Integral i => i -> i -> ReadBlock -> ReadBlock -- | Extract the read without the initial (TCAG) key. trimKey :: CommonHeader -> Sequence Nuc -> Maybe (Sequence Nuc) -- | Convert a sequence position to the corresponding flow position baseToFlowPos :: Integral i => ReadBlock -> i -> Int -- | Convert a flow position to the corresponding sequence position flowToBasePos :: Integral i => ReadBlock -> i -> Int -- | Trim a ReadBlock limiting the number of flows. If writing to an -- SFF file, make sure you update the CommonHeader accordingly. -- See examples/Flx.hs for how to use this. trimFlows :: Integral i => i -> ReadBlock -> ReadBlock -- | test serialization by output'ing the header and first two reads in an -- SFF, and the same after a decode + encode cycle. test :: FilePath -> IO () -- | Convert a file by decoding it and re-encoding it This will lose the -- index (which isn't really necessary) convert :: FilePath -> IO () -- | Helper function to access the flowgram flowgram :: ReadBlock -> [Flow] -- | Extract the sequence with masked bases in lower case masked_bases :: ReadBlock -> SeqData -- | Extract the index as absolute coordinates, not relative. cumulative_index :: ReadBlock -> [Int] -- | Pack a list of flows into the corresponding binary structure (the -- flow_data field) packFlows :: [Flow] -> ByteString -- | Unpack the flow_data field into a list of flow values unpackFlows :: ByteString -> [Flow] -- | The type of flowgram value type Flow = Int16 -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from -- bad. type Qual = Word8 type Index = Word8 -- | The basic data type used in Sequences type SeqData = ByteString -- | Quality data is a Qual vector, currently implemented as a -- ByteString. type QualData = ByteString -- | Read names encode various information, as per this struct. data ReadName ReadName :: (Int, Int, Int) -> (Int, Int, Int) -> Int -> Int -> Int -> ReadName date :: ReadName -> (Int, Int, Int) time :: ReadName -> (Int, Int, Int) region :: ReadName -> Int x_loc :: ReadName -> Int y_loc :: ReadName -> Int decodeReadName :: ByteString -> Maybe ReadName encodeReadName :: ReadName -> ByteString instance Binary PartialReadHeader instance Binary RSFF instance Show ReadBlock instance Binary ReadHeader instance Show ReadHeader instance Binary CommonHeader instance Show CommonHeader instance Binary SFF instance Show SFF instance Binary RBI -- | This implements a number of filters used in the Titanium pipeline, -- based on published documentation. module Bio.Sequence.SFF_filters -- | DiscardFilters determine whether a read is to be retained or discarded type DiscardFilter = ReadBlock -> Bool -- | This filter discards empty sequences. discard_empty :: DiscardFilter -- | Discard sequences that don't have the given key tag (typically TCAG) -- at the start of the read. discard_key :: String -> DiscardFilter -- |
-- AS contigs reads -- CO contig_name bases reads segments compl (CAP3: segments=0) -- sequence -- BQ base_qualities -- AF read1 compl padded_start_consensus (negatives meaning?) -- AF read2 .. -- BS segments -- RD read1 bases info_items info_tags (latter two set to 0 by CAP3) -- sequence -- QA read1 qual_start qual_end align_start align_end -- DS (phred header? left empty by CAP3) -- RD read2 ... ---- -- As far as I know, this is only used for nucleotide sequences. module Bio.Alignment.ACE -- | Reading an ACE file. readACE :: FilePath -> IO [[Assembly]] writeACE :: FilePath -> [Assembly] -> IO () data Assembly Asm :: (Sequence Nuc, Gaps) -> Alignment Nuc -> Assembly contig :: Assembly -> (Sequence Nuc, Gaps) fragments :: Assembly -> Alignment Nuc -- | Test parser p on a list of ACE elements ptest :: Show a => String -> AceParser a -> [ACE] -> IO () reads :: Assembly -> Alignment Nuc instance Eq ACE instance Show Assembly instance Show ACE -- | Data types for functorially lifting sequence positions and locations -- onto named sequences. These are useful for taking functions that work -- with sequence positions and locations and associating them specific, -- named sequences. module Bio.Location.OnSeq -- | Sequence name, as in a Sequence type SeqName = SeqData -- | Data type for an object associated with a specific, named sequence data OnSeq a OnSeq :: !SeqName -> !a -> OnSeq a onSeqName :: OnSeq a -> !SeqName onSeqObj :: OnSeq a -> !a -- | Looks up a sequence by name and applies a function to it withSeqData :: Monad m => (SeqData -> a -> m b) -> (SeqName -> m SeqData) -> OnSeq a -> m b -- | Tests a predicate when two objects are on the same sequence, returning -- False if they are on different sequences. andSameSeq :: (a -> b -> Bool) -> OnSeq a -> OnSeq b -> Bool -- | Performs an action when two objects are on the same sequence and -- produces an error otherwise. onSameSeq :: (Error e, MonadError e m) => (a -> b -> m c) -> OnSeq a -> OnSeq b -> m c -- | Data type for a collection of objects indexed by sequence name type OnSeqs a = Map SeqName a -- | Lifts a function on an underlying object to look up the sequence name -- in a name-indexed collection. perSeq :: Monoid b => (a -> b -> c) -> OnSeq a -> OnSeqs b -> c -- | Lifts a function that updates an underlying object to look up the -- named sequence and update a named-index collection. perSeqUpdate :: Monoid b => (a -> b -> b) -> OnSeq a -> OnSeqs b -> OnSeqs b -- | Lifts a function on underlying objects to look up a sequence in a -- name-indexed collection withNameAndSeq :: Monad m => (SeqName -> a -> b -> m c) -> OnSeq a -> OnSeqs b -> m c instance Eq a => Eq (OnSeq a) instance Ord a => Ord (OnSeq a) instance Show a => Show (OnSeq a) instance Functor OnSeq -- | Utilities for manipulating nucleotide sequences and locations on -- nucleotide sequences that occur on a forward or a reverse-complement -- strand. module Bio.Location.Strand -- | Sequence strand data Strand Fwd :: Strand RevCompl :: Strand -- | A nucleotide sequence or location on a nucleotide sequence that lies -- on a specific strand and has an orientation. class Stranded s revCompl :: Stranded s => s -> s -- | Convert the orientation of a Stranded thing based on a -- specified Strand stranded :: Stranded s => Strand -> s -> s instance Eq Strand instance Ord Strand instance Show Strand instance Read Strand instance Bounded Strand instance Enum Strand instance Ix Strand instance Stranded ByteString instance Stranded Char instance Stranded Strand -- | Data type for a sequence position. -- -- Zero-based Offset / Int64 indices are used throughout, -- to facilitate direct use of indexing functions on SeqData. module Bio.Location.Position -- | Position in a sequence data Pos Pos :: !Offset -> !Strand -> Pos -- | 0-based index of the position offset :: Pos -> !Offset -- | Strand of the position strand :: Pos -> !Strand -- | Returns a position resulting from sliding the original position along -- the sequence by a specified offset. A positive offset will move the -- position away from the 5' end of the forward stand of the sequence -- regardless of the strand of the position itself. Thus, -- --
-- slide (revCompl pos) off == revCompl (slide pos off) --slide :: Pos -> Offset -> Pos -- | Extract the nucleotide at a specific sequence position. If the -- position lies outside the bounds of the sequence, an error results. seqNt :: (Error e, MonadError e m) => SeqData -> Pos -> m Char -- | As seqNt, extract the nucleotide at a specific sequence -- position, but return N when the position lies outside the -- bounds of the sequence. -- --
-- seqNtPadded sequ pos == (either 'N' id . seqNt sequ) pos --seqNtPadded :: SeqData -> Pos -> Char -- | Display a human-friendly, zero-based representation of a sequence -- position. display :: Pos -> String instance Eq Pos instance Ord Pos instance Show Pos instance Read Pos instance Ix Pos instance Stranded Pos -- | Data type for a sequence location consiting of a contiguous range of -- positions on the sequence. -- -- Throughout, sequence position refers to a Pos which -- includes a strand. An index into a sequence is referred to as an -- offset, and is generally of type Offset. module Bio.Location.ContigLocation -- | Contiguous sequence location defined by a span of sequence positions, -- lying on a specific strand of the sequence. data ContigLoc ContigLoc :: !Offset -> !Offset -> !Strand -> ContigLoc -- | The offset of the 5' end of the location, as a 0-based index offset5 :: ContigLoc -> !Offset -- | The length of the location length :: ContigLoc -> !Offset -- | The strand of the location strand :: ContigLoc -> !Strand -- | Create a sequence location lying between 0-based starting and ending -- offsets. When start < end, the location be on the forward -- strand, otherwise it will be on the reverse complement strand. fromStartEnd :: Offset -> Offset -> ContigLoc -- | Create a sequence location from the sequence position of the start of -- the location and the length of the position. The strand of the -- location, and the direction it extends from the starting position, are -- determined by the strand of the starting position. fromPosLen :: Pos -> Offset -> ContigLoc -- | The bounds of a sequence location. This is a pair consisting of the -- lowest and highest sequence offsets covered by the region. The bounds -- ignore the strand of the sequence location, and the first element of -- the pair will always be lower than the second. bounds :: ContigLoc -> (Offset, Offset) -- | Sequence position of the start of the location. This is the 5' end on -- the location strand, which will have a higher offset than -- endPos if the location is on the RevCompl strand. startPos :: ContigLoc -> Pos -- | Sequence position of the end of the location, as described in -- startPos. endPos :: ContigLoc -> Pos -- | Given a sequence position and a sequence location relative to the same -- sequence, compute a new position representing the original position -- relative to the subsequence defined by the location. If the sequence -- position lies outside of the sequence location, Nothing is -- returned; thus, the offset of the new position will always be in the -- range [0, length cloc - 1]. posInto :: Pos -> ContigLoc -> Maybe Pos -- | Given a sequence location and a sequence position within that -- location, compute a new position representing the original position -- relative to the outer sequence. If the sequence position lies outside -- the location, Nothing is returned. -- -- This function inverts posInto when the sequence position lies -- within the position is actually within the location. posOutof :: Pos -> ContigLoc -> Maybe Pos -- | Returns True when a sequence position lies within a sequence -- location on the same sequence, and occupies the same strand. isWithin :: Pos -> ContigLoc -> Bool -- | Returns True when two sequence locations overlap at any -- position. overlaps :: ContigLoc -> ContigLoc -> Bool -- | Extract the nucleotide SeqData for the sequence location. If -- any part of the location lies outside the bounds of the sequence, an -- error results. seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData -- | As seqData, extract the nucleotide subsequence for the -- location. Any positions in the location lying outside the bounds of -- the sequence are returned as N rather than producing an -- error. seqDataPadded :: SeqData -> ContigLoc -> SeqData -- | Returns a location resulting from sliding the original location along -- the sequence by a specified offset. A positive offset will move the -- location away from the 5' end of the forward stand of the sequence -- regardless of the strand of the location itself. Thus, -- --
-- slide (revCompl cloc) off == revCompl (slide cloc off) --slide :: Offset -> ContigLoc -> ContigLoc -- | Returns a sequence location produced by extending the original -- location on each end, based on a pair of (5\' extension, /3' -- extension/). The 5' extension is applied to the 5' end of the location -- on the location strand; if the location is on the RevCompl -- strand, the 5' end will have a higher offset than the 3' end and this -- offset will increase by the amount of the 5' extension. Similarly, the -- 3' extension is applied to the 3' end of the location. extend :: (Offset, Offset) -> ContigLoc -> ContigLoc -- | Display a human-friendly, zero-based representation of a sequence -- location. display :: ContigLoc -> String instance Eq ContigLoc instance Ord ContigLoc instance Show ContigLoc instance Stranded ContigLoc -- | Data type for a more general sequence location consiting of -- potentially disjoint ranges of positions on the sequence. -- -- Throughout, sequence position refers to a Pos which -- includes a strand. An index into a sequence is referred to as an -- offset, and is generally of type Offset. module Bio.Location.Location -- | General (disjoint) sequence region consisting of a concatenated set of -- contiguous regions (see ContigLoc). newtype Loc Loc :: [ContigLoc] -> Loc -- | The bounds of a sequence location. This is a pair consisting of the -- lowest and highest sequence offsets covered by the region. The bounds -- ignore the strand of the sequence location, and the first element of -- the pair will always be lower than the second. Even if the positions -- in the location do not run monotonically through the location, the -- overall lowest and highest sequence offsets are returned. bounds :: Loc -> (Offset, Offset) -- | Returns the length of the region length :: Loc -> Offset -- | Sequence position of the start of the location. This is the 5' end on -- the location strand, which will have a higher offset than -- endPos if the location is on the RevCompl strand. startPos :: Loc -> Pos -- | Sequence position of the end of the location, as described in -- startPos. endPos :: Loc -> Pos -- | Given a sequence position and a sequence location relative to the same -- sequence, compute a new position representing the original position -- relative to the subsequence defined by the location. If the sequence -- position lies outside of the sequence location, Nothing is -- returned; thus, the offset of the new position will always be in the -- range [0, length cloc - 1]. -- -- When the sequence positions in the location are not monotonic, there -- may be multiple possible posInto solutions. That is, if the same outer -- sequence position is covered by two different contiguous blocks of the -- location, then it would have two possible sequence positions relative -- to the location. In this case, the position 5'-most in the location -- orientation is returned. posInto :: Pos -> Loc -> Maybe Pos -- | Given a sequence location and a sequence position within that -- location, compute a new position representing the original position -- relative to the outer sequence. If the sequence position lies outside -- the location, Nothing is returned. -- -- This function inverts posInto when the sequence position lies -- within the position is actually within the location. Due to the -- possibility of redundant location-relative positions for a given -- absolute position, posInto does not necessary invert -- posOutof posOutof :: Pos -> Loc -> Maybe Pos -- | Returns True when a sequence position lies within a sequence -- location on the same sequence, and occupies the same strand. isWithin :: Pos -> Loc -> Bool -- | Returns True when two sequence locations overlap at any -- position. overlaps :: Loc -> Loc -> Bool -- | Extract the nucleotide SeqData for the sequence location. If -- any part of the location lies outside the bounds of the sequence, an -- error results. seqData :: (Error e, MonadError e m) => SeqData -> Loc -> m SeqData -- | As seqData, extract the nucleotide subsequence for the -- location. Any positions in the location lying outside the bounds of -- the sequence are returned as N rather than producing an -- error. seqDataPadded :: SeqData -> Loc -> SeqData -- | Returns a sequence location produced by extending the original -- location on each end, based on a pair of (5\' extension, /3' -- extension/). These add contiguous positions to the 5' and 3' ends of -- the original location. The 5' extension is applied to the 5' end of -- the location on the location strand; if the location is on the -- RevCompl strand, the 5' end will have a higher offset than the -- 3' end and this offset will increase by the amount of the 5' -- extension. Similarly, the 3' extension is applied to the 3' end of the -- location. extend :: (Offset, Offset) -> Loc -> Loc -- | Display a human-friendly, zero-based representation of a sequence -- location. display :: Loc -> String instance Eq Loc instance Ord Loc instance Show Loc instance Stranded Loc -- | Data types for sequence locations and sequence positions associated -- with specific, named sequences. module Bio.Location.SeqLocation -- | A position on a named sequence type SeqPos = OnSeq Pos -- | A location consisting of a contiguous span of positions on a named -- sequence. type ContigSeqLoc = OnSeq ContigLoc -- | Test whether a sequence position lies within a sequence location. This -- requires that the position lie within the location as per -- isWithin and have the same sequence name. withinContigSeqLoc :: SeqPos -> ContigSeqLoc -> Bool -- | A general location, consisting of spans of sequence positions on a -- specific, named sequence. type SeqLoc = OnSeq Loc -- | Test whether a sequence position lies within a sequence location. This -- requires that the position lie within the location as per -- isWithin and have the same sequence name. isWithin :: SeqPos -> SeqLoc -> Bool -- | Test whether two sequence locations overlap in any position. This -- requires that the locations overlap as per overlaps and have -- the same sequence name. overlaps :: SeqLoc -> SeqLoc -> Bool -- | Extract the subsequence specified by a sequence location from a -- sequence database. The sequence name is used to retrieve the full -- sequence and the subsequence is extracted as by seqData seqData :: (Error e, MonadError e m) => (SeqName -> m SeqData) -> SeqLoc -> m SeqData -- | Display a human-friendly representation of a SeqPos displaySeqPos :: SeqPos -> String -- | Display a human-friendly representation of a ContigSeqLoc displayContigSeqLoc :: ContigSeqLoc -> String -- | Display a human-friendly representation of a SeqLoc display :: SeqLoc -> String -- | This module provides a data type to represent an alignment produced by -- the Bowtie short-read alignment tool (see -- http://bowtie-bio.sourceforge.net/index.shtml). -- -- The simple accessors recapitulate the details of the Bowtie alignment -- output. The position of the alignment is given by the "0-based offset -- into the reference sequence where leftmost character of the alignment -- occurs". Thus, for forward-strand alignments this is the 5' end of the -- query sequence while for reverse-complement alignments this is the 3' -- end of the query sequence. Similarly, the query sequence and query -- quality are shown in reference forward strand orientation, and thus -- may be reverse complemented. module Bio.Alignment.Bowtie data Align Align :: !SeqName -> !Strand -> !SeqName -> !Offset -> !SeqData -> !QualData -> ![Mismatch] -> Align -- | Name of the query sequence name :: Align -> !SeqName -- | Strand of the alignment on the reference sequence strand :: Align -> !Strand -- | Name of the reference sequence refname :: Align -> !SeqName -- | Zero-based offset of the left-most aligned position in the reference leftoffset :: Align -> !Offset -- | Query sequence, in the reference forward strand orientation sequ :: Align -> !SeqData -- | Query quality, in the reference forward strand orientation qual :: Align -> !QualData -- | Mismatches mismatches :: Align -> ![Mismatch] -- | Representation of a single mismatch in a bowtie alignment data Mismatch Mismatch :: !Offset -> !Char -> !Char -> Mismatch -- | Offset of the mismatch site from the 5' end of the query mmoffset :: Mismatch -> !Offset -- | Reference nucleotide refbase :: Mismatch -> !Char -- | Query nucleotide readbase :: Mismatch -> !Char -- | Returns the length of the query sequence length :: Align -> Offset -- | Returns the number of mismatches in the alignment nmismatch :: Align -> Int -- | Query sequence as given in the query file querySequ :: Align -> SeqData -- | Query quality as given in the query file queryQual :: Align -> QualData -- | As refCSeqLoc but without the reference sequence name. refCLoc :: Align -> ContigLoc -- | Returns the sequence location covered by the query in the alignment. -- This will be a sequence location on the reference sequence and may run -- on the forward or the reverse complement strand. refCSeqLoc :: Align -> ContigSeqLoc -- | Returns the sequence location covered by the query, as -- refCSeqLoc, as a SeqLoc location. refSeqLoc :: Align -> SeqLoc -- | Returns the sequence position of the start of the query sequence -- alignment. This will include the strand of the alignment and will not -- be the same as the position computed from leftoffset when the -- alignment is on the reverse complement strand. refSeqPos :: Align -> SeqPos -- | Sequence position of a mismatch on the reference sequence. mismatchSeqPos :: Align -> Mismatch -> SeqPos -- | Parses a line of Bowtie output to produce a Align parse :: ByteString -> Either String Align -- | Returns true when two alignments were derived from the same sequencing -- read. As Bowtie writes alignments of query sequences in their order in -- the query file, all alignments of a given read are grouped together -- and the lists of all alignments for each read can be gathered with -- --
-- groupBy sameRead --sameRead :: Align -> Align -> Bool instance Read Mismatch instance Show Mismatch instance Eq Mismatch instance Ord Mismatch instance Read Align instance Show Align instance Eq Align instance Ord Align module Bio.Alignment.Soap -- | Alignment output from SOAP data SoapAlign SA :: !SeqName -> !SeqData -> !QualData -> !Int -> !Char -> !Offset -> !Strand -> !SeqName -> !Offset -> !Int -> ![SoapAlignMismatch] -> SoapAlign name :: SoapAlign -> !SeqName -- | Reference strand orientation sequence sequ :: SoapAlign -> !SeqData -- | Reference strand orientation quality data qual :: SoapAlign -> !QualData nhit :: SoapAlign -> !Int pairend :: SoapAlign -> !Char length :: SoapAlign -> !Offset strand :: SoapAlign -> !Strand refname :: SoapAlign -> !SeqName -- | 1-based index, as output by SOAP, of reference strand 5' end refstart :: SoapAlign -> !Offset nmismatch :: SoapAlign -> !Int mismatches :: SoapAlign -> ![SoapAlignMismatch] data SoapAlignMismatch SAM :: !Char -> !Char -> !Offset -> !Qual -> SoapAlignMismatch -- | Read nt in reference strand orientation readnt :: SoapAlignMismatch -> !Char -- | Reference nt in reference strand orientation refnt :: SoapAlignMismatch -> !Char -- | Offset from reference strand 5' end in reference strand orientation offset :: SoapAlignMismatch -> !Offset -- | Quality score of read nt qualnt :: SoapAlignMismatch -> !Qual refSeqPos :: SoapAlign -> SeqPos refCSeqLoc :: SoapAlign -> ContigSeqLoc refSeqLoc :: SoapAlign -> SeqLoc mismatchSeqPos :: SoapAlign -> SoapAlignMismatch -> SeqPos parse :: (Error e, MonadError e m) => ByteString -> m SoapAlign unparse :: SoapAlign -> ByteString parseMismatch :: (Error e, MonadError e m) => ByteString -> m SoapAlignMismatch unparseMismatch :: SoapAlignMismatch -> ByteString group :: [SoapAlign] -> [[SoapAlign]] instance Read SoapAlignMismatch instance Show SoapAlignMismatch instance Eq SoapAlignMismatch instance Ord SoapAlignMismatch instance Read SoapAlign instance Show SoapAlign instance Eq SoapAlign instance Ord SoapAlign -- | Model the BED format, according to the spec at -- http:genome.ucsc.eduFAQFAQformat#format1 module Bio.Alignment.BED -- | The BED data type Note that the specification allows a variable number -- of fields, with only the three first required. This definition -- requires all fields to be present. data BED BED :: ByteString -> Offset -> Offset -> ByteString -> Int -> Dir -> Offset -> Offset -> (Word8, Word8, Word8) -> [(Offset, Offset)] -> BED chrom :: BED -> ByteString chromStart :: BED -> Offset chromEnd :: BED -> Offset name :: BED -> ByteString -- | Range 0..1000 score :: BED -> Int strand :: BED -> Dir thickStart :: BED -> Offset thickEnd :: BED -> Offset -- | Available BED files appear to not support this format. RGB is -- therefore ignored (read and written as '0') itemRGB :: BED -> (Word8, Word8, Word8) -- | Lists of lenght blockCount, blockStarts are relative to chromStart blockSizeStart :: BED -> [(Offset, Offset)] -- | Yet another direction data structure. data Dir Fwd :: Dir Rev :: Dir readBED :: FilePath -> IO [BED] writeBED :: FilePath -> [BED] -> IO () instance Eq Dir instance Show BED instance Read Dir instance Show Dir -- | Efficient lookup of sequence positions and locations in a large map of -- target locations. For example, target locations might represent a -- collection of genes annotated on a chromosome. The LocMap would -- efficiently find which gene(s) overlapped a sequence position on that -- chromosome. -- -- Target locations are assigned to one or more zones based on -- bounds. Query locations are then tested only against the target -- locations in the relevant zones. module Bio.Location.LocMap -- | Data structure allowing efficient lookup of target sequence locations -- that overlap a query location. Target locations can be paired with an -- arbitrary object. data LocMap a -- | Create a LocMap from an association list of target locations. fromList :: Offset -> [(Loc, a)] -> LocMap a -- | Find the (possibly empty) list of target locations and associated -- objects that contain a sequence position, in the sense of -- isWithin lookupWithin :: Pos -> LocMap a -> [(Loc, a)] -- | Find the (possibly empty) list of target locations and associated -- objects that overlap a sequence location, in the sense of -- overlaps lookupOverlaps :: Loc -> LocMap a -> [(Loc, a)] -- | Remove a target location and object association from the map, if it is -- present. If it is present multiple times, only the first occurrence -- will be deleted. delete :: Eq a => (Loc, a) -> LocMap a -> LocMap a -- | Generalized version of delete that removes the first target -- location / object association that satisfies a predicate function. deleteBy :: ((Loc, a) -> Bool) -> LocMap a -> LocMap a -- | Insert a new target association into a target location map. insert :: Loc -> a -> LocMap a -> LocMap a checkInvariants :: LocMap a -> [String] instance Monoid (LocMap a) -- | Efficient lookup of query positions in a collection of target sequence -- locations where positions and locations are associated with specific -- sequence names. This is an extension of LocMap to use -- locations and positions on named sequences as in SeqLocation. module Bio.Location.SeqLocMap -- | A data structure for efficiently finding target sequence locations -- (SeqLoc.Loc) that overlap query positions or locations. Each -- target location can be associated with an arbitrary additional value -- in the lookup map. type SeqLocMap a = OnSeqs (LocMap a) -- | Empty lookup map. empty :: SeqLocMap a -- | Creates a SeqLocMap from a list of target locations and their -- associated objects fromList :: [(SeqLoc, a)] -> SeqLocMap a -- | Inserts a new target location and associated object into the location -- lookup map. insert :: SeqLoc -> a -> SeqLocMap a -> SeqLocMap a -- | Find the (possibly empty) list of target locations and associated -- objects that contain a sequence position, in the sense of -- Loc.isWithin. lookupWithin :: SeqPos -> SeqLocMap a -> [(SeqLoc, a)] -- | Find the (possibly empty) list of target locations and associated -- objects that overlap a sequence location, in the sense of -- Loc.overlaps. lookupOverlaps :: SeqLoc -> SeqLocMap a -> [(SeqLoc, a)] module Bio.GFF3.Feature data GFFAttr GFFAttr :: !ByteString -> ![ByteString] -> GFFAttr attrTag :: GFFAttr -> !ByteString attrValues :: GFFAttr -> ![ByteString] data Feature Feature :: !ByteString -> !ByteString -> !ByteString -> !Offset -> !Offset -> !Maybe Double -> !Maybe Strand -> !Maybe Offset -> ![GFFAttr] -> Feature seqid :: Feature -> !ByteString source :: Feature -> !ByteString ftype :: Feature -> !ByteString start :: Feature -> !Offset end :: Feature -> !Offset score :: Feature -> !Maybe Double strand :: Feature -> !Maybe Strand phase :: Feature -> !Maybe Offset attributes :: Feature -> ![GFFAttr] length :: Feature -> Offset parse :: (Error e, MonadError e m) => ByteString -> m Feature unparse :: Feature -> ByteString parseWithFasta :: (Error e, MonadError e m) => ByteString -> m ([Feature], [ByteString]) attrByTag :: ByteString -> Feature -> [ByteString] ids :: Feature -> [ByteString] parentIds :: Feature -> [ByteString] contigLoc :: Feature -> ContigLoc loc :: Feature -> Loc seqLoc :: Feature -> SeqLoc name :: (Error e, MonadError e m) => Feature -> m SeqName instance Eq Feature instance Ord Feature instance Show Feature instance Eq GFFAttr instance Ord GFFAttr instance Show GFFAttr module Bio.GFF3.FeatureHier data FeatureHier features :: FeatureHier -> (Set Feature) lookupId :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m Feature lookupIdChildren :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m [Feature] fromList :: (Error e, MonadError e m) => [Feature] -> m FeatureHier insert :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier delete :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier parents :: FeatureHier -> Feature -> [Feature] children :: FeatureHier -> Feature -> [Feature] parentsM :: MonadReader FeatureHier m => Feature -> m [Feature] childrenM :: MonadReader FeatureHier m => Feature -> m [Feature] checkInvariants :: FeatureHier -> [String] instance Show FeatureHier module Bio.GFF3.FeatureHierSequences data FeatureHierSequences features :: FeatureHierSequences -> Set Feature sequences :: FeatureHierSequences -> [Sequence a] fromLists :: (Error e, MonadError e m) => [Feature] -> [Sequence a] -> m FeatureHierSequences parse :: (Error e, MonadError e m) => ByteString -> m FeatureHierSequences lookupId :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m Feature parents :: FeatureHierSequences -> Feature -> [Feature] children :: FeatureHierSequences -> Feature -> [Feature] seqData :: (Error e, MonadError e m) => FeatureHierSequences -> SeqLoc -> m SeqData getSequence :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m SeqData featureSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a) runGFF :: FilePath -> (ErrorT String (Reader FeatureHierSequences) a) -> ErrorT String IO a runGFFIO :: FilePath -> (ErrorT String (ReaderT FeatureHierSequences IO) a) -> ErrorT String IO a asksGFF :: (Error e, MonadError e m, MonadReader FeatureHierSequences m) => (FeatureHierSequences -> a -> m b) -> a -> m b instance Show FeatureHierSequences module Bio.GFF3.SGD chromosomes :: FeatureHierSequences -> [Feature] genes :: FeatureHierSequences -> [Feature] rRNAs :: FeatureHierSequences -> [Feature] sortExons :: (Error e, MonadError e m) => [Feature] -> m [Feature] geneSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a) geneSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc geneCDSes :: FeatureHierSequences -> Feature -> [Feature] noncodingSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a) noncodingSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc noncodingExons :: FeatureHierSequences -> Feature -> [Feature] namedSLM :: FeatureHierSequences -> SeqLocMap Feature geneCDS_SLM :: (Error e, MonadError e m) => FeatureHierSequences -> m (SeqLocMap Feature) -- | This is a meta-module importing and re-exporting sequence-related -- stuff. -- -- It encompasses the Bio.Sequence.SeqData, -- Bio.Sequence.Fasta, and Bio.Sequence.TwoBit modules. module Bio.Sequence -- | A sequence consists of a header, the sequence data itself, and -- optional quality data. The type parameter is a phantom type to -- separate nucleotide and amino acid sequences data Sequence t -- | header and actual sequence Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence t data Unknown -- | An offset, index, or length of a SeqData type Offset = Int64 -- | The basic data type used in Sequences type SeqData = ByteString -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from -- bad. type Qual = Word8 -- | Quality data is a Qual vector, currently implemented as a -- ByteString. type QualData = ByteString -- | Return sequence length. seqlength :: Sequence a -> Offset -- | Return sequence label (first word of header) seqlabel :: Sequence a -> SeqData -- | Return full header. seqheader :: Sequence a -> SeqData -- | Return the sequence data. seqdata :: Sequence a -> SeqData -- | Return the quality data, or error if none exist. Use hasqual if in -- doubt. seqqual :: Sequence a -> QualData -- | Read the character at the specified position in the sequence. (!) :: Sequence a -> Offset -> Char appendHeader :: Sequence a -> String -> Sequence a -- | Modify the header by appending text, or by replacing all but the -- sequence label (i.e. first word). setHeader :: Sequence a -> String -> Sequence a -- | Convert a String to SeqData fromStr :: String -> SeqData -- | Convert a SeqData to a String toStr :: SeqData -> String -- | Complement a single character. I.e. identify the nucleotide it can -- hybridize with. Note that for multiple nucleotides, you usually want -- the reverse complement (see revcompl for that). compl :: Char -> Char -- | Calculate the reverse complement. This is only relevant for the -- nucleotide alphabet, and it leaves other characters unmodified. revcompl :: Sequence Nuc -> Sequence Nuc -- | Calculate the reverse complent for SeqData only. revcompl' :: SeqData -> SeqData -- | For type tagging sequences (protein sequences use Amino below) data Nuc castToNuc :: Sequence a -> Sequence Nuc data Amino Ala :: Amino Arg :: Amino Asn :: Amino Asp :: Amino Cys :: Amino Gln :: Amino Glu :: Amino Gly :: Amino His :: Amino Ile :: Amino Leu :: Amino Lys :: Amino Met :: Amino Phe :: Amino Pro :: Amino Ser :: Amino Thr :: Amino Tyr :: Amino Trp :: Amino Val :: Amino STP :: Amino Asx :: Amino Glx :: Amino Xle :: Amino Xaa :: Amino -- | Translate a nucleotide sequence into the corresponding protein -- sequence. This works rather blindly, with no attempt to identify ORFs -- or otherwise QA the result. translate :: Sequence Nuc -> Offset -> [Amino] -- | Convert a sequence in IUPAC format to a list of amino acids. fromIUPAC :: SeqData -> [Amino] -- | Convert a list of amino acids to a sequence in IUPAC format. toIUPAC :: [Amino] -> SeqData castToAmino :: Sequence a -> Sequence Amino -- | Returns a sequence with all internal storage freshly copied and with -- sequence and quality data present as a single chunk. -- -- By freshly copying internal storage, defragSeq allows garbage -- collection of the original data source whence the sequence was read; -- otherwise, use of just a short sequence name can cause an entire -- sequence file buffer to be retained. -- -- By compacting sequence data into a single chunk, defragSeq -- avoids linear-time traversal of sequence chunks during random access -- into sequence data. defragSeq :: Sequence t -> Sequence t -- | map over sequences, treating them as a sequence of (char,word8) pairs. -- This will work on sequences without quality, as long as the function -- doesn't try to examine it. The current implementation is not very -- efficient. seqmap :: ((Char, Qual) -> (Char, Qual)) -> Sequence t -> Sequence t -- | Read nucleotide sequences in any format - Fasta, SFF, FastQ, 2bit, -- PHD... Todo: detect Illumina vs Sanger FastQ, transparent compression readNuc :: FilePath -> IO [Sequence Nuc] -- | Read protein sequences in any supported format (i.e. Fasta) readProt :: FilePath -> IO [Sequence Amino] -- | Lazily read sequences from a FASTA-formatted file readFasta :: FilePath -> IO [Sequence Unknown] -- | Lazily read sequence from handle hReadFasta :: Handle -> IO [Sequence Unknown] -- | Write sequences to a FASTA-formatted file. Line length is 60. writeFasta :: FilePath -> [Sequence a] -> IO () -- | Write sequences in FASTA format to a handle. hWriteFasta :: Handle -> [Sequence a] -> IO () -- | Read quality data for sequences to a file. readQual :: FilePath -> IO [Sequence Unknown] -- | Write quality data for sequences to a file. writeQual :: FilePath -> [Sequence a] -> IO () hWriteQual :: Handle -> [Sequence a] -> IO () -- | Read sequence and associated quality. Will error if the sequences and -- qualites do not match one-to-one in sequence. readFastaQual :: FilePath -> FilePath -> IO [Sequence Unknown] -- | Write sequence and quality data simulatnously This may be more -- laziness-friendly. writeFastaQual :: FilePath -> FilePath -> [Sequence a] -> IO () hWriteFastaQual :: Handle -> Handle -> [Sequence a] -> IO () readFastQ :: FilePath -> IO [Sequence Nuc] writeFastQ :: FilePath -> [Sequence Nuc] -> IO () hReadFastQ :: Handle -> IO [Sequence Nuc] hWriteFastQ :: Handle -> [Sequence Nuc] -> IO () readSangerQ :: FilePath -> IO [Sequence Nuc] writeSangerQ :: FilePath -> [Sequence Nuc] -> IO () hReadSangerQ :: Handle -> IO [Sequence Nuc] hWriteSangerQ :: Handle -> [Sequence Nuc] -> IO () readIllumina :: FilePath -> IO [Sequence Nuc] writeIllumina :: FilePath -> [Sequence Nuc] -> IO () hReadIllumina :: Handle -> IO [Sequence Nuc] hWriteIllumina :: Handle -> [Sequence Nuc] -> IO () -- | Parse a .phd file, extracting the contents as a Sequence readPhd :: FilePath -> IO (Sequence Nuc) -- | Parse .phd contents from a handle hReadPhd :: Handle -> IO (Sequence Nuc) -- | Parse a (lazy) ByteString as sequences in the 2bit format. decode2Bit :: ByteString -> [Sequence Nuc] -- | Read sequences from a file in 2bit format and | unmarshall/deserialize -- into Sequence format. read2Bit :: FilePath -> IO [Sequence Nuc] -- | Read sequences from a file handle in the 2bit format and | -- unmarshall/deserialze into Sequence format. hRead2Bit :: Handle -> IO [Sequence Nuc] -- | This is a struct for containing a set of hashing functions data HashF k HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k -- | calculates the hash at a given offset in the sequence hash :: HashF k -> SeqData -> Offset -> Maybe k -- | calculate all hashes from a sequence, and their indices hashes :: HashF k -> SeqData -> [(k, Offset)] -- | for sorting hashes ksort :: HashF k -> [k] -> [k] -- | Contigous constructs an int/eger from a contigous k-word. contigous :: Integral k => Int -> HashF k -- | Like contigous, but returns the same hash for a word and its -- reverse complement. rcontig :: Integral k => Int -> HashF k -- | Like rcontig, but ignoring monomers (i.e. arbitrarily long -- runs of a single nucelotide are treated the same a single nucleotide. rcpacked :: Integral k => Int -> HashF k class KWords s kwords :: KWords s => Int -> s -> [s] entropy :: (Ord str, KWords str) => Int -> str -> Double -- | Multiple alignments. module Bio.Alignment.Multiple -- | Progressive multiple alignment. Calculate a tree from agglomerative -- clustering, then align at each branch going bottom up. Returns a list -- of columns (rows?). progressive :: (Sequence a -> Sequence a -> (Double, EditList)) -> [Sequence a] -> [String] -- | Derive alignments indirectly, i.e. calculate A|C using alignments A|B -- and B|C. This is central for Coffee evaluation of alignments, -- and T-Coffee construction of alignments. indirect :: EditList -> EditList -> EditList