-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A bioinformatics library -- -- This is a collection of data structures and algorithms I've found -- useful when building various bioinformatics-related tools and -- utilities. -- -- Current list of features includes: a Sequence data type supporting -- protein and nucleotide sequences and conversion between them, quality -- data, reading and writing Fasta formatted files, reading TwoBit and -- phd formats. Rudimentary support for doing alignments - including -- dynamic adjustment of scores based on sequence quality - and Blast -- output parsing. Partly implemented single linkage clustering, and -- multiple alignment. Reading Gene Ontology (GO) annotations (GOA) and -- definitions/hierarchy. -- -- The Darcs repository is at: -- http://malde.org/~ketil/biohaskell/biolib. @package bio @version 0.3.5 module Bio.GFF3.Escape unEscapeByteString :: (Error e, MonadError e m) => ByteString -> m ByteString escapeByteString :: (Char -> Bool) -> ByteString -> ByteString escapeAllBut :: String -> ByteString -> ByteString escapeAllOf :: String -> ByteString -> ByteString -- | Lazy "many" combinator for Parsec. Courtesy of Tomasz Zielonka. module Bio.Util.Parsex lazyMany :: GenParser Char () a -> SourceName -> [Char] -> [a] -- | Utility module, with various useful stuff. module Bio.Util lines :: ByteString -> [ByteString] -- | Break a list of bytestrings on a predicate. splitWhen :: (ByteString -> Bool) -> [ByteString] -> [[ByteString]] -- | Output (to stderr) progress while evaluating a lazy list. Useful for -- generating output while (conceptually, at least) in pure code countIO :: String -> String -> Int -> [a] -> IO [a] -- | A lazier version of Control.Monad.sequence in Control.Monad, -- needed by countIO above. sequence' :: [IO a] -> IO [a] -- | Workaround, the current Data.ByteString.Lazy.Char8 contains a -- bug in Data.ByteString.Lazy.Char8.lines. mylines :: ByteString -> [ByteString] -- | Implement clustering module Bio.Clustering -- | Data structure for storing hierarchical clusters data Clustered score datum Branch :: score -> (Clustered score datum) -> (Clustered score datum) -> Clustered score datum Leaf :: datum -> Clustered score datum -- | Single linkage agglomerative clustering. Cluster elements by slurping -- a sorted list of pairs with score (i.e. triples :-) Keeps a set of -- contained elements at each branch's root, so O(n log n), and requires -- elements to be in Ord. For this to work, the triples must be sorted on -- score. Earlier scores in the list will make up the lower nodes, so -- sort descending for similarity, ascending for distance. cluster_sl :: (Ord a, Ord s) => [(s, a, a)] -> [Clustered s a] instance (Show score, Show datum) => Show (Clustered score datum) -- | This module implements a hierarchical data structure for BLAST -- results, there is an alternative flat structure in the -- Bio.Alignment.BlastFlat module. -- -- BLAST is a tool for searching in (biological) sequences for -- similarity. This library is tested against NCBI-blast version 2.2.14. -- There exist several independent versions of BLAST, so expect some -- incompatbilities if you're using a different BLAST version. -- -- For parsing BLAST results, the XML format (blastall -m 7) is by far -- the most robust choice, and is implemented in the -- Bio.Alignment.BlastXML module. -- -- The format is straightforward (and non-recursive). For more -- information on BLAST, check -- http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html module Bio.Alignment.BlastData -- | The sequence id, i.e. the first word of the header field. type SeqId = ByteString -- | The Strand indicates the direction of the match, i.e. the plain -- sequence or its reverse complement. data Strand Plus :: Strand Minus :: Strand -- | The Aux field in the BLAST output includes match information that -- depends on the BLAST flavor (blastn, blastx, or blastp). This data -- structure captures those variations. data Aux -- | blastn Strands :: !Strand -> !Strand -> Aux -- | blastx Frame :: !Strand -> !Int -> Aux -- | A BlastResult is the root of the hierarchy. data BlastResult BlastResult :: !ByteString -> !ByteString -> !ByteString -> !ByteString -> !ByteString -> !Integer -> !Integer -> [BlastRecord] -> BlastResult blastprogram :: BlastResult -> !ByteString blastversion :: BlastResult -> !ByteString blastdate :: BlastResult -> !ByteString blastreferences :: BlastResult -> !ByteString database :: BlastResult -> !ByteString dbsequences :: BlastResult -> !Integer dbchars :: BlastResult -> !Integer results :: BlastResult -> [BlastRecord] -- | Each query sequence generates a BlastRecord data BlastRecord BlastRecord :: !SeqId -> !Int -> [BlastHit] -> BlastRecord query :: BlastRecord -> !SeqId qlength :: BlastRecord -> !Int hits :: BlastRecord -> [BlastHit] -- | Each match between a query and a target sequence (or subject) is a -- BlastHit. data BlastHit BlastHit :: !SeqId -> !Int -> [BlastMatch] -> BlastHit subject :: BlastHit -> !SeqId slength :: BlastHit -> !Int matches :: BlastHit -> [BlastMatch] -- | A BlastHit may contain multiple separate matches (typcially -- when an indel causes a frameshift that blastx is unable to bridge). data BlastMatch BlastMatch :: !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastMatch bits :: BlastMatch -> !Double e_val :: BlastMatch -> !Double identity :: BlastMatch -> (Int, Int) q_from :: BlastMatch -> !Int q_to :: BlastMatch -> !Int h_from :: BlastMatch -> !Int h_to :: BlastMatch -> !Int aux :: BlastMatch -> !Aux instance Show BlastMatch instance Show BlastHit instance Show BlastRecord instance Show BlastResult instance Show Aux instance Eq Aux instance Read Strand instance Show Strand instance Eq Strand -- | This module implements a parser for BLAST results. -- -- This module is DEPRECATED. It is *very* recommended that you run blast -- with XML output instaed, and use the BlastXML module to parse it. -- Don't say I didn't warn you! -- -- BLAST is a tool for searching in (biological) sequences for -- similarity. This library is tested against NCBI-blast version 2.2.14. -- There exist several independent versions, so expect some -- incompatbilities if you're using a different BLAST version. -- -- The format is straightforward (and non-recursive), and this -- implementation uses a simple line-based, hierarchical parser. -- -- For more information on BLAST, check -- http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html module Bio.Alignment.Blast parse :: ByteString -> BlastResult -- | Parse blast XML output. -- -- If you use a recent version of NCBI BLAST and specify XML output -- (blastall -m 7), this module should be able to parse the result into a -- hierarchical BlastResult structure. -- -- While the process may consume a bit of memory, the parsing is lazy, -- and file sizes of several gigabytes can be parsed (see e.g. the xml2x -- tool for an example). To parse XML, we use Text.HTML.TagSoup. module Bio.Alignment.BlastXML -- | Parse BLAST results in XML format readXML :: FilePath -> IO [BlastResult] -- | This module implements a "flattened" data structure for Blast hits, as -- opposed to the hierarchical structure in -- Bio.Alignment.BlastData. -- -- The flat data type is useful in many cases where it is more natural to -- see the result as a set of rows (e.g. for insertaion in a database). -- -- It would probably be more (memory-) efficient to go the other way -- (i.e. from flat to hierarchical), as passing the current, partially -- built BlastFlat object down the stream of results and stamping -- out a stream of completed ones. (See -- Bio.Alignment.BlastXML.breaks for this week's most cumbersome -- use of parallelism to avoid the memory issue.) module Bio.Alignment.BlastFlat -- | The BlastFlat data structure contains information about a single match data BlastFlat BlastFlat :: !SeqId -> !Int -> !SeqId -> !Int -> !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastFlat query :: BlastFlat -> !SeqId qlength :: BlastFlat -> !Int subject :: BlastFlat -> !SeqId slength :: BlastFlat -> !Int bits :: BlastFlat -> !Double e_val :: BlastFlat -> !Double identity :: BlastFlat -> (Int, Int) q_from :: BlastFlat -> !Int q_to :: BlastFlat -> !Int h_from :: BlastFlat -> !Int h_to :: BlastFlat -> !Int aux :: BlastFlat -> !Aux readXML :: FilePath -> IO [BlastFlat] -- | Convert BlastRecords into BlastFlats (representing a depth-first -- traversal of the BlastRecord structure.) flatten :: [BlastRecord] -> [BlastFlat] -- | Each query sequence generates a BlastRecord data BlastRecord blastprogram :: BlastResult -> ByteString blastversion :: BlastResult -> ByteString blastdate :: BlastResult -> ByteString blastreferences :: BlastResult -> ByteString database :: BlastResult -> ByteString dbsequences :: BlastResult -> Integer dbchars :: BlastResult -> Integer results :: BlastResult -> [BlastRecord] -- | The Aux field in the BLAST output includes match information that -- depends on the BLAST flavor (blastn, blastx, or blastp). This data -- structure captures those variations. data Aux -- | blastn Strands :: !Strand -> !Strand -> Aux -- | blastx Frame :: !Strand -> !Int -> Aux -- | The Strand indicates the direction of the match, i.e. the plain -- sequence or its reverse complement. data Strand Plus :: Strand Minus :: Strand -- | GeneOntology - parse and index Gene Ontology Annotations In -- particular, the file 'gene_association.goa_uniprot' that contains -- links between GO terms and UniProt accessions. -- --
-- AS contigs reads -- CO contig_name bases reads segments compl (CAP3: segments=0) -- sequence -- BQ base_qualities -- AF read1 compl padded_start_consensus (negatives meaning?) -- AF read2 .. -- BS segments -- RD read1 bases info_items info_tags (latter two set to 0 by CAP3) -- sequence -- QA read1 qual_start qual_end align_start align_end -- DS (phred header? left empty by CAP3) -- RD read2 ... --module Bio.Alignment.ACE -- | Reading an ACE file. readACE :: FilePath -> IO [[Assembly]] writeACE :: FilePath -> [Assembly] -> IO () data Assembly Asm :: (Sequence, Gaps) -> Alignment -> Assembly contig :: Assembly -> (Sequence, Gaps) fragments :: Assembly -> Alignment -- | Test parser p on a list of ACE elements ptest :: (Show a) => String -> AceParser a -> [ACE] -> IO () reads :: Assembly -> Alignment instance Eq ACE instance Show Assembly instance Show ACE module Bio.Util.TestBase data Test T :: String -> t -> Test newtype Nucleotide N :: Char -> Nucleotide newtype Quality Q :: Word8 -> Quality fromN :: Nucleotide -> Char fromQ :: Quality -> Word8 -- | For testing, variable lengths newtype EST E :: Sequence -> EST newtype ESTq Eq :: Sequence -> ESTq newtype Protein P :: Sequence -> Protein -- | For benchmarking, fixed lengths newtype EST_short ES :: Sequence -> EST_short newtype EST_long EL :: Sequence -> EST_long newtype EST_set ESet :: [Sequence] -> EST_set -- | Take time (CPU and wall clock) and report it time :: String -> IO () -> IO () -- | Print a CPUTime difference showT :: (Integral a) => a -> String -- | Shamelessly stolen from FPS integralRandomR :: (Integral a, RandomGen g) => (a, a) -> g -> (a, g) -- | Constrained position generators genOffset :: Gen Offset genNonNegOffset :: Gen Offset genPositiveOffset :: Gen Offset instance Show EST_set instance Show EST_long instance Show EST_short instance Show Protein instance Show ESTq instance Show EST instance Show Quality instance Show Nucleotide instance Arbitrary EST_set instance Arbitrary EST_long instance Arbitrary EST_short instance Arbitrary Char instance Arbitrary EST instance Arbitrary ESTq instance Arbitrary Quality instance Arbitrary Nucleotide instance Arbitrary Word8 instance Random Word8 module Bio.Location.Strand -- | Sequence strand data Strand Fwd :: Strand RevCompl :: Strand -- | Anything, such as a location or a sequence, which lies on a strand and -- can thus be reverse complemented. class Stranded s revCompl :: (Stranded s) => s -> s stranded :: (Stranded s) => Strand -> s -> s instance Eq Strand instance Ord Strand instance Show Strand instance Read Strand instance Bounded Strand instance Enum Strand instance Ix Strand instance Stranded ByteString instance Stranded Char instance Stranded Strand -- | Positions on a sequence. Zero-based Int64 indices are used throughout, -- to facilitate direct use of indexing functions on SeqData. module Bio.Location.Position -- | Position in a sequence data Pos Pos :: !Offset -> !Strand -> Pos -- | 0-based index of the position offset :: Pos -> !Offset -- | Optional strand of the position strand :: Pos -> !Strand -- | Slide a position by an offset slide :: Pos -> Offset -> Pos seqNt :: (Error e, MonadError e m) => SeqData -> Pos -> m Char seqNtPadded :: SeqData -> Pos -> Char display :: Pos -> String instance Eq Pos instance Ord Pos instance Show Pos instance Read Pos instance Ix Pos instance Stranded Pos -- | Data types for working with locations in a sequence. Zero-based Int64 -- indices are used throughout, to facilitate direct use of indexing -- functions on SeqData. module Bio.Location.ContigLocation -- | Contiguous set of positions in a sequence data ContigLoc ContigLoc :: !Offset -> !Offset -> !Strand -> ContigLoc -- | 5' end of region on target sequence, 0-based index offset5 :: ContigLoc -> !Offset -- | length of region on target sequence length :: ContigLoc -> !Offset -- | strand of region strand :: ContigLoc -> !Strand -- | Create a ContigLoc from 0-based starting and ending positions. -- When start is less than end the position will be on the Fwd -- Strand, otherwise it will be on the RevCompl strand. fromStartEnd :: Offset -> Offset -> ContigLoc -- | Create a ContigLoc from a Pos.Pos defining the start -- (ContigLoc 5 prime end) position on the sequence and the -- length. fromPosLen :: Pos -> Offset -> ContigLoc -- | The bounds of a ContigLoc, a pair of the lowest and highest -- sequence indices covered by the region, which ignores the strand of -- the ContigLoc. The first element of the pair will always be -- lower than the second. bounds :: ContigLoc -> (Offset, Offset) -- | 0-based starting (5' in the region orientation) position startPos :: ContigLoc -> Pos -- | 0-based ending (3' in the region orientation) position endPos :: ContigLoc -> Pos -- | Move a ContigLoc region by a specified offset slide :: Offset -> ContigLoc -> ContigLoc -- | ContigLoc extended on the 5' and 3' ends. extend :: (Offset, Offset) -> ContigLoc -> ContigLoc -- | For a Pos and a ContigLoc on the same sequence, find the -- corresponding Pos relative to the ContigLoc, provided it is -- within the ContigLoc. posInto :: Pos -> ContigLoc -> Maybe Pos -- | For a Pos specified relative to a ContigLoc, find the -- corresponding Pos relative to the outer sequence, provided that the -- Pos is within the bounds of the ContigLoc. posOutof :: Pos -> ContigLoc -> Maybe Pos -- | Subsequence SeqData for a ContigLoc, provided that the -- region is entirely within the sequence. seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData -- | Subsequence SeqData for a ContigLoc, padded as needed -- with Ns seqDataPadded :: SeqData -> ContigLoc -> SeqData -- | For a Pos and a ContigLoc on the same sequence, is the Pos -- within the ContigLoc. isWithin :: Pos -> ContigLoc -> Bool -- | For a pair of ContigLoc regions on the same sequence, indicates -- if they overlap at all. overlaps :: ContigLoc -> ContigLoc -> Bool display :: ContigLoc -> String instance Eq ContigLoc instance Ord ContigLoc instance Show ContigLoc instance Stranded ContigLoc -- | Data types for working with locations in a sequence. Zero-based Int64 -- indices are used throughout, to facilitate direct use of indexing -- functions on SeqData. module Bio.Location.Location -- | General (disjoint) sequence region consisting of a concatenated set of -- contiguous regions newtype Loc Loc :: [ContigLoc] -> Loc -- | The bounds of a Loc, consisting of the lowest & highest -- sequence indices lying within the region. The first element of the -- pair will always be lower than the second. bounds :: Loc -> (Offset, Offset) -- | Length of the region length :: Loc -> Offset -- | 0-based starting (5' in the region orientation) offset of the region -- on its sequence. startPos :: Loc -> Pos -- | 0-based ending (3' in the region orientation) offset of the region on -- its sequence. endPos :: Loc -> Pos -- | Extend a Loc region by incorporating contigous nucleotide -- regions of the specified lengths on the 5' and 3' ends extend :: (Offset, Offset) -> Loc -> Loc -- | For a Pos and a Loc region on the same sequence, find the -- corresponding Pos relative to the region, if the Pos is within the -- region. If the Loc region has redundant positions for a given -- sequence position, the first is returned. posInto :: Pos -> Loc -> Maybe Pos -- | For a Loc region on a sequence and a Pos relative to the -- region, find the corresponding Pos on the sequence, provided that the -- position is within the bounds of the region. posOutof :: Pos -> Loc -> Maybe Pos -- | For a Pos and a Loc on the same sequence, does the position -- fall within the Loc region? isWithin :: Pos -> Loc -> Bool -- | For a pair of Loc regions on the same sequence, do they overlap -- at all? overlaps :: Loc -> Loc -> Bool -- | Subsequence SeqData for a Loc, provided that the region -- is entirely within the sequence. seqData :: (Error e, MonadError e m) => SeqData -> Loc -> m SeqData seqDataPadded :: SeqData -> Loc -> SeqData display :: Loc -> String instance Eq Loc instance Ord Loc instance Show Loc instance Stranded Loc -- | A structure to allow fast lookup of objects whose sequence location -- lines up with a give position. module Bio.Location.LocMap -- | Collection mapping a collection of Loc locations, possibly -- overlapping, binned for efficient lookup by position. data LocMap a -- | Create an empty LocMap with a specified position bin size mkLocMap :: Offset -> LocMap a defaultZonesize :: Offset -- | Create a LocMap from an associated list. fromList :: Offset -> [(Loc, a)] -> LocMap a -- | Find the (possibly empty) list of sequence regions and associated -- objects that contain a Pos position, in the sense of withinLoc lookupWithin :: Pos -> LocMap a -> [(Loc, a)] -- | Find the (possibly empty) list of sequence regions and associated -- objects that overlap a Loc region, in the sense of overlapsLoc lookupOverlaps :: Loc -> LocMap a -> [(Loc, a)] -- | Remove a region / object association from the map, if it is present. -- If it is present multiple times, only the first occurrence will be -- deleted. delete :: (Eq a) => (Loc, a) -> LocMap a -> LocMap a -- | Remove the first region / object association satisfying a predicate -- function. deleteBy :: ((Loc, a) -> Bool) -> LocMap a -> LocMap a -- | Add an object with an associated Loc sequence region insert :: Loc -> a -> LocMap a -> LocMap a checkInvariants :: LocMap a -> [String] instance Monoid (LocMap a) module Bio.Location.OnSeq type SeqName = SeqData data OnSeq a OnSeq :: !SeqName -> !a -> OnSeq a onSeqName :: OnSeq a -> !SeqName onSeqObj :: OnSeq a -> !a withSeqData :: (Error e, MonadError e m) => (SeqData -> a -> m b) -> (SeqName -> m SeqData) -> OnSeq a -> m b andSameSeq :: (a -> b -> Bool) -> OnSeq a -> OnSeq b -> Bool onSameSeq :: (Monad m) => (a -> b -> m c) -> OnSeq a -> OnSeq b -> m c type OnSeqs a = Map SeqName a perSeq :: (Monoid b) => (a -> b -> c) -> OnSeq a -> OnSeqs b -> c perSeqUpdate :: (Monoid b) => (a -> b -> b) -> OnSeq a -> OnSeqs b -> OnSeqs b withNameAndSeq :: (Monad m) => (SeqName -> a -> b -> m c) -> OnSeq a -> OnSeqs b -> m c instance (Eq a) => Eq (OnSeq a) instance (Ord a) => Ord (OnSeq a) instance (Show a) => Show (OnSeq a) instance Functor OnSeq module Bio.Location.SeqLocation type SeqPos = OnSeq Pos displaySeqPos :: SeqPos -> String type ContigSeqLoc = OnSeq ContigLoc withinContigSeqLoc :: SeqPos -> ContigSeqLoc -> Bool displayContigSeqLoc :: ContigSeqLoc -> String type SeqLoc = OnSeq Loc isWithin :: SeqPos -> SeqLoc -> Bool overlaps :: SeqLoc -> SeqLoc -> Bool seqData :: (Error e, MonadError e m) => (SeqName -> m SeqData) -> SeqLoc -> m SeqData display :: SeqLoc -> String module Bio.Alignment.Soap -- | Alignment output from SOAP data SoapAlign SA :: !SeqName -> !SeqData -> !QualData -> !Int -> !Char -> !Offset -> !Strand -> !SeqName -> !Offset -> !Int -> ![SoapAlignMismatch] -> SoapAlign name :: SoapAlign -> !SeqName -- | Reference strand orientation sequence sequ :: SoapAlign -> !SeqData -- | Reference strand orientation quality data qual :: SoapAlign -> !QualData nhit :: SoapAlign -> !Int pairend :: SoapAlign -> !Char length :: SoapAlign -> !Offset strand :: SoapAlign -> !Strand refname :: SoapAlign -> !SeqName -- | 1-based index, as output by SOAP, of reference strand 5' end refstart :: SoapAlign -> !Offset nmismatch :: SoapAlign -> !Int mismatches :: SoapAlign -> ![SoapAlignMismatch] data SoapAlignMismatch SAM :: !Char -> !Char -> !Offset -> !Qual -> SoapAlignMismatch -- | Read nt in reference strand orientation readnt :: SoapAlignMismatch -> !Char -- | Reference nt in reference strand orientation refnt :: SoapAlignMismatch -> !Char -- | Offset from reference strand 5' end in reference strand orientation offset :: SoapAlignMismatch -> !Offset -- | Quality score of read nt qualnt :: SoapAlignMismatch -> !Qual refSeqPos :: SoapAlign -> SeqPos refCSeqLoc :: SoapAlign -> ContigSeqLoc refSeqLoc :: SoapAlign -> SeqLoc mismatchSeqPos :: SoapAlign -> SoapAlignMismatch -> SeqPos parse :: (Error e, MonadError e m) => ByteString -> m SoapAlign unparse :: SoapAlign -> ByteString parseMismatch :: (Error e, MonadError e m) => ByteString -> m SoapAlignMismatch unparseMismatch :: SoapAlignMismatch -> ByteString group :: [SoapAlign] -> [[SoapAlign]] instance Read SoapAlignMismatch instance Show SoapAlignMismatch instance Eq SoapAlignMismatch instance Ord SoapAlignMismatch instance Read SoapAlign instance Show SoapAlign instance Eq SoapAlign instance Ord SoapAlign module Bio.Location.SeqLocMap type SeqLocMap a = OnSeqs (LocMap a) empty :: SeqLocMap a fromList :: [(SeqLoc, a)] -> SeqLocMap a insert :: SeqLoc -> a -> SeqLocMap a -> SeqLocMap a lookupWithin :: SeqPos -> SeqLocMap a -> [(SeqLoc, a)] lookupOverlaps :: SeqLoc -> SeqLocMap a -> [(SeqLoc, a)] module Bio.GFF3.Feature data GFFAttr GFFAttr :: !ByteString -> ![ByteString] -> GFFAttr attrTag :: GFFAttr -> !ByteString attrValues :: GFFAttr -> ![ByteString] data Feature Feature :: !ByteString -> !ByteString -> !ByteString -> !Offset -> !Offset -> !Maybe Double -> !Maybe Strand -> !Maybe Offset -> ![GFFAttr] -> Feature seqid :: Feature -> !ByteString source :: Feature -> !ByteString ftype :: Feature -> !ByteString start :: Feature -> !Offset end :: Feature -> !Offset score :: Feature -> !Maybe Double strand :: Feature -> !Maybe Strand phase :: Feature -> !Maybe Offset attributes :: Feature -> ![GFFAttr] length :: Feature -> Offset parse :: (Error e, MonadError e m) => ByteString -> m Feature unparse :: Feature -> ByteString parseWithFasta :: (Error e, MonadError e m) => ByteString -> m ([Feature], [ByteString]) attrByTag :: ByteString -> Feature -> [ByteString] ids :: Feature -> [ByteString] parentIds :: Feature -> [ByteString] contigLoc :: Feature -> ContigLoc loc :: Feature -> Loc seqLoc :: Feature -> SeqLoc name :: (Error e, MonadError e m) => Feature -> m SeqName instance Eq Feature instance Ord Feature instance Show Feature instance Eq GFFAttr instance Ord GFFAttr instance Show GFFAttr module Bio.GFF3.FeatureHier data FeatureHier features :: FeatureHier -> (Set Feature) lookupId :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m Feature lookupIdChildren :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m [Feature] fromList :: (Error e, MonadError e m) => [Feature] -> m FeatureHier insert :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier delete :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier parents :: FeatureHier -> Feature -> [Feature] children :: FeatureHier -> Feature -> [Feature] parentsM :: (MonadReader FeatureHier m) => Feature -> m [Feature] childrenM :: (MonadReader FeatureHier m) => Feature -> m [Feature] checkInvariants :: FeatureHier -> [String] instance Show FeatureHier module Bio.GFF3.FeatureHierSequences data FeatureHierSequences features :: FeatureHierSequences -> Set Feature sequences :: FeatureHierSequences -> [Sequence] fromLists :: (Error e, MonadError e m) => [Feature] -> [Sequence] -> m FeatureHierSequences parse :: (Error e, MonadError e m) => ByteString -> m FeatureHierSequences lookupId :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m Feature parents :: FeatureHierSequences -> Feature -> [Feature] children :: FeatureHierSequences -> Feature -> [Feature] seqData :: (Error e, MonadError e m) => FeatureHierSequences -> SeqLoc -> m SeqData getSequence :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m SeqData featureSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m Sequence runGFF :: FilePath -> (ErrorT String (Reader FeatureHierSequences) a) -> ErrorT String IO a runGFFIO :: FilePath -> (ErrorT String (ReaderT FeatureHierSequences IO) a) -> ErrorT String IO a asksGFF :: (Error e, MonadError e m, MonadReader FeatureHierSequences m) => (FeatureHierSequences -> a -> m b) -> a -> m b instance Show FeatureHierSequences module Bio.GFF3.SGD chromosomes :: FeatureHierSequences -> [Feature] genes :: FeatureHierSequences -> [Feature] rRNAs :: FeatureHierSequences -> [Feature] sortExons :: (Error e, MonadError e m) => [Feature] -> m [Feature] geneSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m Sequence geneSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc geneCDSes :: FeatureHierSequences -> Feature -> [Feature] noncodingSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m Sequence noncodingSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc noncodingExons :: FeatureHierSequences -> Feature -> [Feature] namedSLM :: FeatureHierSequences -> SeqLocMap Feature geneCDS_SLM :: (Error e, MonadError e m) => FeatureHierSequences -> m (SeqLocMap Feature) -- | This is a meta-module importing and re-exporting sequence-related -- stuff. -- -- It encompasses the Bio.Sequence.SeqData, -- Bio.Sequence.Fasta, and Bio.Sequence.TwoBit modules. module Bio.Sequence -- | A sequence consists of a header, the sequence data itself, and -- optional quality data. data Sequence -- | header and actual sequence Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence -- | An offset, index, or length of a SeqData type Offset = Int64 -- | The basic data type used in Sequences type SeqData = ByteString -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from -- bad. type Qual = Word8 -- | Quality data is a Qual vector, currently implemented as a -- ByteString. type QualData = ByteString -- | Return sequence length. seqlength :: Sequence -> Offset -- | Return sequence label (first word of header) seqlabel :: Sequence -> SeqData -- | Return full header. seqheader :: Sequence -> SeqData -- | Return the sequence data. seqdata :: Sequence -> SeqData -- | Return the quality data, or error if none exist. Use hasqual if in -- doubt. seqqual :: Sequence -> QualData -- | Read the character at the specified position in the sequence. (!) :: Sequence -> Offset -> Char appendHeader :: Sequence -> String -> Sequence -- | Modify the header by appending text, or by replacing all but the -- sequence label (i.e. first word). setHeader :: Sequence -> String -> Sequence -- | Convert a String to SeqData fromStr :: String -> SeqData -- | Convert a SeqData to a String toStr :: SeqData -> String -- | Complement a single character. I.e. identify the nucleotide it can -- hybridize with. Note that for multiple nucleotides, you usually want -- the reverse complement (see revcompl for that). compl :: Char -> Char -- | Calculate the reverse complement. This is only relevant for the -- nucleotide alphabet, and it leaves other characters unmodified. revcompl :: Sequence -> Sequence data Amino Ala :: Amino Arg :: Amino Asn :: Amino Asp :: Amino Cys :: Amino Gln :: Amino Glu :: Amino Gly :: Amino His :: Amino Ile :: Amino Leu :: Amino Lys :: Amino Met :: Amino Phe :: Amino Pro :: Amino Ser :: Amino Thr :: Amino Tyr :: Amino Trp :: Amino Val :: Amino STP :: Amino Asx :: Amino Glx :: Amino Xle :: Amino Xaa :: Amino -- | Translate a nucleotide sequence into the corresponding protein -- sequence. This works rather blindly, with no attempt to identify ORFs -- or otherwise QA the result. translate :: Sequence -> Offset -> [Amino] -- | Convert a sequence in IUPAC format to a list of amino acids. fromIUPAC :: SeqData -> [Amino] -- | Convert a list of amino acids to a sequence in IUPAC format. toIUPAC :: [Amino] -> SeqData -- | Lazily read sequences from a FASTA-formatted file readFasta :: FilePath -> IO [Sequence] -- | Lazily read sequence from handle hReadFasta :: Handle -> IO [Sequence] -- | Write sequences to a FASTA-formatted file. Line length is 60. writeFasta :: FilePath -> [Sequence] -> IO () -- | Write sequences in FASTA format to a handle. hWriteFasta :: Handle -> [Sequence] -> IO () -- | Read quality data for sequences to a file. readQual :: FilePath -> IO [Sequence] -- | Write quality data for sequences to a file. writeQual :: FilePath -> [Sequence] -> IO () hWriteQual :: Handle -> [Sequence] -> IO () -- | Read sequence and associated quality. Will error if the sequences and -- qualites do not match one-to-one in sequence. readFastaQual :: FilePath -> FilePath -> IO [Sequence] -- | Write sequence and quality data simulatnously This may be more -- laziness-friendly. writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO () hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO () readFastQ :: FilePath -> IO [Sequence] writeFastQ :: FilePath -> [Sequence] -> IO () hReadFastQ :: Handle -> IO [Sequence] hWriteFastQ :: Handle -> [Sequence] -> IO () -- | Parse a .phd file, extracting the contents as a Sequence readPhd :: FilePath -> IO Sequence -- | Parse .phd contents from a handle hReadPhd :: Handle -> IO Sequence -- | Parse a (lazy) ByteString as sequences in the 2bit format. decode2Bit :: ByteString -> [Sequence] -- | Extract sequences from a file in 2bit format. read2Bit :: FilePath -> IO [Sequence] -- | Extract sequences in the 2bit format from a handle. hRead2Bit :: Handle -> IO [Sequence] -- | This is a struct for containing a set of hashing functions data HashF k HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k -- | calculates the hash at a given offset in the sequence hash :: HashF k -> SeqData -> Offset -> Maybe k -- | calculate all hashes from a sequence, and their indices hashes :: HashF k -> SeqData -> [(k, Offset)] -- | for sorting hashes ksort :: HashF k -> [k] -> [k] -- | Contigous constructs an int/eger from a contigous k-word. contigous :: (Integral k) => Int -> HashF k -- | Like contigous, but returns the same hash for a word and its -- reverse complement. rcontig :: (Integral k) => Int -> HashF k -- | Like rcontig, but ignoring monomers (i.e. arbitrarily long -- runs of a single nucelotide are treated the same a single nucleotide. rcpacked :: (Integral k) => Int -> HashF k class KWords s kwords :: (KWords s) => Int -> s -> [s] entropy :: (Ord str, KWords str) => Int -> str -> Double -- | Multiple alignments. module Bio.Alignment.Multiple -- | Progressive multiple alignment. Calculate a tree from agglomerative -- clustering, then align at each branch going bottom up. Returns a list -- of columns (rows?). progressive :: (Sequence -> Sequence -> (Double, EditList)) -> [Sequence] -> [String] -- | Derive alignments indirectly, i.e. calculate A|C using alignments A|B -- and B|C. This is central for Coffee evaluation of alignments, and -- T-Coffee construction of alignments. indirect :: EditList -> EditList -> EditList