-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | A bioinformatics library
--   
--   This is a collection of data structures and algorithms I've found
--   useful when building various bioinformatics-related tools and
--   utilities.
--   
--   Current list of features includes: a Sequence data type supporting
--   protein and nucleotide sequences and conversion between them, quality
--   data, reading and writing Fasta formatted files, reading TwoBit and
--   phd formats. Rudimentary support for doing alignments - including
--   dynamic adjustment of scores based on sequence quality - and Blast
--   output parsing. Partly implemented single linkage clustering, and
--   multiple alignment.
--   
--   The Darcs repository is at:
--   <a>http://malde.org/~ketil/biohaskell/biolib</a>.
@package bio
@version 0.3.3.2


-- | Lazy <a>many</a> combinator for Parsec. Courtesy of Tomasz Zielonka.
module Bio.Util.Parsex
lazyMany :: GenParser Char () a -> SourceName -> [Char] -> [a]


-- | Utility module, with various useful stuff.
module Bio.Util

-- | Break a list of bytestrings on a predicate
splitWhen :: (ByteString -> Bool) -> [ByteString] -> [[ByteString]]

-- | Output (to stderr) progress while evaluating a lazy list Useful for
--   generating output while (conceptually, at least) in pure code
--   Strictness warning!! This doesn't *quite* work in all cases. Why?
countIO :: String -> String -> Int -> [a] -> IO [a]
sequence' :: [IO a] -> IO [a]


-- | Implement clustering
module Bio.Clustering

-- | Data structure for storing hierarchical clusters
data Clustered score datum
Branch :: score -> (Clustered score datum) -> (Clustered score datum) -> Clustered score datum
Leaf :: datum -> Clustered score datum

-- | Single linkage agglomerative clustering. Cluster elements by slurping
--   a sorted list of pairs with score (i.e. triples :-) Keeps a set of
--   contained elements at each branch's root, so O(n log n), and requires
--   elements to be in Ord. For this to work, the triples must be sorted on
--   score. Earlier scores in the list will make up the lower nodes, so
--   sort descending for similarity, ascending for distance.
cluster_sl :: (Ord a, Ord s) => [(s, a, a)] -> [Clustered s a]
instance (Show score, Show datum) => Show (Clustered score datum)


-- | This module implements a hierarchical data structure for BLAST
--   results, there is an alternative flat structure in the
--   <a>Bio.Alignment.BlastFlat</a> module.
--   
--   BLAST is a tool for searching in (biological) sequences for
--   similarity. This library is tested against NCBI-blast version 2.2.14.
--   There exist several independent versions of BLAST, so expect some
--   incompatbilities if you're using a different BLAST version.
--   
--   For parsing BLAST results, the XML format (blastall -m 7) is by far
--   the most robust choice, and is implemented in the
--   <a>Bio.Alignment.BlastXML</a> module.
--   
--   The format is straightforward (and non-recursive). For more
--   information on BLAST, check
--   <a>http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html</a>
module Bio.Alignment.BlastData

-- | The sequence id, i.e. the first word of the header field.
type SeqId = ByteString

-- | The <a>Strand</a> indicates the direction of the match, i.e. the plain
--   sequence or its reverse complement.
data Strand
Plus :: Strand
Minus :: Strand

-- | The Aux field in the BLAST output includes match information that
--   depends on the BLAST flavor (blastn, blastx, or blastp). This data
--   structure captures those variations.
data Aux

-- | blastn
Strands :: !Strand -> !Strand -> Aux

-- | blastx
Frame :: !Strand -> !Int -> Aux

-- | A <a>BlastResult</a> is the root of the hierarchy.
data BlastResult
BlastResult :: !ByteString -> !ByteString -> !ByteString -> !ByteString -> !ByteString -> !Integer -> !Integer -> [BlastRecord] -> BlastResult
blastprogram :: BlastResult -> !ByteString
blastversion :: BlastResult -> !ByteString
blastdate :: BlastResult -> !ByteString
blastreferences :: BlastResult -> !ByteString
database :: BlastResult -> !ByteString
dbsequences :: BlastResult -> !Integer
dbchars :: BlastResult -> !Integer
results :: BlastResult -> [BlastRecord]

-- | Each query sequence generates a <a>BlastRecord</a>
data BlastRecord
BlastRecord :: !SeqId -> !Int -> [BlastHit] -> BlastRecord
query :: BlastRecord -> !SeqId
qlength :: BlastRecord -> !Int
hits :: BlastRecord -> [BlastHit]

-- | Each match between a query and a target sequence (or subject) is a
--   <a>BlastHit</a>.
data BlastHit
BlastHit :: !SeqId -> !Int -> [BlastMatch] -> BlastHit
subject :: BlastHit -> !SeqId
slength :: BlastHit -> !Int
matches :: BlastHit -> [BlastMatch]

-- | A <a>BlastHit</a> may contain multiple separate matches (typcially
--   when an indel causes a frameshift that blastx is unable to bridge).
data BlastMatch
BlastMatch :: !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastMatch
bits :: BlastMatch -> !Double
e_val :: BlastMatch -> !Double
identity :: BlastMatch -> (Int, Int)
q_from :: BlastMatch -> !Int
q_to :: BlastMatch -> !Int
h_from :: BlastMatch -> !Int
h_to :: BlastMatch -> !Int
aux :: BlastMatch -> !Aux
instance Show BlastMatch
instance Show BlastHit
instance Show BlastRecord
instance Show BlastResult
instance Show Aux
instance Eq Aux
instance Read Strand
instance Show Strand
instance Eq Strand


-- | This module implements a "flattened" data structure for Blast hits, as
--   opposed to the hierarchical structure in
--   <a>Bio.Alignment.BlastData</a>.
--   
--   The flat data type is useful in many cases where it is more natural to
--   see the result as a set of rows (e.g. for insertaion in a database).
--   
--   It would probably be more (memory-) efficient to go the other way
--   (i.e. from flat to hierarchical), as passing the current, partially
--   built <a>BlastFlat</a> object down the stream of results and stamping
--   out a stream of completed ones. (See
--   <a>Bio.Alignment.BlastXML.breaks</a> for this week's most cumbersome
--   use of parallelism to avoid the memory issue.)
module Bio.Alignment.BlastFlat

-- | The BlastFlat data structure contains information about a single match
data BlastFlat
BlastFlat :: !SeqId -> !Int -> !SeqId -> !Int -> !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastFlat
query :: BlastFlat -> !SeqId
qlength :: BlastFlat -> !Int
subject :: BlastFlat -> !SeqId
slength :: BlastFlat -> !Int
bits :: BlastFlat -> !Double
e_val :: BlastFlat -> !Double
identity :: BlastFlat -> (Int, Int)
q_from :: BlastFlat -> !Int
q_to :: BlastFlat -> !Int
h_from :: BlastFlat -> !Int
h_to :: BlastFlat -> !Int
aux :: BlastFlat -> !Aux

-- | Convert BlastRecords into BlastFlats (representing a depth-first
--   traversal of the BlastRecord structure.)
flatten :: [BlastRecord] -> [BlastFlat]

-- | Each query sequence generates a <a>BlastRecord</a>
data BlastRecord
blastprogram :: BlastResult -> ByteString
blastversion :: BlastResult -> ByteString
blastdate :: BlastResult -> ByteString
blastreferences :: BlastResult -> ByteString
database :: BlastResult -> ByteString
dbsequences :: BlastResult -> Integer
dbchars :: BlastResult -> Integer
results :: BlastResult -> [BlastRecord]

-- | The Aux field in the BLAST output includes match information that
--   depends on the BLAST flavor (blastn, blastx, or blastp). This data
--   structure captures those variations.
data Aux

-- | blastn
Strands :: !Strand -> !Strand -> Aux

-- | blastx
Frame :: !Strand -> !Int -> Aux

-- | The <a>Strand</a> indicates the direction of the match, i.e. the plain
--   sequence or its reverse complement.
data Strand
Plus :: Strand
Minus :: Strand


-- | This module implements a parser for BLAST results.
--   
--   This module is DEPRECATED. It is *very* recommended that you run blast
--   with XML output instaed, and use the BlastXML module to parse it.
--   Don't say I didn't warn you!
--   
--   BLAST is a tool for searching in (biological) sequences for
--   similarity. This library is tested against NCBI-blast version 2.2.14.
--   There exist several independent versions, so expect some
--   incompatbilities if you're using a different BLAST version.
--   
--   The format is straightforward (and non-recursive), and this
--   implementation uses a simple line-based, hierarchical parser.
--   
--   For more information on BLAST, check
--   <a>http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html</a>
module Bio.Alignment.Blast
parse :: ByteString -> BlastResult

module Bio.Alignment.BlastXML
readXML :: FilePath -> IO [BlastResult]


-- | GOA - parse and index Gene Onthology Annotations In particular, the
--   file 'gene_association.goa_uniprot' that contains links between GO
--   terms and UniProt accessions.
--   
--   (Where to find the hierarchical relationship between GO terms?)
--   <a>http://www.geneontology.org/ontology/gene_ontology.obo</a> contains
--   isA relationships
--   <a>http://www.geneontology.org/GO.format.obo-1_2.shtml</a> describes
--   the format
module Bio.Sequence.GOA

-- | Read the goa_uniprot file (warning: this one is huge!)
readGOA :: FilePath -> IO [Annotation]

-- | Read GO term definitions
readGO :: FilePath -> IO [GoDef]
decomment :: ByteString -> [ByteString]
newtype GoTerm
GO :: Int -> GoTerm
type UniProtAcc = ByteString
data GoClass
Func :: GoClass
Proc :: GoClass
Comp :: GoClass

-- | GOA Annotation - or multiple annotations?
data Annotation
Ann :: !UniProtAcc -> !GoTerm -> !EvidenceCode -> Annotation
mkAnn :: ByteString -> Annotation

-- | GO maps GO terms (GO:xxxx for some number xxxx) to biologically
--   meaningful terms. Defined in
--   <a>http://www.geneontology.org/doc/GO.terms_and_ids</a> The format is
--   <a>GO:0000000 [tab] text string [tab] F|P|C</a>
data GoDef
GoDef :: !GoTerm -> !ByteString -> !GoClass -> GoDef
mkGoDef :: ByteString -> GoDef

-- | Evidence codes describe the type of support for an annotation
--   <a>http://www.geneontology.org/GO.evidence.shtml</a>
data EvidenceCode
IC :: EvidenceCode
IDA :: EvidenceCode
IEA :: EvidenceCode
IEP :: EvidenceCode
IGC :: EvidenceCode
IGI :: EvidenceCode
IMP :: EvidenceCode
IPI :: EvidenceCode
ISS :: EvidenceCode
NAS :: EvidenceCode
ND :: EvidenceCode
RCA :: EvidenceCode
TAS :: EvidenceCode
NR :: EvidenceCode

-- | The vast majority of GOA data is IEA, while the most reliable
--   information is manually curated. Filtering on this is useful to keep
--   data set sizes manageable, too.
isCurated :: EvidenceCode -> Bool
instance Read EvidenceCode
instance Show EvidenceCode
instance Eq EvidenceCode
instance Show GoDef
instance Show Annotation
instance Eq GoTerm
instance Ord GoTerm
instance Show GoClass
instance Read GoClass
instance Show GoTerm
instance Read GoTerm

module Bio.Sequence.Entropy
class KWords s
kwords :: (KWords s) => Int -> s -> [s]
entropy :: (Ord str, KWords str) => Int -> str -> Double
instance KWords [a]


-- | Data structures for manipulating (biological) sequences.
--   
--   Generally supports both nucleotide and protein sequences, some
--   functions, like <tt>revcompl</tt>, only makes sense for nucleotides.
module Bio.Sequence.SeqData

-- | A sequence consists of a header, the sequence data itself, and
--   optional quality data.
data Sequence

-- | header and actual sequence
Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence

-- | An offset, index, or length of a <a>SeqData</a>
type Offset = Int64

-- | The basic data type used in <a>Sequence</a>s
type SeqData = ByteString

-- | Basic type for quality data. Range 0..255. Typical Phred output is in
--   the range 6..50, with 20 as the line in the sand separating good from
--   bad.
type Qual = Word8

-- | Quality data is a <a>Qual</a> vector, currently implemented as a
--   ByteString.
type QualData = ByteString

-- | Read the character at the specified position in the sequence.
(!) :: Sequence -> Offset -> Char

-- | Return sequence length.
seqlength :: Sequence -> Offset

-- | Return sequence label (first word of header)
seqlabel :: Sequence -> SeqData

-- | Return full header.
seqheader :: Sequence -> SeqData

-- | Return the sequence data.
seqdata :: Sequence -> SeqData
(?) :: Sequence -> Offset -> Qual

-- | Check whether the sequence has associated quality data.
hasqual :: Sequence -> Bool

-- | Return the quality data, or error if none exist. Use hasqual if in
--   doubt.
seqqual :: Sequence -> QualData

-- | Convert a String to <a>SeqData</a>
fromStr :: String -> SeqData

-- | Convert a <a>SeqData</a> to a String
toStr :: SeqData -> String

-- | Complement a single character. I.e. identify the nucleotide it can
--   hybridize with. Note that for multiple nucleotides, you usually want
--   the reverse complement (see <a>revcompl</a> for that).
compl :: Char -> Char

-- | Calculate the reverse complement. This is only relevant for the
--   nucleotide alphabet, and it leaves other characters unmodified.
revcompl :: Sequence -> Sequence
data Amino
Ala :: Amino
Arg :: Amino
Asn :: Amino
Asp :: Amino
Cys :: Amino
Gln :: Amino
Glu :: Amino
Gly :: Amino
His :: Amino
Ile :: Amino
Leu :: Amino
Lys :: Amino
Met :: Amino
Phe :: Amino
Pro :: Amino
Ser :: Amino
Thr :: Amino
Tyr :: Amino
Trp :: Amino
Val :: Amino
STP :: Amino
Asx :: Amino
Glx :: Amino
Xle :: Amino
Xaa :: Amino

-- | Translate a nucleotide sequence into the corresponding protein
--   sequence. This works rather blindly, with no attempt to identify ORFs
--   or otherwise QA the result.
translate :: Sequence -> Offset -> [Amino]

-- | Convert a sequence in IUPAC format to a list of amino acids.
fromIUPAC :: SeqData -> [Amino]

-- | Convert a list of amino acids to a sequence in IUPAC format.
toIUPAC :: [Amino] -> SeqData
instance Show Amino
instance Eq Amino
instance Show Sequence
instance Eq Sequence


-- | This module incorporates functionality for reading and writing
--   sequence data in the Fasta format. Each sequence consists of a header
--   (with a <a>&gt;</a> prefix) and a set of lines containing the sequence
--   data.
module Bio.Sequence.Fasta

-- | Lazily read sequences from a FASTA-formatted file
readFasta :: FilePath -> IO [Sequence]

-- | Write sequences to a FASTA-formatted file. Line length is 60.
writeFasta :: FilePath -> [Sequence] -> IO ()

-- | Lazily read sequence from handle
hReadFasta :: Handle -> IO [Sequence]

-- | Write sequences in FASTA format to a handle.
hWriteFasta :: Handle -> [Sequence] -> IO ()

-- | Read quality data for sequences to a file.
readQual :: FilePath -> IO [Sequence]

-- | Write quality data for sequences to a file.
writeQual :: FilePath -> [Sequence] -> IO ()
hWriteQual :: Handle -> [Sequence] -> IO ()

-- | Read sequence and associated quality. Will error if the sequences and
--   qualites do not match one-to-one in sequence.
readFastaQual :: FilePath -> FilePath -> IO [Sequence]
hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO ()

-- | Write sequence and quality data simulatnously This may be more
--   laziness-friendly.
writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO ()
countSeqs :: FilePath -> IO Int

-- | Convert a list of FASTA-formatted lines into a list of sequences.
--   Blank lines are ignored. Comment lines start with <a>#</a> are allowed
--   between sequences (and ignored). Lines starting with <a>&gt;</a>
--   initiate a new sequence.
mkSeqs :: [ByteString] -> [Sequence]


-- | This module implements the 2bit format for sequences.
--   
--   Based on: <a>http://genome.ucsc.edu/FAQ/FAQformat#format7</a> Note!
--   the description is not accurate, it is missing a reserved word in each
--   sequence record.
--   
--   There are also other, completely different ideas of the 2bit format,
--   e.g. <a>http://jcomeau.freeshell.org/www/genome/2bitformat.html</a>
module Bio.Sequence.TwoBit

-- | Parse a (lazy) ByteString as sequences in the 2bit format.
decode2Bit :: ByteString -> [Sequence]

-- | Extract sequences from a file in 2bit format.
read2Bit :: FilePath -> IO [Sequence]

-- | Extract sequences in the 2bit format from a handle.
hRead2Bit :: Handle -> IO [Sequence]
instance Show SRLE
instance Show SRBE
instance Show SR
instance Show Entry
instance Binary SRLE
instance Binary SRBE
instance Binary Entries
instance Show Entries
instance Binary Entry
instance Binary Header
instance Show Header
instance Arbitrary Word8


-- | Parse phd files (phred base calling output).
module Bio.Sequence.Phd

-- | Parse a .phd file, extracting the contents as a Sequence
readPhd :: FilePath -> IO Sequence

-- | Parse .phd contents from a handle
hReadPhd :: Handle -> IO Sequence

module Bio.Sequence.HashWord

-- | This is a struct for containing a set of hashing functions
data HashF k
HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k

-- | calculates the hash at a given offset in the sequence
hash :: HashF k -> SeqData -> Offset -> Maybe k

-- | calculate all hashes from a sequence, and their indices
hashes :: HashF k -> SeqData -> [(k, Offset)]

-- | for sorting hashes
ksort :: HashF k -> [k] -> [k]

-- | Adds a default <a>hashes</a> function to a <tt>HashF</tt>, when
--   <a>hash</a> is defined.
genkeys :: HashF k -> HashF k

-- | Contigous constructs an int/eger from a contigous k-word.
contigous :: (Integral k) => Int -> HashF k

-- | Like <a>contigous</a>, but returns the same hash for a word and its
--   reverse complement.
rcontig :: (Integral k) => Int -> HashF k
compact :: SeqData -> [SeqData]

-- | Like <tt>rcontig</tt>, but ignoring monomers (i.e. arbitrarily long
--   runs of a single nucelotide are treated the same a single nucleotide.
rcpacked :: (Integral k) => Int -> HashF k
type Shape = String
gapped :: (Integral k) => Shape -> HashF k
isN :: Char -> Bool
n2k :: (Integral k) => Int -> SeqData -> k
n2i' :: (Num a) => a -> SeqData -> a
k2n :: (Integral k) => Int -> k -> SeqData
val :: (Num t) => Char -> t
unval :: (Num a) => a -> Char
complement :: Char -> Char


-- | Data structures and helper functions for calculating alignments
--   
--   There are two ways to view an alignment: either as a list of edits
--   (i.e., insertions, deletions, or substitutions), or as a set of
--   sequences with inserted gaps.
--   
--   The edit list approach is perhaps more restrictive model but doesn't
--   generalize to multiple alignments.
--   
--   The gap approach is more general, and probably more commonly used by
--   other software (see e.g. the ACE file format).
module Bio.Alignment.AlignData
data Dir
Fwd :: Dir
Rev :: Dir
type Gaps = [Offset]
type Alignment = [(Offset, Dir, Sequence, Gaps)]

-- | Gaps are coded as <a>*</a>s, this function removes them, and returns
--   the sequence along with the list of gap positions.
extractGaps :: SeqData -> (SeqData, Gaps)
insertGaps :: Char -> (SeqData, Gaps) -> SeqData

-- | An Edit is either the insertion, the deletion, or the replacement of a
--   character.
data Edit
Ins :: Chr -> Edit
Del :: Chr -> Edit
Repl :: Chr -> Chr -> Edit

-- | An alignment is a sequence of edits.
type EditList = [Edit]

-- | A substitution matrix gives scores for replacing a character with
--   another. Typically, it will be symmetric.
type SubstMx a = (Chr, Chr) -> a

-- | A Selector consists of a zero element, and a funcition that chooses a
--   possible Edit operation, and generates an updated result.
type Selector a = [(a, Edit)] -> a

-- | The sequence element type, used in alignments.
type Chr = Word8

-- | Calculate a set of columns containing scores This represents the
--   columns of the alignment matrix, but will only require linear space
--   for score calculation.
columns :: Selector a -> a -> Sequence -> Sequence -> [[a]]

-- | Evaluate an Edit based on SubstMx and gap penalty
eval :: SubstMx a -> a -> Edit -> a

-- | True if the Edit is a Repl.
isRepl :: Edit -> Bool

-- | turn an alignment into sequences with <a>-</a> representing gaps (for
--   checking, filtering out the <a>-</a> characters should return the
--   original sequences, provided <a>-</a> isn't part of the sequence
--   alphabet)
toStrings :: EditList -> (String, String)
instance Show Edit
instance Eq Edit
instance Eq Dir
instance Show Dir


-- | Common substitution matrices for alignments.
--   
--   When in doubt, use BLOSUM62. Consult
--   <a>http://www.ncbi.nlm.nih.gov/blast/blast_whatsnew.shtml#20051206</a>
--   for some hints on good parameters for nucleotide alignments.
module Bio.Alignment.Matrices
blosum45 :: (Char, Char) -> Int

-- | The standard BLOSUM45 matrix.
blosum62 :: (Char, Char) -> Int
blosum80 :: (Char, Char) -> Int
pam30 :: (Char, Char) -> Int
pam70 :: (Char, Char) -> Int

-- | The standard BLOSUM62 matrix.
--   
--   The standard BLOSUM80 matrix.
--   
--   The standard PAM30 matrix
--   
--   The standard PAM70 matrix.
--   
--   Blast defaults, use with gap_open = -5 gap_extend = -3 This should
--   really check for valid nucleotides, and perhaps be more lenient in the
--   case of Ns. Oh well.
blastn_default :: (Num a) => (Chr, Chr) -> a

-- | Construct a simple <a>matrix</a> from match score/mismatch penalty
simpleMx :: (Num a) => a -> a -> (Chr, Chr) -> a


-- | Simple alignment of sequences
--   
--   Standard alignment/edit distance
module Bio.Alignment.SAlign

-- | Calculate local edit distance (Smith-Waterman alignment score)
local_score :: (Num a, Ord a) => SubstMx a -> a -> Sequence -> Sequence -> a
local_align :: (Num a, Ord a) => SubstMx a -> a -> Sequence -> Sequence -> EditList

-- | Calculate global edit distance (Needleman-Wunsch alignment score)
global_score :: (Num a, Ord a) => SubstMx a -> a -> Sequence -> Sequence -> a

-- | Calculate alignments.
global_align :: (Num a, Ord a) => SubstMx a -> a -> Sequence -> Sequence -> EditList


-- | Implement alignments/edit distance with affine gap penalties
--   
--   I've seen g = (-10,-1) as the suggested price to pay for a gaps using
--   BLOSUM62. Good choice as any, I guess.
module Bio.Alignment.AAlign

-- | Calculate local edit distance (Smith-Waterman alignment score)
local_score :: (Num a, Ord a) => SubstMx a -> (a, a) -> Sequence -> Sequence -> a

-- | Calculate local alignmnet (Smith-Waterman)
local_align :: (Num a, Ord a) => SubstMx a -> (a, a) -> Sequence -> Sequence -> (a, EditList)

-- | Calculate global edit distance (Needleman-Wunsch alignment score)
global_score :: (Num a, Ord a) => SubstMx a -> (a, a) -> Sequence -> Sequence -> a

-- | Calculate global alignment (Needleman-Wunsch)
global_align :: (Num a, Ord a) => SubstMx a -> (a, a) -> Sequence -> Sequence -> (a, EditList)


-- | Quality-aware alignments
--   
--   Generally, quality data are ignored for alignment/pattern searching
--   like Smith-Waterman, Needleman-Wunsch, or BLAST(p|n|x). I believe that
--   accounting for quality will at the very least affect things like BLAST
--   statistics, and e.g. is crucial for good EST annotation using Blastx.
--   
--   This module performs sequences alignments, takes quality values into
--   account.
module Bio.Alignment.QAlign

-- | Calculate local edit distance (Smith-Waterman alignment score)
local_score :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> Double

-- | Calculate local alignmnet (Smith-Waterman)
local_align :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> (Double, EditList)

-- | Calculate global edit distance (Needleman-Wunsch alignment score)
global_score :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> Double

-- | Calculate global alignment (Needleman-Wunsch)
global_align :: QualMx Double -> (Double, Double) -> Sequence -> Sequence -> (Double, EditList)
qualMx :: Qual -> Qual -> (Chr, Chr) -> Double


-- | Read ACE format assembly files
--   
--   These are typically output by sequence assembly tools, like CAP3 or
--   Phrap.
--   
--   Documented in the section labelled "ACE FILE FORMAT" at
--   <a>http://bozeman.mbt.washington.edu/consed/distributions/README.14.0.txt</a>
--   
--   Briefly: each field is a line starting with a two letter code, in some
--   cases followed by data lines termintated by a blank line. -- AS
--   contigs reads -- CO contig_name bases reads segments compl (CAP3:
--   segments=0) -- sequence -- BQ -- base_qualities -- AF read1 compl
--   padded_start_consensus (negatives meaning?) -- AF read2 .. -- BS
--   segments -- RD read1 bases info_items info_tags (latter two set to 0
--   by CAP3) -- sequence -- QA read1 qual_start qual_end align_start
--   align_end -- DS (phred header? left empty by CAP3) -- RD read2 ...
module Bio.Alignment.ACE

-- | Reading an ACE file.
readACE :: FilePath -> IO [[Assembly]]
writeACE :: FilePath -> [Assembly] -> IO ()
data Assembly
Asm :: (Sequence, Gaps) -> Alignment -> Assembly
contig :: Assembly -> (Sequence, Gaps)
fragments :: Assembly -> Alignment

-- | Test parser p on a list of ACE elements
ptest :: (Show a) => String -> AceParser a -> [ACE] -> IO ()
reads :: Assembly -> Alignment
instance Eq ACE
instance Show Assembly
instance Show ACE


-- | This is a meta-module importing and re-exporting sequence-related
--   stuff.
--   
--   It encompasses the <a>Bio.Sequence.SeqData</a>,
--   <a>Bio.Sequence.Fasta</a>, and <a>Bio.Sequence.TwoBit</a> modules.
module Bio.Sequence

-- | A sequence consists of a header, the sequence data itself, and
--   optional quality data.
data Sequence

-- | header and actual sequence
Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence

-- | An offset, index, or length of a <a>SeqData</a>
type Offset = Int64

-- | The basic data type used in <a>Sequence</a>s
type SeqData = ByteString

-- | Basic type for quality data. Range 0..255. Typical Phred output is in
--   the range 6..50, with 20 as the line in the sand separating good from
--   bad.
type Qual = Word8

-- | Quality data is a <a>Qual</a> vector, currently implemented as a
--   ByteString.
type QualData = ByteString

-- | Return sequence length.
seqlength :: Sequence -> Offset

-- | Return sequence label (first word of header)
seqlabel :: Sequence -> SeqData

-- | Return full header.
seqheader :: Sequence -> SeqData

-- | Return the sequence data.
seqdata :: Sequence -> SeqData

-- | Return the quality data, or error if none exist. Use hasqual if in
--   doubt.
seqqual :: Sequence -> QualData

-- | Read the character at the specified position in the sequence.
(!) :: Sequence -> Offset -> Char

-- | Convert a String to <a>SeqData</a>
fromStr :: String -> SeqData

-- | Convert a <a>SeqData</a> to a String
toStr :: SeqData -> String

-- | Complement a single character. I.e. identify the nucleotide it can
--   hybridize with. Note that for multiple nucleotides, you usually want
--   the reverse complement (see <a>revcompl</a> for that).
compl :: Char -> Char

-- | Calculate the reverse complement. This is only relevant for the
--   nucleotide alphabet, and it leaves other characters unmodified.
revcompl :: Sequence -> Sequence
data Amino
Ala :: Amino
Arg :: Amino
Asn :: Amino
Asp :: Amino
Cys :: Amino
Gln :: Amino
Glu :: Amino
Gly :: Amino
His :: Amino
Ile :: Amino
Leu :: Amino
Lys :: Amino
Met :: Amino
Phe :: Amino
Pro :: Amino
Ser :: Amino
Thr :: Amino
Tyr :: Amino
Trp :: Amino
Val :: Amino
STP :: Amino
Asx :: Amino
Glx :: Amino
Xle :: Amino
Xaa :: Amino

-- | Translate a nucleotide sequence into the corresponding protein
--   sequence. This works rather blindly, with no attempt to identify ORFs
--   or otherwise QA the result.
translate :: Sequence -> Offset -> [Amino]

-- | Convert a sequence in IUPAC format to a list of amino acids.
fromIUPAC :: SeqData -> [Amino]

-- | Convert a list of amino acids to a sequence in IUPAC format.
toIUPAC :: [Amino] -> SeqData

-- | Lazily read sequences from a FASTA-formatted file
readFasta :: FilePath -> IO [Sequence]

-- | Lazily read sequence from handle
hReadFasta :: Handle -> IO [Sequence]

-- | Write sequences to a FASTA-formatted file. Line length is 60.
writeFasta :: FilePath -> [Sequence] -> IO ()

-- | Write sequences in FASTA format to a handle.
hWriteFasta :: Handle -> [Sequence] -> IO ()

-- | Read quality data for sequences to a file.
readQual :: FilePath -> IO [Sequence]

-- | Write quality data for sequences to a file.
writeQual :: FilePath -> [Sequence] -> IO ()
hWriteQual :: Handle -> [Sequence] -> IO ()

-- | Read sequence and associated quality. Will error if the sequences and
--   qualites do not match one-to-one in sequence.
readFastaQual :: FilePath -> FilePath -> IO [Sequence]

-- | Write sequence and quality data simulatnously This may be more
--   laziness-friendly.
writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO ()
hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO ()

-- | Parse a .phd file, extracting the contents as a Sequence
readPhd :: FilePath -> IO Sequence

-- | Parse .phd contents from a handle
hReadPhd :: Handle -> IO Sequence

-- | Parse a (lazy) ByteString as sequences in the 2bit format.
decode2Bit :: ByteString -> [Sequence]

-- | Extract sequences from a file in 2bit format.
read2Bit :: FilePath -> IO [Sequence]

-- | Extract sequences in the 2bit format from a handle.
hRead2Bit :: Handle -> IO [Sequence]

-- | This is a struct for containing a set of hashing functions
data HashF k
HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k

-- | calculates the hash at a given offset in the sequence
hash :: HashF k -> SeqData -> Offset -> Maybe k

-- | calculate all hashes from a sequence, and their indices
hashes :: HashF k -> SeqData -> [(k, Offset)]

-- | for sorting hashes
ksort :: HashF k -> [k] -> [k]

-- | Contigous constructs an int/eger from a contigous k-word.
contigous :: (Integral k) => Int -> HashF k

-- | Like <a>contigous</a>, but returns the same hash for a word and its
--   reverse complement.
rcontig :: (Integral k) => Int -> HashF k

-- | Like <tt>rcontig</tt>, but ignoring monomers (i.e. arbitrarily long
--   runs of a single nucelotide are treated the same a single nucleotide.
rcpacked :: (Integral k) => Int -> HashF k
class KWords s
kwords :: (KWords s) => Int -> s -> [s]
entropy :: (Ord str, KWords str) => Int -> str -> Double


-- | Multiple alignments.
module Bio.Alignment.Multiple

-- | Progressive multiple alignment. Calculate a tree from agglomerative
--   clustering, then align at each branch going bottom up. Returns a list
--   of columns (rows?).
progressive :: (Sequence -> Sequence -> (Double, EditList)) -> [Sequence] -> [String]

-- | Derive alignments indirectly, i.e. calculate A|C using alignments A|B
--   and B|C. This is central for Coffee evaluation of alignments, and
--   T-Coffee construction of alignments.
indirect :: EditList -> EditList -> EditList