-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | A bioinformatics library
--   
--   This is a collection of data structures and algorithms useful for
--   building bioinformatics-related tools and utilities.
--   
--   Current list of features includes: a Sequence data type supporting
--   protein and nucleotide sequences and conversion between them. As of
--   version 0.4, different kinds of sequence have different types. Support
--   for quality data, reading and writing Fasta formatted files, reading
--   TwoBit and phd formats, and Roche/454 SFF files. Rudimentary (i.e.
--   unoptimized) support for doing alignments - including dynamic
--   adjustment of scores based on sequence quality. Also Blast output
--   parsing. Partly implemented single linkage clustering, and multiple
--   alignment. Reading Gene Ontology (GO) annotations (GOA) and
--   definitions/hierarchy.
--   
--   The Darcs repository is at:
--   <a>http://malde.org/~ketil/biohaskell/biolib</a>.
@package bio
@version 0.5

module Bio.GFF3.Escape
unEscapeByteString :: (Error e, MonadError e m) => ByteString -> m ByteString
escapeByteString :: (Char -> Bool) -> ByteString -> ByteString
escapeAllBut :: String -> ByteString -> ByteString
escapeAllOf :: String -> ByteString -> ByteString


-- | Lazy "many" combinator for Parsec. Courtesy of Tomasz Zielonka.
module Bio.Util.Parsex
lazyMany :: GenParser Char () a -> SourceName -> [Char] -> [a]


-- | Implement clustering
module Bio.Clustering

-- | Data structure for storing hierarchical clusters
data Clustered score datum
Branch :: score -> (Clustered score datum) -> (Clustered score datum) -> Clustered score datum
Leaf :: datum -> Clustered score datum

-- | Single linkage agglomerative clustering. Cluster elements by slurping
--   a sorted list of pairs with score (i.e. triples :-) Keeps a set of
--   contained elements at each branch's root, so O(n log n), and requires
--   elements to be in Ord. For this to work, the triples must be sorted on
--   score. Earlier scores in the list will make up the lower nodes, so
--   sort descending for similarity, ascending for distance.
cluster_sl :: (Ord a, Ord s) => [(s, a, a)] -> [Clustered s a]
instance (Show score, Show datum) => Show (Clustered score datum)


-- | This models the PSL format used by e.g. the alignment tool BLAT. It is
--   a simple, textual representation of (spliced) alignments, with
--   tab-separated fields.
--   
--   See http:<i></i>genome.ucsc.edu<i>FAQ</i>FAQformat#format2 for
--   details.
module Bio.Alignment.PSL
data PSL
PSL :: Int -> Int -> Int -> Int -> Int -> Int -> Int -> Int -> ByteString -> ByteString -> Int -> Int -> Int -> ByteString -> Int -> Int -> Int -> Int -> [Int] -> [Int] -> [Int] -> PSL
match :: PSL -> Int
mismatch :: PSL -> Int
repmatch :: PSL -> Int
ncount :: PSL -> Int
qgapcount :: PSL -> Int
qgaplength :: PSL -> Int
tgapcount :: PSL -> Int
tgaplength :: PSL -> Int
strand :: PSL -> ByteString
qname :: PSL -> ByteString
qsize :: PSL -> Int
qstart :: PSL -> Int
qend :: PSL -> Int
tname :: PSL -> ByteString
tsize :: PSL -> Int
tstart :: PSL -> Int
tend :: PSL -> Int
blockcount :: PSL -> Int
blocksizes :: PSL -> [Int]
qstarts :: PSL -> [Int]
tstarts :: PSL -> [Int]
readPSL :: FilePath -> IO [PSL]
writePSL :: FilePath -> [PSL] -> IO ()
parsePSL :: ByteString -> [PSL]
unparsePSL :: [PSL] -> ByteString
pslHeader :: ByteString
instance Eq PSL
instance Show PSL


-- | Utility module, with various useful stuff.
module Bio.Util
lines :: ByteString -> [ByteString]

-- | Break a list of bytestrings on a predicate.
splitWhen :: (ByteString -> Bool) -> [ByteString] -> [[ByteString]]

-- | Output (to stderr) progress while evaluating a lazy list. Useful for
--   generating output while (conceptually, at least) in pure code
countIO :: String -> String -> Int -> [a] -> IO [a]

-- | A lazier version of <tt>Control.Monad.sequence</tt> in
--   <a>Control.Monad</a>, needed by <a>countIO</a> above.
sequence' :: [IO a] -> IO [a]

-- | Workaround, the current <a>Data.ByteString.Lazy.Char8</a> contains a
--   bug in <tt>Data.ByteString.Lazy.Char8.lines</tt>.
mylines :: ByteString -> [ByteString]


-- | This module implements a hierarchical data structure for BLAST
--   results, there is an alternative flat structure in the
--   <a>Bio.Alignment.BlastFlat</a> module.
--   
--   BLAST is a tool for searching in (biological) sequences for
--   similarity. This library is tested against NCBI-blast version 2.2.14.
--   There exist several independent versions of BLAST, so expect some
--   incompatbilities if you're using a different BLAST version.
--   
--   For parsing BLAST results, the XML format (blastall -m 7) is by far
--   the most robust choice, and is implemented in the
--   <a>Bio.Alignment.BlastXML</a> module.
--   
--   The format is straightforward (and non-recursive). For more
--   information on BLAST, check
--   <a>http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html</a>
module Bio.Alignment.BlastData

-- | The sequence id, i.e. the first word of the header field.
type SeqId = ByteString

-- | The <a>Strand</a> indicates the direction of the match, i.e. the plain
--   sequence or its reverse complement.
data Strand
Plus :: Strand
Minus :: Strand

-- | The Aux field in the BLAST output includes match information that
--   depends on the BLAST flavor (blastn, blastx, or blastp). This data
--   structure captures those variations.
data Aux

-- | blastn
Strands :: !Strand -> !Strand -> Aux

-- | blastx
Frame :: !Strand -> !Int -> Aux

-- | A <a>BlastResult</a> is the root of the hierarchy.
data BlastResult
BlastResult :: !ByteString -> !ByteString -> !ByteString -> !ByteString -> !ByteString -> !Integer -> !Integer -> [BlastRecord] -> BlastResult
blastprogram :: BlastResult -> !ByteString
blastversion :: BlastResult -> !ByteString
blastdate :: BlastResult -> !ByteString
blastreferences :: BlastResult -> !ByteString
database :: BlastResult -> !ByteString
dbsequences :: BlastResult -> !Integer
dbchars :: BlastResult -> !Integer
results :: BlastResult -> [BlastRecord]

-- | Each query sequence generates a <a>BlastRecord</a>
data BlastRecord
BlastRecord :: !SeqId -> !Int -> [BlastHit] -> BlastRecord
query :: BlastRecord -> !SeqId
qlength :: BlastRecord -> !Int
hits :: BlastRecord -> [BlastHit]

-- | Each match between a query and a target sequence (or subject) is a
--   <a>BlastHit</a>.
data BlastHit
BlastHit :: !SeqId -> !Int -> [BlastMatch] -> BlastHit
subject :: BlastHit -> !SeqId
slength :: BlastHit -> !Int
matches :: BlastHit -> [BlastMatch]

-- | A <a>BlastHit</a> may contain multiple separate matches (typcially
--   when an indel causes a frameshift that blastx is unable to bridge).
data BlastMatch
BlastMatch :: !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastMatch
bits :: BlastMatch -> !Double
e_val :: BlastMatch -> !Double
identity :: BlastMatch -> (Int, Int)
q_from :: BlastMatch -> !Int
q_to :: BlastMatch -> !Int
h_from :: BlastMatch -> !Int
h_to :: BlastMatch -> !Int
aux :: BlastMatch -> !Aux
instance Show BlastMatch
instance Show BlastHit
instance Show BlastRecord
instance Show BlastResult
instance Show Aux
instance Eq Aux
instance Read Strand
instance Show Strand
instance Eq Strand


-- | Parse blast XML output.
--   
--   If you use a recent version of NCBI BLAST and specify XML output
--   (blastall -m 7), this module should be able to parse the result into a
--   hierarchical <a>BlastResult</a> structure.
--   
--   While the process may consume a bit of memory, the parsing is lazy,
--   and file sizes of several gigabytes can be parsed (see e.g. the xml2x
--   tool for an example). To parse XML, we use <tt>Text.HTML.TagSoup</tt>.
module Bio.Alignment.BlastXML

-- | Parse BLAST results in XML format
readXML :: FilePath -> IO [BlastResult]


-- | This module implements a "flattened" data structure for Blast hits, as
--   opposed to the hierarchical structure in
--   <a>Bio.Alignment.BlastData</a>.
--   
--   The flat data type is useful in many cases where it is more natural to
--   see the result as a set of rows (e.g. for insertaion in a database).
--   
--   It would probably be more (memory-) efficient to go the other way
--   (i.e. from flat to hierarchical), as passing the current, partially
--   built <a>BlastFlat</a> object down the stream of results and stamping
--   out a stream of completed ones. (See
--   <a>Bio.Alignment.BlastXML.breaks</a> for this week's most cumbersome
--   use of parallelism to avoid the memory issue.)
module Bio.Alignment.BlastFlat

-- | The BlastFlat data structure contains information about a single match
data BlastFlat
BlastFlat :: !SeqId -> !Int -> !SeqId -> !Int -> !Double -> !Double -> (Int, Int) -> !Int -> !Int -> !Int -> !Int -> !Aux -> BlastFlat
query :: BlastFlat -> !SeqId
qlength :: BlastFlat -> !Int
subject :: BlastFlat -> !SeqId
slength :: BlastFlat -> !Int
bits :: BlastFlat -> !Double
e_val :: BlastFlat -> !Double
identity :: BlastFlat -> (Int, Int)
q_from :: BlastFlat -> !Int
q_to :: BlastFlat -> !Int
h_from :: BlastFlat -> !Int
h_to :: BlastFlat -> !Int
aux :: BlastFlat -> !Aux
readXML :: FilePath -> IO [BlastFlat]

-- | Convert BlastRecords into BlastFlats (representing a depth-first
--   traversal of the BlastRecord structure.)
flatten :: [BlastRecord] -> [BlastFlat]

-- | Each query sequence generates a <a>BlastRecord</a>
data BlastRecord
blastprogram :: BlastResult -> ByteString
blastversion :: BlastResult -> ByteString
blastdate :: BlastResult -> ByteString
blastreferences :: BlastResult -> ByteString
database :: BlastResult -> ByteString
dbsequences :: BlastResult -> Integer
dbchars :: BlastResult -> Integer
results :: BlastResult -> [BlastRecord]

-- | The Aux field in the BLAST output includes match information that
--   depends on the BLAST flavor (blastn, blastx, or blastp). This data
--   structure captures those variations.
data Aux

-- | blastn
Strands :: !Strand -> !Strand -> Aux

-- | blastx
Frame :: !Strand -> !Int -> Aux

-- | The <a>Strand</a> indicates the direction of the match, i.e. the plain
--   sequence or its reverse complement.
data Strand
Plus :: Strand
Minus :: Strand


-- | This module implements a parser for BLAST results.
--   
--   This module is DEPRECATED. It is *very* recommended that you run blast
--   with XML output instaed, and use the BlastXML module to parse it.
--   Don't say I didn't warn you!
--   
--   BLAST is a tool for searching in (biological) sequences for
--   similarity. This library is tested against NCBI-blast version 2.2.14.
--   There exist several independent versions, so expect some
--   incompatbilities if you're using a different BLAST version.
--   
--   The format is straightforward (and non-recursive), and this
--   implementation uses a simple line-based, hierarchical parser.
--   
--   For more information on BLAST, check
--   <a>http://www.ncbi.nlm.nih.gov/Education/BLASTinfo/information3.html</a>
module Bio.Alignment.Blast
parse :: ByteString -> BlastResult

module Bio.Sequence.SFF_name

-- | Read names encode various information, as per this struct.
data ReadName
ReadName :: (Int, Int, Int) -> (Int, Int, Int) -> Int -> Int -> Int -> ReadName
date :: ReadName -> (Int, Int, Int)
time :: ReadName -> (Int, Int, Int)
region :: ReadName -> Int
x_loc :: ReadName -> Int
y_loc :: ReadName -> Int
decodeReadName :: ByteString -> Maybe ReadName
decodeLocation :: ByteString -> Maybe (Int, Int)
decodeDate :: ByteString -> Maybe [Int]
encodeReadName :: ReadName -> ByteString
encodeLocation :: Int -> Int -> ByteString
encodeRegion :: Int -> ByteString
encodeDate :: (Int, Int, Int) -> (Int, Int, Int) -> ByteString
divMods :: Int -> [Int] -> [Int]
decode36 :: ByteString -> Maybe Int
decCh :: Char -> Maybe Int
encode36 :: Int -> ByteString
b36 :: UArray Int Char
instance Show ReadName


-- | GeneOntology - parse and index Gene Ontology Annotations In
--   particular, the file 'gene_association.goa_uniprot' that contains
--   links between GO terms and UniProt accessions.
--   
--   <ul>
--   <li><a>http://www.geneontology.org/ontology/gene_ontology.obo</a> --
--   Contains the hierarchy including isA relationships.</li>
--   <li><a>http://www.geneontology.org/GO.format.obo-1_2.shtml</a> --
--   Describes the OBO format.</li>
--   <li><a>ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/</a> --
--   Contains the GOA-UniProt mapping (and a README file).</li>
--   <li><a>http://www.geneontology.org/ontology/GO.defs</a> -- Contains GO
--   definitions (not supported here yet).</li>
--   <li><a>http://www.geneontology.org/doc/GO.terms_and_ids</a> -- GO
--   definitions, simpler and more schematically.</li>
--   </ul>
module Bio.Sequence.GeneOntology

-- | A GO term is a positive integer
newtype GoTerm
GO :: Int -> GoTerm

-- | A GoDef maps a <a>GoTerm</a> to a description and a <a>GoClass</a>.
data GoDef
GoDef :: !GoTerm -> !ByteString -> !GoClass -> GoDef

-- | A list of Go definitions, with pointers to parent nodes. Read from the
--   .obo file. The user may construct the explicit hierachy by storing
--   these in a Map or similar
type GoHierarchy = [(GoDef, [GoTerm])]

-- | Read the GO hierarchy from the obo file. Note that this is not quite a
--   tree structure.
readObo :: FilePath -> IO GoHierarchy

-- | Read GO term definitions, from the GO.terms_and_ids file
readTerms :: FilePath -> IO [GoDef]

-- | A GOA annotation, containing a UniProt identifier, a GoTerm and an
--   evidence code.
data Annotation
Ann :: !UniProtAcc -> !GoTerm -> !EvidenceCode -> Annotation

-- | A UniProt identifier (short string of capitals and numbers).
type UniProtAcc = ByteString
data GoClass
Func :: GoClass
Proc :: GoClass
Comp :: GoClass

-- | Evidence codes describe the type of support for an annotation
--   <a>http://www.geneontology.org/GO.evidence.shtml</a>
data EvidenceCode

-- | Inferred by Curator
IC :: EvidenceCode

-- | Inferred from Direct Assay
IDA :: EvidenceCode

-- | Inferred from Electronic Annotation
IEA :: EvidenceCode

-- | Inferred from Expression Pattern
IEP :: EvidenceCode

-- | Inferred from Genomic Context
IGC :: EvidenceCode

-- | Inferred from Genetic Interaction
IGI :: EvidenceCode

-- | Inferred from Mutant Phenotype
IMP :: EvidenceCode

-- | Inferred from Physical Interaction
IPI :: EvidenceCode

-- | Inferred from Sequence or Structural Similarity
ISS :: EvidenceCode

-- | Non-traceable Author Statement
NAS :: EvidenceCode

-- | No biological Data available
ND :: EvidenceCode

-- | Inferred from Reviewed Computational Analysis
RCA :: EvidenceCode

-- | Traceable Author Statement
TAS :: EvidenceCode

-- | Not Recorded
NR :: EvidenceCode

-- | Read the goa_uniprot file (warning: this one is huge!)
readGOA :: FilePath -> IO [Annotation]

-- | The vast majority of GOA data is IEA, while the most reliable
--   information is manually curated. Filtering on this is useful to keep
--   data set sizes manageable, too.
isCurated :: EvidenceCode -> Bool
decomment :: ByteString -> [ByteString]
instance Read EvidenceCode
instance Show EvidenceCode
instance Eq EvidenceCode
instance Show Annotation
instance Show GoDef
instance Eq GoTerm
instance Ord GoTerm
instance Show GoClass
instance Read GoClass
instance Show GoTerm
instance Read GoTerm


-- | Functionality for manipulating KEGG annotations.
--   
--   KEGG is a bit hard find, but there exist species-specific tables
--   Available organisms are listed in the table at
--   
--   <a>ftp://ftp.genome.jp/pub/kegg/genes/etc/all_species.tab</a>
--   
--   Data for each organism is stored its own subdirectory under
--   
--   <a>ftp://ftp.genome.jp/pub/kegg/genes/organisms/</a>
--   
--   Containing tables linking everything -- including external resources
--   like UniProt, PDB, or NCBI -- together.
module Bio.Sequence.KEGG

-- | Most KEGG files that contain associations, have one association per
--   line, consisting of two items separated by whitespace. This is a
--   generalized reader function.
genReadKegg :: FilePath -> IO [(ByteString, ByteString)]
newtype KO
KO :: ByteString -> KO

-- | Convert UniProt IDs (up:xxxxxx) to the <a>UniProtAcc</a> type.
decodeUP :: ByteString -> UniProtAcc

-- | Convert KO IDs (ko:xxxxx) to the <a>KO</a> data type.
decodeKO :: ByteString -> KO

-- | KEGG uses strings with an identifying prefix for IDs. This helper
--   function checks and removes prefix to construct native values.
removePrefix :: String -> String -> (ByteString -> a) -> ByteString -> a
instance Show KO


-- | Moved to GeneOnthology - this is for backwards compatibility.
module Bio.Sequence.GOA
readGO :: FilePath -> IO [GoDef]

module Bio.Sequence.Entropy
class KWords s
kwords :: KWords s => Int -> s -> [s]
entropy :: (Ord str, KWords str) => Int -> str -> Double
instance KWords [a]


-- | Data structures for manipulating (biological) sequences.
--   
--   Generally supports both nucleotide and protein sequences, some
--   functions, like <tt>revcompl</tt>, only makes sense for nucleotides.
module Bio.Sequence.SeqData

-- | A sequence consists of a header, the sequence data itself, and
--   optional quality data. The type parameter is a phantom type to
--   separate nucleotide and amino acid sequences
data Sequence t

-- | header and actual sequence
Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence t

-- | An offset, index, or length of a <a>SeqData</a>
type Offset = Int64

-- | The basic data type used in <a>Sequence</a>s
type SeqData = ByteString

-- | Basic type for quality data. Range 0..255. Typical Phred output is in
--   the range 6..50, with 20 as the line in the sand separating good from
--   bad.
type Qual = Word8

-- | Quality data is a <a>Qual</a> vector, currently implemented as a
--   <tt>ByteString</tt>.
type QualData = ByteString

-- | Read the character at the specified position in the sequence.
(!) :: Sequence a -> Offset -> Char

-- | Return sequence length.
seqlength :: Sequence a -> Offset

-- | Return sequence label (first word of header)
seqlabel :: Sequence a -> SeqData

-- | Return full header.
seqheader :: Sequence a -> SeqData

-- | Return the sequence data.
seqdata :: Sequence a -> SeqData
(?) :: Sequence a -> Offset -> Qual

-- | Check whether the sequence has associated quality data.
hasqual :: Sequence a -> Bool

-- | Return the quality data, or error if none exist. Use hasqual if in
--   doubt.
seqqual :: Sequence a -> QualData
appendHeader :: Sequence a -> String -> Sequence a

-- | Modify the header by appending text, or by replacing all but the
--   sequence label (i.e. first word).
setHeader :: Sequence a -> String -> Sequence a

-- | Convert a String to <a>SeqData</a>
fromStr :: String -> SeqData

-- | Convert a <a>SeqData</a> to a String
toStr :: SeqData -> String

-- | Returns a sequence with all internal storage freshly copied and with
--   sequence and quality data present as a single chunk.
--   
--   By freshly copying internal storage, <a>defragSeq</a> allows garbage
--   collection of the original data source whence the sequence was read;
--   otherwise, use of just a short sequence name can cause an entire
--   sequence file buffer to be retained.
--   
--   By compacting sequence data into a single chunk, <a>defragSeq</a>
--   avoids linear-time traversal of sequence chunks during random access
--   into sequence data.
defragSeq :: Sequence t -> Sequence t

-- | map over sequences, treating them as a sequence of (char,word8) pairs.
--   This will work on sequences without quality, as long as the function
--   doesn't try to examine it. The current implementation is not very
--   efficient.
seqmap :: ((Char, Qual) -> (Char, Qual)) -> Sequence t -> Sequence t

-- | Phantom type functionality, unchecked conversion between sequence
--   types
castSeq :: Sequence a -> Sequence b

-- | Complement a single character. I.e. identify the nucleotide it can
--   hybridize with. Note that for multiple nucleotides, you usually want
--   the reverse complement (see <a>revcompl</a> for that).
compl :: Char -> Char

-- | Calculate the reverse complement. This is only relevant for the
--   nucleotide alphabet, and it leaves other characters unmodified.
revcompl :: Sequence Nuc -> Sequence Nuc

-- | Calculate the reverse complent for SeqData only.
revcompl' :: SeqData -> SeqData

-- | For type tagging sequences (protein sequences use <a>Amino</a> below)
data Nuc
castToNuc :: Sequence a -> Sequence Nuc
data Amino
Ala :: Amino
Arg :: Amino
Asn :: Amino
Asp :: Amino
Cys :: Amino
Gln :: Amino
Glu :: Amino
Gly :: Amino
His :: Amino
Ile :: Amino
Leu :: Amino
Lys :: Amino
Met :: Amino
Phe :: Amino
Pro :: Amino
Ser :: Amino
Thr :: Amino
Tyr :: Amino
Trp :: Amino
Val :: Amino
STP :: Amino
Asx :: Amino
Glx :: Amino
Xle :: Amino
Xaa :: Amino

-- | Translate a nucleotide sequence into the corresponding protein
--   sequence. This works rather blindly, with no attempt to identify ORFs
--   or otherwise QA the result.
translate :: Sequence Nuc -> Offset -> [Amino]

-- | Convert a sequence in IUPAC format to a list of amino acids.
fromIUPAC :: SeqData -> [Amino]

-- | Convert a list of amino acids to a sequence in IUPAC format.
toIUPAC :: [Amino] -> SeqData
castToAmino :: Sequence a -> Sequence Amino

-- | A simple function to display a sequence: we generate the sequence
--   string and | call putStrLn
putSeqLn :: Sequence a -> Int -> Int -> [(Int, Int)] -> IO ()

-- | Returns a properly formatted and probably highlighted string |
--   representation of a sequence. Highlighting is done using ANSI-Escape |
--   sequences.
seqToStr :: Sequence a -> Int -> Int -> [(Int, Int)] -> [Char]
data Unknown
instance Show Amino
instance Eq Amino
instance Eq (Sequence t)
instance Show (Sequence a)


-- | This module incorporates functionality for reading and writing
--   sequence data in the Fasta format. Each sequence consists of a header
--   (with a <a>&gt;</a> prefix) and a set of lines containing the sequence
--   data.
--   
--   As Fasta is used for both amino acids and nucleotides, the resulting
--   <a>Sequence</a>s are type-tagged with <a>Unknown</a>. If you know the
--   type of sequence you are reading, use <a>castToAmino</a> or
--   <a>castToNuc</a>.
module Bio.Sequence.Fasta

-- | Lazily read sequences from a FASTA-formatted file
readFasta :: FilePath -> IO [Sequence Unknown]

-- | Write sequences to a FASTA-formatted file. Line length is 60.
writeFasta :: FilePath -> [Sequence a] -> IO ()

-- | Lazily read sequence from handle
hReadFasta :: Handle -> IO [Sequence Unknown]

-- | Write sequences in FASTA format to a handle.
hWriteFasta :: Handle -> [Sequence a] -> IO ()

-- | Read quality data for sequences to a file.
readQual :: FilePath -> IO [Sequence Unknown]

-- | Write quality data for sequences to a file.
writeQual :: FilePath -> [Sequence a] -> IO ()
hWriteQual :: Handle -> [Sequence a] -> IO ()

-- | Read sequence and associated quality. Will error if the sequences and
--   qualites do not match one-to-one in sequence.
readFastaQual :: FilePath -> FilePath -> IO [Sequence Unknown]
hWriteFastaQual :: Handle -> Handle -> [Sequence a] -> IO ()

-- | Write sequence and quality data simulatnously This may be more
--   laziness-friendly.
writeFastaQual :: FilePath -> FilePath -> [Sequence a] -> IO ()
countSeqs :: FilePath -> IO Int

-- | Convert a list of FASTA-formatted lines into a list of sequences.
--   Blank lines are ignored. Comment lines start with <a>#</a> are allowed
--   between sequences (and ignored). Lines starting with <a>&gt;</a>
--   initiate a new sequence.
mkSeqs :: [ByteString] -> [Sequence Unknown]

-- | Basic type for quality data. Range 0..255. Typical Phred output is in
--   the range 6..50, with 20 as the line in the sand separating good from
--   bad.
type Qual = Word8


-- | Support the FastQ format that combines sequence and quality. See:
--   
--   <ul>
--   <li><a>http://www.bioperl.org/wiki/FASTQ_sequence_format</a></li>
--   </ul>
--   
--   Of course, this is yet another vaguely defined pseudo-standard with
--   conflicting definitions. Of course Solexa had to go and invent not
--   one, but two different, and indistinguishably so, ways to do it:
--   
--   <ul>
--   
--   <li><a>http://www.bcgsc.ca/pipermail/ssrformat/2007-March/000137.html</a></li>
--   <li><a>http://maq.sourceforge.net/fastq.shtml</a></li>
--   <li><a>http://en.wikipedia.org/wiki/FASTQ_format</a></li>
--   </ul>
--   
--   Sanger-style FastQ-format is supported with the (h)read/writeSangerQ
--   functions, and the new Illumina<i>Solexa-style with
--   (h)read</i>writeIllumina.
--   
--   As far as I know, FastQ is only used for nucleotide sequences, never
--   amino acid.
module Bio.Sequence.FastQ
readFastQ :: FilePath -> IO [Sequence Nuc]
hReadFastQ :: Handle -> IO [Sequence Nuc]

-- | Parse one FastQ entry, suitable for using in <a>unfoldr</a> over
--   <a>lines</a> from a file
parse :: [ByteString] -> Maybe (Either String (Sequence Nuc), [ByteString])
writeFastQ :: FilePath -> [Sequence Nuc] -> IO ()
hWriteFastQ :: Handle -> [Sequence Nuc] -> IO ()
unparse :: Sequence Nuc -> ByteString
readSangerQ :: FilePath -> IO [Sequence Nuc]
hReadSangerQ :: Handle -> IO [Sequence Nuc]
writeSangerQ :: FilePath -> [Sequence Nuc] -> IO ()
hWriteSangerQ :: Handle -> [Sequence Nuc] -> IO ()
readIllumina :: FilePath -> IO [Sequence Nuc]
hReadIllumina :: Handle -> IO [Sequence Nuc]
writeIllumina :: FilePath -> [Sequence Nuc] -> IO ()
hWriteIllumina :: Handle -> [Sequence Nuc] -> IO ()


-- | Parse phd files (phred base calling output).
module Bio.Sequence.Phd

-- | Parse a .phd file, extracting the contents as a Sequence
readPhd :: FilePath -> IO (Sequence Nuc)

-- | Parse .phd contents from a handle
hReadPhd :: Handle -> IO (Sequence Nuc)


-- | This module implements the 2bit format for sequences.
--   
--   Based on: <a>http://genome.ucsc.edu/FAQ/FAQformat#format7</a> Note!
--   the description is not accurate, it is missing a reserved word in each
--   sequence record.
--   
--   There are also other, completely different ideas of the 2bit format,
--   e.g. <a>http://jcomeau.freeshell.org/www/genome/2bitformat.html</a>
module Bio.Sequence.TwoBit

-- | Parse a (lazy) ByteString as sequences in the 2bit format.
decode2Bit :: ByteString -> [Sequence Nuc]

-- | Read sequences from a file in 2bit format and | unmarshall/deserialize
--   into Sequence format.
read2Bit :: FilePath -> IO [Sequence Nuc]

-- | Read sequences from a file handle in the 2bit format and |
--   unmarshall/deserialze into Sequence format.
hRead2Bit :: Handle -> IO [Sequence Nuc]

-- | Marshall from neutral representation to the 2Bit ByteString rep
encode2Bit :: [Sequence a] -> ByteString

-- | Marshall/serialize [Sequence] into 2Bit format and write to a file.
write2Bit :: FilePath -> [Sequence a] -> IO ()

-- | Marshall/serialize [Sequence] into 2Bit format and write to a file
--   using handle.
hWrite2Bit :: Handle -> [Sequence a] -> IO ()
instance Show SRLE
instance Show SRBE
instance Show SR
instance Show Entry
instance Binary SRLE
instance Binary SRBE
instance Binary Entries
instance Show Entries
instance Binary Entry
instance Binary Header
instance Show Header


-- | Read and write the SFF file format used by Roche/454 sequencing to
--   store flowgram data.
--   
--   A flowgram is a series of values (intensities) representing
--   homopolymer runs of A,G,C, and T in a fixed cycle, and usually
--   displayed as a histogram.
--   
--   This file is based on information in the Roche FLX manual. Among other
--   sources for information about the format, are The Staden Package,
--   which contains an io_lib with a C routine for parsing this format.
--   According to comments in the sources, the io_lib implementation is
--   based on a file called getsff.c, which I've been unable to track down.
--   Other software parsing SFFs are QIIME, sff_extract, and Celera's
--   sffToCa.
--   
--   It is believed that all values are stored big endian.
module Bio.Sequence.SFF

-- | The data structure storing the contents of an SFF file (modulo the
--   index)
data SFF
SFF :: !CommonHeader -> [ReadBlock] -> SFF

-- | SFF has a 31-byte common header
--   
--   The format is open to having the index anywhere between reads, we
--   should really keep count and check for each read. In practice, it
--   seems to be places after the reads.
--   
--   The following two fields are considered part of the header, but as
--   they are static, they are not part of the data structure
--   
--   <pre>
--      
--   magic   :: Word32   -- 0x2e736666, i.e. the string ".sff"
--   version :: Word32   -- 0x00000001
--   </pre>
data CommonHeader
CommonHeader :: Int64 -> Int32 -> Int32 -> Int16 -> Int16 -> Word8 -> ByteString -> ByteString -> CommonHeader

-- | Points to a text(?) section
index_offset :: CommonHeader -> Int64
index_length :: CommonHeader -> Int32
num_reads :: CommonHeader -> Int32
key_length :: CommonHeader -> Int16
flow_length :: CommonHeader -> Int16
flowgram_fmt :: CommonHeader -> Word8
flow :: CommonHeader -> ByteString
key :: CommonHeader -> ByteString

-- | Each Read has a fixed read header, containing various information.
data ReadHeader
ReadHeader :: Int16 -> Int32 -> Int16 -> Int16 -> Int16 -> Int16 -> ByteString -> ReadHeader
name_length :: ReadHeader -> Int16
num_bases :: ReadHeader -> Int32
clip_qual_left :: ReadHeader -> Int16
clip_qual_right :: ReadHeader -> Int16
clip_adapter_left :: ReadHeader -> Int16
clip_adapter_right :: ReadHeader -> Int16
read_name :: ReadHeader -> ByteString

-- | This contains the actual flowgram for a single read.
data ReadBlock
ReadBlock :: !ReadHeader -> !ByteString -> !ByteString -> !SeqData -> !QualData -> ReadBlock
read_header :: ReadBlock -> !ReadHeader
flow_data :: ReadBlock -> !ByteString
flow_index :: ReadBlock -> !ByteString
bases :: ReadBlock -> !SeqData
quality :: ReadBlock -> !QualData

-- | Read an SFF file.
readSFF :: FilePath -> IO SFF

-- | Write an <a>SFF</a> to the specified file name
writeSFF :: FilePath -> SFF -> IO ()

-- | Write an <a>SFF</a> to the specified file name, but go back and update
--   the read count. Useful if you want to output a lazy stream of
--   <a>ReadBlock</a>s. Returns the number of reads written.
writeSFF' :: FilePath -> SFF -> IO Int

-- | Read an SFF file, but be resilient against errors.
recoverSFF :: FilePath -> IO SFF

-- | Extract the sequences from an <a>SFF</a> data structure.
sffToSequence :: SFF -> [Sequence Nuc]

-- | Extract the sequence information from a <a>ReadBlock</a>.
rbToSequence :: ReadBlock -> Sequence Nuc

-- | Trim a read according to clipping information
trim :: ReadBlock -> ReadBlock

-- | Trim a read to specific sequence position, inclusive bounds The
--   current implementation has the unintended side effect of always
--   trimming the flowgram down to a basecalled position. Note that you
--   can't (easily) write trimmed <a>ReadBlock</a>s to a file, since they
--   need to have the same number of flows as given in the
--   <tt>CommmonHeader</tt>.
trimFromTo :: Integral i => i -> i -> ReadBlock -> ReadBlock

-- | Extract the read without the initial (TCAG) key.
trimKey :: CommonHeader -> Sequence Nuc -> Maybe (Sequence Nuc)

-- | Convert a sequence position to the corresponding flow position
baseToFlowPos :: Integral i => ReadBlock -> i -> Int

-- | Convert a flow position to the corresponding sequence position
flowToBasePos :: Integral i => ReadBlock -> i -> Int

-- | Trim a <a>ReadBlock</a> limiting the number of flows. If writing to an
--   SFF file, make sure you update the <a>CommonHeader</a> accordingly.
--   See <tt>examples/Flx.hs</tt> for how to use this.
trimFlows :: Integral i => i -> ReadBlock -> ReadBlock

-- | test serialization by output'ing the header and first two reads in an
--   SFF, and the same after a decode + encode cycle.
test :: FilePath -> IO ()

-- | Convert a file by decoding it and re-encoding it This will lose the
--   index (which isn't really necessary)
convert :: FilePath -> IO ()

-- | Helper function to access the flowgram
flowgram :: ReadBlock -> [Flow]

-- | Extract the sequence with masked bases in lower case
masked_bases :: ReadBlock -> SeqData

-- | Extract the index as absolute coordinates, not relative.
cumulative_index :: ReadBlock -> [Int]

-- | Pack a list of flows into the corresponding binary structure (the
--   flow_data field)
packFlows :: [Flow] -> ByteString

-- | Unpack the flow_data field into a list of flow values
unpackFlows :: ByteString -> [Flow]

-- | The type of flowgram value
type Flow = Int16

-- | Basic type for quality data. Range 0..255. Typical Phred output is in
--   the range 6..50, with 20 as the line in the sand separating good from
--   bad.
type Qual = Word8
type Index = Word8

-- | The basic data type used in <a>Sequence</a>s
type SeqData = ByteString

-- | Quality data is a <a>Qual</a> vector, currently implemented as a
--   <tt>ByteString</tt>.
type QualData = ByteString

-- | Read names encode various information, as per this struct.
data ReadName
ReadName :: (Int, Int, Int) -> (Int, Int, Int) -> Int -> Int -> Int -> ReadName
date :: ReadName -> (Int, Int, Int)
time :: ReadName -> (Int, Int, Int)
region :: ReadName -> Int
x_loc :: ReadName -> Int
y_loc :: ReadName -> Int
decodeReadName :: ByteString -> Maybe ReadName
encodeReadName :: ReadName -> ByteString
instance Binary PartialReadHeader
instance Binary RSFF
instance Show ReadBlock
instance Binary ReadHeader
instance Show ReadHeader
instance Binary CommonHeader
instance Show CommonHeader
instance Binary SFF
instance Show SFF
instance Binary RBI


-- | This implements a number of filters used in the Titanium pipeline,
--   based on published documentation.
module Bio.Sequence.SFF_filters

-- | DiscardFilters determine whether a read is to be retained or discarded
type DiscardFilter = ReadBlock -> Bool

-- | This filter discards empty sequences.
discard_empty :: DiscardFilter

-- | Discard sequences that don't have the given key tag (typically TCAG)
--   at the start of the read.
discard_key :: String -> DiscardFilter

-- | <ol>
--   <li>2.2.1.2 The <a>dots</a> filter discards sequences where the last
--   positive flow is before flow 84, and flows with &gt;5% dots (i.e.
--   three successive noise values) before the last postitive flow. The
--   percentage can be given as a parameter.</li>
--   </ol>
discard_dots :: Double -> DiscardFilter

-- | <ol>
--   <li>2.2.1.3 The <a>mixed</a> filter discards sequences with more than
--   70% positive flows. Also, discard with <a>30% noise, </a>20% middle
--   (0.45..0.75) or &lt;30% positive.</li>
--   </ol>
discard_mixed :: DiscardFilter

-- | Discard a read if the number of untrimmed flows is less than n (n=186
--   for Titanium)
discard_length :: Int -> DiscardFilter

-- | TrimFilters modify the read, typically trimming it for quality
type TrimFilter = ReadBlock -> ReadBlock

-- | <ol>
--   <li>2.2.1.4 Signal intensity trim - trim back until &lt;3% borderline
--   flows (0.5..0.7). Then trim borderline values or dots from the end
--   (use a window).</li>
--   </ol>
trim_sigint :: TrimFilter
sigint :: ReadBlock -> Int

-- | <ol>
--   <li>2.2.1.5 Primer filter This looks for the B-adaptor at the end of
--   the read. The 454 implementation isn't very effective at finding
--   mutated adaptors.</li>
--   </ol>
trim_primer :: String -> TrimFilter
find_primer :: String -> ReadBlock -> Int

-- | <ol>
--   <li>2.2.1.7 Quality score trimming trims using a 10-base window until
--   a Q20 average is found.</li>
--   </ol>
trim_qual20 :: Int -> TrimFilter
qual20 :: Int -> ReadBlock -> Int

-- | List length as a double (eliminates many instances of fromIntegral)
dlength :: [a] -> Double

-- | Calculate average of a list
avg :: Integral a => [a] -> Double

-- | Translate a number of flows to position in sequence, and update
--   clipping data accordingly
clipFlows :: ReadBlock -> Int -> ReadBlock

-- | Update clip_qual_right if more severe than previous value
clipSeq :: ReadBlock -> Int -> ReadBlock

module Bio.Sequence.HashWord

-- | This is a struct for containing a set of hashing functions
data HashF k
HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k

-- | calculates the hash at a given offset in the sequence
hash :: HashF k -> SeqData -> Offset -> Maybe k

-- | calculate all hashes from a sequence, and their indices
hashes :: HashF k -> SeqData -> [(k, Offset)]

-- | for sorting hashes
ksort :: HashF k -> [k] -> [k]

-- | Adds a default <a>hashes</a> function to a <tt>HashF</tt>, when
--   <a>hash</a> is defined.
genkeys :: HashF k -> HashF k

-- | Contigous constructs an int/eger from a contigous k-word.
contigous :: Integral k => Int -> HashF k

-- | Like <a>contigous</a>, but returns the same hash for a word and its
--   reverse complement.
rcontig :: Integral k => Int -> HashF k
compact :: SeqData -> [SeqData]

-- | Like <tt>rcontig</tt>, but ignoring monomers (i.e. arbitrarily long
--   runs of a single nucelotide are treated the same a single nucleotide.
rcpacked :: Integral k => Int -> HashF k
type Shape = String
gapped :: Integral k => Shape -> HashF k
isN :: Char -> Bool
n2k :: Integral k => Int -> SeqData -> k
n2i' :: Num a => a -> SeqData -> a
k2n :: Integral k => Int -> k -> SeqData
val :: Num t => Char -> t
unval :: Num a => a -> Char
complement :: Char -> Char


-- | Encodes a table of amino acid properties. Based on Livingstone &amp;
--   Barton, CABIOS, 9, 745-756, 1993, as seen at:
--   http:<i></i>www.compbio.dundee.ac.uk<i>user</i>ws-dev1<i>jalview</i>latest<i>help</i>html<i>misc</i>aaproperties.html
--   NB: based on the graphic, not the table (in which P is polar, but T is
--   not) |
module Bio.Sequence.AminoProperties
type AAProp = Amino -> Bool
aromatic :: AAProp
hydrophobic :: AAProp
polar :: AAProp
small :: AAProp
tiny :: AAProp
charged :: AAProp
negative :: AAProp
positive :: AAProp
aliphatic :: AAProp
hydropathy :: Amino -> Double
mass :: Amino -> Double

-- | The propensities for forming secondary structures From Zvelebil and
--   Baum: Understanding Bioinformatics, Chapter 11 citing Chou and Fasman.
--   Today, more complex methods like GOR are recommended instead.
strandP :: Amino -> Double
helixP :: Amino -> Double


-- | Data structures and helper functions for calculating alignments
--   
--   There are two ways to view an alignment: either as a list of edits
--   (i.e., insertions, deletions, or substitutions), or as a set of
--   sequences with inserted gaps.
--   
--   The edit list approach is perhaps more restrictive model but doesn't
--   generalize to multiple alignments.
--   
--   The gap approach is more general, and probably more commonly used by
--   other software (see e.g. the ACE file format).
module Bio.Alignment.AlignData
data Dir
Fwd :: Dir
Rev :: Dir
type Gaps = [Offset]
type Alignment a = [(Offset, Dir, Sequence a, Gaps)]

-- | Gaps are coded as <a>*</a>s, this function removes them, and returns
--   the sequence along with the list of gap positions. note that gaps are
--   positioned relative to the *gapped* sequence (contrast to
--   stmassembler/Cluster.hs)
extractGaps :: SeqData -> (SeqData, Gaps)
insertGaps :: Char -> (SeqData, Gaps) -> SeqData

-- | An Edit is either the insertion, the deletion, or the replacement of a
--   character.
data Edit
Ins :: Chr -> Edit
Del :: Chr -> Edit
Repl :: Chr -> Chr -> Edit

-- | An alignment is a sequence of edits.
type EditList = [Edit]

-- | A substitution matrix gives scores for replacing a character with
--   another. Typically, it will be symmetric. It is type-tagged with the
--   alphabet - Nuc or Amino.
type SubstMx t a = (Chr, Chr) -> a

-- | A Selector consists of a zero element, and a funcition that chooses a
--   possible Edit operation, and generates an updated result.
type Selector a = [(a, Edit)] -> a

-- | The sequence element type, used in alignments.
type Chr = Word8

-- | Calculate a set of columns containing scores This represents the
--   columns of the alignment matrix, but will only require linear space
--   for score calculation.
columns :: Selector a -> a -> Sequence b -> Sequence b -> [[a]]

-- | Evaluate an Edit based on SubstMx and gap penalty
eval :: SubstMx t a -> a -> Edit -> a

-- | True if the Edit is a Repl.
isRepl :: Edit -> Bool
on :: (t1 -> t1 -> t) -> (t2 -> t1) -> t2 -> t2 -> t
showalign :: EditList -> [Char]

-- | turn an alignment into sequences with <a>-</a> representing gaps (for
--   checking, filtering out the <a>-</a> characters should return the
--   original sequences, provided <a>-</a> isn't part of the sequence
--   alphabet)
toStrings :: EditList -> (String, String)
instance Show Edit
instance Eq Edit
instance Eq Dir
instance Show Dir


-- | Common substitution matrices for alignments.
--   
--   When in doubt, use BLOSUM62. Consult
--   <a>http://www.ncbi.nlm.nih.gov/blast/blast_whatsnew.shtml#20051206</a>
--   for some hints on good parameters for nucleotide alignments.
--   
--   See also <a>http://en.wikipedia.org/wiki/Substitution_matrix</a> for a
--   summary about the difference between the different matrices.
module Bio.Alignment.Matrices

-- | BLOSUM45 matrix, suitable for distantly related sequences
blosum45 :: (Chr, Chr) -> Int

-- | The standard BLOSUM62 matrix.
blosum62 :: (Chr, Chr) -> Int

-- | BLOSUM80 matrix, suitable for closely related sequences.
blosum80 :: (Chr, Chr) -> Int

-- | The standard PAM30 matrix
pam30 :: (Chr, Chr) -> Int

-- | The standard PAM70 matrix.
pam70 :: (Chr, Chr) -> Int

-- | Blast defaults, use with gap_open = -5 gap_extend = -3 This should
--   really check for valid nucleotides, and perhaps be more lenient in the
--   case of Ns. Oh well.
blastn_default :: Num a => (Chr, Chr) -> a

-- | Construct a simple <a>matrix</a> from match score/mismatch penalty
simpleMx :: Num a => a -> a -> (Chr, Chr) -> a


-- | Simple alignment of sequences
--   
--   Standard alignment/edit distance
module Bio.Alignment.SAlign

-- | Calculate local edit distance (Smith-Waterman alignment score)
local_score :: (Num a, Ord a) => SubstMx t a -> a -> Sequence t -> Sequence t -> a
local_align :: (Num a, Ord a) => SubstMx t a -> a -> Sequence t -> Sequence t -> EditList

-- | Calculate global edit distance (Needleman-Wunsch alignment score)
global_score :: (Num a, Ord a) => SubstMx t a -> a -> Sequence t -> Sequence t -> a

-- | Calculate alignments.
global_align :: (Num a, Ord a) => SubstMx t a -> a -> Sequence t -> Sequence t -> EditList


-- | Implement alignments/edit distance with affine gap penalties
--   
--   I've seen g = (-10,-1) as the suggested price to pay for a gaps using
--   BLOSUM62. Good choice as any, I guess.
module Bio.Alignment.AAlign

-- | Calculate local edit distance (Smith-Waterman alignment score)
local_score :: (Num a, Ord a) => SubstMx t a -> (a, a) -> Sequence t -> Sequence t -> a

-- | Calculate local alignmnet (Smith-Waterman)
local_align :: (Num a, Ord a) => SubstMx t a -> (a, a) -> Sequence t -> Sequence t -> (a, EditList)

-- | Calculate global edit distance (Needleman-Wunsch alignment score)
global_score :: (Num a, Ord a) => SubstMx t a -> (a, a) -> Sequence t -> Sequence t -> a

-- | Calculate global alignment (Needleman-Wunsch)
global_align :: (Num a, Ord a) => SubstMx t a -> (a, a) -> Sequence t -> Sequence t -> (a, EditList)


-- | Quality-aware alignments
--   
--   Generally, quality data are ignored for alignment/pattern searching
--   like Smith-Waterman, Needleman-Wunsch, or BLAST(p|n|x). I believe that
--   accounting for quality will at the very least affect things like BLAST
--   statistics, and e.g. is crucial for good EST annotation using Blastx.
--   
--   This module performs sequences alignments, takes quality values into
--   account.
--   
--   See also
--   <a>http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btn052v1</a>.
module Bio.Alignment.QAlign

-- | Calculate local edit distance (Smith-Waterman alignment score)
local_score :: QualMx t Double -> (Double, Double) -> Sequence t -> Sequence t -> Double

-- | Calculate local alignment (Smith-Waterman) (can we replace uncurry
--   max' with fst - a local alignment must always end on a subst, no?)
local_align :: QualMx t Double -> (Double, Double) -> Sequence t -> Sequence t -> (Double, EditList)

-- | Calculate global edit distance (Needleman-Wunsch alignment score)
global_score :: QualMx t Double -> (Double, Double) -> Sequence t -> Sequence t -> Double

-- | Calculate global alignment (Needleman-Wunsch)
global_align :: QualMx t Double -> (Double, Double) -> Sequence t -> Sequence t -> (Double, EditList)

-- | Calucalte best overlap score, where gaps at the edges are free The
--   starting point is like for local score (0 cost for initial indels),
--   the result is the maximum anywhere in the last column or bottom row of
--   the matrix.
overlap_score :: QualMx t Double -> (Double, Double) -> Sequence t -> Sequence t -> Double

-- | Calucalte best overlap score, where gaps at the edges are free The
--   starting point is like for local score (0 cost for initial indels),
--   the result is the maximum anywhere in the last column or bottom row of
--   the matrix.
overlap_align :: QualMx t Double -> (Double, Double) -> Sequence t -> Sequence t -> (Double, EditList)
qualMx :: Qual -> Qual -> (Chr, Chr) -> Double
test :: IO ()


-- | Read ACE format assembly files
--   
--   These are typically output by sequence assembly tools, like CAP3 or
--   Phrap.
--   
--   Documented in the section labelled "ACE FILE FORMAT" at
--   <a>http://bozeman.mbt.washington.edu/consed/distributions/README.14.0.txt</a>
--   
--   Briefly: each field is a line starting with a two letter code, in some
--   cases followed by data lines termintated by a blank line. Here's an
--   brief example how an ACE file looks like:
--   
--   <pre>
--   AS contigs reads
--   CO contig_name bases reads segments compl (CAP3: segments=0)
--   sequence
--   BQ base_qualities
--   AF read1 compl padded_start_consensus (negatives meaning?)
--   AF read2 ..
--   BS segments
--   RD read1 bases info_items info_tags (latter two set to 0 by CAP3)
--   sequence
--   QA read1 qual_start qual_end align_start align_end
--   DS (phred header? left empty by CAP3)
--   RD read2 ...
--   </pre>
--   
--   As far as I know, this is only used for nucleotide sequences.
module Bio.Alignment.ACE

-- | Reading an ACE file.
readACE :: FilePath -> IO [[Assembly]]
writeACE :: FilePath -> [Assembly] -> IO ()
data Assembly
Asm :: (Sequence Nuc, Gaps) -> Alignment Nuc -> Assembly
contig :: Assembly -> (Sequence Nuc, Gaps)
fragments :: Assembly -> Alignment Nuc

-- | Test parser p on a list of ACE elements
ptest :: Show a => String -> AceParser a -> [ACE] -> IO ()
reads :: Assembly -> Alignment Nuc
instance Eq ACE
instance Show Assembly
instance Show ACE


-- | Data types for functorially lifting sequence positions and locations
--   onto named sequences. These are useful for taking functions that work
--   with sequence positions and locations and associating them specific,
--   named sequences.
module Bio.Location.OnSeq

-- | Sequence name, as in a <a>Sequence</a>
type SeqName = SeqData

-- | Data type for an object associated with a specific, named sequence
data OnSeq a
OnSeq :: !SeqName -> !a -> OnSeq a
onSeqName :: OnSeq a -> !SeqName
onSeqObj :: OnSeq a -> !a

-- | Looks up a sequence by name and applies a function to it
withSeqData :: Monad m => (SeqData -> a -> m b) -> (SeqName -> m SeqData) -> OnSeq a -> m b

-- | Tests a predicate when two objects are on the same sequence, returning
--   <tt>False</tt> if they are on different sequences.
andSameSeq :: (a -> b -> Bool) -> OnSeq a -> OnSeq b -> Bool

-- | Performs an action when two objects are on the same sequence and
--   produces an error otherwise.
onSameSeq :: (Error e, MonadError e m) => (a -> b -> m c) -> OnSeq a -> OnSeq b -> m c

-- | Data type for a collection of objects indexed by sequence name
type OnSeqs a = Map SeqName a

-- | Lifts a function on an underlying object to look up the sequence name
--   in a name-indexed collection.
perSeq :: Monoid b => (a -> b -> c) -> OnSeq a -> OnSeqs b -> c

-- | Lifts a function that updates an underlying object to look up the
--   named sequence and update a named-index collection.
perSeqUpdate :: Monoid b => (a -> b -> b) -> OnSeq a -> OnSeqs b -> OnSeqs b

-- | Lifts a function on underlying objects to look up a sequence in a
--   name-indexed collection
withNameAndSeq :: Monad m => (SeqName -> a -> b -> m c) -> OnSeq a -> OnSeqs b -> m c
instance Eq a => Eq (OnSeq a)
instance Ord a => Ord (OnSeq a)
instance Show a => Show (OnSeq a)
instance Functor OnSeq


-- | Utilities for manipulating nucleotide sequences and locations on
--   nucleotide sequences that occur on a forward or a reverse-complement
--   strand.
module Bio.Location.Strand

-- | Sequence strand
data Strand
Fwd :: Strand
RevCompl :: Strand

-- | A nucleotide sequence or location on a nucleotide sequence that lies
--   on a specific strand and has an orientation.
class Stranded s
revCompl :: Stranded s => s -> s

-- | Convert the orientation of a <a>Stranded</a> thing based on a
--   specified <a>Strand</a>
stranded :: Stranded s => Strand -> s -> s
instance Eq Strand
instance Ord Strand
instance Show Strand
instance Read Strand
instance Bounded Strand
instance Enum Strand
instance Ix Strand
instance Stranded ByteString
instance Stranded Char
instance Stranded Strand


-- | Data type for a sequence position.
--   
--   Zero-based <a>Offset</a> / <tt>Int64</tt> indices are used throughout,
--   to facilitate direct use of indexing functions on <a>SeqData</a>.
module Bio.Location.Position

-- | Position in a sequence
data Pos
Pos :: !Offset -> !Strand -> Pos

-- | 0-based index of the position
offset :: Pos -> !Offset

-- | Strand of the position
strand :: Pos -> !Strand

-- | Returns a position resulting from sliding the original position along
--   the sequence by a specified offset. A positive offset will move the
--   position away from the 5' end of the forward stand of the sequence
--   regardless of the strand of the position itself. Thus,
--   
--   <pre>
--   slide (revCompl pos) off == revCompl (slide pos off)
--   </pre>
slide :: Pos -> Offset -> Pos

-- | Extract the nucleotide at a specific sequence position. If the
--   position lies outside the bounds of the sequence, an error results.
seqNt :: (Error e, MonadError e m) => SeqData -> Pos -> m Char

-- | As <a>seqNt</a>, extract the nucleotide at a specific sequence
--   position, but return <tt>N</tt> when the position lies outside the
--   bounds of the sequence.
--   
--   <pre>
--   seqNtPadded sequ pos == (either 'N' id . seqNt sequ) pos
--   </pre>
seqNtPadded :: SeqData -> Pos -> Char

-- | Display a human-friendly, zero-based representation of a sequence
--   position.
display :: Pos -> String
instance Eq Pos
instance Ord Pos
instance Show Pos
instance Read Pos
instance Ix Pos
instance Stranded Pos


-- | Data type for a sequence location consiting of a contiguous range of
--   positions on the sequence.
--   
--   Throughout, <i>sequence position</i> refers to a <a>Pos</a> which
--   includes a strand. An index into a sequence is referred to as an
--   <i>offset</i>, and is generally of type <a>Offset</a>.
module Bio.Location.ContigLocation

-- | Contiguous sequence location defined by a span of sequence positions,
--   lying on a specific strand of the sequence.
data ContigLoc
ContigLoc :: !Offset -> !Offset -> !Strand -> ContigLoc

-- | The offset of the 5' end of the location, as a 0-based index
offset5 :: ContigLoc -> !Offset

-- | The length of the location
length :: ContigLoc -> !Offset

-- | The strand of the location
strand :: ContigLoc -> !Strand

-- | Create a sequence location lying between 0-based starting and ending
--   offsets. When <tt>start &lt; end</tt>, the location be on the forward
--   strand, otherwise it will be on the reverse complement strand.
fromStartEnd :: Offset -> Offset -> ContigLoc

-- | Create a sequence location from the sequence position of the start of
--   the location and the length of the position. The strand of the
--   location, and the direction it extends from the starting position, are
--   determined by the strand of the starting position.
fromPosLen :: Pos -> Offset -> ContigLoc

-- | The bounds of a sequence location. This is a pair consisting of the
--   lowest and highest sequence offsets covered by the region. The bounds
--   ignore the strand of the sequence location, and the first element of
--   the pair will always be lower than the second.
bounds :: ContigLoc -> (Offset, Offset)

-- | Sequence position of the start of the location. This is the 5' end on
--   the location strand, which will have a higher offset than
--   <a>endPos</a> if the location is on the <a>RevCompl</a> strand.
startPos :: ContigLoc -> Pos

-- | Sequence position of the end of the location, as described in
--   <a>startPos</a>.
endPos :: ContigLoc -> Pos

-- | Given a sequence position and a sequence location relative to the same
--   sequence, compute a new position representing the original position
--   relative to the subsequence defined by the location. If the sequence
--   position lies outside of the sequence location, <tt>Nothing</tt> is
--   returned; thus, the offset of the new position will always be in the
--   range <tt>[0, length cloc - 1]</tt>.
posInto :: Pos -> ContigLoc -> Maybe Pos

-- | Given a sequence location and a sequence position within that
--   location, compute a new position representing the original position
--   relative to the outer sequence. If the sequence position lies outside
--   the location, <tt>Nothing</tt> is returned.
--   
--   This function inverts <a>posInto</a> when the sequence position lies
--   within the position is actually within the location.
posOutof :: Pos -> ContigLoc -> Maybe Pos

-- | Returns <tt>True</tt> when a sequence position lies within a sequence
--   location on the same sequence, and occupies the same strand.
isWithin :: Pos -> ContigLoc -> Bool

-- | Returns <tt>True</tt> when two sequence locations overlap at any
--   position.
overlaps :: ContigLoc -> ContigLoc -> Bool

-- | Extract the nucleotide <a>SeqData</a> for the sequence location. If
--   any part of the location lies outside the bounds of the sequence, an
--   error results.
seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData

-- | As <a>seqData</a>, extract the nucleotide subsequence for the
--   location. Any positions in the location lying outside the bounds of
--   the sequence are returned as <tt>N</tt> rather than producing an
--   error.
seqDataPadded :: SeqData -> ContigLoc -> SeqData

-- | Returns a location resulting from sliding the original location along
--   the sequence by a specified offset. A positive offset will move the
--   location away from the 5' end of the forward stand of the sequence
--   regardless of the strand of the location itself. Thus,
--   
--   <pre>
--   slide (revCompl cloc) off == revCompl (slide cloc off)
--   </pre>
slide :: Offset -> ContigLoc -> ContigLoc

-- | Returns a sequence location produced by extending the original
--   location on each end, based on a pair of (<i>5\' extension</i>, /3'
--   extension/). The 5' extension is applied to the 5' end of the location
--   on the location strand; if the location is on the <a>RevCompl</a>
--   strand, the 5' end will have a higher offset than the 3' end and this
--   offset will increase by the amount of the 5' extension. Similarly, the
--   3' extension is applied to the 3' end of the location.
extend :: (Offset, Offset) -> ContigLoc -> ContigLoc

-- | Display a human-friendly, zero-based representation of a sequence
--   location.
display :: ContigLoc -> String
instance Eq ContigLoc
instance Ord ContigLoc
instance Show ContigLoc
instance Stranded ContigLoc


-- | Data type for a more general sequence location consiting of
--   potentially disjoint ranges of positions on the sequence.
--   
--   Throughout, <i>sequence position</i> refers to a <a>Pos</a> which
--   includes a strand. An index into a sequence is referred to as an
--   <i>offset</i>, and is generally of type <a>Offset</a>.
module Bio.Location.Location

-- | General (disjoint) sequence region consisting of a concatenated set of
--   contiguous regions (see <a>ContigLoc</a>).
newtype Loc
Loc :: [ContigLoc] -> Loc

-- | The bounds of a sequence location. This is a pair consisting of the
--   lowest and highest sequence offsets covered by the region. The bounds
--   ignore the strand of the sequence location, and the first element of
--   the pair will always be lower than the second. Even if the positions
--   in the location do not run monotonically through the location, the
--   overall lowest and highest sequence offsets are returned.
bounds :: Loc -> (Offset, Offset)

-- | Returns the length of the region
length :: Loc -> Offset

-- | Sequence position of the start of the location. This is the 5' end on
--   the location strand, which will have a higher offset than
--   <a>endPos</a> if the location is on the <a>RevCompl</a> strand.
startPos :: Loc -> Pos

-- | Sequence position of the end of the location, as described in
--   <a>startPos</a>.
endPos :: Loc -> Pos

-- | Given a sequence position and a sequence location relative to the same
--   sequence, compute a new position representing the original position
--   relative to the subsequence defined by the location. If the sequence
--   position lies outside of the sequence location, <tt>Nothing</tt> is
--   returned; thus, the offset of the new position will always be in the
--   range <tt>[0, length cloc - 1]</tt>.
--   
--   When the sequence positions in the location are not monotonic, there
--   may be multiple possible posInto solutions. That is, if the same outer
--   sequence position is covered by two different contiguous blocks of the
--   location, then it would have two possible sequence positions relative
--   to the location. In this case, the position 5'-most in the location
--   orientation is returned.
posInto :: Pos -> Loc -> Maybe Pos

-- | Given a sequence location and a sequence position within that
--   location, compute a new position representing the original position
--   relative to the outer sequence. If the sequence position lies outside
--   the location, <tt>Nothing</tt> is returned.
--   
--   This function inverts <a>posInto</a> when the sequence position lies
--   within the position is actually within the location. Due to the
--   possibility of redundant location-relative positions for a given
--   absolute position, <a>posInto</a> does not necessary invert
--   <a>posOutof</a>
posOutof :: Pos -> Loc -> Maybe Pos

-- | Returns <tt>True</tt> when a sequence position lies within a sequence
--   location on the same sequence, and occupies the same strand.
isWithin :: Pos -> Loc -> Bool

-- | Returns <tt>True</tt> when two sequence locations overlap at any
--   position.
overlaps :: Loc -> Loc -> Bool

-- | Extract the nucleotide <a>SeqData</a> for the sequence location. If
--   any part of the location lies outside the bounds of the sequence, an
--   error results.
seqData :: (Error e, MonadError e m) => SeqData -> Loc -> m SeqData

-- | As <a>seqData</a>, extract the nucleotide subsequence for the
--   location. Any positions in the location lying outside the bounds of
--   the sequence are returned as <tt>N</tt> rather than producing an
--   error.
seqDataPadded :: SeqData -> Loc -> SeqData

-- | Returns a sequence location produced by extending the original
--   location on each end, based on a pair of (<i>5\' extension</i>, /3'
--   extension/). These add contiguous positions to the 5' and 3' ends of
--   the original location. The 5' extension is applied to the 5' end of
--   the location on the location strand; if the location is on the
--   <a>RevCompl</a> strand, the 5' end will have a higher offset than the
--   3' end and this offset will increase by the amount of the 5'
--   extension. Similarly, the 3' extension is applied to the 3' end of the
--   location.
extend :: (Offset, Offset) -> Loc -> Loc

-- | Display a human-friendly, zero-based representation of a sequence
--   location.
display :: Loc -> String
instance Eq Loc
instance Ord Loc
instance Show Loc
instance Stranded Loc


-- | Data types for sequence locations and sequence positions associated
--   with specific, named sequences.
module Bio.Location.SeqLocation

-- | A position on a named sequence
type SeqPos = OnSeq Pos

-- | A location consisting of a contiguous span of positions on a named
--   sequence.
type ContigSeqLoc = OnSeq ContigLoc

-- | Test whether a sequence position lies within a sequence location. This
--   requires that the position lie within the location as per
--   <a>isWithin</a> and have the same sequence name.
withinContigSeqLoc :: SeqPos -> ContigSeqLoc -> Bool

-- | A general location, consisting of spans of sequence positions on a
--   specific, named sequence.
type SeqLoc = OnSeq Loc

-- | Test whether a sequence position lies within a sequence location. This
--   requires that the position lie within the location as per
--   <a>isWithin</a> and have the same sequence name.
isWithin :: SeqPos -> SeqLoc -> Bool

-- | Test whether two sequence locations overlap in any position. This
--   requires that the locations overlap as per <a>overlaps</a> and have
--   the same sequence name.
overlaps :: SeqLoc -> SeqLoc -> Bool

-- | Extract the subsequence specified by a sequence location from a
--   sequence database. The sequence name is used to retrieve the full
--   sequence and the subsequence is extracted as by <a>seqData</a>
seqData :: (Error e, MonadError e m) => (SeqName -> m SeqData) -> SeqLoc -> m SeqData

-- | Display a human-friendly representation of a <a>SeqPos</a>
displaySeqPos :: SeqPos -> String

-- | Display a human-friendly representation of a <a>ContigSeqLoc</a>
displayContigSeqLoc :: ContigSeqLoc -> String

-- | Display a human-friendly representation of a <a>SeqLoc</a>
display :: SeqLoc -> String


-- | This module provides a data type to represent an alignment produced by
--   the Bowtie short-read alignment tool (see
--   <a>http://bowtie-bio.sourceforge.net/index.shtml</a>).
--   
--   The simple accessors recapitulate the details of the Bowtie alignment
--   output. The position of the alignment is given by the "0-based offset
--   into the reference sequence where leftmost character of the alignment
--   occurs". Thus, for forward-strand alignments this is the 5' end of the
--   query sequence while for reverse-complement alignments this is the 3'
--   end of the query sequence. Similarly, the query sequence and query
--   quality are shown in reference forward strand orientation, and thus
--   may be reverse complemented.
module Bio.Alignment.Bowtie
data Align
Align :: !SeqName -> !Strand -> !SeqName -> !Offset -> !SeqData -> !QualData -> ![Mismatch] -> Align

-- | Name of the query sequence
name :: Align -> !SeqName

-- | Strand of the alignment on the reference sequence
strand :: Align -> !Strand

-- | Name of the reference sequence
refname :: Align -> !SeqName

-- | Zero-based offset of the left-most aligned position in the reference
leftoffset :: Align -> !Offset

-- | Query sequence, in the reference forward strand orientation
sequ :: Align -> !SeqData

-- | Query quality, in the reference forward strand orientation
qual :: Align -> !QualData

-- | Mismatches
mismatches :: Align -> ![Mismatch]

-- | Representation of a single mismatch in a bowtie alignment
data Mismatch
Mismatch :: !Offset -> !Char -> !Char -> Mismatch

-- | Offset of the mismatch site from the 5' end of the query
mmoffset :: Mismatch -> !Offset

-- | Reference nucleotide
refbase :: Mismatch -> !Char

-- | Query nucleotide
readbase :: Mismatch -> !Char

-- | Returns the length of the query sequence
length :: Align -> Offset

-- | Returns the number of mismatches in the alignment
nmismatch :: Align -> Int

-- | Query sequence as given in the query file
querySequ :: Align -> SeqData

-- | Query quality as given in the query file
queryQual :: Align -> QualData

-- | As <a>refCSeqLoc</a> but without the reference sequence name.
refCLoc :: Align -> ContigLoc

-- | Returns the sequence location covered by the query in the alignment.
--   This will be a sequence location on the reference sequence and may run
--   on the forward or the reverse complement strand.
refCSeqLoc :: Align -> ContigSeqLoc

-- | Returns the sequence location covered by the query, as
--   <a>refCSeqLoc</a>, as a <a>SeqLoc</a> location.
refSeqLoc :: Align -> SeqLoc

-- | Returns the sequence position of the start of the query sequence
--   alignment. This will include the strand of the alignment and will not
--   be the same as the position computed from <a>leftoffset</a> when the
--   alignment is on the reverse complement strand.
refSeqPos :: Align -> SeqPos

-- | Sequence position of a mismatch on the reference sequence.
mismatchSeqPos :: Align -> Mismatch -> SeqPos

-- | Parses a line of Bowtie output to produce a <a>Align</a>
parse :: ByteString -> Either String Align

-- | Returns true when two alignments were derived from the same sequencing
--   read. As Bowtie writes alignments of query sequences in their order in
--   the query file, all alignments of a given read are grouped together
--   and the lists of all alignments for each read can be gathered with
--   
--   <pre>
--   groupBy sameRead
--   </pre>
sameRead :: Align -> Align -> Bool
instance Read Mismatch
instance Show Mismatch
instance Eq Mismatch
instance Ord Mismatch
instance Read Align
instance Show Align
instance Eq Align
instance Ord Align

module Bio.Alignment.Soap

-- | Alignment output from SOAP
data SoapAlign
SA :: !SeqName -> !SeqData -> !QualData -> !Int -> !Char -> !Offset -> !Strand -> !SeqName -> !Offset -> !Int -> ![SoapAlignMismatch] -> SoapAlign
name :: SoapAlign -> !SeqName

-- | Reference strand orientation sequence
sequ :: SoapAlign -> !SeqData

-- | Reference strand orientation quality data
qual :: SoapAlign -> !QualData
nhit :: SoapAlign -> !Int
pairend :: SoapAlign -> !Char
length :: SoapAlign -> !Offset
strand :: SoapAlign -> !Strand
refname :: SoapAlign -> !SeqName

-- | 1-based index, as output by SOAP, of reference strand 5' end
refstart :: SoapAlign -> !Offset
nmismatch :: SoapAlign -> !Int
mismatches :: SoapAlign -> ![SoapAlignMismatch]
data SoapAlignMismatch
SAM :: !Char -> !Char -> !Offset -> !Qual -> SoapAlignMismatch

-- | Read nt in reference strand orientation
readnt :: SoapAlignMismatch -> !Char

-- | Reference nt in reference strand orientation
refnt :: SoapAlignMismatch -> !Char

-- | Offset from reference strand 5' end in reference strand orientation
offset :: SoapAlignMismatch -> !Offset

-- | Quality score of read nt
qualnt :: SoapAlignMismatch -> !Qual
refSeqPos :: SoapAlign -> SeqPos
refCSeqLoc :: SoapAlign -> ContigSeqLoc
refSeqLoc :: SoapAlign -> SeqLoc
mismatchSeqPos :: SoapAlign -> SoapAlignMismatch -> SeqPos
parse :: (Error e, MonadError e m) => ByteString -> m SoapAlign
unparse :: SoapAlign -> ByteString
parseMismatch :: (Error e, MonadError e m) => ByteString -> m SoapAlignMismatch
unparseMismatch :: SoapAlignMismatch -> ByteString
group :: [SoapAlign] -> [[SoapAlign]]
instance Read SoapAlignMismatch
instance Show SoapAlignMismatch
instance Eq SoapAlignMismatch
instance Ord SoapAlignMismatch
instance Read SoapAlign
instance Show SoapAlign
instance Eq SoapAlign
instance Ord SoapAlign


-- | Model the BED format, according to the spec at
--   http:<i></i>genome.ucsc.edu<i>FAQ</i>FAQformat#format1
module Bio.Alignment.BED

-- | The BED data type Note that the specification allows a variable number
--   of fields, with only the three first required. This definition
--   requires all fields to be present.
data BED
BED :: ByteString -> Offset -> Offset -> ByteString -> Int -> Dir -> Offset -> Offset -> (Word8, Word8, Word8) -> [(Offset, Offset)] -> BED
chrom :: BED -> ByteString
chromStart :: BED -> Offset
chromEnd :: BED -> Offset
name :: BED -> ByteString

-- | Range 0..1000
score :: BED -> Int
strand :: BED -> Dir
thickStart :: BED -> Offset
thickEnd :: BED -> Offset

-- | Available BED files appear to not support this format. RGB is
--   therefore ignored (read and written as '0')
itemRGB :: BED -> (Word8, Word8, Word8)

-- | Lists of lenght blockCount, blockStarts are relative to chromStart
blockSizeStart :: BED -> [(Offset, Offset)]

-- | Yet another direction data structure.
data Dir
Fwd :: Dir
Rev :: Dir
readBED :: FilePath -> IO [BED]
writeBED :: FilePath -> [BED] -> IO ()
instance Eq Dir
instance Show BED
instance Read Dir
instance Show Dir


-- | Efficient lookup of sequence positions and locations in a large map of
--   target locations. For example, target locations might represent a
--   collection of genes annotated on a chromosome. The <a>LocMap</a> would
--   efficiently find which gene(s) overlapped a sequence position on that
--   chromosome.
--   
--   Target locations are assigned to one or more zones based on
--   <a>bounds</a>. Query locations are then tested only against the target
--   locations in the relevant zones.
module Bio.Location.LocMap

-- | Data structure allowing efficient lookup of target sequence locations
--   that overlap a query location. Target locations can be paired with an
--   arbitrary object.
data LocMap a

-- | Create a <a>LocMap</a> from an association list of target locations.
fromList :: Offset -> [(Loc, a)] -> LocMap a

-- | Find the (possibly empty) list of target locations and associated
--   objects that contain a sequence position, in the sense of
--   <a>isWithin</a>
lookupWithin :: Pos -> LocMap a -> [(Loc, a)]

-- | Find the (possibly empty) list of target locations and associated
--   objects that overlap a sequence location, in the sense of
--   <a>overlaps</a>
lookupOverlaps :: Loc -> LocMap a -> [(Loc, a)]

-- | Remove a target location and object association from the map, if it is
--   present. If it is present multiple times, only the first occurrence
--   will be deleted.
delete :: Eq a => (Loc, a) -> LocMap a -> LocMap a

-- | Generalized version of <a>delete</a> that removes the first target
--   location / object association that satisfies a predicate function.
deleteBy :: ((Loc, a) -> Bool) -> LocMap a -> LocMap a

-- | Insert a new target association into a target location map.
insert :: Loc -> a -> LocMap a -> LocMap a
checkInvariants :: LocMap a -> [String]
instance Monoid (LocMap a)


-- | Efficient lookup of query positions in a collection of target sequence
--   locations where positions and locations are associated with specific
--   sequence names. This is an extension of <tt>LocMap</tt> to use
--   locations and positions on named sequences as in <tt>SeqLocation</tt>.
module Bio.Location.SeqLocMap

-- | A data structure for efficiently finding target sequence locations
--   (<tt>SeqLoc.Loc</tt>) that overlap query positions or locations. Each
--   target location can be associated with an arbitrary additional value
--   in the lookup map.
type SeqLocMap a = OnSeqs (LocMap a)

-- | Empty lookup map.
empty :: SeqLocMap a

-- | Creates a <a>SeqLocMap</a> from a list of target locations and their
--   associated objects
fromList :: [(SeqLoc, a)] -> SeqLocMap a

-- | Inserts a new target location and associated object into the location
--   lookup map.
insert :: SeqLoc -> a -> SeqLocMap a -> SeqLocMap a

-- | Find the (possibly empty) list of target locations and associated
--   objects that contain a sequence position, in the sense of
--   <tt>Loc.isWithin</tt>.
lookupWithin :: SeqPos -> SeqLocMap a -> [(SeqLoc, a)]

-- | Find the (possibly empty) list of target locations and associated
--   objects that overlap a sequence location, in the sense of
--   <tt>Loc.overlaps</tt>.
lookupOverlaps :: SeqLoc -> SeqLocMap a -> [(SeqLoc, a)]

module Bio.GFF3.Feature
data GFFAttr
GFFAttr :: !ByteString -> ![ByteString] -> GFFAttr
attrTag :: GFFAttr -> !ByteString
attrValues :: GFFAttr -> ![ByteString]
data Feature
Feature :: !ByteString -> !ByteString -> !ByteString -> !Offset -> !Offset -> !Maybe Double -> !Maybe Strand -> !Maybe Offset -> ![GFFAttr] -> Feature
seqid :: Feature -> !ByteString
source :: Feature -> !ByteString
ftype :: Feature -> !ByteString
start :: Feature -> !Offset
end :: Feature -> !Offset
score :: Feature -> !Maybe Double
strand :: Feature -> !Maybe Strand
phase :: Feature -> !Maybe Offset
attributes :: Feature -> ![GFFAttr]
length :: Feature -> Offset
parse :: (Error e, MonadError e m) => ByteString -> m Feature
unparse :: Feature -> ByteString
parseWithFasta :: (Error e, MonadError e m) => ByteString -> m ([Feature], [ByteString])
attrByTag :: ByteString -> Feature -> [ByteString]
ids :: Feature -> [ByteString]
parentIds :: Feature -> [ByteString]
contigLoc :: Feature -> ContigLoc
loc :: Feature -> Loc
seqLoc :: Feature -> SeqLoc
name :: (Error e, MonadError e m) => Feature -> m SeqName
instance Eq Feature
instance Ord Feature
instance Show Feature
instance Eq GFFAttr
instance Ord GFFAttr
instance Show GFFAttr

module Bio.GFF3.FeatureHier
data FeatureHier
features :: FeatureHier -> (Set Feature)
lookupId :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m Feature
lookupIdChildren :: (Error e, MonadError e m) => FeatureHier -> ByteString -> m [Feature]
fromList :: (Error e, MonadError e m) => [Feature] -> m FeatureHier
insert :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier
delete :: (Error e, MonadError e m) => Feature -> FeatureHier -> m FeatureHier
parents :: FeatureHier -> Feature -> [Feature]
children :: FeatureHier -> Feature -> [Feature]
parentsM :: MonadReader FeatureHier m => Feature -> m [Feature]
childrenM :: MonadReader FeatureHier m => Feature -> m [Feature]
checkInvariants :: FeatureHier -> [String]
instance Show FeatureHier

module Bio.GFF3.FeatureHierSequences
data FeatureHierSequences
features :: FeatureHierSequences -> Set Feature
sequences :: FeatureHierSequences -> [Sequence a]
fromLists :: (Error e, MonadError e m) => [Feature] -> [Sequence a] -> m FeatureHierSequences
parse :: (Error e, MonadError e m) => ByteString -> m FeatureHierSequences
lookupId :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m Feature
parents :: FeatureHierSequences -> Feature -> [Feature]
children :: FeatureHierSequences -> Feature -> [Feature]
seqData :: (Error e, MonadError e m) => FeatureHierSequences -> SeqLoc -> m SeqData
getSequence :: (Error e, MonadError e m) => FeatureHierSequences -> SeqName -> m SeqData
featureSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a)
runGFF :: FilePath -> (ErrorT String (Reader FeatureHierSequences) a) -> ErrorT String IO a
runGFFIO :: FilePath -> (ErrorT String (ReaderT FeatureHierSequences IO) a) -> ErrorT String IO a
asksGFF :: (Error e, MonadError e m, MonadReader FeatureHierSequences m) => (FeatureHierSequences -> a -> m b) -> a -> m b
instance Show FeatureHierSequences

module Bio.GFF3.SGD
chromosomes :: FeatureHierSequences -> [Feature]
genes :: FeatureHierSequences -> [Feature]
rRNAs :: FeatureHierSequences -> [Feature]
sortExons :: (Error e, MonadError e m) => [Feature] -> m [Feature]
geneSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a)
geneSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc
geneCDSes :: FeatureHierSequences -> Feature -> [Feature]
noncodingSequence :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m (Sequence a)
noncodingSeqLoc :: (Error e, MonadError e m) => FeatureHierSequences -> Feature -> m SeqLoc
noncodingExons :: FeatureHierSequences -> Feature -> [Feature]
namedSLM :: FeatureHierSequences -> SeqLocMap Feature
geneCDS_SLM :: (Error e, MonadError e m) => FeatureHierSequences -> m (SeqLocMap Feature)


-- | This is a meta-module importing and re-exporting sequence-related
--   stuff.
--   
--   It encompasses the <a>Bio.Sequence.SeqData</a>,
--   <a>Bio.Sequence.Fasta</a>, and <a>Bio.Sequence.TwoBit</a> modules.
module Bio.Sequence

-- | A sequence consists of a header, the sequence data itself, and
--   optional quality data. The type parameter is a phantom type to
--   separate nucleotide and amino acid sequences
data Sequence t

-- | header and actual sequence
Seq :: !SeqData -> !SeqData -> !Maybe QualData -> Sequence t
data Unknown

-- | An offset, index, or length of a <a>SeqData</a>
type Offset = Int64

-- | The basic data type used in <a>Sequence</a>s
type SeqData = ByteString

-- | Basic type for quality data. Range 0..255. Typical Phred output is in
--   the range 6..50, with 20 as the line in the sand separating good from
--   bad.
type Qual = Word8

-- | Quality data is a <a>Qual</a> vector, currently implemented as a
--   <tt>ByteString</tt>.
type QualData = ByteString

-- | Return sequence length.
seqlength :: Sequence a -> Offset

-- | Return sequence label (first word of header)
seqlabel :: Sequence a -> SeqData

-- | Return full header.
seqheader :: Sequence a -> SeqData

-- | Return the sequence data.
seqdata :: Sequence a -> SeqData

-- | Return the quality data, or error if none exist. Use hasqual if in
--   doubt.
seqqual :: Sequence a -> QualData

-- | Read the character at the specified position in the sequence.
(!) :: Sequence a -> Offset -> Char
appendHeader :: Sequence a -> String -> Sequence a

-- | Modify the header by appending text, or by replacing all but the
--   sequence label (i.e. first word).
setHeader :: Sequence a -> String -> Sequence a

-- | Convert a String to <a>SeqData</a>
fromStr :: String -> SeqData

-- | Convert a <a>SeqData</a> to a String
toStr :: SeqData -> String

-- | Complement a single character. I.e. identify the nucleotide it can
--   hybridize with. Note that for multiple nucleotides, you usually want
--   the reverse complement (see <a>revcompl</a> for that).
compl :: Char -> Char

-- | Calculate the reverse complement. This is only relevant for the
--   nucleotide alphabet, and it leaves other characters unmodified.
revcompl :: Sequence Nuc -> Sequence Nuc

-- | Calculate the reverse complent for SeqData only.
revcompl' :: SeqData -> SeqData

-- | For type tagging sequences (protein sequences use <a>Amino</a> below)
data Nuc
castToNuc :: Sequence a -> Sequence Nuc
data Amino
Ala :: Amino
Arg :: Amino
Asn :: Amino
Asp :: Amino
Cys :: Amino
Gln :: Amino
Glu :: Amino
Gly :: Amino
His :: Amino
Ile :: Amino
Leu :: Amino
Lys :: Amino
Met :: Amino
Phe :: Amino
Pro :: Amino
Ser :: Amino
Thr :: Amino
Tyr :: Amino
Trp :: Amino
Val :: Amino
STP :: Amino
Asx :: Amino
Glx :: Amino
Xle :: Amino
Xaa :: Amino

-- | Translate a nucleotide sequence into the corresponding protein
--   sequence. This works rather blindly, with no attempt to identify ORFs
--   or otherwise QA the result.
translate :: Sequence Nuc -> Offset -> [Amino]

-- | Convert a sequence in IUPAC format to a list of amino acids.
fromIUPAC :: SeqData -> [Amino]

-- | Convert a list of amino acids to a sequence in IUPAC format.
toIUPAC :: [Amino] -> SeqData
castToAmino :: Sequence a -> Sequence Amino

-- | Returns a sequence with all internal storage freshly copied and with
--   sequence and quality data present as a single chunk.
--   
--   By freshly copying internal storage, <a>defragSeq</a> allows garbage
--   collection of the original data source whence the sequence was read;
--   otherwise, use of just a short sequence name can cause an entire
--   sequence file buffer to be retained.
--   
--   By compacting sequence data into a single chunk, <a>defragSeq</a>
--   avoids linear-time traversal of sequence chunks during random access
--   into sequence data.
defragSeq :: Sequence t -> Sequence t

-- | map over sequences, treating them as a sequence of (char,word8) pairs.
--   This will work on sequences without quality, as long as the function
--   doesn't try to examine it. The current implementation is not very
--   efficient.
seqmap :: ((Char, Qual) -> (Char, Qual)) -> Sequence t -> Sequence t

-- | Read nucleotide sequences in any format - Fasta, SFF, FastQ, 2bit,
--   PHD... Todo: detect Illumina vs Sanger FastQ, transparent compression
readNuc :: FilePath -> IO [Sequence Nuc]

-- | Read protein sequences in any supported format (i.e. Fasta)
readProt :: FilePath -> IO [Sequence Amino]

-- | Lazily read sequences from a FASTA-formatted file
readFasta :: FilePath -> IO [Sequence Unknown]

-- | Lazily read sequence from handle
hReadFasta :: Handle -> IO [Sequence Unknown]

-- | Write sequences to a FASTA-formatted file. Line length is 60.
writeFasta :: FilePath -> [Sequence a] -> IO ()

-- | Write sequences in FASTA format to a handle.
hWriteFasta :: Handle -> [Sequence a] -> IO ()

-- | Read quality data for sequences to a file.
readQual :: FilePath -> IO [Sequence Unknown]

-- | Write quality data for sequences to a file.
writeQual :: FilePath -> [Sequence a] -> IO ()
hWriteQual :: Handle -> [Sequence a] -> IO ()

-- | Read sequence and associated quality. Will error if the sequences and
--   qualites do not match one-to-one in sequence.
readFastaQual :: FilePath -> FilePath -> IO [Sequence Unknown]

-- | Write sequence and quality data simulatnously This may be more
--   laziness-friendly.
writeFastaQual :: FilePath -> FilePath -> [Sequence a] -> IO ()
hWriteFastaQual :: Handle -> Handle -> [Sequence a] -> IO ()
readFastQ :: FilePath -> IO [Sequence Nuc]
writeFastQ :: FilePath -> [Sequence Nuc] -> IO ()
hReadFastQ :: Handle -> IO [Sequence Nuc]
hWriteFastQ :: Handle -> [Sequence Nuc] -> IO ()
readSangerQ :: FilePath -> IO [Sequence Nuc]
writeSangerQ :: FilePath -> [Sequence Nuc] -> IO ()
hReadSangerQ :: Handle -> IO [Sequence Nuc]
hWriteSangerQ :: Handle -> [Sequence Nuc] -> IO ()
readIllumina :: FilePath -> IO [Sequence Nuc]
writeIllumina :: FilePath -> [Sequence Nuc] -> IO ()
hReadIllumina :: Handle -> IO [Sequence Nuc]
hWriteIllumina :: Handle -> [Sequence Nuc] -> IO ()

-- | Parse a .phd file, extracting the contents as a Sequence
readPhd :: FilePath -> IO (Sequence Nuc)

-- | Parse .phd contents from a handle
hReadPhd :: Handle -> IO (Sequence Nuc)

-- | Parse a (lazy) ByteString as sequences in the 2bit format.
decode2Bit :: ByteString -> [Sequence Nuc]

-- | Read sequences from a file in 2bit format and | unmarshall/deserialize
--   into Sequence format.
read2Bit :: FilePath -> IO [Sequence Nuc]

-- | Read sequences from a file handle in the 2bit format and |
--   unmarshall/deserialze into Sequence format.
hRead2Bit :: Handle -> IO [Sequence Nuc]

-- | This is a struct for containing a set of hashing functions
data HashF k
HF :: (SeqData -> Offset -> Maybe k) -> (SeqData -> [(k, Offset)]) -> ([k] -> [k]) -> HashF k

-- | calculates the hash at a given offset in the sequence
hash :: HashF k -> SeqData -> Offset -> Maybe k

-- | calculate all hashes from a sequence, and their indices
hashes :: HashF k -> SeqData -> [(k, Offset)]

-- | for sorting hashes
ksort :: HashF k -> [k] -> [k]

-- | Contigous constructs an int/eger from a contigous k-word.
contigous :: Integral k => Int -> HashF k

-- | Like <a>contigous</a>, but returns the same hash for a word and its
--   reverse complement.
rcontig :: Integral k => Int -> HashF k

-- | Like <tt>rcontig</tt>, but ignoring monomers (i.e. arbitrarily long
--   runs of a single nucelotide are treated the same a single nucleotide.
rcpacked :: Integral k => Int -> HashF k
class KWords s
kwords :: KWords s => Int -> s -> [s]
entropy :: (Ord str, KWords str) => Int -> str -> Double


-- | Multiple alignments.
module Bio.Alignment.Multiple

-- | Progressive multiple alignment. Calculate a tree from agglomerative
--   clustering, then align at each branch going bottom up. Returns a list
--   of columns (rows?).
progressive :: (Sequence a -> Sequence a -> (Double, EditList)) -> [Sequence a] -> [String]

-- | Derive alignments indirectly, i.e. calculate A|C using alignments A|B
--   and B|C. This is central for <tt>Coffee</tt> evaluation of alignments,
--   and T-Coffee construction of alignments.
indirect :: EditList -> EditList -> EditList