-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | A package with basic parsing utilities for several Bioinformatic data formats.
--
-- Contains utilities to parse and write Eigenstrat, Fasta, FreqSum, VCF,
-- Plink and other file formats used in population genetics analyses.
@package sequence-formats
@version 1.5.0
-- | This module contains helper functions for file parsing.
module SequenceFormats.Utils
-- | A function to help with reporting parsing errors to stderr. Returns a
-- clean Producer over the parsed datatype.
liftParsingErrors :: MonadThrow m => Either (ParsingError, Producer ByteString m r) () -> Producer a m ()
-- | A helper function to parse a text producer, properly reporting all
-- errors to stderr.
consumeProducer :: MonadThrow m => Parser a -> Producer ByteString m () -> Producer a m ()
readFileProd :: MonadSafe m => FilePath -> Producer ByteString m ()
-- | An exception type for parsing BioInformatic file formats.
data SeqFormatException
SeqFormatException :: String -> SeqFormatException
-- | A wrapper datatype for Chromosome names.
newtype Chrom
Chrom :: ByteString -> Chrom
[unChrom] :: Chrom -> ByteString
word :: Parser ByteString
instance GHC.Classes.Eq SequenceFormats.Utils.Chrom
instance GHC.Classes.Eq SequenceFormats.Utils.SeqFormatException
instance GHC.Show.Show SequenceFormats.Utils.SeqFormatException
instance GHC.Show.Show SequenceFormats.Utils.Chrom
instance GHC.Classes.Ord SequenceFormats.Utils.Chrom
instance GHC.Exception.Type.Exception SequenceFormats.Utils.SeqFormatException
-- | A module to read and write allele sharing histograms, as defined here:
-- https://rarecoal-docs.readthedocs.io/en/latest/rarecoal.html#histogram-files
module SequenceFormats.RareAlleleHistogram
-- | A datatype to represent an Allele Sharing Histogram:
data RareAlleleHistogram
RareAlleleHistogram :: [String] -> [Int] -> Int -> Int -> [Int] -> [SitePattern] -> Int64 -> Map SitePattern Int64 -> Maybe (Map SitePattern (Double, Double)) -> RareAlleleHistogram
-- | A list of branch names
[raNames] :: RareAlleleHistogram -> [String]
-- | A list of haploid sample sizes.
[raNVec] :: RareAlleleHistogram -> [Int]
-- | The minimum allele count
[raMinAf] :: RareAlleleHistogram -> Int
-- | The maximum allele count
[raMaxAf] :: RareAlleleHistogram -> Int
-- | A list of branch indices that were used to condition the allele
-- sharing pattern
[raConditionOn] :: RareAlleleHistogram -> [Int]
-- | A list of patterns that are excluded.
[raExcludePatterns] :: RareAlleleHistogram -> [SitePattern]
-- | The total number of non-missing sites in the genome.
[raTotalNrSites] :: RareAlleleHistogram -> Int64
-- | The actual data, a dictionary from allele sharing patterns to observed
-- numbers.
[raCounts] :: RareAlleleHistogram -> Map SitePattern Int64
-- | An optional dictionary that contains Jackknife estimates and standard
-- deviations for each pattern frequency.
[raJackknifeEstimates] :: RareAlleleHistogram -> Maybe (Map SitePattern (Double, Double))
-- | Read a histogram from a File Handle.
readHistogramFromHandle :: MonadIO m => Handle -> m RareAlleleHistogram
-- | A simple type synonym for the SitePattern, represented as a list of
-- Integers that represents each pattern across the branches.
type SitePattern = [Int]
-- | Read a histogram from a FilePath
readHistogram :: MonadIO m => FilePath -> m RareAlleleHistogram
-- | Write a histogram to the stdout
writeHistogramStdOut :: MonadIO m => RareAlleleHistogram -> m ()
-- | Write a histogram to a file
writeHistogramFile :: MonadIO m => FilePath -> RareAlleleHistogram -> m ()
-- | A simple function to convert a pattern into a String.
showSitePattern :: SitePattern -> String
instance GHC.Show.Show SequenceFormats.RareAlleleHistogram.RareAlleleHistogram
instance GHC.Classes.Eq SequenceFormats.RareAlleleHistogram.RareAlleleHistogram
module SequenceFormats.Pileup
-- | Read a pileup-formatted file from StdIn, for reading from an external
-- command `samtools mpileup`.
readPileupFromStdIn :: (MonadIO m, MonadThrow m) => Producer PileupRow m ()
-- | Read pileup from a file.
readPileupFromFile :: MonadSafe m => FilePath -> Producer PileupRow m ()
-- | A datatype to represent a single pileup row for multiple individuals.
-- The constructor arguments are: Chromosome, Position, Refererence
-- Allelele, Pileup String per individual
data PileupRow
PileupRow :: Chrom -> Int -> Char -> [String] -> [[Strand]] -> PileupRow
-- | The chromosome
[pileupChrom] :: PileupRow -> Chrom
-- | The position
[pileupPos] :: PileupRow -> Int
-- | The reference base
[pileupRef] :: PileupRow -> Char
-- | The base string
[pileupBases] :: PileupRow -> [String]
[pileupStrandInfo] :: PileupRow -> [[Strand]]
-- | A datatype to represent the strand orientation of a single base.
data Strand
ForwardStrand :: Strand
ReverseStrand :: Strand
instance GHC.Show.Show SequenceFormats.Pileup.PileupRow
instance GHC.Classes.Eq SequenceFormats.Pileup.PileupRow
instance GHC.Show.Show SequenceFormats.Pileup.Strand
instance GHC.Classes.Eq SequenceFormats.Pileup.Strand
-- | Module to parse and write freqSum files. The freqsum format is defined
-- here:
-- https://rarecoal-docs.readthedocs.io/en/latest/rarecoal-tools.html#vcf2freqsum
module SequenceFormats.FreqSum
-- | A function to read a freqsum file from StdIn. Returns a pair of a
-- freqSum Header and a Producer over all lines.
readFreqSumStdIn :: (MonadIO m, MonadThrow m) => m (FreqSumHeader, Producer FreqSumEntry m ())
-- | A function to read a freqsum file from a file. Returns a pair of a
-- freqSum Header and a Producer over all lines.
readFreqSumFile :: MonadSafe m => FilePath -> m (FreqSumHeader, Producer FreqSumEntry m ())
-- | A Datatype to denote a single freqSum line
data FreqSumEntry
FreqSumEntry :: Chrom -> Int -> Maybe ByteString -> Maybe Double -> Char -> Char -> [Maybe Int] -> FreqSumEntry
-- | The chromosome of the site
[fsChrom] :: FreqSumEntry -> Chrom
-- | The position of the site
[fsPos] :: FreqSumEntry -> Int
-- | An optional parameter to take the snpId. This is not parsed from or
-- printed to freqSum format but is used in internal conversions from
-- Eigenstrat.
[fsSnpId] :: FreqSumEntry -> Maybe ByteString
-- | An optional parameter to take the genetic pos. This is not parsed from
-- or printed to freqSum format but is used in internal conversions from
-- Eigenstrat.
[fsGeneticPos] :: FreqSumEntry -> Maybe Double
-- | The reference allele
[fsRef] :: FreqSumEntry -> Char
-- | The alternative allele
[fsAlt] :: FreqSumEntry -> Char
-- | A list of allele counts in each group. Nothing denotes missing data.
[fsCounts] :: FreqSumEntry -> [Maybe Int]
-- | A Datatype representing the Header
data FreqSumHeader
FreqSumHeader :: [String] -> [Int] -> FreqSumHeader
-- | A list of individual or group names
[fshNames] :: FreqSumHeader -> [String]
-- | A list of haplotype counts per individual/group.
[fshCounts] :: FreqSumHeader -> [Int]
-- | A function to write freqSum data to StdOut. Expects the freqSum header
-- as argument, and then returns a Consumer that accepts freqSum entries.
printFreqSumStdOut :: MonadIO m => FreqSumHeader -> Consumer FreqSumEntry m ()
-- | A function that writes a freqSum file. Expects the FilePath and the
-- freqSum header as arguments, and then returns a Consumer that accepts
-- freqSum entries.
printFreqSumFile :: MonadSafe m => FilePath -> FreqSumHeader -> Consumer FreqSumEntry m ()
-- | This function converts a single freqSum entry to a printable freqSum
-- line.
freqSumEntryToText :: FreqSumEntry -> ByteString
instance GHC.Show.Show SequenceFormats.FreqSum.FreqSumEntry
instance GHC.Classes.Eq SequenceFormats.FreqSum.FreqSumEntry
instance GHC.Show.Show SequenceFormats.FreqSum.FreqSumHeader
instance GHC.Classes.Eq SequenceFormats.FreqSum.FreqSumHeader
-- | Module to read and parse through a Fasta file. The Fasta format is
-- defined here: https://en.wikipedia.org/wiki/FASTA_format
module SequenceFormats.Fasta
-- | This function takes a Bytestring-Producer over a Fasta-file, reads in
-- the first header and then returns a produer over its sequence. The
-- return of that producer is the Bytestring-Producer of the rest of the
-- fasta file.
readNextFastaEntry :: MonadIO m => Producer ByteString m () -> m (Chrom, Producer ByteString m (Producer ByteString m ()))
-- | A function to select out a specific chromosome from a Fasta File.
-- Expects a file handle to the file and a chromosome. Note that by
-- Chromosome I simply denote a fasta header line, as is the case for
-- example for the human reference genome. Returns a Bytestring-Producer
-- over the single sequence followed the specified header (the
-- chromosome).
loadFastaChrom :: Handle -> Chrom -> IO (Producer ByteString IO ())
-- | Module to read and parse Eigenstrat-formatted genotype data. The
-- Eigenstrat format is defined at
-- https://github.com/argriffing/eigensoft/blob/master/CONVERTF/README.
module SequenceFormats.Eigenstrat
-- | A datatype to represent a single genomic SNP. The constructor
-- arguments are: Chromosome, Position, Reference Allele, Alternative
-- Allele.
data EigenstratSnpEntry
EigenstratSnpEntry :: Chrom -> Int -> Double -> ByteString -> Char -> Char -> EigenstratSnpEntry
[snpChrom] :: EigenstratSnpEntry -> Chrom
[snpPos] :: EigenstratSnpEntry -> Int
[snpGeneticPos] :: EigenstratSnpEntry -> Double
[snpId] :: EigenstratSnpEntry -> ByteString
[snpRef] :: EigenstratSnpEntry -> Char
[snpAlt] :: EigenstratSnpEntry -> Char
-- | A datatype to represent a single individual. The constructor arguments
-- are: Name, Sex and Population Name
data EigenstratIndEntry
EigenstratIndEntry :: String -> Sex -> String -> EigenstratIndEntry
-- | Function to read an Eigenstrat individual file. Returns the Eigenstrat
-- Individual Entries as list.
readEigenstratInd :: MonadIO m => FilePath -> m [EigenstratIndEntry]
-- | A datatype to represent the genotype of an individual at a SNP.
data GenoEntry
HomRef :: GenoEntry
Het :: GenoEntry
HomAlt :: GenoEntry
Missing :: GenoEntry
-- | Vector of the genotypes of all individuals at a single SNP.
type GenoLine = Vector GenoEntry
-- | A datatype to represent Sex in an Eigenstrat Individual file
data Sex
Male :: Sex
Female :: Sex
Unknown :: Sex
-- | Function to read a Snp File from StdIn. Returns a Pipes-Producer over
-- the EigenstratSnpEntries.
readEigenstratSnpStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m ()
-- | Function to read a Snp File from a file. Returns a Pipes-Producer over
-- the EigenstratSnpEntries.
readEigenstratSnpFile :: MonadSafe m => FilePath -> Producer EigenstratSnpEntry m ()
-- | Function to read a full Eigenstrat database from files. Returns a pair
-- of the Eigenstrat Individual Entries, and a joint Producer over the
-- snp entries and the genotypes.
readEigenstrat :: MonadSafe m => FilePath -> FilePath -> FilePath -> m ([EigenstratIndEntry], Producer (EigenstratSnpEntry, GenoLine) m ())
-- | Function to write an Eigenstrat Database. Returns a consumer expecting
-- joint Snp- and Genotype lines.
writeEigenstrat :: MonadSafe m => FilePath -> FilePath -> FilePath -> [EigenstratIndEntry] -> Consumer (EigenstratSnpEntry, GenoLine) m ()
-- | Function to write an Eigenstrat Ind file.
writeEigenstratIndFile :: MonadIO m => FilePath -> [EigenstratIndEntry] -> m ()
-- | Function to write an Eigenstrat Snp File. Returns a consumer expecting
-- EigenstratSnpEntries.
writeEigenstratSnp :: MonadIO m => Handle -> Consumer EigenstratSnpEntry m ()
-- | Function to write an Eigentrat Geno File. Returns a consumer expecting
-- Eigenstrat Genolines.
writeEigenstratGeno :: MonadIO m => Handle -> Consumer GenoLine m ()
instance GHC.Show.Show SequenceFormats.Eigenstrat.GenoEntry
instance GHC.Classes.Eq SequenceFormats.Eigenstrat.GenoEntry
instance GHC.Show.Show SequenceFormats.Eigenstrat.EigenstratIndEntry
instance GHC.Classes.Eq SequenceFormats.Eigenstrat.EigenstratIndEntry
instance GHC.Show.Show SequenceFormats.Eigenstrat.Sex
instance GHC.Classes.Eq SequenceFormats.Eigenstrat.Sex
instance GHC.Show.Show SequenceFormats.Eigenstrat.EigenstratSnpEntry
instance GHC.Classes.Eq SequenceFormats.Eigenstrat.EigenstratSnpEntry
module SequenceFormats.Plink
-- | Function to read a Bim File from StdIn. Returns a Pipes-Producer over
-- the EigenstratSnpEntries.
readBimStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m ()
-- | Function to read a Bim File from a file. Returns a Pipes-Producer over
-- the EigenstratSnpEntries.
readBimFile :: MonadSafe m => FilePath -> Producer EigenstratSnpEntry m ()
-- | Function to write a Bim file. Returns a consumer expecting
-- EigenstratSnpEntries.
writeBim :: MonadIO m => Handle -> Consumer EigenstratSnpEntry m ()
-- | Function to read a Plink fam file. Returns the Eigenstrat Individual
-- Entries as list.
readFamFile :: MonadIO m => FilePath -> m [EigenstratIndEntry]
-- | A function to read a bed file from a file. Returns a Producer over all
-- lines.
readPlinkBedFile :: MonadSafe m => FilePath -> Int -> m (Producer GenoLine m ())
-- | A module to help with parsing VCF files. The VCF format is defined
-- here: https://en.wikipedia.org/wiki/Variant_Call_Format
module SequenceFormats.VCF
-- | A datatype to represent the VCF Header. Most comments are simply
-- parsed as entire lines, but the very last comment line, containing the
-- sample names, is separated out
data VCFheader
VCFheader :: [String] -> [String] -> VCFheader
-- | A list of containing all comments starting with a single #
[vcfHeaderComments] :: VCFheader -> [String]
-- | The list of sample names parsed from the last comment line starting
-- with ##
[vcfSampleNames] :: VCFheader -> [String]
-- | A Datatype representing a single VCF entry.
data VCFentry
VCFentry :: Chrom -> Int -> Maybe ByteString -> ByteString -> [ByteString] -> Double -> Maybe ByteString -> [ByteString] -> [ByteString] -> [[ByteString]] -> VCFentry
-- | The chromosome
[vcfChrom] :: VCFentry -> Chrom
-- | The position
[vcfPos] :: VCFentry -> Int
-- | The SNP ID if non-missing
[vcfId] :: VCFentry -> Maybe ByteString
-- | The reference allele (supports also multi-character alleles for
-- Indels)
[vcfRef] :: VCFentry -> ByteString
-- | The alternative alleles, each one possible of multiple characters
[vcfAlt] :: VCFentry -> [ByteString]
-- | The quality value
[vcfQual] :: VCFentry -> Double
-- | The Filter value, if non-missing.
[vcfFilter] :: VCFentry -> Maybe ByteString
-- | A list of Info fields
[vcfInfo] :: VCFentry -> [ByteString]
-- | A list of format tags
[vcfFormatString] :: VCFentry -> [ByteString]
-- | A list of format fields for each sample.
[vcfGenotypeInfo] :: VCFentry -> [[ByteString]]
-- | Reading a VCF from StdIn. Returns a VCFHeader and a Producer over
-- VCFentries.
readVCFfromStdIn :: (MonadIO m, MonadThrow m) => m (VCFheader, Producer VCFentry m ())
-- | Reading a VCF from a file. Returns a VCFHeader and a Producer over
-- VCFentries.
readVCFfromFile :: MonadSafe m => FilePath -> m (VCFheader, Producer VCFentry m ())
-- | reads a VCFheader and VCFentries from a text producer.
readVCFfromProd :: MonadThrow m => Producer ByteString m () -> m (VCFheader, Producer VCFentry m ())
-- | Extracts the genotype fields (for each sapmle) from a VCF entry
getGenotypes :: VCFentry -> Either String [ByteString]
-- | Extracts the dosages (the sum of non-reference alleles) per sample
-- (returns a Left Error if it fails.)
getDosages :: VCFentry -> Either String [Maybe Int]
-- | returns True if the SNp is a biallelic Transversion SNP (i.e. one of
-- GT, GC, AT, AC)
isTransversionSnp :: ByteString -> [ByteString] -> Bool
-- | Converts a VCFentry to the simpler FreqSum format (returns a Left
-- Error if it fails.)
vcfToFreqSumEntry :: VCFentry -> Either String FreqSumEntry
-- | returns True if the SNP is biallelic.
isBiallelicSnp :: ByteString -> [ByteString] -> Bool
instance GHC.Classes.Eq SequenceFormats.VCF.VCFentry
instance GHC.Show.Show SequenceFormats.VCF.VCFentry
instance GHC.Show.Show SequenceFormats.VCF.VCFheader