-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A package with basic parsing utilities for several Bioinformatic data formats. -- -- Contains utilities to parse and write Eigenstrat, Fasta, FreqSum, VCF, -- Plink and other file formats used in population genetics analyses. @package sequence-formats @version 1.7.0 -- | This module contains helper functions for file parsing. module SequenceFormats.Utils -- | A function to help with reporting parsing errors to stderr. Returns a -- clean Producer over the parsed datatype. liftParsingErrors :: MonadThrow m => Either (ParsingError, Producer ByteString m r) () -> Producer a m () -- | A helper function to parse a text producer, properly reporting all -- errors to stderr. consumeProducer :: MonadThrow m => Parser a -> Producer ByteString m () -> Producer a m () readFileProd :: MonadSafe m => FilePath -> Producer ByteString m () -- | An exception type for parsing BioInformatic file formats. data SeqFormatException SeqFormatException :: String -> SeqFormatException -- | A wrapper datatype for Chromosome names. newtype Chrom Chrom :: ByteString -> Chrom [unChrom] :: Chrom -> ByteString word :: Parser ByteString instance GHC.Classes.Eq SequenceFormats.Utils.SeqFormatException instance GHC.Show.Show SequenceFormats.Utils.SeqFormatException instance GHC.Classes.Eq SequenceFormats.Utils.Chrom instance GHC.Show.Show SequenceFormats.Utils.Chrom instance GHC.Classes.Ord SequenceFormats.Utils.Chrom instance GHC.Exception.Type.Exception SequenceFormats.Utils.SeqFormatException -- | A module to read and write allele sharing histograms, as defined here: -- https://rarecoal-docs.readthedocs.io/en/latest/rarecoal.html#histogram-files module SequenceFormats.RareAlleleHistogram -- | A datatype to represent an Allele Sharing Histogram: data RareAlleleHistogram RareAlleleHistogram :: [String] -> [Int] -> Int -> Int -> [Int] -> [SitePattern] -> Int64 -> Map SitePattern Int64 -> Maybe (Map SitePattern (Double, Double)) -> RareAlleleHistogram -- | A list of branch names [raNames] :: RareAlleleHistogram -> [String] -- | A list of haploid sample sizes. [raNVec] :: RareAlleleHistogram -> [Int] -- | The minimum allele count [raMinAf] :: RareAlleleHistogram -> Int -- | The maximum allele count [raMaxAf] :: RareAlleleHistogram -> Int -- | A list of branch indices that were used to condition the allele -- sharing pattern [raConditionOn] :: RareAlleleHistogram -> [Int] -- | A list of patterns that are excluded. [raExcludePatterns] :: RareAlleleHistogram -> [SitePattern] -- | The total number of non-missing sites in the genome. [raTotalNrSites] :: RareAlleleHistogram -> Int64 -- | The actual data, a dictionary from allele sharing patterns to observed -- numbers. [raCounts] :: RareAlleleHistogram -> Map SitePattern Int64 -- | An optional dictionary that contains Jackknife estimates and standard -- deviations for each pattern frequency. [raJackknifeEstimates] :: RareAlleleHistogram -> Maybe (Map SitePattern (Double, Double)) -- | Read a histogram from a File Handle. readHistogramFromHandle :: MonadIO m => Handle -> m RareAlleleHistogram -- | A simple type synonym for the SitePattern, represented as a list of -- Integers that represents each pattern across the branches. type SitePattern = [Int] -- | Read a histogram from a FilePath readHistogram :: MonadIO m => FilePath -> m RareAlleleHistogram -- | Write a histogram to the stdout writeHistogramStdOut :: MonadIO m => RareAlleleHistogram -> m () -- | Write a histogram to a file writeHistogramFile :: MonadIO m => FilePath -> RareAlleleHistogram -> m () -- | A simple function to convert a pattern into a String. showSitePattern :: SitePattern -> String instance GHC.Show.Show SequenceFormats.RareAlleleHistogram.RareAlleleHistogram instance GHC.Classes.Eq SequenceFormats.RareAlleleHistogram.RareAlleleHistogram module SequenceFormats.Pileup -- | Read a pileup-formatted file from StdIn, for reading from an external -- command `samtools mpileup`. readPileupFromStdIn :: (MonadIO m, MonadThrow m) => Producer PileupRow m () -- | Read pileup from a file. readPileupFromFile :: MonadSafe m => FilePath -> Producer PileupRow m () -- | A datatype to represent a single pileup row for multiple individuals. -- The constructor arguments are: Chromosome, Position, Refererence -- Allelele, Pileup String per individual data PileupRow PileupRow :: Chrom -> Int -> Char -> [String] -> [[Strand]] -> PileupRow -- | The chromosome [pileupChrom] :: PileupRow -> Chrom -- | The position [pileupPos] :: PileupRow -> Int -- | The reference base [pileupRef] :: PileupRow -> Char -- | The base string [pileupBases] :: PileupRow -> [String] [pileupStrandInfo] :: PileupRow -> [[Strand]] -- | A datatype to represent the strand orientation of a single base. data Strand ForwardStrand :: Strand ReverseStrand :: Strand instance GHC.Show.Show SequenceFormats.Pileup.Strand instance GHC.Classes.Eq SequenceFormats.Pileup.Strand instance GHC.Show.Show SequenceFormats.Pileup.PileupRow instance GHC.Classes.Eq SequenceFormats.Pileup.PileupRow -- | Module to parse and write freqSum files. The freqsum format is defined -- here: -- https://rarecoal-docs.readthedocs.io/en/latest/rarecoal-tools.html#vcf2freqsum module SequenceFormats.FreqSum -- | A function to read a freqsum file from StdIn. Returns a pair of a -- freqSum Header and a Producer over all lines. readFreqSumStdIn :: (MonadIO m, MonadThrow m) => m (FreqSumHeader, Producer FreqSumEntry m ()) -- | A function to read a freqsum file from a file. Returns a pair of a -- freqSum Header and a Producer over all lines. readFreqSumFile :: MonadSafe m => FilePath -> m (FreqSumHeader, Producer FreqSumEntry m ()) -- | A Datatype to denote a single freqSum line data FreqSumEntry FreqSumEntry :: Chrom -> Int -> Maybe ByteString -> Maybe Double -> Char -> Char -> [Maybe Int] -> FreqSumEntry -- | The chromosome of the site [fsChrom] :: FreqSumEntry -> Chrom -- | The position of the site [fsPos] :: FreqSumEntry -> Int -- | An optional parameter to take the snpId. This is not parsed from or -- printed to freqSum format but is used in internal conversions from -- Eigenstrat. [fsSnpId] :: FreqSumEntry -> Maybe ByteString -- | An optional parameter to take the genetic pos. This is not parsed from -- or printed to freqSum format but is used in internal conversions from -- Eigenstrat. [fsGeneticPos] :: FreqSumEntry -> Maybe Double -- | The reference allele [fsRef] :: FreqSumEntry -> Char -- | The alternative allele [fsAlt] :: FreqSumEntry -> Char -- | A list of allele counts in each group. Nothing denotes missing data. [fsCounts] :: FreqSumEntry -> [Maybe Int] -- | A Datatype representing the Header data FreqSumHeader FreqSumHeader :: [String] -> [Int] -> FreqSumHeader -- | A list of individual or group names [fshNames] :: FreqSumHeader -> [String] -- | A list of haplotype counts per individual/group. [fshCounts] :: FreqSumHeader -> [Int] -- | A function to write freqSum data to StdOut. Expects the freqSum header -- as argument, and then returns a Consumer that accepts freqSum entries. printFreqSumStdOut :: MonadIO m => FreqSumHeader -> Consumer FreqSumEntry m () -- | A function that writes a freqSum file. Expects the FilePath and the -- freqSum header as arguments, and then returns a Consumer that accepts -- freqSum entries. printFreqSumFile :: MonadSafe m => FilePath -> FreqSumHeader -> Consumer FreqSumEntry m () -- | This function converts a single freqSum entry to a printable freqSum -- line. freqSumEntryToText :: FreqSumEntry -> ByteString instance GHC.Show.Show SequenceFormats.FreqSum.FreqSumHeader instance GHC.Classes.Eq SequenceFormats.FreqSum.FreqSumHeader instance GHC.Show.Show SequenceFormats.FreqSum.FreqSumEntry instance GHC.Classes.Eq SequenceFormats.FreqSum.FreqSumEntry -- | Module to read and parse through a Fasta file. The Fasta format is -- defined here: https://en.wikipedia.org/wiki/FASTA_format module SequenceFormats.Fasta -- | This function takes a Bytestring-Producer over a Fasta-file, reads in -- the first header and then returns a produer over its sequence. The -- return of that producer is the Bytestring-Producer of the rest of the -- fasta file. readNextFastaEntry :: MonadIO m => Producer ByteString m () -> m (Chrom, Producer ByteString m (Producer ByteString m ())) -- | A function to select out a specific chromosome from a Fasta File. -- Expects a file handle to the file and a chromosome. Note that by -- Chromosome I simply denote a fasta header line, as is the case for -- example for the human reference genome. Returns a Bytestring-Producer -- over the single sequence followed the specified header (the -- chromosome). loadFastaChrom :: Handle -> Chrom -> IO (Producer ByteString IO ()) -- | Module to read and parse Eigenstrat-formatted genotype data. The -- Eigenstrat format is defined at -- https://github.com/argriffing/eigensoft/blob/master/CONVERTF/README. module SequenceFormats.Eigenstrat -- | A datatype to represent a single genomic SNP. The constructor -- arguments are: Chromosome, Position, Reference Allele, Alternative -- Allele. data EigenstratSnpEntry EigenstratSnpEntry :: Chrom -> Int -> Double -> ByteString -> Char -> Char -> EigenstratSnpEntry [snpChrom] :: EigenstratSnpEntry -> Chrom [snpPos] :: EigenstratSnpEntry -> Int [snpGeneticPos] :: EigenstratSnpEntry -> Double [snpId] :: EigenstratSnpEntry -> ByteString [snpRef] :: EigenstratSnpEntry -> Char [snpAlt] :: EigenstratSnpEntry -> Char -- | A datatype to represent a single individual. The constructor arguments -- are: Name, Sex and Population Name data EigenstratIndEntry EigenstratIndEntry :: String -> Sex -> String -> EigenstratIndEntry -- | Function to read an Eigenstrat individual file. Returns the Eigenstrat -- Individual Entries as list. readEigenstratInd :: MonadIO m => FilePath -> m [EigenstratIndEntry] -- | A datatype to represent the genotype of an individual at a SNP. data GenoEntry HomRef :: GenoEntry Het :: GenoEntry HomAlt :: GenoEntry Missing :: GenoEntry -- | Vector of the genotypes of all individuals at a single SNP. type GenoLine = Vector GenoEntry -- | A datatype to represent Sex in an Eigenstrat Individual file data Sex Male :: Sex Female :: Sex Unknown :: Sex -- | Function to read a Snp File from StdIn. Returns a Pipes-Producer over -- the EigenstratSnpEntries. readEigenstratSnpStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m () -- | Function to read a Snp File from a file. Returns a Pipes-Producer over -- the EigenstratSnpEntries. readEigenstratSnpFile :: MonadSafe m => FilePath -> Producer EigenstratSnpEntry m () -- | Function to read a full Eigenstrat database from files. Returns a pair -- of the Eigenstrat Individual Entries, and a joint Producer over the -- snp entries and the genotypes. readEigenstrat :: MonadSafe m => FilePath -> FilePath -> FilePath -> m ([EigenstratIndEntry], Producer (EigenstratSnpEntry, GenoLine) m ()) -- | Function to write an Eigenstrat Database. Returns a consumer expecting -- joint Snp- and Genotype lines. writeEigenstrat :: MonadSafe m => FilePath -> FilePath -> FilePath -> [EigenstratIndEntry] -> Consumer (EigenstratSnpEntry, GenoLine) m () -- | Function to write an Eigenstrat Ind file. writeEigenstratIndFile :: MonadIO m => FilePath -> [EigenstratIndEntry] -> m () -- | Function to write an Eigenstrat Snp File. Returns a consumer expecting -- EigenstratSnpEntries. writeEigenstratSnp :: MonadIO m => Handle -> Consumer EigenstratSnpEntry m () -- | Function to write an Eigentrat Geno File. Returns a consumer expecting -- Eigenstrat Genolines. writeEigenstratGeno :: MonadIO m => Handle -> Consumer GenoLine m () instance GHC.Show.Show SequenceFormats.Eigenstrat.EigenstratSnpEntry instance GHC.Classes.Eq SequenceFormats.Eigenstrat.EigenstratSnpEntry instance GHC.Show.Show SequenceFormats.Eigenstrat.Sex instance GHC.Classes.Eq SequenceFormats.Eigenstrat.Sex instance GHC.Show.Show SequenceFormats.Eigenstrat.EigenstratIndEntry instance GHC.Classes.Eq SequenceFormats.Eigenstrat.EigenstratIndEntry instance GHC.Show.Show SequenceFormats.Eigenstrat.GenoEntry instance GHC.Classes.Eq SequenceFormats.Eigenstrat.GenoEntry module SequenceFormats.Plink -- | Function to read a Bim File from StdIn. Returns a Pipes-Producer over -- the EigenstratSnpEntries. readBimStdIn :: (MonadThrow m, MonadIO m) => Producer EigenstratSnpEntry m () -- | Function to read a Bim File from a file. Returns a Pipes-Producer over -- the EigenstratSnpEntries. readBimFile :: MonadSafe m => FilePath -> Producer EigenstratSnpEntry m () -- | Function to write a Bim file. Returns a consumer expecting -- EigenstratSnpEntries. writeBim :: MonadIO m => Handle -> Consumer EigenstratSnpEntry m () -- | Function to read a Plink fam file. Returns the Eigenstrat Individual -- Entries as list. readFamFile :: MonadIO m => FilePath -> m [PlinkFamEntry] -- | A function to read a bed file from a file. Returns a Producer over all -- lines. readPlinkBedFile :: MonadSafe m => FilePath -> Int -> m (Producer GenoLine m ()) -- | Function to read a full Plink dataset from files. Returns a pair of -- the Plink Individual Entries, and a joint Producer over the snp -- entries and the genotypes. readPlink :: MonadSafe m => FilePath -> FilePath -> FilePath -> m ([PlinkFamEntry], Producer (EigenstratSnpEntry, GenoLine) m ()) -- | Function to write a Plink Database. Returns a consumer expecting joint -- Snp- and Genotype lines. writePlink :: MonadSafe m => FilePath -> FilePath -> FilePath -> [PlinkFamEntry] -> Consumer (EigenstratSnpEntry, GenoLine) m () data PlinkFamEntry PlinkFamEntry :: String -> String -> String -> String -> Sex -> String -> PlinkFamEntry [_famFamilyID] :: PlinkFamEntry -> String [_famIndividualID] :: PlinkFamEntry -> String [_famFatherID] :: PlinkFamEntry -> String [_famMotherID] :: PlinkFamEntry -> String [_famSexCode] :: PlinkFamEntry -> Sex [_famPhenotype] :: PlinkFamEntry -> String plinkFam2EigenstratInd :: PlinkPopNameMode -> PlinkFamEntry -> EigenstratIndEntry eigenstratInd2PlinkFam :: PlinkPopNameMode -> EigenstratIndEntry -> PlinkFamEntry data PlinkPopNameMode PlinkPopNameAsFamily :: PlinkPopNameMode PlinkPopNameAsPhenotype :: PlinkPopNameMode PlinkPopNameAsBoth :: PlinkPopNameMode instance GHC.Show.Show SequenceFormats.Plink.PlinkFamEntry instance GHC.Classes.Eq SequenceFormats.Plink.PlinkFamEntry instance GHC.Show.Show SequenceFormats.Plink.PlinkPopNameMode instance GHC.Classes.Eq SequenceFormats.Plink.PlinkPopNameMode module SequenceFormats.Bed data BedEntry BedEntry :: Chrom -> Int -> Int -> BedEntry bedFileParser :: Parser BedEntry readBedFile :: MonadSafe m => FilePath -> Producer BedEntry m () data IntervalStatus BedBehind :: IntervalStatus BedOn :: IntervalStatus BedAhead :: IntervalStatus filterThroughBed :: Monad m => Producer BedEntry m () -> (a -> (Chrom, Int)) -> Producer a m () -> Producer a m () instance GHC.Classes.Eq SequenceFormats.Bed.BedEntry instance GHC.Show.Show SequenceFormats.Bed.BedEntry -- | A module to help with parsing VCF files. The VCF format is defined -- here: https://en.wikipedia.org/wiki/Variant_Call_Format module SequenceFormats.VCF -- | A datatype to represent the VCF Header. Most comments are simply -- parsed as entire lines, but the very last comment line, containing the -- sample names, is separated out data VCFheader VCFheader :: [String] -> [String] -> VCFheader -- | A list of containing all comments starting with a single # [vcfHeaderComments] :: VCFheader -> [String] -- | The list of sample names parsed from the last comment line starting -- with ## [vcfSampleNames] :: VCFheader -> [String] -- | A Datatype representing a single VCF entry. data VCFentry VCFentry :: Chrom -> Int -> Maybe ByteString -> ByteString -> [ByteString] -> Double -> Maybe ByteString -> [ByteString] -> [ByteString] -> [[ByteString]] -> VCFentry -- | The chromosome [vcfChrom] :: VCFentry -> Chrom -- | The position [vcfPos] :: VCFentry -> Int -- | The SNP ID if non-missing [vcfId] :: VCFentry -> Maybe ByteString -- | The reference allele (supports also multi-character alleles for -- Indels) [vcfRef] :: VCFentry -> ByteString -- | The alternative alleles, each one possible of multiple characters [vcfAlt] :: VCFentry -> [ByteString] -- | The quality value [vcfQual] :: VCFentry -> Double -- | The Filter value, if non-missing. [vcfFilter] :: VCFentry -> Maybe ByteString -- | A list of Info fields [vcfInfo] :: VCFentry -> [ByteString] -- | A list of format tags [vcfFormatString] :: VCFentry -> [ByteString] -- | A list of format fields for each sample. [vcfGenotypeInfo] :: VCFentry -> [[ByteString]] -- | Reading a VCF from StdIn. Returns a VCFHeader and a Producer over -- VCFentries. readVCFfromStdIn :: (MonadIO m, MonadThrow m) => m (VCFheader, Producer VCFentry m ()) -- | Reading a VCF from a file. Returns a VCFHeader and a Producer over -- VCFentries. readVCFfromFile :: MonadSafe m => FilePath -> m (VCFheader, Producer VCFentry m ()) -- | reads a VCFheader and VCFentries from a text producer. readVCFfromProd :: MonadThrow m => Producer ByteString m () -> m (VCFheader, Producer VCFentry m ()) -- | Extracts the genotype fields (for each sapmle) from a VCF entry getGenotypes :: VCFentry -> Either String [ByteString] -- | Extracts the dosages (the sum of non-reference alleles) per sample -- (returns a Left Error if it fails.) getDosages :: VCFentry -> Either String [Maybe Int] -- | returns True if the SNp is a biallelic Transversion SNP (i.e. one of -- GT, GC, AT, AC) isTransversionSnp :: ByteString -> [ByteString] -> Bool -- | Converts a VCFentry to the simpler FreqSum format (returns a Left -- Error if it fails.) vcfToFreqSumEntry :: VCFentry -> Either String FreqSumEntry -- | returns True if the SNP is biallelic. isBiallelicSnp :: ByteString -> [ByteString] -> Bool instance GHC.Show.Show SequenceFormats.VCF.VCFheader instance GHC.Classes.Eq SequenceFormats.VCF.VCFentry instance GHC.Show.Show SequenceFormats.VCF.VCFentry module SequenceFormats.Genomic class Genomic a genomicPosition :: Genomic a => a -> (Chrom, Int) genomicChrom :: Genomic a => a -> Chrom genomicBase :: Genomic a => a -> Int chromFilter :: Genomic e => [Chrom] -> e -> Bool genomicFilterThroughBed :: (Monad m, Genomic e) => Producer BedEntry m () -> Producer e m () -> Producer e m () instance SequenceFormats.Genomic.Genomic SequenceFormats.Eigenstrat.EigenstratSnpEntry instance SequenceFormats.Genomic.Genomic SequenceFormats.FreqSum.FreqSumEntry instance SequenceFormats.Genomic.Genomic SequenceFormats.Pileup.PileupRow instance SequenceFormats.Genomic.Genomic SequenceFormats.VCF.VCFentry