/* synced_bcf_reader.h -- stream through multiple VCF files. Copyright (C) 2012-2014 Genome Research Ltd. Author: Petr Danecek Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* The synced_bcf_reader allows to keep multiple VCFs open and stream them using the next_line iterator in a seamless matter without worrying about chromosomes and synchronizing the sites. This is used by vcfcheck to compare multiple VCFs simultaneously and is used also for merging, creating intersections, etc. The synced_bcf_reader also provides API for reading indexed BCF/VCF, hiding differences in BCF/VCF opening, indexing and reading. Example of usage: bcf_srs_t *sr = bcf_sr_init(); for (i=0; ihas_line[i] #define bcf_sr_get_line(_readers, i) ((_readers)->has_line[i] ? ((_readers)->readers[i].buffer[0]) : NULL) #define bcf_sr_swap_line(_readers, i, lieu) { bcf1_t *tmp = lieu; lieu = (_readers)->readers[i].buffer[0]; (_readers)->readers[i].buffer[0] = tmp; } #define bcf_sr_region_done(_readers,i) (!(_readers)->has_line[i] && !(_readers)->readers[i].nbuffer ? 1 : 0) #define bcf_sr_get_header(_readers, i) (_readers)->readers[i].header #define bcf_sr_get_reader(_readers, i) &((_readers)->readers[i]) /** * bcf_sr_seek() - set all readers to selected position * @seq: sequence name; NULL to seek to start * @pos: 0-based coordinate */ int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos); /** * bcf_sr_set_samples() - sets active samples * @readers: holder of the open readers * @samples: this can be one of: file name with one sample per line; * or column-separated list of samples; or '-' for a list of * samples shared by all files. If first character is the * exclamation mark, all but the listed samples are included. * @is_file: 0: list of samples; 1: file with sample names * * Returns 1 if the call succeeded, or 0 on error. */ int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); /** * bcf_sr_set_targets(), bcf_sr_set_regions() - init targets/regions * @readers: holder of the open readers * @targets: list of regions, one-based and inclusive. * @is_fname: 0: targets is a comma-separated list of regions (chr,chr:from-to) * 1: targets is a tabix indexed file with a list of regions * ( or ) * * Returns 0 if the call succeeded, or -1 on error. * * Both functions behave the same way, unlisted positions will be skipped by * bcf_sr_next_line(). However, there is an important difference: regions use * index to jump to desired positions while targets streams the whole files * and merely skip unlisted positions. * * Moreover, bcf_sr_set_targets() accepts an optional parameter $alleles which * is intepreted as a 1-based column index in the tab-delimited file where * alleles are listed. This in principle enables to perform the COLLAPSE_* * logic also with tab-delimited files. However, the current implementation * considers the alleles merely as a suggestion for prioritizing one of possibly * duplicate VCF lines. It is up to the caller to examine targets->als if * perfect match is sought after. Note that the duplicate positions in targets * file are currently not supported. * Targets (but not regions) can be prefixed with "^" to request logical complement, * for example "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. */ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); /* * bcf_sr_regions_init() * @regions: regions can be either a comma-separated list of regions * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or * tab-delimited file (the default). Uncompressed files * are stored in memory while bgzip-compressed and tabix-indexed * region files are streamed. * @is_file: 0: regions is a comma-separated list of regions * (chr|chr:pos|chr:from-to|chr:from-) * 1: VCF, BED or tab-delimited file * @chr, from, to: * Column indexes of chromosome, start position and end position * in the tab-delimited file. The positions are 1-based and * inclusive. * These parameters are ignored when reading from VCF, BED or * tabix-indexed files. When end position column is not present, * supply 'from' in place of 'to'. When 'to' is negative, first * abs(to) will be attempted and if that fails, 'from' will be used * instead. */ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to); void bcf_sr_regions_destroy(bcf_sr_regions_t *regions); /* * bcf_sr_regions_seek() - seek to the chromosome block * * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and * reg->start,reg->end to -1. */ int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr); /* * bcf_sr_regions_next() - retrieves next region. Returns 0 on success and -1 * when all regions have been read. The fields reg->seq, reg->start and * reg->end are filled with the genomic coordinates on succes or with * NULL,-1,-1 when no region is available. The coordinates are 0-based, * inclusive. */ int bcf_sr_regions_next(bcf_sr_regions_t *reg); /* * bcf_sr_regions_overlap() - checks if the interval overlaps any of * the regions, the coordinates are 0-based, inclusive. The coordinate queries * must come in ascending order. * * Returns 0 if the position is in regions; -1 if the position is not in the * regions and more regions exist; -2 if not in the regions and there are no more * regions left. */ int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end); /* * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until * all remaining records are processed. */ void bcf_sr_regions_flush(bcf_sr_regions_t *regs); #ifdef __cplusplus } #endif #endif