/* Copyright (c) 2013 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger Institute nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include "cram/sam_header.h" #include "cram/string_alloc.h" static void sam_hdr_error(char *msg, char *line, int len, int lno) { int j; for (j = 0; j < len && line[j] != '\n'; j++) ; fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line); } void sam_hdr_dump(SAM_hdr *hdr) { khint_t k; int i; printf("===DUMP===\n"); for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { SAM_hdr_type *t1, *t2; char c[2]; if (!kh_exist(hdr->h, k)) continue; t1 = t2 = kh_val(hdr->h, k); c[0] = kh_key(hdr->h, k)>>8; c[1] = kh_key(hdr->h, k)&0xff; printf("Type %.2s, count %d\n", c, t1->prev->order+1); do { SAM_hdr_tag *tag; printf(">>>%d ", t1->order); for (tag = t1->tag; tag; tag=tag->next) { printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3); } putchar('\n'); t1 = t1->next; } while (t1 != t2); } /* Dump out PG chains */ printf("\n@PG chains:\n"); for (i = 0; i < hdr->npg_end; i++) { int j; printf(" %d:", i); for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) { printf("%s%d(%.*s)", j == hdr->pg_end[i] ? " " : "->", j, hdr->pg[j].name_len, hdr->pg[j].name); } printf("\n"); } puts("===END DUMP==="); } /* Updates the hash tables in the SAM_hdr structure. * * Returns 0 on success; * -1 on failure */ static int sam_hdr_update_hashes(SAM_hdr *sh, int type, SAM_hdr_type *h_type) { /* Add to reference hash? */ if ((type>>8) == 'S' && (type&0xff) == 'Q') { SAM_hdr_tag *tag; SAM_SQ *new_ref; int nref = sh->nref; new_ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref)); if (!new_ref) return -1; sh->ref = new_ref; tag = h_type->tag; sh->ref[nref].name = NULL; sh->ref[nref].len = 0; sh->ref[nref].ty = h_type; sh->ref[nref].tag = tag; while (tag) { if (tag->str[0] == 'S' && tag->str[1] == 'N') { if (!(sh->ref[nref].name = malloc(tag->len))) return -1; strncpy(sh->ref[nref].name, tag->str+3, tag->len-3); sh->ref[nref].name[tag->len-3] = 0; } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { sh->ref[nref].len = atoi(tag->str+3); } tag = tag->next; } if (sh->ref[nref].name) { khint_t k; int r; k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r); if (-1 == r) return -1; kh_val(sh->ref_hash, k) = nref; } else { return -1; // SN should be present, according to spec. } sh->nref++; } /* Add to read-group hash? */ if ((type>>8) == 'R' && (type&0xff) == 'G') { SAM_hdr_tag *tag; SAM_RG *new_rg; int nrg = sh->nrg; new_rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg)); if (!new_rg) return -1; sh->rg = new_rg; tag = h_type->tag; sh->rg[nrg].name = NULL; sh->rg[nrg].name_len = 0; sh->rg[nrg].ty = h_type; sh->rg[nrg].tag = tag; sh->rg[nrg].id = nrg; while (tag) { if (tag->str[0] == 'I' && tag->str[1] == 'D') { if (!(sh->rg[nrg].name = malloc(tag->len))) return -1; strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3); sh->rg[nrg].name[tag->len-3] = 0; sh->rg[nrg].name_len = strlen(sh->rg[nrg].name); } tag = tag->next; } if (sh->rg[nrg].name) { khint_t k; int r; k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r); if (-1 == r) return -1; kh_val(sh->rg_hash, k) = nrg; } else { return -1; // ID should be present, according to spec. } sh->nrg++; } /* Add to program hash? */ if ((type>>8) == 'P' && (type&0xff) == 'G') { SAM_hdr_tag *tag; SAM_PG *new_pg; int npg = sh->npg; new_pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg)); if (!new_pg) return -1; sh->pg = new_pg; tag = h_type->tag; sh->pg[npg].name = NULL; sh->pg[npg].name_len = 0; sh->pg[npg].ty = h_type; sh->pg[npg].tag = tag; sh->pg[npg].id = npg; sh->pg[npg].prev_id = -1; while (tag) { if (tag->str[0] == 'I' && tag->str[1] == 'D') { if (!(sh->pg[npg].name = malloc(tag->len))) return -1; strncpy(sh->pg[npg].name, tag->str+3, tag->len-3); sh->pg[npg].name[tag->len-3] = 0; sh->pg[npg].name_len = strlen(sh->pg[npg].name); } else if (tag->str[0] == 'P' && tag->str[1] == 'P') { // Resolve later if needed khint_t k; char tmp = tag->str[tag->len]; tag->str[tag->len] = 0; k = kh_get(m_s2i, sh->pg_hash, tag->str+3); tag->str[tag->len] = tmp; if (k != kh_end(sh->pg_hash)) { int p_id = kh_val(sh->pg_hash, k); sh->pg[npg].prev_id = sh->pg[p_id].id; /* Unmark previous entry as a PG termination */ if (sh->npg_end > 0 && sh->pg_end[sh->npg_end-1] == p_id) { sh->npg_end--; } else { int i; for (i = 0; i < sh->npg_end; i++) { if (sh->pg_end[i] == p_id) { memmove(&sh->pg_end[i], &sh->pg_end[i+1], (sh->npg_end-i-1)*sizeof(*sh->pg_end)); sh->npg_end--; } } } } else { sh->pg[npg].prev_id = -1; } } tag = tag->next; } if (sh->pg[npg].name) { khint_t k; int r; k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r); if (-1 == r) return -1; kh_val(sh->pg_hash, k) = npg; } else { return -1; // ID should be present, according to spec. } /* Add to npg_end[] array. Remove later if we find a PP line */ if (sh->npg_end >= sh->npg_end_alloc) { int *new_pg_end; int new_alloc = sh->npg_end_alloc ? sh->npg_end_alloc*2 : 4; new_pg_end = realloc(sh->pg_end, new_alloc * sizeof(int)); if (!new_pg_end) return -1; sh->npg_end_alloc = new_alloc; sh->pg_end = new_pg_end; } sh->pg_end[sh->npg_end++] = npg; sh->npg++; } return 0; } /* * Appends a formatted line to an existing SAM header. * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with * optional new-line. If it contains more than 1 line then multiple lines * will be added in order. * * Input text is of maximum length len or as terminated earlier by a NUL. * Len may be 0 if unknown, in which case lines must be NUL-terminated. * * Returns 0 on success * -1 on failure */ int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) { int i, lno, text_offset; char *hdr; if (!len) len = strlen(lines); text_offset = ks_len(&sh->text); if (EOF == kputsn(lines, len, &sh->text)) return -1; hdr = ks_str(&sh->text) + text_offset; for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) { khint32_t type; khint_t k; int l_start = i, new; SAM_hdr_type *h_type; SAM_hdr_tag *h_tag, *last; if (hdr[i] != '@') { int j; for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++) ; sam_hdr_error("Header line does not start with '@'", &hdr[l_start], len - l_start, lno); return -1; } type = (hdr[i+1]<<8) | hdr[i+2]; if (hdr[i+1] < 'A' || hdr[i+1] > 'z' || hdr[i+2] < 'A' || hdr[i+2] > 'z') { sam_hdr_error("Header line does not have a two character key", &hdr[l_start], len - l_start, lno); return -1; } i += 3; if (hdr[i] == '\n') continue; // Add the header line type if (!(h_type = pool_alloc(sh->type_pool))) return -1; if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new))) return -1; // Form the ring, either with self or other lines of this type if (!new) { SAM_hdr_type *t = kh_val(sh->h, k), *p; p = t->prev; assert(p->next == t); p->next = h_type; h_type->prev = p; t->prev = h_type; h_type->next = t; h_type->order = p->order+1; } else { kh_val(sh->h, k) = h_type; h_type->prev = h_type->next = h_type; h_type->order = 0; } // Parse the tags on this line last = NULL; if ((type>>8) == 'C' && (type&0xff) == 'O') { int j; if (hdr[i] != '\t') { sam_hdr_error("Missing tab", &hdr[l_start], len - l_start, lno); return -1; } for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++) ; if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool))) return -1; h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); h_tag->len = j-i; h_tag->next = NULL; if (!h_tag->str) return -1; i = j; } else { do { int j; if (hdr[i] != '\t') { sam_hdr_error("Missing tab", &hdr[l_start], len - l_start, lno); return -1; } for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++) ; if (!(h_tag = pool_alloc(sh->tag_pool))) return -1; h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i); h_tag->len = j-i; h_tag->next = NULL; if (!h_tag->str) return -1; if (h_tag->len < 3 || h_tag->str[2] != ':') { sam_hdr_error("Malformed key:value pair", &hdr[l_start], len - l_start, lno); return -1; } if (last) last->next = h_tag; else h_type->tag = h_tag; last = h_tag; i = j; } while (i < len && hdr[i] != '\0' && hdr[i] != '\n'); } /* Update RG/SQ hashes */ if (-1 == sam_hdr_update_hashes(sh, type, h_type)) return -1; } return 0; } /* * Adds a single line to a SAM header. * Specify type and one or more key,value pairs, ending with the NULL key. * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL). * * Returns index for specific entry on success (eg 2nd SQ, 4th RG) * -1 on failure */ int sam_hdr_add(SAM_hdr *sh, const char *type, ...) { va_list args; va_start(args, type); return sam_hdr_vadd(sh, type, args, NULL); } /* * sam_hdr_add with a va_list interface. * * Note: this function invokes va_arg at least once, making the value * of ap indeterminate after the return. The caller should call * va_start/va_end before/after calling this function or use va_copy. */ int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) { va_list args; SAM_hdr_type *h_type; SAM_hdr_tag *h_tag, *last; int new; khint32_t type_i = (type[0]<<8) | type[1], k; if (EOF == kputc_('@', &sh->text)) return -1; if (EOF == kputsn(type, 2, &sh->text)) return -1; if (!(h_type = pool_alloc(sh->type_pool))) return -1; if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new))) return -1; // Form the ring, either with self or other lines of this type if (!new) { SAM_hdr_type *t = kh_val(sh->h, k), *p; p = t->prev; assert(p->next == t); p->next = h_type; h_type->prev = p; t->prev = h_type; h_type->next = t; h_type->order = p->order + 1; } else { kh_val(sh->h, k) = h_type; h_type->prev = h_type->next = h_type; h_type->order = 0; } last = NULL; // Any ... varargs va_start(args, ap); for (;;) { char *k, *v; int idx; if (!(k = (char *)va_arg(args, char *))) break; v = va_arg(args, char *); if (EOF == kputc_('\t', &sh->text)) return -1; if (!(h_tag = pool_alloc(sh->tag_pool))) return -1; idx = ks_len(&sh->text); if (EOF == kputs(k, &sh->text)) return -1; if (EOF == kputc_(':', &sh->text)) return -1; if (EOF == kputs(v, &sh->text)) return -1; h_tag->len = ks_len(&sh->text) - idx; h_tag->str = string_ndup(sh->str_pool, ks_str(&sh->text) + idx, h_tag->len); h_tag->next = NULL; if (!h_tag->str) return -1; if (last) last->next = h_tag; else h_type->tag = h_tag; last = h_tag; } va_end(args); // Plus the specified va_list params for (;;) { char *k, *v; int idx; if (!(k = (char *)va_arg(ap, char *))) break; v = va_arg(ap, char *); if (EOF == kputc_('\t', &sh->text)) return -1; if (!(h_tag = pool_alloc(sh->tag_pool))) return -1; idx = ks_len(&sh->text); if (EOF == kputs(k, &sh->text)) return -1; if (EOF == kputc_(':', &sh->text)) return -1; if (EOF == kputs(v, &sh->text)) return -1; h_tag->len = ks_len(&sh->text) - idx; h_tag->str = string_ndup(sh->str_pool, ks_str(&sh->text) + idx, h_tag->len); h_tag->next = NULL; if (!h_tag->str) return -1; if (last) last->next = h_tag; else h_type->tag = h_tag; last = h_tag; } va_end(ap); if (EOF == kputc('\n', &sh->text)) return -1; int itype = (type[0]<<8) | type[1]; if (-1 == sam_hdr_update_hashes(sh, itype, h_type)) return -1; return h_type->order; } /* * Returns the first header item matching 'type'. If ID is non-NULL it checks * for the tag ID: and compares against the specified ID. * * Returns NULL if no type/ID is found */ SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type, char *ID_key, char *ID_value) { SAM_hdr_type *t1, *t2; int itype = (type[0]<<8)|(type[1]); khint_t k; /* Special case for types we have prebuilt hashes on */ if (ID_key) { if (type[0] == 'S' && type[1] == 'Q' && ID_key[0] == 'S' && ID_key[1] == 'N') { k = kh_get(m_s2i, hdr->ref_hash, ID_value); return k != kh_end(hdr->ref_hash) ? hdr->ref[kh_val(hdr->ref_hash, k)].ty : NULL; } if (type[0] == 'R' && type[1] == 'G' && ID_key[0] == 'I' && ID_key[1] == 'D') { k = kh_get(m_s2i, hdr->rg_hash, ID_value); return k != kh_end(hdr->rg_hash) ? hdr->rg[kh_val(hdr->rg_hash, k)].ty : NULL; } if (type[0] == 'P' && type[1] == 'G' && ID_key[0] == 'I' && ID_key[1] == 'D') { k = kh_get(m_s2i, hdr->pg_hash, ID_value); return k != kh_end(hdr->pg_hash) ? hdr->pg[kh_val(hdr->pg_hash, k)].ty : NULL; } } k = kh_get(sam_hdr, hdr->h, itype); if (k == kh_end(hdr->h)) return NULL; if (!ID_key) return kh_val(hdr->h, k); t1 = t2 = kh_val(hdr->h, k); do { SAM_hdr_tag *tag; for (tag = t1->tag; tag; tag = tag->next) { if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) { char *cp1 = tag->str+3; char *cp2 = ID_value; while (*cp1 && *cp1 == *cp2) cp1++, cp2++; if (*cp2 || *cp1) continue; return t1; } } t1 = t1->next; } while (t1 != t2); return NULL; } /* * As per SAM_hdr_type, but returns a complete line of formatted text * for a specific head type/ID combination. If ID is NULL then it returns * the first line of the specified type. * * The returned string is malloced and should be freed by the calling * function with free(). * * Returns NULL if no type/ID is found. */ char *sam_hdr_find_line(SAM_hdr *hdr, char *type, char *ID_key, char *ID_value) { SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value); kstring_t ks = KS_INITIALIZER; SAM_hdr_tag *tag; int r = 0; if (!ty) return NULL; // Paste together the line from the hashed copy r |= (kputc_('@', &ks) == EOF); r |= (kputs(type, &ks) == EOF); for (tag = ty->tag; tag; tag = tag->next) { r |= (kputc_('\t', &ks) == EOF); r |= (kputsn(tag->str, tag->len, &ks) == EOF); } if (r) { KS_FREE(&ks); return NULL; } return ks_str(&ks); } /* * Looks for a specific key in a single sam header line. * If prev is non-NULL it also fills this out with the previous tag, to * permit use in key removal. *prev is set to NULL when the tag is the first * key in the list. When a tag isn't found, prev (if non NULL) will be the last * tag in the existing list. * * Returns the tag pointer on success * NULL on failure */ SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh, SAM_hdr_type *type, char *key, SAM_hdr_tag **prev) { SAM_hdr_tag *tag, *p = NULL; for (tag = type->tag; tag; p = tag, tag = tag->next) { if (tag->str[0] == key[0] && tag->str[1] == key[1]) { if (prev) *prev = p; return tag; } } if (prev) *prev = p; return NULL; } /* * Adds or updates tag key,value pairs in a header line. * Eg for adding M5 tags to @SQ lines or updating sort order for the * @HD line (although use the sam_hdr_sort_order() function for * HD manipulation, which is a wrapper around this funuction). * * Specify multiple key,value pairs ending in NULL. * * Returns 0 on success * -1 on failure */ int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) { va_list ap; va_start(ap, type); for (;;) { char *k, *v; int idx; SAM_hdr_tag *tag, *prev; if (!(k = (char *)va_arg(ap, char *))) break; v = va_arg(ap, char *); tag = sam_hdr_find_key(hdr, type, k, &prev); if (!tag) { if (!(tag = pool_alloc(hdr->tag_pool))) return -1; if (prev) prev->next = tag; else type->tag = tag; tag->next = NULL; } idx = ks_len(&hdr->text); if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0) return -1; tag->len = ks_len(&hdr->text) - idx; tag->str = string_ndup(hdr->str_pool, ks_str(&hdr->text) + idx, tag->len); if (!tag->str) return -1; } va_end(ap); return 0; } #define K(a) (((a)[0]<<8)|((a)[1])) /* * Returns the sort order: */ enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) { return hdr->sort_order; } static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) { khint_t k; enum sam_sort_order so; so = ORDER_UNKNOWN; k = kh_get(sam_hdr, hdr->h, K("HD")); if (k != kh_end(hdr->h)) { SAM_hdr_type *ty = kh_val(hdr->h, k); SAM_hdr_tag *tag; for (tag = ty->tag; tag; tag = tag->next) { if (tag->str[0] == 'S' && tag->str[1] == 'O') { if (strcmp(tag->str+3, "unsorted") == 0) so = ORDER_UNSORTED; else if (strcmp(tag->str+3, "queryname") == 0) so = ORDER_NAME; else if (strcmp(tag->str+3, "coordinate") == 0) so = ORDER_COORD; else if (strcmp(tag->str+3, "unknown") != 0) fprintf(stderr, "Unknown sort order field: %s\n", tag->str+3); } } } return so; } /* * Reconstructs the kstring from the header hash table. * Returns 0 on success * -1 on failure */ int sam_hdr_rebuild(SAM_hdr *hdr) { /* Order: HD then others */ kstring_t ks = KS_INITIALIZER; khint_t k; k = kh_get(sam_hdr, hdr->h, K("HD")); if (k != kh_end(hdr->h)) { SAM_hdr_type *ty = kh_val(hdr->h, k); SAM_hdr_tag *tag; if (EOF == kputs("@HD", &ks)) return -1; for (tag = ty->tag; tag; tag = tag->next) { if (EOF == kputc_('\t', &ks)) return -1; if (EOF == kputsn_(tag->str, tag->len, &ks)) return -1; } if (EOF == kputc('\n', &ks)) return -1; } for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) { SAM_hdr_type *t1, *t2; if (!kh_exist(hdr->h, k)) continue; if (kh_key(hdr->h, k) == K("HD")) continue; t1 = t2 = kh_val(hdr->h, k); do { SAM_hdr_tag *tag; char c[2]; if (EOF == kputc_('@', &ks)) return -1; c[0] = kh_key(hdr->h, k)>>8; c[1] = kh_key(hdr->h, k)&0xff; if (EOF == kputsn_(c, 2, &ks)) return -1; for (tag = t1->tag; tag; tag=tag->next) { if (EOF == kputc_('\t', &ks)) return -1; if (EOF == kputsn_(tag->str, tag->len, &ks)) return -1; } if (EOF == kputc('\n', &ks)) return -1; t1 = t1->next; } while (t1 != t2); } if (ks_str(&hdr->text)) KS_FREE(&hdr->text); hdr->text = ks; return 0; } /* * Creates an empty SAM header, ready to be populated. * * Returns a SAM_hdr struct on success (free with sam_hdr_free()) * NULL on failure */ SAM_hdr *sam_hdr_new() { SAM_hdr *sh = calloc(1, sizeof(*sh)); if (!sh) return NULL; sh->h = kh_init(sam_hdr); if (!sh->h) goto err; sh->ID_cnt = 1; sh->ref_count = 1; sh->nref = 0; sh->ref = NULL; if (!(sh->ref_hash = kh_init(m_s2i))) goto err; sh->nrg = 0; sh->rg = NULL; if (!(sh->rg_hash = kh_init(m_s2i))) goto err; sh->npg = 0; sh->pg = NULL; sh->npg_end = sh->npg_end_alloc = 0; sh->pg_end = NULL; if (!(sh->pg_hash = kh_init(m_s2i))) goto err; KS_INIT(&sh->text); if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag)))) goto err; if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type)))) goto err; if (!(sh->str_pool = string_pool_create(8192))) goto err; return sh; err: if (sh->h) kh_destroy(sam_hdr, sh->h); if (sh->tag_pool) pool_destroy(sh->tag_pool); if (sh->type_pool) pool_destroy(sh->type_pool); if (sh->str_pool) string_pool_destroy(sh->str_pool); free(sh); return NULL; } /* * Tokenises a SAM header into a hash table. * Also extracts a few bits on specific data types, such as @RG lines. * * Returns a SAM_hdr struct on success (free with sam_hdr_free()) * NULL on failure */ SAM_hdr *sam_hdr_parse_(const char *hdr, int len) { /* Make an empty SAM_hdr */ SAM_hdr *sh; sh = sam_hdr_new(); if (NULL == sh) return NULL; if (NULL == hdr) return sh; // empty header is permitted /* Parse the header, line by line */ if (-1 == sam_hdr_add_lines(sh, hdr, len)) { sam_hdr_free(sh); return NULL; } /* Obtain sort order */ sh->sort_order = sam_hdr_parse_sort_order(sh); //sam_hdr_dump(sh); //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL); //sam_hdr_rebuild(sh); //printf(">>%s<<", ks_str(sh->text)); //parse_references(sh); //parse_read_groups(sh); sam_hdr_link_pg(sh); //sam_hdr_dump(sh); return sh; } /* * Produces a duplicate copy of hdr and returns it. * Returns NULL on failure */ SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) { if (-1 == sam_hdr_rebuild(hdr)) return NULL; return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr)); } /*! Increments a reference count on hdr. * * This permits multiple files to share the same header, all calling * sam_hdr_free when done, without causing errors for other open files. */ void sam_hdr_incr_ref(SAM_hdr *hdr) { hdr->ref_count++; } /*! Increments a reference count on hdr. * * This permits multiple files to share the same header, all calling * sam_hdr_free when done, without causing errors for other open files. * * If the reference count hits zero then the header is automatically * freed. This makes it a synonym for sam_hdr_free(). */ void sam_hdr_decr_ref(SAM_hdr *hdr) { sam_hdr_free(hdr); } /*! Deallocates all storage used by a SAM_hdr struct. * * This also decrements the header reference count. If after decrementing * it is still non-zero then the header is assumed to be in use by another * caller and the free is not done. * * This is a synonym for sam_hdr_dec_ref(). */ void sam_hdr_free(SAM_hdr *hdr) { if (!hdr) return; if (--hdr->ref_count > 0) return; if (ks_str(&hdr->text)) KS_FREE(&hdr->text); if (hdr->h) kh_destroy(sam_hdr, hdr->h); if (hdr->ref_hash) kh_destroy(m_s2i, hdr->ref_hash); if (hdr->ref) { int i; for (i = 0; i < hdr->nref; i++) if (hdr->ref[i].name) free(hdr->ref[i].name); free(hdr->ref); } if (hdr->rg_hash) kh_destroy(m_s2i, hdr->rg_hash); if (hdr->rg) { int i; for (i = 0; i < hdr->nrg; i++) if (hdr->rg[i].name) free(hdr->rg[i].name); free(hdr->rg); } if (hdr->pg_hash) kh_destroy(m_s2i, hdr->pg_hash); if (hdr->pg) { int i; for (i = 0; i < hdr->npg; i++) if (hdr->pg[i].name) free(hdr->pg[i].name); free(hdr->pg); } if (hdr->pg_end) free(hdr->pg_end); if (hdr->type_pool) pool_destroy(hdr->type_pool); if (hdr->tag_pool) pool_destroy(hdr->tag_pool); if (hdr->str_pool) string_pool_destroy(hdr->str_pool); free(hdr); } int sam_hdr_length(SAM_hdr *hdr) { return ks_len(&hdr->text); } char *sam_hdr_str(SAM_hdr *hdr) { return ks_str(&hdr->text); } /* * Looks up a reference sequence by name and returns the numerical ID. * Returns -1 if unknown reference. */ int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) { khint_t k = kh_get(m_s2i, hdr->ref_hash, ref); return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k); } /* * Looks up a read-group by name and returns a pointer to the start of the * associated tag list. * * Returns NULL on failure */ SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) { khint_t k = kh_get(m_s2i, hdr->rg_hash, rg); return k == kh_end(hdr->rg_hash) ? NULL : &hdr->rg[kh_val(hdr->rg_hash, k)]; } /* * Fixes any PP links in @PG headers. * If the entries are in order then this doesn't need doing, but incase * our header is out of order this goes through the sh->pg[] array * setting the prev_id field. * * Note we can have multiple complete chains. This code should identify the * tails of these chains as these are the entries we have to link to in * subsequent PP records. * * Returns 0 on sucess * -1 on failure (indicating broken PG/PP records) */ int sam_hdr_link_pg(SAM_hdr *hdr) { int i, j, ret = 0; hdr->npg_end_alloc = hdr->npg; hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end)); if (!hdr->pg_end) return -1; for (i = 0; i < hdr->npg; i++) hdr->pg_end[i] = i; for (i = 0; i < hdr->npg; i++) { khint_t k; SAM_hdr_tag *tag; char tmp; for (tag = hdr->pg[i].tag; tag; tag = tag->next) { if (tag->str[0] == 'P' && tag->str[1] == 'P') break; } if (!tag) { /* Chain start points */ continue; } tmp = tag->str[tag->len]; tag->str[tag->len] = 0; k = kh_get(m_s2i, hdr->pg_hash, tag->str+3); tag->str[tag->len] = tmp; if (k == kh_end(hdr->pg_hash)) { ret = -1; continue; } hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id; hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1; } for (i = j = 0; i < hdr->npg; i++) { if (hdr->pg_end[i] != -1) hdr->pg_end[j++] = hdr->pg_end[i]; } hdr->npg_end = j; return ret; } /* * Returns a unique ID from a base name. * * The value returned is valid until the next call to * this function. */ const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) { khint_t k = kh_get(m_s2i, sh->pg_hash, name); if (k == kh_end(sh->pg_hash)) return name; do { sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++); k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf); } while (k != kh_end(sh->pg_hash)); return sh->ID_buf; } /* * Add an @PG line. * * If we wish complete control over this use sam_hdr_add() directly. This * function uses that, but attempts to do a lot of tedious house work for * you too. * * - It will generate a suitable ID if the supplied one clashes. * - It will generate multiple @PG records if we have multiple PG chains. * * Call it as per sam_hdr_add() with a series of key,value pairs ending * in NULL. * * Returns 0 on success * -1 on failure */ int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) { va_list args; if (sh->npg_end) { /* Copy ends array to avoid us looping while modifying it */ int *end = malloc(sh->npg_end * sizeof(int)); int i, nends = sh->npg_end; if (!end) return -1; memcpy(end, sh->pg_end, nends * sizeof(*end)); for (i = 0; i < nends; i++) { va_start(args, name); if (-1 == sam_hdr_vadd(sh, "PG", args, "ID", sam_hdr_PG_ID(sh, name), "PN", name, "PP", sh->pg[end[i]].name, NULL)) { free(end); return -1; } va_end(args); } free(end); } else { va_start(args, name); if (-1 == sam_hdr_vadd(sh, "PG", args, "ID", sam_hdr_PG_ID(sh, name), "PN", name, NULL)) return -1; va_end(args); } //sam_hdr_dump(sh); return 0; } /* * A function to help with construction of CL tags in @PG records. * Takes an argc, argv pair and returns a single space-separated string. * This string should be deallocated by the calling function. * * Returns malloced char * on success * NULL on failure */ char *stringify_argv(int argc, char *argv[]) { char *str, *cp; size_t nbytes = 1; int i, j; /* Allocate */ for (i = 0; i < argc; i++) { nbytes += strlen(argv[i]) + 1; } if (!(str = malloc(nbytes))) return NULL; /* Copy */ cp = str; for (i = 0; i < argc; i++) { j = 0; while (argv[i][j]) { if (argv[i][j] == '\t') *cp++ = ' '; else *cp++ = argv[i][j]; j++; } *cp++ = ' '; } *cp++ = 0; return str; }