From 3c90481d2a552e3b476694184a3e35e983eea2a5 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 21 Oct 2024 12:11:02 +0200 Subject: [PATCH] Add support for matching VCF lines by ID (PR #1844) See https://github.com/samtools/bcftools/issues/1739 --- bcf_sr_sort.c | 16 +++++++++++++--- bcf_sr_sort.h | 12 ++++++++++-- htslib/synced_bcf_reader.h | 3 ++- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/bcf_sr_sort.c b/bcf_sr_sort.c index 01e98bb39..73be004c9 100644 --- a/bcf_sr_sort.c +++ b/bcf_sr_sort.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2021 Genome Research Ltd. + Copyright (C) 2017-2021,2024 Genome Research Ltd. Author: Petr Danecek @@ -32,6 +32,7 @@ #include "htslib/khash_str2int.h" #include "htslib/kbitset.h" +// Variant types and pair-wise compatibility of their combinations, see bcf_sr_init_scores() #define SR_REF 1 #define SR_SNP 2 #define SR_INDEL 4 @@ -366,7 +367,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, // group VCFs into groups, each with a unique combination of variants in the duplicate lines int ireader,ivar,irec,igrp,ivset,iact; for (ireader=0; ireadernreaders; ireader++) srt->vcf_buf[ireader].nrec = 0; - for (iact=0; iactnactive; iact++) + for (iact=0; iactnactive; iact++) // process each of the active readers, ie which still have a record to process { ireader = srt->active[iact]; bcf_sr_t *reader = &readers->readers[ireader]; @@ -384,6 +385,11 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, srt->off[srt->noff++] = srt->str.l; size_t beg = srt->str.l; int end_pos = -1; + if ( srt->pair & BCF_SR_PAIR_ID ) + { + kputs(line->d.id,&srt->str); + kputc(':',&srt->str); + } for (ivar=1; ivarn_allele; ivar++) { if ( ivar>1 ) kputc(',',&srt->str); @@ -417,7 +423,10 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, } // Create new variant or attach to existing one. But careful, there can be duplicate - // records with the same POS,REF,ALT (e.g. in dbSNP-b142) + // records with the same POS,REF,ALT (e.g. in dbSNP-b142). In such case, use a + // hash table (srt->var_str2int) and a counter (var_idx) to ensure they are + // treated as separate variants, while still allowing them to be matched + // between readers. char *var_str = beg + srt->str.s; int ret, var_idx = 0, var_end = srt->str.l; while ( 1 ) @@ -435,6 +444,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, } if ( ret==-1 ) { + // the variant is not present, insert ivar = srt->nvar++; hts_expand0(var_t,srt->nvar,srt->mvar,srt->var); srt->var[ivar].nvcf = 0; diff --git a/bcf_sr_sort.h b/bcf_sr_sort.h index c8bd787a1..447e8bf7a 100644 --- a/bcf_sr_sort.h +++ b/bcf_sr_sort.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2017 Genome Research Ltd. + Copyright (C) 2017-2019,2024 Genome Research Ltd. Author: Petr Danecek @@ -55,6 +55,9 @@ typedef struct } var_t; +// Group is a set of variants in duplicate records within one VCF. They are identified with a key (used only +// for debugging), such as C>A,C>G;C>T. Commas separate alleles in a multiallelic record, semicolons separate +// VCF lines. typedef struct { char *key; // only for debugging @@ -67,7 +70,7 @@ typedef struct { int nvar, mvar, *var; // list of compatible variants that can be output together int cnt; // number of readers in this group - kbitset_t *mask; // which groups are populated in this set (replace with expandable bitmask) + kbitset_t *mask; // which groups are populated in this set } varset_t; @@ -100,8 +103,13 @@ sr_sort_t; sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt); void bcf_sr_sort_reset(sr_sort_t *srt); int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t pos); + +// initialize a new position using the i-th reader int bcf_sr_sort_set_active(sr_sort_t *srt, int i); + +// add i-th reader with the same position, assumed bcf_sr_sort_set_active() was called with another reader int bcf_sr_sort_add_active(sr_sort_t *srt, int i); + void bcf_sr_sort_destroy(sr_sort_t *srt); void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i); diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 9a6b48438..fb42877c9 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019-2023 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2024 Genome Research Ltd. Author: Petr Danecek @@ -89,6 +89,7 @@ extern "C" { #define BCF_SR_PAIR_SNP_REF (1<<4) // allow REF-only records with SNPs #define BCF_SR_PAIR_INDEL_REF (1<<5) // allow REF-only records with indels #define BCF_SR_PAIR_EXACT (1<<6) // require the exact same set of alleles in all files +#define BCF_SR_PAIR_ID (1<<7) // require matching IDs (overlap) #define BCF_SR_PAIR_BOTH (BCF_SR_PAIR_SNPS|BCF_SR_PAIR_INDELS) #define BCF_SR_PAIR_BOTH_REF (BCF_SR_PAIR_SNPS|BCF_SR_PAIR_INDELS|BCF_SR_PAIR_SNP_REF|BCF_SR_PAIR_INDEL_REF)