diff --git a/leviosam-test.py b/leviosam-test.py index b23cb49..4fb1cfa 100644 --- a/leviosam-test.py +++ b/leviosam-test.py @@ -49,7 +49,8 @@ def setUpClass(cls): for param in params: process = subprocess.Popen( ['./leviosam', - 'lift', '-l', 'testdata/wg-maj.lft', '-a', param['sam'], '-p', param['out_prefix']], + 'lift', '-l', 'testdata/wg-maj.lft', '-a', param['sam'], + '-p', param['out_prefix']], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() print(f'Lifted {param["out_prefix"]}.sam') @@ -82,12 +83,12 @@ def test_identical_fields(self): assert param['mode'] in ['SE', 'PE'] # Each subtest here is one parameter set (e.g. BWA-SE, BT2-PE) - with self.subTest(aln_param_idx=i): + with self.subTest(aln_param_idx=param): dict_lifted = self.read_sam_file_as_dict(f'{param["out_prefix"]}.sam') dict_gold = self.read_sam_file_as_dict(f'{param["gold"]}') for test_idx, test_field in enumerate(dict_test_field.keys()): - with self.subTest(test_idx=test_idx): + with self.subTest(test_idx=test_field): for k in dict_lifted.keys(): # Single-end files. if param['mode'] == 'SE': diff --git a/leviosam.cpp b/leviosam.cpp index cbfb069..b5fb90a 100644 --- a/leviosam.cpp +++ b/leviosam.cpp @@ -9,19 +9,17 @@ #include #include -using NameMap = std::vector>; -using LengthMap = std::unordered_map; - struct lift_opts { std::string vcf_fname = ""; std::string sample = ""; std::string outpre = ""; + std::string out_format = "sam"; std::string lift_fname = ""; std::string sam_fname = ""; std::string cmd = ""; std::string haplotype = "0"; int threads = 1; - int chunk_size = 64; + int chunk_size = 256; int verbose = 0; NameMap name_map; LengthMap length_map; @@ -61,10 +59,6 @@ LengthMap parse_length_map(const char* fname) { return lengths; } -lift::LiftMap lift_from_vcf(std::string fname, - std::string sample, - std::string haplotype, - NameMap names, LengthMap lengths); void serialize_run(lift_opts args) { lift::LiftMap l(lift_from_vcf(args.vcf_fname, args.sample, args.haplotype, args.name_map, args.length_map)); @@ -79,58 +73,24 @@ void serialize_run(lift_opts args) { } -char* get_PG(bam_hdr_t* hdr) { - char* hdr_txt = (char*) malloc(sizeof(char) * hdr->l_text + 1); - char* buf = (char*) malloc(sizeof(char) * hdr->l_text); - buf[0] = '\0'; - std::strcpy(hdr_txt, hdr->text); - char* token = std::strtok(hdr_txt, "\n"); - while (token != NULL) { - if (token[0] == '@' && token[1] == 'P' && token[2] == 'G') { - strcat(buf, token); - } - token = std::strtok(NULL, "\n"); - } - free(hdr_txt); - return buf; -} - -/* just gets AS for now till I get around to iterating through all the tags - */ -std::string tag_to_string(const bam1_t* rec) { - std::string tag_string(""); - uint8_t* aux = bam_aux_get(rec, "AS"); - if (aux != NULL) { - tag_string += "AS:i:"; - tag_string += std::to_string(bam_aux2i(aux)); - } - aux = bam_aux_get(rec, "NM"); - if (aux != NULL) { - tag_string += "\tNM:i:"; - tag_string += std::to_string(bam_aux2i(aux)); - } - return tag_string; -} - void read_and_lift( std::mutex* mutex_fread, std::mutex* mutex_fwrite, std::mutex* mutex_vec, samFile* sam_fp, - FILE* out_sam_fp, + samFile* out_sam_fp, bam_hdr_t* hdr, int chunk_size, lift::LiftMap* l, std::vector* chroms_not_found ){ std::vector aln_vec; - std::vector tmp_write_buffer(chunk_size); for (int i = 0; i < chunk_size; i++){ bam1_t* aln = bam_init1(); aln_vec.push_back(aln); } int read = 1; - while(read >= 0){ + while (read >= 0){ int num_actual_reads = chunk_size; { // read from SAM, protected by mutex @@ -144,112 +104,54 @@ void read_and_lift( } } for (int i = 0; i < num_actual_reads; i++){ - std::string sam_out; bam1_t* aln = aln_vec[i]; bam1_core_t c = aln->core; - sam_out += bam_get_qname(aln); - sam_out += "\t"; - sam_out += std::to_string(c.flag); - sam_out += "\t"; std::string s1_name, s2_name; size_t pos; - if (c.flag & BAM_FUNMAP){ // If unmapped - if (((c.flag & BAM_FPAIRED) && (c.flag & BAM_FMUNMAP)) || // paired-end, and mate unmapped - !(c.flag & BAM_FPAIRED) // single-end - ){ - sam_out += "*\t0\t0\t*\t"; - } - else { - s2_name = hdr->target_name[c.mtid]; - s1_name = l->get_other_name(s2_name); - // chroms_not_found needs to be protected because chroms_not_found is shared - pos = l->s2_to_s1(s2_name, c.mpos, chroms_not_found, mutex_vec) + 1; - // REF - sam_out += s1_name.data(); - sam_out += "\t"; - // POS - sam_out += std::to_string(pos); - sam_out += "\t"; - // QUAL & CIGAR - sam_out += "0\t*\t"; - } - } - else { + // If a read is mapped, lift its position. + // If a read is unmapped, but its mate is mapped, lift its mates position. + // Otherwise leave it unchanged. + if (!(c.flag & BAM_FUNMAP)) { s2_name = hdr->target_name[c.tid]; s1_name = l->get_other_name(s2_name); // chroms_not_found needs to be protected because chroms_not_found is shared - pos = l->s2_to_s1(s2_name, c.pos, chroms_not_found, mutex_vec) + 1; - // REF - sam_out += s1_name.data(); - sam_out += "\t"; - // POS - sam_out += std::to_string(pos); - sam_out += "\t"; - // QUAL - sam_out += std::to_string(c.qual); - sam_out += "\t"; + pos = l->s2_to_s1(s2_name, c.pos, chroms_not_found, mutex_vec); // CIGAR - sam_out += l->cigar_s2_to_s1(s2_name, aln).data(); - sam_out += "\t"; + l->cigar_s2_to_s1(s2_name, aln); + aln->core.pos = pos; + } else if ((c.flag & BAM_FPAIRED) && !(c.flag & BAM_FMUNMAP)) { + s2_name = hdr->target_name[c.mtid]; + s1_name = l->get_other_name(s2_name); + // chroms_not_found needs to be protected because chroms_not_found is shared + pos = l->s2_to_s1(s2_name, c.mpos, chroms_not_found, mutex_vec); + aln->core.pos = pos; } - if (c.flag & BAM_FPAIRED) { // mate information - if (c.flag & BAM_FMUNMAP){ // mate is unmapped - size_t mpos = l->s2_to_s1(s2_name, c.pos, chroms_not_found, mutex_vec) + 1; - sam_out += "=\t"; - sam_out += std::to_string(mpos); - sam_out += "\t0\t"; - } - else{ // mate is mapped + // Lift mate position. + if (c.flag & BAM_FPAIRED) { + // If the mate is unmapped, use the lifted position of the read. + // If the mate is mapped, lift its position. + if (c.flag & BAM_FMUNMAP){ + aln->core.mpos = aln->core.pos; + } else { std::string ms2_name(hdr->target_name[c.mtid]); std::string ms1_name(l->get_other_name(ms2_name)); - size_t mpos = l->s2_to_s1(ms2_name, c.mpos, chroms_not_found, mutex_vec) + 1; - // RNEXT - // If IDs match, print "="; else, print RNAME for the mate - sam_out += (c.tid == c.mtid)? "=" : ms1_name.data(); - sam_out += "\t"; - sam_out += std::to_string(mpos); // PNEXT - size_t pos_5, mpos_5; // 5'-end position + size_t mpos = l->s2_to_s1(ms2_name, c.mpos, chroms_not_found, mutex_vec); + aln->core.mpos = mpos; int isize = (c.isize == 0)? 0 : c.isize + (mpos - c.mpos) - (pos - c.pos); - sam_out += "\t"; - sam_out += std::to_string(isize); // TLEN (or ISIZE) - sam_out += "\t"; - } - } - else { // single-end - sam_out += "*\t0\t0\t"; - } - // get query sequence - std::string query_seq(""); - uint8_t* seq = bam_get_seq(aln); - for (auto i = 0; i < c.l_qseq; ++i) { - query_seq += seq_nt16_str[bam_seqi(seq, i)]; - } - sam_out += query_seq.data(); - sam_out += "\t"; - // get quality - std::string qual_seq(""); - uint8_t* qual = bam_get_qual(aln); - if (qual[0] == 255) qual_seq = "*"; - else { - for (auto i = 0; i < c.l_qseq; ++i) { - qual_seq += (char) (qual[i] + 33); + aln->core.isize = isize; } } - sam_out += qual_seq.data(); - sam_out += "\t"; - // TODO reconcile any tags that also need to be added. - sam_out += tag_to_string(aln).data(); - sam_out += "\n"; - - tmp_write_buffer[i] = sam_out; - // fprintf(out_sam_fp, "%s", sam_out.c_str()); } { // write to file, thread corruption protected by lock_guard std::lock_guard g(*mutex_fwrite); // std::thread::id this_id = std::this_thread::get_id(); for (int i = 0; i < num_actual_reads; i++){ - fprintf(out_sam_fp, "%s", tmp_write_buffer[i].c_str()); + auto flag_write = sam_write1(out_sam_fp, hdr, aln_vec[i]); + if (flag_write < 0){ + std::cerr << "[Error] Failed to write record " << bam_get_qname(aln_vec[i]) << "\n"; + exit(1); + } } } } @@ -276,60 +178,46 @@ void lift_run(lift_opts args) { fprintf(stderr, "loaded liftmap!\n"); - samFile* sam_fp; - // read sam file here - if (args.sam_fname == "") { - sam_fp = sam_open("-", "r"); - } - else - sam_fp = sam_open(args.sam_fname.data(), "r"); + samFile* sam_fp = (args.sam_fname == "")? + sam_open("-", "r") : sam_open(args.sam_fname.data(), "r"); bam_hdr_t* hdr = sam_hdr_read(sam_fp); // the "ref" lengths are all stored in the levio structure. How do we loop over it? std::vector contig_names; std::vector contig_reflens; std::tie(contig_names, contig_reflens) = l.get_s1_lens(); - // for now we'll just write out the samfile raw - FILE* out_sam_fp; - if (args.outpre == "") - out_sam_fp = stdout; - else - out_sam_fp = fopen((args.outpre + ".sam").data(), "w"); - fprintf(out_sam_fp, "@HD\tVN:1.6\tSO:unknown\n"); - for (auto i = 0; i < hdr->n_targets; i++){ + + std::string out_mode = (args.out_format == "sam")? "w" : "wb"; + samFile* out_sam_fp = (args.outpre == "-" || args.outpre == "")? + sam_open("-", out_mode.data()) : + sam_open((args.outpre + "." + args.out_format).data(), out_mode.data()); + + // Write SAM headers. We update contig lengths if needed. + sam_hdr_add_pg(hdr, "leviosam", "VN", VERSION, "CL", args.cmd.data(), NULL); + for (auto i = 0; i < hdr->n_targets; i++) { auto contig_itr = std::find(contig_names.begin(), contig_names.end(), hdr->target_name[i]); // If a contig is in contig_names, look up the associated length. - if (contig_itr != contig_names.end()){ - fprintf(out_sam_fp, "@SQ\tSN:%s\tLN:%ld\n", - contig_names[contig_itr - contig_names.begin()].data(), - contig_reflens[contig_itr - contig_names.begin()]); - } - // if a contig is not found in vcf/lft, print it as it was before levio - else{ - fprintf(out_sam_fp, "@SQ\tSN:%s\tLN:%u\n", hdr->target_name[i], hdr->target_len[i]); + if (contig_itr != contig_names.end()) { + auto len_str = std::to_string(contig_reflens[contig_itr - contig_names.begin()]); + if (sam_hdr_update_line( + hdr, "SQ", + "SN", contig_names[contig_itr - contig_names.begin()].data(), + "LN", len_str.data(), NULL) < 0) { + std::cerr << "[Error] failed when converting contig length for " + << hdr->target_name[i] << "\n"; + exit(1); + } } } - /* This is only supported in the latest dev release of htslib as of July 22. - * add back in at next htslib release - kstring_t s = KS_INITIALIZE; - sam_hdr_find_line_id(hdr, "PG", NULL, NULL, &s); - fprintf(out_sam_fp, "%s\n", s.s); - ks_free(&s); - */ - char* prev_pg = get_PG(hdr); - fprintf(out_sam_fp, "%s\n", prev_pg); - free(prev_pg); - fprintf(out_sam_fp, "@PG\tID:leviosam\tPN:leviosam\tCL:\"%s\"\n", args.cmd.data()); + auto write_hdr = sam_hdr_write(out_sam_fp, hdr); - // store chromosomes found in SAM but not in lft - // use a vector to avoid printing out the same warning msg multiple times + // Store chromosomes found in SAM but not in the VCF. + // We use a vector to avoid printing out the same warning msg multiple times. std::vector chroms_not_found; const int num_threads = args.threads; const int chunk_size = args.chunk_size; std::vector threads; - std::mutex mutex_fread; - std::mutex mutex_fwrite; - std::mutex mutex_vec; + std::mutex mutex_fread, mutex_fwrite, mutex_vec; for (int j = 0; j < num_threads; j++){ threads.push_back( std::thread( @@ -351,9 +239,10 @@ void lift_run(lift_opts args) { threads[j].join(); } threads.clear(); - fclose(out_sam_fp); + sam_close(out_sam_fp); } + lift::LiftMap lift_from_vcf(std::string fname, std::string sample, std::string haplotype, @@ -376,13 +265,16 @@ std::string make_cmd(int argc, char** argv) { } return cmd; } + + void print_serialize_help_msg(){ fprintf(stderr, "\n"); fprintf(stderr, "Usage: leviosam serialize [options]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -v string Build a leviosam index using a VCF file.\n"); fprintf(stderr, " -s string The sample used to build leviosam index (-v needs to be set).\n"); - fprintf(stderr, " -p string The prefix of the output files.\n"); + fprintf(stderr, " -p string The prefix of the output file.\n"); + fprintf(stderr, " -O format Output file format, can be sam or bam. [sam]\n"); fprintf(stderr, " -g 0/1 The haplotype used to index leviosam. [0] \n"); fprintf(stderr, " -n string Path to a name map file.\n"); fprintf(stderr, " This can be used to map '1' to 'chr1', or vice versa.\n"); @@ -394,22 +286,24 @@ void print_lift_help_msg(){ fprintf(stderr, "\n"); fprintf(stderr, "Usage: leviosam lift [options]\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -a string The SAM file to be lifted. \n"); + fprintf(stderr, " -a string Path to the SAM/BAM file to be lifted. \n"); fprintf(stderr, " Leave empty or set to \"-\" to read from stdin.\n"); - fprintf(stderr, " -l string Path to a pre-built leviosam index.\n"); + fprintf(stderr, " -l string Path to a leviosam index.\n"); fprintf(stderr, " -v string If -l is not specified, can build indexes using a VCF file.\n"); fprintf(stderr, " -t INT Number of threads used. [1] \n"); - fprintf(stderr, " -T INT Size of a processing chunk. [64] \n"); + fprintf(stderr, " -T INT Chunk size for each thread. [256] \n"); + fprintf(stderr, " Each thread queries <-T> reads, lifts, and writes.\n"); + fprintf(stderr, " Setting a higher <-T> uses slightly more memory but might benefit thread scaling.\n"); fprintf(stderr, " The options for serialize can also be used here, if -v is set.\n"); print_serialize_help_msg(); } void print_main_help_msg(){ fprintf(stderr, "\n"); - fprintf(stderr, "Program: leviosam (lift over SAM based on VCF)\n"); + fprintf(stderr, "Program: leviosam (lift over SAM/BAM based on VCF)\n"); fprintf(stderr, "Usage: leviosam [options]\n\n"); fprintf(stderr, "Commands:serialize build leviosam indexes\n"); - fprintf(stderr, " lift perform lifting\n"); + fprintf(stderr, " lift lift alignments\n"); fprintf(stderr, "Options: -h Print detailed usage.\n"); fprintf(stderr, "\n"); } @@ -424,13 +318,14 @@ int main(int argc, char** argv) { {"prefix", required_argument, 0, 'p'}, {"leviosam", required_argument, 0, 'l'}, {"sam", required_argument, 0, 'a'}, + {"out_format", required_argument, 0, 'O'}, {"haplotype", required_argument, 0, 'g'}, {"threads", required_argument, 0, 't'}, {"chunk_size", required_argument, 0, 'T'}, {"verbose", no_argument, &args.verbose, 1}, }; int long_index = 0; - while((c = getopt_long(argc, argv, "hv:s:p:l:a:g:n:k:t:T:", long_options, &long_index)) != -1) { + while((c = getopt_long(argc, argv, "hv:s:p:l:a:O:g:n:k:t:T:", long_options, &long_index)) != -1) { switch (c) { case 'v': args.vcf_fname = optarg; @@ -447,6 +342,9 @@ int main(int argc, char** argv) { case 'a': args.sam_fname = optarg; break; + case 'O': + args.out_format = optarg; + break; case 'g': args.haplotype = optarg; break; @@ -480,6 +378,12 @@ int main(int argc, char** argv) { fprintf(stderr, "invalid haplotype %s\n", args.haplotype.c_str()); exit(1); } + + if (args.out_format != "sam" && args.out_format != "bam") { + fprintf(stderr, "Not supported extension format %s\n", args.out_format.c_str()); + exit(1); + } + if (!strcmp(argv[optind], "lift")) { lift_run(args); } else if (!strcmp(argv[optind], "serialize")) { diff --git a/leviosam.hpp b/leviosam.hpp index ce5f784..08ceb04 100644 --- a/leviosam.hpp +++ b/leviosam.hpp @@ -20,6 +20,12 @@ * Created: July 2019 */ +const char* VERSION("0.2"); + +using NameMap = std::vector>; +using LengthMap = std::unordered_map; + + static inline void die(std::string msg) { fprintf(stderr, "%s\n", msg.data()); exit(1); @@ -83,15 +89,22 @@ class Lift { return del_rs0(ins_sls0(p+1)); } - // convert a CIGAR string in an s2 alignment to the corresponding CIGAR string in the s1 alignment - // input: bam1_t alignment record against s2 sequence - // output: CIGAR string (as std::string) wrt s1 sequence - std::string cigar_s2_to_s1(bam1_t* b) { - auto x = del_sls0(b->core.pos+1); + + /* Core function to lift a CIGAR string from s1 to s2 space. + * + * Input: + * An alignment struct we'd like to update. + * This function is not going to update the bam1_t stuct. + * + * Output: + * A vector, each element is an unit32_t corresponding to htslib CIGAR value + */ + std::vector cigar_s2_to_s1_core(bam1_t* b){ + auto x = del_sls0(b->core.pos + 1); int y = 0; // read2alt cigar pointer uint32_t* cigar = bam_get_cigar(b); - std::string out_cigar = ""; std::vector cigar_ops; + std::vector new_cigar_ops; for (int i = 0; i < b->core.n_cigar; ++i) { for (int j = 0; j < bam_cigar_oplen(cigar[i]); ++j) { cigar_ops.push_back(bam_cigar_op(cigar[i])); @@ -101,24 +114,30 @@ class Lift { while (y < iters) { int cop = cigar_ops[y]; if (del[x]) { // skip ahead in alt2ref cigar - if (out_cigar[out_cigar.length()-1] == 'I'){ - out_cigar.pop_back(); - out_cigar += "M"; + if (new_cigar_ops.empty()){ + new_cigar_ops.push_back(BAM_CDEL); + } else { + if (new_cigar_ops.back() == BAM_CINS){ + new_cigar_ops[new_cigar_ops.size() - 1] = BAM_CMATCH; + } else { + new_cigar_ops.push_back(BAM_CDEL); + } } - else - out_cigar += "D"; ++x; } else if (cop == BAM_CINS) { // skip ahead in read2alt cigar - if (out_cigar[out_cigar.length()-1] == 'D'){ - out_cigar.pop_back(); - out_cigar += "M"; + if (!new_cigar_ops.empty()){ + if (new_cigar_ops.back() == BAM_CDEL){ + new_cigar_ops[new_cigar_ops.size() - 1] = BAM_CMATCH; + } else{ + new_cigar_ops.push_back(BAM_CINS); + } + } else { + new_cigar_ops.push_back(BAM_CINS); } - else - out_cigar += "I"; ++y; } else if (cop == BAM_CSOFT_CLIP || cop == BAM_CHARD_CLIP || cop == BAM_CPAD){ - out_cigar += (cop == BAM_CSOFT_CLIP)? "S" : - (cop == BAM_CHARD_CLIP)? "H" : "P"; + // TODO: check the case where S is followed by D + new_cigar_ops.push_back(cop); ++y; } else if (cop == BAM_CBACK) { // skip. We don't support B Ops. @@ -126,43 +145,114 @@ class Lift { ++y; } else if (cop == BAM_CMATCH || cop == BAM_CDIFF || cop == BAM_CEQUAL) { // M if (ins[x]){ - if (out_cigar[out_cigar.length()-1] == 'D'){ - out_cigar.pop_back(); - out_cigar += "M"; + if (new_cigar_ops.empty()){ + new_cigar_ops.push_back(BAM_CINS); + } else if (new_cigar_ops.back() == BAM_CDEL){ + new_cigar_ops[new_cigar_ops.size() - 1] = BAM_CMATCH; + } else { + new_cigar_ops.push_back(BAM_CINS); } - else - out_cigar += "I"; // IM + } else { + new_cigar_ops.push_back(BAM_CMATCH); } - else out_cigar += "M"; // MM ++x; ++y; } else if (cop == BAM_CDEL || cop == BAM_CREF_SKIP) { // D - if (ins[x]) out_cigar += ""; // ID - insertion is invalidated - else{ - if (out_cigar[out_cigar.length()-1] == 'I'){ - out_cigar.pop_back(); - out_cigar += "M"; + if (!ins[x]) { + if (new_cigar_ops.empty()){ + new_cigar_ops.push_back(BAM_CDEL); + } else if (new_cigar_ops.back() == BAM_CINS){ + new_cigar_ops[new_cigar_ops.size() - 1] = BAM_CMATCH; + } else { + new_cigar_ops.push_back(BAM_CDEL); } - else - out_cigar += "D"; // MD } ++x; ++y; } } + return new_cigar_ops; + } - std::string out = ""; + + /* Convert a CIGAR string in an s2 alignment to the corresponding CIGAR string in the s1 alignment + * + * Input: bam1_t alignment record against s2 sequence + * + * Output: + * The bam1_t struct is updated if there's a change in its CIGAR. + * bam1_t->core.n_cigar specifies numbers of operators in a CIGAR; + * bam_get_cigar(bam1_t) points to the occurrences. + */ + void cigar_s2_to_s1(bam1_t* b) { + uint32_t* cigar = bam_get_cigar(b); + // Get lifted CIGAR. + auto new_cigar_ops = cigar_s2_to_s1_core(b); + // Prepare to update CIGAR in the bam1_t struct. + std::vector cigar_op_len; + std::vector cigar_op; int z = 1; - for (size_t i = 1; i < out_cigar.size(); ++i) { - if (out_cigar[i] == out_cigar[i-1]) { + for (int i = 1; i < new_cigar_ops.size(); i++){ + if (new_cigar_ops[i] == new_cigar_ops[i-1]){ z++; } else { - out += std::to_string(z); - out += out_cigar[i-1]; + cigar_op_len.push_back(z); + cigar_op.push_back(new_cigar_ops[i - 1]); z = 1; } } - out += std::to_string(z); - out += out_cigar[out_cigar.size() - 1]; - return out; + cigar_op_len.push_back(z); + cigar_op.push_back(new_cigar_ops[new_cigar_ops.size() - 1]); + std::vector new_cigar; + for (int i = 0; i < cigar_op_len.size(); i++){ + new_cigar.push_back(bam_cigar_gen(cigar_op_len[i], cigar_op[i])); + } + // Adjust data position when n_cigar is changed by levioSAM. n_cigar could either be + // increased or decreased. + // + // Adapted from samtools/sam.c + // https://github.com/samtools/htslib/blob/2264113e5df1946210828e45d29c605915bd3733/sam.c#L515 + if (b->core.n_cigar != cigar_op_len.size()){ + auto cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; + auto fake_bytes = b->core.n_cigar * 4; + b->core.n_cigar = (uint32_t)cigar_op_len.size(); + auto n_cigar4 = b->core.n_cigar * 4; + auto orig_len = b->l_data; + if (n_cigar4 > fake_bytes){ + // Check if we need to update `b->m_data`. + // + // Adapted from htslib/sam_internal.h + // https://github.com/samtools/htslib/blob/31f0a76d338c9bf3a6893b71dd437ef5fcaaea0e/sam_internal.h#L48 + auto new_m_data = (size_t) b->l_data + n_cigar4 - fake_bytes; + kroundup32(new_m_data); + if (new_m_data > b->m_data){ + auto new_data = static_cast(realloc(b->data, new_m_data)); + if (!new_data){ + std::cerr << "[Error] Failed to expand a bam1_t struct for " << bam_get_qname(b) << "\n"; + std::cerr << "This is likely due to out of memory\n"; + exit(1); + } + b->data = new_data; + b->m_data = new_m_data; + cigar = bam_get_cigar(b); + } + } + // Update memory usage of data. + b->l_data = b->l_data - fake_bytes + n_cigar4; + // Move data to the correct place. + memmove(b->data + cigar_st + n_cigar4, + b->data + cigar_st + fake_bytes, + orig_len - (cigar_st + fake_bytes)); + // If new n_cigar is greater, copy the real CIGAR to the right place. + // Skipped this if new n_cigar is smaller than the original value. + if (n_cigar4 > fake_bytes){ + memcpy(b->data + cigar_st, + b->data + (n_cigar4 - fake_bytes) + 8, + n_cigar4); + } + b->core.n_cigar = cigar_op_len.size(); + } + for (int i = 0; i < b->core.n_cigar; i++){ + *(cigar + i) = new_cigar[i]; + } } // returns size of s1 sequence @@ -413,48 +503,22 @@ class LiftMap { return n; } - // converts CIGAR string of s2 alignment to corresponding CIGAR string for the s1 alignment - // input: sequence name, bam1_t alignment record object (via htslib) - // ouput: CIGAR string of s1 alignment (as std::string) - // if liftover is not defined, then returns empty string - std::string cigar_s2_to_s1(std::string n, bam1_t* b) { + /* Convert a CIGAR string in an s2 alignment to the corresponding CIGAR string in the s1 alignment + * + * Input: bam1_t alignment record against s2 sequence + * + * The bam1_t struct is updated if there's a change in its CIGAR. + * bam1_t->core.n_cigar specifies numbers of operators in a CIGAR; + * bam_get_cigar(bam1_t) points to the occurrences. + * + * If liftover is not defined, do not update the bam1_t struct. + */ + void cigar_s2_to_s1(const std::string& n, bam1_t* b) { auto it = s2_map.find(n); if (it != s2_map.end()) { - return lmap[it->second].cigar_s2_to_s1(b); - } else { // returns original cigar string - std::string out_cigar = ""; - auto cigar = bam_get_cigar(b); - for (int i = 0; i < b->core.n_cigar; ++i) { - for (int j = 0; j < bam_cigar_oplen(cigar[i]); ++j) { - if (bam_cigar_op(cigar[i]) == BAM_CINS) - out_cigar += "I"; - else if (bam_cigar_op(cigar[i]) == BAM_CDEL) - out_cigar += "D"; - else if (bam_cigar_op(cigar[i]) == BAM_CMATCH) - out_cigar += "M"; - else if (bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) - out_cigar += "S"; - else if (bam_cigar_op(cigar[i]) == BAM_CHARD_CLIP) - out_cigar += "H"; - else if (bam_cigar_op(cigar[i]) == BAM_CPAD) - out_cigar += "P"; - } - } - std::string out = ""; - int z = 1; - for (size_t i = 1; i < out_cigar.size(); ++i) { - if (out_cigar[i] == out_cigar[i-1]) { - z++; - } else { - out += std::to_string(z); - out += out_cigar[i-1]; - z = 1; - } - } - out += std::to_string(z); - out += out_cigar[out_cigar.size() - 1]; - return out; + lmap[it->second].cigar_s2_to_s1(b); } + // If not found, no update. } // saves to stream @@ -588,6 +652,12 @@ class LiftMap { }; }; +lift::LiftMap lift_from_vcf(std::string fname, + std::string sample, + std::string haplotype, + NameMap names, LengthMap lengths); + + void print_main_help_msg(); void print_lift_help_msg(); void print_serialize_help_msg(); diff --git a/scripts/chain_utils.py b/scripts/chain_utils.py new file mode 100644 index 0000000..8e43b5f --- /dev/null +++ b/scripts/chain_utils.py @@ -0,0 +1,315 @@ +''' +This script includes two functionalities: chain2vcf and chain2bed + +chain2vcf: Converts a chain file to a VCF file. + + This conversion is not capble of handling translocations and inversions. Users should specify + a list of "chain ids" that contain only indels. + +Example: + # This output VCF is used to convert h38 coordinates to T2T coordinates. + python chain_utils.py chain2vcf -i hg38.t2t-chm13-v1.0.over.chain -c 1-23 -o main_chain-h38.t2t_chm13.vcf + + +chain2bed: Converts a chain file to a BED file. + + The conversion only supports one chain per contig (we currently don't support translocations). + Users need to provide a list of chain ids for conversion. + The output BED file will contain the "chain block" regions. +''' + +import argparse +import sys + +def parse_args(): + parser = argparse.ArgumentParser() + + subparsers = parser.add_subparsers() + #subparsers = parser.add_subparsers(dest='subparser') + parser_c2v = subparsers.add_parser('chain2vcf') + parser_c2v.add_argument( + '-i', '--in_chain', required=True, + help='Path to the chain file to be converted.' + ) + parser_c2v.add_argument( + '-c', '--chain_ids', required=True, + help='Chain ids to be converted, separated by commas and hyphens. E.g., "1-9", "1,5-8".' + ) + parser_c2v.add_argument( + '-o', '--out_vcf', + help='Path to the output VCF file.' + ) + parser_c2v.set_defaults(func=chain2vcf) + + parser_c2b = subparsers.add_parser('chain2bed') + parser_c2b.add_argument( + '-i', '--in_chain', required=True, + help='Path to the chain file to be converted.' + ) + parser_c2b.add_argument( + '-c', '--chain_ids', required=True, + help='Chain ids to be converted, separated by commas and hyphens. E.g., "1-9", "1,5-8".' + ) + parser_c2b.add_argument( + '-o', '--out_bed', + help='Path to the output BED file.' + ) + parser_c2b.set_defaults(func=chain2bed) + + args = parser.parse_args() + args.func(args) + + #kwargs = vars(parser.parse_args()) + #globals()[kwargs.pop('subparser')](**kwargs) + + +class Chain_Fields(): + HEADER = 0 + SCORE = 1 + TNAME = 2 + TSIZE = 3 + TSTRAND = 4 + TSTART = 5 + TEND = 6 + QNAME = 7 + QSIZE = 8 + QSTRAND = 9 + QSTART = 10 + QEND = 11 + CID = 12 + ALN_SIZE = 0 + ALN_DT = 1 + ALN_DQ = 2 + + +class Chain2Bed(): + def __init__(self, out_bed=sys.stdout, chain_hdr='', CF=None): + assert(chain_hdr[CF.HEADER] == 'chain') + assert(chain_hdr[CF.TNAME] == chain_hdr[CF.QNAME]) + self.out_bed = out_bed + self.CF = CF + self.contig = chain_hdr[self.CF.QNAME] + self.qstart = int(chain_hdr[self.CF.QSTART]) + self.tstart = int(chain_hdr[self.CF.TSTART]) + self.bed_records = [] + + def add(self, chain_aln): + if len(self.bed_records) == 0 or self.tstart != self.bed_records[-1][1]: + self.bed_records.append([self.tstart, self.tstart + int(chain_aln[self.CF.ALN_SIZE])]) + else: + self.bed_records[-1][1] = self.tstart + int(chain_aln[self.CF.ALN_SIZE]) + + self.tstart += int(chain_aln[self.CF.ALN_SIZE]) + int(chain_aln[self.CF.ALN_DT]) + self.qstart += int(chain_aln[self.CF.ALN_SIZE]) + int(chain_aln[self.CF.ALN_DQ]) + + def write(self): + for record in self.bed_records: + print(f'{self.contig}\t{record[0]}\t{record[1]}', file=self.out_bed) + + + +class Chain2Vcf(): + def __init__(self, out_vcf=sys.stdout, chain_hdr='', CF=None): + assert(chain_hdr[CF.HEADER] == 'chain') + # assert(chain_hdr[CF.TNAME] == chain_hdr[CF.QNAME]) + + self.out_vcf = out_vcf + self.CF = CF + + self.contig = chain_hdr[self.CF.QNAME] + # Add one b/c VCF format is one-based. + self.qstart = int(chain_hdr[self.CF.QSTART]) + 1 + self.tstart = int(chain_hdr[self.CF.TSTART]) + 1 + + if chain_hdr[self.CF.TNAME] == chain_hdr[self.CF.QNAME]: + # INS w.r.t the query sequence. + if self.qstart < self.tstart: + ref = 'A' + qry = 'A' * (self.tstart - self.qstart + 1) + print(f'{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}', + file=self.out_vcf) + # DEL + elif self.qstart > self.tstart: + ref = 'A' * (self.qstart - self.tstart + 1) + qry = 'A' + print(f'{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}', + file=self.out_vcf) + else: + len_t = int(chain_hdr[self.CF.TEND]) - int(chain_hdr[self.CF.TSTART]) + len_q = int(chain_hdr[self.CF.QEND]) - int(chain_hdr[self.CF.QSTART]) + if len_t > len_q: + ref = 'A' + qry = 'A' * (len_t - len_q + 1) + print(f'{self.contig}\t{self.qstart}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}', + file=self.out_vcf) + elif len_q > len_t: + ref = 'A' * (len_q - len_t + 1) + qry = 'A' + print(f'{self.contig}\t{self.qstart - len_q}\t.\t{ref}\t{qry}\t.\tAUTO\tTPOS={self.tstart}', + file=self.out_vcf) + + + + + def add(self, chain_aln): + self.tstart += int(chain_aln[self.CF.ALN_SIZE]) + self.qstart += int(chain_aln[self.CF.ALN_SIZE]) + + len_t = int(chain_aln[self.CF.ALN_DT]) + len_q = int(chain_aln[self.CF.ALN_DQ]) + + if (len_t > len_q): + # Insertion wrt the query seq + len_t -= len_q + len_q = 0 + seq_t = 'A' * (len_t + 1) + seq_q = 'A' * (len_q + 1) + print(f'{self.contig}\t{self.qstart}\t.\t{seq_q}\t{seq_t}\t.\tAUTO\t' + + f'TPOS={self.tstart};ALN_DT={int(chain_aln[self.CF.ALN_DT])};ALN_DQ={int(chain_aln[self.CF.ALN_DQ])}', + file=self.out_vcf) + elif (len_t < len_q): + # Deletion wrt the query seq + len_q -= len_t + len_t = 0 + seq_t = 'A' * (len_t + 1) + seq_q = 'A' * (len_q + 1) + print(f'{self.contig}\t{self.qstart}\t.\t{seq_q}\t{seq_t}\t.\tAUTO\t' + + f'TPOS={self.tstart};ALN_DT={int(chain_aln[self.CF.ALN_DT])};ALN_DQ={int(chain_aln[self.CF.ALN_DQ])}', + file=self.out_vcf) + self.tstart += int(chain_aln[self.CF.ALN_DT]) + self.qstart += int(chain_aln[self.CF.ALN_DQ]) + + + +# Parse raw `chain_ids` and return a list of chain ids. +# +# Input: +# chain_ids: A string of chain_ids that are separated by commas and hyphens. +# Output: +# A list containing individual chain ids. +def parse_chain_id(chain_ids): + list_chain = [] + chain_ids = chain_ids.split(',') + for c in chain_ids: + c = c.split('-') + if len(c) == 1: + list_chain.append(c[0]) + elif len(c) == 2: + for i in range(int(c[0]), int(c[1]) + 1): + list_chain.append(str(i)) + return list_chain + + +def get_contig_name(fn, chain_ids, CF): + dict_cid_contig = {} + with open(fn, 'r') as f: + for line in f: + line = line.split() + if len(line) == 13: + if line[CF.CID] in chain_ids: + dict_cid_contig[line[CF.CID]] = line[CF.TNAME] + return dict_cid_contig + + +def write_vcf_hdr(f_out, chain_ids, dict_cid_contig): + print('##fileformat=VCFv4.3', file=f_out) + print('##FILTER=', file=f_out) + print(dict_cid_contig) + for cid, contig in dict_cid_contig.items(): + print(f'##contig=', file=f_out) + print('##INFO=', + file=f_out) + print('##INFO=', + file=f_out) + print('##INFO=', + file=f_out) + print('#CHROM POS ID REF ALT QUAL FILTER INFO', file=f_out) + + +def chain2bed(args): + print('chain2bed', file=sys.stderr) + # Constants. + CF = Chain_Fields() + + chain_ids = parse_chain_id(args.chain_ids) + print('Chain IDs to be processed', chain_ids, file=sys.stderr) + + if not args.out_bed: + f_out = sys.stdout + else: + f_out = open(args.out_bed, 'w') + + f = open(args.in_chain, 'r') + current_id = '' + chain = None + for line in f: + line = line.split() + # Chain header + if len(line) == 13: + if line[CF.CID] in chain_ids: + chain = Chain2Bed(out_bed=f_out, chain_hdr=line, CF=CF) + if line[CF.TNAME] != line[CF.QNAME]: + del chain + chain = None + elif len(line) == 0: + current_id = '' + if chain: + del chain + chain = None + elif len(line) == 3: + if chain: + chain.add(line) + elif len(line) == 1: + if chain: + chain.write() + del chain + chain = None + + + +def chain2vcf(args): + # Constants. + CF = Chain_Fields() + + chain_ids = parse_chain_id(args.chain_ids) + if not args.out_vcf: + f_out = sys.stdout + else: + f_out = open(args.out_vcf, 'w') + + write_vcf_hdr(f_out, chain_ids, + get_contig_name(args.in_chain, chain_ids, CF)) + + f = open(args.in_chain, 'r') + current_id = '' + chain = None + for line in f: + line = line.split() + # Chain header + if len(line) == 13: + if line[CF.CID] in chain_ids: + # if line[CF.QNAME] in ['chr2']: + chain = Chain2Vcf(out_vcf=f_out, chain_hdr=line, CF=CF) + if line[CF.TNAME] != line[CF.QNAME]: + print('[Error] Contig names mismatch', + line[CF.TNAME], line[CF.QNAME], file=sys.stderr) + exit() + del chain + chain = None + elif len(line) == 0: + current_id = '' + if chain: + del chain + chain = None + elif len(line) == 3: + if chain: + chain.add(line) + elif len(line) == 1: + if chain: + del chain + chain = None + + +if __name__ == '__main__': + parse_args() + diff --git a/scripts/compare_sam.py b/scripts/compare_sam.py new file mode 100644 index 0000000..cb4dc31 --- /dev/null +++ b/scripts/compare_sam.py @@ -0,0 +1,103 @@ +"""Compares two SAM files and report a summary. +""" +import argparse +import sys + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '-q', '--input_query', + help='Path to a query SAM file.' + ) + parser.add_argument( + '-b', '--input_baseline', + help='Path to a baseline SAM file.' + ) + parser.add_argument( + '-o', '--out', default=sys.stdout, + help='Path to the ouput report. [stdout]' + ) + args = parser.parse_args() + return args + + +class Summary(): + def __init__(self): + self.pos_diff = [] + self.mapq_diff = [] + self.cigar_diff = [] + self.pos_diff_records = [] + + def update(self, query, baseline): + if len(query) == 0 or len(baseline) == 0: + return + if query[0] == baseline[0]: # contig + self.pos_diff.append(abs(query[1] - baseline[1])) + else: + self.pos_diff.append(-1) + if not (query[0] == baseline[0] and query[1] == baseline[1]): + self.pos_diff_records.append([query, baseline]) + self.mapq_diff.append(query[2] - baseline[2]) + self.cigar_diff.append(query[3] == baseline[3]) + + def report(self, f_out): + print('## Position', file=f_out) + print(f'{self.pos_diff.count(0) / len(self.pos_diff)} ({self.pos_diff.count(0)}/{len(self.pos_diff)})', file=f_out) + for i in range(30): + print(self.pos_diff_records[i], file=f_out) + + print('## MAPQ', file=f_out) + print(f'{self.mapq_diff.count(0) / len(self.mapq_diff)} ({self.mapq_diff.count(0)}/{len(self.mapq_diff)})', file=f_out) + + print('## CIGAR', file=f_out) + print(f'{self.cigar_diff.count(True) / len(self.cigar_diff)} ({self.cigar_diff.count(True)}/{len(self.cigar_diff)})', file=f_out) + + +def process_sam_line(line, dict_sam): + line = line.split() + name = line[0] + flag = int(line[1]) + contig = line[2] + pos = int(line[3]) + mapq = int(line[4]) + cigar = line[5] + + # Ignore unmapped reads. + if flag & 4: + return + + if flag & 64: + if dict_sam.setdefault(name, [[], []]): + dict_sam[name][0] = [contig, pos, mapq, cigar] + elif flag & 128: + if dict_sam.setdefault(name, [[], []]): + dict_sam[name][1] = [contig, pos, mapq, cigar] + else: + print(line) + + +def read_sam_as_dict(fn): + dict_sam = {} + with open(fn, 'r') as f: + for line in f: + if line[0] != '@': + process_sam_line(line, dict_sam) + return dict_sam + + +def compare_sam(args): + summary = Summary() + dict_query = read_sam_as_dict(args.input_query) + dict_baseline = read_sam_as_dict(args.input_baseline) + for i_q, [name, [first_seg, second_seg]] in enumerate(dict_query.items()): + if dict_baseline.get(name): + summary.update(first_seg, dict_baseline[name][0]) + summary.update(second_seg, dict_baseline[name][1]) + summary.report(args.out) + + +if __name__ == '__main__': + args = parse_args() + compare_sam(args) + + diff --git a/scripts/gen_length_map.py b/scripts/gen_length_map.py new file mode 100644 index 0000000..7df83a2 --- /dev/null +++ b/scripts/gen_length_map.py @@ -0,0 +1,32 @@ +import argparse +import sys + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '-f', '--faidx', + help='Path to the .fa.idx file.' + ) + parser.add_argument( + '-o', '--out', + help='Path to the output length map.' + ) + args = parser.parse_args() + return args + + +def gen_length_map(args): + f = open(args.faidx, 'r') + if not args.out: + f_out = sys.stdout + else: + f_out = open(args.out, 'w') + for line in f: + line = line.split() + print(f'{line[0]}\t{line[1]}', file=f_out) + + +if __name__ == '__main__': + args = parse_args() + gen_length_map(args) + diff --git a/testdata/toy.sam b/testdata/toy.sam new file mode 100644 index 0000000..3c9dba4 --- /dev/null +++ b/testdata/toy.sam @@ -0,0 +1,37 @@ +@HD VN:1.0 SO:unsorted +@SQ SN:chr1 LN:248954750 +@SQ SN:chr2 LN:242192349 +@SQ SN:chr3 LN:198294095 +@SQ SN:chr4 LN:190213022 +@SQ SN:chr5 LN:181538717 +@SQ SN:chr6 LN:170806452 +@SQ SN:chr7 LN:159345940 +@SQ SN:chr8 LN:145138783 +@SQ SN:chr9 LN:138394715 +@SQ SN:chr10 LN:133796194 +@SQ SN:chr11 LN:135085765 +@SQ SN:chr12 LN:133274462 +@SQ SN:chr13 LN:114364425 +@SQ SN:chr14 LN:107043763 +@SQ SN:chr15 LN:101991347 +@SQ SN:chr16 LN:90337982 +@SQ SN:chr17 LN:83257045 +@SQ SN:chr18 LN:80372693 +@SQ SN:chr19 LN:58617405 +@SQ SN:chr20 LN:64444158 +@SQ SN:chr21 LN:46709851 +@SQ SN:chr22 LN:50818266 +@SQ SN:chrX LN:156040881 +@SQ SN:chrY LN:57227415 +@SQ SN:chrM LN:16569 +@PG ID:bowtie2 PN:bowtie2 VN:2.3.4.3 CL:"/home-1/cnaechy1@jhu.edu/miniconda3/bin/bowtie2-align-s --wrapper basic-0 -p 16 -x /net/langmead-bigmem-ib.bluecrab.cluster/storage2/naechyun/refflow-exp/snakemake/SRR622457_se/major/indexes/wg-maj -U ../../data/NA12878-nygc_1-sorted-10M.fq" +toy1 0 chr1 949000 42 71M * 0 0 AATGCCACATGGGGCTCCGTCCCGAGCAACCAGAAGACGCAGCAGCAACAGCAAAGACAGCAAAGAGTGCC ??????????????????????????????????????????????????????????????????????? +toy2 0 chr1 949050 42 30M * 0 0 GCAAAGACAGCAAAGAGTGCCAGACACAGG ?????????????????????????????? +A00217:76:HFLT3DSXX:1:1101:1606:27978 99 chr1 81889506 42 150M = 81889757 400 AGGTTAACTCAACAGCCAAGTGCCAGCCGTGGGAGATACTGGGAAGAAACTTGGACTAAGTTGGGGGTCGGGGAGAATTTGGAACAAACAGCAAGAAGGTAATTAAGAGTTGGGAGAGGTGGGGATTAGGAAAAATGTATGCAATATATA ?????????????????????????????+??????????????????????????????????????????????????????????????????+????????+???????????????????+?????????????????+?????? AS:i:-5 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:94A55 YS:i:-8 YT:Z:CP +A00217:76:HFLT3DSXX:1:1101:1606:27978 147 chr1 81889757 42 19M1I130M = 81889506 -400 GTAGGGTGCAGAGATACACGGCAGTCTTTGAAAAAACTGGCCCATCTCCATGTTCACCTAGGTCTACAGTCTGGTAGCTCTGAACCAGTAAGATTATAGAAATCTTGATATCTTCTCTTTGGTTCTCAGGGTGAGAAATTGTATATTTTT ??????????????5?????????????????????????????????????????????????+????????????????????????????????????????????????????????????????????????????????????? AS:i:-8 XN:i:0 XM:i:0 XO:i:1 XG:i:1 NM:i:1 MD:Z:149 YS:i:-5 YT:Z:CP +A00217:76:HFLT3DSXX:1:1101:1606:27999 99 chr1 81889506 42 150M = 81889757 400 AGGTTAACTCAACAGCCAAGTGCCAGCCGTGGGAGATACTGGGAAGAAACTTGGACTAAGTTGGGGGTCGGGGAGAATTTGGAACAAACAGCAAGAAGGTAATTAAGAGTTGGGAGAGGTGGGGATTAGGAAAAATGTATGCAATATATA ?????????????????????????????+??????????????????????????????????????????????????????????????????+????????+???????????????????+?????????????????+?????? AS:i:-5 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:94A55 YS:i:-8 YT:Z:CP +A00217:76:HFLT3DSXX:1:1101:1606:27999 147 chr1 81889757 42 19M1I1M1D1M1I127M = 81889506 -400 GTAGGGTGCAGAGATACACGGCAGTCTTTGAAAAAACTGGCCCATCTCCATGTTCACCTAGGTCTACAGTCTGGTAGCTCTGAACCAGTAAGATTATAGAAATCTTGATATCTTCTCTTTGGTTCTCAGGGTGAGAAATTGTATATTTTT ??????????????5?????????????????????????????????????????????????+????????????????????????????????????????????????????????????????????????????????????? AS:i:-8 XN:i:0 XM:i:0 XO:i:1 XG:i:1 NM:i:1 MD:Z:149 YS:i:-5 YT:Z:CP +A00217:76:HFLT3DSXX:1:1101:1127:2597 89 chr19 44832790 42 150M = 44832790 0 ACTGTAGCCTCAACATCCCAGGGCTCAGGTGATCCTCCCATCTCAGCCTCCTGAGTAGCTGGGATTACAGGAGCATGCCACCACACCTGGCTAATTTTTGTACTTTTTGTAGAGATAGGGTTTCACCATGTTGCCCAGGCTGGTCTTGAA ???????????????????????????????????????????????????+????????????????????????????????+?5????????????????++???????5????????????????????????5??????5????5 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:150 YT:Z:UP +A00217:76:HFLT3DSXX:1:1101:1127:2597 165 chr19 44832790 0 * = 44832790 0 GGGTGGGGGGATGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGATGAACGATCTCCGCTCACTGCAACCTCTGCATCTAGGGGTCAAGTGATTCGCCTGCATCCACCTACCAATTATCGAGGGCTACAGGCGCCCGCCACCACGCCG ??????5??????????+??????????????????5?5+??+5??++?+?+????5?+?5????????+5???5+?+??5++??+?+?????????&5????+++++5?5+??++++?++&??++????+???5???&??5++?55??' YT:Z:UP +A00217:76:HFLT3DSXX:1:1101:1063:33771 99 chr9 26829129 3 150M = 26829474 491 CCTATAACACTGCAATACCACCTTGTTGTCAGTGTAAACAAGGGCATAGCCCGAAAGCACTGAGGCCACTGACAACCCATAGCCTTCCTATCAAAAATCCTTAACCCAGCAGGTTTCCTAACAGGCGATCTAAATCTTAATTAATTACCA ?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? AS:i:-40 XS:i:0 XN:i:0 XM:i:8 XO:i:0 XG:i:0 NM:i:8 MD:Z:11C9T23G5T30T4T35T1G24 YS:i:-74 YT:Z:CP +A00217:76:HFLT3DSXX:1:1101:1063:33771 147 chr9 26829474 3 27M1I51M1I30M2I38M = 26829129 -491 GCCATCTATACCAATTCTAAGTTAATATGGACCGAACGAGGTTTTATTAATAGCAAAGAAAAATTAAAATCTCAAACTTAACAAGGTTTTGAACCAAAGTAAAGTTTGCTAAAAATTAACAGTGTAACATGCATTATCCTACTACCACAC ??????????5??????????????+?+5??????+??????????????5?????????????5????????????????????????????5????????????????????+???????5?+?????????+??????????????? AS:i:-74 XS:i:-5 XN:i:0 XM:i:10 XO:i:3 XG:i:4 NM:i:14 MD:Z:31T3T34C1C15C3T8T8G7G4T22 YS:i:-40 YT:Z:CP diff --git a/testdata/toy.vcf b/testdata/toy.vcf new file mode 100644 index 0000000..0daf56e --- /dev/null +++ b/testdata/toy.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.3 +##FILTER= +##fileDate=26022019_15h52m43s +##source=IGSRpipeline +##reference=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 949046 . A AACAGCAAAG . PASS AC=4418;AN=5096;DP=23115;AF=0.87;EAS_AF=0.92;EUR_AF=0.94;AFR_AF=0.72;AMR_AF=0.89;SAS_AF=0.92;VT=INDEL;NS=2548 +chr1 949049 . A AG . PASS AC=4418;AN=5096;DP=23115;AF=0.87;EAS_AF=0.92;EUR_AF=0.94;AFR_AF=0.72;AMR_AF=0.89;SAS_AF=0.92;VT=INDEL;NS=2548 +chr1 949054 . GA G . PASS AC=4418;AN=5096;DP=23115;AF=0.87;EAS_AF=0.92;EUR_AF=0.94;AFR_AF=0.72;AMR_AF=0.89;SAS_AF=0.92;VT=INDEL;NS=2548 +chr1 949057 . T TGC . PASS AC=4418;AN=5096;DP=23115;AF=0.87;EAS_AF=0.92;EUR_AF=0.94;AFR_AF=0.72;AMR_AF=0.89;SAS_AF=0.92;VT=INDEL;NS=2548