From 223d7693c2b668c485bb1fdbe7061440dff6519e Mon Sep 17 00:00:00 2001 From: Timofey Prodanov Date: Thu, 23 Nov 2023 10:06:44 +0100 Subject: [PATCH] Detect paired/unpaired read bias --- requirements.txt | 1 + src/parascopy/__init__.py | 2 +- src/parascopy/call_variants.py | 26 ++- src/parascopy/inner/bam_file.py | 298 +++++++++++++++++------------- src/parascopy/inner/common.py | 8 + src/parascopy/inner/genome.py | 2 +- src/parascopy/inner/paralog_cn.py | 5 +- src/parascopy/inner/variants.py | 129 ++++++++----- src/parascopy/pool_reads.py | 12 +- 9 files changed, 288 insertions(+), 195 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4d287ce..32a3fea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ simpleeval>=0.9 parasail>=1.2 biopython>=1.70 intervaltree>=3.0 +construct>=2.10 diff --git a/src/parascopy/__init__.py b/src/parascopy/__init__.py index 0911011..ced8a80 100644 --- a/src/parascopy/__init__.py +++ b/src/parascopy/__init__.py @@ -1,7 +1,7 @@ __pkg_name__ = 'parascopy' __title__ = 'Parascopy' -__version__ = '1.14.0-0' +__version__ = '1.14.1-0' __author__ = 'Timofey Prodanov, Vikas Bansal' __license__ = 'MIT' diff --git a/src/parascopy/call_variants.py b/src/parascopy/call_variants.py index 4a37470..fa9ee0a 100644 --- a/src/parascopy/call_variants.py +++ b/src/parascopy/call_variants.py @@ -149,8 +149,13 @@ def _create_complement_dupl_tree(duplications, table, genome, padding): def _write_record_coordinates(filenames, samples, locus, duplications, table, genome, args): ix_filename = bam_file_.CoordinatesIndex.index_name(filenames.read_coord) - if args.rerun != 'full' and os.path.isfile(filenames.read_coord) and common.non_empty_file(ix_filename): - return + if args.rerun != 'full' and os.path.isfile(filenames.read_coord) and os.path.isfile(ix_filename): + try: + coord_index = bam_file_.CoordinatesIndex(filenames.read_coord, samples, genome) + if coord_index.version >= 2: + return coord_index + except: + common.log('WARN: Could not read BAM coordinates file, writing it anew') common.log('[{}] Writing read coordinates'.format(locus.name)) with pysam.AlignmentFile(filenames.pooled) as in_bam: @@ -158,7 +163,7 @@ def _write_record_coordinates(filenames, samples, locus, duplications, table, ge max_mate_dist = int(comment_dict['max_mate_dist']) unique_tree = _create_complement_dupl_tree(duplications, table, genome, padding=max_mate_dist * 2) bam_file_.write_record_coordinates(in_bam, samples, unique_tree, genome, filenames.read_coord, comment_dict) - common.log('[{}] Finished writing read coordinates'.format(locus.name)) + return bam_file_.CoordinatesIndex(filenames.read_coord, samples, genome) def _analyze_sample(locus, sample_id, sample, all_read_allele_obs, coord_index, cn_profiles, assume_cn, @@ -208,7 +213,7 @@ def _analyze_sample(locus, sample_id, sample, all_read_allele_obs, coord_index, common.log('[{}] Sample {}: selected {} informative PSVs'.format(locus.name, sample, len(informative_psv_gts))) else: informative_psv_gts = potential_psv_gts - read_collection.add_psv_observations(informative_psv_gts, coord_index.max_mate_dist, varcall_params.no_mate_penalty) + read_collection.add_psv_observations(informative_psv_gts, varcall_params.no_mate_penalty) if debug_out is not None: read_collection.debug_read_probs(genome, debug_out) @@ -274,7 +279,7 @@ def analyze_locus(locus, model_params, data, samples, assume_cn): varcall_params = variants_.VarCallParameters(args, samples) filenames.read_coord = os.path.join(filenames.subdir, 'read_coordinates.bin') - _write_record_coordinates(filenames, samples, locus, duplications, table, genome, args) + coord_index = _write_record_coordinates(filenames, samples, locus, duplications, table, genome, args) cn_profiles = paralog_cn.CopyNumProfiles(filenames.cn_res, genome, samples, locus.chrom_id) filenames.read_allele = os.path.join(filenames.subdir, 'read_allele_obs.bin') @@ -299,8 +304,8 @@ def analyze_locus(locus, model_params, data, samples, assume_cn): extra_files = dict(genotypes='genotypes.csv', psv_use='psv_use.csv') if args.debug: extra_files.update(dict(debug_reads='debug_reads.log', debug_obs='debug_obs.log', debug='debug.log')) - with bam_file_.CoordinatesIndex(filenames.read_coord, samples, genome) as coord_index, \ - OutputFiles(filenames.subdir, extra_files) as out: + # Use `coord_index as coord_index` to open & close inner files. + with coord_index as coord_index, OutputFiles(filenames.subdir, extra_files) as out: if args.debug: out.debug_reads.write('name\thash\n') _debug_write_read_hashes(filenames.pooled, out.debug_reads) @@ -490,8 +495,10 @@ def main(prog_name=None, in_argv=None): 'genotype (genotype quality >= ) [default: %(default)s].') call_args.add_argument('--limit-depth', nargs=2, type=int, metavar=' ', default=(3, 2000), help='Min and max variant read depth [default: 3, 2000].') - call_args.add_argument('--max-strand-bias', type=float, metavar='', default=30, - help='Maximum strand bias (Phred p-value score) [default: %(default)s].') + call_args.add_argument('--strand-bias', type=float, metavar='', default=30, + help='Maximum strand bias (Phred score) [default: %(default)s].') + call_args.add_argument('--unpaired-bias', type=float, metavar='', default=30, + help='Maximum unpaired bias (Phred score) [default: %(default)s].') call_args.add_argument('--max-agcn', type=int, metavar='', default=10, help='Maximum aggregate copy number [default: %(default)s].') @@ -537,6 +544,7 @@ def main(prog_name=None, in_argv=None): else: assert args.output != args.parascopy common.mkdir(args.output) + common.mkdir(os.path.join(args.output, 'loci')) separate_output = True detect_cn.write_command(os.path.join(args.output, 'command_call.txt')) diff --git a/src/parascopy/inner/bam_file.py b/src/parascopy/inner/bam_file.py index d7b1531..f1b8715 100644 --- a/src/parascopy/inner/bam_file.py +++ b/src/parascopy/inner/bam_file.py @@ -1,4 +1,5 @@ import re +import io import struct from enum import Enum import tempfile @@ -6,6 +7,7 @@ import numpy as np from collections import defaultdict from intervaltree import IntervalTree +import construct from .duplication import Duplication from .genome import Interval @@ -157,56 +159,67 @@ def has_orig_aln(self): return self == ReadStatus.Realigned -def _aln_b_is_true_position(cigar_a, cigar_b, aln_region_b, strand_b, baseq, unique_tree, min_tail): - has_clipping_a = True if cigar_a is None else cigar_a.has_true_clipping(baseq) - has_clipping_b = cigar_b.has_true_clipping(common.cond_reverse(baseq, strand=strand_b)) - # Same as (not has_clipping_a or has_clipping_b). - if has_clipping_a <= has_clipping_b: +def _old_aln_is_true(cigar1, cigar2, aln_region2, strand2, baseq, unique_tree, min_tail): + """ + Checks if the old alignment (*2) represents the true location of the read. + """ + has_clipping1 = True if cigar1 is None else cigar1.has_true_clipping(baseq) + has_clipping2 = cigar2.has_true_clipping(common.cond_reverse(baseq, strand=strand2)) + # Same as (not has_clipping1 or has_clipping2). + if has_clipping1 <= has_clipping2: return False - return unique_tree.intersection_size(aln_region_b) >= min_tail + return unique_tree.intersection_size(aln_region2) >= min_tail class RecordCoord: - class LocationInfo(Enum): - """ - Alignment region can have unknown status (unknown if correct or incorrect), - certainly correct (it is known that self.aln_region is the true location), - or certainly incorrect (self.aln_region is not the true location, however the true location is unknown). - """ - Unknown = 0 - CertIncorrect = 1 - CertCorrect = 2 - - IS_PAIRED = 0b00000001 - IS_REVERSE = 0b00000010 + IS_PAIRED = 0b00000001 + IS_REVERSE = 0b00000010 + # There is no new alignment. + NEW_UNMAPPED = 0b00000100 + # There is no need to store old location, it is equal to the new. + OLD_EQ_NEW = 0b00001000 LOC_INFO_SHIFT = 6 - MAX_U16 = 0xffff - byte_struct = None - @staticmethod - def init_byte_struct(): - """ - Binary format takes 19 bytes per read: - Name Bytes Comment - hash 8 Hash of the read name, last bit is 1 if this is the first mate. - seq_len 2 - chrom_id 2 - aln_start 4 0-based position - aln_len 2 - flag 1 - """ - if RecordCoord.byte_struct is None: - RecordCoord.byte_struct = struct.Struct('=QHHIHB') + # + # Name Bytes Comment + # hash 8 Hash of the read name, last bit is 1 if this is the first mate. + # flag 1 Information about the read: strand, SE/PE, location info. + # seq_len V + # new_chrom_id V New alignment location. Only present if !NEW_UNMAPPED. + # new_start V 0-based start + # new_len V + # old_chrom_id V Original alignment location. Only present if !OLD_EQ_NEW. + # old_start V 0-based start + # old_len V + # + RecordStruct = construct.Struct( + 'hash' / construct.Int64un, # un = Unsigned native, + 'flag' / construct.Int8un, + 'seq_len' / construct.VarInt, + 'new_region' / construct.IfThenElse(construct.this.flag & NEW_UNMAPPED, + construct.Pass, construct.VarInt[3]), + 'old_region' / construct.IfThenElse(construct.this.flag & OLD_EQ_NEW, + construct.Pass, construct.VarInt[3]), + ) - def __init__(self): - RecordCoord.init_byte_struct() + class LocationInfo(Enum): + Unknown = 0 + # New location is certainly incorrect + NewIncorrect = 1 + # Old location was certainly correct + OldCorrect = 2 + def __init__(self): self.read_hash = None - self.seq_len = None - self.aln_region = None + self.is_paired = None + self.is_reverse = None self.location_info = RecordCoord.LocationInfo.Unknown - self.is_paired = False - self.is_reverse = False + self.seq_len = None + # New and old alignment regions (can be the same). + self.new_region = None + self.old_region = None + # It is possible that there are many realignments of the same read. + # Here, we will refer all of them to each other. self.other_entries = None def add_entry(self, other): @@ -215,17 +228,22 @@ def add_entry(self, other): else: self.other_entries = [other] - def get_certainly_correct_location(self): - return self.aln_region if self.location_info == RecordCoord.LocationInfo.CertCorrect else None + def get_true_location(self): + """ + Returns a true location, if it is certainly known. Otherwise, returns None. + """ + return self.old_region if self.location_info == RecordCoord.LocationInfo.OldCorrect else None - def get_certainly_incorrect_locations(self): + def get_forbidden_locations(self): + """ + Returns all forbidden locations, if they are known. + """ res = [] - if self.location_info == RecordCoord.LocationInfo.CertIncorrect: - res.append(self.aln_region) - if self.other_entries: - for entry in self.other_entries: - if entry.location_info == RecordCoord.LocationInfo.CertIncorrect: - res.append(entry.aln_region) + if self.location_info == RecordCoord.LocationInfo.NewIncorrect: + res.append(self.new_region) + for entry in self.other_entries or (): + if entry.location_info == RecordCoord.LocationInfo.NewIncorrect: + res.append(entry.new_region) return tuple(res) @classmethod @@ -237,79 +255,90 @@ def from_pooled_record(cls, record, unique_tree, genome, min_mapq=50, min_unique self.is_paired = record.is_paired self.is_reverse = record.is_reverse - if not record.is_unmapped: - chrom_a = genome.chrom_id(record.reference_name) - self.aln_region = aln_region_a = Interval(chrom_a, record.reference_start, record.reference_end) - cigar_a = Cigar.from_pysam_tuples(record.cigartuples) - mapq_a = record.mapping_quality - # print(' Alignment A: {} {} {}'.format(aln_region_a.to_str(genome), cigar_a.to_str('.'), mapq_a)) - # print(' unique tail A = {}'.format(unique_tree.intersection_size(aln_region_a))) + if record.is_unmapped: + cigar1 = None else: - cigar_a = None - - if record.has_tag('OA'): - oa_tag = record.get_tag('OA').split(',') - chrom_b = genome.chrom_id(oa_tag[0]) - start_b = int(oa_tag[1]) - 1 - strand_b = oa_tag[2] == '+' - cigar_b = Cigar(oa_tag[3]) - mapq_b = int(oa_tag[4]) - - aln_region_b = Interval(chrom_b, start_b, start_b + cigar_b.ref_len) - if self.aln_region is None: - self.aln_region = aln_region_b - # print(' Alignment B: {} {} {}'.format(aln_region_b.to_str(genome), cigar_b.to_str('.'), mapq_b)) - # print(' unique tail B = {}'.format(unique_tree.intersection_size(aln_region_b))) - - # Do not check mapq_a because it is always 60. - if mapq_b >= min_mapq and _aln_b_is_true_position( - cigar_a, cigar_b, aln_region_b, strand_b, record.query_qualities, unique_tree, min_unique_tail): - self.aln_region = aln_region_b - self.location_info = RecordCoord.LocationInfo.CertCorrect - elif cigar_a is None or cigar_b.aligned_len > cigar_a.aligned_len + min_unique_tail: - self.location_info = RecordCoord.LocationInfo.CertIncorrect - - elif mapq_a >= min_mapq and not cigar_a.has_true_clipping(record.query_qualities) and \ - unique_tree.intersection_size(aln_region_a) >= min_unique_tail: - self.aln_region = aln_region_a - self.location_info = RecordCoord.LocationInfo.CertCorrect - # print(' -> location info = {}'.format(self.location_info)) - - assert self.aln_region is not None + chrom1 = genome.chrom_id(record.reference_name) + self.new_region = Interval(chrom1, record.reference_start, record.reference_end) + cigar1 = Cigar.from_pysam_tuples(record.cigartuples) + mapq1 = record.mapping_quality + + # Parsing old alignment. + oa_tag = record.get_tag('OA').split(',') + chrom2 = genome.chrom_id(oa_tag[0]) + start2 = int(oa_tag[1]) - 1 + strand2 = oa_tag[2] == '+' + cigar2 = Cigar(oa_tag[3]) + mapq2 = int(oa_tag[4]) + self.old_region = Interval(chrom2, start2, start2 + cigar2.ref_len) + + if self.new_region is not None and self.new_region == self.old_region \ + and not cigar1.has_true_clipping(record.query_qualities) \ + and unique_tree.intersection_size(self.new_region) >= min_unique_tail: + self.location_info = RecordCoord.LocationInfo.OldCorrect + + elif mapq2 >= min_mapq and _old_aln_is_true(cigar1, cigar2, self.old_region, strand2, + record.query_qualities, unique_tree, min_unique_tail): + self.location_info = RecordCoord.LocationInfo.OldCorrect + + elif cigar1 is not None and cigar2.aligned_len > cigar1.aligned_len + min_unique_tail: + # Redundant to add NewIncorrect if cigar1 is None. + self.location_info = RecordCoord.LocationInfo.NewIncorrect return self def write_binary(self, out): + data = construct.Container(hash=self.read_hash, seq_len=self.seq_len, new_region=None, old_region=None) flag = 0 if self.is_paired: flag |= RecordCoord.IS_PAIRED if self.is_reverse: flag |= RecordCoord.IS_REVERSE - flag |= self.location_info.value << RecordCoord.LOC_INFO_SHIFT - out.write(RecordCoord.byte_struct.pack( - self.read_hash, - min(self.seq_len, RecordCoord.MAX_U16), - self.aln_region.chrom_id, - self.aln_region.start, - min(len(self.aln_region), RecordCoord.MAX_U16), - flag, - )) + if self.new_region is None: + flag |= RecordCoord.NEW_UNMAPPED + else: + data.new_region = (self.new_region.chrom_id, self.new_region.start, len(self.new_region)) + + if self.new_region is not None and self.new_region == self.old_region: + flag |= RecordCoord.OLD_EQ_NEW + else: + data.old_region = (self.old_region.chrom_id, self.old_region.start, len(self.old_region)) + + data.flag = flag | (self.location_info.value << RecordCoord.LOC_INFO_SHIFT) + out.write(RecordCoord.RecordStruct.build(data)) + + @staticmethod + def parse_interval(seq): + if seq is None: + return None + chrom, start, length = seq + return Interval(chrom, start, start + length) @classmethod - def from_binary(cls, buffer, offset=0): + def from_binary(cls, stream, offset=0): self = cls() - (read_hash, seq_len, chrom_id, start, region_len, flag) = RecordCoord.byte_struct.unpack_from(buffer, offset) - self.read_hash = np.uint64(read_hash) - self.seq_len = seq_len - self.aln_region = Interval(chrom_id, start, start + region_len) + struct = RecordCoord.RecordStruct.parse_stream(stream) + self.read_hash = np.uint64(struct.hash) + flag = struct.flag self.is_paired = bool(flag & RecordCoord.IS_PAIRED) self.is_reverse = bool(flag & RecordCoord.IS_REVERSE) self.location_info = RecordCoord.LocationInfo(flag >> RecordCoord.LOC_INFO_SHIFT) + + self.seq_len = struct.seq_len + self.new_region = RecordCoord.parse_interval(struct.new_region) + self.old_region = RecordCoord.parse_interval(struct.old_region) + assert self.new_region is not None or (flag & RecordCoord.NEW_UNMAPPED) + if self.old_region is None: + assert flag & RecordCoord.OLD_EQ_NEW + self.old_region = self.new_region return self def to_str(self, genome=None): - return 'Hash {}: status {}, length {}, alignment {}, {}'.format(self.read_hash, self.status.name, self.seq_len, - self.aln_region.to_str(genome) if genome else repr(self.aln_region), self.location_info) + return '{:x}: length {}, new {}, old {}, {}'.format( + self.read_hash, self.seq_len, + '*' if self.new_region is None else (self.new_region.to_str(genome) if genome else repr(self.new_region)), + '*' if self.old_region is None else (self.old_region.to_str(genome) if genome else repr(self.old_region)), + self.location_info) def write_record_coordinates(in_bam, samples, unique_tree, genome, out_filename, comment_dict): @@ -319,33 +348,40 @@ def write_record_coordinates(in_bam, samples, unique_tree, genome, out_filename, read_groups[read_group] = samples.id_or_none(sample) with tempfile.TemporaryDirectory(prefix='parascopy') as wdir: - tmp_files = [] + n_entries = [0] * len(samples) + tmp_filenames = [os.path.join(wdir, str(i)) for i in range(n_samples)] try: - for i in range(n_samples): - tmp_files.append(open(os.path.join(wdir, str(i)), 'wb')) + tmp_files = [] + for tmp_filename in tmp_filenames: + tmp_files.append(open(tmp_filename, 'wb')) for record in in_bam: coord = RecordCoord.from_pooled_record(record, unique_tree, genome) sample_id = read_groups[record.get_tag('RG')] if sample_id is not None: coord.write_binary(tmp_files[sample_id]) + n_entries[sample_id] += 1 finally: for f in tmp_files: f.close() with open(out_filename, 'wb') as out: - index_str = [genome.table_header()] + # Header begins with the version of the coordinates file. + index_str = ['#v2\n', genome.table_header()] for key, val in comment_dict.items(): index_str.append('# {}={}\n'.format(key, val)) + index_str.append('#sample\tstart_offset\tend_offset\tn_entries\n') offset = 0 - for i in range(n_samples): - with open(os.path.join(wdir, str(i)), 'rb') as tmp_file: + for sample, tmp_filename, sample_entries in zip(samples, tmp_filenames, n_entries): + with open(tmp_filename, 'rb') as tmp_file: data = tmp_file.read() out.write(data) new_offset = offset + len(data) - index_str.append('{}\t{}\t{}\n'.format(samples[i], offset, new_offset)) + index_str.append('{}\t{}\t{}\t{}\n'.format(sample, offset, new_offset, sample_entries)) offset = new_offset - with open(CoordinatesIndex.index_name(out_filename), 'w') as out_index: - out_index.writelines(index_str) + + # Write full index at once so that we have no situation, where both files exist, but are incomplete. + with open(CoordinatesIndex.index_name(out_filename), 'w') as out_index: + out_index.writelines(index_str) class CoordinatesIndex: @@ -354,40 +390,50 @@ def index_name(path): return path + '.ix' def __init__(self, path, samples, genome): + self.version = 0 self.index = [None] * len(samples) comment_dict = {} - with open(self.index_name(path)) as in_index: - if not genome.matches_header(next(in_index)): + with open(CoordinatesIndex.index_name(path)) as f: + line = next(f) + if line.startswith('#v'): + self.version = int(line[2:]) + line = next(f) + if not genome.matches_header(line): raise ValueError('Input reference fasta does not match read coordinates file {}.\n'.format(path) + 'Consider deleting it or run parascopy with --rerun full.') - for line in in_index: - if line.startswith('# ') and '=' in line: - key, val = line[2:].strip().split('=', 1) - comment_dict[key] = val + + for line in f: + if line.startswith('#'): + if '=' in line: + key, val = line[2:].strip().split('=', 1) + comment_dict[key] = val continue - sample, start, end = line.strip().split('\t') - self.index[samples.id(sample)] = (int(start), int(end)) + sample, start, end, n_entries = line.strip().split('\t') + self.index[samples.id(sample)] = (int(start), int(end), int(n_entries)) self.max_mate_dist = int(comment_dict['max_mate_dist']) self.path = path self.file = None - RecordCoord.init_byte_struct() def load(self, sample_id): - start, end = self.index[sample_id] + start, end, n_entries = self.index[sample_id] length = end - start self.file.seek(start) - buffer = self.file.read(length) - struct_size = RecordCoord.byte_struct.size + stream = io.BytesIO(self.file.read(length)) coordinates = {} - for offset in range(0, length, struct_size): - coord = RecordCoord.from_binary(buffer, offset) + for _ in range(n_entries): + coord = RecordCoord.from_binary(stream) if coord.read_hash in coordinates: coordinates[coord.read_hash].add_entry(coord) else: coordinates[coord.read_hash] = coord + try: + if stream.__getstate__()[1] != length: + common.log('WARN: Possibly, not all entries were read from the coordinates file') + except AttributeError: + pass return coordinates def open(self): diff --git a/src/parascopy/inner/common.py b/src/parascopy/inner/common.py index 86dd072..3847a2a 100644 --- a/src/parascopy/inner/common.py +++ b/src/parascopy/inner/common.py @@ -457,3 +457,11 @@ def letter_suffix(index, chars=string.ascii_lowercase): if index < n: return chars[index] return chars[index // n - 1] + chars[index % n] + + +class Undefined: + def __bool__(self): + raise ValueError('Cannot cast UNDEF to bool') + + +UNDEF = Undefined() diff --git a/src/parascopy/inner/genome.py b/src/parascopy/inner/genome.py index 8ed00bd..d75aa74 100644 --- a/src/parascopy/inner/genome.py +++ b/src/parascopy/inner/genome.py @@ -433,7 +433,7 @@ def start_distance(self, other): def distance(self, other): """ - Returns distance between closest points of two duplications. + Returns distance between closest points of two intervals. Returns sys.maxsize if intervals lie on different chromosomes. """ if self._chrom_id != other._chrom_id: diff --git a/src/parascopy/inner/paralog_cn.py b/src/parascopy/inner/paralog_cn.py index 01f2782..feb1d41 100644 --- a/src/parascopy/inner/paralog_cn.py +++ b/src/parascopy/inner/paralog_cn.py @@ -576,9 +576,8 @@ def _get_ref_pscns(samples, genome, region_group, const_regions, modified_ref_cn return ([None] * n_samples,) * 2 return [def_agcn] * n_samples, [def_pscn] * n_samples - undefined = object() - ref_agcns = [undefined] * n_samples - ref_pscns = [undefined] * n_samples + ref_agcns = [common.UNDEF] * n_samples + ref_pscns = [common.UNDEF] * n_samples for i, region_ix in enumerate(region_group.region_ixs): const_region = const_regions[region_ix] diff --git a/src/parascopy/inner/variants.py b/src/parascopy/inner/variants.py index fc62670..b6c4d8f 100644 --- a/src/parascopy/inner/variants.py +++ b/src/parascopy/inner/variants.py @@ -458,43 +458,38 @@ def __init__(self, start): self.start = start -def strand_bias_test(forw_counts, rev_counts): +def strand_bias_test(strand_counts): """ + Input: np.array((2, n_alleles)). Find strand bias for each allele and returns an array with p-values. For each allele, run Fisher Exact test comparing forward/reverse counts against the sum forward/reverse counts for all other alleles. + + Returns Fisher test Phred-scores for each allele. """ - n_alleles = len(forw_counts) - forw_sum = np.sum(forw_counts) - rev_sum = np.sum(rev_counts) + n_alleles = strand_counts.shape[1] + forw_sum, rev_sum = np.sum(strand_counts, axis=1) total_sum = forw_sum + rev_sum - pvals = np.ones(n_alleles) + phreds = np.zeros(n_alleles) if forw_sum == 0 or rev_sum == 0: # One of the rows is 0 -- all p-values = 1. - return pvals + return phreds if n_alleles == 2: - col_sum1 = forw_counts[0] + rev_counts[0] - if col_sum1 == 0 or col_sum1 == total_sum: - # One of the columns is 0 -- all p-values = 1. - return pvals - - matrix = (forw_counts, rev_counts) - pvals[:] = fisher_exact(matrix)[1] - return pvals + if 0 < np.sum(strand_counts[:, 0]) < total_sum: + pval = fisher_exact(strand_counts)[1] + # Use +0 to remove -0. + phreds[:] = -10.0 * np.log10(pval) + 0.0 + # Otherwise, if there are no observations for one of the alleles -- all p-values = 1. + return phreds for i in range(n_alleles): - forw_i = forw_counts[i] - rev_i = rev_counts[i] - col_sum_i = forw_i + rev_i - if col_sum_i == 0 or col_sum_i == total_sum: - # One of the columns is 0 -- current p-values = 1. - continue - - matrix = ((forw_i, forw_sum - forw_i), (rev_i, rev_sum - rev_i)) - pvals[i] = fisher_exact(matrix)[1] - return pvals + forw_i, rev_i = strand_counts[:, i] + if 0 < forw_i + rev_i < total_sum: + pval = fisher_exact(((forw_i, forw_sum - forw_i), (rev_i, rev_sum - rev_i)))[1] + phreds[i] = -10.0 * np.log10(pval) + 0.0 + return phreds def _round_qual(qual): @@ -912,16 +907,24 @@ def update_vcf_records(self, gt_pred, genome): good_allele_counts = [0] * curr_n_alleles strand_allele_counts = [0] * (2 * curr_n_alleles) strand_phred = [0] * curr_n_alleles + paired_allele_counts = [0] * (2 * curr_n_alleles) + paired_phred = [0] * curr_n_alleles for old_allele_ix, new_allele_ix in enumerate(old_to_new): all_allele_counts[new_allele_ix] = int(gt_pred.all_allele_counts[old_allele_ix]) good_allele_counts[new_allele_ix] = int(gt_pred.good_allele_counts[old_allele_ix]) - strand_allele_counts[2 * new_allele_ix] = int(gt_pred.forw_counts[old_allele_ix]) - strand_allele_counts[2 * new_allele_ix + 1] = int(gt_pred.rev_counts[old_allele_ix]) - strand_phred[new_allele_ix] = gt_pred.strand_phred[old_allele_ix] + strand_allele_counts[2 * new_allele_ix] = int(gt_pred.strand_counts[0, old_allele_ix]) + strand_allele_counts[2 * new_allele_ix + 1] = int(gt_pred.strand_counts[1, old_allele_ix]) + strand_phred[new_allele_ix] = np.floor(gt_pred.strand_phred[old_allele_ix]) + paired_allele_counts[2 * new_allele_ix] = int(gt_pred.paired_counts[0, old_allele_ix]) + paired_allele_counts[2 * new_allele_ix + 1] = int(gt_pred.paired_counts[1, old_allele_ix]) + paired_phred[new_allele_ix] = np.floor(gt_pred.paired_phred[old_allele_ix]) rec_fmt['AD'] = all_allele_counts rec_fmt['ADq'] = good_allele_counts rec_fmt['SB'] = strand_allele_counts rec_fmt['FS'] = strand_phred + if True: # any(unpaired_count > 0 for unpaired_count in itertools.islice(paired_allele_counts, 1, None, 2)): + rec_fmt['PP'] = paired_allele_counts + rec_fmt['FP'] = paired_phred if gt_filter: rec_fmt['FILT'] = gt_filter.to_tuple() if i == 0: @@ -1118,7 +1121,12 @@ def create_vcf_headers(genome, argv, samples): 'Description="Number of pooled observations for each allele and strand ' '(forward REF, reverse REF, forward ALT[1], reverse ALT[1], etc).">') vcf_header.add_line('##FORMAT=') + 'Description="Phred-scaled p-value each allele checking if there is a strand bias">') + vcf_header.add_line('##FORMAT=') + vcf_header.add_line('##FORMAT=') if i == 1: vcf_header.add_line('##FORMAT=') vcf_header.add_line('##FORMAT== varcall_params.max_strand_bias: self.filter.add(VarFilter.StrandBias) - # abs to get rid of -0. - self.strand_phred = np.abs(np.floor(self.strand_phred)) + self.paired_phred = strand_bias_test(self.paired_counts) + if np.max(self.paired_phred) >= varcall_params.max_unpaired_bias: + self.filter.add(VarFilter.UnpairedBias) def update_read_locations(self): var_positions = tuple(var_pos.region for var_pos in self.variant_obs.variant_positions) @@ -2364,7 +2394,8 @@ def __init__(self, args, samples): self.min_read_depth = args.limit_depth[0] self.max_read_depth = args.limit_depth[1] - self.max_strand_bias = args.max_strand_bias + self.max_strand_bias = args.strand_bias + self.max_unpaired_bias = args.unpaired_bias self.min_freebayes_qual = args.limit_qual # def _set_use_af(self, args, samples): diff --git a/src/parascopy/pool_reads.py b/src/parascopy/pool_reads.py index 5ac3dc6..d0a4212 100644 --- a/src/parascopy/pool_reads.py +++ b/src/parascopy/pool_reads.py @@ -17,11 +17,11 @@ from . import long_version -_UNDEF = object() +UNDEF = common.UNDEF def _create_record(orig_record, header, read_groups, status, max_mate_dist, - *, dupl_strand=True, start=_UNDEF, cigar_tuples=_UNDEF, seq=_UNDEF, qual=_UNDEF): + *, dupl_strand=True, start=UNDEF, cigar_tuples=UNDEF, seq=UNDEF, qual=UNDEF): """ Creates a new record by taking the orig_record as a template. If start, cigar_tuples, seq, qual are not provided, take them frmo the original record. @@ -29,17 +29,17 @@ def _create_record(orig_record, header, read_groups, status, max_mate_dist, record = pysam.AlignedSegment(header) record.query_name = orig_record.query_name record.query_sequence = common.cond_rev_comp(orig_record.query_sequence, strand=dupl_strand) \ - if seq is _UNDEF else seq + if seq is UNDEF else seq record.query_qualities = common.cond_reverse(orig_record.query_qualities, strand=dupl_strand) \ - if qual is _UNDEF else qual + if qual is UNDEF else qual - if cigar_tuples is _UNDEF: + if cigar_tuples is UNDEF: assert dupl_strand cigar_tuples = orig_record.cigartuples # This is either input cigar_tuples, or orig_record.cigartuples. if cigar_tuples: record.reference_id = 0 - record.reference_start = orig_record.reference_start if start is _UNDEF else start + record.reference_start = orig_record.reference_start if start is UNDEF else start record.mapping_quality = 60 record.cigartuples = cigar_tuples if orig_record.is_reverse == dupl_strand: