From b096aca215724de7f70fd89371b9cc88c9db3054 Mon Sep 17 00:00:00 2001 From: Adam Ewing Date: Tue, 24 Jan 2017 12:25:52 +1000 Subject: [PATCH] allow per-variant SV VAF adjustment via varfile --- bin/addsv.py | 43 ++++++++++++++++++++++++++++++++++--------- test_data/test_sv.txt | 10 +++++----- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/bin/addsv.py b/bin/addsv.py index ab11059..05415fb 100644 --- a/bin/addsv.py +++ b/bin/addsv.py @@ -345,7 +345,7 @@ def makemut(args, bedline, alignopts): chrom = c[0] start = int(c[1]) end = int(c[2]) - araw = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP + araw = c[3:] # INV, DEL, INS, DUP, TRN # translocation specific trn_chrom = None @@ -355,6 +355,10 @@ def makemut(args, bedline, alignopts): is_transloc = c[3] == 'TRN' if is_transloc: + araw = [c[3]] + if len(c) > 7: + araw += c[7:] + start -= 3000 end += 3000 if start < 0: start = 0 @@ -366,16 +370,18 @@ def makemut(args, bedline, alignopts): actions = map(lambda x: x.strip(),' '.join(araw).split(',')) - svfrac = float(args.svfrac) # default, can be overridden by cnv file + svfrac = float(args.svfrac) # default, can be overridden by cnv file or per-variant + + cn = 1.0 if cnv: # CNV file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom,start,end): cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" + ' '.join(("copy number in sv region:",chrom,str(start),str(end),"=",str(cn))) + "\n") - svfrac = 1.0/float(cn) - assert svfrac <= 1.0 - sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted MAF: " + str(svfrac) + "\n") + svfrac = svfrac/float(cn) + assert svfrac <= 1.0, 'copy number from %s must be at least 1: %s' % (args.cnvfile, snregion.stri[()]) + sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted default MAF: " + str(svfrac) + "\n") print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c print "INFO\t" + now() + "\t" + mutid + "\tlength:", end-start @@ -485,22 +491,28 @@ def makemut(args, bedline, alignopts): assert len(a) > 1 # insertion syntax: INS [optional TSDlen] insseqfile = a[1] if not (os.path.exists(insseqfile) or insseqfile == 'RND'): # not a file... is it a sequence? (support indel ins.) - assert re.search('^[ATGCatgc]*$',insseqfile) # make sure it's a sequence + assert re.search('^[ATGCatgc]*$',insseqfile), "cannot determine SV type: %s" % insseqfile # make sure it's a sequence insseq = insseqfile.upper() insseqfile = None if len(a) > 2: # field 5 for insertion is TSD Length tsdlen = int(a[2]) - if len(a) > 3: # field 5 for insertion is motif, format = 'NNNN/NNNN where / is cut site + if len(a) > 3: # field 6 for insertion is motif, format = 'NNNN^NNNN where ^ is cut site ins_motif = a[3] assert '^' in ins_motif, 'insertion motif specification requires cut site defined by ^' + if len(a) > 4: # field 7 is VAF + svfrac = float(a[4])/cn + if action == 'DUP': if len(a) > 1: ndups = int(a[1]) else: ndups = 1 + if len(a) > 2: # VAF + svfrac = float(a[2])/cn + if action == 'DEL': if len(a) > 1: dsize = float(a[1]) @@ -511,9 +523,19 @@ def makemut(args, bedline, alignopts): else: dsize = 1.0 + if len(a) > 2: # VAF + svfrac = float(a[2])/cn + if action == 'TRN': - pass + if len(a) > 1: + svfrac = float(a[1])/cn + if action == 'INV': + if len(a) > 1: + svfrac = float(a[1])/cn + + + print "INFO\t" + now() + "\t" + mutid + "\tfinal VAF accounting for copy number %f: %f" % (cn, svfrac) logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n") @@ -579,6 +601,9 @@ def makemut(args, bedline, alignopts): print "INFO\t" + now() + "\t" + mutid + "\tset paired end mean distance: " + str(args.ismean) print "INFO\t" + now() + "\t" + mutid + "\tset paired end distance stddev: " + str(args.issd) + + + # simulate reads (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, actions, exclude, pemean, pesd, args.tmpdir, mutid=mutid, seed=args.seed, trn_contig=trn_maxcontig) @@ -748,7 +773,7 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='adds SVs to reads, outputs modified reads as .bam along with mates') parser.add_argument('-v', '--varfile', dest='varFileName', required=True, - help='whitespace-delimited target regions to try and add a SV: chrom,start,stop,action,seqfile (if insertion),TSDlength (if insertion)') + help='whitespace-delimited target regions for SV spike-in, see manual for syntax') parser.add_argument('-f', '--bamfile', dest='bamFileName', required=True, help='sam/bam file from which to obtain reads') parser.add_argument('-r', '--reference', dest='refFasta', required=True, diff --git a/test_data/test_sv.txt b/test_data/test_sv.txt index 50c7a19..e079d3f 100644 --- a/test_data/test_sv.txt +++ b/test_data/test_sv.txt @@ -1,5 +1,5 @@ -22 34171055 34184700 INS ACTAACCTGCACAATGTGCACATGTACCCTAAAACTTAGAGTATAATAAAAAAAA 13 AA^TTTT -22 33607508 33607508 TRN 22 34314981 34314981 -22 33871043 33884754 DEL 0.9 -22 34071043 34084754 INV -22 34271043 34284754 DUP 3 +22 34171055 34184700 INS ACTAACCTGCACAATGTGCACATGTACCCTAAAACTTAGAGTATAATAAAAAAAA 13 AA^TTTT 0.8 +22 33607508 33607508 TRN 22 34314981 34314981 0.9 +22 33871043 33884754 DEL 0.9 0.75 +22 34071043 34084754 INV 0.9 +22 34271043 34284754 DUP 3 0.9