diff --git a/badread/simulate.py b/badread/simulate.py index 8ba3e53..92714d5 100644 --- a/badread/simulate.py +++ b/badread/simulate.py @@ -37,6 +37,7 @@ def simulate(args, output=sys.stderr): ref_seqs, ref_depths, ref_circular = load_reference(args.reference, output) rev_comp_ref_seqs = {name: reverse_complement(seq) for name, seq in ref_seqs.items()} frag_lengths = FragmentLengths(args.mean_frag_length, args.frag_length_stdev, output) + adjust_depths(ref_seqs, ref_depths, ref_circular, frag_lengths, args) identities = Identities(args.mean_identity, args.identity_stdev, args.max_identity, output) error_model = ErrorModel(args.error_model, output) qscore_model = QScoreModel(args.qscore_model, output) @@ -157,7 +158,7 @@ def get_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs, ref_con # repeatedly until we get a result. for _ in range(100): seq, info = get_real_fragment(fragment_length, ref_seqs, rev_comp_ref_seqs, ref_contigs, - ref_contig_weights, ref_circular, args) + ref_contig_weights, ref_circular) if seq != '': return seq, info sys.exit('Error: failed to generate any sequence fragments - are your read lengths ' @@ -180,7 +181,7 @@ def get_fragment_type(args): def get_real_fragment(fragment_length, ref_seqs, rev_comp_ref_seqs, ref_contigs, - ref_contig_weights, ref_circular, args): + ref_contig_weights, ref_circular): if len(ref_contigs) == 1: contig = ref_contigs[0] @@ -200,14 +201,10 @@ def get_real_fragment(fragment_length, ref_seqs, rev_comp_ref_seqs, ref_contigs, info.append('0-' + str(len(seq))) return seq, info - # If the reference contig is circular and the fragment length is too long, then we either - # fail to get the read (if --small_plasmid_bias was used) or bring the fragment size back - # down to the contig size. + # If the reference contig is circular and the fragment length is too long, then we fail to get + # the read. if fragment_length > len(seq) and ref_circular[contig]: - if args.small_plasmid_bias: - return '', '' - else: - fragment_length = len(seq) + return '', '' start_pos = random.randint(0, len(seq)-1) end_pos = start_pos + fragment_length @@ -489,3 +486,26 @@ def print_intro(output): print('', file=output) print(f'Badread v{__version__}', file=output) print(f'long read simulation', file=output) + + +def adjust_depths(ref_seqs, ref_depths, ref_circular, frag_lengths, args): + sampled_lengths = [frag_lengths.get_fragment_length() for x in range(100000)] + total = sum(sampled_lengths) + for ref_name, ref_seq in ref_seqs.items(): + ref_len = len(ref_seq) + ref_circ = ref_circular[ref_name] + + # Circular plasmids may have to have their depth increased due compensate for misses. + if not args.small_plasmid_bias and ref_circ: + passing_total = sum(length for length in sampled_lengths if length <= ref_len) + if passing_total == 0: + sys.exit('Error: fragment length distribution incompatible with reference lengths ' + '- try running with --small_plasmid_bias to avoid this error') + adjustment = total / passing_total + ref_depths[ref_name] *= adjustment + + # Linear plasmids may have to have their depth increased due compensate for truncations. + if not ref_circ: + passing_total = sum(min(ref_len, length) for length in sampled_lengths) + adjustment = total / passing_total + ref_depths[ref_name] *= adjustment diff --git a/badread/version.py b/badread/version.py index 1dc0c1e..fe569a1 100644 --- a/badread/version.py +++ b/badread/version.py @@ -14,4 +14,4 @@ If not, see . """ -__version__ = '0.1.3' +__version__ = '0.1.4' diff --git a/test/test_fragments.py b/test/test_fragments.py index 0a3be66..75ca488 100644 --- a/test/test_fragments.py +++ b/test/test_fragments.py @@ -322,28 +322,6 @@ def setUp(self): def tearDown(self): self.null.close() - def test_bias_off(self): - # When bias is off, the fragments end up at the reference length (even though the frag - # length distribution wants them bigger). - start_adapt_rate, start_adapt_amount = 0.0, 0.0 - end_adapt_rate, end_adapt_amount = 0.0, 0.0 - Args = collections.namedtuple('Args', ['start_adapter_seq', 'end_adapter_seq', - 'junk_reads', 'random_reads', 'chimeras', - 'glitch_rate', 'glitch_size', 'glitch_skip', - 'small_plasmid_bias']) - args = Args(start_adapter_seq='', end_adapter_seq='', - junk_reads=0, random_reads=0, chimeras=0, - glitch_rate=0, glitch_size=0, glitch_skip=0, - small_plasmid_bias=False) - for _ in range(self.trials): - fragment, info = \ - badread.simulate.build_fragment(self.lengths, self.ref_seqs, self.rev_comp_ref_seqs, - self.ref_contigs, self.ref_contig_weights, - self.ref_circular, args, start_adapt_rate, - start_adapt_amount, end_adapt_rate, - end_adapt_amount) - self.assertEqual(len(fragment), 1000) - def test_bias_on(self): # When bias is on, the program quits with an error, because it can't make any fragments. start_adapt_rate, start_adapt_amount = 0.0, 0.0 diff --git a/test/test_simulate2.py b/test/test_simulate2.py index 1626631..ee956b0 100644 --- a/test/test_simulate2.py +++ b/test/test_simulate2.py @@ -134,19 +134,6 @@ def test_counts_3(self): self.assertGreater(ratio, 1.6) self.assertLess(ratio, 2.4) - def test_plasmid_bias_off(self): - # Contig H is 1% the size of contig G but 100x the depth, so it should get about the same - # number of reads. - ref_filename = os.path.join(os.path.dirname(__file__), 'test_ref_5.fasta') - with badread.misc.captured_output() as (out, err): - sequence(ref_filename) - out, err = out.getvalue().strip(), err.getvalue().strip() - counts = count_ref_contigs(out) - g_count, h_count = counts['G'], counts['H'] - ratio = h_count / g_count - self.assertGreater(ratio, 0.6) - self.assertLess(ratio, 1.4) - def test_plasmid_bias_on(self): # When --small_plasmid_bias is on, contig H gets few to no reads, because its length # (50 bp) is much less than the mean fragment length (100 bp).