CAST-genomics · mlamkin7 · Feb 23, 2023 · Feb 16, 2023 · Feb 17, 2023 · Feb 17, 2023
diff --git a/docs/commands/simgenotype.rst b/docs/commands/simgenotype.rst
@@ -37,6 +37,7 @@ Parameter Descriptions
 * ``--region`` - Limit the simulation to a region within a single chromosome. Overwrites chroms with the chrom listed in this region. eg 1:1-10000 [Optional]
 * ``--pop_field`` - Flag for ouputting population field in VCF output. Note this flag does not work when your output is in PGEN format. [Optional]
 * ``--sample_field`` - Flag for ouputting sample field in VCF output. Note this flag does not work when your output is in PGEN format. Should only be used for debugging. [Optional]
+* ``--no_replacement`` - Flag for deteremining during the VCF generation process whether we grab samples' haplotypes with or without replacement from the reference VCF file. Default = False (With replacement) [Optional]
 * ``--verbosity`` - What level of output the logger should print to stdout. Please see `logging levels <https://docs.python.org/3/library/logging.html>`_ for output levels. Default = INFO [Optional]
 * ``--only_breakpoint`` - Flag which when provided only outputs the breakpoint file. Note you will not need to provide a ``--ref_vcf`` or ``--sample_info`` file and can instead put NA. eg.  ``--ref_vcf NA`` and ``--sample_info NA`` [Optional]
 

diff --git a/haptools/__main__.py b/haptools/__main__.py
@@ -182,6 +182,16 @@ def karyogram(bp, sample, out, title, centromeres, colors, verbosity):
         " flag does not work when your output file is in PGEN format."
     ),
 )
+@click.option(
+    "--no_replacement",
+    is_flag=True,
+    required=False,
+    default=False,
+    help=(
+        "Flag used to determine whether to sample reference haplotypes"
+        " with or without replacement. (Default = Replacement)"
+    ),
+)
 @click.option(
     "--only_breakpoint",
     is_flag=True,
@@ -211,6 +221,7 @@ def simgenotype(
     region,
     pop_field,
     sample_field,
+    no_replacement,
     only_breakpoint,
     verbosity,
 ):
@@ -264,7 +275,15 @@ def simgenotype(
 
     # simulate breakpoints
     popsize = validate_params(
-        model, mapdir, chroms, popsize, ref_vcf, sample_info, region, only_breakpoint
+        model,
+        mapdir,
+        chroms,
+        popsize,
+        ref_vcf,
+        sample_info,
+        no_replacement,
+        region,
+        only_breakpoint,
     )
     samples, pop_dict, breakpoints = simulate_gt(
         model, mapdir, chroms, region, popsize, log, seed
@@ -284,6 +303,7 @@ def simgenotype(
             region,
             pop_field,
             sample_field,
+            no_replacement,
             out,
             log,
         )

diff --git a/haptools/sim_genotype.py b/haptools/sim_genotype.py
@@ -22,6 +22,7 @@ def output_vcf(
         region, 
         pop_field, 
         sample_field, 
+        no_replacement,
         out, 
         log
     ):
@@ -62,6 +63,9 @@ def output_vcf(
         Flag to determine whether to have the population field in the VCF file output
     sample_field: boolean
         Flag to determine whether to have the sample field in the VCF file output
+    no_replacement: boolean
+        Flag to determine whether we sample from the reference VCF with or without
+        replacement. When True there will be no replacement.
     out: str
         output prefix
     log: log object
@@ -91,6 +95,11 @@ def output_vcf(
             pop_sample[pop].append(sample)
     log.debug(f"Filtered sample info to limit populations within model file. Populations: {pops}.")
 
+    # Shuffle samples for each pop if no_replacement to have random selection of samples
+    if no_replacement:
+        for key in pop_sample.keys():
+            np.random.shuffle(pop_sample[key])
+
     # check if pops without admixed is same as grabbed populations
     assert len(pops)-1 == len(list(set(pop_sample.keys())))
 
@@ -113,6 +122,9 @@ def output_vcf(
     for ind, sample in enumerate(vcf.samples):
         sample_dict[sample] = ind
 
+    # initialize hap_used array which should contain list of lists for each reference sample's genome and what segment has been used for each
+    haps_used = [[] for samp in range(len(vcf.samples)*2)]
+
     log.debug(f"Created index array storing per sample which segment is being processed.")
 
     # Load populations and samples from sample info and breakpoints as 2 matrices with size variants x samples x 2
@@ -140,8 +152,10 @@ def output_vcf(
         for chrom in chroms:
             # limit reference vcf variants to current chrom
             ref_vars_chrom = ref_vars["pos"][ref_vars["chrom"] == f"{cur_chrom}{chrom}"]
-            # Convert haplotype to numpy array of segment position, populations, and samples
-            hap_positions, hap_pops, hap_samples_name, hap_samples_ind = _convert_haplotype(haplotype, chrom, pop_dict, pop_sample, sample_dict)
+
+            # Convert haplotype to numpy array of segment position, populations, samples, and sample haplotypes
+            hap_positions, hap_pops, hap_samples_name, hap_samples_ind, hap_sample_haps = \
+                    _convert_haplotype(haplotype, chrom, pop_dict, pop_sample, sample_dict, haps_used, no_replacement)
 
             # if the variant position is = breakpoint end then we consider it part of that bkp
             bkp_pos = np.searchsorted(ref_vars_chrom, hap_positions, side='right')
@@ -153,12 +167,19 @@ def output_vcf(
             # create ref_gts from mapping hap_samples to their respective genotypes 
             ref_sample_inds_chrom = np.repeat(hap_samples_ind, inter_len)
 
+            # select which alleles for each segment
+            if no_replacement:
+                ref_sample_haps_chrom = np.repeat(hap_sample_haps, inter_len)
+
+
             # grab random haplotype from samples and use as gts for our simulated samples
             end_var = cur_var+ref_sample_inds_chrom.shape[0]
             gt_vars = np.arange(cur_var, end_var, 1)
+            if not no_replacement:
+                ref_sample_haps_chrom = np.random.randint(2, size=len(gt_vars))
             ref_gts_chrom = vcf.data[ref_sample_inds_chrom,
                                      gt_vars,
-                                     np.random.randint(2, size=len(gt_vars))]
+                                     ref_sample_haps_chrom]
             ref_gts[cur_var:end_var] = ref_gts_chrom
 
             if pop_field:
@@ -208,21 +229,49 @@ def output_vcf(
 
     return
 
-def _convert_haplotype(haplotype, chrom, pop_dict, pop_sample, sample_dict):
+def _convert_haplotype(haplotype, chrom, pop_dict, pop_sample, sample_dict, haps_used, no_replacement):
     if chrom == 'X': chrom = 23
     # Do binary search to find beginning of chr in haplotype segments
     hap_start_ind = start_segment(0, int(chrom), haplotype)
     hap_subset = haplotype[hap_start_ind:]
     hap_pos = []
     hap_pops = []
+    hap_inds = []
     hap_samples = []
     hap_samples_ind = []
 
     # collect all segments within chromosome
     for segment in hap_subset:
         if not segment.get_chrom() == int(chrom):
             break
-        sample_name = np.random.choice(pop_sample[pop_dict[segment.get_pop()]])
+
+        # Grab reference sample to take variants for current segment
+        population = pop_dict[segment.get_pop()]
+
+        # Sample without replacement by keeping track of all segments used for each sample
+        if no_replacement:
+            if not hap_pos:
+                sample_name, hap_ind = _find_random_sample(
+                                                pop_sample[population],
+                                                sample_dict,
+                                                haps_used,
+                                                chrom,
+                                                0,
+                                                segment.get_end_coord(),
+                                                )
+            else:
+                sample_name, hap_ind = _find_random_sample(
+                                                pop_sample[population],
+                                                sample_dict,
+                                                haps_used,
+                                                chrom,
+                                                hap_pos[-1]+1,
+                                                segment.get_end_coord(),
+                                                )
+            hap_inds.append(hap_ind)
+        else:
+            sample_name = np.random.choice(pop_sample[population])
+
         hap_pos.append(segment.get_end_coord())
         hap_pops.append(segment.get_pop())
         hap_samples.append(sample_name)
@@ -231,7 +280,30 @@ def _convert_haplotype(haplotype, chrom, pop_dict, pop_sample, sample_dict):
     return np.asarray(hap_pos, dtype=np.int64), \
            np.asarray(hap_pops, dtype=np.uint8), \
            np.asarray(hap_samples, dtype=object), \
-           np.asarray(hap_samples_ind, dtype=np.int32)
+           np.asarray(hap_samples_ind, dtype=np.int32), \
+           np.asarray(hap_inds, dtype=np.uint8)
+
+def _find_random_sample(samples, sample_dict, haps_used, chrom, start_coord, end_coord):
+    for sample in samples:
+        for haplotype in range(2):
+            # Determine whether the coord we want to sample is being used
+            if _find_coord(haps_used[sample_dict[sample]*2+haplotype], chrom, start_coord, end_coord):
+                continue
+            else:
+                return sample, haplotype
+
+    raise Exception(f"No available sample for the current coords {start_coord}-{end_coord}.")
+
+def _find_coord(cur_hap, chrom, start_coord, end_coord):
+    if cur_hap:
+        for coords in cur_hap:
+            # check for whether the segment has already been used or partially been used
+            if chrom == coords[0]:
+                if start_coord <= coords[1] < end_coord or start_coord < coords[2] <= end_coord: 
+                    return True
+    # segment isn't found so add segment to current sample's haplotype
+    cur_hap.append((chrom, start_coord, end_coord))
+    return False
 
 def simulate_gt(model_file, coords_dir, chroms, region, popsize, log, seed=None):
     """
@@ -738,7 +810,7 @@ def start_segment(start, chrom, segments):
 
     return len(segments)
 
-def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, region=None, only_bp=False):
+def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, no_replacement, region=None, only_bp=False):
     # validate model file
     mfile = open(model, 'r')
     num_samples, *pops = mfile.readline().strip().split()
@@ -832,19 +904,27 @@ def validate_params(model, mapdir, chroms, popsize, invcf, sample_info, region=N
 
     # validate sample_info file (ensure pops given are in model file and samples in vcf file)
     # ensure sample_info col 2 in pops
+    total_per_pop = {}
     sample_pops = set()
     for line in open(sample_info, 'r'):
         sample = line.split()[0]
         info_pop = line.split()[1]
         sample_pops.add(info_pop)
+        if not total_per_pop.get(info_pop, False):
+            total_per_pop[info_pop] = 1
+        else:
+            total_per_pop[info_pop] += 1
 
         if sample not in vcf_samples:
             raise Exception(f"Sample {sample} in sampleinfo file is not present in the vcf file.")
 
-    # Ensure that all populations from the model file are listed in the sample info file
+    # Ensure that all populations from the model file are listed in the sample info file 
+    #    and that there is sufficient enough samples when no_replacement is specified
     for model_pop in pops[1:]:
         if model_pop not in list(sample_pops):
             raise Exception(f"Population {model_pop} in model file is not present in the sample info file.")
+        if no_replacement and total_per_pop[model_pop] < num_samples:
+            raise Exception(f"Population {model_pop} does not have enough samples to sample without replacement. Please ensure that each population specified has >= total number of samples output: {num_samples}")
 
     # Ensure that the region parameter can be properly interpreted
     if region:

diff --git a/tests/data/outvcf_info-one_CEU.tab b/tests/data/outvcf_info-one_CEU.tab
@@ -0,0 +1,5 @@
+HG00096 CEU
+HG00097 YRI
+HG00099 YRI
+HG00100 YRI
+HG00101 YRI
diff --git a/tests/data/outvcf_test_no_replace.bp b/tests/data/outvcf_test_no_replace.bp
@@ -0,0 +1,20 @@
+Sample_1_1
+YRI	1	59423086	85.107755
+CEU	1	239403765	266.495714
+YRI	2	229668157	244.341689
+YRI	23	229668157	244.341689
+Sample_1_2
+YRI	1	59423086	85.107755
+YRI	1	239403765	266.495714
+CEU	2	229668157	244.341689
+YRI	23	229668157	244.341689
+Sample_2_1
+CEU	1	59423086	85.107755
+YRI	1	239403765	266.495714
+CEU	2	229668157	244.341689
+YRI	23	229668157	244.341689
+Sample_2_2
+CEU	1	59423086	85.107755
+CEU	1	239403765	266.495714
+YRI	2	229668157	244.341689
+YRI	23	229668157	244.341689
diff --git a/tests/data/outvcf_test_no_replace.vcf b/tests/data/outvcf_test_no_replace.vcf
@@ -0,0 +1,11 @@
+##fileformat=VCFv4.2
+##filedate=20180225
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##contig=<ID=chr1>
+##contig=<ID=chr2>
+##contig=<ID=chrX>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG00096	HG00097	HG00099	HG00100	HG00101
+chr1	10114	1:10114:T:C	T	C	.	.	.	GT	1|1	1|1	0|0	0|0	0|0
+chr1	59423090	1:59423090:A:G	A	G	.	.	.	GT	0|0	0|0	1|1	1|1	1|1
+chr2	10122	2:10122:A:G	A	G	.	.	.	GT	0|0	0|0	1|1	1|1	1|1
+chrX	10122	X:10122:A:G	A	G	.	.	.	GT	0|0	0|0	1|1	1|1	1|1
diff --git a/tests/data/outvcf_test_no_replace2.bp b/tests/data/outvcf_test_no_replace2.bp
@@ -0,0 +1,20 @@
+Sample_1_1
+CEU	1	59423086	85.107755
+CEU	1	239403765	266.495714
+YRI	2	229668157	244.341689
+YRI	23	229668157	244.341689
+Sample_1_2
+CEU	1	59423086	85.107755
+CEU	1	239403765	266.495714
+CEU	2	229668157	244.341689
+CEU	23	229668157	244.341689
+Sample_2_1
+CEU	1	59423086	85.107755
+CEU	1	239403765	266.495714
+CEU	2	229668157	244.341689
+YRI	23	229668157	244.341689
+Sample_2_2
+CEU	1	59423086	85.107755
+CEU	1	239403765	266.495714
+YRI	2	229668157	244.341689
+CEU	23	229668157	244.341689