Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Added utilization of logging class to karyogram and simgenotype #159

Merged
merged 14 commits into from
Jan 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 31 additions & 19 deletions haptools/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,28 @@ def main():
required=False,
help="Optional color dictionary. Format is e.g. 'YRI:blue,CEU:green'",
)
def karyogram(bp, sample, out, title, centromeres, colors):
@click.option(
"-v",
"--verbosity",
type=click.Choice(["CRITICAL", "INFO", "WARNING", "INFO", "DEBUG", "NOTSET"]),
default="INFO",
show_default="only errors",
help="The level of verbosity desired",
)
def karyogram(bp, sample, out, title, centromeres, colors, verbosity):
"""
Visualize a karyogram of local ancestry tracks
"""
from .karyogram import PlotKaryogram
from .logging import getLogger

log = getLogger(name="karyogram", level=verbosity)

if colors is not None:
colors = dict([item.split(":") for item in colors.split(",")])
log.info("Generating Karyogram...")
PlotKaryogram(
bp, sample, out, centromeres_file=centromeres, title=title, colors=colors
bp, sample, out, log, centromeres_file=centromeres, title=title, colors=colors
)


Expand Down Expand Up @@ -151,14 +163,12 @@ def karyogram(bp, sample, out, title, centromeres, colors):
),
)
@click.option(
"--verbose",
hidden=True,
is_flag=True,
required=False,
help=(
"Output time metrics for each section, breakpoint simulation, vcf creation, "
"and total exection."
),
"-v",
"--verbosity",
type=click.Choice(["CRITICAL", "INFO", "WARNING", "INFO", "DEBUG", "NOTSET"]),
default="INFO",
show_default="only errors",
help="The level of verbosity desired",
)
def simgenotype(
invcf,
Expand All @@ -171,7 +181,7 @@ def simgenotype(
chroms,
region,
only_breakpoint,
verbose,
verbosity,
):
"""
Simulate admixed genomes under a pre-defined model.
Expand All @@ -184,7 +194,9 @@ def simgenotype(
validate_params,
write_breakpoints,
)
from .logging import getLogger

log = getLogger(name="simgenotype", level=verbosity)
start = time.time()

# parse region and chroms parameters
Expand Down Expand Up @@ -215,21 +227,21 @@ def simgenotype(
popsize = validate_params(
model, mapdir, chroms, popsize, invcf, sample_info, region, only_breakpoint
)
samples, breakpoints = simulate_gt(model, mapdir, chroms, region, popsize, seed)
breakpoints = write_breakpoints(samples, breakpoints, out)
samples, breakpoints = simulate_gt(
model, mapdir, chroms, region, popsize, log, seed
)
breakpoints = write_breakpoints(samples, breakpoints, out, log)
bp_end = time.time()

# simulate vcfs
vcf_start = time.time()
if not only_breakpoint:
# TODO add region functionality
output_vcf(breakpoints, chroms, model, invcf, sample_info, region, out)
output_vcf(breakpoints, chroms, model, invcf, sample_info, region, out, log)
end = time.time()

if verbose:
print(f"Time elapsed for breakpoint simulation: {bp_end - start}")
print(f"Time elapse for creating vcf: {end - vcf_start}")
print(f"Time elapsed for simgenotype execution: {end - start}")
log.debug(f"Time elapsed for breakpoint simulation: {bp_end - start}")
log.debug(f"Time elapse for creating vcf: {end - vcf_start}")
log.debug(f"Time elapsed for simgenotype execution: {end - start}")


@main.command()
Expand Down
8 changes: 7 additions & 1 deletion haptools/karyogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def GetChromOrder(sample_blocks):
chroms.sort()
return chroms

def PlotKaryogram(bp_file, sample_name, out_file,
def PlotKaryogram(bp_file, sample_name, out_file, log,
centromeres_file=None, title=None, colors=None):
"""
Plot a karyogram based on breakpoints output by haptools simgenotypes
Expand All @@ -212,6 +212,8 @@ def PlotKaryogram(bp_file, sample_name, out_file,
Sample ID to plot
out_file : str
Name of output file
log: log object
Outputs messages to the appropriate channel.
centromeres_file : str, optional
Path to file with centromere coordinates.
Format: chrom, chromstart_cm, centromere_cm, chromend_cm
Expand All @@ -225,6 +227,7 @@ def PlotKaryogram(bp_file, sample_name, out_file,
"""
# Parse haplotype blocks from the bp file for the
# specified sample
log.info("Collecting Haplotype Blocks...")
sample_blocks = GetHaplotypeBlocks(bp_file, sample_name, centromeres_file)
if len(sample_blocks) == 0:
sys.stderr.write("ERROR: no haplotype blocks identified for %s. "%sample_name)
Expand All @@ -251,13 +254,15 @@ def PlotKaryogram(bp_file, sample_name, out_file,

# Set up colors
if colors is None:
log.info("Colors not given. Setting up colors...")
num_pops = len(pop_list)
cmap = plt.cm.get_cmap('rainbow', num_pops)
colors = dict(zip(pop_list, cmap(list(range(num_pops)))))

# Optionally, plot centromeres/telomeres
clipmask_perchrom = None
if centromeres_file is not None:
log.info("Centromeres present, adding into figure...")
clipmask_perchrom = GetCentromereClipMask(centromeres_file, chrom_order)

# Plot the actual haplotype blocks
Expand All @@ -279,6 +284,7 @@ def PlotKaryogram(bp_file, sample_name, out_file,
ax.set_ylim(len(chrom_order)+3, -3)

fig.savefig(out_file)
log.info(f"Karyogram Complete! Saved to {out_file}")

def GetCentromereClipMask(centromeres_file, chrom_order):
"""
Expand Down
32 changes: 18 additions & 14 deletions haptools/sim_genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import re
import sys
import glob
import time
import numpy as np
from cyvcf2 import VCF
from pysam import VariantFile
Expand All @@ -13,7 +12,7 @@
from .data import GenotypesRefAlt, GenotypesPLINK


def output_vcf(breakpoints, chroms, model_file, vcf_file, sampleinfo_file, region, out):
def output_vcf(breakpoints, chroms, model_file, vcf_file, sampleinfo_file, region, out, log):
"""
Takes in simulated breakpoints and uses reference files, vcf and sampleinfo,
to create simulated variants output in file: out + .vcf
Expand Down Expand Up @@ -48,9 +47,11 @@ def output_vcf(breakpoints, chroms, model_file, vcf_file, sampleinfo_file, regio
within that region.
out: str
output prefix
log: log object
Outputs messages to the appropriate channel.
"""

print(f"Outputting VCF file {out}.vcf")
log.info(f"Outputting VCF file {out}.vcf.gz")

# details to know
# vcf file: how to handle samples and which sample is which haplotype block randomly choose out of current population types
Expand Down Expand Up @@ -104,10 +105,10 @@ def output_vcf(breakpoints, chroms, model_file, vcf_file, sampleinfo_file, regio
# Note: comment out the code below to enable (very experimental!) PGEN support
# curr_bkps = current_bkps.copy()
# _write_pgen(breakpoints, chroms, region, hapblock_samples, curr_bkps, output_samples, vcf_file, out+".pgen")
_write_vcf(breakpoints, chroms, region, hapblock_samples, vcf.samples, current_bkps, output_samples, vcf, out+".vcf.gz")
_write_vcf(breakpoints, chroms, region, hapblock_samples, vcf.samples, current_bkps, output_samples, vcf, out+".vcf.gz", log)
return

def _write_vcf(breakpoints, chroms, region, hapblock_samples, vcf_samples, current_bkps, out_samples, in_vcf, out_vcf):
def _write_vcf(breakpoints, chroms, region, hapblock_samples, vcf_samples, current_bkps, out_samples, in_vcf, out_vcf, log):
"""
in_vcf = cyvcf2 variants we are reading in
out_vcf = output vcf file we output too
Expand All @@ -127,10 +128,9 @@ def _write_vcf(breakpoints, chroms, region, hapblock_samples, vcf_samples, curre
try:
write_vcf.header.add_samples(out_samples)
except AttributeError:
print(
log.warning(
"Upgrade to pysam >=0.19.1 to reduce the time required to create "
"VCFs. See https://github.com/pysam-developers/pysam/issues/1104",
file = sys.stderr,
"VCFs. See https://github.com/pysam-developers/pysam/issues/1104"
)
for sample in out_samples:
write_vcf.header.add_sample(sample)
Expand Down Expand Up @@ -282,7 +282,7 @@ def _write_pgen(breakpoints, chroms, region, hapblock_samples, current_bkps, out

gts.write()

def simulate_gt(model_file, coords_dir, chroms, region, popsize, seed=None):
def simulate_gt(model_file, coords_dir, chroms, region, popsize, log, seed=None):
"""
Simulate admixed genotypes based on the parameters of model_file.

Expand Down Expand Up @@ -313,6 +313,8 @@ def simulate_gt(model_file, coords_dir, chroms, region, popsize, seed=None):
within that region.
popsize: int
Size of population created for each generation.
log: log object
Outputs messages to the appropriate channel.
seed: int
Seed used for randomization.

Expand All @@ -328,7 +330,7 @@ def simulate_gt(model_file, coords_dir, chroms, region, popsize, seed=None):
# initialize seed used for breakpoints
if seed:
np.random.seed(seed)
print(f"Using seed {seed}")
log.info(f"Using seed {seed}")

# load population samples and labels to be simulated
mfile = open(model_file, 'r')
Expand Down Expand Up @@ -426,13 +428,13 @@ def numeric_alpha(x):
sim_gens = cur_gen - prev_gen

# sim generation
print(f"Simulating generation {prev_gen+1}")
log.info(f"Simulating generation {prev_gen+1}")
next_gen_samples = _simulate(popsize, pops, pop_fracs, prev_gen, chroms,
coords, end_coords, recomb_probs, next_gen_samples)

# simulate remaining generations
for i in range(1, sim_gens):
print(f"Simulating generation {prev_gen+i+1}")
log.info(f"Simulating generation {prev_gen+i+1}")

# update pop_fracs to have 100% admixture since this generation has not been specified in model file
pop_fracs = [0]*len(pops)
Expand All @@ -447,7 +449,7 @@ def numeric_alpha(x):
mfile.close()
return num_samples, next_gen_samples

def write_breakpoints(samples, breakpoints, out):
def write_breakpoints(samples, breakpoints, out, log):
"""
Write out a subsample of breakpoints to out determined by samples.

Expand All @@ -461,14 +463,16 @@ def write_breakpoints(samples, breakpoints, out):
of ancestors for this person.
out: str
output prefix used to output the breakpoint file
log: log object
Outputs messages to the appropriate channel.

Returns
-------
breakpoints: list[list[HaplotypeSegment]]
subsampled breakpoints only containing number of samples
"""
breakpt_file = out + '.bp'
print(f"Outputting breakpoint file {breakpt_file}")
log.info(f"Outputting breakpoint file {breakpt_file}")

# randomly sample breakpoints to get the correct amount of samples to output
breakpoints = np.array(breakpoints, dtype=object)
Expand Down
26 changes: 18 additions & 8 deletions tests/test_outputvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,21 @@
import numpy as np
from cyvcf2 import VCF
from pathlib import Path
from haptools.logging import getLogger
from haptools.sim_genotype import output_vcf, validate_params, simulate_gt
from haptools.admix_storage import HaplotypeSegment

DATADIR = Path(__file__).parent.joinpath("data")


def _get_files():
log = getLogger(name="test", level="INFO")
bkp_file = DATADIR.joinpath("outvcf_test.bp")
model_file = DATADIR.joinpath("outvcf_gen.dat")
vcf_file = DATADIR.joinpath("outvcf_test.vcf")
sampleinfo_file = DATADIR.joinpath("outvcf_info.tab")
out_prefix = DATADIR.joinpath("outvcf_out")
return bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix
return bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix, log


def _get_breakpoints(bkp_file):
Expand Down Expand Up @@ -49,15 +51,15 @@ def _get_breakpoints(bkp_file):
def test_alt_chrom_name():
# Test when the ref VCF has chr{X|\d+} form
# read in all files and breakpoints
bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files()
bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix, log = _get_files()
bkp_file = DATADIR.joinpath("outvcf_test_chr.bp")
vcf_file = DATADIR.joinpath("outvcf_test_chr.vcf")
chroms = ["1", "2", "X"]
bkps = _get_breakpoints(bkp_file)

# generate output vcf file
output_vcf(
bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix)
bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix), log
)

# read in vcf file
Expand Down Expand Up @@ -97,13 +99,13 @@ def test_alt_chrom_name():

def test_vcf_output():
# read in all files and breakpoints
bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files()
bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix, log = _get_files()
chroms = ["1", "2"]
bkps = _get_breakpoints(bkp_file)

# generate output vcf file
output_vcf(
bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix)
bkps, chroms, model_file, vcf_file, sampleinfo_file, None, str(out_prefix), log
)

# Expected output for each variant (note these are phased so order matters)
Expand Down Expand Up @@ -147,8 +149,9 @@ def test_region_bkp():
coords_dir = DATADIR.joinpath("map")
chroms = ["22"]
seed = 100
log = getLogger(name="test", level="INFO")
num_samples, all_samples = simulate_gt(
modelfile, coords_dir, chroms, region, popsize, seed
modelfile, coords_dir, chroms, region, popsize, log, seed
)

# Make sure lowest bkp listed is 16111 and greatest is 18674
Expand All @@ -160,11 +163,18 @@ def test_region_bkp():

def test_region_vcf():
region = {"chr": "2", "start": 1, "end": 10122}
bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files()
bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix, log = _get_files()
bkps = _get_breakpoints(bkp_file)
chroms = ["2"]
output_vcf(
bkps, chroms, model_file, vcf_file, sampleinfo_file, region, str(out_prefix)
bkps,
chroms,
model_file,
vcf_file,
sampleinfo_file,
region,
str(out_prefix),
log,
)

vcf = VCF(str(out_prefix) + ".vcf.gz")
Expand Down