Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Development #112

Merged
merged 4 commits into from
Dec 14, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 17 additions & 18 deletions bin/vcf2gvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ def parse_args():
return parser.parse_args()


def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):
def map_pos_to_gene_protein(pos, aa_names, GENE_PROTEIN_POSITIONS_DICT):
"""This function is inspired/lifted from Ivan's code.
Map a series of nucleotide positions to SARS-CoV-2 genes.
See https://www.ncbi.nlm.nih.gov/nuccore/MN908947.
:param pos: Nucleotide position pandas series from VCF
:param aa_names: aa_names pandas series
:param GENE_POSITIONS_DICT: Dictionary of gene positions from cov_lineages
:param GENE_PROTEIN_POSITIONS_DICT: Dictionary of gene positions from cov_lineages
:type pos: int
:return: series containing SARS-CoV-2 chromosome region names at each
nucleotide position in ``pos``
Expand All @@ -72,10 +72,10 @@ def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):

# loop through genes dict to get gene names
df["gene_names"] = df["POS"]
for gene in GENE_POSITIONS_DICT["genes"]:
for gene in GENE_PROTEIN_POSITIONS_DICT["genes"]:
# get nucleotide coordinates for this gene
start = GENE_POSITIONS_DICT["genes"][gene]["coordinates"]["from"]
end = GENE_POSITIONS_DICT["genes"][gene]["coordinates"]["to"]
start = GENE_PROTEIN_POSITIONS_DICT["genes"][gene]["coordinates"]["from"]
end = GENE_PROTEIN_POSITIONS_DICT["genes"][gene]["coordinates"]["to"]
# for all the mutations that are found in this region,
# assign this gene name
gene_mask = pos.astype(int).between(start, end, inclusive="both")
Expand All @@ -88,11 +88,11 @@ def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):

# loop through proteins dict to get protein names
df["protein_names"] = df["POS"]
for protein in GENE_POSITIONS_DICT["proteins"]:
start = GENE_POSITIONS_DICT["proteins"][protein][
for protein in GENE_PROTEIN_POSITIONS_DICT["proteins"]:
start = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein][
"g.coordinates"]["from"]
end = GENE_POSITIONS_DICT["proteins"][protein]["g.coordinates"]["to"]
protein_name = GENE_POSITIONS_DICT["proteins"][protein]["name"]
end = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein]["g.coordinates"]["to"]
protein_name = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein]["name"]
# get protein names for all mutations that are within range
protein_mask = pos.astype(int).between(start, end, inclusive="both")
df["protein_names"][protein_mask] = protein_name
Expand Down Expand Up @@ -133,7 +133,7 @@ def clade_defining_threshold(threshold, df):
'FILTER', 'INFO', 'FORMAT', 'unknown']


def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
def vcftogvf(var_data, strain, GENE_PROTEIN_POSITIONS_DICT, names_to_split):
df = pd.read_csv(var_data, sep='\t', names=vcf_colnames)
# remove pragmas
df = df[~df['#CHROM'].str.contains("#")]
Expand Down Expand Up @@ -170,6 +170,7 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
# hgvs names
hgvs = eff_info[3].str.rsplit(pat='c.').apply(pd.Series)
hgvs_protein = hgvs[0].str[:-1]

hgvs_nucleotide = 'g.' + hgvs[1] # change 'c.' to 'g.' for nucleotide names

new_df['nt_name'] = hgvs_nucleotide
Expand Down Expand Up @@ -201,7 +202,7 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):

# gene and protein name extraction
gene_names, protein_names = map_pos_to_gene_protein(
df['POS'].astype(int), new_df['aa_name'], GENE_POSITIONS_DICT)
df['POS'].astype(int), new_df['aa_name'], GENE_PROTEIN_POSITIONS_DICT)
new_df['#attributes'] = 'chrom_region=' + gene_names + ';'
new_df['#attributes'] = new_df['#attributes'] + 'protein=' + \
protein_names + ';'
Expand Down Expand Up @@ -372,13 +373,10 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):

def add_functions(gvf, annotation_file, clade_file, strain):
attributes = gvf["#attributes"].str.split(pat=';').apply(pd.Series)

# remember this includes nucleotide names where there are no
# protein names
hgvs_protein = attributes[0].str.split(pat='=').apply(pd.Series)[1]
hgvs_nucleotide = attributes[1].str.split(pat='=').apply(pd.Series)[
1]
# drop the prefix
gvf["mutation"] = hgvs_protein.str[2:]
gvf["mutation"] = attributes[0].str.split(pat='=').apply(pd.Series)[1]

# merge annotated vcf and functional annotation files by
# 'mutation' column in the gvf
Expand All @@ -390,6 +388,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
# add functional annotations
merged_df = pd.merge(df, gvf, on=['mutation'], how='right')

print(merged_df)
# collect all mutation groups (including reference mutation) in a
# column, sorted alphabetically
# this is more roundabout than it needs to be; streamline with
Expand Down Expand Up @@ -474,7 +473,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
merged_df[column] = merged_df[column].fillna('')
merged_df["#attributes"] = merged_df["#attributes"].astype(
str) + key + '=' + merged_df[column].astype(str) + ';'

# get clade_defining status, and then info from clades file
# load clade-defining mutations file
clades = pd.read_csv(clade_file, sep='\t', header=0)
Expand Down Expand Up @@ -612,7 +611,7 @@ def find_sample_size(table, lineage):
print("Processing: " + file)

# create gvf from annotated vcf (ignoring pragmas for now)
gvf = vcftogvf(file, args.strain, GENE_POSITIONS_DICT,
gvf = vcftogvf(file, args.strain, GENE_PROTEIN_POSITIONS_DICT,
args.names_to_split)
# add functional annotations
if args.names:
Expand Down