Skip to content

Commit

Permalink
Merge pull request #112 from cidgoh/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
anwarMZ authored Dec 14, 2022
2 parents 1cd46fb + 8c5afec commit 672375c
Showing 1 changed file with 17 additions and 18 deletions.
35 changes: 17 additions & 18 deletions bin/vcf2gvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ def parse_args():
return parser.parse_args()


def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):
def map_pos_to_gene_protein(pos, aa_names, GENE_PROTEIN_POSITIONS_DICT):
"""This function is inspired/lifted from Ivan's code.
Map a series of nucleotide positions to SARS-CoV-2 genes.
See https://www.ncbi.nlm.nih.gov/nuccore/MN908947.
:param pos: Nucleotide position pandas series from VCF
:param aa_names: aa_names pandas series
:param GENE_POSITIONS_DICT: Dictionary of gene positions from cov_lineages
:param GENE_PROTEIN_POSITIONS_DICT: Dictionary of gene positions from cov_lineages
:type pos: int
:return: series containing SARS-CoV-2 chromosome region names at each
nucleotide position in ``pos``
Expand All @@ -72,10 +72,10 @@ def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):

# loop through genes dict to get gene names
df["gene_names"] = df["POS"]
for gene in GENE_POSITIONS_DICT["genes"]:
for gene in GENE_PROTEIN_POSITIONS_DICT["genes"]:
# get nucleotide coordinates for this gene
start = GENE_POSITIONS_DICT["genes"][gene]["coordinates"]["from"]
end = GENE_POSITIONS_DICT["genes"][gene]["coordinates"]["to"]
start = GENE_PROTEIN_POSITIONS_DICT["genes"][gene]["coordinates"]["from"]
end = GENE_PROTEIN_POSITIONS_DICT["genes"][gene]["coordinates"]["to"]
# for all the mutations that are found in this region,
# assign this gene name
gene_mask = pos.astype(int).between(start, end, inclusive="both")
Expand All @@ -88,11 +88,11 @@ def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):

# loop through proteins dict to get protein names
df["protein_names"] = df["POS"]
for protein in GENE_POSITIONS_DICT["proteins"]:
start = GENE_POSITIONS_DICT["proteins"][protein][
for protein in GENE_PROTEIN_POSITIONS_DICT["proteins"]:
start = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein][
"g.coordinates"]["from"]
end = GENE_POSITIONS_DICT["proteins"][protein]["g.coordinates"]["to"]
protein_name = GENE_POSITIONS_DICT["proteins"][protein]["name"]
end = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein]["g.coordinates"]["to"]
protein_name = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein]["name"]
# get protein names for all mutations that are within range
protein_mask = pos.astype(int).between(start, end, inclusive="both")
df["protein_names"][protein_mask] = protein_name
Expand Down Expand Up @@ -133,7 +133,7 @@ def clade_defining_threshold(threshold, df):
'FILTER', 'INFO', 'FORMAT', 'unknown']


def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
def vcftogvf(var_data, strain, GENE_PROTEIN_POSITIONS_DICT, names_to_split):
df = pd.read_csv(var_data, sep='\t', names=vcf_colnames)
# remove pragmas
df = df[~df['#CHROM'].str.contains("#")]
Expand Down Expand Up @@ -170,6 +170,7 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
# hgvs names
hgvs = eff_info[3].str.rsplit(pat='c.').apply(pd.Series)
hgvs_protein = hgvs[0].str[:-1]

hgvs_nucleotide = 'g.' + hgvs[1] # change 'c.' to 'g.' for nucleotide names

new_df['nt_name'] = hgvs_nucleotide
Expand Down Expand Up @@ -201,7 +202,7 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):

# gene and protein name extraction
gene_names, protein_names = map_pos_to_gene_protein(
df['POS'].astype(int), new_df['aa_name'], GENE_POSITIONS_DICT)
df['POS'].astype(int), new_df['aa_name'], GENE_PROTEIN_POSITIONS_DICT)
new_df['#attributes'] = 'chrom_region=' + gene_names + ';'
new_df['#attributes'] = new_df['#attributes'] + 'protein=' + \
protein_names + ';'
Expand Down Expand Up @@ -372,13 +373,10 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):

def add_functions(gvf, annotation_file, clade_file, strain):
attributes = gvf["#attributes"].str.split(pat=';').apply(pd.Series)

# remember this includes nucleotide names where there are no
# protein names
hgvs_protein = attributes[0].str.split(pat='=').apply(pd.Series)[1]
hgvs_nucleotide = attributes[1].str.split(pat='=').apply(pd.Series)[
1]
# drop the prefix
gvf["mutation"] = hgvs_protein.str[2:]
gvf["mutation"] = attributes[0].str.split(pat='=').apply(pd.Series)[1]

# merge annotated vcf and functional annotation files by
# 'mutation' column in the gvf
Expand All @@ -390,6 +388,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
# add functional annotations
merged_df = pd.merge(df, gvf, on=['mutation'], how='right')

print(merged_df)
# collect all mutation groups (including reference mutation) in a
# column, sorted alphabetically
# this is more roundabout than it needs to be; streamline with
Expand Down Expand Up @@ -474,7 +473,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
merged_df[column] = merged_df[column].fillna('')
merged_df["#attributes"] = merged_df["#attributes"].astype(
str) + key + '=' + merged_df[column].astype(str) + ';'

# get clade_defining status, and then info from clades file
# load clade-defining mutations file
clades = pd.read_csv(clade_file, sep='\t', header=0)
Expand Down Expand Up @@ -612,7 +611,7 @@ def find_sample_size(table, lineage):
print("Processing: " + file)

# create gvf from annotated vcf (ignoring pragmas for now)
gvf = vcftogvf(file, args.strain, GENE_POSITIONS_DICT,
gvf = vcftogvf(file, args.strain, GENE_PROTEIN_POSITIONS_DICT,
args.names_to_split)
# add functional annotations
if args.names:
Expand Down

0 comments on commit 672375c

Please sign in to comment.