cidgoh · anwarMZ · Dec 14, 2022 · Dec 13, 2022 · Dec 13, 2022 · Dec 14, 2022
diff --git a/bin/vcf2gvf.py b/bin/vcf2gvf.py
@@ -55,13 +55,13 @@ def parse_args():
     return parser.parse_args()
 
 
-def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):
+def map_pos_to_gene_protein(pos, aa_names, GENE_PROTEIN_POSITIONS_DICT):
     """This function is inspired/lifted from Ivan's code.
     Map a series of nucleotide positions to SARS-CoV-2 genes.
     See https://www.ncbi.nlm.nih.gov/nuccore/MN908947.
     :param pos: Nucleotide position pandas series from VCF
     :param aa_names: aa_names pandas series
-    :param GENE_POSITIONS_DICT: Dictionary of gene positions from cov_lineages
+    :param GENE_PROTEIN_POSITIONS_DICT: Dictionary of gene positions from cov_lineages
     :type pos: int
     :return: series containing SARS-CoV-2 chromosome region names at each
     nucleotide position in ``pos``
@@ -72,10 +72,10 @@ def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):
 
     # loop through genes dict to get gene names
     df["gene_names"] = df["POS"]
-    for gene in GENE_POSITIONS_DICT["genes"]:
+    for gene in GENE_PROTEIN_POSITIONS_DICT["genes"]:
         # get nucleotide coordinates for this gene
-        start = GENE_POSITIONS_DICT["genes"][gene]["coordinates"]["from"]
-        end = GENE_POSITIONS_DICT["genes"][gene]["coordinates"]["to"]
+        start = GENE_PROTEIN_POSITIONS_DICT["genes"][gene]["coordinates"]["from"]
+        end = GENE_PROTEIN_POSITIONS_DICT["genes"][gene]["coordinates"]["to"]
         # for all the mutations that are found in this region,
         # assign this gene name
         gene_mask = pos.astype(int).between(start, end, inclusive="both")
@@ -88,11 +88,11 @@ def map_pos_to_gene_protein(pos, aa_names, GENE_POSITIONS_DICT):
 
     # loop through proteins dict to get protein names
     df["protein_names"] = df["POS"]
-    for protein in GENE_POSITIONS_DICT["proteins"]:
-        start = GENE_POSITIONS_DICT["proteins"][protein][
+    for protein in GENE_PROTEIN_POSITIONS_DICT["proteins"]:
+        start = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein][
             "g.coordinates"]["from"]
-        end = GENE_POSITIONS_DICT["proteins"][protein]["g.coordinates"]["to"]
-        protein_name = GENE_POSITIONS_DICT["proteins"][protein]["name"]
+        end = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein]["g.coordinates"]["to"]
+        protein_name = GENE_PROTEIN_POSITIONS_DICT["proteins"][protein]["name"]
         # get protein names for all mutations that are within range
         protein_mask = pos.astype(int).between(start, end, inclusive="both")
         df["protein_names"][protein_mask] = protein_name
@@ -133,7 +133,7 @@ def clade_defining_threshold(threshold, df):
                 'FILTER', 'INFO', 'FORMAT', 'unknown']
 
 
-def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
+def vcftogvf(var_data, strain, GENE_PROTEIN_POSITIONS_DICT, names_to_split):
     df = pd.read_csv(var_data, sep='\t', names=vcf_colnames)
     # remove pragmas
     df = df[~df['#CHROM'].str.contains("#")]
@@ -170,6 +170,7 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
     # hgvs names
     hgvs = eff_info[3].str.rsplit(pat='c.').apply(pd.Series)
     hgvs_protein = hgvs[0].str[:-1]
+
     hgvs_nucleotide = 'g.' + hgvs[1] # change 'c.' to 'g.' for nucleotide names
 
     new_df['nt_name'] = hgvs_nucleotide
@@ -201,7 +202,7 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
 
     # gene and protein name extraction
     gene_names, protein_names = map_pos_to_gene_protein(
-        df['POS'].astype(int), new_df['aa_name'], GENE_POSITIONS_DICT)
+        df['POS'].astype(int), new_df['aa_name'], GENE_PROTEIN_POSITIONS_DICT)
     new_df['#attributes'] = 'chrom_region=' + gene_names + ';'
     new_df['#attributes'] = new_df['#attributes'] + 'protein=' + \
         protein_names + ';'
@@ -372,13 +373,10 @@ def vcftogvf(var_data, strain, GENE_POSITIONS_DICT, names_to_split):
 
 def add_functions(gvf, annotation_file, clade_file, strain):
     attributes = gvf["#attributes"].str.split(pat=';').apply(pd.Series)
+
     # remember this includes nucleotide names where there are no
     # protein names
-    hgvs_protein = attributes[0].str.split(pat='=').apply(pd.Series)[1]
-    hgvs_nucleotide = attributes[1].str.split(pat='=').apply(pd.Series)[
-        1]
-    # drop the prefix
-    gvf["mutation"] = hgvs_protein.str[2:]
+    gvf["mutation"] = attributes[0].str.split(pat='=').apply(pd.Series)[1]
 
     # merge annotated vcf and functional annotation files by
     # 'mutation' column in the gvf
@@ -390,6 +388,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
         # add functional annotations
     merged_df = pd.merge(df, gvf, on=['mutation'], how='right')
 
+    print(merged_df)
     # collect all mutation groups (including reference mutation) in a
     # column, sorted alphabetically
     # this is more roundabout than it needs to be; streamline with
@@ -474,7 +473,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
         merged_df[column] = merged_df[column].fillna('')
         merged_df["#attributes"] = merged_df["#attributes"].astype(
             str) + key + '=' + merged_df[column].astype(str) + ';'
-
+    
     # get clade_defining status, and then info from clades file
     # load clade-defining mutations file
     clades = pd.read_csv(clade_file, sep='\t', header=0)
@@ -612,7 +611,7 @@ def find_sample_size(table, lineage):
     print("Processing: " + file)
 
     # create gvf from annotated vcf (ignoring pragmas for now)
-    gvf = vcftogvf(file, args.strain, GENE_POSITIONS_DICT,
+    gvf = vcftogvf(file, args.strain, GENE_PROTEIN_POSITIONS_DICT,
                    args.names_to_split)
     # add functional annotations
     if args.names: