'INFO' column improved parsing, only merge clade-defining file if str…

…ain exists in file
cidgoh · Aug 9, 2021 · 2c40e9b · 2c40e9b · miseminger · Aug 9, 2021
1 parent e272727
commit 2c40e9b
Showing 1 changed file with 36 additions and 29 deletions.
diff --git a/bin/vcf2gvf.py b/bin/vcf2gvf.py
@@ -24,7 +24,6 @@
 import glob
 import os
 import numpy as np
-from cyvcf2 import VCF, Writer
 
 
 def parse_args():
@@ -62,15 +61,16 @@ def vcftogvf(var_data, strain):
     eff_info = df['INFO'].str.findall('\((.*?)\)') #series: extract everything between parentheses as elements of a list
     eff_info = eff_info.apply(pd.Series)[0] #take first element of list
     eff_info = eff_info.str.split(pat='|').apply(pd.Series) #split at pipe, form dataframe
-
     #hgvs names
     hgvs = eff_info[3].str.rsplit(pat='c.').apply(pd.Series)
     hgvs_protein = hgvs[0].str[:-1]
-    hgvs_protein.replace(r'^\s+$', np.nan, regex=True)
     hgvs_nucleotide = 'c.' + hgvs[1]
-    new_df['#attributes'] = new_df['#attributes'].astype(str) + 'Name=' + hgvs_protein + ';'
-    new_df['#attributes'] = new_df['#attributes'].astype(str) + 'nt_name=' + hgvs_nucleotide + ';'
 
+    #use nucleotide name where protein name doesn't exist (for 'Name' attribute)
+    Names = hgvs_protein
+    Names[~Names.str.contains("p.")] =  hgvs_nucleotide #fill in empty protein name spaces with nucleotide names ("c."...)
+
+    new_df['#attributes'] = new_df['#attributes'].astype(str) + 'Name=' + Names + ';'
     new_df['#attributes'] = new_df['#attributes'].astype(str) + 'nt_name=' + hgvs_nucleotide + ';'
     new_df['#attributes'] = new_df['#attributes'].astype(str) + 'gene=' + eff_info[5] + ';' #gene names
     new_df['#attributes'] = new_df['#attributes'].astype(str) + 'mutation_type=' + eff_info[1] + ';' #mutation type 
@@ -84,32 +84,28 @@ def vcftogvf(var_data, strain):
             key = 'Variant_seq'
         new_df['#attributes'] = new_df['#attributes'].astype(str) + key + '=' + df[column].astype(str) + ';'
 
-    #add ao, dp, ro
+    #make 'INFO' column easier to extract attributes from
     info = df['INFO'].str.split(pat=';').apply(pd.Series) #split at ;, form dataframe
-    new_df['#attributes'] = new_df['#attributes'] + info[5].str.lower() + ';' #ao
-    new_df['#attributes'] = new_df['#attributes'] + info[7].str.lower() + ';' #dp
-    new_df['#attributes'] = new_df['#attributes'] + info[28].str.lower() + ';' #ro
+    for column in info.columns:
+        split = info[column].str.split(pat='=').apply(pd.Series)
+        title = split[0].drop_duplicates().tolist()[0].lower()
+        content = split[1]
+        info[column] = content #ignore "tag=" in column content
+        info.rename(columns={column:title}, inplace=True) #make attribute tag as column label
 
+    #add 'INFO' attributes by name
+    for column in ['ao','dp','ro']:
+        new_df['#attributes'] = new_df['#attributes'].astype(str) + column + '=' + info[column].astype(str) + ';'
+
     #add strain name
     new_df['#attributes'] = new_df['#attributes'] + 'viral_lineage=' + strain + ';'
-
-    #add WHO strain name
-    alt_strain_names = {'B.1.1.7': 'Alpha', 'B.1.351': 'Beta', 'P.1': 'Gamma', 'B.1.617.2': 'Delta', 'B.1.427': 'Epsilon', 'B.1.429': 'Epsilon', 'P.2': 'Zeta', 'B.1.525': 'Eta', 'P.3': 'Theta', 'B.1.526': 'Iota', 'B.1.617.1': 'Kappa'}
-    new_df['#attributes'] = new_df['#attributes'] + 'who_label=' + alt_strain_names.get(strain) + ';'
-
-    #add VOC/VOI designation
-    if strain in {'Alpha', 'Beta', 'Gamma', 'Delta'}:
-        new_df['#attributes'] = new_df['#attributes'] + 'status=VOC;'
-    else:
-        new_df['#attributes'] = new_df['#attributes'] + 'status=VOI;'
-
     #remove starting NaN; leave trailing ';'
     new_df['#attributes'] = new_df['#attributes'].str[3:]
 
     #fill in other GVF columns
     new_df['#seqid'] = df['#CHROM']
     new_df['#source'] = '.'
-    new_df['#type'] = info[40].str.split(pat='=').apply(pd.Series)[1]
+    new_df['#type'] = info['type']
     new_df['#start'] = df['POS']
     new_df['#end'] = (df['POS'].astype(int) + df['ALT'].str.len() - 1).astype(str)  #this needs fixing
     new_df['#score'] = '.'
@@ -127,20 +123,17 @@ def add_functions(gvf, annotation_file, clade_file, strain):
 
     #load files into Pandas dataframes
     df = pd.read_csv(annotation_file, sep='\t', header=0) #load functional annotations spreadsheet
-    clades = pd.read_csv(clade_file, sep='\t', header=0, usecols=['strain', 'mutation']) #load clade-defining mutations file
-    clades = clades.loc[clades.strain == strain] #only look at the relevant part of that file
 
     attributes = gvf["#attributes"].str.split(pat=';').apply(pd.Series)
-    hgvs_protein = attributes[0].str.split(pat='=').apply(pd.Series)[1]
+    hgvs_protein = attributes[0].str.split(pat='=').apply(pd.Series)[1] #remember this includes nucleotide names where there are no protein names
     hgvs_nucleotide = attributes[1].str.split(pat='=').apply(pd.Series)[1]
     gvf["mutation"] = hgvs_protein.str[2:] #drop the prefix
 
     #merge annotated vcf and functional annotation files by 'mutation' column in the gvf
     for column in df.columns:
         df[column] = df[column].str.lstrip()
     merged_df = pd.merge(df, gvf, on=['mutation'], how='right') #add functional annotations
-    merged_df = pd.merge(clades, merged_df, on=['mutation'], how='right') #add clade-defining mutations
-
+
     #collect all mutation groups (including reference mutation) in a column, sorted alphabetically
     #this is more roundabout than it needs to be; streamline with grouby() later
     merged_df["mutation_group"] = merged_df["comb_mutation"].astype(str) + ", '" + merged_df["mutation"].astype(str) + "'"
@@ -185,10 +178,23 @@ def add_functions(gvf, annotation_file, clade_file, strain):
         else:
             merged_df["#attributes"] = merged_df["#attributes"].astype(str) + key + '=' + merged_df[column].astype(str) + ';'
 
-    #change clade-defining attribute to True/False depending on content of 'strain' column
-    merged_df.loc[merged_df.strain == strain, "#attributes"] = merged_df.loc[merged_df.strain == strain, "#attributes"].astype(str)  + "clade_defining=True;"
-    merged_df.loc[merged_df.strain != strain, "#attributes"] = merged_df.loc[merged_df.strain != strain, "#attributes"].astype(str)  + "clade_defining=False;"
 
+    #if strain is in clades file, merge that too
+    clades = pd.read_csv(clade_file, sep='\t', header=0) #load clade-defining mutations file
+    available_strains = clades['strain'].drop_duplicates().tolist() 
+    if strain in available_strains:
+        clades = clades.loc[clades.strain == strain] #only look at the relevant part of that file
+        #extract status and WHO strain name from clades file
+        voc_status = clades['voc_status'].drop_duplicates().tolist()[0]
+        who_label = clades['who_name'].drop_duplicates().tolist()[0]
+        #merge clades with function-annotated dataframe
+        merged_df = pd.merge(clades, merged_df, on=['mutation'], how='right') #add clade-defining mutations
+        #change clade-defining attribute to True/False depending on content of 'strain' column
+        merged_df.loc[merged_df.strain == strain, "#attributes"] = merged_df.loc[merged_df.strain == strain, "#attributes"].astype(str)  + "clade_defining=True;"
+        merged_df.loc[merged_df.strain != strain, "#attributes"] = merged_df.loc[merged_df.strain != strain, "#attributes"].astype(str)  + "clade_defining=False;"
+        merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "who_label=" + who_label + ';'
+        merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "voc_status=" + voc_status + ';'
+
     #add ID to attributes
     merged_df["#attributes"] = 'ID=' + merged_df['id'].astype(str) + ';' + merged_df["#attributes"].astype(str)
 
@@ -280,6 +286,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
         pat = r'.*?' + '(.*)_ids.*'
         match = re.search(pat, file.split("/")[-1])
         strain = match.group(1)
+        #strain = 'moose'
         print("Strain: ", strain)
 
         #create gvf from annotated vcf (ignoring pragmas for now)