Skip to content

Commit

Permalink
'INFO' column improved parsing, only merge clade-defining file if str…
Browse files Browse the repository at this point in the history
…ain exists in file
  • Loading branch information
miseminger committed Aug 9, 2021
1 parent e272727 commit 2c40e9b
Showing 1 changed file with 36 additions and 29 deletions.
65 changes: 36 additions & 29 deletions bin/vcf2gvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import glob
import os
import numpy as np
from cyvcf2 import VCF, Writer


def parse_args():
Expand Down Expand Up @@ -62,15 +61,16 @@ def vcftogvf(var_data, strain):
eff_info = df['INFO'].str.findall('\((.*?)\)') #series: extract everything between parentheses as elements of a list
eff_info = eff_info.apply(pd.Series)[0] #take first element of list
eff_info = eff_info.str.split(pat='|').apply(pd.Series) #split at pipe, form dataframe

#hgvs names
hgvs = eff_info[3].str.rsplit(pat='c.').apply(pd.Series)
hgvs_protein = hgvs[0].str[:-1]
hgvs_protein.replace(r'^\s+$', np.nan, regex=True)
hgvs_nucleotide = 'c.' + hgvs[1]
new_df['#attributes'] = new_df['#attributes'].astype(str) + 'Name=' + hgvs_protein + ';'
new_df['#attributes'] = new_df['#attributes'].astype(str) + 'nt_name=' + hgvs_nucleotide + ';'

#use nucleotide name where protein name doesn't exist (for 'Name' attribute)
Names = hgvs_protein
Names[~Names.str.contains("p.")] = hgvs_nucleotide #fill in empty protein name spaces with nucleotide names ("c."...)

new_df['#attributes'] = new_df['#attributes'].astype(str) + 'Name=' + Names + ';'
new_df['#attributes'] = new_df['#attributes'].astype(str) + 'nt_name=' + hgvs_nucleotide + ';'
new_df['#attributes'] = new_df['#attributes'].astype(str) + 'gene=' + eff_info[5] + ';' #gene names
new_df['#attributes'] = new_df['#attributes'].astype(str) + 'mutation_type=' + eff_info[1] + ';' #mutation type
Expand All @@ -84,32 +84,28 @@ def vcftogvf(var_data, strain):
key = 'Variant_seq'
new_df['#attributes'] = new_df['#attributes'].astype(str) + key + '=' + df[column].astype(str) + ';'

#add ao, dp, ro
#make 'INFO' column easier to extract attributes from
info = df['INFO'].str.split(pat=';').apply(pd.Series) #split at ;, form dataframe
new_df['#attributes'] = new_df['#attributes'] + info[5].str.lower() + ';' #ao
new_df['#attributes'] = new_df['#attributes'] + info[7].str.lower() + ';' #dp
new_df['#attributes'] = new_df['#attributes'] + info[28].str.lower() + ';' #ro
for column in info.columns:
split = info[column].str.split(pat='=').apply(pd.Series)
title = split[0].drop_duplicates().tolist()[0].lower()
content = split[1]
info[column] = content #ignore "tag=" in column content
info.rename(columns={column:title}, inplace=True) #make attribute tag as column label

#add 'INFO' attributes by name
for column in ['ao','dp','ro']:
new_df['#attributes'] = new_df['#attributes'].astype(str) + column + '=' + info[column].astype(str) + ';'

#add strain name
new_df['#attributes'] = new_df['#attributes'] + 'viral_lineage=' + strain + ';'

#add WHO strain name
alt_strain_names = {'B.1.1.7': 'Alpha', 'B.1.351': 'Beta', 'P.1': 'Gamma', 'B.1.617.2': 'Delta', 'B.1.427': 'Epsilon', 'B.1.429': 'Epsilon', 'P.2': 'Zeta', 'B.1.525': 'Eta', 'P.3': 'Theta', 'B.1.526': 'Iota', 'B.1.617.1': 'Kappa'}
new_df['#attributes'] = new_df['#attributes'] + 'who_label=' + alt_strain_names.get(strain) + ';'

#add VOC/VOI designation
if strain in {'Alpha', 'Beta', 'Gamma', 'Delta'}:
new_df['#attributes'] = new_df['#attributes'] + 'status=VOC;'
else:
new_df['#attributes'] = new_df['#attributes'] + 'status=VOI;'

#remove starting NaN; leave trailing ';'
new_df['#attributes'] = new_df['#attributes'].str[3:]

#fill in other GVF columns
new_df['#seqid'] = df['#CHROM']
new_df['#source'] = '.'
new_df['#type'] = info[40].str.split(pat='=').apply(pd.Series)[1]
new_df['#type'] = info['type']
new_df['#start'] = df['POS']
new_df['#end'] = (df['POS'].astype(int) + df['ALT'].str.len() - 1).astype(str) #this needs fixing
new_df['#score'] = '.'
Expand All @@ -127,20 +123,17 @@ def add_functions(gvf, annotation_file, clade_file, strain):

#load files into Pandas dataframes
df = pd.read_csv(annotation_file, sep='\t', header=0) #load functional annotations spreadsheet
clades = pd.read_csv(clade_file, sep='\t', header=0, usecols=['strain', 'mutation']) #load clade-defining mutations file
clades = clades.loc[clades.strain == strain] #only look at the relevant part of that file

attributes = gvf["#attributes"].str.split(pat=';').apply(pd.Series)
hgvs_protein = attributes[0].str.split(pat='=').apply(pd.Series)[1]
hgvs_protein = attributes[0].str.split(pat='=').apply(pd.Series)[1] #remember this includes nucleotide names where there are no protein names
hgvs_nucleotide = attributes[1].str.split(pat='=').apply(pd.Series)[1]
gvf["mutation"] = hgvs_protein.str[2:] #drop the prefix

#merge annotated vcf and functional annotation files by 'mutation' column in the gvf
for column in df.columns:
df[column] = df[column].str.lstrip()
merged_df = pd.merge(df, gvf, on=['mutation'], how='right') #add functional annotations
merged_df = pd.merge(clades, merged_df, on=['mutation'], how='right') #add clade-defining mutations


#collect all mutation groups (including reference mutation) in a column, sorted alphabetically
#this is more roundabout than it needs to be; streamline with grouby() later
merged_df["mutation_group"] = merged_df["comb_mutation"].astype(str) + ", '" + merged_df["mutation"].astype(str) + "'"
Expand Down Expand Up @@ -185,10 +178,23 @@ def add_functions(gvf, annotation_file, clade_file, strain):
else:
merged_df["#attributes"] = merged_df["#attributes"].astype(str) + key + '=' + merged_df[column].astype(str) + ';'

#change clade-defining attribute to True/False depending on content of 'strain' column
merged_df.loc[merged_df.strain == strain, "#attributes"] = merged_df.loc[merged_df.strain == strain, "#attributes"].astype(str) + "clade_defining=True;"
merged_df.loc[merged_df.strain != strain, "#attributes"] = merged_df.loc[merged_df.strain != strain, "#attributes"].astype(str) + "clade_defining=False;"

#if strain is in clades file, merge that too
clades = pd.read_csv(clade_file, sep='\t', header=0) #load clade-defining mutations file
available_strains = clades['strain'].drop_duplicates().tolist()
if strain in available_strains:
clades = clades.loc[clades.strain == strain] #only look at the relevant part of that file
#extract status and WHO strain name from clades file
voc_status = clades['voc_status'].drop_duplicates().tolist()[0]
who_label = clades['who_name'].drop_duplicates().tolist()[0]
#merge clades with function-annotated dataframe
merged_df = pd.merge(clades, merged_df, on=['mutation'], how='right') #add clade-defining mutations
#change clade-defining attribute to True/False depending on content of 'strain' column
merged_df.loc[merged_df.strain == strain, "#attributes"] = merged_df.loc[merged_df.strain == strain, "#attributes"].astype(str) + "clade_defining=True;"
merged_df.loc[merged_df.strain != strain, "#attributes"] = merged_df.loc[merged_df.strain != strain, "#attributes"].astype(str) + "clade_defining=False;"
merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "who_label=" + who_label + ';'
merged_df["#attributes"] = merged_df["#attributes"].astype(str) + "voc_status=" + voc_status + ';'

#add ID to attributes
merged_df["#attributes"] = 'ID=' + merged_df['id'].astype(str) + ';' + merged_df["#attributes"].astype(str)

Expand Down Expand Up @@ -280,6 +286,7 @@ def add_functions(gvf, annotation_file, clade_file, strain):
pat = r'.*?' + '(.*)_ids.*'
match = re.search(pat, file.split("/")[-1])
strain = match.group(1)
#strain = 'moose'
print("Strain: ", strain)

#create gvf from annotated vcf (ignoring pragmas for now)
Expand Down

1 comment on commit 2c40e9b

@miseminger
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this also uses the nucleotide-level HGVS name in the "Name" attribute where there is no amino acid name (eg. in the 5' UTR). This is to ensure that each mutation group has a unique ID, and to make sure that there is always a name viewable in the visualization. The other addition here is that the VOC/VOI status and WHO labels now come from clade_defining_mutations.tsv rather than from a dictionary in the script; this is to streamline updating as new variants arise.

Please sign in to comment.