Skip to content

Commit

Permalink
add support for case fusioninspector filters out a fusion from fusion…
Browse files Browse the repository at this point in the history
…report
  • Loading branch information
rannick committed Nov 7, 2023
1 parent f4c0912 commit 4337f4a
Showing 1 changed file with 26 additions and 15 deletions.
41 changes: 26 additions & 15 deletions bin/vcf_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
logger = logging.getLogger()


def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample, out):
def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, hgnc, gtf, out):
"""
Process FusionInspector and FusionReport data,
merge with GTF from FusionInspector and HGNC database,
Expand All @@ -33,14 +33,17 @@ def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample
.reset_index()
)

df = build_hgnc_dataframe(hgnc).merge(
df = build_hgcn_dataframe(hgnc).merge(
merged_df, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id"
)
df = df.rename(columns={"hgnc_id": "Left_hgnc_id"})
df = build_hgnc_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id")
df = build_hgcn_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id")
df = df.rename(columns={"hgnc_id": "Right_hgnc_id"})
gtf_df = build_gtf_dataframe(gtf)
all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id")
all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0)
all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].astype(int)

all_df = all_df[(all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])]
all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
Expand Down Expand Up @@ -73,7 +76,10 @@ def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample
]
].drop_duplicates()
all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id")
all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0)
all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int)
all_df = all_df[(all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])]

all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
all_df = all_df[
Expand Down Expand Up @@ -195,7 +201,6 @@ def build_fusioninspector_dataframe(file):
df = df.rename(columns={"#FusionName": "FUSION"})
df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True)
df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True)
df[["GeneA", "GeneB"]] = df["FUSION"].str.split("--", expand=True)
df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True)
df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True)
return df.set_index(["FUSION"])
Expand Down Expand Up @@ -234,11 +239,11 @@ def read_build_fusionreport(fusionreport_file):
from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line]
expression = from_html[0].split('], "tool')[0]
fusion_report = pd.DataFrame.from_dict(ast.literal_eval(expression))
if "arriba" not in fusion_report.columns:
if not "arriba" in fusion_report.columns:
fusion_report["arriba"] = ""
if "fusioncatcher" not in fusion_report.columns:
if not "fusioncatcher" in fusion_report.columns:
fusion_report["fusioncatcher"] = ""
if "starfusion" not in fusion_report.columns:
if not "starfusion" in fusion_report.columns:
fusion_report["starfusion"] = ""
fusion_report["arriba"] = fusion_report[["arriba"]].apply(
replace_value_with_column_name, args=("true", "arriba"), axis=1
Expand All @@ -254,7 +259,11 @@ def read_build_fusionreport(fusionreport_file):
)
fusion_report.columns = fusion_report.columns.str.upper()
fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ", ".join(x))
return fusion_report[["FUSION", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(["FUSION"])
fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True)

return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(
["FUSION"]
)


def column_manipulation(df):
Expand All @@ -270,10 +279,12 @@ def column_manipulation(df):
df["REF"] = "N"
df["INFO"] = ""
df["Sample"] = ""
df["Strand1"] = df["Strand1"].astype(str)

for index, row in df.iterrows():
# ALT
if not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]:
if row["Strand1"] == "nan":
df.loc[index, "ALT"] = "nan"
elif not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]:
df.loc[index, "ALT"] = "N[{}:{}[".format(df["ChromosomeB"], row["PosB"])
elif row["Strand1"] == "-" and row["Strand2"] == "-":
df.loc[index, "ALT"] = "[{}:{}[N".format(row["ChromosomeB"], row["PosB"])
Expand All @@ -283,12 +294,12 @@ def column_manipulation(df):
df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"])
else:
df.loc[index, "ALT"] = "N[{}:{}[".format(row["ChromosomeB"], row["PosB"])
# INFO

df.loc[index, "INFO"] = (
"SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};"
"FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
"TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={};"
"EXON_NUMBER_B={};ANNOTATIONS={}".format(
"FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
"TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={},EXON_NUMBER_B={};"
"ANNOTATIONS={}".format(
row["ChromosomeA"],
row["ChromosomeB"],
row["GeneA"],
Expand Down Expand Up @@ -347,7 +358,7 @@ def write_vcf(df_to_print, header, out_file):
f.write(header.rstrip("\r\n") + "\n" + content)


def build_hgnc_dataframe(file):
def build_hgcn_dataframe(file):
"""
Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns.
"""
Expand Down

0 comments on commit 4337f4a

Please sign in to comment.