Merge pull request #18 from Leon-Bichmann/feature/vcf_to_fasts

dockerImage added
FRED-2 · Jul 20, 2018 · 15f43d2 · 15f43d2
2 parents d1c5bcf + 7a4235d
commit 15f43d2
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 8 deletions.
diff --git a/knime/descriptors/variants2proteins.ctd b/knime/descriptors/variants2proteins.ctd
@@ -1,10 +1,10 @@
 <?xml version="1.0" ?>
 <tool name="Variants2Proteins" version="1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://github.com/genericworkflownodes/CTDopts/raw/master/schemas/CTD_0_3.xsd">
-	<manual>Variants2Proteins consumes a VCF File and generates all possible neoantigen proteins based on the annotated variants contained in the VCF file by extracting the annotated transcript sequences from Ensemble and integrating the variants. Optionally, it consumes a text file, containing gene IDs of the reference system used for annotation, which are used as filter during the neoantigen generation. Additionally the user can specify whether frameshift mutations, or deletions and insertions, besides single point mutations, should be considered. NeoEpitopePrediction currently supports  annotations for GRCh37 and GRCh38 only.
+	<manual>Variants2Proteins consumes a VCF File and generates all possible neoantigen proteins based on the annotated variants contained in the VCF file by extracting the annotated transcript sequences from Ensemble and integrating the variants. Optionally, it consumes a text file, containing gene IDs of the reference system used for annotation, which are used as filter during the neoantigen generation. Additionally the user can specify whether frameshift mutations, or deletions and insertions, besides single point mutations, should be considered. Variants2Proteins currently supports  annotations for GRCh37 and GRCh38 only.
 
 	Please cite the original publication of the used prediction method alongside ImmunoNodes.
 	</manual>
-	<description>Variants2Proteins consumes a VCF File and generates all possible neoantigen proteins based on the annotated variants contained in the VCF file by extracting the annotated transcript sequences from Ensemble and integrating the variants. Optionally, it consumes a text file, containing gene IDs of the reference system used for annotation, which are used as filter during the neoantigen generation. Additionally the user can specify whether frameshift mutations, or deletions and insertions, besides single point mutations, should be considered. NeoEpitopePrediction currently supports  annotations for GRCh37 and GRCh38 only.
+	<description>Variants2Proteins consumes a VCF File and generates all possible neoantigen proteins based on the annotated variants contained in the VCF file by extracting the annotated transcript sequences from Ensemble and integrating the variants. Optionally, it consumes a text file, containing gene IDs of the reference system used for annotation, which are used as filter during the neoantigen generation. Additionally the user can specify whether frameshift mutations, or deletions and insertions, besides single point mutations, should be considered. Variants2Proteins currently supports  annotations for GRCh37 and GRCh38 only.
 
 	Please cite the original publication of the used prediction method alongside ImmunoNodes.
 	</description>

diff --git a/knime/plugin.properties b/knime/plugin.properties
@@ -29,3 +29,4 @@ tool.CleavagePrediction.dockerImage=aperim/immunonodes
 tool.EpitopeConservation.dockerImage=aperim/immunonodes
 tool.Distance2SelfCalculation.dockerImage=aperim/immunonodes
 tool.Distance2SelfGeneration.dockerImage=aperim/immunonodes
+tool.Variants2Proteins.dockerImage=aperim/immunonodes
diff --git a/src/variants2proteins.py b/src/variants2proteins.py
@@ -31,6 +31,7 @@
 import time
 import sys
 import argparse
+import logging
 
 from Fred2.Core import Protein, Peptide, Allele, MutationSyntax, Variant
 from Fred2.Core.Variant import VariationType
@@ -88,9 +89,14 @@ def get_type(ref, alt):
             isSynonymous = False
 
             for co in info.split(","):
-                #Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE">
-                _,var_type,_,gene,_,transcript_type,transcript_id,_,_,_,_,_,_,transcript_pos,prot_pos,aa_mutation = co.strip().split("|")[:16]
-                HGNC_ID=co.strip().split("|")[22]
+                #skip additional info fields without annotation
+                try:
+                    #Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE">
+                    _,var_type,_,gene,_,transcript_type,transcript_id,_,_,_,_,_,_,transcript_pos,prot_pos,aa_mutation = co.strip().split("|")[:16]
+                    HGNC_ID=co.strip().split("|")[22]
+                except ValueError:
+                    logging.warning("INFO field in different format in line: {}, skipping...".format(str(i)))
+                    continue
 
                 #pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.)
                 #pass genes that are uninterresting for us
@@ -102,9 +108,8 @@ def get_type(ref, alt):
                     #generate mutation syntax
 
                     #positioning in Fred2 is 0-based!!!
-                    if transcript_pos != "":
-                        coding[transcript_id] = MutationSyntax(transcript_id, int(transcript_pos)-1, 
-                            -1 if prot_pos  == "" else int(prot_pos)-1, co, "", geneID=HGNC_ID)
+                    if transcript_pos != "" and '?' not in transcript_pos:
+                        coding[transcript_id] = MutationSyntax(transcript_id, int(transcript_pos.split("-")[0])-1, -1 if prot_pos  == "" else int(prot_pos.split("-")[0])-1, co, "", geneID=HGNC_ID)
 
                 #is variant synonymous?
                 isSynonymous = any(t == "synonymous_variant" for t in var_type.split("&"))