Skip to content

Commit

Permalink
Adds support for the conversion of Structural Variants
Browse files Browse the repository at this point in the history
* Updated the code to add support for the conversion of Structural Variants.

* Exposed a new parameter 'genomic_source_class' to configure Genomic Source Class.
  • Loading branch information
Rohan-cod committed Jul 18, 2021
1 parent 8aa49a1 commit b55d343
Show file tree
Hide file tree
Showing 11 changed files with 626 additions and 1,273 deletions.
99 changes: 97 additions & 2 deletions vcf2fhir/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,33 @@
import logging
import re
from collections import OrderedDict
from enum import Enum


general_logger = logging.getLogger("vcf2fhir.general")

SVs = {'INS', 'DEL', 'DUP', 'CNV', 'INV'}
VARIANT_COMPONENTS_ORDER = [
'dna_change_type_component',
'ref_seq_id_component', 'genomic_source_class_component',
'allelic_state_component', 'allelic_frequency_component',
'copy_number_component', 'ref_allele_component', 'alt_allele_component',
'genomic_coord_system_component', 'exact_start_end_component',
'outer_start_end_component', 'inner_start_end_component',
]
GERMLINE = 'Germline'
SOMATIC = 'Somatic'
MIXED = 'Mixed'
SVTYPE_TO_DNA_CHANGE_TYPE = {
'CNV': ['SO:0001019', 'copy_number_variation'],
'DUP': ['SO:1000035', 'duplication'],
'INV': ['SO:1000036', 'inversion'],
'DEL': ['SO:0000159', 'deletion'],
'INS': ['SO:0000667', 'insertion']
}
GENOMIC_SOURCE_CLASS_TO_CODE = {
GERMLINE: 'LA6683-2',
SOMATIC: 'LA6684-0'
}

"""
/**
Expand All @@ -19,6 +42,17 @@
"""


class Genomic_Source_Class(Enum):

@classmethod
def set_(cls):
return set(map(lambda c: c.value, cls))

GERMLINE = GERMLINE
SOMATIC = SOMATIC
MIXED = MIXED


def get_fhir_date():
z = datetime.datetime.now(pytz.timezone(
'UTC')).strftime("%Y-%m-%dT%H:%M:%S%z")
Expand Down Expand Up @@ -61,6 +95,7 @@ def get_sequence_relation(phased_rec_map):
def get_allelic_state(record, ratio_ad_dp):
allelic_state = ''
allelic_code = ''
allelic_frequency = None
# Using the first sample
sample = record.samples[0]
alleles = sample.gt_alleles
Expand All @@ -85,8 +120,10 @@ def get_allelic_state(record, ratio_ad_dp):
len(sample.data.AD) > 0):
ratio = float(
sample.data.AD[0]) / float(sample.data.DP)
allelic_frequency = ratio
else:
ratio = float(sample.data.AD) / float(sample.data.DP)
allelic_frequency = ratio
if ratio > ratio_ad_dp:
allelic_state = "homoplasmic"
allelic_code = "LA6704-6"
Expand All @@ -101,7 +138,11 @@ def get_allelic_state(record, ratio_ad_dp):
_error_log_allelicstate(record)
else:
_error_log_allelicstate(record)
return {'ALLELE': allelic_state, 'CODE': allelic_code}
return {
'ALLELE': allelic_state,
'CODE': allelic_code,
'FREQUENCY': allelic_frequency
}


def extract_chrom_identifier(chrom):
Expand Down Expand Up @@ -143,13 +184,67 @@ def get_codeable_concept(system, code, display=None):
return concept.CodeableConcept(codeable_concept)


# def validate_alt_simple(alt):
# alt_pattern_simple =\
# re.compile(
# r"^([a-zA-Z]+)(,[a-zA-Z]+)*$|" +
# # Match character string and comma separated
# # list of character strings
# r"^\.$"
# # Match '.'
# )
# alt = ",".join(list(map(str, list(alt))))
# if alt_pattern_simple.match(alt):
# return True
# else:
# return False


# def validate_alt_sv(alt):
# alt_pattern_sv =\
# re.compile(
# r"^([a-zA-Z]+)(,[a-zA-Z]+)*$|" +
# # Match character string and comma separated
# # list of character strings
# r"^\.$|" +
# # Match '.'
# r"^(<[A-Z*\d*]+>)(,<[A-Z*\d*]+>)*$"
# # Match angle-bracketed token and comma
# # separated angle-bracketed token list
# )

# alt = ",".join(list(map(str, list(alt))))
# if alt_pattern_sv.match(alt):
# return True
# else:
# return False


# def validate_alt_simple_char(alt):
# alt = ",".join(list(map(str, list(alt))))
# if alt.isalpha():
# return True
# else:
# return False


def _error_log_allelicstate(record):
general_logger.error(
"Cannot Determine AllelicState for: %s , considered sample: %s",
record,
record.samples[0].data)


def get_dna_chg(svtype):
dna_chg = SVTYPE_TO_DNA_CHANGE_TYPE.get(svtype)
return {"CODE": dna_chg[0], "DISPLAY": dna_chg[1]}


def get_genomic_source_class(genomic_source_class):
source_class_code = GENOMIC_SOURCE_CLASS_TO_CODE.get(genomic_source_class)
return {"CODE": source_class_code, "DISPLAY": genomic_source_class}


def createOrderedDict(value_from, order):
value_to = OrderedDict()
for key in order:
Expand Down
34 changes: 15 additions & 19 deletions vcf2fhir/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class Converter(object):
If allelic depth (FORMAT.AD) / read depth (FORMAT.DP) is \
greater than ratio_ad_dp then allelic state is
homoplasmic; else heteroplasmic.
**genomic_source_class** (optional):
Returns
-------
Expand All @@ -90,16 +92,10 @@ class Converter(object):
"""

def __init__(
self,
vcf_filename=None,
ref_build=None,
patient_id=None,
has_tabix=False,
conv_region_filename=None,
conv_region_dict=None,
region_studied_filename=None,
nocall_filename=None,
ratio_ad_dp=0.99):
self, vcf_filename=None, ref_build=None, patient_id=None,
has_tabix=False, conv_region_filename=None, conv_region_dict=None,
region_studied_filename=None, nocall_filename=None,
ratio_ad_dp=0.99, genomic_source_class='somatic'):

super(Converter, self).__init__()
if not (vcf_filename):
Expand Down Expand Up @@ -167,12 +163,18 @@ def __init__(
if not validate_ratio_ad_dp(ratio_ad_dp):
raise Exception("Please provide a valid 'ratio_ad_dp'")

if genomic_source_class.title() not in Genomic_Source_Class.set_():
raise Exception(
("Please provide a valid Genomic Source Class " +
"('germline' or 'somatic' or 'mixed')"))

self.ratio_ad_dp = ratio_ad_dp
self.has_tabix = has_tabix
self.patient_id = patient_id
self.ref_build = ref_build
self.nocall_filename = nocall_filename
self.conv_region_filename = conv_region_filename
self.genomic_source_class = genomic_source_class.title()
general_logger.info("Converter class instantiated successfully")

def convert(self, output_filename='fhir.json'):
Expand All @@ -187,15 +189,9 @@ def convert(self, output_filename='fhir.json'):
"""
general_logger.info("Starting VCF to FHIR Conversion")
_get_fhir_json(
self._vcf_reader,
self.ref_build,
self.patient_id,
self.has_tabix,
self.conversion_region,
self.region_studied,
self.nocall_region,
self.ratio_ad_dp,
output_filename)
self._vcf_reader, self.ref_build, self.patient_id, self.has_tabix,
self.conversion_region, self.region_studied, self.nocall_region,
self.ratio_ad_dp, self.genomic_source_class, output_filename)
general_logger.info("Completed VCF to FHIR Conversion")

def _generate_exception(self, msg):
Expand Down
Loading

0 comments on commit b55d343

Please sign in to comment.