diff --git a/pyard/ard.py b/pyard/ard.py index e292db4..05b2647 100644 --- a/pyard/ard.py +++ b/pyard/ard.py @@ -124,7 +124,7 @@ def __init__( dr.generate_serology_mapping( self.db_connection, imgt_version, self.serology_mapping, self._redux_allele ) - self.valid_serology_set = dr.build_valid_serology_set(self.db_connection) + self.valid_serology_set = SerologyMapping.get_valid_serology_names() # Load V2 to V3 mappings dr.generate_v2_to_v3_mapping(self.db_connection, imgt_version) @@ -436,16 +436,23 @@ def redux(self, glstring: str, redux_type: VALID_REDUCTION_TYPES) -> str: is_hla_prefix = HLA_regex.search(loc_antigen) if is_hla_prefix: loc_antigen = loc_antigen.split("-")[1] - if self.is_XX(glstring, loc_antigen, code): - if is_hla_prefix: - reduced_alleles = self.redux( - "/".join(self.code_mappings.xx_codes[loc_antigen]), redux_type - ) - return "/".join(["HLA-" + a for a in reduced_alleles.split("/")]) + if code == "XX": + if self.is_XX(glstring, loc_antigen, code): + if is_hla_prefix: + reduced_alleles = self.redux( + "/".join(self.code_mappings.xx_codes[loc_antigen]), + redux_type, + ) + return "/".join( + ["HLA-" + a for a in reduced_alleles.split("/")] + ) + else: + return self.redux( + "/".join(self.code_mappings.xx_codes[loc_antigen]), + redux_type, + ) else: - return self.redux( - "/".join(self.code_mappings.xx_codes[loc_antigen]), redux_type - ) + raise InvalidTypingError(f"{glstring} is not valid XX code") # Handle MAC if self._config["reduce_MAC"] and code.isalpha(): @@ -633,7 +640,13 @@ def find_broad_splits(self, allele) -> tuple: return self.serology_mapping.find_splits(allele) def find_associated_antigen(self, serology) -> str: - return self.serology_mapping.serology_associated_map.get(serology, serology) + return self.serology_mapping.find_associated_antigen(serology) + + @functools.lru_cache() + def find_xx_from_serology(self, serology): + if self.is_serology(serology): + return db.find_xx_for_serology(self.db_connection, serology) + raise InvalidAlleleError(f"{serology} is not a valid serology") def _get_alleles(self, code, locus_antigen) -> Iterable[str]: """ diff --git a/pyard/data_repository.py b/pyard/data_repository.py index c6f7b7e..6a2db41 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -27,7 +27,7 @@ import pyard.load from pyard.smart_sort import smart_sort_comparator from . import db -from .serology import broad_splits_dna_mapping, get_all_valid_serology_names +from .constants import expression_chars from .load import ( load_g_group, load_p_group, @@ -35,7 +35,6 @@ load_serology_mappings, load_latest_version, ) -from .constants import expression_chars from .mappings import ( ars_mapping_tables, ARSMapping, @@ -50,6 +49,7 @@ number_of_fields, get_1field_allele, ) +from .serology import broad_splits_dna_mapping, SerologyMapping def expression_reduce(df): @@ -356,7 +356,10 @@ def to_serological_name(locus_name: str): def generate_serology_mapping( - db_connection: sqlite3.Connection, imgt_version, serology_mapping, redux_function + db_connection: sqlite3.Connection, + imgt_version: str, + serology_mapping: SerologyMapping, + redux_function, ): if not db.table_exists(db_connection, "serology_mapping"): df_sero = load_serology_mappings(imgt_version) @@ -412,22 +415,28 @@ def generate_serology_mapping( if split in sero_mapping: sero_mapping[broad] = sero_mapping[split] - # re-sort allele lists into smartsort order - for sero in sero_mapping.keys(): - sero_mapping[sero] = ( - "/".join( - sorted( - sero_mapping[sero][0], - key=functools.cmp_to_key(smart_sort_comparator), - ) - ), - "/".join( - sorted( - sero_mapping[sero][1], - key=functools.cmp_to_key(smart_sort_comparator), + # Create a mapping of serology to alleles, lgx_alleles and associated XX allele + serology_xx_mapping = serology_mapping.get_xx_mappings() + # re-sort allele lists into smart-sort order + for sero in serology_xx_mapping: + if sero in sero_mapping: + sero_mapping[sero] = ( + "/".join( + sorted( + sero_mapping[sero][0], + key=functools.cmp_to_key(smart_sort_comparator), + ) + ), + "/".join( + sorted( + sero_mapping[sero][1], + key=functools.cmp_to_key(smart_sort_comparator), + ), ), - ), - ) + serology_xx_mapping[sero], + ) + else: + sero_mapping[sero] = (None, None, serology_xx_mapping[sero]) db.save_serology_mappings(db_connection, sero_mapping) @@ -483,12 +492,3 @@ def generate_cwd_mapping(db_connection: sqlite3.Connection): if not db.table_exists(db_connection, "cwd2"): cwd2_map = pyard.load.load_cwd2() db.save_cwd2(db_connection, cwd2_map) - - -def build_valid_serology_set(db_connection: sqlite3.Connection): - valid_serology_names = get_all_valid_serology_names() - # Save to db if `valid_serology` table is not present - if not db.table_exists(db_connection, "valid_serology"): - db.save_set(db_connection, "valid_serology", valid_serology_names, "serology") - - return set(valid_serology_names) diff --git a/pyard/db.py b/pyard/db.py index 230904e..65f6a52 100644 --- a/pyard/db.py +++ b/pyard/db.py @@ -412,6 +412,22 @@ def find_serology_for_allele( return serology_mapping +def find_xx_for_serology(connection: sqlite3.Connection, serology: str) -> str: + """ + Find the corresponding XX allele for the given serology + + :param connection: db connection of type sqlite.Connection + :param serology: serology for which to find XX allele + :return: XX allele for given serology + """ + query = f"SELECT xx FROM serology_mapping WHERE serology = ?" + cursor = connection.execute(query, (serology,)) + results = cursor.fetchone() + if results: + return results[0] + return None + + def get_user_version(connection: sqlite3.Connection) -> int: """ Retrieve user_version from db @@ -424,9 +440,7 @@ def get_user_version(connection: sqlite3.Connection) -> int: version = result[0] cursor.close() - if version: - return version - return None + return version def set_user_version(connection: sqlite3.Connection, version: int): @@ -579,15 +593,16 @@ def save_serology_mappings(db_connection, sero_mapping): # Create table create_table_sql = f"""CREATE TABLE serology_mapping ( serology TEXT PRIMARY KEY, - allele_list TEXT NOT NULL, - lgx_allele_list TEXT NOT NULL + allele_list TEXT, + lgx_allele_list TEXT, + xx TEXT NOT NULL )""" cursor.execute(create_table_sql) - rows = ((k, v[0], v[1]) for k, v in sero_mapping.items()) + rows = ((k, v[0], v[1], v[2]) for k, v in sero_mapping.items()) # insert - cursor.executemany(f"INSERT INTO serology_mapping VALUES (?, ?, ?)", rows) + cursor.executemany(f"INSERT INTO serology_mapping VALUES (?, ?, ?, ?)", rows) # commit transaction - writes to the db db_connection.commit() diff --git a/pyard/serology.py b/pyard/serology.py index 5abb7b8..b2a5bf5 100644 --- a/pyard/serology.py +++ b/pyard/serology.py @@ -20,15 +20,72 @@ # > http://www.fsf.org/licensing/licenses/lgpl.html # > http://www.opensource.org/licenses/lgpl-license.php # -from pyard.constants import HLA_regex +import re +from pyard.constants import HLA_regex # # HLA Antigens # List of all recognised serological collected from: # https://hla.alleles.org/antigens/recognised_serology.html # -def get_all_valid_serology_names(): + + +# -# +# Broad, Splits and Associated Antigens +# http://hla.alleles.org/antigens/broads_splits.html +# +# +# Mapping Generated from `dna_relshp.csv` file +# +broad_splits_dna_mapping = { + "A*09": ["A*23", "A*24"], + "A*10": ["A*25", "A*26", "A*34", "A*66"], + "A*19": ["A*29", "A*30", "A*31", "A*32", "A*33", "A*74"], + "A*28": ["A*68", "A*69"], + "B*05": ["B*51", "B*52"], + "B*12": ["B*44", "B*45"], + "B*16": ["B*38", "B*39"], + "B*17": ["B*57", "B*58"], + "B*21": ["B*49", "B*50"], + "B*22": ["B*54", "B*55", "B*56"], + "C*10": ["C*03", "C*04"], + "DQB1*01": ["DQB1*05", "DQB1*06"], + "DRB1*02": ["DRB1*15", "DRB1*16"], + "DRB1*06": ["DRB1*13", "DRB1*14"], +} + +serology_xx_exception_mapping = { + # Locus B + # Broad B40 + "B60": "B*40:XX", + "B61": "B*40:XX", + # Broad B14 + "B64": "B*14:XX", + "B65": "B*14:XX", + # Broad B15 + "B62": "B*15:XX", + "B63": "B*15:XX", + "B70": "B*15:XX", + "B75": "B*15:XX", + "B76": "B*15:XX", + "B77": "B*15:XX", + # Broad B70 + "B71": "B*15:XX", + "B72": "B*15:XX", + "DR17": "DRB1*03:XX", + "DR18": "DRB1*03:XX", + # Locus DQB1 + # Broad DQ3 + "DQ7": "DQB1*03:XX", + "DQ8": "DQB1*03:XX", + "DQ9": "DQB1*03:XX", +} + +sero_antigen_regex = re.compile(r"(\D+)(\d+)") + + +class SerologyMapping: valid_serology_map = { "A": [ "A1", @@ -153,7 +210,7 @@ def get_all_valid_serology_names(): "Dw25", "Dw26", ], - "DR": [ + "DRB1": [ "DR1", "DR103", "DR2", @@ -179,40 +236,10 @@ def get_all_valid_serology_names(): "DR52", "DR53", ], - "DQ": ["DQ1", "DQ2", "DQ3", "DQ4", "DQ5", "DQ6", "DQ7", "DQ8", "DQ9"], - "DP": ["DPw1", "DPw2", "DPw3", "DPw4", "DPw5", "DPw6"], + "DQB1": ["DQ1", "DQ2", "DQ3", "DQ4", "DQ5", "DQ6", "DQ7", "DQ8", "DQ9"], + "DPB1": ["DPw1", "DPw2", "DPw3", "DPw4", "DPw5", "DPw6"], } - all_serology_names = [x for v in valid_serology_map.values() for x in v] - return all_serology_names - - -# -# -# Broad, Splits and Associated Antigens -# http://hla.alleles.org/antigens/broads_splits.html -# -# -# Mapping Generated from `dna_relshp.csv` file -# -broad_splits_dna_mapping = { - "A*09": ["A*23", "A*24"], - "A*10": ["A*25", "A*26", "A*34", "A*66"], - "A*19": ["A*29", "A*30", "A*31", "A*32", "A*33", "A*74"], - "A*28": ["A*68", "A*69"], - "B*05": ["B*51", "B*52"], - "B*12": ["B*44", "B*45"], - "B*16": ["B*38", "B*39"], - "B*17": ["B*57", "B*58"], - "B*21": ["B*49", "B*50"], - "B*22": ["B*54", "B*55", "B*56"], - "C*10": ["C*03", "C*04"], - "DQB1*01": ["DQB1*05", "DQB1*06"], - "DRB1*02": ["DRB1*15", "DRB1*16"], - "DRB1*06": ["DRB1*13", "DRB1*14"], -} - - -class SerologyMapping: def __init__(self, broad_splits_mapping, associated_mapping): self.broad_splits_map = broad_splits_mapping self.serology_associated_map = associated_mapping @@ -237,8 +264,43 @@ def find_splits(self, allele: str) -> tuple: if allele_name in mapping[broad]: return self._get_mapping(broad, mapping, prefix) - @staticmethod - def _get_mapping(broad, mapping, prefix): + def find_associated_antigen(self, serology): + return self.serology_associated_map.get(serology, serology) + + def get_xx_mappings(self): + all_xx_mappings = {} + for locus, serologies in SerologyMapping.valid_serology_map.items(): + xx_mapping = { + serology: self._map_serology_to_xx(locus, serology) + for serology in serologies + } + all_xx_mappings.update(xx_mapping) + return all_xx_mappings + + @classmethod + def get_valid_serology_names(cls): + all_serology_names = {x for v in cls.valid_serology_map.values() for x in v} + return all_serology_names + + def _map_serology_to_xx(self, locus, serology): + if serology in serology_xx_exception_mapping.keys(): + return serology_xx_exception_mapping[serology] + + # Use the associated serology for XX version + serology = self.find_associated_antigen(serology) + + # Extract just the digits + antigen_group = sero_antigen_regex.match(serology).group(2) + # Pad numbers with 0 for single digit numbers + antigen_group_num = int(antigen_group) + if antigen_group_num < 10: + antigen_group = f"{antigen_group_num:02}" + + # Build the XX allele + return f"{locus}*{antigen_group}:XX" + + @classmethod + def _get_mapping(cls, broad, mapping, prefix): if prefix: return "HLA-" + broad, list(map(lambda x: "HLA-" + x, mapping[broad])) else: diff --git a/tests/features/serology.feature b/tests/features/serology.feature index e815c54..a7df0d5 100644 --- a/tests/features/serology.feature +++ b/tests/features/serology.feature @@ -42,17 +42,44 @@ Feature: Serology Reduction All recognized serology are valid, even those with no corresponding DNA alleles. - Given the allele as + Given the serology typing is When checking for validity of the allele in non-strict mode Then the validness of the allele is Examples: - | Allele | Validity | - | DR7 | Valid | - | DR99 | Invalid | - | A10 | Valid | - | A101 | Invalid | - | DQ8 | Valid | - | DQ20 | InValid | - | DPw6 | Valid | - | DPw7 | InValid | + | Serology | Validity | + | DR7 | Valid | + | DR99 | Invalid | + | A10 | Valid | + | A101 | Invalid | + | DQ8 | Valid | + | DQ20 | InValid | + | DPw6 | Valid | + | DPw7 | InValid | + + Scenario Outline: Serology XX Mapping + + Serology to XX Mappings + + Given the serology typing is + When finding the XX version of the serology + Then the XX version is + + Examples: + | Serology | XX | + | A9 | A*09:XX | + | A23 | A*23:XX | + | A24 | A*24:XX | + | A2403 | A*24:XX | + | B70 | B*15:XX | + | B71 | B*15:XX | + | B72 | B*15:XX | + | B15 | B*15:XX | + | B40 | B*40:XX | + | B60 | B*40:XX | + | B703 | B*07:XX | + | DQ1 | DQB1*01:XX | + | DQ3 | DQB1*03:XX | + | DQ7 | DQB1*03:XX | + | DQ8 | DQB1*03:XX | + | DQ9 | DQB1*03:XX | diff --git a/tests/steps/redux_allele.py b/tests/steps/redux_allele.py index 10d6a17..a8534ca 100644 --- a/tests/steps/redux_allele.py +++ b/tests/steps/redux_allele.py @@ -130,3 +130,13 @@ def step_impl(context): def step_impl(context, validity): valid = validity == "Valid" assert_that(context.is_valid, is_(valid)) + + +@when("finding the XX version of the serology") +def step_impl(context): + context.xx_version = context.ard_non_strict.find_xx_from_serology(context.allele) + + +@then("the XX version is {XX}") +def step_impl(context, XX): + assert_that(context.xx_version, is_(XX))