diff --git a/Dockerfile b/Dockerfile index 615b46b..1ddede0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ LABEL MAINTAINER="Pradeep Bashyal" WORKDIR /app -ARG PY_ARD_VERSION=1.0.11 +ARG PY_ARD_VERSION=1.1.0 COPY requirements.txt /app RUN pip install --no-cache-dir --upgrade pip && \ diff --git a/Makefile b/Makefile index d2bf2d3..05bd3c1 100644 --- a/Makefile +++ b/Makefile @@ -98,10 +98,10 @@ dist: clean ## builds source and wheel package ls -l dist docker-build: ## build a docker image for the service - docker build -t pyard-service:latest . + docker build --platform=linux/amd64 -t nmdpbioinformatics/pyard-service:latest . docker: docker-build ## build a docker image and run the service - docker run --rm --name pyard-service -p 8080:8080 pyard-service:latest + docker run --platform=linux/amd64 --rm --name pyard-service -p 8080:8080 nmdpbioinformatics/pyard-service:latest install: clean ## install the package to the active Python's site-packages pip install --upgrade pip diff --git a/api-spec.yaml b/api-spec.yaml index f472d8c..62becb7 100644 --- a/api-spec.yaml +++ b/api-spec.yaml @@ -2,7 +2,7 @@ openapi: 3.0.3 info: title: ARD Reduction description: Reduce to ARD Level - version: "1.0.11" + version: "1.1.0" servers: - url: 'http://localhost:8080' tags: diff --git a/pyard/__init__.py b/pyard/__init__.py index 59c9437..6e0591d 100644 --- a/pyard/__init__.py +++ b/pyard/__init__.py @@ -22,12 +22,11 @@ # > http://www.opensource.org/licenses/lgpl-license.php # from .blender import blender as dr_blender -from .broad_splits import find_splits as find_broad_splits from .constants import DEFAULT_CACHE_SIZE from .misc import get_imgt_db_versions as db_versions __author__ = """NMDP Bioinformatics""" -__version__ = "1.0.11" +__version__ = "1.1.0" def init( diff --git a/pyard/ard.py b/pyard/ard.py index 0a950c4..b7953ec 100644 --- a/pyard/ard.py +++ b/pyard/ard.py @@ -113,10 +113,16 @@ def __init__( ) # Load Serology mappings - broad_splits.broad_splits_ser_mapping = ( - dr.generate_serology_broad_split_mapping(self.db_connection, imgt_version) + broad_splits_mapping, associated_mapping = dr.generate_broad_splits_mapping( + self.db_connection, imgt_version + ) + self.serology_mapping = broad_splits.SerologyMapping( + broad_splits_mapping, associated_mapping + ) + + dr.generate_serology_mapping( + self.db_connection, self.serology_mapping, imgt_version ) - dr.generate_serology_mapping(self.db_connection, imgt_version) # Load V2 to V3 mappings dr.generate_v2_to_v3_mapping(self.db_connection, imgt_version) # Save IMGT database version @@ -608,6 +614,12 @@ def is_exp_allele(self, allele): """ return allele in self.allele_group.exp_alleles + def find_broad_splits(self, allele) -> tuple: + return self.serology_mapping.find_splits(allele) + + def find_associated_antigen(self, serology) -> str: + return self.serology_mapping.serology_associated_map.get(serology, serology) + def _get_alleles(self, code, locus_antigen) -> Iterable[str]: """ Look up allele code in database and generate alleles diff --git a/pyard/broad_splits.py b/pyard/broad_splits.py index c42224c..f3c1d37 100644 --- a/pyard/broad_splits.py +++ b/pyard/broad_splits.py @@ -20,7 +20,8 @@ # > http://www.fsf.org/licensing/licenses/lgpl.html # > http://www.opensource.org/licenses/lgpl-license.php # -import re + +from pyard.constants import HLA_regex # # Broad, Splits and Associated Antigens @@ -46,35 +47,35 @@ "DRB1*06": ["DRB1*13", "DRB1*14"], } -# Loaded at runtime -broad_splits_ser_mapping = None - -HLA_regex = re.compile("^HLA-") - -def find_splits(allele: str) -> tuple: - if HLA_regex.search(allele): - prefix = True - allele_name = allele.split("-")[1] - else: - prefix = False - allele_name = allele +class SerologyMapping: + def __init__(self, broad_splits_mapping, associated_mapping): + self.broad_splits_map = broad_splits_mapping + self.serology_associated_map = associated_mapping - if "*" in allele_name: - mapping = broad_splits_dna_mapping - else: - mapping = broad_splits_ser_mapping + def find_splits(self, allele: str) -> tuple: + if HLA_regex.search(allele): + prefix = True + allele_name = allele.split("-")[1] + else: + prefix = False + allele_name = allele - if allele_name in mapping: - return _get_mapping(allele_name, mapping, prefix) + if "*" in allele_name: + mapping = broad_splits_dna_mapping + else: + mapping = self.broad_splits_map - for broad in mapping: - if allele_name in mapping[broad]: - return _get_mapping(broad, mapping, prefix) + if allele_name in mapping: + return self._get_mapping(allele_name, mapping, prefix) + for broad in mapping: + if allele_name in mapping[broad]: + return self._get_mapping(broad, mapping, prefix) -def _get_mapping(broad, mapping, prefix): - if prefix: - return "HLA-" + broad, list(map(lambda x: "HLA-" + x, mapping[broad])) - else: - return broad, mapping[broad] + @staticmethod + def _get_mapping(broad, mapping, prefix): + if prefix: + return "HLA-" + broad, list(map(lambda x: "HLA-" + x, mapping[broad])) + else: + return broad, mapping[broad] diff --git a/pyard/data_repository.py b/pyard/data_repository.py index 789e063..8df94d7 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -26,7 +26,8 @@ import pyard.load from pyard.smart_sort import smart_sort_comparator -from . import db, broad_splits +from . import db +from .broad_splits import broad_splits_dna_mapping from .load import ( load_g_group, load_p_group, @@ -216,7 +217,7 @@ def generate_alleles_and_xx_codes_and_who( xx_codes = xx_df.groupby(["1d"]).apply(lambda x: list(x["Allele"])).to_dict() # Update xx codes with broads and splits - for broad, splits in broad_splits.broad_splits_dna_mapping.items(): + for broad, splits in broad_splits_dna_mapping.items(): for split in splits: if broad in xx_codes: xx_codes[broad].extend(xx_codes[split]) @@ -354,7 +355,9 @@ def to_serological_name(locus_name: str): return sero_name -def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version): +def generate_serology_mapping( + db_connection: sqlite3.Connection, serology_mapping, imgt_version +): if not db.table_exists(db_connection, "serology_mapping"): df_sero = load_serology_mappings(imgt_version) @@ -396,7 +399,7 @@ def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version): # map alleles for split serology to their corresponding broad # Update xx codes with broads and splits - for broad, splits in broad_splits.broad_splits_ser_mapping.items(): + for broad, splits in serology_mapping.broad_splits_map.items(): for split in splits: try: sero_mapping[broad] = "/".join( @@ -450,15 +453,19 @@ def get_db_version(db_connection: sqlite3.Connection): return db.get_user_version(db_connection) -def generate_serology_broad_split_mapping( - db_connection: sqlite3.Connection, imgt_version -): +def generate_broad_splits_mapping(db_connection: sqlite3.Connection, imgt_version): if not db.table_exists(db_connection, "serology_broad_split_mapping"): - sero_mapping = pyard.load.load_serology_broad_split_mapping(imgt_version) + sero_mapping, associated_mapping = pyard.load.load_serology_broad_split_mapping( + imgt_version + ) db.save_serology_broad_split_mappings(db_connection, sero_mapping) - return sero_mapping + db.save_serology_associated_mappings(db_connection, associated_mapping) + return sero_mapping, associated_mapping + + sero_mapping = db.load_serology_broad_split_mappings(db_connection) + associated_mapping = db.load_serology_associated_mappings(db_connection) - return db.load_serology_broad_split_mappings(db_connection) + return sero_mapping, associated_mapping def generate_cwd_mapping(db_connection: sqlite3.Connection): diff --git a/pyard/db.py b/pyard/db.py index 9973997..cd7f33f 100644 --- a/pyard/db.py +++ b/pyard/db.py @@ -609,12 +609,19 @@ def load_v2_v3_mappings(db_connection): def load_serology_broad_split_mappings(db_connection): sero_mapping = load_dict( - db_connection, "serology_broad_split_mapping", ("serology", "splits") + db_connection, "serology_broad_split_mapping", ("broad", "splits") ) sero_splits = {k: v.split("/") for k, v in sero_mapping.items()} return sero_splits +def load_serology_associated_mappings(db_connection): + associated_mapping = load_dict( + db_connection, "serology_associated_mappings", ("associated", "antigen") + ) + return associated_mapping + + def save_serology_broad_split_mappings(db_connection, sero_mapping): # Save the `splits` as a "/" delimited string to db sero_splits = {sero: "/".join(splits) for sero, splits in sero_mapping.items()} @@ -622,7 +629,16 @@ def save_serology_broad_split_mappings(db_connection, sero_mapping): db_connection, table_name="serology_broad_split_mapping", dictionary=sero_splits, - columns=("serology", "splits"), + columns=("broad", "splits"), + ) + + +def save_serology_associated_mappings(db_connection, associated_mapping): + save_dict( + db_connection, + table_name="serology_associated_mappings", + dictionary=associated_mapping, + columns=("associated", "antigen"), ) diff --git a/pyard/load.py b/pyard/load.py index e99f8af..6ce2a3c 100644 --- a/pyard/load.py +++ b/pyard/load.py @@ -20,7 +20,7 @@ # > http://www.opensource.org/licenses/lgpl-license.php # import sys -from typing import Dict, List +from typing import Dict, List, Tuple from urllib.error import URLError from pyard.misc import get_G_name, get_2field_allele, get_3field_allele, get_P_name @@ -38,7 +38,7 @@ def add_locus_name(locus: str, splits: str) -> List: # Derived from rel_ser_ser.txt # https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/wmda/rel_ser_ser.txt # -def load_serology_broad_split_mapping(imgt_version: str) -> Dict: +def load_serology_broad_split_mapping(imgt_version: str) -> Tuple[Dict, Dict]: import pandas as pd ser_ser_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/rel_ser_ser.txt" @@ -47,21 +47,36 @@ def load_serology_broad_split_mapping(imgt_version: str) -> Dict: ser_ser_url, skiprows=6, names=["Locus", "A", "Splits", "Associated"], - usecols=[0, 1, 2], dtype="string", sep=";", - ).dropna() + ) except URLError as e: print(f"Error downloading {ser_ser_url}", e, file=sys.stderr) sys.exit(1) - df_p["Sero"] = df_p["Locus"] + df_p["A"] - df_p["Splits"] = df_p[["Locus", "Splits"]].apply( + splits_df = df_p[["Locus", "A", "Splits"]].dropna() + associated_df = df_p[["Locus", "A", "Associated"]].dropna() + + splits_df["Sero"] = splits_df["Locus"] + splits_df["A"] + splits_df["Splits"] = splits_df[["Locus", "Splits"]].apply( lambda x: add_locus_name(x["Locus"], x["Splits"]), axis=1 ) + splits_df = splits_df.astype({"A": "int32"}).sort_values(by=["Locus", "A"]) + + associated_df["Sero"] = associated_df["Locus"] + associated_df["A"] + associated_df["Associated"] = associated_df[["Locus", "Associated"]].apply( + lambda x: add_locus_name(x["Locus"], x["Associated"]), axis=1 + ) + associated_df = associated_df.astype({"A": "int32"}).sort_values(by=["Locus", "A"]) + + splits_mapping = splits_df[["Sero", "Splits"]].set_index("Sero")["Splits"].to_dict() + associated_mapping = ( + associated_df.explode("Associated")[["Associated", "Sero"]] + .set_index("Associated")["Sero"] + .to_dict() + ) - sero_mapping = df_p[["Sero", "Splits"]].set_index("Sero")["Splits"].to_dict() - return sero_mapping + return splits_mapping, associated_mapping def load_g_group(imgt_version): diff --git a/scripts/pyard b/scripts/pyard index 630d212..d03b7f8 100755 --- a/scripts/pyard +++ b/scripts/pyard @@ -33,7 +33,7 @@ from pyard.exceptions import InvalidAlleleError, InvalidTypingError, InvalidMACE from pyard.misc import get_data_dir, get_imgt_version -def find_similar_alleles(prefix): +def find_similar_alleles(ard, prefix): alleles = ard.similar_alleles(prefix) if alleles: for allele in alleles: @@ -62,8 +62,8 @@ def expand_mac_code(): sys.exit(0) -def find_broad_splits(): - mapping = pyard.find_broad_splits(args.splits) +def find_broad_splits(ard): + mapping = ard.find_broad_splits(args.splits) if mapping: print(f"{mapping[0]} = {'/'.join(mapping[1])}") sys.exit(0) @@ -166,7 +166,7 @@ if __name__ == "__main__": # Handle --splits option if args.splits: - find_broad_splits() + find_broad_splits(ard) # Handle --expand-mac option if args.expand_mac: @@ -178,7 +178,7 @@ if __name__ == "__main__": # Handle --similar option if args.similar_allele: - find_similar_alleles(args.similar_allele) + find_similar_alleles(ard, args.similar_allele) try: if args.cwd: diff --git a/setup.cfg b/setup.cfg index 9f09fc8..d1c5d9b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.11 +current_version = 1.1.0 commit = True tag = True diff --git a/setup.py b/setup.py index d7e4745..83ec1d8 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ setup( name="py-ard", - version="1.0.11", + version="1.1.0", description="ARD reduction for HLA with Python", long_description=readme, long_description_content_type="text/markdown", diff --git a/tests/features/broad_splits.feature b/tests/features/broad_splits.feature index 0dbfbbc..5f6489f 100644 --- a/tests/features/broad_splits.feature +++ b/tests/features/broad_splits.feature @@ -31,3 +31,19 @@ Feature: Broad Splits for DNA/Serology | DQB1*05 | DQB1*06 | DQB1*01 | | B*55 | B*54/B*56 | B*22 | | A25 | A26/A34/A66 | A10 | + + + Scenario Outline: Associated Serology + + Given the serology antigen is + When looking for associated serology + Then the associated serology is found to be + + Examples: Alleles to Serology + | Serology | Associated Serology | + | A23 | A23 | + | A24 | A24 | + | A2403 | A24 | + | DR1403 | DR14 | + | DR1404 | DR14 | + | B5 | B5 | diff --git a/tests/steps/broad_splits.py b/tests/steps/broad_splits.py index e9e0c8a..ab41445 100644 --- a/tests/steps/broad_splits.py +++ b/tests/steps/broad_splits.py @@ -32,7 +32,7 @@ def step_impl(context, broad): @when("it is expanded to the splits") def step_impl(context): - mapping = pyard.find_broad_splits(context.broad) + mapping = context.ard.find_broad_splits(context.broad) splits = mapping[1] context.splits = "/".join(splits) @@ -49,7 +49,7 @@ def step_impl(context, split): @when("split is searched in the mappings") def step_impl(context): - mapping = pyard.find_broad_splits(context.split) + mapping = context.ard.find_broad_splits(context.split) context.broad = mapping[0] splits = mapping[1] splits.remove(context.split) @@ -64,3 +64,18 @@ def step_impl(context, siblings): @step("the corresponding broad is {broad}") def step_impl(context, broad): assert_that(context.broad, is_(broad)) + + +@given("the serology antigen is {serology}") +def step_impl(context, serology): + context.serology = serology + + +@when("looking for associated serology") +def step_impl(context): + context.associated_antigen = context.ard.find_associated_antigen(context.serology) + + +@then("the associated serology is found to be {associated_antigen}") +def step_impl(context, associated_antigen): + assert_that(context.associated_antigen, is_(associated_antigen))