Skip to content

Commit

Permalink
Merge pull request #303 from pbashyal-nmdp/support-associated-serology
Browse files Browse the repository at this point in the history
Support Serology Associated Antigens
  • Loading branch information
mmaiers-nmdp authored Feb 14, 2024
2 parents 25d798d + 7b17e5a commit 4665c1d
Show file tree
Hide file tree
Showing 14 changed files with 146 additions and 65 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ LABEL MAINTAINER="Pradeep Bashyal"

WORKDIR /app

ARG PY_ARD_VERSION=1.0.11
ARG PY_ARD_VERSION=1.1.0

COPY requirements.txt /app
RUN pip install --no-cache-dir --upgrade pip && \
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ dist: clean ## builds source and wheel package
ls -l dist

docker-build: ## build a docker image for the service
docker build -t pyard-service:latest .
docker build --platform=linux/amd64 -t nmdpbioinformatics/pyard-service:latest .

docker: docker-build ## build a docker image and run the service
docker run --rm --name pyard-service -p 8080:8080 pyard-service:latest
docker run --platform=linux/amd64 --rm --name pyard-service -p 8080:8080 nmdpbioinformatics/pyard-service:latest

install: clean ## install the package to the active Python's site-packages
pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion api-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ openapi: 3.0.3
info:
title: ARD Reduction
description: Reduce to ARD Level
version: "1.0.11"
version: "1.1.0"
servers:
- url: 'http://localhost:8080'
tags:
Expand Down
3 changes: 1 addition & 2 deletions pyard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,11 @@
# > http://www.opensource.org/licenses/lgpl-license.php
#
from .blender import blender as dr_blender
from .broad_splits import find_splits as find_broad_splits
from .constants import DEFAULT_CACHE_SIZE
from .misc import get_imgt_db_versions as db_versions

__author__ = """NMDP Bioinformatics"""
__version__ = "1.0.11"
__version__ = "1.1.0"


def init(
Expand Down
18 changes: 15 additions & 3 deletions pyard/ard.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ def __init__(
)

# Load Serology mappings
broad_splits.broad_splits_ser_mapping = (
dr.generate_serology_broad_split_mapping(self.db_connection, imgt_version)
broad_splits_mapping, associated_mapping = dr.generate_broad_splits_mapping(
self.db_connection, imgt_version
)
self.serology_mapping = broad_splits.SerologyMapping(
broad_splits_mapping, associated_mapping
)

dr.generate_serology_mapping(
self.db_connection, self.serology_mapping, imgt_version
)
dr.generate_serology_mapping(self.db_connection, imgt_version)
# Load V2 to V3 mappings
dr.generate_v2_to_v3_mapping(self.db_connection, imgt_version)
# Save IMGT database version
Expand Down Expand Up @@ -608,6 +614,12 @@ def is_exp_allele(self, allele):
"""
return allele in self.allele_group.exp_alleles

def find_broad_splits(self, allele) -> tuple:
return self.serology_mapping.find_splits(allele)

def find_associated_antigen(self, serology) -> str:
return self.serology_mapping.serology_associated_map.get(serology, serology)

def _get_alleles(self, code, locus_antigen) -> Iterable[str]:
"""
Look up allele code in database and generate alleles
Expand Down
55 changes: 28 additions & 27 deletions pyard/broad_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
# > http://www.fsf.org/licensing/licenses/lgpl.html
# > http://www.opensource.org/licenses/lgpl-license.php
#
import re

from pyard.constants import HLA_regex

#
# Broad, Splits and Associated Antigens
Expand All @@ -46,35 +47,35 @@
"DRB1*06": ["DRB1*13", "DRB1*14"],
}

# Loaded at runtime
broad_splits_ser_mapping = None

HLA_regex = re.compile("^HLA-")


def find_splits(allele: str) -> tuple:
if HLA_regex.search(allele):
prefix = True
allele_name = allele.split("-")[1]
else:
prefix = False
allele_name = allele
class SerologyMapping:
def __init__(self, broad_splits_mapping, associated_mapping):
self.broad_splits_map = broad_splits_mapping
self.serology_associated_map = associated_mapping

if "*" in allele_name:
mapping = broad_splits_dna_mapping
else:
mapping = broad_splits_ser_mapping
def find_splits(self, allele: str) -> tuple:
if HLA_regex.search(allele):
prefix = True
allele_name = allele.split("-")[1]
else:
prefix = False
allele_name = allele

if allele_name in mapping:
return _get_mapping(allele_name, mapping, prefix)
if "*" in allele_name:
mapping = broad_splits_dna_mapping
else:
mapping = self.broad_splits_map

for broad in mapping:
if allele_name in mapping[broad]:
return _get_mapping(broad, mapping, prefix)
if allele_name in mapping:
return self._get_mapping(allele_name, mapping, prefix)

for broad in mapping:
if allele_name in mapping[broad]:
return self._get_mapping(broad, mapping, prefix)

def _get_mapping(broad, mapping, prefix):
if prefix:
return "HLA-" + broad, list(map(lambda x: "HLA-" + x, mapping[broad]))
else:
return broad, mapping[broad]
@staticmethod
def _get_mapping(broad, mapping, prefix):
if prefix:
return "HLA-" + broad, list(map(lambda x: "HLA-" + x, mapping[broad]))
else:
return broad, mapping[broad]
27 changes: 17 additions & 10 deletions pyard/data_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@

import pyard.load
from pyard.smart_sort import smart_sort_comparator
from . import db, broad_splits
from . import db
from .broad_splits import broad_splits_dna_mapping
from .load import (
load_g_group,
load_p_group,
Expand Down Expand Up @@ -216,7 +217,7 @@ def generate_alleles_and_xx_codes_and_who(
xx_codes = xx_df.groupby(["1d"]).apply(lambda x: list(x["Allele"])).to_dict()

# Update xx codes with broads and splits
for broad, splits in broad_splits.broad_splits_dna_mapping.items():
for broad, splits in broad_splits_dna_mapping.items():
for split in splits:
if broad in xx_codes:
xx_codes[broad].extend(xx_codes[split])
Expand Down Expand Up @@ -354,7 +355,9 @@ def to_serological_name(locus_name: str):
return sero_name


def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
def generate_serology_mapping(
db_connection: sqlite3.Connection, serology_mapping, imgt_version
):
if not db.table_exists(db_connection, "serology_mapping"):
df_sero = load_serology_mappings(imgt_version)

Expand Down Expand Up @@ -396,7 +399,7 @@ def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):

# map alleles for split serology to their corresponding broad
# Update xx codes with broads and splits
for broad, splits in broad_splits.broad_splits_ser_mapping.items():
for broad, splits in serology_mapping.broad_splits_map.items():
for split in splits:
try:
sero_mapping[broad] = "/".join(
Expand Down Expand Up @@ -450,15 +453,19 @@ def get_db_version(db_connection: sqlite3.Connection):
return db.get_user_version(db_connection)


def generate_serology_broad_split_mapping(
db_connection: sqlite3.Connection, imgt_version
):
def generate_broad_splits_mapping(db_connection: sqlite3.Connection, imgt_version):
if not db.table_exists(db_connection, "serology_broad_split_mapping"):
sero_mapping = pyard.load.load_serology_broad_split_mapping(imgt_version)
sero_mapping, associated_mapping = pyard.load.load_serology_broad_split_mapping(
imgt_version
)
db.save_serology_broad_split_mappings(db_connection, sero_mapping)
return sero_mapping
db.save_serology_associated_mappings(db_connection, associated_mapping)
return sero_mapping, associated_mapping

sero_mapping = db.load_serology_broad_split_mappings(db_connection)
associated_mapping = db.load_serology_associated_mappings(db_connection)

return db.load_serology_broad_split_mappings(db_connection)
return sero_mapping, associated_mapping


def generate_cwd_mapping(db_connection: sqlite3.Connection):
Expand Down
20 changes: 18 additions & 2 deletions pyard/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,20 +609,36 @@ def load_v2_v3_mappings(db_connection):

def load_serology_broad_split_mappings(db_connection):
sero_mapping = load_dict(
db_connection, "serology_broad_split_mapping", ("serology", "splits")
db_connection, "serology_broad_split_mapping", ("broad", "splits")
)
sero_splits = {k: v.split("/") for k, v in sero_mapping.items()}
return sero_splits


def load_serology_associated_mappings(db_connection):
associated_mapping = load_dict(
db_connection, "serology_associated_mappings", ("associated", "antigen")
)
return associated_mapping


def save_serology_broad_split_mappings(db_connection, sero_mapping):
# Save the `splits` as a "/" delimited string to db
sero_splits = {sero: "/".join(splits) for sero, splits in sero_mapping.items()}
save_dict(
db_connection,
table_name="serology_broad_split_mapping",
dictionary=sero_splits,
columns=("serology", "splits"),
columns=("broad", "splits"),
)


def save_serology_associated_mappings(db_connection, associated_mapping):
save_dict(
db_connection,
table_name="serology_associated_mappings",
dictionary=associated_mapping,
columns=("associated", "antigen"),
)


Expand Down
31 changes: 23 additions & 8 deletions pyard/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# > http://www.opensource.org/licenses/lgpl-license.php
#
import sys
from typing import Dict, List
from typing import Dict, List, Tuple
from urllib.error import URLError

from pyard.misc import get_G_name, get_2field_allele, get_3field_allele, get_P_name
Expand All @@ -38,7 +38,7 @@ def add_locus_name(locus: str, splits: str) -> List:
# Derived from rel_ser_ser.txt
# https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/wmda/rel_ser_ser.txt
#
def load_serology_broad_split_mapping(imgt_version: str) -> Dict:
def load_serology_broad_split_mapping(imgt_version: str) -> Tuple[Dict, Dict]:
import pandas as pd

ser_ser_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/rel_ser_ser.txt"
Expand All @@ -47,21 +47,36 @@ def load_serology_broad_split_mapping(imgt_version: str) -> Dict:
ser_ser_url,
skiprows=6,
names=["Locus", "A", "Splits", "Associated"],
usecols=[0, 1, 2],
dtype="string",
sep=";",
).dropna()
)
except URLError as e:
print(f"Error downloading {ser_ser_url}", e, file=sys.stderr)
sys.exit(1)

df_p["Sero"] = df_p["Locus"] + df_p["A"]
df_p["Splits"] = df_p[["Locus", "Splits"]].apply(
splits_df = df_p[["Locus", "A", "Splits"]].dropna()
associated_df = df_p[["Locus", "A", "Associated"]].dropna()

splits_df["Sero"] = splits_df["Locus"] + splits_df["A"]
splits_df["Splits"] = splits_df[["Locus", "Splits"]].apply(
lambda x: add_locus_name(x["Locus"], x["Splits"]), axis=1
)
splits_df = splits_df.astype({"A": "int32"}).sort_values(by=["Locus", "A"])

associated_df["Sero"] = associated_df["Locus"] + associated_df["A"]
associated_df["Associated"] = associated_df[["Locus", "Associated"]].apply(
lambda x: add_locus_name(x["Locus"], x["Associated"]), axis=1
)
associated_df = associated_df.astype({"A": "int32"}).sort_values(by=["Locus", "A"])

splits_mapping = splits_df[["Sero", "Splits"]].set_index("Sero")["Splits"].to_dict()
associated_mapping = (
associated_df.explode("Associated")[["Associated", "Sero"]]
.set_index("Associated")["Sero"]
.to_dict()
)

sero_mapping = df_p[["Sero", "Splits"]].set_index("Sero")["Splits"].to_dict()
return sero_mapping
return splits_mapping, associated_mapping


def load_g_group(imgt_version):
Expand Down
10 changes: 5 additions & 5 deletions scripts/pyard
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ from pyard.exceptions import InvalidAlleleError, InvalidTypingError, InvalidMACE
from pyard.misc import get_data_dir, get_imgt_version


def find_similar_alleles(prefix):
def find_similar_alleles(ard, prefix):
alleles = ard.similar_alleles(prefix)
if alleles:
for allele in alleles:
Expand Down Expand Up @@ -62,8 +62,8 @@ def expand_mac_code():
sys.exit(0)


def find_broad_splits():
mapping = pyard.find_broad_splits(args.splits)
def find_broad_splits(ard):
mapping = ard.find_broad_splits(args.splits)
if mapping:
print(f"{mapping[0]} = {'/'.join(mapping[1])}")
sys.exit(0)
Expand Down Expand Up @@ -166,7 +166,7 @@ if __name__ == "__main__":

# Handle --splits option
if args.splits:
find_broad_splits()
find_broad_splits(ard)

# Handle --expand-mac option
if args.expand_mac:
Expand All @@ -178,7 +178,7 @@ if __name__ == "__main__":

# Handle --similar option
if args.similar_allele:
find_similar_alleles(args.similar_allele)
find_similar_alleles(ard, args.similar_allele)

try:
if args.cwd:
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.0.11
current_version = 1.1.0
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

setup(
name="py-ard",
version="1.0.11",
version="1.1.0",
description="ARD reduction for HLA with Python",
long_description=readme,
long_description_content_type="text/markdown",
Expand Down
16 changes: 16 additions & 0 deletions tests/features/broad_splits.feature
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,19 @@ Feature: Broad Splits for DNA/Serology
| DQB1*05 | DQB1*06 | DQB1*01 |
| B*55 | B*54/B*56 | B*22 |
| A25 | A26/A34/A66 | A10 |


Scenario Outline: Associated Serology

Given the serology antigen is <Serology>
When looking for associated serology
Then the associated serology is found to be <Associated Serology>

Examples: Alleles to Serology
| Serology | Associated Serology |
| A23 | A23 |
| A24 | A24 |
| A2403 | A24 |
| DR1403 | DR14 |
| DR1404 | DR14 |
| B5 | B5 |
Loading

0 comments on commit 4665c1d

Please sign in to comment.