From 1e4a9b1ce02fe8e1a00de50b3225946af6ada82d Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sat, 9 Nov 2024 11:36:17 +0100 Subject: [PATCH 1/8] Structure identification for models --- evcouplings/compare/protocol.py | 104 +++++++++++++++++++++++++++++++- evcouplings/compare/sifts.py | 35 ++++++++++- 2 files changed, 135 insertions(+), 4 deletions(-) diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py index 1f07002f..a53404e4 100644 --- a/evcouplings/compare/protocol.py +++ b/evcouplings/compare/protocol.py @@ -28,7 +28,7 @@ intra_dists, multimer_dists, remap_chains, inter_dists, remap_complex_chains ) -from evcouplings.compare.sifts import SIFTS, SIFTSResult +from evcouplings.compare.sifts import SIFTS, SIFTSResult, SIFTSWithDynamicUpdate from evcouplings.compare.ecs import ( coupling_scores_compared, add_precision ) @@ -582,6 +582,108 @@ def _individual_distance_map_config_result(individual_distance_map_table): return individual_maps_result +def _map_alphafold_hits(modeldb_list_file, relevant_ids): + """ + Turn list of AlphaFoldDB sequence hits into structure table + compatible with SIFTS object + + Parameters + ---------- + modeldb_list_file: str + Path to accession_ids.csv file from AlphaFoldDB + relevant_ids: set(str) + DB entry identifiers to include in table (without leading "AFDB:" prefix) + + Returns + ------- + pd.DataFrame + Structure table for SIFTS object + """ + with open(modeldb_list_file) as f: + _table = [] + for line in f: + uniprot_ac, start, end, model_id, model_version = line.strip().split(",") + model_id_with_prefix = "AFDB:" + model_id + start, end = int(start), int(end) + if model_id_with_prefix in relevant_ids: + _table.append({ + "uniprot_ac": model_id_with_prefix, + "pdb_id": f"{model_id}-model_v{model_version}", + "pdb_chain": "A", + "resseq_start": start, + "resseq_end": end, + "coord_start": start, + "coord_end": end, + "uniprot_start": start, + "uniprot_end": end, + }) + + return pd.DataFrame(_table) + + +def _identify_predicted_structures(**kwargs): + """ + Find suitable predicted structures for mapping on ECs + + Parameters + ---------- + **kwargs + See check_required in code below + + Returns + ------- + SIFTSResult + Identified structures and residue index mappings + """ + check_required( + kwargs, + [ + "modeldb_type", "modeldb_sequence_file", "modeldb_list_file", + "model_max_num_hits", + ] + ) + + # implement custom logic for different model database types in code to avoid + # complicated configuration setup covering all eventualities + AVAILABLE_DB_TYPES = ["alphafolddb_v4"] + modeldb_type = kwargs["modeldb_type"] + if modeldb_type not in AVAILABLE_DB_TYPES: + raise InvalidParameterError( + f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_DB_TYPES)}" + ) + + table_callback = None + if modeldb_type == "alphafolddb_v4": + table_callback = lambda ali, hits: ( + _map_alphafold_hits( + kwargs["modeldb_list_file"], set(hits.uniprot_ac) + ), hits + ) + + assert table_callback is not None, "table_callback not defined, this indicates logic error above" + + # create patched SIFTS object with all model sequences, but only instantiate table + # property once sequences are found to avoid loading gigabytes of tables with pandas + s = SIFTSWithDynamicUpdate( + table_callback=table_callback, + sequence_file=kwargs["modeldb_sequence_file"] + ) + + # run sequence-based query against model database, always use jackhmmer for this + sifts_map = s.by_alignment(**{ + **kwargs, + "pdb_alignment_method": "jackhmmer" + }) + + sifts_map_full = deepcopy(sifts_map) + + # reduce number of structures/hits + if kwargs["model_max_num_hits"] is not None: + sifts_map.hits = sifts_map.hits.iloc[:kwargs["model_max_num_hits"]] + + return sifts_map, sifts_map_full + + def standard(**kwargs): """ Protocol: diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py index 275b9848..29781e4f 100644 --- a/evcouplings/compare/sifts.py +++ b/evcouplings/compare/sifts.py @@ -255,8 +255,12 @@ def find_homologs(pdb_alignment_method="jackhmmer", **kwargs): # read hmmer hittable and simplify hits = read_hmmer_domtbl(ar["hittable_file"]) - hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[1]) - hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[2]) + try: + hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[1]) + hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[2]) + except IndexError: + hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].copy() + hits.loc[:, "uniprot_id"] = pd.NA hits = hits.rename( columns={ @@ -855,10 +859,15 @@ def _create_mapping(r): **kwargs ) + if callable(self.table): + table, hits = self.table(ali, hits) + else: + table = self.table + # merge with internal table to identify overlap of # aligned regions and regions with structural coverage hits = hits.merge( - self.table, on="uniprot_ac", suffixes=("", "_") + table, on="uniprot_ac", suffixes=("", "_") ) # add 1 to end of range since overlap function treats @@ -1002,3 +1011,23 @@ def _agg_type(x): } return SIFTSResult(hits_grouped, mappings) + + +class SIFTSWithDynamicUpdate(SIFTS): + def __init__(self, table_callback, sequence_file): + """ + Version of SIFTS class that allows to dynamically + instantiate table after searching sequence file + (to avoid loading large tables) + + Parameters + ---------- + table_callback: callable + Function that ingests alignment (evcouplings.align.Alignment) and hit table (pd.DataFrame) + and returns dynamic structure mapping table (pd.DataFrame) and hit table (pd.DataFrame) + sequence_file: str + Path to database with sequences corresponding to structures + """ + # note: do *not* call super init to pass verification of input files + self.table = table_callback + self.sequence_file = sequence_file From d78920631c3d84b06836cb5770d8aa8f02ea676a Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sat, 9 Nov 2024 16:27:00 +0100 Subject: [PATCH 2/8] model comparison skeleton added --- evcouplings/compare/pdb.py | 6 +- evcouplings/compare/protocol.py | 107 ++++++++++++++++++++++++++++++-- 2 files changed, 107 insertions(+), 6 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index c13a74a9..ce449368 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -1280,7 +1280,7 @@ def get_chain(self, chain, model=0): return Chain(res_df, coord_df) -def load_structures(pdb_ids, structure_dir=None, raise_missing=True): +def load_structures(pdb_ids, structure_dir=None, raise_missing=True, extension=".bcif.gz"): """ Load PDB structures from files / web @@ -1298,6 +1298,8 @@ def load_structures(pdb_ids, structure_dir=None, raise_missing=True): Raise a ResourceError exception if any of the PDB IDs cannot be loaded. If False, missing entries will be ignored. + extension: str, optional (default: ".bcif.gz") + File extension to be added to identifier if loading file locally Returns ------- @@ -1320,7 +1322,7 @@ def load_structures(pdb_ids, structure_dir=None, raise_missing=True): has_file = False if structure_dir is not None: - structure_file = path.join(structure_dir, pdb_id + ".mmtf") + structure_file = path.join(structure_dir, pdb_id + extension) has_file = valid_file(structure_file) try: diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py index a53404e4..f6adc27b 100644 --- a/evcouplings/compare/protocol.py +++ b/evcouplings/compare/protocol.py @@ -35,6 +35,8 @@ from evcouplings.visualize import pairs, misc SIFTS_TABLE_FORMAT_STR = "{pdb_id}:{pdb_chain} ({coord_start}-{coord_end})" +AVAILABLE_MODEL_DB_TYPES = ["alphafolddb_v4"] +ALPHAFOLDDB_DOWNLOAD_URL = "https://alphafold.ebi.ac.uk/files/{model_id}.cif" def print_pdb_structure_info(sifts_result, format_string=SIFTS_TABLE_FORMAT_STR, @@ -645,11 +647,11 @@ def _identify_predicted_structures(**kwargs): # implement custom logic for different model database types in code to avoid # complicated configuration setup covering all eventualities - AVAILABLE_DB_TYPES = ["alphafolddb_v4"] modeldb_type = kwargs["modeldb_type"] - if modeldb_type not in AVAILABLE_DB_TYPES: + + if modeldb_type not in AVAILABLE_MODEL_DB_TYPES: raise InvalidParameterError( - f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_DB_TYPES)}" + f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_MODEL_DB_TYPES)}" ) table_callback = None @@ -684,6 +686,85 @@ def _identify_predicted_structures(**kwargs): return sifts_map, sifts_map_full +def _load_models(model_ids, modeldb_type, structure_dir=None, raise_missing=True): + if modeldb_type == "alphafolddb_v4": + if structure_dir is not None: + raise NotImplementedError("Local file retrieval not implemented") + + pass + else: + raise InvalidParameterError("Invalid modeldb_type") + + return {} + + +def models(**kwargs): + """ + # TODO: document parameters/return values + """ + check_required( + kwargs, + [ + "prefix", "ec_file", "modeldb_type", "modeldb_file_dir", + # "min_sequence_distance", + # "pdb_mmtf_dir", "atom_filter", "compare_multimer", + # "distance_cutoff", "target_sequence_file", + # "scale_sizes", + ] + ) + + prefix = kwargs["prefix"] + outcfg = { + "model_structure_hits_file": prefix + "_model_hits.csv", + "model_structure_hits_unfiltered_file": prefix + "_model_hits_unfiltered.csv", + } + + # make sure EC file exists + verify_resources( + "EC file does not exist", + kwargs["ec_file"] + ) + + # make sure output directory exists + create_prefix_folders(prefix) + + # store auxiliary files here (too much for average user) + aux_prefix = insert_dir(prefix, "aux_models", rootname_subdir=False) + create_prefix_folders(aux_prefix) + + # Step 1: Identify models for comparison + sifts_map, sifts_map_full = _identify_predicted_structures(**{ + **kwargs, + "prefix": aux_prefix, + }) + + # save selected PDB hits + sifts_map.hits.to_csv( + outcfg["model_structure_hits_file"], index=True + ) + + # also save full list of hits + sifts_map_full.hits.to_csv( + outcfg["model_structure_hits_unfiltered_file"], index=True + ) + + # Step 2: Compute distance maps + + # load all structures at once + structures = _load_models( + sifts_map.hits.pdb_id, + kwargs["modeldb_type"], + kwargs["modeldb_file_dir"], + raise_missing=False + ) + + # TODO: implement remaining logic + # TODO: implement structure mapping + # print(structures) + + return outcfg + + def standard(**kwargs): """ Protocol: @@ -939,6 +1020,21 @@ def standard(**kwargs): ec_table, d_intra, d_multimer, sifts_map, **kwargs ) + # Step 5: check if comparison to models is enabled, then run this protocol as well + if kwargs.get("compare_to_models"): + # create subdirectory for running models comparison + # aux_prefix_models = insert_dir( + # prefix, "models", rootname_subdir=False + # ) + # create_prefix_folders(aux_prefix_models) + + # apply protocol and update output + outcfg_models = models(**kwargs) + outcfg = { + **outcfg, + **outcfg_models, + } + return outcfg @@ -1318,7 +1414,10 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): "standard": standard, # comparison for protein complexes - "complex": complex + "complex": complex, + + # comparison to predicted models + "models": models, } From 0c9e3baa02bf08f05636f8cc8d2cbdfdf34507e1 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sun, 10 Nov 2024 14:05:38 +0100 Subject: [PATCH 3/8] add regular CIF parsing to PDB class --- evcouplings/compare/pdb.py | 131 +++++++++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 33 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index ce449368..2f7c99f3 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -17,6 +17,7 @@ import requests import msgpack from Bio.PDB.binary_cif import _decode +from Bio.PDB.MMCIF2Dict import MMCIF2Dict from evcouplings.utils.config import InvalidParameterError from evcouplings.utils.constants import AA3_to_AA1 @@ -412,9 +413,9 @@ class PDB: Holds PDB structure from binaryCIF format; supersedes original PDB class based on MMTF format (renamed to MmtfPDB, cf. below) due to MMTF retirement in 2024 """ - def __init__(self, filehandle, keep_full_data=False): + def __init__(self, filehandle, binary=True, keep_full_data=False): """ - Initialize by parsing binaryCIF from open filehandle. + Initialize by parsing binaryCIF/CIF from open filehandle. Recommended to use from_file() and from_id() class methods to create object. Column extraction and decoding based on https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py @@ -423,18 +424,23 @@ def __init__(self, filehandle, keep_full_data=False): ---------- filehandle: file-like object Open filehandle (binary) from which to read binaryCIF data + binary: bool (default: True) + Indicates if file is binaryCIF (true) or regular text-based CIF file (false) keep_full_data: bool (default: False) Associate raw extracted data with object """ - # unpack information in bCIF file - raw_data = msgpack.unpack( - filehandle, use_list=True - ) + if binary: + # unpack information in bCIF file + raw_data = msgpack.unpack( + filehandle, use_list=True + ) - data = { - f"{category['name']}.{column['name']}": column - for block in raw_data["dataBlocks"] for category in block["categories"] for column in category["columns"] - } + data = { + f"{category['name']}.{column['name']}": column + for block in raw_data["dataBlocks"] for category in block["categories"] for column in category["columns"] + } + else: + data = MMCIF2Dict(filehandle) ATOM_TARGET_COLS = { "_atom_site.pdbx_PDB_model_num": "model_number", @@ -498,23 +504,55 @@ def __init__(self, filehandle, keep_full_data=False): self.data = None # decode information into dataframe with BioPython helper method - self.atom_table = pd.DataFrame({ - name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items() - }).assign( - # make sure chain identifiers are strings, in some pathologic cases, these are int rather than str - # (e.g. entry 6swy) - auth_asym_id=lambda df: df.auth_asym_id.astype(str), - label_asym_id=lambda df: df.label_asym_id.astype(str), - ) + if binary: + self.atom_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items() + }).assign( + # make sure chain identifiers are strings, in some pathologic cases, these are int rather than str + # (e.g. entry 6swy) + auth_asym_id=lambda df: df.auth_asym_id.astype(str), + label_asym_id=lambda df: df.label_asym_id.astype(str), + ) + else: + self.atom_table = pd.DataFrame({ + name: data[source_column] for source_column, name in ATOM_TARGET_COLS.items() + }).replace( + # replace with empty values for consistency with bCIF parsing (otherwise could use pd.NA here) + {"?": "", ".": ""} + ).assign( + # update column types for float columns + x=lambda df: pd.to_numeric(df.x), + y=lambda df: pd.to_numeric(df.y), + z=lambda df: pd.to_numeric(df.z), + occupancy=lambda df: pd.to_numeric(df.occupancy), + b_factor=lambda df: pd.to_numeric(df.b_factor), + # update data types for int columns + model_number=lambda df: df.model_number.astype("int32"), + id=lambda df: df.id.astype("int32"), + label_entity_id=lambda df: df.label_entity_id.astype("int32"), + # align behaviour with missing value coding in bCIF parser for label_seq_id + label_seq_id=lambda df: df.label_seq_id.replace("", 0).astype("int32"), + auth_seq_id=lambda df: df.auth_seq_id.astype("int32"), + ) # decode information into dataframe with BioPython helper method; note this section may not be # present if no helices exist in the structure try: - self.conf_table = pd.DataFrame({ - name: _decode(data[source_column]) for source_column, name in CONF_TARGET_COLS.items() - }).query( - # there are a handful of PDB entries that have (probably wrong) secondary structure assignments - # extending over more than one segment (e.g. 2bp7, 2wjv), drop these rather than raising an error + if binary: + self.conf_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in CONF_TARGET_COLS.items() + }) + else: + self.conf_table = pd.DataFrame({ + name: data[source_column] for source_column, name in CONF_TARGET_COLS.items() + }).assign( + beg_label_seq_id=lambda df: df.beg_label_seq_id.astype("int32"), + end_label_seq_id=lambda df: df.end_label_seq_id.astype("int32"), + ) + + # there are a handful of PDB entries that have (probably wrong) secondary structure assignments + # extending over more than one segment (e.g. 2bp7, 2wjv), drop these rather than raising an error + self.conf_table = self.conf_table.query( "beg_label_asym_id == end_label_asym_id" ) except KeyError: @@ -523,9 +561,19 @@ def __init__(self, filehandle, keep_full_data=False): # decode information into dataframe with BioPython helper method; note this section may not be # present if no sheets exist in the structure try: - self.sheet_table = pd.DataFrame({ - name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items() - }) + if binary: + self.sheet_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items() + }) + else: + self.sheet_table = pd.DataFrame({ + name: data[source_column] for source_column, name in SHEET_TARGET_COLS.items() + }).assign( + id=lambda df: df.id.astype("int32"), + beg_label_seq_id=lambda df: df.beg_label_seq_id.astype("int32"), + end_label_seq_id=lambda df: df.end_label_seq_id.astype("int32"), + ) + except KeyError: self.sheet_table = None @@ -593,9 +641,14 @@ def __init__(self, filehandle, keep_full_data=False): @classmethod def from_file(cls, filename, keep_full_data=False): """ - Initialize structure from binaryCIF file + Initialize structure from binaryCIF or CIF file (gzipped or not). + + Note for simplicity this function will determine if CIF or bCIF, and gzipped or not solely on + case-independent filename extensions (.cif.gz, .cif, .bcif.gz, .bcif) as supplied by the PDB + rather than performing checks on the file itself. If this logic does not hold in your use case, + supply an appropriate filehandle and binary=True/False directly to the constructor of this class. - inspired by https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py + Inspired by https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py Parameters ---------- @@ -610,11 +663,23 @@ def from_file(cls, filename, keep_full_data=False): initialized PDB structure """ try: - with ( - gzip.open(filename, mode="rb") - if filename.lower().endswith(".gz") else open(filename, mode="rb") - ) as f: - return cls(f, keep_full_data=keep_full_data) + fnl = filename.lower() + + # determine if gzipped or not, use appropriate function to open file + if fnl.endswith(".gz"): + openfunc = gzip.open + else: + openfunc = open + + # check if binaryCIF or text-based CIF, adjust file open mode accordingly + binary = fnl.endswith(".bcif") or fnl.endswith(".bcif.gz") + if binary: + mode = "rb" + else: + mode = "r" + + with openfunc(filename, mode=mode) as f: + return cls(f, binary=binary, keep_full_data=keep_full_data) except IOError as e: raise ResourceError( "Could not open file {}".format(filename) From 0df4bf8c865b9a33e6aac7eef29efff702d0e7c8 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sun, 10 Nov 2024 15:36:09 +0100 Subject: [PATCH 4/8] first complete draft of models protocol --- evcouplings/compare/protocol.py | 224 +++++++++++++++++++++++++++++--- 1 file changed, 207 insertions(+), 17 deletions(-) diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py index f6adc27b..7fc57f4b 100644 --- a/evcouplings/compare/protocol.py +++ b/evcouplings/compare/protocol.py @@ -11,6 +11,7 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np +from io import StringIO from evcouplings.align.alignment import ( read_fasta, parse_header @@ -20,10 +21,10 @@ ) from evcouplings.utils.system import ( - create_prefix_folders, insert_dir, verify_resources, + create_prefix_folders, insert_dir, verify_resources, get, ResourceError ) from evcouplings.couplings import Segment -from evcouplings.compare.pdb import load_structures +from evcouplings.compare.pdb import load_structures, PDB from evcouplings.compare.distances import ( intra_dists, multimer_dists, remap_chains, inter_dists, remap_complex_chains @@ -36,7 +37,7 @@ SIFTS_TABLE_FORMAT_STR = "{pdb_id}:{pdb_chain} ({coord_start}-{coord_end})" AVAILABLE_MODEL_DB_TYPES = ["alphafolddb_v4"] -ALPHAFOLDDB_DOWNLOAD_URL = "https://alphafold.ebi.ac.uk/files/{model_id}.cif" +ALPHAFOLDDB_DOWNLOAD_URL = "https://alphafold.ebi.ac.uk/files/{id}.cif" def print_pdb_structure_info(sifts_result, format_string=SIFTS_TABLE_FORMAT_STR, @@ -687,36 +688,105 @@ def _identify_predicted_structures(**kwargs): def _load_models(model_ids, modeldb_type, structure_dir=None, raise_missing=True): + """ + Load structure models from files/web + + Parameters + ---------- + model_ids : Iterable + List / iterable containing model identifiers + to be loaded. + modeldb_type : str + Model database type + structure_dir : str, optional (default: None) + Path to directory with structures (if not available, will be fetched from web) + raise_missing : bool, optional (default: True) + Raise a ResourceError exception if any of the + PDB IDs cannot be loaded. If False, missing + entries will be ignored. + + Returns + ------- + structures : dict(str -> PDB) + Dictionary containing loaded structures. + Keys (PDB identifiers) will be lower-case. + + Raises + ------ + ResourceError + Raised if raise_missing is True and any of the given + PDB IDs cannot be loaded. + """ + if modeldb_type not in AVAILABLE_MODEL_DB_TYPES: + raise InvalidParameterError( + f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_MODEL_DB_TYPES)}" + ) + + if structure_dir is not None: + raise NotImplementedError( + "Local file retrieval currently not implemented" + ) + + # implement database-specific retrieval behaviour here if modeldb_type == "alphafolddb_v4": - if structure_dir is not None: - raise NotImplementedError("Local file retrieval not implemented") + make_download_url = lambda model_id: ALPHAFOLDDB_DOWNLOAD_URL.format(id=model_id) - pass - else: - raise InvalidParameterError("Invalid modeldb_type") + structures = {} + + for model_id in model_ids: + try: + data = get( + make_download_url(model_id) + ) + + structures[model_id] = PDB( + StringIO(data.text), binary=False + ) + except ResourceError as e: + if raise_missing: + raise - return {} + return structures def models(**kwargs): """ - # TODO: document parameters/return values + Protocol: + Compare ECs for single proteins (or domains) + to 3D structure models + + Parameters + ---------- + Mandatory kwargs arguments: + See list below in code where calling check_required + + Returns + ------- + outcfg : dict + Output configuration of the pipeline """ check_required( kwargs, [ "prefix", "ec_file", "modeldb_type", "modeldb_file_dir", - # "min_sequence_distance", - # "pdb_mmtf_dir", "atom_filter", "compare_multimer", - # "distance_cutoff", "target_sequence_file", - # "scale_sizes", + "atom_filter", "distance_cutoff", "min_sequence_distance", + "target_sequence_file", ] ) prefix = kwargs["prefix"] outcfg = { + "model_ec_compared_all_file": prefix + "_model_CouplingScoresCompared_all.csv", + "model_ec_compared_longrange_file": prefix + "_model_CouplingScoresCompared_longrange.csv", "model_structure_hits_file": prefix + "_model_hits.csv", "model_structure_hits_unfiltered_file": prefix + "_model_hits_unfiltered.csv", + + # cannot have the distmap files end with "_file" because there are + # two files (.npy and .csv), which would cause problems with automatic + # checking if those files exist + "model_distmap_monomer": prefix + "_model_distance_map_monomer", + # residue map for all individual distance maps before aggregation + "model_distmap_monomer_residues_file": prefix + "_model_distance_map_monomer_residues.csv", } # make sure EC file exists @@ -758,9 +828,129 @@ def models(**kwargs): raise_missing=False ) - # TODO: implement remaining logic - # TODO: implement structure mapping - # print(structures) + if len(sifts_map.hits) > 0: + d_intra = intra_dists( + sifts_map, structures, atom_filter=kwargs["atom_filter"], + output_prefix=aux_prefix + "model_distmap_intra" + ) + + residue_table_filename, dist_mat_filename = d_intra.to_file(outcfg["model_distmap_monomer"]) + + # store residue map (monomer) + d_intra.aggregated_residue_maps.to_csv( + outcfg["model_distmap_monomer_residues_file"], index=False + ) + + # TODO: for now, create additional entries rather than removing distmap_monomer for compatibility reasons, + # but eventually drop the one above + outcfg["model_distmap_monomer_files"] = { + residue_table_filename: {"file_type": "residue_table"}, + dist_mat_filename: {"file_type": "distance_matrix"} + } + + d_intra_individual_maps = d_intra.individual_distance_map_table + # also store individual intra distance matrices (should always be present) + if d_intra_individual_maps is not None: + outcfg["model_distmap_monomer_individual_files"] = _individual_distance_map_config_result( + d_intra_individual_maps + ) + + # save contacts to separate file + outcfg["model_monomer_contacts_file"] = prefix + "_model_contacts_monomer.csv" + d_intra.contacts( + kwargs["distance_cutoff"] + ).to_csv( + outcfg["model_monomer_contacts_file"], index=False + ) + + # at this point, also create remapped structures (e.g. for + # later comparison of folding results) + verify_resources( + "Target sequence file does not exist", + kwargs["target_sequence_file"] + ) + + # create target sequence map for remapping structure + with open(kwargs["target_sequence_file"]) as f: + header, seq = next(read_fasta(f)) + + seq_id, seq_start, seq_end = parse_header(header) + seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) + + # remap structures, swap mapping index and filename in + # dictionary so we have a list of files in the dict keys + # + # remapped structures have side chains taken off and changed + # residue types, since e.g. maxcluster cannot handle mismatches + # well. Also create structures that are just renumbered (but have + # original side chains and residue names) for visualization asf. + for name, sequence_map, atom_filter in [ + ("remapped", seqmap, ("N", "CA", "C", "O")), + ("renumbered", None, None) + ]: + outcfg[name + "_model_pdb_files"] = { + filename: mapping_index for mapping_index, filename in + remap_chains( + sifts_map, + "{}_{}".format(aux_prefix, name), + structures=structures, + sequence=sequence_map, + atom_filter=atom_filter + ).items() + } + else: + # if no structures, can not compute distance maps + d_intra = None + outcfg["model_distmap_monomer"] = None + outcfg["model_distmap_monomer_residues_file"] = None + outcfg["remapped_model_pdb_files"] = None + outcfg["renumbered_model_pdb_files"] = None + + # Step 3: Compare ECs to distance maps + + ec_table = pd.read_csv(kwargs["ec_file"]) + + # identify number of sites in EC model + num_sites = len( + set.union(set(ec_table.i.unique()), set(ec_table.j.unique())) + ) + + for out_file, min_seq_dist in [ + ("model_ec_compared_longrange_file", kwargs["min_sequence_distance"]), + ("model_ec_compared_all_file", 0), + ]: + # compare ECs only if we minimally have intra distance map + if d_intra is not None: + coupling_scores_compared( + ec_table, d_intra, None, + dist_cutoff=kwargs["distance_cutoff"], + output_file=outcfg[out_file], + min_sequence_dist=min_seq_dist, + score="score" + ) + else: + outcfg[out_file] = None + + # also create line-drawing script if we made the csv + if outcfg["model_ec_compared_longrange_file"] is not None: + ecs_longrange = pd.read_csv(outcfg["model_ec_compared_longrange_file"]) + + outcfg["model_ec_lines_compared_pml_file"] = prefix + "_model_draw_ec_lines_compared.pml" + pairs.ec_lines_pymol_script( + ecs_longrange.iloc[:num_sites, :], + outcfg["model_ec_lines_compared_pml_file"], + distance_cutoff=kwargs["distance_cutoff"], + score_column="score" + ) + + # Step 4: Make contact map plots + # if no structures available, defaults to EC-only plot + outcfg["model_contact_map_files"] = _make_contact_maps( + ec_table, d_intra, None, sifts_map, **{ + **kwargs, + "prefix": kwargs["prefix"] + "_model" + } + ) return outcfg From 3a14a63f528e1a027a576d0c3d6981645ebcfc32 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sun, 10 Nov 2024 15:42:40 +0100 Subject: [PATCH 5/8] fix file names and no plotting if no structures --- evcouplings/compare/protocol.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py index 7fc57f4b..abcda3cc 100644 --- a/evcouplings/compare/protocol.py +++ b/evcouplings/compare/protocol.py @@ -888,7 +888,7 @@ def models(**kwargs): ("remapped", seqmap, ("N", "CA", "C", "O")), ("renumbered", None, None) ]: - outcfg[name + "_model_pdb_files"] = { + outcfg[f"model_{name}_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, @@ -903,8 +903,8 @@ def models(**kwargs): d_intra = None outcfg["model_distmap_monomer"] = None outcfg["model_distmap_monomer_residues_file"] = None - outcfg["remapped_model_pdb_files"] = None - outcfg["renumbered_model_pdb_files"] = None + outcfg["model_remapped_pdb_files"] = None + outcfg["model_renumbered_pdb_files"] = None # Step 3: Compare ECs to distance maps @@ -943,14 +943,15 @@ def models(**kwargs): score_column="score" ) - # Step 4: Make contact map plots - # if no structures available, defaults to EC-only plot - outcfg["model_contact_map_files"] = _make_contact_maps( - ec_table, d_intra, None, sifts_map, **{ - **kwargs, - "prefix": kwargs["prefix"] + "_model" - } - ) + # Step 4: Make contact map plots; + # unlike in standard protocol, only make plots of structures available + if len(sifts_map.hits) > 0: + outcfg["model_contact_map_files"] = _make_contact_maps( + ec_table, d_intra, None, sifts_map, **{ + **kwargs, + "prefix": kwargs["prefix"] + "_model" + } + ) return outcfg From ff159142834b32f5ba3fabfbbea1880cba613c1c Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Mon, 11 Nov 2024 16:20:43 +0100 Subject: [PATCH 6/8] allow to specify model comparison settings via config --- config/sample_config_monomer.txt | 17 +++++++++++++++++ evcouplings/compare/protocol.py | 32 ++++++++++++++------------------ 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt index e1260d1d..be1f599c 100644 --- a/config/sample_config_monomer.txt +++ b/config/sample_config_monomer.txt @@ -284,6 +284,17 @@ compare: # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. pdb_alignment_method: jackhmmer + # Uncomment to enable comparison to predicted models (e.g. AlphaFoldDB, as specified in databases section below); + # will inherit configuration as passed to compare stage, with any keys specified in compared_to_models subsection + # taking precedence + compare_models: + max_num_hits: 1 + pdb_alignment_method: jackhmmer + use_bitscores: True + domain_threshold: 1.5 + sequence_threshold: 1.5 + iterations: 1 + # Settings for Mutation effect predictions mutate: # Options: standard @@ -414,6 +425,12 @@ databases: sifts_mapping_table: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.csv sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.fasta + # Mapping onto predicted 3D structure models + # modeldb_type: alphafolddb_v4 + # modeldb_sequence_file: /n/groups/marks/databases/alphafolddb/2022-11-14/sequences.fasta + # modeldb_list_file: /n/groups/marks/databases/alphafolddb/2022-11-14/accession_ids.csv + # modeldb_file_dir: + # Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required. tools: jackhmmer: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/jackhmmer diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py index abcda3cc..b25825f1 100644 --- a/evcouplings/compare/protocol.py +++ b/evcouplings/compare/protocol.py @@ -642,7 +642,7 @@ def _identify_predicted_structures(**kwargs): kwargs, [ "modeldb_type", "modeldb_sequence_file", "modeldb_list_file", - "model_max_num_hits", + "max_num_hits", ] ) @@ -672,17 +672,13 @@ def _identify_predicted_structures(**kwargs): sequence_file=kwargs["modeldb_sequence_file"] ) - # run sequence-based query against model database, always use jackhmmer for this - sifts_map = s.by_alignment(**{ - **kwargs, - "pdb_alignment_method": "jackhmmer" - }) - + # run sequence-based query against model database + sifts_map = s.by_alignment(**kwargs) sifts_map_full = deepcopy(sifts_map) # reduce number of structures/hits - if kwargs["model_max_num_hits"] is not None: - sifts_map.hits = sifts_map.hits.iloc[:kwargs["model_max_num_hits"]] + if kwargs["max_num_hits"] is not None: + sifts_map.hits = sifts_map.hits.iloc[:kwargs["max_num_hits"]] return sifts_map, sifts_map_full @@ -1211,16 +1207,16 @@ def standard(**kwargs): ec_table, d_intra, d_multimer, sifts_map, **kwargs ) - # Step 5: check if comparison to models is enabled, then run this protocol as well - if kwargs.get("compare_to_models"): - # create subdirectory for running models comparison - # aux_prefix_models = insert_dir( - # prefix, "models", rootname_subdir=False - # ) - # create_prefix_folders(aux_prefix_models) - + # Step 5: check if comparison to models is enabled (optional argument), then run this protocol as well + models_config = kwargs.get("compare_models") + if models_config is not None: # apply protocol and update output - outcfg_models = models(**kwargs) + outcfg_models = models( + **{ + **kwargs, + **models_config, + } + ) outcfg = { **outcfg, **outcfg_models, From 95e56fb553a958b06fd7c9c94014a52a1a233340 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Mon, 11 Nov 2024 16:26:29 +0100 Subject: [PATCH 7/8] comment out model comparison settings by default --- config/sample_config_monomer.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt index be1f599c..fa02863d 100644 --- a/config/sample_config_monomer.txt +++ b/config/sample_config_monomer.txt @@ -287,13 +287,13 @@ compare: # Uncomment to enable comparison to predicted models (e.g. AlphaFoldDB, as specified in databases section below); # will inherit configuration as passed to compare stage, with any keys specified in compared_to_models subsection # taking precedence - compare_models: - max_num_hits: 1 - pdb_alignment_method: jackhmmer - use_bitscores: True - domain_threshold: 1.5 - sequence_threshold: 1.5 - iterations: 1 + # compare_models: + # max_num_hits: 1 + # pdb_alignment_method: jackhmmer + # use_bitscores: True + # domain_threshold: 1.5 + # sequence_threshold: 1.5 + # iterations: 1 # Settings for Mutation effect predictions mutate: From 745d180a62628579d59928df2a002e7808c10bb2 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 13 Nov 2024 17:34:06 +0100 Subject: [PATCH 8/8] version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 63bda4e4..6441e79d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "evcouplings" -version = "0.2.1" +version = "0.3.0" description = "A Framework for evolutionary couplings analysis" readme = "README.md" license = "MIT"