From 1e4a9b1ce02fe8e1a00de50b3225946af6ada82d Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sat, 9 Nov 2024 11:36:17 +0100
Subject: [PATCH 1/8] Structure identification for models

---
 evcouplings/compare/protocol.py | 104 +++++++++++++++++++++++++++++++-
 evcouplings/compare/sifts.py    |  35 ++++++++++-
 2 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py
index 1f07002f..a53404e4 100644
--- a/evcouplings/compare/protocol.py
+++ b/evcouplings/compare/protocol.py
@@ -28,7 +28,7 @@
     intra_dists, multimer_dists, remap_chains,
     inter_dists, remap_complex_chains
 )
-from evcouplings.compare.sifts import SIFTS, SIFTSResult
+from evcouplings.compare.sifts import SIFTS, SIFTSResult, SIFTSWithDynamicUpdate
 from evcouplings.compare.ecs import (
     coupling_scores_compared, add_precision
 )
@@ -582,6 +582,108 @@ def _individual_distance_map_config_result(individual_distance_map_table):
     return individual_maps_result
 
 
+def _map_alphafold_hits(modeldb_list_file, relevant_ids):
+    """
+    Turn list of AlphaFoldDB sequence hits into structure table
+    compatible with SIFTS object
+
+    Parameters
+    ----------
+    modeldb_list_file: str
+        Path to accession_ids.csv file from AlphaFoldDB
+    relevant_ids: set(str)
+        DB entry identifiers to include in table (without leading "AFDB:" prefix)
+
+    Returns
+    -------
+    pd.DataFrame
+        Structure table for SIFTS object
+    """
+    with open(modeldb_list_file) as f:
+        _table = []
+        for line in f:
+            uniprot_ac, start, end, model_id, model_version = line.strip().split(",")
+            model_id_with_prefix = "AFDB:" + model_id
+            start, end = int(start), int(end)
+            if model_id_with_prefix in relevant_ids:
+                _table.append({
+                    "uniprot_ac": model_id_with_prefix,
+                    "pdb_id": f"{model_id}-model_v{model_version}",
+                    "pdb_chain": "A",
+                    "resseq_start": start,
+                    "resseq_end": end,
+                    "coord_start": start,
+                    "coord_end": end,
+                    "uniprot_start": start,
+                    "uniprot_end": end,
+                })
+
+    return pd.DataFrame(_table)
+
+
+def _identify_predicted_structures(**kwargs):
+    """
+    Find suitable predicted structures for mapping on ECs
+
+    Parameters
+    ----------
+    **kwargs
+        See check_required in code below
+
+    Returns
+    -------
+    SIFTSResult
+        Identified structures and residue index mappings
+    """
+    check_required(
+        kwargs,
+        [
+            "modeldb_type", "modeldb_sequence_file", "modeldb_list_file",
+            "model_max_num_hits",
+        ]
+    )
+
+    # implement custom logic for different model database types in code to avoid
+    # complicated configuration setup covering all eventualities
+    AVAILABLE_DB_TYPES = ["alphafolddb_v4"]
+    modeldb_type = kwargs["modeldb_type"]
+    if modeldb_type not in AVAILABLE_DB_TYPES:
+        raise InvalidParameterError(
+            f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_DB_TYPES)}"
+        )
+
+    table_callback = None
+    if modeldb_type == "alphafolddb_v4":
+        table_callback = lambda ali, hits: (
+            _map_alphafold_hits(
+                kwargs["modeldb_list_file"], set(hits.uniprot_ac)
+            ), hits
+        )
+
+    assert table_callback is not None, "table_callback not defined, this indicates logic error above"
+
+    # create patched SIFTS object with all model sequences, but only instantiate table
+    # property once sequences are found to avoid loading gigabytes of tables with pandas
+    s = SIFTSWithDynamicUpdate(
+        table_callback=table_callback,
+        sequence_file=kwargs["modeldb_sequence_file"]
+    )
+
+    # run sequence-based query against model database, always use jackhmmer for this
+    sifts_map = s.by_alignment(**{
+        **kwargs,
+        "pdb_alignment_method": "jackhmmer"
+    })
+
+    sifts_map_full = deepcopy(sifts_map)
+
+    # reduce number of structures/hits
+    if kwargs["model_max_num_hits"] is not None:
+        sifts_map.hits = sifts_map.hits.iloc[:kwargs["model_max_num_hits"]]
+
+    return sifts_map, sifts_map_full
+
+
 def standard(**kwargs):
     """
     Protocol:
diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py
index 275b9848..29781e4f 100644
--- a/evcouplings/compare/sifts.py
+++ b/evcouplings/compare/sifts.py
@@ -255,8 +255,12 @@ def find_homologs(pdb_alignment_method="jackhmmer", **kwargs):
     # read hmmer hittable and simplify
     hits = read_hmmer_domtbl(ar["hittable_file"])
 
-    hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[1])
-    hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[2])
+    try:
+        hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[1])
+        hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(lambda x: x.split("|")[2])
+    except IndexError:
+        hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].copy()
+        hits.loc[:, "uniprot_id"] = pd.NA
 
     hits = hits.rename(
         columns={
@@ -855,10 +859,15 @@ def _create_mapping(r):
             **kwargs
         )
 
+        if callable(self.table):
+            table, hits = self.table(ali, hits)
+        else:
+            table = self.table
+
         # merge with internal table to identify overlap of
         # aligned regions and regions with structural coverage
         hits = hits.merge(
-            self.table, on="uniprot_ac", suffixes=("", "_")
+            table, on="uniprot_ac", suffixes=("", "_")
         )
 
         # add 1 to end of range since overlap function treats
@@ -1002,3 +1011,23 @@ def _agg_type(x):
         }
 
         return SIFTSResult(hits_grouped, mappings)
+
+
+class SIFTSWithDynamicUpdate(SIFTS):
+    def __init__(self, table_callback, sequence_file):
+        """
+        Version of SIFTS class that allows to dynamically
+        instantiate table after searching sequence file
+        (to avoid loading large tables)
+
+        Parameters
+        ----------
+        table_callback: callable
+            Function that ingests alignment (evcouplings.align.Alignment) and hit table (pd.DataFrame)
+            and returns dynamic structure mapping table (pd.DataFrame) and hit table (pd.DataFrame)
+        sequence_file: str
+            Path to database with sequences corresponding to structures
+        """
+        # note: do *not* call super init to pass verification of input files
+        self.table = table_callback
+        self.sequence_file = sequence_file

From d78920631c3d84b06836cb5770d8aa8f02ea676a Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sat, 9 Nov 2024 16:27:00 +0100
Subject: [PATCH 2/8] model comparison skeleton added

---
 evcouplings/compare/pdb.py      |   6 +-
 evcouplings/compare/protocol.py | 107 ++++++++++++++++++++++++++++++--
 2 files changed, 107 insertions(+), 6 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index c13a74a9..ce449368 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -1280,7 +1280,7 @@ def get_chain(self, chain, model=0):
         return Chain(res_df, coord_df)
 
 
-def load_structures(pdb_ids, structure_dir=None, raise_missing=True):
+def load_structures(pdb_ids, structure_dir=None, raise_missing=True, extension=".bcif.gz"):
     """
     Load PDB structures from files / web
 
@@ -1298,6 +1298,8 @@ def load_structures(pdb_ids, structure_dir=None, raise_missing=True):
         Raise a ResourceError exception if any of the
         PDB IDs cannot be loaded. If False, missing
         entries will be ignored.
+    extension: str, optional (default: ".bcif.gz")
+        File extension to be added to identifier if loading file locally
 
     Returns
     -------
@@ -1320,7 +1322,7 @@ def load_structures(pdb_ids, structure_dir=None, raise_missing=True):
 
         has_file = False
         if structure_dir is not None:
-            structure_file = path.join(structure_dir, pdb_id + ".mmtf")
+            structure_file = path.join(structure_dir, pdb_id + extension)
             has_file = valid_file(structure_file)
 
         try:
diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py
index a53404e4..f6adc27b 100644
--- a/evcouplings/compare/protocol.py
+++ b/evcouplings/compare/protocol.py
@@ -35,6 +35,8 @@
 from evcouplings.visualize import pairs, misc
 
 SIFTS_TABLE_FORMAT_STR = "{pdb_id}:{pdb_chain} ({coord_start}-{coord_end})"
+AVAILABLE_MODEL_DB_TYPES = ["alphafolddb_v4"]
+ALPHAFOLDDB_DOWNLOAD_URL = "https://alphafold.ebi.ac.uk/files/{model_id}.cif"
 
 
 def print_pdb_structure_info(sifts_result, format_string=SIFTS_TABLE_FORMAT_STR,
@@ -645,11 +647,11 @@ def _identify_predicted_structures(**kwargs):
 
     # implement custom logic for different model database types in code to avoid
     # complicated configuration setup covering all eventualities
-    AVAILABLE_DB_TYPES = ["alphafolddb_v4"]
     modeldb_type = kwargs["modeldb_type"]
-    if modeldb_type not in AVAILABLE_DB_TYPES:
+
+    if modeldb_type not in AVAILABLE_MODEL_DB_TYPES:
         raise InvalidParameterError(
-            f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_DB_TYPES)}"
+            f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_MODEL_DB_TYPES)}"
         )
 
     table_callback = None
@@ -684,6 +686,85 @@ def _identify_predicted_structures(**kwargs):
     return sifts_map, sifts_map_full
 
 
+def _load_models(model_ids, modeldb_type, structure_dir=None, raise_missing=True):
+    if modeldb_type == "alphafolddb_v4":
+        if structure_dir is not None:
+            raise NotImplementedError("Local file retrieval not implemented")
+
+        pass
+    else:
+        raise InvalidParameterError("Invalid modeldb_type")
+
+    return {}
+
+
+def models(**kwargs):
+    """
+    # TODO: document parameters/return values
+    """
+    check_required(
+        kwargs,
+        [
+            "prefix", "ec_file", "modeldb_type", "modeldb_file_dir",
+            # "min_sequence_distance",
+            # "pdb_mmtf_dir", "atom_filter", "compare_multimer",
+            # "distance_cutoff", "target_sequence_file",
+            # "scale_sizes",
+        ]
+    )
+
+    prefix = kwargs["prefix"]
+    outcfg = {
+        "model_structure_hits_file": prefix + "_model_hits.csv",
+        "model_structure_hits_unfiltered_file": prefix + "_model_hits_unfiltered.csv",
+    }
+
+    # make sure EC file exists
+    verify_resources(
+        "EC file does not exist",
+        kwargs["ec_file"]
+    )
+
+    # make sure output directory exists
+    create_prefix_folders(prefix)
+
+    # store auxiliary files here (too much for average user)
+    aux_prefix = insert_dir(prefix, "aux_models", rootname_subdir=False)
+    create_prefix_folders(aux_prefix)
+
+    # Step 1: Identify models for comparison
+    sifts_map, sifts_map_full = _identify_predicted_structures(**{
+        **kwargs,
+        "prefix": aux_prefix,
+    })
+
+    # save selected PDB hits
+    sifts_map.hits.to_csv(
+        outcfg["model_structure_hits_file"], index=True
+    )
+
+    # also save full list of hits
+    sifts_map_full.hits.to_csv(
+        outcfg["model_structure_hits_unfiltered_file"], index=True
+    )
+
+    # Step 2: Compute distance maps
+
+    # load all structures at once
+    structures = _load_models(
+        sifts_map.hits.pdb_id,
+        kwargs["modeldb_type"],
+        kwargs["modeldb_file_dir"],
+        raise_missing=False
+    )
+
+    # TODO: implement remaining logic
+    # TODO: implement structure mapping
+    # print(structures)
+
+    return outcfg
+
+
 def standard(**kwargs):
     """
     Protocol:
@@ -939,6 +1020,21 @@ def standard(**kwargs):
         ec_table, d_intra, d_multimer, sifts_map, **kwargs
     )
 
+    # Step 5: check if comparison to models is enabled, then run this protocol as well
+    if kwargs.get("compare_to_models"):
+        # create subdirectory for running models comparison
+        # aux_prefix_models = insert_dir(
+        #     prefix, "models", rootname_subdir=False
+        # )
+        # create_prefix_folders(aux_prefix_models)
+
+        # apply protocol and update output
+        outcfg_models = models(**kwargs)
+        outcfg = {
+            **outcfg,
+            **outcfg_models,
+        }
+
     return outcfg
 
 
@@ -1318,7 +1414,10 @@ def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):
     "standard": standard,
 
     # comparison for protein complexes
-    "complex": complex
+    "complex": complex,
+
+    # comparison to predicted models
+    "models": models,
 }
 
 

From 0c9e3baa02bf08f05636f8cc8d2cbdfdf34507e1 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sun, 10 Nov 2024 14:05:38 +0100
Subject: [PATCH 3/8] add regular CIF parsing to PDB class

---
 evcouplings/compare/pdb.py | 131 +++++++++++++++++++++++++++----------
 1 file changed, 98 insertions(+), 33 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index ce449368..2f7c99f3 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -17,6 +17,7 @@
 import requests
 import msgpack
 from Bio.PDB.binary_cif import _decode
+from Bio.PDB.MMCIF2Dict import MMCIF2Dict
 
 from evcouplings.utils.config import InvalidParameterError
 from evcouplings.utils.constants import AA3_to_AA1
@@ -412,9 +413,9 @@ class PDB:
     Holds PDB structure from binaryCIF format; supersedes original PDB class based
     on MMTF format (renamed to MmtfPDB, cf. below) due to MMTF retirement in 2024
     """
-    def __init__(self, filehandle, keep_full_data=False):
+    def __init__(self, filehandle, binary=True, keep_full_data=False):
         """
-        Initialize by parsing binaryCIF from open filehandle.
+        Initialize by parsing binaryCIF/CIF from open filehandle.
         Recommended to use from_file() and from_id() class methods to create object.
 
         Column extraction and decoding based on https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py
@@ -423,18 +424,23 @@ def __init__(self, filehandle, keep_full_data=False):
         ----------
         filehandle: file-like object
             Open filehandle (binary) from which to read binaryCIF data
+        binary: bool (default: True)
+            Indicates if file is binaryCIF (true) or regular text-based CIF file (false)
         keep_full_data: bool (default: False)
             Associate raw extracted data with object
         """
-        # unpack information in bCIF file
-        raw_data = msgpack.unpack(
-            filehandle, use_list=True
-        )
+        if binary:
+            # unpack information in bCIF file
+            raw_data = msgpack.unpack(
+                filehandle, use_list=True
+            )
 
-        data = {
-            f"{category['name']}.{column['name']}": column
-            for block in raw_data["dataBlocks"] for category in block["categories"] for column in category["columns"]
-        }
+            data = {
+                f"{category['name']}.{column['name']}": column
+                for block in raw_data["dataBlocks"] for category in block["categories"] for column in category["columns"]
+            }
+        else:
+            data = MMCIF2Dict(filehandle)
 
         ATOM_TARGET_COLS = {
             "_atom_site.pdbx_PDB_model_num": "model_number",
@@ -498,23 +504,55 @@ def __init__(self, filehandle, keep_full_data=False):
             self.data = None
 
         # decode information into dataframe with BioPython helper method
-        self.atom_table = pd.DataFrame({
-            name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items()
-        }).assign(
-            # make sure chain identifiers are strings, in some pathologic cases, these are int rather than str
-            # (e.g. entry 6swy)
-            auth_asym_id=lambda df: df.auth_asym_id.astype(str),
-            label_asym_id=lambda df: df.label_asym_id.astype(str),
-        )
+        if binary:
+            self.atom_table = pd.DataFrame({
+                name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items()
+            }).assign(
+                # make sure chain identifiers are strings, in some pathologic cases, these are int rather than str
+                # (e.g. entry 6swy)
+                auth_asym_id=lambda df: df.auth_asym_id.astype(str),
+                label_asym_id=lambda df: df.label_asym_id.astype(str),
+            )
+        else:
+            self.atom_table = pd.DataFrame({
+                name: data[source_column] for source_column, name in ATOM_TARGET_COLS.items()
+            }).replace(
+                # replace with empty values for consistency with bCIF parsing (otherwise could use pd.NA here)
+                {"?": "", ".": ""}
+            ).assign(
+                # update column types for float columns
+                x=lambda df: pd.to_numeric(df.x),
+                y=lambda df: pd.to_numeric(df.y),
+                z=lambda df: pd.to_numeric(df.z),
+                occupancy=lambda df: pd.to_numeric(df.occupancy),
+                b_factor=lambda df: pd.to_numeric(df.b_factor),
+                # update data types for int columns
+                model_number=lambda df: df.model_number.astype("int32"),
+                id=lambda df: df.id.astype("int32"),
+                label_entity_id=lambda df: df.label_entity_id.astype("int32"),
+                # align behaviour with missing value coding in bCIF parser for label_seq_id
+                label_seq_id=lambda df: df.label_seq_id.replace("", 0).astype("int32"),
+                auth_seq_id=lambda df: df.auth_seq_id.astype("int32"),
+            )
 
         # decode information into dataframe with BioPython helper method; note this section may not be
         # present if no helices exist in the structure
         try:
-            self.conf_table = pd.DataFrame({
-                name: _decode(data[source_column]) for source_column, name in CONF_TARGET_COLS.items()
-            }).query(
-                # there are a handful of PDB entries that have (probably wrong) secondary structure assignments
-                # extending over more than one segment (e.g. 2bp7, 2wjv), drop these rather than raising an error
+            if binary:
+                self.conf_table = pd.DataFrame({
+                    name: _decode(data[source_column]) for source_column, name in CONF_TARGET_COLS.items()
+                })
+            else:
+                self.conf_table = pd.DataFrame({
+                    name: data[source_column] for source_column, name in CONF_TARGET_COLS.items()
+                }).assign(
+                    beg_label_seq_id=lambda df: df.beg_label_seq_id.astype("int32"),
+                    end_label_seq_id=lambda df: df.end_label_seq_id.astype("int32"),
+                )
+
+            # there are a handful of PDB entries that have (probably wrong) secondary structure assignments
+            # extending over more than one segment (e.g. 2bp7, 2wjv), drop these rather than raising an error
+            self.conf_table = self.conf_table.query(
                 "beg_label_asym_id == end_label_asym_id"
             )
         except KeyError:
@@ -523,9 +561,19 @@ def __init__(self, filehandle, keep_full_data=False):
         # decode information into dataframe with BioPython helper method; note this section may not be
         # present if no sheets exist in the structure
         try:
-            self.sheet_table = pd.DataFrame({
-                name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
-            })
+            if binary:
+                self.sheet_table = pd.DataFrame({
+                    name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
+                })
+            else:
+                self.sheet_table = pd.DataFrame({
+                    name: data[source_column] for source_column, name in SHEET_TARGET_COLS.items()
+                }).assign(
+                    id=lambda df: df.id.astype("int32"),
+                    beg_label_seq_id=lambda df: df.beg_label_seq_id.astype("int32"),
+                    end_label_seq_id=lambda df: df.end_label_seq_id.astype("int32"),
+                )
+
         except KeyError:
             self.sheet_table = None
 
@@ -593,9 +641,14 @@ def __init__(self, filehandle, keep_full_data=False):
     @classmethod
     def from_file(cls, filename, keep_full_data=False):
         """
-        Initialize structure from binaryCIF file
+        Initialize structure from binaryCIF or CIF file (gzipped or not).
+
+        Note for simplicity this function will determine if CIF or bCIF, and gzipped or not solely on
+        case-independent filename extensions (.cif.gz, .cif, .bcif.gz, .bcif) as supplied by the PDB
+        rather than performing checks on the file itself. If this logic does not hold in your use case,
+        supply an appropriate filehandle and binary=True/False directly  to the constructor of this class.
 
-        inspired by https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py
+        Inspired by https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py
 
         Parameters
         ----------
@@ -610,11 +663,23 @@ def from_file(cls, filename, keep_full_data=False):
             initialized PDB structure
         """
         try:
-            with (
-                    gzip.open(filename, mode="rb")
-                    if filename.lower().endswith(".gz") else open(filename, mode="rb")
-            ) as f:
-                return cls(f, keep_full_data=keep_full_data)
+            fnl = filename.lower()
+
+            # determine if gzipped or not, use appropriate function to open file
+            if fnl.endswith(".gz"):
+                openfunc = gzip.open
+            else:
+                openfunc = open
+
+            # check if binaryCIF or text-based CIF, adjust file open mode accordingly
+            binary = fnl.endswith(".bcif") or fnl.endswith(".bcif.gz")
+            if binary:
+                mode = "rb"
+            else:
+                mode = "r"
+
+            with openfunc(filename, mode=mode) as f:
+                return cls(f, binary=binary, keep_full_data=keep_full_data)
         except IOError as e:
             raise ResourceError(
                 "Could not open file {}".format(filename)

From 0df4bf8c865b9a33e6aac7eef29efff702d0e7c8 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sun, 10 Nov 2024 15:36:09 +0100
Subject: [PATCH 4/8] first complete draft of models protocol

---
 evcouplings/compare/protocol.py | 224 +++++++++++++++++++++++++++++---
 1 file changed, 207 insertions(+), 17 deletions(-)

diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py
index f6adc27b..7fc57f4b 100644
--- a/evcouplings/compare/protocol.py
+++ b/evcouplings/compare/protocol.py
@@ -11,6 +11,7 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
+from io import StringIO
 
 from evcouplings.align.alignment import (
     read_fasta, parse_header
@@ -20,10 +21,10 @@
 )
 
 from evcouplings.utils.system import (
-    create_prefix_folders, insert_dir, verify_resources,
+    create_prefix_folders, insert_dir, verify_resources, get, ResourceError
 )
 from evcouplings.couplings import Segment
-from evcouplings.compare.pdb import load_structures
+from evcouplings.compare.pdb import load_structures, PDB
 from evcouplings.compare.distances import (
     intra_dists, multimer_dists, remap_chains,
     inter_dists, remap_complex_chains
@@ -36,7 +37,7 @@
 
 SIFTS_TABLE_FORMAT_STR = "{pdb_id}:{pdb_chain} ({coord_start}-{coord_end})"
 AVAILABLE_MODEL_DB_TYPES = ["alphafolddb_v4"]
-ALPHAFOLDDB_DOWNLOAD_URL = "https://alphafold.ebi.ac.uk/files/{model_id}.cif"
+ALPHAFOLDDB_DOWNLOAD_URL = "https://alphafold.ebi.ac.uk/files/{id}.cif"
 
 
 def print_pdb_structure_info(sifts_result, format_string=SIFTS_TABLE_FORMAT_STR,
@@ -687,36 +688,105 @@ def _identify_predicted_structures(**kwargs):
 
 
 def _load_models(model_ids, modeldb_type, structure_dir=None, raise_missing=True):
+    """
+    Load structure models from files/web
+
+    Parameters
+    ----------
+    model_ids : Iterable
+        List / iterable containing model identifiers
+        to be loaded.
+    modeldb_type : str
+        Model database type
+    structure_dir : str, optional (default: None)
+        Path to directory with structures (if not available, will be fetched from web)
+    raise_missing : bool, optional (default: True)
+        Raise a ResourceError exception if any of the
+        PDB IDs cannot be loaded. If False, missing
+        entries will be ignored.
+
+    Returns
+    -------
+    structures : dict(str -> PDB)
+        Dictionary containing loaded structures.
+        Keys (PDB identifiers) will be lower-case.
+
+    Raises
+    ------
+    ResourceError
+        Raised if raise_missing is True and any of the given
+        PDB IDs cannot be loaded.
+    """
+    if modeldb_type not in AVAILABLE_MODEL_DB_TYPES:
+        raise InvalidParameterError(
+            f"Model DB type {modeldb_type} not available, valid options are: {', '.join(AVAILABLE_MODEL_DB_TYPES)}"
+        )
+
+    if structure_dir is not None:
+        raise NotImplementedError(
+            "Local file retrieval currently not implemented"
+        )
+
+    # implement database-specific retrieval behaviour here
     if modeldb_type == "alphafolddb_v4":
-        if structure_dir is not None:
-            raise NotImplementedError("Local file retrieval not implemented")
+        make_download_url = lambda model_id: ALPHAFOLDDB_DOWNLOAD_URL.format(id=model_id)
 
-        pass
-    else:
-        raise InvalidParameterError("Invalid modeldb_type")
+    structures = {}
+
+    for model_id in model_ids:
+        try:
+            data = get(
+                make_download_url(model_id)
+            )
+
+            structures[model_id] = PDB(
+                StringIO(data.text), binary=False
+            )
+        except ResourceError as e:
+            if raise_missing:
+                raise
 
-    return {}
+    return structures
 
 
 def models(**kwargs):
     """
-    # TODO: document parameters/return values
+    Protocol:
+    Compare ECs for single proteins (or domains)
+    to 3D structure models
+
+    Parameters
+    ----------
+    Mandatory kwargs arguments:
+        See list below in code where calling check_required
+
+    Returns
+    -------
+    outcfg : dict
+        Output configuration of the pipeline
     """
     check_required(
         kwargs,
         [
             "prefix", "ec_file", "modeldb_type", "modeldb_file_dir",
-            # "min_sequence_distance",
-            # "pdb_mmtf_dir", "atom_filter", "compare_multimer",
-            # "distance_cutoff", "target_sequence_file",
-            # "scale_sizes",
+            "atom_filter", "distance_cutoff", "min_sequence_distance",
+            "target_sequence_file",
         ]
     )
 
     prefix = kwargs["prefix"]
     outcfg = {
+        "model_ec_compared_all_file": prefix + "_model_CouplingScoresCompared_all.csv",
+        "model_ec_compared_longrange_file": prefix + "_model_CouplingScoresCompared_longrange.csv",
         "model_structure_hits_file": prefix + "_model_hits.csv",
         "model_structure_hits_unfiltered_file": prefix + "_model_hits_unfiltered.csv",
+
+        # cannot have the distmap files end with "_file" because there are
+        # two files (.npy and .csv), which would cause problems with automatic
+        # checking if those files exist
+        "model_distmap_monomer": prefix + "_model_distance_map_monomer",
+        # residue map for all individual distance maps before aggregation
+        "model_distmap_monomer_residues_file": prefix + "_model_distance_map_monomer_residues.csv",
     }
 
     # make sure EC file exists
@@ -758,9 +828,129 @@ def models(**kwargs):
         raise_missing=False
     )
 
-    # TODO: implement remaining logic
-    # TODO: implement structure mapping
-    # print(structures)
+    if len(sifts_map.hits) > 0:
+        d_intra = intra_dists(
+            sifts_map, structures, atom_filter=kwargs["atom_filter"],
+            output_prefix=aux_prefix + "model_distmap_intra"
+        )
+
+        residue_table_filename, dist_mat_filename = d_intra.to_file(outcfg["model_distmap_monomer"])
+
+        # store residue map (monomer)
+        d_intra.aggregated_residue_maps.to_csv(
+            outcfg["model_distmap_monomer_residues_file"], index=False
+        )
+
+        # TODO: for now, create additional entries rather than removing distmap_monomer for compatibility reasons,
+        # but eventually drop the one above
+        outcfg["model_distmap_monomer_files"] = {
+            residue_table_filename: {"file_type": "residue_table"},
+            dist_mat_filename: {"file_type": "distance_matrix"}
+        }
+
+        d_intra_individual_maps = d_intra.individual_distance_map_table
+        # also store individual intra distance matrices (should always be present)
+        if d_intra_individual_maps is not None:
+            outcfg["model_distmap_monomer_individual_files"] = _individual_distance_map_config_result(
+                d_intra_individual_maps
+            )
+
+        # save contacts to separate file
+        outcfg["model_monomer_contacts_file"] = prefix + "_model_contacts_monomer.csv"
+        d_intra.contacts(
+            kwargs["distance_cutoff"]
+        ).to_csv(
+            outcfg["model_monomer_contacts_file"], index=False
+        )
+
+        # at this point, also create remapped structures (e.g. for
+        # later comparison of folding results)
+        verify_resources(
+            "Target sequence file does not exist",
+            kwargs["target_sequence_file"]
+        )
+
+        # create target sequence map for remapping structure
+        with open(kwargs["target_sequence_file"]) as f:
+            header, seq = next(read_fasta(f))
+
+        seq_id, seq_start, seq_end = parse_header(header)
+        seqmap = dict(zip(range(seq_start, seq_end + 1), seq))
+
+        # remap structures, swap mapping index and filename in
+        # dictionary so we have a list of files in the dict keys
+        #
+        # remapped structures have side chains taken off and changed
+        # residue types, since e.g. maxcluster cannot handle mismatches
+        # well. Also create structures that are just renumbered (but have
+        # original side chains and residue names) for visualization asf.
+        for name, sequence_map, atom_filter in [
+            ("remapped", seqmap, ("N", "CA", "C", "O")),
+            ("renumbered", None, None)
+        ]:
+            outcfg[name + "_model_pdb_files"] = {
+                filename: mapping_index for mapping_index, filename in
+                remap_chains(
+                    sifts_map,
+                    "{}_{}".format(aux_prefix, name),
+                    structures=structures,
+                    sequence=sequence_map,
+                    atom_filter=atom_filter
+                ).items()
+            }
+    else:
+        # if no structures, can not compute distance maps
+        d_intra = None
+        outcfg["model_distmap_monomer"] = None
+        outcfg["model_distmap_monomer_residues_file"] = None
+        outcfg["remapped_model_pdb_files"] = None
+        outcfg["renumbered_model_pdb_files"] = None
+
+    # Step 3: Compare ECs to distance maps
+
+    ec_table = pd.read_csv(kwargs["ec_file"])
+
+    # identify number of sites in EC model
+    num_sites = len(
+        set.union(set(ec_table.i.unique()), set(ec_table.j.unique()))
+    )
+
+    for out_file, min_seq_dist in [
+        ("model_ec_compared_longrange_file", kwargs["min_sequence_distance"]),
+        ("model_ec_compared_all_file", 0),
+    ]:
+        # compare ECs only if we minimally have intra distance map
+        if d_intra is not None:
+            coupling_scores_compared(
+                ec_table, d_intra, None,
+                dist_cutoff=kwargs["distance_cutoff"],
+                output_file=outcfg[out_file],
+                min_sequence_dist=min_seq_dist,
+                score="score"
+            )
+        else:
+            outcfg[out_file] = None
+
+    # also create line-drawing script if we made the csv
+    if outcfg["model_ec_compared_longrange_file"] is not None:
+        ecs_longrange = pd.read_csv(outcfg["model_ec_compared_longrange_file"])
+
+        outcfg["model_ec_lines_compared_pml_file"] = prefix + "_model_draw_ec_lines_compared.pml"
+        pairs.ec_lines_pymol_script(
+            ecs_longrange.iloc[:num_sites, :],
+            outcfg["model_ec_lines_compared_pml_file"],
+            distance_cutoff=kwargs["distance_cutoff"],
+            score_column="score"
+        )
+
+    # Step 4: Make contact map plots
+    # if no structures available, defaults to EC-only plot
+    outcfg["model_contact_map_files"] = _make_contact_maps(
+        ec_table, d_intra, None, sifts_map, **{
+            **kwargs,
+            "prefix": kwargs["prefix"] + "_model"
+        }
+    )
 
     return outcfg
 

From 3a14a63f528e1a027a576d0c3d6981645ebcfc32 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sun, 10 Nov 2024 15:42:40 +0100
Subject: [PATCH 5/8] fix file names and no plotting if no structures

---
 evcouplings/compare/protocol.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py
index 7fc57f4b..abcda3cc 100644
--- a/evcouplings/compare/protocol.py
+++ b/evcouplings/compare/protocol.py
@@ -888,7 +888,7 @@ def models(**kwargs):
             ("remapped", seqmap, ("N", "CA", "C", "O")),
             ("renumbered", None, None)
         ]:
-            outcfg[name + "_model_pdb_files"] = {
+            outcfg[f"model_{name}_pdb_files"] = {
                 filename: mapping_index for mapping_index, filename in
                 remap_chains(
                     sifts_map,
@@ -903,8 +903,8 @@ def models(**kwargs):
         d_intra = None
         outcfg["model_distmap_monomer"] = None
         outcfg["model_distmap_monomer_residues_file"] = None
-        outcfg["remapped_model_pdb_files"] = None
-        outcfg["renumbered_model_pdb_files"] = None
+        outcfg["model_remapped_pdb_files"] = None
+        outcfg["model_renumbered_pdb_files"] = None
 
     # Step 3: Compare ECs to distance maps
 
@@ -943,14 +943,15 @@ def models(**kwargs):
             score_column="score"
         )
 
-    # Step 4: Make contact map plots
-    # if no structures available, defaults to EC-only plot
-    outcfg["model_contact_map_files"] = _make_contact_maps(
-        ec_table, d_intra, None, sifts_map, **{
-            **kwargs,
-            "prefix": kwargs["prefix"] + "_model"
-        }
-    )
+    # Step 4: Make contact map plots;
+    # unlike in standard protocol, only make plots of structures available
+    if len(sifts_map.hits) > 0:
+        outcfg["model_contact_map_files"] = _make_contact_maps(
+            ec_table, d_intra, None, sifts_map, **{
+                **kwargs,
+                "prefix": kwargs["prefix"] + "_model"
+            }
+        )
 
     return outcfg
 

From ff159142834b32f5ba3fabfbbea1880cba613c1c Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Mon, 11 Nov 2024 16:20:43 +0100
Subject: [PATCH 6/8] allow to specify model comparison settings via config

---
 config/sample_config_monomer.txt | 17 +++++++++++++++++
 evcouplings/compare/protocol.py  | 32 ++++++++++++++------------------
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt
index e1260d1d..be1f599c 100644
--- a/config/sample_config_monomer.txt
+++ b/config/sample_config_monomer.txt
@@ -284,6 +284,17 @@ compare:
     # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. 
     pdb_alignment_method: jackhmmer
 
+    # Uncomment to enable comparison to predicted models (e.g. AlphaFoldDB, as specified in databases section below);
+    # will inherit configuration as passed to compare stage, with any keys specified in compared_to_models subsection
+    # taking precedence
+    compare_models:
+        max_num_hits: 1
+        pdb_alignment_method: jackhmmer
+        use_bitscores: True
+        domain_threshold: 1.5
+        sequence_threshold: 1.5
+        iterations: 1
+
 # Settings for Mutation effect predictions
 mutate:
     # Options: standard
@@ -414,6 +425,12 @@ databases:
     sifts_mapping_table: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.csv
     sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.o2.fasta
 
+    # Mapping onto predicted 3D structure models
+    # modeldb_type: alphafolddb_v4
+    # modeldb_sequence_file: /n/groups/marks/databases/alphafolddb/2022-11-14/sequences.fasta
+    # modeldb_list_file: /n/groups/marks/databases/alphafolddb/2022-11-14/accession_ids.csv
+    # modeldb_file_dir:
+
 # Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required.
 tools:
     jackhmmer: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/jackhmmer
diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py
index abcda3cc..b25825f1 100644
--- a/evcouplings/compare/protocol.py
+++ b/evcouplings/compare/protocol.py
@@ -642,7 +642,7 @@ def _identify_predicted_structures(**kwargs):
         kwargs,
         [
             "modeldb_type", "modeldb_sequence_file", "modeldb_list_file",
-            "model_max_num_hits",
+            "max_num_hits",
         ]
     )
 
@@ -672,17 +672,13 @@ def _identify_predicted_structures(**kwargs):
         sequence_file=kwargs["modeldb_sequence_file"]
     )
 
-    # run sequence-based query against model database, always use jackhmmer for this
-    sifts_map = s.by_alignment(**{
-        **kwargs,
-        "pdb_alignment_method": "jackhmmer"
-    })
-
+    # run sequence-based query against model database
+    sifts_map = s.by_alignment(**kwargs)
     sifts_map_full = deepcopy(sifts_map)
 
     # reduce number of structures/hits
-    if kwargs["model_max_num_hits"] is not None:
-        sifts_map.hits = sifts_map.hits.iloc[:kwargs["model_max_num_hits"]]
+    if kwargs["max_num_hits"] is not None:
+        sifts_map.hits = sifts_map.hits.iloc[:kwargs["max_num_hits"]]
 
     return sifts_map, sifts_map_full
 
@@ -1211,16 +1207,16 @@ def standard(**kwargs):
         ec_table, d_intra, d_multimer, sifts_map, **kwargs
     )
 
-    # Step 5: check if comparison to models is enabled, then run this protocol as well
-    if kwargs.get("compare_to_models"):
-        # create subdirectory for running models comparison
-        # aux_prefix_models = insert_dir(
-        #     prefix, "models", rootname_subdir=False
-        # )
-        # create_prefix_folders(aux_prefix_models)
-
+    # Step 5: check if comparison to models is enabled (optional argument), then run this protocol as well
+    models_config = kwargs.get("compare_models")
+    if models_config is not None:
         # apply protocol and update output
-        outcfg_models = models(**kwargs)
+        outcfg_models = models(
+            **{
+                **kwargs,
+                **models_config,
+            }
+        )
         outcfg = {
             **outcfg,
             **outcfg_models,

From 95e56fb553a958b06fd7c9c94014a52a1a233340 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Mon, 11 Nov 2024 16:26:29 +0100
Subject: [PATCH 7/8] comment out model comparison settings by default

---
 config/sample_config_monomer.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt
index be1f599c..fa02863d 100644
--- a/config/sample_config_monomer.txt
+++ b/config/sample_config_monomer.txt
@@ -287,13 +287,13 @@ compare:
     # Uncomment to enable comparison to predicted models (e.g. AlphaFoldDB, as specified in databases section below);
     # will inherit configuration as passed to compare stage, with any keys specified in compared_to_models subsection
     # taking precedence
-    compare_models:
-        max_num_hits: 1
-        pdb_alignment_method: jackhmmer
-        use_bitscores: True
-        domain_threshold: 1.5
-        sequence_threshold: 1.5
-        iterations: 1
+    # compare_models:
+    #     max_num_hits: 1
+    #     pdb_alignment_method: jackhmmer
+    #     use_bitscores: True
+    #     domain_threshold: 1.5
+    #     sequence_threshold: 1.5
+    #     iterations: 1
 
 # Settings for Mutation effect predictions
 mutate:

From 745d180a62628579d59928df2a002e7808c10bb2 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 13 Nov 2024 17:34:06 +0100
Subject: [PATCH 8/8] version bump

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 63bda4e4..6441e79d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "evcouplings"
-version = "0.2.1"
+version = "0.3.0"
 description = "A Framework for evolutionary couplings analysis"
 readme = "README.md"
 license = "MIT"