From 0e64dfdac1279ebe3820f87813d4ca7f87816fed Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Thu, 16 Mar 2023 13:46:17 +0100 Subject: [PATCH 1/8] Add support for computational structure entries --- src/biotite/database/rcsb/query.py | 51 ++++++++++++++++++++++++++---- tests/database/test_rcsb.py | 33 ++++++++++++++++++- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py index d8d9e4830..62211a421 100644 --- a/src/biotite/database/rcsb/query.py +++ b/src/biotite/database/rcsb/query.py @@ -33,6 +33,7 @@ class Query(metaclass=abc.ABCMeta): This is the abstract base class for all queries. """ + @abc.abstractmethod def get_content(self): """ @@ -41,6 +42,11 @@ def get_content(self): This content is converted into JSON by the :func:`search` and :func:`count` methods. + + Returns + ------- + content : dict + The content dictionary for the ``'query'`` attributes. """ pass @@ -449,7 +455,7 @@ def get_content(self): return content -def count(query, return_type="entry"): +def count(query, return_type="entry", content_types=("experimental",)): """ Count PDB entries that meet the given query requirements, via the RCSB search API. @@ -470,6 +476,14 @@ def count(query, return_type="entry"): - ``'non_polymer_entity'``: All matching non-polymeric entities are counted. - ``'polymer_instance'``: All matching chains are counted. + content_types : iterable of {"experimental", "computational"}, optional + Specify whether experimental and computational structures should + be included. + At least one of them needs to be specified. + By default only experimental structures are included. + Note, that identifiers for computational structures cannot be + downloaded via :func:`biotite.database.rcsb.fetch()` as they + point to *AlphaFold DB* and *ModelArchive*. Returns ------- @@ -492,13 +506,20 @@ def count(query, return_type="entry"): "polymer_entity", "non_polymer_entity", ]: raise ValueError(f"'{return_type}' is an invalid return type") + + request_options = {"return_counts": True} + + if len(content_types) == 0: + raise ValueError("At least one content type must be specified") + for content_type in content_types: + if content_type not in ("experimental", "computational"): + raise ValueError(f"Unknown content type '{content_type}'") + request_options["results_content_type"] = content_types query_dict = { "query": query.get_content(), "return_type": return_type, - "request_options": { - "return_counts": True - } + "request_options": request_options } r = requests.get(_search_url, params={"json": json.dumps(query_dict)}) @@ -515,7 +536,7 @@ def count(query, return_type="entry"): raise RequestError(f"Error {r.status_code}") -def search(query, return_type="entry", range=None, sort_by=None): +def search(query, return_type="entry", range=None, sort_by=None, content_types=("experimental",)): """ Get all PDB IDs that meet the given query requirements, via the RCSB search API. @@ -547,11 +568,22 @@ def search(query, return_type="entry", range=None, sort_by=None): The range is zero-indexed and the stop value is exclusive. sort_by : str, optional If specified, the returned PDB IDs are sorted by the values - of the given field name in descending order. + of the given field name. A complete list of the available fields is documented at ``_. and ``_. + If a string is given sorting is performed in descending order. + To choose the order a :class:`Sorting` object needs to be + provided. + content_types : iterable of {"experimental", "computational"}, optional + Specify whether experimental and computational structures should + be included. + At least one of them needs to be specified. + By default only experimental structures are included. + Note, that identifiers for computational structures cannot be + downloaded via :func:`biotite.database.rcsb.fetch()` as they + point to *AlphaFold DB* and *ModelArchive*. Returns ------- @@ -584,6 +616,13 @@ def search(query, return_type="entry", range=None, sort_by=None): if sort_by is not None: request_options["sort"] = [{"sort_by": sort_by}] + + if len(content_types) == 0: + raise ValueError("At least one content type must be specified") + for content_type in content_types: + if content_type not in ("experimental", "computational"): + raise ValueError(f"Unknown content type '{content_type}'") + request_options["results_content_type"] = content_types if range is None: request_options["return_all_hits"] = True diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index 3626afa61..effc3403a 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -141,6 +141,7 @@ def test_search_sequence(): ref_sequence, "protein", min_identity=IDENTIY_CUTOFF ) test_ids = rcsb.search(query) + assert test_ids >= 2 for id in test_ids: fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta")) @@ -172,7 +173,7 @@ def test_search_motif(): MOTIF = "C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H." query = rcsb.MotifQuery(MOTIF, "prosite", "protein") test_count = rcsb.count(query) - assert test_count == pytest.approx(558, rel=0.1) + assert test_count == pytest.approx(580, rel=0.1) @pytest.mark.skipif( @@ -258,6 +259,36 @@ def test_search_sort(): assert resolutions == list(reversed(sorted(resolutions))) +def test_search_content_types(): + # Query to limit the number of returned results + # for improved performance + query = rcsb.FieldQuery( + "rcsb_entity_host_organism.scientific_name", + exact_match="Homo sapiens" + ) + experimental_set = set(rcsb.search(query, content_types=["experimental"])) + computational_set = set(rcsb.search(query, content_types=["computational"])) + combined_set = set(rcsb.search(query, content_types=["experimental", "computational"])) + + # If there are no results, the following tests make no sense + assert len(combined_set) > 0 + # There should be no common elements + assert len(experimental_set & computational_set) == 0 + # The combined search should include the contents of both searches + assert len(experimental_set | computational_set) == len(combined_set) + + assert rcsb.count(query, content_types=["experimental"]) == len(experimental_set) + assert rcsb.count(query, content_types=["computational"]) == len(computational_set) + assert rcsb.count(query, content_types=["experimental", "computational"]) == len(combined_set) + + # Expect an exception if no content_type + with pytest.raises(ValueError): + rcsb.search(query, content_types=[]) + with pytest.raises(ValueError): + rcsb.count(query, content_types=[]) + + + @pytest.mark.skipif( cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" From cfb3578bf40692a8048b9a4ad9c3f8f6614f24bb Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Thu, 16 Mar 2023 15:30:19 +0100 Subject: [PATCH 2/8] Improve requests with sorting and grouping --- src/biotite/database/rcsb/query.py | 265 +++++++++++++++++++++++------ tests/database/test_rcsb.py | 3 +- 2 files changed, 217 insertions(+), 51 deletions(-) diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py index 62211a421..5609b09d6 100644 --- a/src/biotite/database/rcsb/query.py +++ b/src/biotite/database/rcsb/query.py @@ -7,6 +7,8 @@ __all__ = ["Query", "SingleQuery", "CompositeQuery", "BasicQuery", "FieldQuery", "SequenceQuery", "StructureQuery", "MotifQuery", + "Sorting", + "Grouping", "DepositGrouping", "IdentityGrouping", "UniprotGrouping", "search", "count"] import abc @@ -33,7 +35,6 @@ class Query(metaclass=abc.ABCMeta): This is the abstract base class for all queries. """ - @abc.abstractmethod def get_content(self): """ @@ -41,7 +42,7 @@ def get_content(self): ``'query'`` attribute in the RCSB search API. This content is converted into JSON by the :func:`search` - and :func:`count` methods. + and :func:`count` functions. Returns ------- @@ -68,7 +69,6 @@ class SingleQuery(Query, metaclass=abc.ABCMeta): This is the abstract base class for all queries that are terminal nodes. """ - @abc.abstractmethod def get_content(self): return {"parameters": {}} @@ -455,7 +455,164 @@ def get_content(self): return content -def count(query, return_type="entry", content_types=("experimental",)): + + +class Sorting: + + def __init__(self, field, descending=True): + self._field = field + self._descending = descending + + @property + def field(self): + return self._field + + @property + def descending(self): + return self._descending + + def get_content(self): + """ + Get the sorting content, i.e. the data belonging to the + ``'sort'`` and ``'ranking_criteria_type'`` attributes in the + RCSB search API. + + This content is converted into JSON by the :func:`search` + function. + + Returns + ------- + content : dict + The content dictionary for the ``'sort'`` and + ``'ranking_criteria_type'`` attributes. + """ + direction = "desc" if self._descending else "asc" + return { + "sort_by" : self._field, + "direction" : direction + } + + + + +class Grouping(metaclass=abc.ABCMeta): + """ + A representation of the JSON grouping options of the RCSB search + API. + + Parameters + ---------- + sort_by : str or Sorting, optional + If specified, the returned PDB IDs are sorted by the values + of the given field name. + A complete list of the available fields is documented at + ``_. + and + ``_. + If a string is given, sorting is performed in descending order. + To choose the order a :class:`Sorting` object needs to be + provided. + + Attributes + ---------- + sorting : Sorting + The sorting of the :class:`Grouping`. + """ + + def __init__(self, sort_by=None): + if sort_by is None: + self._sorting = None + elif isinstance(sort_by, Sorting): + self._sorting = sort_by + else: + self._sorting = Sorting(sort_by) + + @abc.abstractmethod + def get_content(self): + """ + Get the grouping content, i.e. the data belonging to the + ``'group_by'`` attribute in the RCSB search API. + + This content is converted into JSON by the :func:`search` + and :func:`count` functions. + + ABSTRACT: Override when inheriting. + + Returns + ------- + content : dict + The content dictionary for the ``'group_by'`` attributes. + """ + if self._sorting is not None: + return {"ranking_criteria_type" : self.sorting} + else: + return {} + + @abc.abstractmethod + def is_compatible_return_type(self, return_type): + """ + Check whether this :class:`Group` is compatible with the + RCSB search API ``return_type``. + + ABSTRACT: Override when inheriting. + + Parameter + --------- + return_type : str + The ``return_type`` attribute to be checked. + + Returns + ------- + is_compatible : bool + True, if this :class:`Group` is compatible with the + `return_type`, false otherwise. + """ + pass + + +class DepositGrouping(Grouping): + + def get_content(self): + content = super().get_content() + content["aggregation_method"] = "matching_deposit_group_id" + return content + + def is_compatible_return_type(self, return_type): + return return_type == "entry" + + +class IdentityGrouping(Grouping): + + def __init__(self, similarity_cutoff, sort_by=None): + super().__init__(sort_by) + self._similarity_cutoff = similarity_cutoff + + def get_content(self): + content = super().get_content() + content["aggregation_method"] = "sequence_identity" + content["similarity_cutoff"] = str(self.self._similarity_cutoff) + return content + + def is_compatible_return_type(self, return_type): + return return_type == "polymer_entity" + + +class UniprotGrouping(Grouping): + + def get_content(self): + content = super().get_content() + content["aggregation_method"] = "matching_uniprot_accession" + return content + + def is_compatible_return_type(self, return_type): + return return_type == "polymer_entity" + + + + + +def count(query, return_type="entry", group_by=None, + content_types=("experimental",)): """ Count PDB entries that meet the given query requirements, via the RCSB search API. @@ -501,26 +658,12 @@ def count(query, return_type="entry", content_types=("experimental",)): >>> print(sorted(ids)) ['1EJG', '1I0T', '2GLT', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H'] """ - if return_type not in [ - "entry", "polymer_instance", "assembly", - "polymer_entity", "non_polymer_entity", - ]: - raise ValueError(f"'{return_type}' is an invalid return type") - - request_options = {"return_counts": True} + query_dict = _initialize_query_dict( + query, return_type, group_by, content_types + ) - if len(content_types) == 0: - raise ValueError("At least one content type must be specified") - for content_type in content_types: - if content_type not in ("experimental", "computational"): - raise ValueError(f"Unknown content type '{content_type}'") - request_options["results_content_type"] = content_types + query_dict["request_options"]["return_counts"] = True - query_dict = { - "query": query.get_content(), - "return_type": return_type, - "request_options": request_options - } r = requests.get(_search_url, params={"json": json.dumps(query_dict)}) if r.status_code == 200: @@ -536,7 +679,8 @@ def count(query, return_type="entry", content_types=("experimental",)): raise RequestError(f"Error {r.status_code}") -def search(query, return_type="entry", range=None, sort_by=None, content_types=("experimental",)): +def search(query, return_type="entry", range=None, sort_by=None, group_by=None, + return_groups=False, content_types=("experimental",)): """ Get all PDB IDs that meet the given query requirements, via the RCSB search API. @@ -562,11 +706,11 @@ def search(query, return_type="entry", range=None, sort_by=None, content_types=( (more exactly ``'asym_id'``) is returned (e.g. ``'XXXX.A'``). range : tuple(int, int), optional - If this parameter is specified, the only PDB IDs in this range + If this parameter is specified, only PDB IDs in this range are selected from all matching PDB IDs and returned (pagination). The range is zero-indexed and the stop value is exclusive. - sort_by : str, optional + sort_by : str or Sorting, optional If specified, the returned PDB IDs are sorted by the values of the given field name. A complete list of the available fields is documented at @@ -574,7 +718,7 @@ def search(query, return_type="entry", range=None, sort_by=None, content_types=( and ``_. If a string is given sorting is performed in descending order. - To choose the order a :class:`Sorting` object needs to be + To choose the order, a :class:`Sorting` object needs to be provided. content_types : iterable of {"experimental", "computational"}, optional Specify whether experimental and computational structures should @@ -606,40 +750,32 @@ def search(query, return_type="entry", range=None, sort_by=None, content_types=( >>> print(sorted(search(query, return_type="polymer_instance"))) ['1EJG.A', '1I0T.A', '1I0T.B', '2GLT.A', '3NIR.A', '3P4J.A', '3P4J.B', '4JLJ.A', '4JLJ.B', '5D8V.A', '5NW3.A', '7ATG.A', '7ATG.B', '7R0H.A'] """ - if return_type not in [ - "entry", "polymer_instance", "assembly", - "polymer_entity", "non_polymer_entity", - ]: - raise ValueError(f"'{return_type}' is an invalid return type") - - request_options = {} + query_dict = _initialize_query_dict( + query, return_type, group_by, content_types + ) + + if return_groups is True: + query_dict["request_options"]["group_by_return_type"] = "groups" + else: + query_dict["request_options"]["group_by_return_type"] = "representatives" if sort_by is not None: - request_options["sort"] = [{"sort_by": sort_by}] - - if len(content_types) == 0: - raise ValueError("At least one content type must be specified") - for content_type in content_types: - if content_type not in ("experimental", "computational"): - raise ValueError(f"Unknown content type '{content_type}'") - request_options["results_content_type"] = content_types + if isinstance(sort_by, Sorting): + sorting = sort_by + else: + sorting = Sorting(sort_by) + query_dict["request_options"]["sort"] = [sorting.get_content()] if range is None: - request_options["return_all_hits"] = True + query_dict["request_options"]["return_all_hits"] = True elif range[1] <= range[0]: raise ValueError("Range stop must be greater than range start") else: - request_options["paginate"] = { + query_dict["request_options"]["paginate"] = { "start": int(range[0]), "rows": int(range[1]) - int(range[0]) } - query_dict = { - "query": query.get_content(), - "return_type": return_type, - "request_options": request_options - } - r = requests.get(_search_url, params={"json": json.dumps(query_dict)}) if r.status_code == 200: @@ -655,6 +791,37 @@ def search(query, return_type="entry", range=None, sort_by=None, content_types=( raise RequestError(f"Error {r.status_code}") +def _initialize_query_dict(query, return_type, group_by, return_groups, content_types): + """ + Initialize the request parameter dictionary with attributes that + `count()` and `search()` have in common. + """ + if return_type not in [ + "entry", "polymer_instance", "assembly", + "polymer_entity", "non_polymer_entity", + ]: + raise ValueError(f"'{return_type}' is an invalid return type") + + request_options = {} + + if len(content_types) == 0: + raise ValueError("At least one content type must be specified") + for content_type in content_types: + if content_type not in ("experimental", "computational"): + raise ValueError(f"Unknown content type '{content_type}'") + request_options["results_content_type"] = content_types + + if group_by is not None: + request_options["group_by"] = group_by.get_content() + + query_dict = { + "query": query.get_content(), + "return_type": return_type, + "request_options": request_options + } + return query_dict + + def _to_isoformat(object): """ Convert a datetime into the specifc ISO 8601 format required by the RCSB. diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index effc3403a..49338bcbc 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -141,7 +141,7 @@ def test_search_sequence(): ref_sequence, "protein", min_identity=IDENTIY_CUTOFF ) test_ids = rcsb.search(query) - assert test_ids >= 2 + assert len(test_ids) >= 2 for id in test_ids: fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta")) @@ -288,7 +288,6 @@ def test_search_content_types(): rcsb.count(query, content_types=[]) - @pytest.mark.skipif( cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" From 466d6bd6c97e620be6f69977d40b9692c882f0fb Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Thu, 16 Mar 2023 16:41:51 +0100 Subject: [PATCH 3/8] Add tests and fixes --- src/biotite/database/rcsb/query.py | 34 ++++++++---- tests/database/test_rcsb.py | 85 ++++++++++++++++++++++++++++-- 2 files changed, 106 insertions(+), 13 deletions(-) diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py index 5609b09d6..741fc6220 100644 --- a/src/biotite/database/rcsb/query.py +++ b/src/biotite/database/rcsb/query.py @@ -544,7 +544,7 @@ def get_content(self): The content dictionary for the ``'group_by'`` attributes. """ if self._sorting is not None: - return {"ranking_criteria_type" : self.sorting} + return {"ranking_criteria_type" : self._sorting.get_content()} else: return {} @@ -585,12 +585,16 @@ class IdentityGrouping(Grouping): def __init__(self, similarity_cutoff, sort_by=None): super().__init__(sort_by) + if similarity_cutoff not in (100, 95, 90, 70, 50, 30): + raise ValueError( + f"A similarity cutoff of {similarity_cutoff}% is not supported" + ) self._similarity_cutoff = similarity_cutoff def get_content(self): content = super().get_content() content["aggregation_method"] = "sequence_identity" - content["similarity_cutoff"] = str(self.self._similarity_cutoff) + content["similarity_cutoff"] = self._similarity_cutoff return content def is_compatible_return_type(self, return_type): @@ -667,7 +671,10 @@ def count(query, return_type="entry", group_by=None, r = requests.get(_search_url, params={"json": json.dumps(query_dict)}) if r.status_code == 200: - return r.json()["total_count"] + if group_by is None: + return r.json()["total_count"] + else: + return r.json()["group_by_count"] elif r.status_code == 204: # Search did not return any results return 0 @@ -754,10 +761,13 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, query, return_type, group_by, content_types ) - if return_groups is True: - query_dict["request_options"]["group_by_return_type"] = "groups" - else: - query_dict["request_options"]["group_by_return_type"] = "representatives" + if group_by is not None: + if return_groups: + query_dict["request_options"]["group_by_return_type"] \ + = "groups" + else: + query_dict["request_options"]["group_by_return_type"] \ + = "representatives" if sort_by is not None: if isinstance(sort_by, Sorting): @@ -779,7 +789,13 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, r = requests.get(_search_url, params={"json": json.dumps(query_dict)}) if r.status_code == 200: - return [result["identifier"] for result in r.json()["result_set"]] + if group_by is None or not return_groups: + return [result["identifier"] for result in r.json()["result_set"]] + else: + return [ + [result["identifier"] for result in group["result_set"]] + for group in r.json()["group_set"] + ] elif r.status_code == 204: # Search did not return any results return [] @@ -791,7 +807,7 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, raise RequestError(f"Error {r.status_code}") -def _initialize_query_dict(query, return_type, group_by, return_groups, content_types): +def _initialize_query_dict(query, return_type, group_by, content_types): """ Initialize the request parameter dictionary with attributes that `count()` and `search()` have in common. diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index 49338bcbc..37fd49d0b 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -243,22 +243,35 @@ def test_search_range(seed): cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" ) -def test_search_sort(): +@pytest.mark.parametrize("as_sorting_object", [False, True]) +def test_search_sort(as_sorting_object): query = rcsb.FieldQuery( "rcsb_entity_host_organism.scientific_name", exact_match="Homo sapiens" ) - entries = rcsb.search(query, sort_by="reflns.d_resolution_high") + if as_sorting_object: + sort_by = rcsb.Sorting("reflns.d_resolution_high", descending=False) + else: + sort_by = "reflns.d_resolution_high" + entries = rcsb.search(query, sort_by=sort_by) resolutions = [] for pdb_id in entries[:5]: pdbx_file = pdbx.PDBxFile.read(rcsb.fetch(pdb_id, "pdbx")) resolutions.append(float(pdbx_file["reflns"]["d_resolution_high"])) - # Check if values are sorted in descending order - assert resolutions == list(reversed(sorted(resolutions))) + if as_sorting_object: + # In the tested case the Sorting object uses ascending order + assert resolutions == list(sorted(resolutions)) + else: + # Check if values are sorted in descending order + assert resolutions == list(reversed(sorted(resolutions))) +@pytest.mark.skipif( + cannot_connect_to(RCSB_URL), + reason="RCSB PDB is not available" +) def test_search_content_types(): # Query to limit the number of returned results # for improved performance @@ -288,6 +301,70 @@ def test_search_content_types(): rcsb.count(query, content_types=[]) +@pytest.mark.skipif( + cannot_connect_to(RCSB_URL), + reason="RCSB PDB is not available" +) +def test_search_identity_grouping(): + """ + Expect the same result as the example in the RCSB search API + tutorial. + """ + REF_GROUPS = set([ + ('1ZHM_1',), + ( + '3P8X_1', '7QPP_1', '3X36_1', '3CS6_1', '3CS4_1', '3A78_1', + '3A40_1', '3A3Z_1', '2HB8_1', '2HB7_1', '2HAS_1', '2HAR_1', + '2HAM_1', '1TXI_1', '4G2I_1', '3TKC_1', '3OGT_1', '3KPZ_1', + '1IE9_1', '1IE8_1', '1DB1_1', '5YT2_1', '5YSY_1', '5GT4_1', + '3WGP_1', '3W0Y_1', '3W0C_1', '3W0A_1', '3AZ3_1', '3AZ2_1', + '3AZ1_1' + ), + ('3D44_1',), + ('6RA4_1',), + ('3B9V_1',), + ('2FC0_1', '2FBY_1'), + ('5GJH_1',), + ('2IGP_1',), + ('5LF7_13', '5LF4_13', '5LF1_13', '5LEY_13', '5LE5_13'), + ('1GBU_2',) + ]) + REF_COUNT = 9597 + + query = ( + rcsb.FieldQuery( + "rcsb_entity_source_organism.taxonomy_lineage.name", + exact_match="Homo sapiens" + ) + & rcsb.FieldQuery( + "exptl.method", + exact_match="X-RAY DIFFRACTION" + ) + & rcsb.FieldQuery( + "rcsb_entry_info.resolution_combined", + range_closed=(1.0, 2.0) + ) + ) + grouping = rcsb.IdentityGrouping( + 100, sort_by="entity_poly.rcsb_sample_sequence_length" + ) + + test_groups = rcsb.search( + query, "polymer_entity", + group_by=grouping, return_groups=True + ) + test_representatives = rcsb.search( + query, "polymer_entity", + group_by=grouping, return_groups=False + ) + test_count = rcsb.count(query, "polymer_entity", group_by=grouping) + + # List is not hashable + assert set([tuple(group) for group in test_groups]) == REF_GROUPS + assert set(test_representatives) == [group[0] for group in REF_GROUPS] + assert test_count == pytest.approx(REF_COUNT, rel = 0.1) + + @pytest.mark.skipif( cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" From 206c3c004a0ce70fc60d0772e75a33bd1a3cebee Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Fri, 17 Mar 2023 13:41:47 +0100 Subject: [PATCH 4/8] Add new classes to correct category in API documentation --- doc/apidoc.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/apidoc.json b/doc/apidoc.json index 5138cdfbc..755518839 100644 --- a/doc/apidoc.json +++ b/doc/apidoc.json @@ -48,6 +48,13 @@ "MotifQuery", "StructureQuery" ], + "Sorting and grouping" : [ + "Sorting", + "Grouping", + "DepositGrouping", + "IdentityGrouping", + "UniprotGrouping" + ], "Search and fetch" : [ "count", "search", From d17d5415b6f9449139183d3a08c899755da60eb8 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Fri, 17 Mar 2023 13:42:22 +0100 Subject: [PATCH 5/8] Add docstrings, change return value --- src/biotite/database/rcsb/query.py | 148 +++++++++++++++++++++++++---- 1 file changed, 131 insertions(+), 17 deletions(-) diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py index 741fc6220..ed18c09ce 100644 --- a/src/biotite/database/rcsb/query.py +++ b/src/biotite/database/rcsb/query.py @@ -17,7 +17,7 @@ from datetime import datetime import numpy as np import requests -from ...sequence.seqtypes import ProteinSequence, NucleotideSequence +from ...sequence.seqtypes import NucleotideSequence from ..error import RequestError @@ -503,8 +503,8 @@ class Grouping(metaclass=abc.ABCMeta): Parameters ---------- sort_by : str or Sorting, optional - If specified, the returned PDB IDs are sorted by the values - of the given field name. + If specified, the returned PDB IDs within each group are sorted + by the values of the given field name. A complete list of the available fields is documented at ``_. and @@ -512,11 +512,6 @@ class Grouping(metaclass=abc.ABCMeta): If a string is given, sorting is performed in descending order. To choose the order a :class:`Sorting` object needs to be provided. - - Attributes - ---------- - sorting : Sorting - The sorting of the :class:`Grouping`. """ def __init__(self, sort_by=None): @@ -571,6 +566,28 @@ def is_compatible_return_type(self, return_type): class DepositGrouping(Grouping): + """ + This class groups PDB entries if they were deposited as a + collection. + Such a group usually contain the same protein with e.g. a different + bound molecule. + + This :class:`Grouping` is only applicable, if the + :func:`count()`/:func:`search()` return type is set to ``entry``. + + Parameters + ---------- + sort_by : str or Sorting, optional + If specified, the returned PDB IDs within each group are sorted + by the values of the given field name. + A complete list of the available fields is documented at + ``_. + and + ``_. + If a string is given, sorting is performed in descending order. + To choose the order a :class:`Sorting` object needs to be + provided. + """ def get_content(self): content = super().get_content() @@ -582,7 +599,34 @@ def is_compatible_return_type(self, return_type): class IdentityGrouping(Grouping): + """ + This class groups protein chains with a given sequence identity + with each other. + This :class:`Grouping` is only applicable, if the + :func:`count()`/:func:`search()` return type is set to + ``polymer_entity``. + + Parameters + ---------- + similarity_cutoff : {100, 95, 90, 70, 50, 30} + The sequence identity in percent at which the structures are + grouped. + In other words, a returned group contains sequences that have + `similarity_cutoff` sequence identity with each other. + Since the PDB uses precalculated clusters, only certain values + are available. + sort_by : str or Sorting, optional + If specified, the returned PDB IDs within each group are sorted + by the values of the given field name. + A complete list of the available fields is documented at + ``_. + and + ``_. + If a string is given, sorting is performed in descending order. + To choose the order a :class:`Sorting` object needs to be + provided. + """ def __init__(self, similarity_cutoff, sort_by=None): super().__init__(sort_by) if similarity_cutoff not in (100, 95, 90, 70, 50, 30): @@ -602,6 +646,27 @@ def is_compatible_return_type(self, return_type): class UniprotGrouping(Grouping): + """ + This class groups protein chains that point to the same *Uniprot* + accession ID. + + This :class:`Grouping` is only applicable, if the + :func:`count()`/:func:`search()` return type is set to + ``polymer_entity``. + + Parameters + ---------- + sort_by : str or Sorting, optional + If specified, the returned PDB IDs within each group are sorted + by the values of the given field name. + A complete list of the available fields is documented at + ``_. + and + ``_. + If a string is given, sorting is performed in descending order. + To choose the order a :class:`Sorting` object needs to be + provided. + """ def get_content(self): content = super().get_content() @@ -637,6 +702,9 @@ def count(query, return_type="entry", group_by=None, - ``'non_polymer_entity'``: All matching non-polymeric entities are counted. - ``'polymer_instance'``: All matching chains are counted. + group_by : Grouping + If this parameter is set, the number of groups is returned + instead. content_types : iterable of {"experimental", "computational"}, optional Specify whether experimental and computational structures should be included. @@ -648,9 +716,16 @@ def count(query, return_type="entry", group_by=None, Returns ------- - ids : list of str - A list of strings containing all PDB IDs that meet the query - requirements. + count : int + The total number of PDB IDs (or groups) that would be returned + by calling :func:`search()` using the same parameters. + + Notes + ----- + If `group_by` is set, the number of results may be lower than in an + ungrouped query, as grouping is not applicable to all structures. + For example a DNA structure has no associated *Uniprot* accession + and hence is omitted by :class:`UniprotGrouping`. Examples -------- @@ -727,6 +802,17 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, If a string is given sorting is performed in descending order. To choose the order, a :class:`Sorting` object needs to be provided. + group_by : Grouping + If this parameter is set, the PDB IDs that meet the query + requirements, are grouped according to the given criterion. + return_groups : boolean, optional + Only has effect, if `group_by` is set. + By default the representative with the highest rank in each + group is returned. + The rank is determined by the `sort_by` parameter of + :class:`Grouping` provided in `group_by`. + If set to true, groups containing all structures belonging to + the group are returned instead. content_types : iterable of {"experimental", "computational"}, optional Specify whether experimental and computational structures should be included. @@ -738,9 +824,25 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, Returns ------- - ids : list of str - A list of strings containing all PDB IDs that meet the query - requirements. + ids : list of str or dict (str -> list of str) + If `return_groups` is false (default case), a list of strings + containing all PDB IDs that meet the query requirements is + returned. + If `return_groups` is set to true a dictionary of groups is + returned. + This dictionary maps group identifiers to a list of all PDB IDs + belonging to this group. + + Notes + ----- + If `group_by` is set, the number of results may be lower than in an + ungrouped query, as grouping is not applicable to all structures. + For example a DNA structure has no associated *Uniprot* accession + and hence is omitted by :class:`UniprotGrouping`. + + Also note that `sort_by` does not affect the order within a group. + This order is determined by the `sort_by` parameter of the + :class:`Grouping`. Examples -------- @@ -756,6 +858,11 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, ['7ATG', '5NW3', '5D8V'] >>> print(sorted(search(query, return_type="polymer_instance"))) ['1EJG.A', '1I0T.A', '1I0T.B', '2GLT.A', '3NIR.A', '3P4J.A', '3P4J.B', '4JLJ.A', '4JLJ.B', '5D8V.A', '5NW3.A', '7ATG.A', '7ATG.B', '7R0H.A'] + >>> print(search( + ... query, return_type="polymer_entity", return_groups=True, + ... group_by=UniprotGrouping(sort_by="rcsb_accession_info.initial_release_date"), + ... )) + {'P24297': ['5NW3_1'], 'P04425': ['2GLT_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['3NIR_1', '1EJG_1']} """ query_dict = _initialize_query_dict( query, return_type, group_by, content_types @@ -792,10 +899,12 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None, if group_by is None or not return_groups: return [result["identifier"] for result in r.json()["result_set"]] else: - return [ - [result["identifier"] for result in group["result_set"]] + return { + group["identifier"] : [ + result["identifier"] for result in group["result_set"] + ] for group in r.json()["group_set"] - ] + } elif r.status_code == 204: # Search did not return any results return [] @@ -828,6 +937,11 @@ def _initialize_query_dict(query, return_type, group_by, content_types): request_options["results_content_type"] = content_types if group_by is not None: + if not group_by.is_compatible_return_type(return_type): + raise ValueError( + f"Return type '{return_type}' is not compatible " + f"with the given Grouping" + ) request_options["group_by"] = group_by.get_content() query_dict = { From 088ae875c45e9d444bc6204729726bb18298a112 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Fri, 17 Mar 2023 13:42:30 +0100 Subject: [PATCH 6/8] Add tests --- tests/database/test_rcsb.py | 108 +++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 39 deletions(-) diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py index 37fd49d0b..ba8924f95 100644 --- a/tests/database/test_rcsb.py +++ b/tests/database/test_rcsb.py @@ -305,64 +305,94 @@ def test_search_content_types(): cannot_connect_to(RCSB_URL), reason="RCSB PDB is not available" ) -def test_search_identity_grouping(): - """ - Expect the same result as the example in the RCSB search API - tutorial. - """ - REF_GROUPS = set([ - ('1ZHM_1',), +@pytest.mark.parametrize( + "grouping, resolution_threshold, return_type, ref_groups", + [ ( - '3P8X_1', '7QPP_1', '3X36_1', '3CS6_1', '3CS4_1', '3A78_1', - '3A40_1', '3A3Z_1', '2HB8_1', '2HB7_1', '2HAS_1', '2HAR_1', - '2HAM_1', '1TXI_1', '4G2I_1', '3TKC_1', '3OGT_1', '3KPZ_1', - '1IE9_1', '1IE8_1', '1DB1_1', '5YT2_1', '5YSY_1', '5GT4_1', - '3WGP_1', '3W0Y_1', '3W0C_1', '3W0A_1', '3AZ3_1', '3AZ2_1', - '3AZ1_1' + rcsb.IdentityGrouping( + 100, sort_by="rcsb_accession_info.initial_release_date" + ), + 0.7, + "polymer_entity", + set([ + ("3X2M_1",), + ("6E6O_1",), + ("1YK4_1",), + ("5NW3_1",), + ("1US0_1",), + ("4HP2_1",), + ("2DSX_1",), + ("2VB1_1",), + ("7VOS_1", "5D8V_1", "3A38_1"), + ("1UCS_1",), + ("3NIR_1", "1EJG_1"), + ]) ), - ('3D44_1',), - ('6RA4_1',), - ('3B9V_1',), - ('2FC0_1', '2FBY_1'), - ('5GJH_1',), - ('2IGP_1',), - ('5LF7_13', '5LF4_13', '5LF1_13', '5LEY_13', '5LE5_13'), - ('1GBU_2',) - ]) - REF_COUNT = 9597 + ( + rcsb.UniprotGrouping( + sort_by="rcsb_accession_info.initial_release_date" + ), + 0.7, + "polymer_entity", + set([ + ("3X2M_1",), + ("6E6O_1",), + ("1YK4_1",), + ("5NW3_1",), + ("1US0_1",), + ("4HP2_1",), + ("2DSX_1",), + ("2VB1_1",), + ("7VOS_1", "5D8V_1", "3A38_1"), + ("1UCS_1",), + ("3NIR_1", "1EJG_1"), + ]) + ), + + ( + rcsb.DepositGrouping( + sort_by="rcsb_accession_info.initial_release_date" + ), + 0.9, + "entry", + set([ + ("5R32",), + ("5RDH", "5RBR"), + ]) + ) + ] +) +def test_search_grouping(grouping, resolution_threshold, return_type, + ref_groups): + """ + Check whether the same result as in a known example is achieved. + """ query = ( rcsb.FieldQuery( - "rcsb_entity_source_organism.taxonomy_lineage.name", - exact_match="Homo sapiens" - ) - & rcsb.FieldQuery( "exptl.method", exact_match="X-RAY DIFFRACTION" ) & rcsb.FieldQuery( "rcsb_entry_info.resolution_combined", - range_closed=(1.0, 2.0) + range_closed=(0.0, resolution_threshold) ) ) - grouping = rcsb.IdentityGrouping( - 100, sort_by="entity_poly.rcsb_sample_sequence_length" - ) - test_groups = rcsb.search( - query, "polymer_entity", + test_groups = list(rcsb.search( + query, return_type, group_by=grouping, return_groups=True - ) + ).values()) test_representatives = rcsb.search( - query, "polymer_entity", + query, return_type, group_by=grouping, return_groups=False ) - test_count = rcsb.count(query, "polymer_entity", group_by=grouping) + test_count = rcsb.count(query, return_type, group_by=grouping) # List is not hashable - assert set([tuple(group) for group in test_groups]) == REF_GROUPS - assert set(test_representatives) == [group[0] for group in REF_GROUPS] - assert test_count == pytest.approx(REF_COUNT, rel = 0.1) + assert set([tuple(group) for group in test_groups]) == ref_groups + assert set(test_representatives) == set([group[0] for group in ref_groups]) + assert test_count == len(ref_groups) @pytest.mark.skipif( From f929e46627cb35d37659bff877fc6661f14418ff Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Fri, 17 Mar 2023 13:43:04 +0100 Subject: [PATCH 7/8] Remove warning by removing doctest The function is deprecated so removing the doctest is reasonable --- src/biotite/application/viennarna/rnafold.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/biotite/application/viennarna/rnafold.py b/src/biotite/application/viennarna/rnafold.py index c621767fd..52fca90c2 100644 --- a/src/biotite/application/viennarna/rnafold.py +++ b/src/biotite/application/viennarna/rnafold.py @@ -174,16 +174,6 @@ def get_mfe(self): ------- mfe : float The minimum free energy. - - Examples - -------- - - >>> sequence = NucleotideSequence("CGACGTAGATGCTAGCTGACTCGATGC") - >>> app = RNAfoldApp(sequence) - >>> app.start() - >>> app.join() - >>> print(app.get_mfe()) - -1.3 """ warnings.warn( "'get_mfe()' is deprecated, use 'get_free_energy()' instead", From 5311f8a15dcb88afe495cf6a921d0c14238ade6d Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Fri, 17 Mar 2023 13:43:42 +0100 Subject: [PATCH 8/8] Explain grouping in tutorial --- doc/tutorial/src/database.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/doc/tutorial/src/database.py b/doc/tutorial/src/database.py index e46188edd..59d6aecc0 100644 --- a/doc/tutorial/src/database.py +++ b/doc/tutorial/src/database.py @@ -105,6 +105,30 @@ print(rcsb.search(composite_query)) ######################################################################## +# Often the structures behind the obtained PDB IDs have degree of +# redundancy. +# For example they may represent the same protein sequences or result +# from the same set of experiments. +# You may use :class:`Grouping` of structures to group redundant +# entries or even return only single representatives of each group. + +query = rcsb.BasicQuery("Transketolase") +# Group PDB IDs from the same collection +print(rcsb.search( + query, group_by=rcsb.DepositGrouping(), return_groups=True +)) +# Get only a single representative of each group +print(rcsb.search( + query, group_by=rcsb.DepositGrouping(), return_groups=False +)) + +######################################################################## +# Note that grouping may omit PDB IDs in search results, if such PDB IDs +# cannot be grouped. +# In the example shown above, not all structures +# For example in the case shown above only a few PDB entries were +# uploaded as collection and hence are part of the search results. +# # Fetching files from the NCBI Entrez database # -------------------------------------------- #