From 0e64dfdac1279ebe3820f87813d4ca7f87816fed Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Thu, 16 Mar 2023 13:46:17 +0100
Subject: [PATCH 1/8] Add support for computational structure entries

---
 src/biotite/database/rcsb/query.py | 51 ++++++++++++++++++++++++++----
 tests/database/test_rcsb.py        | 33 ++++++++++++++++++-
 2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py
index d8d9e4830..62211a421 100644
--- a/src/biotite/database/rcsb/query.py
+++ b/src/biotite/database/rcsb/query.py
@@ -33,6 +33,7 @@ class Query(metaclass=abc.ABCMeta):
     
     This is the abstract base class for all queries.
     """
+
     @abc.abstractmethod
     def get_content(self):
         """
@@ -41,6 +42,11 @@ def get_content(self):
 
         This content is converted into JSON by the :func:`search`
         and :func:`count` methods.
+
+        Returns
+        -------
+        content : dict
+            The content dictionary for the ``'query'`` attributes.
         """
         pass
 
@@ -449,7 +455,7 @@ def get_content(self):
         return content
 
 
-def count(query, return_type="entry"):
+def count(query, return_type="entry", content_types=("experimental",)):
     """
     Count PDB entries that meet the given query requirements,
     via the RCSB search API.
@@ -470,6 +476,14 @@ def count(query, return_type="entry"):
         - ``'non_polymer_entity'``: All matching non-polymeric entities
           are counted.
         - ``'polymer_instance'``: All matching chains are counted.
+    content_types : iterable of {"experimental", "computational"}, optional
+        Specify whether experimental and computational structures should
+        be included.
+        At least one of them needs to be specified.
+        By default only experimental structures are included.
+        Note, that identifiers for computational structures cannot be
+        downloaded via :func:`biotite.database.rcsb.fetch()` as they
+        point to *AlphaFold DB* and *ModelArchive*.
 
     Returns
     -------
@@ -492,13 +506,20 @@ def count(query, return_type="entry"):
         "polymer_entity", "non_polymer_entity",
     ]:
         raise ValueError(f"'{return_type}' is an invalid return type")
+
+    request_options = {"return_counts": True}
+
+    if len(content_types) == 0:
+        raise ValueError("At least one content type must be specified")
+    for content_type in content_types:
+        if content_type not in ("experimental", "computational"):
+            raise ValueError(f"Unknown content type '{content_type}'")
+    request_options["results_content_type"] = content_types
     
     query_dict = {
         "query": query.get_content(),
         "return_type": return_type,
-        "request_options": {
-            "return_counts": True
-        }
+        "request_options": request_options
     }
     r = requests.get(_search_url, params={"json": json.dumps(query_dict)})
     
@@ -515,7 +536,7 @@ def count(query, return_type="entry"):
             raise RequestError(f"Error {r.status_code}")
 
 
-def search(query, return_type="entry", range=None, sort_by=None):
+def search(query, return_type="entry", range=None, sort_by=None, content_types=("experimental",)):
     """
     Get all PDB IDs that meet the given query requirements,
     via the RCSB search API.
@@ -547,11 +568,22 @@ def search(query, return_type="entry", range=None, sort_by=None):
         The range is zero-indexed and the stop value is exclusive.
     sort_by : str, optional
         If specified, the returned PDB IDs are sorted by the values
-        of the given field name in descending order.
+        of the given field name.
         A complete list of the available fields is documented at
         `<https://search.rcsb.org/structure-search-attributes.html>`_.
         and
         `<https://search.rcsb.org/chemical-search-attributes.html>`_.
+        If a string is given sorting is performed in descending order.
+        To choose the order a :class:`Sorting` object needs to be
+        provided.
+    content_types : iterable of {"experimental", "computational"}, optional
+        Specify whether experimental and computational structures should
+        be included.
+        At least one of them needs to be specified.
+        By default only experimental structures are included.
+        Note, that identifiers for computational structures cannot be
+        downloaded via :func:`biotite.database.rcsb.fetch()` as they
+        point to *AlphaFold DB* and *ModelArchive*.
 
     Returns
     -------
@@ -584,6 +616,13 @@ def search(query, return_type="entry", range=None, sort_by=None):
 
     if sort_by is not None:
         request_options["sort"] = [{"sort_by": sort_by}]
+    
+    if len(content_types) == 0:
+        raise ValueError("At least one content type must be specified")
+    for content_type in content_types:
+        if content_type not in ("experimental", "computational"):
+            raise ValueError(f"Unknown content type '{content_type}'")
+    request_options["results_content_type"] = content_types
 
     if range is None:
         request_options["return_all_hits"] = True
diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
index 3626afa61..effc3403a 100644
--- a/tests/database/test_rcsb.py
+++ b/tests/database/test_rcsb.py
@@ -141,6 +141,7 @@ def test_search_sequence():
         ref_sequence, "protein", min_identity=IDENTIY_CUTOFF
     )
     test_ids = rcsb.search(query)
+    assert test_ids >= 2
 
     for id in test_ids:
         fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta"))
@@ -172,7 +173,7 @@ def test_search_motif():
     MOTIF = "C-x(2,4)-C-x(3)-[LIVMFYWC]-x(8)-H-x(3,5)-H."
     query = rcsb.MotifQuery(MOTIF, "prosite", "protein")
     test_count = rcsb.count(query)
-    assert test_count == pytest.approx(558, rel=0.1)
+    assert test_count == pytest.approx(580, rel=0.1)
 
 
 @pytest.mark.skipif(
@@ -258,6 +259,36 @@ def test_search_sort():
     assert resolutions == list(reversed(sorted(resolutions)))
 
 
+def test_search_content_types():
+    # Query to limit the number of returned results
+    # for improved performance
+    query = rcsb.FieldQuery(
+        "rcsb_entity_host_organism.scientific_name",
+        exact_match="Homo sapiens"
+    )
+    experimental_set =  set(rcsb.search(query, content_types=["experimental"]))
+    computational_set = set(rcsb.search(query, content_types=["computational"]))
+    combined_set =      set(rcsb.search(query, content_types=["experimental", "computational"]))
+
+    # If there are no results, the following tests make no sense
+    assert len(combined_set) > 0
+    # There should be no common elements
+    assert len(experimental_set & computational_set) == 0
+    # The combined search should include the contents of both searches
+    assert len(experimental_set | computational_set) == len(combined_set)
+
+    assert rcsb.count(query, content_types=["experimental"]) == len(experimental_set)
+    assert rcsb.count(query, content_types=["computational"]) == len(computational_set)
+    assert rcsb.count(query, content_types=["experimental", "computational"]) == len(combined_set)
+
+    # Expect an exception if no content_type
+    with pytest.raises(ValueError):
+        rcsb.search(query, content_types=[])
+    with pytest.raises(ValueError):
+        rcsb.count(query, content_types=[])
+
+
+
 @pytest.mark.skipif(
     cannot_connect_to(RCSB_URL),
     reason="RCSB PDB is not available"

From cfb3578bf40692a8048b9a4ad9c3f8f6614f24bb Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Thu, 16 Mar 2023 15:30:19 +0100
Subject: [PATCH 2/8] Improve requests with sorting and grouping

---
 src/biotite/database/rcsb/query.py | 265 +++++++++++++++++++++++------
 tests/database/test_rcsb.py        |   3 +-
 2 files changed, 217 insertions(+), 51 deletions(-)

diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py
index 62211a421..5609b09d6 100644
--- a/src/biotite/database/rcsb/query.py
+++ b/src/biotite/database/rcsb/query.py
@@ -7,6 +7,8 @@
 __all__ = ["Query", "SingleQuery", "CompositeQuery",
            "BasicQuery", "FieldQuery",
            "SequenceQuery", "StructureQuery", "MotifQuery",
+           "Sorting",
+           "Grouping", "DepositGrouping", "IdentityGrouping", "UniprotGrouping",
            "search", "count"]
 
 import abc
@@ -33,7 +35,6 @@ class Query(metaclass=abc.ABCMeta):
     
     This is the abstract base class for all queries.
     """
-
     @abc.abstractmethod
     def get_content(self):
         """
@@ -41,7 +42,7 @@ def get_content(self):
         ``'query'`` attribute in the RCSB search API.
 
         This content is converted into JSON by the :func:`search`
-        and :func:`count` methods.
+        and :func:`count` functions.
 
         Returns
         -------
@@ -68,7 +69,6 @@ class SingleQuery(Query, metaclass=abc.ABCMeta):
     This is the abstract base class for all queries that are
     terminal nodes.
     """
-
     @abc.abstractmethod
     def get_content(self):
         return {"parameters": {}}
@@ -455,7 +455,164 @@ def get_content(self):
         return content
 
 
-def count(query, return_type="entry", content_types=("experimental",)):
+
+
+class Sorting:
+
+    def __init__(self, field, descending=True):
+        self._field = field
+        self._descending = descending
+    
+    @property
+    def field(self):
+        return self._field
+
+    @property
+    def descending(self):
+        return self._descending
+    
+    def get_content(self):
+        """
+        Get the sorting content, i.e. the data belonging to the
+        ``'sort'`` and ``'ranking_criteria_type'`` attributes in the
+        RCSB search API.
+
+        This content is converted into JSON by the :func:`search`
+        function.
+
+        Returns
+        -------
+        content : dict
+            The content dictionary for the ``'sort'`` and
+            ``'ranking_criteria_type'`` attributes.
+        """
+        direction = "desc" if self._descending else "asc"
+        return {
+            "sort_by" : self._field,
+            "direction" : direction
+        }
+
+
+
+
+class Grouping(metaclass=abc.ABCMeta):
+    """
+    A representation of the JSON grouping options of the RCSB search 
+    API.
+
+    Parameters
+    ----------
+    sort_by : str or Sorting, optional
+        If specified, the returned PDB IDs are sorted by the values
+        of the given field name.
+        A complete list of the available fields is documented at
+        `<https://search.rcsb.org/structure-search-attributes.html>`_.
+        and
+        `<https://search.rcsb.org/chemical-search-attributes.html>`_.
+        If a string is given, sorting is performed in descending order.
+        To choose the order a :class:`Sorting` object needs to be
+        provided.
+    
+    Attributes
+    ----------
+    sorting : Sorting
+        The sorting of the :class:`Grouping`.
+    """
+
+    def __init__(self, sort_by=None):
+        if sort_by is None:
+            self._sorting = None
+        elif isinstance(sort_by, Sorting):
+            self._sorting = sort_by
+        else:
+            self._sorting = Sorting(sort_by)
+    
+    @abc.abstractmethod
+    def get_content(self):
+        """
+        Get the grouping content, i.e. the data belonging to the
+        ``'group_by'`` attribute in the RCSB search API.
+
+        This content is converted into JSON by the :func:`search`
+        and :func:`count` functions.
+
+        ABSTRACT: Override when inheriting.
+
+        Returns
+        -------
+        content : dict
+            The content dictionary for the ``'group_by'`` attributes.
+        """
+        if self._sorting is not None:
+            return {"ranking_criteria_type" : self.sorting}
+        else:
+            return {}
+    
+    @abc.abstractmethod
+    def is_compatible_return_type(self, return_type):
+        """
+        Check whether this :class:`Group` is compatible with the
+        RCSB search API ``return_type``.
+
+        ABSTRACT: Override when inheriting.
+
+        Parameter
+        ---------
+        return_type : str
+            The ``return_type`` attribute to be checked.
+        
+        Returns
+        -------
+        is_compatible : bool
+            True, if this :class:`Group` is compatible with the
+            `return_type`, false otherwise.
+        """
+        pass
+
+
+class DepositGrouping(Grouping):
+
+    def get_content(self):
+        content = super().get_content()
+        content["aggregation_method"] = "matching_deposit_group_id"
+        return content
+    
+    def is_compatible_return_type(self, return_type):
+        return return_type == "entry"
+
+
+class IdentityGrouping(Grouping):
+
+    def __init__(self, similarity_cutoff, sort_by=None):
+        super().__init__(sort_by)
+        self._similarity_cutoff = similarity_cutoff
+
+    def get_content(self):
+        content = super().get_content()
+        content["aggregation_method"] = "sequence_identity"
+        content["similarity_cutoff"] = str(self.self._similarity_cutoff)
+        return content
+    
+    def is_compatible_return_type(self, return_type):
+        return return_type == "polymer_entity"
+
+
+class UniprotGrouping(Grouping):
+
+    def get_content(self):
+        content = super().get_content()
+        content["aggregation_method"] = "matching_uniprot_accession"
+        return content
+    
+    def is_compatible_return_type(self, return_type):
+        return return_type == "polymer_entity"
+
+
+
+
+
+def count(query, return_type="entry", group_by=None,
+          content_types=("experimental",)):
     """
     Count PDB entries that meet the given query requirements,
     via the RCSB search API.
@@ -501,26 +658,12 @@ def count(query, return_type="entry", content_types=("experimental",)):
     >>> print(sorted(ids))
     ['1EJG', '1I0T', '2GLT', '3NIR', '3P4J', '4JLJ', '5D8V', '5NW3', '7ATG', '7R0H']
     """
-    if return_type not in [
-        "entry", "polymer_instance", "assembly",
-        "polymer_entity", "non_polymer_entity",
-    ]:
-        raise ValueError(f"'{return_type}' is an invalid return type")
-
-    request_options = {"return_counts": True}
+    query_dict = _initialize_query_dict(
+        query, return_type, group_by, content_types
+    )
 
-    if len(content_types) == 0:
-        raise ValueError("At least one content type must be specified")
-    for content_type in content_types:
-        if content_type not in ("experimental", "computational"):
-            raise ValueError(f"Unknown content type '{content_type}'")
-    request_options["results_content_type"] = content_types
+    query_dict["request_options"]["return_counts"] = True
     
-    query_dict = {
-        "query": query.get_content(),
-        "return_type": return_type,
-        "request_options": request_options
-    }
     r = requests.get(_search_url, params={"json": json.dumps(query_dict)})
     
     if r.status_code == 200:
@@ -536,7 +679,8 @@ def count(query, return_type="entry", content_types=("experimental",)):
             raise RequestError(f"Error {r.status_code}")
 
 
-def search(query, return_type="entry", range=None, sort_by=None, content_types=("experimental",)):
+def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
+           return_groups=False, content_types=("experimental",)):
     """
     Get all PDB IDs that meet the given query requirements,
     via the RCSB search API.
@@ -562,11 +706,11 @@ def search(query, return_type="entry", range=None, sort_by=None, content_types=(
           (more exactly ``'asym_id'``) is returned (e.g. ``'XXXX.A'``).
     
     range : tuple(int, int), optional
-        If this parameter is specified, the only PDB IDs in this range
+        If this parameter is specified, only PDB IDs in this range
         are selected from all matching PDB IDs and returned
         (pagination).
         The range is zero-indexed and the stop value is exclusive.
-    sort_by : str, optional
+    sort_by : str or Sorting, optional
         If specified, the returned PDB IDs are sorted by the values
         of the given field name.
         A complete list of the available fields is documented at
@@ -574,7 +718,7 @@ def search(query, return_type="entry", range=None, sort_by=None, content_types=(
         and
         `<https://search.rcsb.org/chemical-search-attributes.html>`_.
         If a string is given sorting is performed in descending order.
-        To choose the order a :class:`Sorting` object needs to be
+        To choose the order, a :class:`Sorting` object needs to be
         provided.
     content_types : iterable of {"experimental", "computational"}, optional
         Specify whether experimental and computational structures should
@@ -606,40 +750,32 @@ def search(query, return_type="entry", range=None, sort_by=None, content_types=(
     >>> print(sorted(search(query, return_type="polymer_instance")))
     ['1EJG.A', '1I0T.A', '1I0T.B', '2GLT.A', '3NIR.A', '3P4J.A', '3P4J.B', '4JLJ.A', '4JLJ.B', '5D8V.A', '5NW3.A', '7ATG.A', '7ATG.B', '7R0H.A']
     """
-    if return_type not in [
-        "entry", "polymer_instance", "assembly",
-        "polymer_entity", "non_polymer_entity",
-    ]:
-        raise ValueError(f"'{return_type}' is an invalid return type")
-    
-    request_options = {}
+    query_dict = _initialize_query_dict(
+        query, return_type, group_by, content_types
+    )
+
+    if return_groups is True:
+        query_dict["request_options"]["group_by_return_type"] = "groups"
+    else:
+        query_dict["request_options"]["group_by_return_type"] = "representatives"
 
     if sort_by is not None:
-        request_options["sort"] = [{"sort_by": sort_by}]
-    
-    if len(content_types) == 0:
-        raise ValueError("At least one content type must be specified")
-    for content_type in content_types:
-        if content_type not in ("experimental", "computational"):
-            raise ValueError(f"Unknown content type '{content_type}'")
-    request_options["results_content_type"] = content_types
+        if isinstance(sort_by, Sorting):
+            sorting = sort_by
+        else:
+            sorting = Sorting(sort_by)
+        query_dict["request_options"]["sort"] = [sorting.get_content()]
 
     if range is None:
-        request_options["return_all_hits"] = True
+        query_dict["request_options"]["return_all_hits"] = True
     elif range[1] <= range[0]:
         raise ValueError("Range stop must be greater than range start")
     else:
-        request_options["paginate"] = {
+        query_dict["request_options"]["paginate"] = {
             "start": int(range[0]),
             "rows": int(range[1]) - int(range[0])
         }
 
-    query_dict = {
-        "query": query.get_content(),
-        "return_type": return_type,
-        "request_options": request_options
-    }
-
     r = requests.get(_search_url, params={"json": json.dumps(query_dict)})
     
     if r.status_code == 200:
@@ -655,6 +791,37 @@ def search(query, return_type="entry", range=None, sort_by=None, content_types=(
             raise RequestError(f"Error {r.status_code}")
 
 
+def _initialize_query_dict(query, return_type, group_by, return_groups, content_types):
+    """
+    Initialize the request parameter dictionary with attributes that
+    `count()` and `search()` have in common.
+    """
+    if return_type not in [
+        "entry", "polymer_instance", "assembly",
+        "polymer_entity", "non_polymer_entity",
+    ]:
+        raise ValueError(f"'{return_type}' is an invalid return type")
+    
+    request_options = {}
+    
+    if len(content_types) == 0:
+        raise ValueError("At least one content type must be specified")
+    for content_type in content_types:
+        if content_type not in ("experimental", "computational"):
+            raise ValueError(f"Unknown content type '{content_type}'")
+    request_options["results_content_type"] = content_types
+
+    if group_by is not None:
+        request_options["group_by"] = group_by.get_content()
+
+    query_dict = {
+        "query": query.get_content(),
+        "return_type": return_type,
+        "request_options": request_options
+    }
+    return query_dict
+
+
 def _to_isoformat(object):
     """
     Convert a datetime into the specifc ISO 8601 format required by the RCSB.
diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
index effc3403a..49338bcbc 100644
--- a/tests/database/test_rcsb.py
+++ b/tests/database/test_rcsb.py
@@ -141,7 +141,7 @@ def test_search_sequence():
         ref_sequence, "protein", min_identity=IDENTIY_CUTOFF
     )
     test_ids = rcsb.search(query)
-    assert test_ids >= 2
+    assert len(test_ids) >= 2
 
     for id in test_ids:
         fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta"))
@@ -288,7 +288,6 @@ def test_search_content_types():
         rcsb.count(query, content_types=[])
 
 
-
 @pytest.mark.skipif(
     cannot_connect_to(RCSB_URL),
     reason="RCSB PDB is not available"

From 466d6bd6c97e620be6f69977d40b9692c882f0fb Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Thu, 16 Mar 2023 16:41:51 +0100
Subject: [PATCH 3/8] Add tests and fixes

---
 src/biotite/database/rcsb/query.py | 34 ++++++++----
 tests/database/test_rcsb.py        | 85 ++++++++++++++++++++++++++++--
 2 files changed, 106 insertions(+), 13 deletions(-)

diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py
index 5609b09d6..741fc6220 100644
--- a/src/biotite/database/rcsb/query.py
+++ b/src/biotite/database/rcsb/query.py
@@ -544,7 +544,7 @@ def get_content(self):
             The content dictionary for the ``'group_by'`` attributes.
         """
         if self._sorting is not None:
-            return {"ranking_criteria_type" : self.sorting}
+            return {"ranking_criteria_type" : self._sorting.get_content()}
         else:
             return {}
     
@@ -585,12 +585,16 @@ class IdentityGrouping(Grouping):
 
     def __init__(self, similarity_cutoff, sort_by=None):
         super().__init__(sort_by)
+        if similarity_cutoff not in (100, 95, 90, 70, 50, 30):
+            raise ValueError(
+                f"A similarity cutoff of {similarity_cutoff}% is not supported"
+            )
         self._similarity_cutoff = similarity_cutoff
 
     def get_content(self):
         content = super().get_content()
         content["aggregation_method"] = "sequence_identity"
-        content["similarity_cutoff"] = str(self.self._similarity_cutoff)
+        content["similarity_cutoff"] = self._similarity_cutoff
         return content
     
     def is_compatible_return_type(self, return_type):
@@ -667,7 +671,10 @@ def count(query, return_type="entry", group_by=None,
     r = requests.get(_search_url, params={"json": json.dumps(query_dict)})
     
     if r.status_code == 200:
-        return r.json()["total_count"]
+        if group_by is None:
+            return r.json()["total_count"]
+        else:
+            return r.json()["group_by_count"]
     elif r.status_code == 204:
         # Search did not return any results
         return 0
@@ -754,10 +761,13 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
         query, return_type, group_by, content_types
     )
 
-    if return_groups is True:
-        query_dict["request_options"]["group_by_return_type"] = "groups"
-    else:
-        query_dict["request_options"]["group_by_return_type"] = "representatives"
+    if group_by is not None:
+        if return_groups:
+            query_dict["request_options"]["group_by_return_type"] \
+                = "groups"
+        else:
+            query_dict["request_options"]["group_by_return_type"] \
+                = "representatives"
 
     if sort_by is not None:
         if isinstance(sort_by, Sorting):
@@ -779,7 +789,13 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
     r = requests.get(_search_url, params={"json": json.dumps(query_dict)})
     
     if r.status_code == 200:
-        return [result["identifier"] for result in r.json()["result_set"]]
+        if group_by is None or not return_groups:
+            return [result["identifier"] for result in r.json()["result_set"]]
+        else:
+            return [
+                [result["identifier"] for result in group["result_set"]]
+                for group in r.json()["group_set"]
+            ]
     elif r.status_code == 204:
         # Search did not return any results
         return []
@@ -791,7 +807,7 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
             raise RequestError(f"Error {r.status_code}")
 
 
-def _initialize_query_dict(query, return_type, group_by, return_groups, content_types):
+def _initialize_query_dict(query, return_type, group_by, content_types):
     """
     Initialize the request parameter dictionary with attributes that
     `count()` and `search()` have in common.
diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
index 49338bcbc..37fd49d0b 100644
--- a/tests/database/test_rcsb.py
+++ b/tests/database/test_rcsb.py
@@ -243,22 +243,35 @@ def test_search_range(seed):
     cannot_connect_to(RCSB_URL),
     reason="RCSB PDB is not available"
 )
-def test_search_sort():
+@pytest.mark.parametrize("as_sorting_object", [False, True])
+def test_search_sort(as_sorting_object):
     query = rcsb.FieldQuery(
         "rcsb_entity_host_organism.scientific_name",
         exact_match="Homo sapiens"
     )
-    entries = rcsb.search(query, sort_by="reflns.d_resolution_high")
+    if as_sorting_object:
+        sort_by = rcsb.Sorting("reflns.d_resolution_high", descending=False)
+    else:
+        sort_by = "reflns.d_resolution_high"
+    entries = rcsb.search(query, sort_by=sort_by)
     
     resolutions = []
     for pdb_id in entries[:5]:
         pdbx_file = pdbx.PDBxFile.read(rcsb.fetch(pdb_id, "pdbx"))
         resolutions.append(float(pdbx_file["reflns"]["d_resolution_high"]))
     
-    # Check if values are sorted in descending order
-    assert resolutions == list(reversed(sorted(resolutions)))
+    if as_sorting_object:
+        # In the tested case the Sorting object uses ascending order
+        assert resolutions == list(sorted(resolutions))
+    else:
+        # Check if values are sorted in descending order
+        assert resolutions == list(reversed(sorted(resolutions)))
 
 
+@pytest.mark.skipif(
+    cannot_connect_to(RCSB_URL),
+    reason="RCSB PDB is not available"
+)
 def test_search_content_types():
     # Query to limit the number of returned results
     # for improved performance
@@ -288,6 +301,70 @@ def test_search_content_types():
         rcsb.count(query, content_types=[])
 
 
+@pytest.mark.skipif(
+    cannot_connect_to(RCSB_URL),
+    reason="RCSB PDB is not available"
+)
+def test_search_identity_grouping():
+    """
+    Expect the same result as the example in the RCSB search API
+    tutorial.
+    """
+    REF_GROUPS = set([
+        ('1ZHM_1',),
+        (
+            '3P8X_1', '7QPP_1', '3X36_1', '3CS6_1', '3CS4_1', '3A78_1',
+            '3A40_1', '3A3Z_1', '2HB8_1', '2HB7_1', '2HAS_1', '2HAR_1',
+            '2HAM_1', '1TXI_1', '4G2I_1', '3TKC_1', '3OGT_1', '3KPZ_1',
+            '1IE9_1', '1IE8_1', '1DB1_1', '5YT2_1', '5YSY_1', '5GT4_1',
+            '3WGP_1', '3W0Y_1', '3W0C_1', '3W0A_1', '3AZ3_1', '3AZ2_1',
+            '3AZ1_1'
+        ),
+        ('3D44_1',),
+        ('6RA4_1',),
+        ('3B9V_1',),
+        ('2FC0_1', '2FBY_1'),
+        ('5GJH_1',),
+        ('2IGP_1',),
+        ('5LF7_13', '5LF4_13', '5LF1_13', '5LEY_13', '5LE5_13'),
+        ('1GBU_2',)
+    ])
+    REF_COUNT = 9597
+
+    query = (
+        rcsb.FieldQuery(
+            "rcsb_entity_source_organism.taxonomy_lineage.name",
+            exact_match="Homo sapiens"
+        )
+        & rcsb.FieldQuery(
+            "exptl.method",
+            exact_match="X-RAY DIFFRACTION"
+        )
+        & rcsb.FieldQuery(
+            "rcsb_entry_info.resolution_combined",
+            range_closed=(1.0, 2.0)
+        )
+    )
+    grouping = rcsb.IdentityGrouping(
+        100, sort_by="entity_poly.rcsb_sample_sequence_length"
+    )
+
+    test_groups = rcsb.search(
+        query, "polymer_entity",
+        group_by=grouping, return_groups=True
+    )
+    test_representatives = rcsb.search(
+        query, "polymer_entity",
+        group_by=grouping, return_groups=False
+    )
+    test_count = rcsb.count(query, "polymer_entity", group_by=grouping)
+    
+    # List is not hashable
+    assert set([tuple(group) for group in test_groups]) == REF_GROUPS
+    assert set(test_representatives) == [group[0] for group in REF_GROUPS]
+    assert test_count == pytest.approx(REF_COUNT, rel = 0.1)
+
+
 @pytest.mark.skipif(
     cannot_connect_to(RCSB_URL),
     reason="RCSB PDB is not available"

From 206c3c004a0ce70fc60d0772e75a33bd1a3cebee Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Fri, 17 Mar 2023 13:41:47 +0100
Subject: [PATCH 4/8] Add new classes to correct category in API documentation

---
 doc/apidoc.json | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/apidoc.json b/doc/apidoc.json
index 5138cdfbc..755518839 100644
--- a/doc/apidoc.json
+++ b/doc/apidoc.json
@@ -48,6 +48,13 @@
             "MotifQuery",
             "StructureQuery"
         ],
+        "Sorting and grouping" : [
+            "Sorting",
+            "Grouping",
+            "DepositGrouping",
+            "IdentityGrouping",
+            "UniprotGrouping"
+        ],
         "Search and fetch" : [
             "count",
             "search",

From d17d5415b6f9449139183d3a08c899755da60eb8 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Fri, 17 Mar 2023 13:42:22 +0100
Subject: [PATCH 5/8] Add docstrings, change return value

---
 src/biotite/database/rcsb/query.py | 148 +++++++++++++++++++++++++----
 1 file changed, 131 insertions(+), 17 deletions(-)

diff --git a/src/biotite/database/rcsb/query.py b/src/biotite/database/rcsb/query.py
index 741fc6220..ed18c09ce 100644
--- a/src/biotite/database/rcsb/query.py
+++ b/src/biotite/database/rcsb/query.py
@@ -17,7 +17,7 @@
 from datetime import datetime
 import numpy as np
 import requests
-from ...sequence.seqtypes import ProteinSequence, NucleotideSequence
+from ...sequence.seqtypes import NucleotideSequence
 from ..error import RequestError
 
 
@@ -503,8 +503,8 @@ class Grouping(metaclass=abc.ABCMeta):
     Parameters
     ----------
     sort_by : str or Sorting, optional
-        If specified, the returned PDB IDs are sorted by the values
-        of the given field name.
+        If specified, the returned PDB IDs within each group are sorted
+        by the values of the given field name.
         A complete list of the available fields is documented at
         `<https://search.rcsb.org/structure-search-attributes.html>`_.
         and
@@ -512,11 +512,6 @@ class Grouping(metaclass=abc.ABCMeta):
         If a string is given, sorting is performed in descending order.
         To choose the order a :class:`Sorting` object needs to be
         provided.
-    
-    Attributes
-    ----------
-    sorting : Sorting
-        The sorting of the :class:`Grouping`.
     """
 
     def __init__(self, sort_by=None):
@@ -571,6 +566,28 @@ def is_compatible_return_type(self, return_type):
 
 
 class DepositGrouping(Grouping):
+    """
+    This class groups PDB entries if they were deposited as a
+    collection.
+    Such a group usually contain the same protein with e.g. a different
+    bound molecule.
+
+    This :class:`Grouping` is only applicable, if the
+    :func:`count()`/:func:`search()` return type is set to ``entry``.
+
+    Parameters
+    ----------
+    sort_by : str or Sorting, optional
+        If specified, the returned PDB IDs within each group are sorted
+        by the values of the given field name.
+        A complete list of the available fields is documented at
+        `<https://search.rcsb.org/structure-search-attributes.html>`_.
+        and
+        `<https://search.rcsb.org/chemical-search-attributes.html>`_.
+        If a string is given, sorting is performed in descending order.
+        To choose the order a :class:`Sorting` object needs to be
+        provided.
+    """
 
     def get_content(self):
         content = super().get_content()
@@ -582,7 +599,34 @@ def is_compatible_return_type(self, return_type):
 
 
 class IdentityGrouping(Grouping):
+    """
+    This class groups protein chains with a given sequence identity
+    with each other.
 
+    This :class:`Grouping` is only applicable, if the
+    :func:`count()`/:func:`search()` return type is set to
+    ``polymer_entity``.
+
+    Parameters
+    ----------
+    similarity_cutoff : {100, 95, 90, 70, 50, 30}
+        The sequence identity in percent at which the structures are
+        grouped.
+        In other words, a returned group contains sequences that have
+        `similarity_cutoff` sequence identity with each other.
+        Since the PDB uses precalculated clusters, only certain values
+        are available.
+    sort_by : str or Sorting, optional
+        If specified, the returned PDB IDs within each group are sorted
+        by the values of the given field name.
+        A complete list of the available fields is documented at
+        `<https://search.rcsb.org/structure-search-attributes.html>`_.
+        and
+        `<https://search.rcsb.org/chemical-search-attributes.html>`_.
+        If a string is given, sorting is performed in descending order.
+        To choose the order a :class:`Sorting` object needs to be
+        provided.
+    """
     def __init__(self, similarity_cutoff, sort_by=None):
         super().__init__(sort_by)
         if similarity_cutoff not in (100, 95, 90, 70, 50, 30):
@@ -602,6 +646,27 @@ def is_compatible_return_type(self, return_type):
 
 
 class UniprotGrouping(Grouping):
+    """
+    This class groups protein chains that point to the same *Uniprot*
+    accession ID.
+
+    This :class:`Grouping` is only applicable, if the
+    :func:`count()`/:func:`search()` return type is set to
+    ``polymer_entity``.
+
+    Parameters
+    ----------
+    sort_by : str or Sorting, optional
+        If specified, the returned PDB IDs within each group are sorted
+        by the values of the given field name.
+        A complete list of the available fields is documented at
+        `<https://search.rcsb.org/structure-search-attributes.html>`_.
+        and
+        `<https://search.rcsb.org/chemical-search-attributes.html>`_.
+        If a string is given, sorting is performed in descending order.
+        To choose the order a :class:`Sorting` object needs to be
+        provided.
+    """
 
     def get_content(self):
         content = super().get_content()
@@ -637,6 +702,9 @@ def count(query, return_type="entry", group_by=None,
         - ``'non_polymer_entity'``: All matching non-polymeric entities
           are counted.
         - ``'polymer_instance'``: All matching chains are counted.
+    group_by : Grouping
+        If this parameter is set, the number of groups is returned
+        instead.
     content_types : iterable of {"experimental", "computational"}, optional
         Specify whether experimental and computational structures should
         be included.
@@ -648,9 +716,16 @@ def count(query, return_type="entry", group_by=None,
 
     Returns
     -------
-    ids : list of str
-        A list of strings containing all PDB IDs that meet the query
-        requirements.
+    count : int
+        The total number of PDB IDs (or groups) that would be returned
+        by calling :func:`search()` using the same parameters.
+    
+    Notes
+    -----
+    If `group_by` is set, the number of results may be lower than in an
+    ungrouped query, as grouping is not applicable to all structures.
+    For example a DNA structure has no associated *Uniprot* accession
+    and hence is omitted by :class:`UniprotGrouping`.
     
     Examples
     --------
@@ -727,6 +802,17 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
         If a string is given sorting is performed in descending order.
         To choose the order, a :class:`Sorting` object needs to be
         provided.
+    group_by : Grouping
+        If this parameter is set, the PDB IDs that meet the query
+        requirements, are grouped according to the given criterion.
+    return_groups : boolean, optional
+        Only has effect, if `group_by` is set.
+        By default the representative with the highest rank in each
+        group is returned.
+        The rank is determined by the `sort_by` parameter of
+        :class:`Grouping` provided in `group_by`.
+        If set to true, groups containing all structures belonging to
+        the group are returned instead.
     content_types : iterable of {"experimental", "computational"}, optional
         Specify whether experimental and computational structures should
         be included.
@@ -738,9 +824,25 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
 
     Returns
     -------
-    ids : list of str
-        A list of strings containing all PDB IDs that meet the query
-        requirements.
+    ids : list of str or dict (str -> list of str)
+        If `return_groups` is false (default case), a list of strings
+        containing all PDB IDs that meet the query requirements is
+        returned.
+        If `return_groups` is set to true a dictionary of groups is
+        returned.
+        This dictionary maps group identifiers to a list of all PDB IDs
+        belonging to this group.
+    
+    Notes
+    -----
+    If `group_by` is set, the number of results may be lower than in an
+    ungrouped query, as grouping is not applicable to all structures.
+    For example a DNA structure has no associated *Uniprot* accession
+    and hence is omitted by :class:`UniprotGrouping`.
+
+    Also note that `sort_by` does not affect the order within a group.
+    This order is determined by the `sort_by` parameter of the
+    :class:`Grouping`.
 
     Examples
     --------
@@ -756,6 +858,11 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
     ['7ATG', '5NW3', '5D8V']
     >>> print(sorted(search(query, return_type="polymer_instance")))
     ['1EJG.A', '1I0T.A', '1I0T.B', '2GLT.A', '3NIR.A', '3P4J.A', '3P4J.B', '4JLJ.A', '4JLJ.B', '5D8V.A', '5NW3.A', '7ATG.A', '7ATG.B', '7R0H.A']
+    >>> print(search(
+    ...     query, return_type="polymer_entity", return_groups=True,
+    ...     group_by=UniprotGrouping(sort_by="rcsb_accession_info.initial_release_date"),
+    ... ))
+    {'P24297': ['5NW3_1'], 'P04425': ['2GLT_1'], 'P27707': ['4JLJ_1'], 'P80176': ['5D8V_1'], 'O29777': ['7R0H_1'], 'P01542': ['3NIR_1', '1EJG_1']}
     """
     query_dict = _initialize_query_dict(
         query, return_type, group_by, content_types
@@ -792,10 +899,12 @@ def search(query, return_type="entry", range=None, sort_by=None, group_by=None,
         if group_by is None or not return_groups:
             return [result["identifier"] for result in r.json()["result_set"]]
         else:
-            return [
-                [result["identifier"] for result in group["result_set"]]
+            return {
+                group["identifier"] : [
+                    result["identifier"] for result in group["result_set"]
+                ]
                 for group in r.json()["group_set"]
-            ]
+            }
     elif r.status_code == 204:
         # Search did not return any results
         return []
@@ -828,6 +937,11 @@ def _initialize_query_dict(query, return_type, group_by, content_types):
     request_options["results_content_type"] = content_types
 
     if group_by is not None:
+        if not group_by.is_compatible_return_type(return_type):
+            raise ValueError(
+                f"Return type '{return_type}' is not compatible "
+                f"with the given Grouping"
+            )
         request_options["group_by"] = group_by.get_content()
 
     query_dict = {

From 088ae875c45e9d444bc6204729726bb18298a112 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Fri, 17 Mar 2023 13:42:30 +0100
Subject: [PATCH 6/8] Add tests

---
 tests/database/test_rcsb.py | 108 +++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 39 deletions(-)

diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
index 37fd49d0b..ba8924f95 100644
--- a/tests/database/test_rcsb.py
+++ b/tests/database/test_rcsb.py
@@ -305,64 +305,94 @@ def test_search_content_types():
     cannot_connect_to(RCSB_URL),
     reason="RCSB PDB is not available"
 )
-def test_search_identity_grouping():
-    """
-    Expect the same result as the example in the RCSB search API
-    tutorial.
-    """
-    REF_GROUPS = set([
-        ('1ZHM_1',),
+@pytest.mark.parametrize(
+    "grouping, resolution_threshold, return_type, ref_groups",
+    [
         (
-            '3P8X_1', '7QPP_1', '3X36_1', '3CS6_1', '3CS4_1', '3A78_1',
-            '3A40_1', '3A3Z_1', '2HB8_1', '2HB7_1', '2HAS_1', '2HAR_1',
-            '2HAM_1', '1TXI_1', '4G2I_1', '3TKC_1', '3OGT_1', '3KPZ_1',
-            '1IE9_1', '1IE8_1', '1DB1_1', '5YT2_1', '5YSY_1', '5GT4_1',
-            '3WGP_1', '3W0Y_1', '3W0C_1', '3W0A_1', '3AZ3_1', '3AZ2_1',
-            '3AZ1_1'
+            rcsb.IdentityGrouping(
+                100, sort_by="rcsb_accession_info.initial_release_date"
+            ),
+            0.7,
+            "polymer_entity",
+            set([
+                ("3X2M_1",),
+                ("6E6O_1",),
+                ("1YK4_1",),
+                ("5NW3_1",),
+                ("1US0_1",),
+                ("4HP2_1",),
+                ("2DSX_1",),
+                ("2VB1_1",),
+                ("7VOS_1", "5D8V_1", "3A38_1"),
+                ("1UCS_1",),
+                ("3NIR_1", "1EJG_1"),
+            ])
         ),
-        ('3D44_1',),
-        ('6RA4_1',),
-        ('3B9V_1',),
-        ('2FC0_1', '2FBY_1'),
-        ('5GJH_1',),
-        ('2IGP_1',),
-        ('5LF7_13', '5LF4_13', '5LF1_13', '5LEY_13', '5LE5_13'),
-        ('1GBU_2',)
-    ])
-    REF_COUNT = 9597
 
+        (
+            rcsb.UniprotGrouping(
+                sort_by="rcsb_accession_info.initial_release_date"
+            ),
+            0.7,
+            "polymer_entity",
+            set([
+                ("3X2M_1",),
+                ("6E6O_1",),
+                ("1YK4_1",),
+                ("5NW3_1",),
+                ("1US0_1",),
+                ("4HP2_1",),
+                ("2DSX_1",),
+                ("2VB1_1",),
+                ("7VOS_1", "5D8V_1", "3A38_1"),
+                ("1UCS_1",),
+                ("3NIR_1", "1EJG_1"),
+            ])
+        ),
+
+        (
+            rcsb.DepositGrouping(
+                sort_by="rcsb_accession_info.initial_release_date"
+            ),
+            0.9,
+            "entry",
+            set([
+                ("5R32",),
+                ("5RDH", "5RBR"),
+            ])
+        )
+    ]
+)
+def test_search_grouping(grouping, resolution_threshold, return_type,
+                         ref_groups):
+    """
+    Check whether the same result as in a known example is achieved.
+    """
     query = (
         rcsb.FieldQuery(
-            "rcsb_entity_source_organism.taxonomy_lineage.name",
-            exact_match="Homo sapiens"
-        )
-        & rcsb.FieldQuery(
             "exptl.method",
             exact_match="X-RAY DIFFRACTION"
         )
         & rcsb.FieldQuery(
             "rcsb_entry_info.resolution_combined",
-            range_closed=(1.0, 2.0)
+            range_closed=(0.0, resolution_threshold)
         )
     )
-    grouping = rcsb.IdentityGrouping(
-        100, sort_by="entity_poly.rcsb_sample_sequence_length"
-    )
 
-    test_groups = rcsb.search(
-        query, "polymer_entity",
+    test_groups = list(rcsb.search(
+        query, return_type,
         group_by=grouping, return_groups=True
-    )
+    ).values())
     test_representatives = rcsb.search(
-        query, "polymer_entity",
+        query, return_type,
         group_by=grouping, return_groups=False
     )
-    test_count = rcsb.count(query, "polymer_entity", group_by=grouping)
+    test_count = rcsb.count(query, return_type, group_by=grouping)
     
     # List is not hashable
-    assert set([tuple(group) for group in test_groups]) == REF_GROUPS
-    assert set(test_representatives) == [group[0] for group in REF_GROUPS]
-    assert test_count == pytest.approx(REF_COUNT, rel = 0.1)
+    assert set([tuple(group) for group in test_groups]) == ref_groups
+    assert set(test_representatives) == set([group[0] for group in ref_groups])
+    assert test_count == len(ref_groups)
 
 
 @pytest.mark.skipif(

From f929e46627cb35d37659bff877fc6661f14418ff Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Fri, 17 Mar 2023 13:43:04 +0100
Subject: [PATCH 7/8] Remove warning by removing doctest

The function is deprecated so removing the doctest is reasonable
---
 src/biotite/application/viennarna/rnafold.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/biotite/application/viennarna/rnafold.py b/src/biotite/application/viennarna/rnafold.py
index c621767fd..52fca90c2 100644
--- a/src/biotite/application/viennarna/rnafold.py
+++ b/src/biotite/application/viennarna/rnafold.py
@@ -174,16 +174,6 @@ def get_mfe(self):
         -------
         mfe : float
             The minimum free energy.
-
-        Examples
-        --------
-
-        >>> sequence = NucleotideSequence("CGACGTAGATGCTAGCTGACTCGATGC")
-        >>> app = RNAfoldApp(sequence)
-        >>> app.start()
-        >>> app.join()
-        >>> print(app.get_mfe())
-        -1.3
         """
         warnings.warn(
             "'get_mfe()' is deprecated, use 'get_free_energy()' instead",

From 5311f8a15dcb88afe495cf6a921d0c14238ade6d Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Fri, 17 Mar 2023 13:43:42 +0100
Subject: [PATCH 8/8] Explain grouping in tutorial

---
 doc/tutorial/src/database.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/doc/tutorial/src/database.py b/doc/tutorial/src/database.py
index e46188edd..59d6aecc0 100644
--- a/doc/tutorial/src/database.py
+++ b/doc/tutorial/src/database.py
@@ -105,6 +105,30 @@
 print(rcsb.search(composite_query))
 
 ########################################################################
+# Often the structures behind the obtained PDB IDs have degree of
+# redundancy.
+# For example they may represent the same protein sequences or result
+# from the same set of experiments.
+# You may use :class:`Grouping` of structures to group redundant
+# entries or even return only single representatives of each group.
+
+query = rcsb.BasicQuery("Transketolase")
+# Group PDB IDs from the same collection
+print(rcsb.search(
+    query, group_by=rcsb.DepositGrouping(), return_groups=True
+))
+# Get only a single representative of each group
+print(rcsb.search(
+    query, group_by=rcsb.DepositGrouping(), return_groups=False
+))
+
+########################################################################
+# Note that grouping may omit PDB IDs in search results, if such PDB IDs
+# cannot be grouped.
+# In the example shown above, not all structures 
+# For example in the case shown above only a few PDB entries were
+# uploaded as collection and hence are part of the search results.
+#
 # Fetching files from the NCBI Entrez database
 # --------------------------------------------
 #