Zotero pagination (#288)

* Add responses library for mocking requests * Mock Zotero API and bibtex entries Instead of setting up a testing environment in Zotero, I have mocked the API documented in [Zotero API docs](https://www.zotero.org/support/dev/web_api/v3/basics) especially with the pagination feature that splits the bibtex into chunks with a maximum length that they call 'limit'. In the fixture, the limit is set to 25 or the default. Along with the mock API, I needed a function that can generate n bibtex entries to test paginated and non-paginated results. * Add test loading bibtex from Zotero To test paginated and non-paginated responses, this test parametrizes the mock_zotero_api fixture with API endpoints with 4 and 150 results respectively. * Add function to handle Zotero API requests Zotero API's "Sorting and Pagination" [docs](https://www.zotero.org/support/dev/web_api/v3/basics) explain that results are limited to an integer number of results that can increase to a maximum of 100 results per request. If there are more results that match the search criteria, a "Link" header will be added with a `rel=next` link. The `tempfile_from_zotero` function leverages the response header and continues requesting the `next` url until all results are returned. To prevent runaway conditions, an arbitrary hard limit of 999 has been placed. * Sanitize Zotero API URL before request To prevent the user from adding problematic query params to the Zotero URL, the `sanitize_zotero_query()` function ensures that the requested format is bibtex and the limit is set to the maximum permitted by Zotero to reduce the total number of requests. The mocked Zotero API needs to represent these latest changes too. * Add responses to GHA testing deps
shyamd · Jan 16, 2025 · dd0695f · dd0695f
1 parent 632ad7a
commit dd0695f
Show file tree

Hide file tree

Showing 5 changed files with 156 additions and 1 deletion.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -57,7 +57,7 @@ jobs:
     - name: Install Python dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install pytest pytest-cov
+        pip install pytest pytest-cov responses
 
     - name: Test with pytest
       run: |

diff --git a/requirements-testing.txt b/requirements-testing.txt
@@ -2,5 +2,6 @@ pytest==8.3.4
 pytest-cov==6.0.0
 pytest-pretty==1.2.0
 mypy==1.14.1
+responses==0.25.6
 ruff==0.9.1
 types-requests~=2.32.0
diff --git a/src/mkdocs_bibtex/utils.py b/src/mkdocs_bibtex/utils.py
@@ -2,6 +2,7 @@
 import re
 import requests
 import tempfile
+import urllib.parse
 from collections import OrderedDict
 from functools import lru_cache
 from itertools import groupby
@@ -290,6 +291,8 @@ def format_bibliography(citation_quads):
 
 def tempfile_from_url(name, url, suffix):
     log.debug(f"Downloading {name} from URL {url} to temporary file...")
+    if urllib.parse.urlparse(url).hostname == "api.zotero.org":
+        return tempfile_from_zotero_url(name, url, suffix)
     for i in range(3):
         try:
             dl = requests.get(url)
@@ -309,3 +312,60 @@ def tempfile_from_url(name, url, suffix):
     raise RuntimeError(
         f"Couldn't successfully download the url: {url}"
     )  # pragma: no cover
+
+
+def tempfile_from_zotero_url(name: str, url: str, suffix: str) -> str:
+    """Download bibfile from the Zotero API."""
+    log.debug(f"Downloading {name} from Zotero at {url}")
+    bib_contents = ""
+
+    url = sanitize_zotero_query(url)
+
+    # Limit the pages requested to 999 arbitrarily. This will support a maximum of ~100k items
+    for page_num in range(999):
+        for _ in range(3):
+            try:
+                response = requests.get(url)
+                if response.status_code != 200:
+                    msg = f"Couldn't download the url: {url}.\nStatus Code: {response.status_code}"
+                    raise RuntimeError(msg)
+                break
+            except requests.exceptions.RequestException:  # pragma: no cover
+                pass
+
+        bib_contents += response.text
+        try:
+            url = response.links["next"]["url"]
+        except KeyError:
+            log.debug(f"Downloaded {page_num}(s) from {url}")
+            break
+    else:
+        log.debug(f"Exceeded the maximum number of pages. Found: {page_num} pages")
+    with tempfile.NamedTemporaryFile(mode="wt", encoding="utf-8", suffix=suffix, delete=False) as file:
+        file.write(bib_contents)
+    log.info(f"{name} downloaded from URL {url} to temporary file ({file})")
+    return file.name
+
+
+def sanitize_zotero_query(url: str) -> str:
+    """Sanitize query params in the Zotero URL.
+
+    The query params are amended to meet the following requirements:
+        - `mkdocs-bibtex` expects all bib data to be in bibtex format.
+        - Requesting the maximum number of items (100) reduces the requests
+            required, hence reducing load times.
+    """
+    updated_query_params = {"format": "bibtex", "limit": 100}
+
+    parsed_url = urllib.parse.urlparse(url)
+
+    query_params = dict(urllib.parse.parse_qsl(parsed_url.query))
+
+    return urllib.parse.ParseResult(
+        scheme=parsed_url.scheme,
+        netloc=parsed_url.netloc,
+        path=parsed_url.path,
+        params=parsed_url.params,
+        query=urllib.parse.urlencode(query={**query_params, **updated_query_params}),
+        fragment=parsed_url.fragment,
+    ).geturl()
diff --git a/test_files/test_plugin.py b/test_files/test_plugin.py
@@ -1,11 +1,42 @@
+import collections.abc
 import os
+import random
+import string
 
 import pytest
+import responses
 
 from mkdocs_bibtex.plugin import BibTexPlugin
 
 module_dir = os.path.dirname(os.path.abspath(__file__))
 test_files_dir = os.path.abspath(os.path.join(module_dir, "..", "test_files"))
+MOCK_ZOTERO_URL = "https://api.zotero.org/groups/FOO/collections/BAR/items?format=bibtex"
+
+
+@pytest.fixture
+def mock_zotero_api(request: pytest.FixtureRequest) -> collections.abc.Generator[responses.RequestsMock]:
+    zotero_api_url = "https://api.zotero.org/groups/FOO/collections/BAR/items?format=bibtex&limit=100"
+    bibtex_contents = generate_bibtex_entries(request.param)
+
+    limit = 100
+    pages = [bibtex_contents[i : i + limit] for i in range(0, len(bibtex_contents), limit)]
+
+    with responses.RequestsMock() as mock_api:
+        for page_num, page in enumerate(pages):
+            current_start = "" if page_num == 0 else f"&start={page_num * limit}"
+            next_start = f"&start={(page_num + 1) * limit}"
+            mock_api.add(
+                responses.Response(
+                    method="GET",
+                    url=f"{zotero_api_url}{current_start}",
+                    json="\n".join(page),
+                    headers={}
+                    if page_num == len(pages) - 1
+                    else {"Link": f"<{zotero_api_url}{next_start}>; rel='next'"},
+                )
+            )
+
+        yield mock_api
 
 
 @pytest.fixture
@@ -48,6 +79,17 @@ def test_bibtex_loading_bibdir():
     assert len(plugin.bib_data.entries) == 2
 
 
+@pytest.mark.parametrize(("mock_zotero_api", "number_of_entries"), ((4, 4), (150, 150)), indirect=["mock_zotero_api"])
+def test_bibtex_loading_zotero(mock_zotero_api: responses.RequestsMock, number_of_entries: int) -> None:
+    plugin = BibTexPlugin()
+    plugin.load_config(
+        options={"bib_file": MOCK_ZOTERO_URL},
+        config_file_path=test_files_dir,
+    )
+
+    plugin.on_config(plugin.config)
+    assert len(plugin.bib_data.entries) == number_of_entries
+
 def test_on_page_markdown(plugin):
     """
     This function just tests to make sure the rendered markdown changees with
@@ -106,3 +148,25 @@ def test_footnote_formatting_config(plugin):
 
     with pytest.raises(Exception):
         bad_plugin.on_config(bad_plugin.config)
+
+def generate_bibtex_entries(n: int) -> list[str]:
+    """Generates n random bibtex entries."""
+
+    entries = []
+
+    for i in range(n):
+        author_first = "".join(random.choices(string.ascii_letters, k=8))
+        author_last = "".join(random.choices(string.ascii_letters, k=8))
+        title = "".join(random.choices(string.ascii_letters, k=10))
+        journal = "".join(random.choices(string.ascii_uppercase, k=5))
+        year = str(random.randint(1950, 2025))
+
+        entries.append(f"""
+@article{{{author_last}_{i}}},
+    title = {{{title}}},
+    volume = {{1}},
+    journal = {{{journal}}},
+    author = {{{author_last}, {author_first}}},
+    year = {{{year}}},
+""")
+    return entries
diff --git a/test_files/test_utils.py b/test_files/test_utils.py
@@ -7,6 +7,7 @@
     format_simple,
     format_pandoc,
     extract_cite_keys,
+    sanitize_zotero_query,
 )
 
 from mkdocs_bibtex.plugin import parse_file
@@ -75,3 +76,32 @@ def test_extract_cite_key():
     """
     assert extract_cite_keys("[@test]") == ["test"]
     assert extract_cite_keys("[@test.3]") == ["test.3"]
+
+
+EXAMPLE_ZOTERO_API_ENDPOINT = "https://api.zotero.org/groups/FOO/collections/BAR/items"
+
+
+@pytest.mark.parametrize(
+    ("zotero_url", "expected_sanitized_url"),
+    (
+        (f"{EXAMPLE_ZOTERO_API_ENDPOINT}", f"{EXAMPLE_ZOTERO_API_ENDPOINT}?format=bibtex&limit=100"),
+        (
+            f"{EXAMPLE_ZOTERO_API_ENDPOINT}?format=bibtex&limit=25",
+            f"{EXAMPLE_ZOTERO_API_ENDPOINT}?format=bibtex&limit=100",
+        ),
+        (
+            f"{EXAMPLE_ZOTERO_API_ENDPOINT}?format=json",
+            f"{EXAMPLE_ZOTERO_API_ENDPOINT}?format=bibtex&limit=100",
+        ),
+        (
+            f"{EXAMPLE_ZOTERO_API_ENDPOINT}?sort=dateAdded",
+            f"{EXAMPLE_ZOTERO_API_ENDPOINT}?sort=dateAdded&format=bibtex&limit=100",
+        ),
+        (
+            f"{EXAMPLE_ZOTERO_API_ENDPOINT}?sort=dateAdded&sort=publisher",
+            f"{EXAMPLE_ZOTERO_API_ENDPOINT}?sort=publisher&format=bibtex&limit=100",
+        ),
+    ),
+)
+def test_sanitize_zotero_query(zotero_url: str, expected_sanitized_url: str) -> None:
+    assert sanitize_zotero_query(url=zotero_url) == expected_sanitized_url