Merge pull request #22 from metaodi/server-side-pagination

Use the new server-side pagination of pyodata
metaodi · Aug 31, 2023 · 1fd8c19 · 1fd8c19
2 parents ab77e18 + c324a36
commit 1fd8c19
Show file tree

Hide file tree

Showing 23 changed files with 6,877 additions and 79 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,10 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 23.7.0
+    hooks:
+      - id: black
+        language_version: python3.8
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project follows [Semantic Versioning](http://semver.org/).
 
 ## [Unreleased]
+### Added
+- Support for server-side pagination (transparent to the user)
+- `ResultVeryLargeWarning` to warn developers about very large queries
+- Logging to help with debugging
+- Add pre-commit configuration
+
+### Changed
+- Use `black` code style
 
 ## [0.2.1] - 2022-01-31
 ### Fixed

diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 [![PyPI Version][pypi-image]][pypi-url]
 [![Build Status][build-image]][build-url]
+[![Code style: black][black-image]][black-url]
+[![pre-commit][pre-commit-image]][pre-commit-url]
 
 
 swissparlpy
@@ -261,3 +263,7 @@ To create a new release, follow these steps (please respect [Semantic Versioning
 [pypi-url]: https://pypi.org/project/swissparlpy/
 [build-image]: https://github.com/metaodi/swissparlpy/actions/workflows/build.yml/badge.svg
 [build-url]: https://github.com/metaodi/swissparlpy/actions/workflows/build.yml
+[black-image]: https://img.shields.io/badge/code%20style-black-000000.svg
+[black-url]: https://github.com/psf/black
+[pre-commit-image]: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit
+[pre-commit-url]: https://github.com/pre-commit/pre-commit
diff --git a/dev_setup.sh b/dev_setup.sh
@@ -5,4 +5,5 @@ source pyenv/bin/activate
 
 pip install --upgrade pip
 pip install flit
+pre-commit install
 flit install -s
diff --git a/examples/Swiss Parliament API.ipynb b/examples/Swiss Parliament API.ipynb
@@ -15,12 +15,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import swissparlpy\n",
+    "import swissparlpy as spp\n",
     "import requests\n",
     "import pandas as pd\n",
     "import os\n",
     "import urllib3\n",
-    "from datetime import datetime"
+    "from datetime import datetime, timezone"
    ]
   },
   {
@@ -54,7 +54,7 @@
    "source": [
     "session = requests.Session()\n",
     "session.verify = False # disable SSL verification\n",
-    "client = swissparlpy.SwissParlClient(session=session)"
+    "client = spp.SwissParlClient(session=session)"
    ]
   },
   {
@@ -74,8 +74,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import swissparlpy as spp\n",
-    "\n",
     "tables = spp.get_tables()\n",
     "glimpse_df = pd.DataFrame(spp.get_glimpse(tables[0]))\n",
     "glimpse_df"
@@ -196,11 +194,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "utc_start_date = datetime.fromisoformat('2019-10-01').astimezone(timezone.utc)\n",
+    "utc_end_Date = datetime.fromisoformat('2019-10-31').astimezone(timezone.utc)\n",
     "business_oct19 = client.get_data(\n",
     "    \"Business\",\n",
     "    Language=\"DE\",\n",
-    "    SubmissionDate__gte=datetime.fromisoformat('2019-10-01'),\n",
-    "    SubmissionDate__lt=datetime.fromisoformat('2019-10-31')\n",
+    "    SubmissionDate__gte=utc_start_date,\n",
+    "    SubmissionDate__lt=utc_end_Date\n",
     ")\n",
     "business_oct19.count"
    ]
@@ -295,11 +295,118 @@
     "df_voting50 = pd.concat([pd.read_pickle(os.path.join(path, x)) for x in os.listdir(path)])\n",
     "df_voting50"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9dc1854c-9f97-4bd4-bc21-50b361efa54b",
+   "metadata": {},
+   "source": [
+    "## Queries with lots of results (server-side pagination)\n",
+    "\n",
+    "There is a server-side limit of 1000 items that are being returned.\n",
+    "swissparlpy handles this server-side pagination transparently, so a user of the library should not worry about it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "408f6b47-c352-49ac-ac0a-b4fbaa00b7cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "business = client.get_data(\"Business\", Language = \"DE\")\n",
+    "business.count"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6b842d0-ebd5-44a2-9169-0d91128f3da3",
+   "metadata": {},
+   "source": [
+    "As we can see, there are over 50k results.\n",
+    "Initially only the first 1000 are loaded:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b463504-3a2e-417d-b302-47adb70d23a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(business.data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02717c13-01a6-4772-b492-c56afdddac85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "business[1]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49fd8ec7-e5d8-4b10-ba03-1543bf52b81e",
+   "metadata": {},
+   "source": [
+    "But as soon as a next element is accessed, new data is (lazy) loaded:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fafc6266-cc43-4975-9e39-2866f29ef713",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "business[1001]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54556cc8-e076-4ea7-bb10-63aaf9f80e10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(business.data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19628322-4a7f-4720-a910-e2d72c1d744c",
+   "metadata": {},
+   "source": [
+    "If the last element is needed, all the data is loaded (NOTE: this uses quite some memory):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bcdb29ce-30ba-428c-9162-b2c843020227",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "business[-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79adb078-dffc-46a2-8746-2b54238b3718",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(business.data)"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -313,7 +420,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,

diff --git a/examples/download_votes_in_batches.py b/examples/download_votes_in_batches.py
@@ -14,7 +14,7 @@ def save_votes_of_session(id, path):
     data = spp.get_data("Voting", Language="DE", IdSession=id)
     print(f"{data.count} rows loaded.")
     df = pd.DataFrame(data)
-    pickle_path = os.path.join(path, f'{id}.pks')
+    pickle_path = os.path.join(path, f"{id}.pks")
     df.to_pickle(pickle_path)
     print(f"Saved pickle at {pickle_path}")
 
@@ -25,7 +25,9 @@ def save_votes_of_session(id, path):
 
 for session in sessions50:
     print(f"Loading session {session['ID']}")
-    save_votes_of_session(session['ID'], path)
+    save_votes_of_session(session["ID"], path)
 
 # Combine to one dataframe
-df_voting50 = pd.concat([pd.read_pickle(os.path.join(path, x)) for x in os.listdir(path)])
+df_voting50 = pd.concat(
+    [pd.read_pickle(os.path.join(path, x)) for x in os.listdir(path)]
+)
diff --git a/examples/filter.py b/examples/filter.py
@@ -2,9 +2,7 @@
 from pprint import pprint
 
 subjects = spp.get_data(
-    table="SubjectBusiness",
-    BusinessShortNumber="05.057",
-    Language="DE"
+    table="SubjectBusiness", BusinessShortNumber="05.057", Language="DE"
 )
 
 print(f"Total rows: {len(subjects)}")

diff --git a/examples/filter_advanced.py b/examples/filter_advanced.py
@@ -3,13 +3,10 @@
 
 
 def name_filter(e):
-    return spp.filter.or_(
-        e.FirstName == 'Stefan',
-        e.LastName == 'Seiler'
-    )
+    return spp.filter.or_(e.FirstName == "Stefan", e.LastName == "Seiler")
 
 
-persons = spp.get_data("Person", filter=name_filter, Language='DE')
+persons = spp.get_data("Person", filter=name_filter, Language="DE")
 
 df = pd.DataFrame(persons)
-print(df[['FirstName', 'LastName']])
+print(df[["FirstName", "LastName"]])
diff --git a/examples/filter_query.py b/examples/filter_query.py
@@ -3,8 +3,8 @@
 
 persons = spp.get_data(
     table="Person",
-    filter="(FirstName eq 'Stefan' or LastName eq 'Seiler') and Language eq 'DE'"
+    filter="(FirstName eq 'Stefan' or LastName eq 'Seiler') and Language eq 'DE'",
 )
 
 df = pd.DataFrame(persons)
-print(df[['FirstName', 'LastName']])
+print(df[["FirstName", "LastName"]])
diff --git a/examples/list_all_tables_and_properties.py b/examples/list_all_tables_and_properties.py
@@ -6,5 +6,5 @@
 for table, props in overview.items():
     print(table)
     for prop in props:
-        print(f' + {prop}')
-    print('')
+        print(f" + {prop}")
+    print("")
diff --git a/examples/pagination.py b/examples/pagination.py
@@ -0,0 +1,23 @@
+import logging
+import swissparlpy
+from pprint import pprint
+
+
+# setup logger to see debug messages from swissparlpy
+spp_logger = logging.getLogger("swissparlpy.client")
+spp_logger.setLevel(logging.DEBUG)
+
+logging.basicConfig(
+    format="%(asctime)s  %(levelname)-10s %(name)s: %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logging.captureWarnings(True)
+
+
+business = swissparlpy.get_data("Business", Language="DE")
+print(f"Count: {business.count})")
+print(f"Internal data: {len(business.data)}")
+
+pprint(business)
+pprint(business[1])
+pprint(business[1001])
diff --git a/examples/slice.py b/examples/slice.py
@@ -1,15 +1,13 @@
 import swissparlpy as spp
 from pprint import pprint
 
-sessions = spp.get_data(
-    table="Session"
-)
+sessions = spp.get_data(table="Session")
 
 print(f"Total rows: {len(sessions)}")
 for session in sessions[5:10]:
     pprint(session)
 
 
 # print any element
-print('')
+print("")
 pprint(sessions[587])
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ classifiers = ["License :: OSI Approved :: MIT License"]
 dynamic = ["version", "description"]
 dependencies = [
     "requests",
-    "pyodata>=1.9.0",
+    "pyodata>=1.10.0",
 ]
 
 [project.optional-dependencies]
@@ -26,6 +26,8 @@ dev = [
     "flit",
     "jupyter",
     "pandas",
+    "black",
+    "pre-commit",
 ]
 
 

diff --git a/setup.cfg b/setup.cfg
@@ -1,10 +1,12 @@
 [flake8]
 max-complexity = 10
 exclude = .git,__pycache__,pyenv,.pytest_cache
-# the new Torvalds default for line length
-max-line-length = 100
 # set to true to check all ignored errors
 disable_noqa = False
 
+# adaptions for black
+max-line-length = 88
+extend-ignore = E203
+
 [tool:pytest]
 addopts = --cov=swissparlpy
diff --git a/swissparlpy/__init__.py b/swissparlpy/__init__.py
@@ -1,11 +1,11 @@
 """Client for Swiss parliament API"""
 
-__version__ = '0.2.1'
-__all__ = ['client', 'errors']
+__version__ = "0.2.1"
+__all__ = ["client", "errors"]
 
 from .errors import SwissParlError  # noqa
 from .client import SwissParlClient
-from pyodata.v2.service import GetEntitySetFilter as filter # noqa
+from pyodata.v2.service import GetEntitySetFilter as filter  # noqa
 
 
 def get_tables():