diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index 4f63c729e..4d486766b 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -15,8 +15,6 @@ jobs:
fail-fast: true
matrix:
include:
- - dockerfile: ./docker/3.8/Debian/Dockerfile
- mtag: py3.8-debian
- dockerfile: ./docker/3.9/Debian/Dockerfile
mtag: py3.9-debian
- dockerfile: ./docker/3.9/Ubuntu/Dockerfile
@@ -25,7 +23,15 @@ jobs:
mtag: py3.10-debian
- dockerfile: ./docker/3.10/Ubuntu/Dockerfile
mtag: py3.10-ubuntu
- - dockerfile: ./docker/3.10/Ubuntu/Dockerfile
+ - dockerfile: ./docker/3.11/Debian/Dockerfile
+ mtag: py3.11-debian
+ - dockerfile: ./docker/3.11/Ubuntu/Dockerfile
+ mtag: py3.11-ubuntu
+ - dockerfile: ./docker/3.12/Debian/Dockerfile
+ mtag: py3.12-debian
+ - dockerfile: ./docker/3.12/Ubuntu/Dockerfile
+ mtag: py3.12-ubuntu
+ - dockerfile: ./docker/3.12/Ubuntu/Dockerfile
mtag: latest
permissions:
contents: read
diff --git a/.github/workflows/mypy-type-check.yml b/.github/workflows/mypy-type-check.yml
index 056a1d1c4..a8d093d1b 100644
--- a/.github/workflows/mypy-type-check.yml
+++ b/.github/workflows/mypy-type-check.yml
@@ -16,7 +16,7 @@ jobs:
strategy:
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11"]
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
@@ -39,4 +39,9 @@ jobs:
tiatoolbox/__main__.py \
tiatoolbox/typing.py \
tiatoolbox/tiatoolbox.py \
- tiatoolbox/utils/*.py
+ tiatoolbox/utils/*.py \
+ tiatoolbox/tools/__init__.py \
+ tiatoolbox/tools/stainextract.py \
+ tiatoolbox/tools/pyramid.py \
+ tiatoolbox/tools/tissuemask.py \
+ tiatoolbox/tools/graph.py
diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml
index abdb11527..ffa6961c9 100644
--- a/.github/workflows/pip-install.yml
+++ b/.github/workflows/pip-install.yml
@@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: true
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11"]
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
os: [ubuntu-22.04, windows-latest, macos-latest]
steps:
- name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 18a6d5360..5e6d74074 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: true
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11"]
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
@@ -30,7 +30,7 @@ jobs:
sudo apt update
sudo apt-get install -y libopenslide-dev openslide-tools libopenjp2-7 libopenjp2-tools
python -m pip install --upgrade pip
- python -m pip install ruff==0.1.13 pytest pytest-cov pytest-runner
+ python -m pip install ruff==0.2.2 pytest pytest-cov pytest-runner
pip install -r requirements/requirements.txt
- name: Cache tiatoolbox static assets
uses: actions/cache@v3
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bc0650353..60fb72afe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -59,7 +59,7 @@ repos:
- id: rst-directive-colons # Detect mistake of rst directive not ending with double colon.
- id: rst-inline-touching-normal # Detect mistake of inline code touching normal text in rst.
- repo: https://github.com/psf/black
- rev: 24.1.1 # Replace with any tag/version: https://github.com/psf/black/tags
+ rev: 24.2.0 # Replace with any tag/version: https://github.com/psf/black/tags
hooks:
- id: black
language_version: python3 # Should be a command that runs python3.+
@@ -68,7 +68,7 @@ repos:
language: python
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
- rev: v0.1.14
+ rev: v0.2.2
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
diff --git a/README.md b/README.md
index 0c5de616d..da8c04f06 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,7 @@ Prepare a computer as a convenient platform for further development of the Pytho
5. Create virtual environment for TIAToolbox using
```sh
- $ conda create -n tiatoolbox-dev python=3.8 # select version of your choice
+ $ conda create -n tiatoolbox-dev python=3.9 # select version of your choice
$ conda activate tiatoolbox-dev
$ pip install -r requirements/requirements_dev.txt
```
diff --git a/benchmarks/annotation_nquery.ipynb b/benchmarks/annotation_nquery.ipynb
index 458ecbb22..64a58794a 100644
--- a/benchmarks/annotation_nquery.ipynb
+++ b/benchmarks/annotation_nquery.ipynb
@@ -71,7 +71,7 @@
"from shapely.geometry import Polygon\n",
"\n",
"sys.path.append(\"..\") # If running locally without pypi installed tiatoolbox\n",
- "from tiatoolbox.annotation.storage import ( # noqa: E402\n",
+ "from tiatoolbox.annotation.storage import (\n",
" Annotation,\n",
" AnnotationStore,\n",
" DictionaryStore,\n",
diff --git a/benchmarks/annotation_store.ipynb b/benchmarks/annotation_store.ipynb
index 128ad387c..882ab251c 100644
--- a/benchmarks/annotation_store.ipynb
+++ b/benchmarks/annotation_store.ipynb
@@ -1,2703 +1,2704 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "aqPkpRk-pT5q"
- },
- "source": [
- "# Benchmarking Annotation Storage\n",
- "\n",
- "Click to open in: \\[[GitHub](https://github.com/TissueImageAnalytics/tiatoolbox/tree/develop/benchmarks/annotation_store.ipynb)\\]\\[[Colab](https://colab.research.google.com/github/TissueImageAnalytics/tiatoolbox/blob/develop/benchmarks/annotation_store.ipynb)\\]\\[[Kaggle](https://kaggle.com/kernels/welcome?src=https://github.com/TissueImageAnalytics/tiatoolbox/blob/develop/benchmarks/annotation_store.ipynb)\\]\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "BS0G58BPpT5s"
- },
- "source": [
- "_In order to run this notebook on a Kaggle platform, 1) click the Kaggle URL 2) click on Settings on the right of the Kaggle screen, 3) log in to your Kaggle account, 4) tick \"Internet\" checkbox under Settings, to enable necessary downloads._\n",
- "\n",
- "**NOTE:** Some parts of this notebook require a lot of memory. Part 2 in particular may not run on memory constrained systems. The notebook will run well on an MacBook Air (M1, 2020) but will use a lot of swap. It may require >64GB of memory for second half to avoid using swap.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "EjHQXjqrpT5s"
- },
- "source": [
- "## About This Notebook\n",
- "\n",
- "Managing annotation, either created by hand or from model output, is a\n",
- "common task in computational pathology. For a small number of\n",
- "annotations this may be trivial. However, for large numbers of\n",
- "annotations, it is often necessary to store the annotations in a more\n",
- "structured format such as a database. This is because finding a desired\n",
- "subset of annotations within a very large collection, for example over\n",
- "one million cell boundary polygons derived from running HoVerNet on a\n",
- "WSI, may be very slow if performed in a naive manner. In the toolbox, we\n",
- "implement two storage method to make handling annotations easier:\n",
- "`DictionaryStore` and `SQLiteStore`.\n",
- "\n",
- "### Storage Classes\n",
- "\n",
- "Both stores act as a key-value store where the key is the annotation ID\n",
- "(as a string) and the value is the annotation. This follows the Python\n",
- "[`MutableMapping`](https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping)\n",
- "interface meaning that the stores can be used in the same way as a\n",
- "regular Python dictionary (`dict`).\n",
- "\n",
- "The `DictionaryStore` is implemented internally using a Python\n",
- "dictionary. It is a realtively simple class, operating with all\n",
- "annotations in memory and using a simple scan method to search for\n",
- "annotations. This works very well for a small number of annotations. In\n",
- "contrast the `SQLiteStore` is implemented using a SQLite database\n",
- "(either in memory or on disk), it is a more complex class making use of\n",
- "an rtree index to efficiently spatially search for annotations. This is\n",
- "much more suited to a very large number of annotations. However, they\n",
- "both follow the same interface and can be used interchangeably for\n",
- "almost all methods (`SQLiteStore` has some additional methods).\n",
- "\n",
- "### Provided Functionality (Mini Tutorial)\n",
- "\n",
- "The storage classes provide a lot of functionality including. This\n",
- "includes all of the standard `MutableMapping` methods, as well as\n",
- "some additional ones for querying the collection of annotations.\n",
- "Below is a brief summary of the main functionality.\n",
- "\n",
- "#### Adding Annotations\n",
- "\n",
- "```python\n",
- "from tiatoolbox.annotation.storage import Annotation, DictionaryStore, SQliteStore\n",
- "from shapely.geometry import Polygon\n",
- "\n",
- "# Create a new store. If no path is given it is an in-memory store.\n",
- "store = DictionaryStore()\n",
- "\n",
- "# An annotation is a shapely geometry and a JSON serializable dictionary\n",
- "annotation = Annotation(Polygon.from_bounds(0, 0, 1, 1), {\"id\": \"1\"})\n",
- "\n",
- "# Add the annotation to the store in the same way as a dictionary\n",
- "store[\"foo\"] = annotation\n",
- "\n",
- "# Bulk append is also supported. This will be faster in some contexts\n",
- "# (e.g. for an SQLiteStore) than adding them one at a time.\n",
- "# Here we add 100 simple box annotations.\n",
- "# As we have not specified a set of keys to use, a new UUID is generated\n",
- "# for each. The respective generated keys are also returned.\n",
- "annotations = [\n",
- " Annotation(Polygon.from_bounds(n, n, n + 1, n + 1), {\"id\": n}) for n in range(100)\n",
- "]\n",
- "keys = store.append_many(annotations)\n",
- "```\n",
- "\n",
- "#### Removing Annotations\n",
- "\n",
- "```python\n",
- "# Remove an annotation by key\n",
- "del store[\"foo\"]\n",
- "\n",
- "# Bulk removal\n",
- "keys = [\"1234-5676....\", \"...\"] # etc.\n",
- "store.remove_many(keys)\n",
- "```\n",
- "\n",
- "#### Querying Within a Region\n",
- "\n",
- "```python\n",
- "# Find all annotations which intersect a polygon\n",
- "search_region = Polygon.from_bounds(0, 0, 10, 10)\n",
- "result = store.query(search_region)\n",
- "\n",
- "# Find all annotations which are contained within a polygon\n",
- "search_region = Polygon.from_bounds(0, 0, 10, 10)\n",
- "result = store.query(search_region, geometry_predicate=\"contains\")\n",
- "```\n",
- "\n",
- "#### Querying Using A Predicate Statement\n",
- "\n",
- "```python\n",
- "# 'props' is a provided shorthand to access the 'properties' dictionary\n",
- "results = store.query(where=\"propd['id'] == 1\")\n",
- "```\n",
- "\n",
- "#### Serializing and Deserializing\n",
- "\n",
- "```python\n",
- "# Serialize the store to a GeoJSON string\n",
- "json_string = store.to_geojson()\n",
- "\n",
- "# Serialize the store to a GeoJSON file\n",
- "store.to_geojson(\"boxes.geojson\")\n",
- "\n",
- "# Deserialize a GeoJSON string into a store (even of a different type)\n",
- "sqlitestore = SqliteStore.from_geojson(\"boxes.geojson\")\n",
- "\n",
- "# The above is an in-memory store. We can also now write this to disk\n",
- "# as an SQLite database.\n",
- "sqlitestore.dump(\"boxes.db\")\n",
- "```\n",
- "\n",
- "### Benchmarking\n",
- "\n",
- "Here we evaluate the storage efficient and data querying performance of\n",
- "the annotation store versus other common formats. We will evaluate some\n",
- "common situations and use cases including:\n",
- "\n",
- "- Disk I/O (tested with an SSD)\n",
- "- Querying the data for annotations within a box region\n",
- "- Querying the data for annotations within a polygon region\n",
- "- Querying the data with a predicate e.g. 'class=1'\n",
- "\n",
- "All saved output is from running this notebook on a 2020 M1 MacBook Air with 16GB RAM.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "aov8ENq2pT5t"
- },
- "source": [
- "## Imports\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "UoMpbDXopT5t"
- },
- "outputs": [],
- "source": [
- "\"\"\"Import modules required to run the Jupyter notebook.\"\"\"\n",
- "\n",
- "from __future__ import annotations\n",
- "\n",
- "# Clear logger to use tiatoolbox.logger\n",
- "import logging\n",
- "\n",
- "if logging.getLogger().hasHandlers():\n",
- " logging.getLogger().handlers.clear()\n",
- "\n",
- "import copy\n",
- "import pickle\n",
- "import sys\n",
- "import tempfile\n",
- "import timeit\n",
- "import uuid\n",
- "from pathlib import Path\n",
- "from typing import TYPE_CHECKING, Any, Generator\n",
- "\n",
- "import numpy as np\n",
- "from IPython.display import display\n",
- "from matplotlib import patheffects\n",
- "from matplotlib import pyplot as plt\n",
- "from shapely import affinity\n",
- "from shapely.geometry import MultiPolygon, Point, Polygon\n",
- "from tqdm.auto import tqdm\n",
- "\n",
- "if TYPE_CHECKING:\n",
- " from numbers import Number\n",
- "\n",
- "sys.path.append(\"..\") # If running locally without pypi installed tiatoolbox\n",
- "\n",
- "from tiatoolbox import logger # noqa: E402\n",
- "from tiatoolbox.annotation.storage import ( # noqa: E402\n",
- " Annotation,\n",
- " DictionaryStore,\n",
- " SQLiteStore,\n",
- ")\n",
- "\n",
- "plt.style.use(\"ggplot\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "nW-UyVQOpT5u"
- },
- "source": [
- "## Data Generation & Utility Functions\n",
- "\n",
- "Here we define some useful functions to generate some artificial data\n",
- "and visualise results.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "N5xNW64UpT5v"
- },
- "outputs": [],
- "source": [
- "def cell_polygon(\n",
- " xy: tuple[Number, Number],\n",
- " n_points: int = 20,\n",
- " radius: Number = 8,\n",
- " noise: Number = 0.01,\n",
- " eccentricity: tuple[Number, Number] = (1, 3),\n",
- " direction: str = \"CCW\",\n",
- " seed: int = 0,\n",
- " *,\n",
- " repeat_first: bool = True,\n",
- ") -> Polygon:\n",
- " \"\"\"Generate a fake cell boundary polygon.\n",
- "\n",
- " Borrowed from tiatoolbox unit tests.\n",
- "\n",
- " Cell boundaries are generated an ellipsoids with randomised eccentricity,\n",
- " added noise, and a random rotation.\n",
- "\n",
- " Args:\n",
- " xy (tuple(int)): The x,y centre point to generate the cell boundary around.\n",
- " n_points (int): Number of points in the boundary. Defaults to 20.\n",
- " radius (float): Radius of the points from the centre. Defaults to 10.\n",
- " noise (float): Noise to add to the point locations. Defaults to 1.\n",
- " eccentricity (tuple(float)): Range of values (low, high) to use for\n",
- " randomised eccentricity. Defaults to (1, 3).\n",
- " repeat_first (bool): Enforce that the last point is equal to the first.\n",
- " direction (str): Ordering of the points. Defaults to \"CCW\". Valid options\n",
- " are: counter-clockwise \"CCW\", and clockwise \"CW\".\n",
- " seed: Seed for the random number generator. Defaults to 0.\n",
- "\n",
- " \"\"\"\n",
- " rand_state = np.random.default_rng().__getstate__()\n",
- " rng_seed = np.random.default_rng(seed)\n",
- "\n",
- " if repeat_first:\n",
- " n_points -= 1\n",
- "\n",
- " # Generate points about an ellipse with random eccentricity\n",
- " x, y = xy\n",
- " alpha = np.linspace(0, 2 * np.pi - (2 * np.pi / n_points), n_points)\n",
- " rx = radius * (rng_seed.random() + 0.5)\n",
- " ry = rng_seed.uniform(*eccentricity) * radius - 0.5 * rx\n",
- " x = rx * np.cos(alpha) + x + (rng_seed.random(n_points) - 0.5) * noise\n",
- " y = ry * np.sin(alpha) + y + (rng_seed.random(n_points) - 0.5) * noise\n",
- " boundary_coords = np.stack([x, y], axis=1).astype(int).tolist()\n",
- "\n",
- " # Copy first coordinate to the end if required\n",
- " if repeat_first:\n",
- " boundary_coords = [*boundary_coords, boundary_coords[0]]\n",
- "\n",
- " # Swap direction\n",
- " if direction.strip().lower() == \"cw\":\n",
- " boundary_coords = boundary_coords[::-1]\n",
- "\n",
- " polygon = Polygon(boundary_coords)\n",
- "\n",
- " # Add random rotation\n",
- " angle = rng_seed.random() * 360\n",
- " polygon = affinity.rotate(polygon, angle, origin=\"centroid\")\n",
- "\n",
- " # Restore the random state\n",
- " np.random.default_rng().__setstate__(rand_state)\n",
- "\n",
- " return polygon"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "jyQEBhNIpT5v"
- },
- "outputs": [],
- "source": [
- "def cell_grid(\n",
- " size: tuple[int, int] = (10, 10),\n",
- " spacing: Number = 25,\n",
- ") -> Generator[Polygon, None, None]:\n",
- " \"\"\"Generate a grid of cell boundaries.\"\"\"\n",
- " return (\n",
- " cell_polygon(xy=np.multiply(ij, spacing), repeat_first=False, seed=n)\n",
- " for n, ij in enumerate(np.ndindex(size))\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "VVjSum_9pT5v"
- },
- "outputs": [],
- "source": [
- "def plot_results(\n",
- " experiments: list[list[Number]],\n",
- " title: str,\n",
- " capsize: int = 5,\n",
- " **kwargs: dict[str, Any],\n",
- ") -> None:\n",
- " \"\"\"Plot the results of a benchmark.\n",
- "\n",
- " Uses the min for the bar height (see See\n",
- " https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat),\n",
- " and plots a min-max error bar.\n",
- "\n",
- " \"\"\"\n",
- " x = range(len(experiments))\n",
- " color = [f\"C{x_i}\" for x_i in x]\n",
- " plt.bar(\n",
- " x=x,\n",
- " height=[min(e) for e in experiments],\n",
- " color=color,\n",
- " yerr=[[0 for e in experiments], [max(e) - min(e) for e in experiments]],\n",
- " capsize=capsize,\n",
- " **kwargs,\n",
- " )\n",
- " for i, (runs, c) in enumerate(zip(experiments, color)):\n",
- " plt.text(\n",
- " i,\n",
- " min(runs),\n",
- " f\" {min(runs):.4f}s\",\n",
- " ha=\"left\",\n",
- " va=\"bottom\",\n",
- " color=c,\n",
- " zorder=10,\n",
- " fontweight=\"bold\",\n",
- " path_effects=[\n",
- " patheffects.withStroke(linewidth=2, foreground=\"w\"),\n",
- " ],\n",
- " )\n",
- " plt.title(title)\n",
- " plt.hlines(\n",
- " 0.5,\n",
- " -0.5,\n",
- " len(experiments) - 0.5,\n",
- " linestyles=\"dashed\",\n",
- " colors=\"black\",\n",
- " alpha=0.5,\n",
- " )\n",
- " plt.yscale(\"log\")\n",
- " plt.xlabel(\"Store Type\")\n",
- " plt.ylabel(\"Time (s)\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "tHEUErSmpT5w"
- },
- "source": [
- "## Display Some Generated Data\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "YUQmgohbpT5w",
- "outputId": "1a0cdee1-e32d-41e9-fb9d-26c5ee572880"
- },
- "outputs": [
+ "cells": [
{
- "data": {
- "image/svg+xml": " ",
- "text/plain": [
- ""
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aqPkpRk-pT5q"
+ },
+ "source": [
+ "# Benchmarking Annotation Storage\n",
+ "\n",
+ "Click to open in: \\[[GitHub](https://github.com/TissueImageAnalytics/tiatoolbox/tree/develop/benchmarks/annotation_store.ipynb)\\]\\[[Colab](https://colab.research.google.com/github/TissueImageAnalytics/tiatoolbox/blob/develop/benchmarks/annotation_store.ipynb)\\]\\[[Kaggle](https://kaggle.com/kernels/welcome?src=https://github.com/TissueImageAnalytics/tiatoolbox/blob/develop/benchmarks/annotation_store.ipynb)\\]\n",
+ "\n"
]
- },
- "metadata": {},
- "output_type": "display_data"
},
{
- "data": {
- "image/svg+xml": " ",
- "text/plain": [
- ""
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BS0G58BPpT5s"
+ },
+ "source": [
+ "_In order to run this notebook on a Kaggle platform, 1) click the Kaggle URL 2) click on Settings on the right of the Kaggle screen, 3) log in to your Kaggle account, 4) tick \"Internet\" checkbox under Settings, to enable necessary downloads._\n",
+ "\n",
+ "**NOTE:** Some parts of this notebook require a lot of memory. Part 2 in particular may not run on memory constrained systems. The notebook will run well on an MacBook Air (M1, 2020) but will use a lot of swap. It may require >64GB of memory for second half to avoid using swap.\n",
+ "\n"
]
- },
- "metadata": {},
- "output_type": "display_data"
},
{
- "data": {
- "image/svg+xml": " ",
- "text/plain": [
- ""
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "EjHQXjqrpT5s"
+ },
+ "source": [
+ "## About This Notebook\n",
+ "\n",
+ "Managing annotation, either created by hand or from model output, is a\n",
+ "common task in computational pathology. For a small number of\n",
+ "annotations this may be trivial. However, for large numbers of\n",
+ "annotations, it is often necessary to store the annotations in a more\n",
+ "structured format such as a database. This is because finding a desired\n",
+ "subset of annotations within a very large collection, for example over\n",
+ "one million cell boundary polygons derived from running HoVerNet on a\n",
+ "WSI, may be very slow if performed in a naive manner. In the toolbox, we\n",
+ "implement two storage method to make handling annotations easier:\n",
+ "`DictionaryStore` and `SQLiteStore`.\n",
+ "\n",
+ "### Storage Classes\n",
+ "\n",
+ "Both stores act as a key-value store where the key is the annotation ID\n",
+ "(as a string) and the value is the annotation. This follows the Python\n",
+ "[`MutableMapping`](https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping)\n",
+ "interface meaning that the stores can be used in the same way as a\n",
+ "regular Python dictionary (`dict`).\n",
+ "\n",
+ "The `DictionaryStore` is implemented internally using a Python\n",
+ "dictionary. It is a realtively simple class, operating with all\n",
+ "annotations in memory and using a simple scan method to search for\n",
+ "annotations. This works very well for a small number of annotations. In\n",
+ "contrast the `SQLiteStore` is implemented using a SQLite database\n",
+ "(either in memory or on disk), it is a more complex class making use of\n",
+ "an rtree index to efficiently spatially search for annotations. This is\n",
+ "much more suited to a very large number of annotations. However, they\n",
+ "both follow the same interface and can be used interchangeably for\n",
+ "almost all methods (`SQLiteStore` has some additional methods).\n",
+ "\n",
+ "### Provided Functionality (Mini Tutorial)\n",
+ "\n",
+ "The storage classes provide a lot of functionality including. This\n",
+ "includes all of the standard `MutableMapping` methods, as well as\n",
+ "some additional ones for querying the collection of annotations.\n",
+ "Below is a brief summary of the main functionality.\n",
+ "\n",
+ "#### Adding Annotations\n",
+ "\n",
+ "```python\n",
+ "from tiatoolbox.annotation.storage import Annotation, DictionaryStore, SQliteStore\n",
+ "from shapely.geometry import Polygon\n",
+ "\n",
+ "# Create a new store. If no path is given it is an in-memory store.\n",
+ "store = DictionaryStore()\n",
+ "\n",
+ "# An annotation is a shapely geometry and a JSON serializable dictionary\n",
+ "annotation = Annotation(Polygon.from_bounds(0, 0, 1, 1), {\"id\": \"1\"})\n",
+ "\n",
+ "# Add the annotation to the store in the same way as a dictionary\n",
+ "store[\"foo\"] = annotation\n",
+ "\n",
+ "# Bulk append is also supported. This will be faster in some contexts\n",
+ "# (e.g. for an SQLiteStore) than adding them one at a time.\n",
+ "# Here we add 100 simple box annotations.\n",
+ "# As we have not specified a set of keys to use, a new UUID is generated\n",
+ "# for each. The respective generated keys are also returned.\n",
+ "annotations = [\n",
+ " Annotation(Polygon.from_bounds(n, n, n + 1, n + 1), {\"id\": n}) for n in range(100)\n",
+ "]\n",
+ "keys = store.append_many(annotations)\n",
+ "```\n",
+ "\n",
+ "#### Removing Annotations\n",
+ "\n",
+ "```python\n",
+ "# Remove an annotation by key\n",
+ "del store[\"foo\"]\n",
+ "\n",
+ "# Bulk removal\n",
+ "keys = [\"1234-5676....\", \"...\"] # etc.\n",
+ "store.remove_many(keys)\n",
+ "```\n",
+ "\n",
+ "#### Querying Within a Region\n",
+ "\n",
+ "```python\n",
+ "# Find all annotations which intersect a polygon\n",
+ "search_region = Polygon.from_bounds(0, 0, 10, 10)\n",
+ "result = store.query(search_region)\n",
+ "\n",
+ "# Find all annotations which are contained within a polygon\n",
+ "search_region = Polygon.from_bounds(0, 0, 10, 10)\n",
+ "result = store.query(search_region, geometry_predicate=\"contains\")\n",
+ "```\n",
+ "\n",
+ "#### Querying Using A Predicate Statement\n",
+ "\n",
+ "```python\n",
+ "# 'props' is a provided shorthand to access the 'properties' dictionary\n",
+ "results = store.query(where=\"propd['id'] == 1\")\n",
+ "```\n",
+ "\n",
+ "#### Serializing and Deserializing\n",
+ "\n",
+ "```python\n",
+ "# Serialize the store to a GeoJSON string\n",
+ "json_string = store.to_geojson()\n",
+ "\n",
+ "# Serialize the store to a GeoJSON file\n",
+ "store.to_geojson(\"boxes.geojson\")\n",
+ "\n",
+ "# Deserialize a GeoJSON string into a store (even of a different type)\n",
+ "sqlitestore = SqliteStore.from_geojson(\"boxes.geojson\")\n",
+ "\n",
+ "# The above is an in-memory store. We can also now write this to disk\n",
+ "# as an SQLite database.\n",
+ "sqlitestore.dump(\"boxes.db\")\n",
+ "```\n",
+ "\n",
+ "### Benchmarking\n",
+ "\n",
+ "Here we evaluate the storage efficient and data querying performance of\n",
+ "the annotation store versus other common formats. We will evaluate some\n",
+ "common situations and use cases including:\n",
+ "\n",
+ "- Disk I/O (tested with an SSD)\n",
+ "- Querying the data for annotations within a box region\n",
+ "- Querying the data for annotations within a polygon region\n",
+ "- Querying the data with a predicate e.g. 'class=1'\n",
+ "\n",
+ "All saved output is from running this notebook on a 2020 M1 MacBook Air with 16GB RAM.\n",
+ "\n"
]
- },
- "metadata": {},
- "output_type": "display_data"
},
{
- "data": {
- "image/svg+xml": " ",
- "text/plain": [
- ""
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aov8ENq2pT5t"
+ },
+ "source": [
+ "## Imports\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "UoMpbDXopT5t"
+ },
+ "outputs": [],
+ "source": [
+ "\"\"\"Import modules required to run the Jupyter notebook.\"\"\"\n",
+ "\n",
+ "from __future__ import annotations\n",
+ "\n",
+ "# Clear logger to use tiatoolbox.logger\n",
+ "import logging\n",
+ "\n",
+ "if logging.getLogger().hasHandlers():\n",
+ " logging.getLogger().handlers.clear()\n",
+ "\n",
+ "import copy\n",
+ "import pickle\n",
+ "import sys\n",
+ "import tempfile\n",
+ "import timeit\n",
+ "import uuid\n",
+ "from pathlib import Path\n",
+ "from typing import TYPE_CHECKING, Any\n",
+ "\n",
+ "import numpy as np\n",
+ "from IPython.display import display\n",
+ "from matplotlib import patheffects\n",
+ "from matplotlib import pyplot as plt\n",
+ "from shapely import affinity\n",
+ "from shapely.geometry import MultiPolygon, Point, Polygon\n",
+ "from tqdm.auto import tqdm\n",
+ "\n",
+ "if TYPE_CHECKING:\n",
+ " from collections.abc import Generator\n",
+ " from numbers import Number\n",
+ "\n",
+ "sys.path.append(\"..\") # If running locally without pypi installed tiatoolbox\n",
+ "\n",
+ "from tiatoolbox import logger\n",
+ "from tiatoolbox.annotation.storage import (\n",
+ " Annotation,\n",
+ " DictionaryStore,\n",
+ " SQLiteStore,\n",
+ ")\n",
+ "\n",
+ "plt.style.use(\"ggplot\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "nW-UyVQOpT5u"
+ },
+ "source": [
+ "## Data Generation & Utility Functions\n",
+ "\n",
+ "Here we define some useful functions to generate some artificial data\n",
+ "and visualise results.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "N5xNW64UpT5v"
+ },
+ "outputs": [],
+ "source": [
+ "def cell_polygon(\n",
+ " xy: tuple[Number, Number],\n",
+ " n_points: int = 20,\n",
+ " radius: Number = 8,\n",
+ " noise: Number = 0.01,\n",
+ " eccentricity: tuple[Number, Number] = (1, 3),\n",
+ " direction: str = \"CCW\",\n",
+ " seed: int = 0,\n",
+ " *,\n",
+ " repeat_first: bool = True,\n",
+ ") -> Polygon:\n",
+ " \"\"\"Generate a fake cell boundary polygon.\n",
+ "\n",
+ " Borrowed from tiatoolbox unit tests.\n",
+ "\n",
+ " Cell boundaries are generated an ellipsoids with randomised eccentricity,\n",
+ " added noise, and a random rotation.\n",
+ "\n",
+ " Args:\n",
+ " xy (tuple(int)): The x,y centre point to generate the cell boundary around.\n",
+ " n_points (int): Number of points in the boundary. Defaults to 20.\n",
+ " radius (float): Radius of the points from the centre. Defaults to 10.\n",
+ " noise (float): Noise to add to the point locations. Defaults to 1.\n",
+ " eccentricity (tuple(float)): Range of values (low, high) to use for\n",
+ " randomised eccentricity. Defaults to (1, 3).\n",
+ " repeat_first (bool): Enforce that the last point is equal to the first.\n",
+ " direction (str): Ordering of the points. Defaults to \"CCW\". Valid options\n",
+ " are: counter-clockwise \"CCW\", and clockwise \"CW\".\n",
+ " seed: Seed for the random number generator. Defaults to 0.\n",
+ "\n",
+ " \"\"\"\n",
+ " rand_state = np.random.default_rng().__getstate__()\n",
+ " rng_seed = np.random.default_rng(seed)\n",
+ "\n",
+ " if repeat_first:\n",
+ " n_points -= 1\n",
+ "\n",
+ " # Generate points about an ellipse with random eccentricity\n",
+ " x, y = xy\n",
+ " alpha = np.linspace(0, 2 * np.pi - (2 * np.pi / n_points), n_points)\n",
+ " rx = radius * (rng_seed.random() + 0.5)\n",
+ " ry = rng_seed.uniform(*eccentricity) * radius - 0.5 * rx\n",
+ " x = rx * np.cos(alpha) + x + (rng_seed.random(n_points) - 0.5) * noise\n",
+ " y = ry * np.sin(alpha) + y + (rng_seed.random(n_points) - 0.5) * noise\n",
+ " boundary_coords = np.stack([x, y], axis=1).astype(int).tolist()\n",
+ "\n",
+ " # Copy first coordinate to the end if required\n",
+ " if repeat_first:\n",
+ " boundary_coords = [*boundary_coords, boundary_coords[0]]\n",
+ "\n",
+ " # Swap direction\n",
+ " if direction.strip().lower() == \"cw\":\n",
+ " boundary_coords = boundary_coords[::-1]\n",
+ "\n",
+ " polygon = Polygon(boundary_coords)\n",
+ "\n",
+ " # Add random rotation\n",
+ " angle = rng_seed.random() * 360\n",
+ " polygon = affinity.rotate(polygon, angle, origin=\"centroid\")\n",
+ "\n",
+ " # Restore the random state\n",
+ " np.random.default_rng().__setstate__(rand_state)\n",
+ "\n",
+ " return polygon"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jyQEBhNIpT5v"
+ },
+ "outputs": [],
+ "source": [
+ "def cell_grid(\n",
+ " size: tuple[int, int] = (10, 10),\n",
+ " spacing: Number = 25,\n",
+ ") -> Generator[Polygon, None, None]:\n",
+ " \"\"\"Generate a grid of cell boundaries.\"\"\"\n",
+ " return (\n",
+ " cell_polygon(xy=np.multiply(ij, spacing), repeat_first=False, seed=n)\n",
+ " for n, ij in enumerate(np.ndindex(size))\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "VVjSum_9pT5v"
+ },
+ "outputs": [],
+ "source": [
+ "def plot_results(\n",
+ " experiments: list[list[Number]],\n",
+ " title: str,\n",
+ " capsize: int = 5,\n",
+ " **kwargs: dict[str, Any],\n",
+ ") -> None:\n",
+ " \"\"\"Plot the results of a benchmark.\n",
+ "\n",
+ " Uses the min for the bar height (see See\n",
+ " https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat),\n",
+ " and plots a min-max error bar.\n",
+ "\n",
+ " \"\"\"\n",
+ " x = range(len(experiments))\n",
+ " color = [f\"C{x_i}\" for x_i in x]\n",
+ " plt.bar(\n",
+ " x=x,\n",
+ " height=[min(e) for e in experiments],\n",
+ " color=color,\n",
+ " yerr=[[0 for e in experiments], [max(e) - min(e) for e in experiments]],\n",
+ " capsize=capsize,\n",
+ " **kwargs,\n",
+ " )\n",
+ " for i, (runs, c) in enumerate(zip(experiments, color)):\n",
+ " plt.text(\n",
+ " i,\n",
+ " min(runs),\n",
+ " f\" {min(runs):.4f}s\",\n",
+ " ha=\"left\",\n",
+ " va=\"bottom\",\n",
+ " color=c,\n",
+ " zorder=10,\n",
+ " fontweight=\"bold\",\n",
+ " path_effects=[\n",
+ " patheffects.withStroke(linewidth=2, foreground=\"w\"),\n",
+ " ],\n",
+ " )\n",
+ " plt.title(title)\n",
+ " plt.hlines(\n",
+ " 0.5,\n",
+ " -0.5,\n",
+ " len(experiments) - 0.5,\n",
+ " linestyles=\"dashed\",\n",
+ " colors=\"black\",\n",
+ " alpha=0.5,\n",
+ " )\n",
+ " plt.yscale(\"log\")\n",
+ " plt.xlabel(\"Store Type\")\n",
+ " plt.ylabel(\"Time (s)\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tHEUErSmpT5w"
+ },
+ "source": [
+ "## Display Some Generated Data\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "YUQmgohbpT5w",
+ "outputId": "1a0cdee1-e32d-41e9-fb9d-26c5ee572880"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": " ",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/svg+xml": " ",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/svg+xml": " ",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/svg+xml": " ",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "for n in range(4):\n",
+ " display(cell_polygon(xy=(0, 0), n_points=20, repeat_first=False, seed=n))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "APUNL2PtpT5w"
+ },
+ "source": [
+ "### Randomised Cell Boundaries\n",
+ "\n",
+ "Here we create a function to generate grid of cells for testing. It uses a fixed seed for reproducibility.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SOpBKM7IpT5w"
+ },
+ "source": [
+ "### A Sample 5×5 Grid\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2xA-oG4VpT5w",
+ "outputId": "caea51e4-8a27-4dd1-ed0d-c272b93d8bb7"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": " ",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "MultiPolygon(polygons=list(cell_grid(size=(5, 5), spacing=35)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "b6S8vzFipT5w"
+ },
+ "source": [
+ "# Part 1: Small Scale Benchmarking of Annotation Storage\n",
+ "\n",
+ "Using the already defined data generation functions (`cell_polygon` and\n",
+ "`cell_grid`), we create some simple artificial cell boundaries by\n",
+ "creating a circle of points, adding some noise, scaling to introduce\n",
+ "eccentricity, and then rotating. We use 20 points per cell, which is a\n",
+ "reasonably high value for cell annotation. However, this can be\n",
+ "adjusted.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UZMoLDvkpT5x"
+ },
+ "source": [
+ "## 1.1) Appending Annotations (In-Memory & Disk I/O)\n",
+ "\n",
+ "Here we test:\n",
+ "\n",
+ "1. A python dictionary based in-memory store (`DictionaryStore`)\n",
+ "1. An SQLite database based in-memory store (`SQLiteStore`)\n",
+ "\n",
+ "Both of these stores may operate in memory. The `SQLiteStore` may also\n",
+ "be backed by an on-disk file for datasets which are too large to fit in\n",
+ "memory. The `DictionaryStore` class can serialise/deserialise itself\n",
+ "to/from disk in a line delimited GeoJSON format (each line seperated\n",
+ "by `\\n` is a valid GeoJSON object)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DZBiw_EepT5x"
+ },
+ "outputs": [],
+ "source": [
+ "# Convert to annotations (a dataclass pairing a geometry and (optional)\n",
+ "# key-value properties)\n",
+ "# Run time: ~2s\n",
+ "annotations = [\n",
+ " Annotation(polygon) for polygon in cell_grid(size=(100, 100), spacing=35)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "LUVa03F2pT5x"
+ },
+ "source": [
+ "### 1.1.1) In Memory Append\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "7PzE7AhdpT5x",
+ "outputId": "974bb3d0-3290-4315-a6fc-3b7ca90072a6"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run time: ~5s\n",
+ "\n",
+ "# Time dictionary store\n",
+ "dict_runs = timeit.repeat(\n",
+ " \"dict_store.append_many(annotations)\",\n",
+ " setup=\"dict_store = DictionaryStore()\",\n",
+ " globals={\"DictionaryStore\": DictionaryStore, \"annotations\": annotations},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " \"sql_store.append_many(annotations)\",\n",
+ " setup=\"sql_store = SQLiteStore()\",\n",
+ " globals={\"SQLiteStore\": SQLiteStore, \"annotations\": annotations},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"Time to Append 10,000 Annotations In Memory\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
+ ")\n",
+ "plt.hlines(0.5, -0.5, 1.5, linestyles=\"dashed\", color=\"k\")\n",
+ "plt.xlim([-0.5, 1.5])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gU6PLE7wpT5x"
+ },
+ "source": [
+ "Note that inserting into the `SQLiteStore` is much slower than the\n",
+ "`DictionaryStore`. Appending to a `Dictionary` store simply requires\n",
+ "adding a memory reference to a dictionary. Therefore, this is a very\n",
+ "fast operation. On the other hand, for the `SQLiteStore`, the insertion\n",
+ "is slower because the data must be serialised for the database and the\n",
+ "R-Tree spatial index must also be updated. Updating the index is a\n",
+ "relatively expensive operation. However, this spatial index allows for\n",
+ "very fast queries of a very large set of annotations within a set of\n",
+ "spatial bounds.\n",
+ "\n",
+ "Insertion is typically only performed once for each\n",
+ "annotation, whereas queries may be performed many times on the\n",
+ "annotation set. Therefore, it makes sense to trade a more expensive\n",
+ "insertion for fast queries as the cost of insertion will be amortised\n",
+ "over a number of queries on the data. Additionally, data may be written\n",
+ "to the database from multiple threads or subprocesses (so long as a new\n",
+ "instance of `SQLiteStore` is created for each thread or subprocess to\n",
+ "attach to a database on disk) thus freeing up the main thread.\n",
+ "\n",
+ "For comparison, we also compare bulk insertion plus seralising to disk\n",
+ "as line-delimited GeoJSON from the `DictionaryStore` as this is the\n",
+ "default serialisation to disk method (`DictionaryStore.dump(file_path`).\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "t2q9QTCfpT5x",
+ "outputId": "2202c328-ba48-476b-8efa-662678d75135"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run time: ~10s\n",
+ "\n",
+ "setup = \"fp.truncate(0)\\nstore = Store(fp)\" # Clear the file\n",
+ "\n",
+ "# Time dictionary store\n",
+ "with tempfile.NamedTemporaryFile(\"w+\") as fp:\n",
+ " dict_runs = timeit.repeat(\n",
+ " (\"store.append_many(annotations)\\nstore.commit()\"),\n",
+ " setup=setup,\n",
+ " globals={\"Store\": DictionaryStore, \"annotations\": annotations, \"fp\": fp},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ " )\n",
+ "\n",
+ "# Time SQLite store\n",
+ "with tempfile.NamedTemporaryFile(\"w+b\") as fp:\n",
+ " sqlite_runs = timeit.repeat(\n",
+ " (\"store.append_many(annotations)\\nstore.commit()\"),\n",
+ " setup=setup,\n",
+ " globals={\"Store\": SQLiteStore, \"annotations\": annotations, \"fp\": fp},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ " )\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"Time to Append & Serialise 10,000 Annotations To Disk\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
+ ")\n",
+ "plt.hlines(0.5, -0.5, 1.5, linestyles=\"dashed\", color=\"k\")\n",
+ "plt.xlim([-0.5, 1.5])\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "LKr6FmctpT5x"
+ },
+ "source": [
+ "Here we can see that when we include the serialisation to disk in the\n",
+ "benchmark, the time to insert is much more similar.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "V7WV8wNmpT5x"
+ },
+ "source": [
+ "## 1.2) Box Query\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "eul4PYZPpT5x",
+ "outputId": "a0131a72-f527-48b1-8aac-8cbccfced2ed"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run time: ~20s\n",
+ "\n",
+ "# One time Setup\n",
+ "dict_store = DictionaryStore()\n",
+ "sql_store = SQLiteStore()\n",
+ "dict_store.append_many(annotations)\n",
+ "sql_store.append_many(annotations)\n",
+ "\n",
+ "rng = np.random.default_rng(123)\n",
+ "boxes = [\n",
+ " Polygon.from_bounds(x, y, 128, 128) for x, y in rng.integers(0, 1000, size=(100, 2))\n",
+ "]\n",
+ "stmt = \"for box in boxes:\\n _ = store.query(box)\"\n",
+ "\n",
+ "# Time dictionary store\n",
+ "dict_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": dict_store, \"boxes\": boxes},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": sql_store, \"boxes\": boxes},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"100 Box Queries\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "z9ntCgKapT5x"
+ },
+ "source": [
+ "Here we can see that the `SQLiteStore` is a bit faster. Addtionally,\n",
+ "difference in performance is more pronounced when there are more\n",
+ "annotations (as we will see later in this notebook) in the store or when\n",
+ "just returning keys:\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "vfGH6e4upT5x",
+ "outputId": "7cf8bf30-a4c9-4de5-9a5f-f9fd6cffc141"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run time: ~15s\n",
+ "\n",
+ "# One time Setup\n",
+ "dict_store = DictionaryStore()\n",
+ "sql_store = SQLiteStore()\n",
+ "dict_store.append_many(annotations)\n",
+ "sql_store.append_many(annotations)\n",
+ "\n",
+ "rng = np.random.default_rng(123)\n",
+ "boxes = [\n",
+ " Polygon.from_bounds(x, y, 128, 128) for x, y in rng.integers(0, 1000, size=(100, 2))\n",
+ "]\n",
+ "stmt = \"for box in boxes:\\n _ = store.iquery(box)\" # Just return the keys (uuids)\n",
+ "\n",
+ "# Time dictionary store\n",
+ "dict_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": dict_store, \"boxes\": boxes},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": sql_store, \"boxes\": boxes},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"100 Box Queries (Key Lookup Only)\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "xVQlsK1MpT5y"
+ },
+ "source": [
+ "## 1.3) Polygon Query\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "fnkdnKWRpT5y",
+ "outputId": "03ccc35c-df96-4d68-9d53-72ac835a9088"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run time: ~15s\n",
+ "\n",
+ "# One time Setup\n",
+ "dict_store = DictionaryStore()\n",
+ "sql_store = SQLiteStore()\n",
+ "dict_store.append_many(annotations)\n",
+ "sql_store.append_many(annotations)\n",
+ "\n",
+ "rng = np.random.default_rng(123)\n",
+ "query_polygons = [\n",
+ " Polygon(\n",
+ " [\n",
+ " (x, y),\n",
+ " (x + 128, y),\n",
+ " (x + 128, y + 128),\n",
+ " (x, y),\n",
+ " ],\n",
+ " )\n",
+ " for x, y in rng.integers(0, 1000, size=(100, 2))\n",
+ "]\n",
+ "stmt = \"for polygon in query_polygons:\\n _ = store.query(polygon)\"\n",
+ "\n",
+ "# Time dictionary store\n",
+ "dict_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": dict_store, \"query_polygons\": query_polygons},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": sql_store, \"query_polygons\": query_polygons},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"100 Polygon Queries\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1k1xOgB5pT5y"
+ },
+ "source": [
+ "Here we can see that performing queries within a polygon region is about\n",
+ "10x faster with the `SQLiteStore` than with the `DictionaryStore`.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iYFK95w1pT5y"
+ },
+ "source": [
+ "## 1.4) Predicate Query\n",
+ "\n",
+ "Here we query the whole annotation region but with a predicate to\n",
+ "select only annotations with the class label of 0. We also,\n",
+ "demonstrate how creating a database index can dramatically improve\n",
+ "the performance of queries.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zNX4UG4BpT5y",
+ "outputId": "97444739-4aa5-42c7-bebc-84a022282ac7"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run time: ~2m\n",
+ "\n",
+ "# Setup\n",
+ "labelled_annotations = copy.deepcopy(annotations)\n",
+ "for n, annotation in enumerate(labelled_annotations):\n",
+ " annotation.properties[\"class\"] = n % 10\n",
+ " annotation.properties[\"vector\"] = rng.integers(1, 4, 10).tolist()\n",
+ "\n",
+ "predicate = \"(props['class'] == ?) & (3 in props['vector'])\"\n",
+ "classes = rng.integers(0, 10, size=100)\n",
+ "stmt = \"for n in classes:\\n store.query(where=predicate.replace('?', str(n)))\"\n",
+ "\n",
+ "dict_store = DictionaryStore()\n",
+ "sql_store = SQLiteStore()\n",
+ "\n",
+ "dict_store.append_many(labelled_annotations)\n",
+ "sql_store.append_many(labelled_annotations)\n",
+ "\n",
+ "\n",
+ "# Time dictionary store\n",
+ "dict_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": dict_store, \"predicate\": predicate, \"classes\": classes},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "dict_result = dict_store.query(where=predicate.replace(\"?\", \"0\"))\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": sql_store, \"predicate\": predicate, \"classes\": classes},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "sql_result = sql_store.query(where=predicate.replace(\"?\", \"0\"))\n",
+ "\n",
+ "\n",
+ "# Add an index\n",
+ "# Note: Indexes may not always speed up the query (sometimes they can\n",
+ "# actually slow it down), test to make sure.\n",
+ "sql_store.create_index(\"class_lookup\", \"props['class']\")\n",
+ "sql_store.create_index(\"has_3\", \"3 in props['vector']\")\n",
+ "\n",
+ "# Time SQLite store again\n",
+ "sqlite_index_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\"store\": sql_store, \"predicate\": predicate, \"classes\": classes},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "sql_index_result = sql_store.query(where=predicate.replace(\"?\", \"0\"))\n",
+ "\n",
+ "# # Validate the results against each other\n",
+ "# for a, b, c in zip(dict_result, sql_result, sql_index_result):\n",
+ "# assert a.geometry == b.geometry == c.geometry # noqa: ERA001\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs, sqlite_index_runs],\n",
+ " title=\"100 Queries with a Predicate\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\", \"SQLiteStore\\n(with index)\"],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gp8mq1TNpT5y"
+ },
+ "source": [
+ "### Polygon & Predicate Query\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Eu0hGvhdpT5y",
+ "outputId": "0d89174e-01e0-4e71-a9c3-e063ed30ca38"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run time: ~10s\n",
+ "\n",
+ "# Setup\n",
+ "labelled_annotations = copy.deepcopy(annotations)\n",
+ "for n, annotation in enumerate(labelled_annotations):\n",
+ " annotation.properties[\"class\"] = n % 10\n",
+ "\n",
+ "predicate = \"props['class'] == \"\n",
+ "classes = rng.integers(0, 10, size=50)\n",
+ "query_polygons = [\n",
+ " Polygon(\n",
+ " [\n",
+ " (x, y),\n",
+ " (x + 128, y),\n",
+ " (x + 128, y + 128),\n",
+ " (x, y),\n",
+ " ],\n",
+ " )\n",
+ " for x, y in rng.integers(0, 1000, size=(100, 2))\n",
+ "]\n",
+ "stmt = (\n",
+ " \"for n, poly in zip(classes, query_polygons):\\n\"\n",
+ " \" store.query(poly, where=predicate + str(n))\"\n",
+ ")\n",
+ "\n",
+ "dict_store = DictionaryStore()\n",
+ "sql_store = SQLiteStore()\n",
+ "\n",
+ "dict_store.append_many(labelled_annotations)\n",
+ "sql_store.append_many(labelled_annotations)\n",
+ "\n",
+ "\n",
+ "# Time dictionary store\n",
+ "dict_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\n",
+ " \"store\": dict_store,\n",
+ " \"predicate\": predicate,\n",
+ " \"classes\": classes,\n",
+ " \"query_polygons\": query_polygons,\n",
+ " },\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "dict_result = dict_store.query(query_polygons[0], where=predicate + \"0\")\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\n",
+ " \"store\": sql_store,\n",
+ " \"predicate\": predicate,\n",
+ " \"classes\": classes,\n",
+ " \"query_polygons\": query_polygons,\n",
+ " },\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "sql_result = sql_store.query(query_polygons[0], where=predicate + \"0\")\n",
+ "\n",
+ "\n",
+ "# Check that the set difference of bounding boxes is empty i.e. all sets\n",
+ "# of results contain polygons which produce the same set of bounding\n",
+ "# boxes. This avoids being tripped up by slight varations in order or\n",
+ "# coordinate order between the results.\n",
+ "dict_set = {x.geometry.bounds for x in dict_result}\n",
+ "sql_set = {x.geometry.bounds for x in sql_result}\n",
+ "assert len(dict_set.difference(sql_set)) == 0 # noqa: S101\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"100 Queries with a Polygon and Predicate\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kJ8x5tJmpT5y"
+ },
+ "source": [
+ "### Complex Predicate Query\n",
+ "\n",
+ "Here we slightly increase the complexity of the predicate to show how\n",
+ "the complexity of a predicate can dramatically affect the performance\n",
+ "when handling many annotations.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "VHb4PqbHpT5y",
+ "outputId": "343b44c7-741d-4e11-9dd2-85f357ba6f32"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run time: ~1m\n",
+ "\n",
+ "# Setup\n",
+ "box = Polygon.from_bounds(0, 0, 1024, 1024)\n",
+ "labelled_annotations = copy.deepcopy(annotations)\n",
+ "for n, annotation in enumerate(labelled_annotations):\n",
+ " annotation.properties[\"class\"] = n % 4\n",
+ " annotation.properties[\"n\"] = n\n",
+ "\n",
+ "predicate = \"(props['n'] > 1000) & (props['n'] % 4 == 0) & (props['class'] == \"\n",
+ "targets = rng.integers(0, 4, size=100)\n",
+ "stmt = \"for n in targets:\\n store.query(box, where=predicate + str(n) + ')')\"\n",
+ "\n",
+ "dict_store = DictionaryStore()\n",
+ "sql_store = SQLiteStore()\n",
+ "\n",
+ "dict_store.append_many(labelled_annotations)\n",
+ "sql_store.append_many(labelled_annotations)\n",
+ "\n",
+ "\n",
+ "# Time dictionary store\n",
+ "dict_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\n",
+ " \"store\": dict_store,\n",
+ " \"predicate\": predicate,\n",
+ " \"targets\": targets,\n",
+ " \"box\": box,\n",
+ " },\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "dict_result = dict_store.query(box, where=predicate + \"0)\")\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " stmt,\n",
+ " globals={\n",
+ " \"store\": sql_store,\n",
+ " \"predicate\": predicate,\n",
+ " \"targets\": targets,\n",
+ " \"box\": box,\n",
+ " },\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "sql_result = sql_store.query(box, where=predicate + \"0)\")\n",
+ "\n",
+ "\n",
+ "# Check that the set difference of bounding boxes is empty i.e. all sets\n",
+ "# of results contain polygons which produce the same set of bounding\n",
+ "# boxes. This avoids being tripped up by slight varations in order or\n",
+ "# coordinate order between the results.\n",
+ "dict_set = {x.geometry.bounds for x in dict_result.values()}\n",
+ "sql_set = {x.geometry.bounds for x in sql_result.values()}\n",
+ "\n",
+ "assert len(dict_set.difference(sql_set)) == 0 # noqa: S101\n",
+ "\n",
+ "# Plot the results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"100 Queries with a Complex Predicate\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CAT0KmS6pT5y"
+ },
+ "source": [
+ "# Part 2: Large Scale Dataset Benchmarking\n",
+ "\n",
+ "Here we generate some sets of anntations with five million items each\n",
+ "(in a 2237 x 2237 grid). One is a set of points, the other a set of\n",
+ "generated cell boundaries.\n",
+ "\n",
+ "The code to generate and write out the annotations to various formats is\n",
+ "included in the following cells. However, some of these take a very long\n",
+ "time to run. A pre-generated dataset is downloaded and then read from\n",
+ "disk instead to save time. However, you may uncomment the generation\n",
+ "code to replicate the original.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "nwH5zYFupT5y"
+ },
+ "source": [
+ "## 2.1) Points Dataset\n",
+ "\n",
+ "Here we generate a simple points data in a grid. The grid is 2237 x 2237\n",
+ "and contains over 5 million points. We also write this to disk in\n",
+ "various formats. Some formats take a long time and are commented out. A\n",
+ "summary of times for a consumer laptop are shown in a table at the end.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2FjCL2jgpT5y"
+ },
+ "outputs": [],
+ "source": [
+ "# Generate some points with a little noise\n",
+ "# Run time: ~5s\n",
+ "points = np.array(\n",
+ " [\n",
+ " [x, y]\n",
+ " for x in np.linspace(0, 75_000, 2237)\n",
+ " for y in np.linspace(0, 75_000, 2237)\n",
+ " ],\n",
+ ")\n",
+ "# Add some noise between -1 and 1\n",
+ "rng_42 = np.random.default_rng(42)\n",
+ "points += rng_42.uniform(-1, 1, size=(2237**2, 2))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "DRWABSBVpT5z"
+ },
+ "source": [
+ "### 2.1.1) Writing To Disk\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "x76WbSFdpT52"
+ },
+ "outputs": [],
+ "source": [
+ "# Save as a simple Numpy array (.npy)\n",
+ "# Run time: <1s\n",
+ "np.save(\"points.npy\", points)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "dkKtM-DKpT52"
+ },
+ "outputs": [],
+ "source": [
+ "# Save as compressed NumPy archive (.npz)\n",
+ "# Run time: ~5s\n",
+ "np.savez_compressed(\"points.npz\", points)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "rbHdEIbPpT52"
+ },
+ "source": [
+ "Note that the above numpy format is missing the keys (UUIDs) of each point.\n",
+ "This may not be required in all cases. However, for the sake of comparison\n",
+ "we also generate a NumPy archive with keys included. We store the UUIDs\n",
+ "as integers to save space and for a fair comparison where the optimal\n",
+ "storage method is used in each case. Note however that UUIDs are too\n",
+ "large to be a standard C type and therefore are stored as an object\n",
+ "array.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DbLm4l5tpT52"
+ },
+ "outputs": [],
+ "source": [
+ "# Generate UUIDs\n",
+ "# Run time: ~10s\n",
+ "keys = np.array([uuid.uuid4().int for _ in range(len(points))])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zXuAqw0KpT52"
+ },
+ "outputs": [],
+ "source": [
+ "# Generate some UUIDs as keys\n",
+ "# Save in NumPy format (.npz)\n",
+ "# Run time: <1s\n",
+ "np.savez(\"uuid_points.npz\", keys=keys, coords=points)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "UAHAgPU4pT52"
+ },
+ "outputs": [],
+ "source": [
+ "# Save in compressed (zip) NumPy format (.npz)\n",
+ "# Run time: ~10s\n",
+ "np.savez_compressed(\"uuid_points_compressed.npz\", keys=keys, coords=points)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "j5wlDFYfpT52"
+ },
+ "outputs": [],
+ "source": [
+ "# Write to SQLite with SQLiteStore\n",
+ "# Run time: ~10m\n",
+ "points_sqlite_store = SQLiteStore(\"points.db\")\n",
+ "_ = points_sqlite_store.append_many(\n",
+ " annotations=(Annotation(Point(x, y)) for x, y in points),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "tUekiEqspT53"
+ },
+ "outputs": [],
+ "source": [
+ "# Load a DictionaryStore into memory by copying from the SQLiteStore\n",
+ "# Run time: ~1m 30s\n",
+ "points_dict_store = DictionaryStore(Path(\"points.ndjson\"))\n",
+ "for key, value in points_sqlite_store.items():\n",
+ " points_dict_store[key] = value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Uynntjq7pT53"
+ },
+ "outputs": [],
+ "source": [
+ "# Save as GeoJSON\n",
+ "# Run time: ~1m 30s\n",
+ "points_sqlite_store.to_geojson(\"points.geojson\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "4YMuggcgpT53"
+ },
+ "outputs": [],
+ "source": [
+ "# Save as ndjson\n",
+ "# Run time: ~1m 30s\n",
+ "# Spec: https://github.com/ndjson/ndjson-spec\n",
+ "points_sqlite_store.to_ndjson(\"points.ndjson\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lW9NoCPwpT53"
+ },
+ "source": [
+ "### 2.1.2) Points Dataset Statistics Summary\n",
+ "\n",
+ "| Format | Write Time | Size |\n",
+ "| -----------------------------: | ---------: | -----: |\n",
+ "| SQLiteStore (.db) | 6m 20s | 893MB |\n",
+ "| ndjson | 1m 23s | 667 MB |\n",
+ "| GeoJSON | 1m 42s | 500 MB |\n",
+ "| NumPy + UUID (.npz) | 0.5s | 165 MB |\n",
+ "| NumPy + UUID Compressed (.npz) | 31s | 136 MB |\n",
+ "| NumPy (.npy) | 0.1s | 76 MB |\n",
+ "| NumPy Compressed (.npz) | 3.3s | 66 MB |\n",
+ "\n",
+ "Note that the points SQLite database is significantly larger than the\n",
+ "NumPy arrays on disk. The numpy array is much more storage efficient\n",
+ "partly because there is no R Tree index or unique identifier (UUID)\n",
+ "stored for each point. For a more fair comparison, another NumPy archive\n",
+ "(.npz) is created where the keys are stored along with the coordinates.\n",
+ "\n",
+ "Also note that although the compressed NumPy representation is much\n",
+ "smaller, it must be decompressed in memeory before it can be used. The\n",
+ "uncompressed versions may be memory mapped if their size exceeds the\n",
+ "available memory.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "a_3Gz5Q0pT53"
+ },
+ "source": [
+ "### 2.1.3) Simple Box Query\n",
+ "\n",
+ "Here we evaluate the performance of performing a simple box query on the\n",
+ "data. All points which are in the area between 128 and 256 in the x and\n",
+ "y coordinates are retrieved. It is assumed that the data is already in\n",
+ "memory for the NumPy formats. In reality this would not the be case for\n",
+ "the first query, all data would have to be read from disk, which is a\n",
+ "significan overhead. However, this cost is amortised across many\n",
+ "queries. To ensure the fairest possible comparison, it is assumed that\n",
+ "many queries will be performed, and that this data loading cost in\n",
+ "negligable.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "o9J0d6gdpT53"
+ },
+ "outputs": [],
+ "source": [
+ "box = Polygon.from_bounds(128, 128, 256, 256)\n",
+ "\n",
+ "# Time numpy\n",
+ "numpy_runs = timeit.repeat(\n",
+ " (\n",
+ " \"where = np.all([\"\n",
+ " \"points[:, 0] > 128,\"\n",
+ " \"points[:, 0] < 256,\"\n",
+ " \"points[:, 1] > 128,\"\n",
+ " \"points[:, 1] < 256\"\n",
+ " \"], 0)\\n\"\n",
+ " \"uuids = keys[where]\\n\"\n",
+ " \"result = points[where]\\n\"\n",
+ " ),\n",
+ " globals={\"keys\": keys, \"points\": points, \"np\": np},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Time SQLiteStore\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " \"store.query(box)\",\n",
+ " globals={\"store\": points_sqlite_store, \"box\": box},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Time DictionaryStore\n",
+ "dict_runs = timeit.repeat(\n",
+ " \"store.query(box)\",\n",
+ " globals={\"store\": points_dict_store, \"box\": box},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "eX1qqUIipT53",
+ "outputId": "a4033a88-6b2d-4a55-f3f6-ba419ef748c0"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs, numpy_runs],\n",
+ " title=\"Points Box Query (5 Million Points)\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\", \"NumPy Array\"],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "aNU6FP90pT53"
+ },
+ "source": [
+ "Although the NumPy array is very space efficient on disk, it is not as\n",
+ "fast to query as the `SQLiteStore`. The `SQLiteStore` is likely faster\n",
+ "due to the use of the R tree index. Furthermore, the method used to\n",
+ "store the points in a NumPy array is limited in that it does not use\n",
+ "UUIDs, which makes merging two datasets more difficult as the indexes of\n",
+ "points no longer uniquely identify them. Additionally, only homogeneous\n",
+ "data such as two-dimentional coordinates can be practically stored in\n",
+ "this way. If the user would like to store variable length data\n",
+ "structures such as polygons, or even mix data types by storing both\n",
+ "points and polygons, then using raw NumPy arrays in this way can become\n",
+ "cumbersome and begins to offer little benefit in terms of storage\n",
+ "efficient or query performance.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "c766NXGPpT53"
+ },
+ "source": [
+ "### 2.1.4) Polygon Query\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "6jiMpRnxpT53"
+ },
+ "outputs": [],
+ "source": [
+ "big_triangle = Polygon(\n",
+ " shell=[ # noqa: S604\n",
+ " (1024, 1024),\n",
+ " (1024, 4096),\n",
+ " (4096, 4096),\n",
+ " (1024, 1024),\n",
+ " ],\n",
+ ")\n",
+ "\n",
+ "# Time SQLiteStore\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " \"store.query(polygon)\",\n",
+ " globals={\"store\": points_sqlite_store, \"polygon\": big_triangle},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")\n",
+ "\n",
+ "# Time DictionaryStore\n",
+ "dict_runs = timeit.repeat(\n",
+ " \"store.query(polygon)\",\n",
+ " globals={\"store\": points_dict_store, \"polygon\": big_triangle},\n",
+ " number=1,\n",
+ " repeat=10,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Es2OQ5OdpT53",
+ "outputId": "b98176ee-7003-49f7-f5ca-62b08180b2ee"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"Polygon Query (5 Million Points)\",\n",
+ " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HUBEmZDMpT53"
+ },
+ "source": [
+ "## 2.2) Cell Boundary Polygons Dataset\n",
+ "\n",
+ "Here we generate a much larger and more complex polygon dataset. This\n",
+ "consistes of a grid of over 5 million generated cell boundary like\n",
+ "polygons.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "xhCr_TDVpT53",
+ "outputId": "c02b7a20-6ab1-4cae-b6bb-fb5c6d94cd12"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 5004169/5004169 [10:04<00:00, 8277.35it/s] \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Generate a grid of 5 million cell boundary polygons (2237 x 2237)\n",
+ "# Run time: ~10m\n",
+ "rng_42 = np.random.default_rng(42)\n",
+ "\n",
+ "cell_polygons = [\n",
+ " Annotation(geometry=polygon, properties={\"class\": rng_42.integers(0, 4)})\n",
+ " for polygon in tqdm(cell_grid(size=(2237, 2237), spacing=35), total=2237**2)\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "21RgwKtgpT54"
+ },
+ "source": [
+ "### 2.2.1) Write To Formats For Comparison\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "CDVLMRUtpT54"
+ },
+ "outputs": [],
+ "source": [
+ "# Write to an SQLiteStore on disk (SSD for recorded times here)\n",
+ "# Run time: ~30m\n",
+ "cell_sqlite_store = SQLiteStore(\"cells.db\")\n",
+ "_ = cell_sqlite_store.append_many(annotations=cell_polygons)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "6Fb4tQHVpT54",
+ "outputId": "fba12c47-e0cb-44fd-ca95-35c38454c9cc"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " \r"
+ ]
+ }
+ ],
+ "source": [
+ "# Create a copy as an in memory DictionaryStore\n",
+ "# Run time: ~5m\n",
+ "cell_dict_store = DictionaryStore()\n",
+ "for key, value in tqdm( # Show a nice progress bar\n",
+ " cell_sqlite_store.items(),\n",
+ " total=len(cell_sqlite_store),\n",
+ " leave=False,\n",
+ " position=0,\n",
+ "):\n",
+ " cell_dict_store[key] = value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "wXOOuGWypT54",
+ "outputId": "e2fb300e-e5b8-4459-b172-249cda363b50"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 5004169/5004169 [01:26<00:00, 58002.74it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Transform into a numpy array\n",
+ "# Run Time: ~1m\n",
+ "cell_polygons_np = np.array(\n",
+ " [np.array(a.geometry.exterior.coords) for a in tqdm(cell_polygons)],\n",
+ " dtype=object,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "yv9VgW9TpT54"
+ },
+ "outputs": [],
+ "source": [
+ "# Create an Nx4 index of (xmin, ymin, xmax, ymax) as a simple spatial\n",
+ "# index to speed up the numpy query.\n",
+ "# Run time: ~1m\n",
+ "min_max_index = np.array(\n",
+ " [(*np.min(coords, 0), *np.max(coords, 0)) for coords in cell_polygons_np],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "nFmHxwBwpT54"
+ },
+ "outputs": [],
+ "source": [
+ "# Write to GeoJSON\n",
+ "# Run time: ~10m\n",
+ "\n",
+ "cell_dict_store.to_geojson(\"cells.geojson\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "2UH6WdmipT54"
+ },
+ "outputs": [],
+ "source": [
+ "# Write to line delimited JSON (ndjson)\n",
+ "# Run time: ~10m\n",
+ "\n",
+ "cell_dict_store.to_ndjson(\"cells.ndjson\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "fw6wg5gapT54",
+ "outputId": "61a32277-fb8d-4bdc-be28-b379cb0a23eb"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cells.ndjson : 40.82% ( 8.82 GiB => 3.60 GiB, cells.ndjson.zstd) \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Zstandard compression of ndjson to demonstrate how well it compresses.\n",
+ "# Gzip may also be used but is slower to compress.\n",
+ "# Run time: ~1m\n",
+ "! zstd -f -k cells.ndjson -o cells.ndjson.zstd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rzGC65zhpT55",
+ "outputId": "75ad772b-5641-4d64-ae16-7d50206e1b85"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cells.db : 75.87% ( 4.87 GiB => 3.69 GiB, cells.db.zstd) \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Zstandard compression of sqlite to demonstrate how well it compresses.\n",
+ "# Gzip may also be used but is slower to compress.\n",
+ "# Run time: ~20s\n",
+ "! zstd -f -k cells.db -o cells.db.zstd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "xT0KZLxdpT55"
+ },
+ "outputs": [],
+ "source": [
+ "# Write as a pickle (list)\n",
+ "# Run time: ~2m\n",
+ "with Path(\"cells.pickle\").open(\"wb\") as fh:\n",
+ " pickle.dump(cell_polygons, fh)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "-TAWGEu9pT55"
+ },
+ "outputs": [],
+ "source": [
+ "# Write as a pickle (dict)\n",
+ "# Run time: ~15m\n",
+ "with Path(\"cells-dict.pickle\").openI(\"wb\") as fh:\n",
+ " pickle.dump(cell_dict_store._rows, fh) # noqa: SLF001"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "I-W4o3GepT55"
+ },
+ "outputs": [],
+ "source": [
+ "# Write dictionary store to a pickle\n",
+ "# Run time: ~20m\n",
+ "with Path(\"cells.pickle\").open(\"wb\") as fh:\n",
+ " pickle.dump(cell_dict_store, fh)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "dALe8k0BpT55"
+ },
+ "outputs": [],
+ "source": [
+ "# Write as numpy object array (similar to writing out with pickle),\n",
+ "# Numpy cannot handle ragged arrays and therefore dtype must be object.\n",
+ "# Run time: ~30m\n",
+ "np.save(\"cells.npy\", np.asanyarray(cell_polygons_np, dtype=object))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "hOrGS0HgpT55"
+ },
+ "outputs": [],
+ "source": [
+ "# Create UUIDs, and get the class labels for each cell boundary\n",
+ "# Run time: ~2m\n",
+ "_uuids = [str(uuid.uuid4) for _ in cell_polygons]\n",
+ "_cls = [x.properties[\"class\"] for x in cell_polygons]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Fs2cz8lVpT55"
+ },
+ "outputs": [],
+ "source": [
+ "# Write as NumPy archive (.npz) with uuid and min_max_index\n",
+ "# Run time: ~40m\n",
+ "np.savez(\n",
+ " \"cells.npz\",\n",
+ " uuids=_uuids,\n",
+ " polygons=cell_polygons_np,\n",
+ " min_max_index=min_max_index,\n",
+ " cls=_cls,\n",
+ ")\n",
+ "\n",
+ "del _uuids, _cls"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4gOTqc03pT55"
+ },
+ "source": [
+ "### 2.2.2) Time To Write Summary Statistics\n",
+ "\n",
+ "The following is a summary of the time required to write each format to\n",
+ "disk and the total disk space occupied by the final output.\n",
+ "\n",
+ "Note that some of these formats, such as GeoJSON compress well with\n",
+ "schemes such as gzip and zstd, reducing the disk space by approximately\n",
+ "half. Statistics for zstd compressed data is also reported below. It\n",
+ "should be noted that the data must be decompressed to be usable.\n",
+ "However, for gzip and zstd, this may be done in a streaming fashion from\n",
+ "disk.\n",
+ "\n",
+ "| Format | Write Time | Size |\n",
+ "| ----------------: | ---------: | -----: |\n",
+ "| SQLiteStore (.db) | 33m 48.4s | 4.9 GB |\n",
+ "| GeoJSON | 11m 32.9s | 8.9 GB |\n",
+ "| ndjson | 9m 0.9s | 8.8 GB |\n",
+ "| pickle | 1m 2.9s | 1.8 GB |\n",
+ "| zstd (SQLite) | 18.2s | 3.7 GB |\n",
+ "| zstd (ndjson) | 43.7s | 3.6 GB |\n",
+ "| NumPy (.npy) | 50.3s | 1.8 GB |\n",
+ "| NumPy (.npz) | 55.3s | 2.6 GB |\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "wS3sGpnWpT55"
+ },
+ "source": [
+ "### 2.2.3) Box Query\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "MKvKfkyvpT55"
+ },
+ "outputs": [],
+ "source": [
+ "# Run time: ~5m\n",
+ "\n",
+ "# Setup\n",
+ "xmin, ymin, xmax, ymax = 128, 12, 256, 256\n",
+ "box = Polygon.from_bounds(xmin, ymin, xmax, ymax)\n",
+ "\n",
+ "\n",
+ "# Time DictionaryStore\n",
+ "dict_runs = timeit.repeat(\n",
+ " \"store.query(box)\",\n",
+ " globals={\"store\": cell_dict_store, \"box\": box},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " \"store.query(box)\",\n",
+ " globals={\"store\": cell_sqlite_store, \"box\": box},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "0Yo14C3kpT55",
+ "outputId": "764bc28b-3072-4887-ea88-4c88ffcefb5f"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plot results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"Box Query (5 Million Polygons)\",\n",
+ " tick_label=[\n",
+ " \"DictionaryStore\",\n",
+ " \"SQLiteStore\",\n",
+ " ],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ExF-fOGQpT56"
+ },
+ "source": [
+ "### 2.2.4) Polygon Query\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "PcxKapqNpT56"
+ },
+ "outputs": [],
+ "source": [
+ "# Run Time: 35s\n",
+ "\n",
+ "# Setup\n",
+ "big_triangle = Polygon(\n",
+ " shell=[ # noqa: S604\n",
+ " (1024, 1024),\n",
+ " (1024, 4096),\n",
+ " (4096, 4096),\n",
+ " (1024, 1024),\n",
+ " ],\n",
+ ")\n",
+ "\n",
+ "\n",
+ "# Time DictionaryStore\n",
+ "dict_runs = timeit.repeat(\n",
+ " \"store.query(polygon)\",\n",
+ " globals={\"store\": cell_dict_store, \"polygon\": big_triangle},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")\n",
+ "\n",
+ "# Time SQLite store\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " \"store.query(polygon)\",\n",
+ " globals={\"store\": cell_sqlite_store, \"polygon\": big_triangle},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "vqHA50DQpT56",
+ "outputId": "7e837f4c-ada9-400f-b5f3-c59430b137f3"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Plot results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs],\n",
+ " title=\"Polygon Query (5 Million Polygons)\",\n",
+ " tick_label=[\n",
+ " \"DictionaryStore\",\n",
+ " \"SQLiteStore\",\n",
+ " ],\n",
+ ")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6m-E5AwapT56"
+ },
+ "source": [
+ "### 2.2.5) Predicate Query\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "whEn34rOpT56"
+ },
+ "outputs": [],
+ "source": [
+ "# Run Time: ~10m\n",
+ "\n",
+ "# Setup\n",
+ "xmin, ymin, xmax, ymax = 128, 12, 256, 256\n",
+ "box = Polygon.from_bounds(xmin, ymin, xmax, ymax)\n",
+ "predicate = \"props['class'] == 0\"\n",
+ "\n",
+ "# Time DictionaryStore\n",
+ "dict_runs = timeit.repeat(\n",
+ " \"store.query(box, predicate)\",\n",
+ " globals={\"store\": cell_dict_store, \"box\": box, \"predicate\": predicate},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")\n",
+ "\n",
+ "# Time SQLiteStore\n",
+ "sqlite_runs = timeit.repeat(\n",
+ " \"store.query(box, where=predicate)\",\n",
+ " globals={\"store\": cell_sqlite_store, \"box\": box, \"predicate\": predicate},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")\n",
+ "\n",
+ "np_stmt = f\"\"\"\n",
+ "polygons = [\n",
+ " polygon\n",
+ " for polygon in tqdm(cell_polygons_np)\n",
+ " if np.all([\n",
+ " np.max(polygon, 0) >= ({xmin}, {ymin}), np.min(polygon, 0) <= ({xmax}, {ymax})\n",
+ " ])\n",
+ "]\n",
+ "\"\"\"\n",
+ "\n",
+ "# Time numpy\n",
+ "numpy_runs = timeit.repeat(\n",
+ " np_stmt,\n",
+ " globals={\"cell_polygons_np\": cell_polygons_np, \"np\": np, \"tqdm\": lambda x: x},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")\n",
+ "\n",
+ "# Time shapely\n",
+ "shapely_runs = timeit.repeat(\n",
+ " \"polygons = [box.intersects(ann.geometry) for ann in cell_polygons]\",\n",
+ " globals={\"box\": box, \"cell_polygons\": cell_polygons},\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")\n",
+ "\n",
+ "# Time box indexed numpy\n",
+ "numpy_index_runs = timeit.repeat(\n",
+ " \"in_box = np.all(min_max_index[:, :2] <= (xmax, ymax), 1) \"\n",
+ " \"& np.all(min_max_index[:, 2:] >= (xmin, ymin), 1)\\n\"\n",
+ " \"polygons = [p for p, w in zip(cell_polygons, in_box) if w]\",\n",
+ " globals={\n",
+ " \"min_max_index\": min_max_index,\n",
+ " \"xmin\": xmin,\n",
+ " \"ymin\": ymin,\n",
+ " \"xmax\": xmax,\n",
+ " \"ymax\": ymax,\n",
+ " \"np\": np,\n",
+ " \"cell_polygons\": cell_polygons,\n",
+ " },\n",
+ " number=1,\n",
+ " repeat=3,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "oRxJTg7BpT56",
+ "outputId": "d235e51a-5109-486e-b779-fe39e5f6ee33"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAF2CAYAAACrlXVQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABZaUlEQVR4nO3dd3hUZfrw8e+Znl5JgNBBSmApIdI7UbCtgIi7NpR3VdBVUdbG6q67LisWwFXEBqhYWFd/YEFYFKRJh4j03iGQXiaT6ef9I2aWIQkEk0zL/bkuLplzzpy5n0PMPU9XVFVVEUIIIWpB4+8AhBBCBD9JJkIIIWpNkokQQohak2QihBCi1iSZCCGEqDVJJkIIIWpNkokQQohak2QiGox77rkHRVE8f2JiYujbty9Lly71yec7HA5efvllunbtSlhYGNHR0QwePJhFixb55POFqE+STESDMnDgQLKyssjKymLTpk2kpaUxatQojhw5Uq+f63A4uO6665gxYwaTJ09m7969bNq0iWHDhnHbbbfx/PPP1+vnV7Db7T75HNEAqUI0EOPHj1eHDx/uday4uFgF1EWLFnkdu//++9XExETVaDSqPXv2VJcvX66qqqparVa1e/fu6s033+y53mKxqJ07d1bHjRtX7WfPmDFDBdRNmzZVOjd9+nRVURR127Ztqqqq6qpVq1RAPXXqlNd1Wq1Wff/99z2vz507p44fP15NTExUIyMj1X79+qlr1qzxnK+4z5IlS9T+/furRqNRff3119XIyEj1k08+8br3sWPHVEVR1FWrVlVbBiEuRWomosGy2+289957GI1G0tLSPMcnTJjA8uXL+fjjj/npp5/o378/N954I/v378doNPLZZ5+xcuVKZs+eDcAjjzyCxWLh3XffrfazPvroI4YPH07v3r0rnXv00UcJCwvjk08+qXHsZWVlDB06lJKSEpYtW8ZPP/3E9ddfzzXXXMO+ffu8rp0yZQpPPvkk+/btY/To0dx+++289957XtfMmzePdu3aMXjw4BrHIIQXf2czIXxl/PjxqlarVSMiItSIiAhVURQ1IiJC/eyzzzzXHDp0SAXUb7/91uu9PXr0UO+9917P6w8++EA1Go3qc889p+r1enXz5s2X/OywsDD1kUceqfb8b37zG/X6669XVbVmNZP3339fTUlJUR0Oh9c1Q4cOVR999FGv+yxYsMDrmu3bt6uAevDgQVVVVdXpdKrNmjVTX3755UuWQYhL0fk3lQnhW7179+bDDz8EwGw289133zF+/HhiYmIYMWIEe/fuBWDQoEFe7xs0aBAbN270vB4/fjxLly7lhRdeYPr06fTq1avWsen1+hpfu3XrVs6dO0dsbKzXcZvNRlhYmNexi2NLS0sjPT2duXPn8tJLL7Fs2TLOnz/P+PHjf3XsQkgyEQ1KWFgY7dq187zu3r07K1euZNq0aYwYMaLa96mqiqIontdms5nMzEy0Wi0HDx687Od26NCB3bt3V3nOarVy5MgRRo4cCYBGo/F8ZgWXy4Xb7fa8drvddOrUicWLF1e6X3h4uNfriIiIStdMnDiRqVOn8o9//IO5c+cyatQokpKSLlsOIaojfSaiwdPpdFgsFgA6d+4MwNq1a72uWbduneccwKRJk9Bqtfzwww98/PHH/Pvf/77kZ9x111388MMPbN68udK5f/3rX5SVlXH33XcDeH6pnz171nPNjh07vJJLeno6R48eJTo6mnbt2nn9adq06WXL/Lvf/Q6r1co777zDt99+y3333XfZ9whxSX5uZhPCZ8aPH68OHDhQzcrKUrOystTDhw+rb775pqrVatV//OMfnutuvfVWtWXLlup///tfdd++feojjzyi6vV6dd++faqqqupHH32kGo1G9aefflJVVVVfffVVNTo6Wj169Gi1n22329Xhw4erSUlJ6vz589WjR4+qe/fuVZ9//nlVp9Op06dP91zrcDjUli1bqiNHjlT37dunrlu3Th04cKCqKIqnz6SsrEzt3Lmzmp6eri5fvlw9duyYumnTJvWf//ynunjxYlVVq+97qfDggw+qBoNBbdOmjep2u2vxZIVQVUkmosEYP368Cnj+hIWFqampqeorr7yiulwuz3VFRUWeocEGg8FraPChQ4fUqKgo9fXXX/dc73a71ZEjR6q9evVS7XZ7tZ9vs9nU6dOnq126dFGNRqMKqBqNRv36668rXbtp0yY1LS1NNZlMateuXdW1a9dWGhqcm5urTpw4UW3atKmq1+vVpk2bqqNGjVIzMzNVVb18MtmxY4cKqP/85z+v6DkKURVFVWWnRSH84ciRIwwfPpz27dvz9ddfYzKZfPr5S5cuZdSoUZw8eZLGjRv79LNF6JE+EyH8pG3btqxbt47+/ft7jRSrbxaLhf379/P3v/+d22+/XRKJqBNSMxGigXn++ef5xz/+Qa9evfjyyy9lFJeoE0GVTLZs2UJmZibFxcWMGDGCbt26+TskIYQQBEAymTNnDpmZmcTExDBjxgzP8R07dvD+++/jdrsZPnw4o0aN8pwzm8189NFHTJo0yQ8RCyGEuJjf+0yGDBnC1KlTvY653W7mzZvH1KlTmTVrFuvXr+f06dOe84sWLbrkBDMhhBC+5fdkkpqaSmRkpNexw4cP07hxY5KTk9HpdPTr14+tW7eiqioff/wx3bt3p02bNn6KWAghxMUCcjmV/Px8EhISPK8TEhI4dOgQy5YtY9euXVgsFs6dO8e1115b5ftXrFjBihUrAJg+fXrQ7eGg0+lwOp3+DsOnpMwNg5Q5eBgMhiu6PiCTSVXdOIqicP3113P99ddf9v0ZGRlkZGR4Xufm5tZpfPUtMTEx6GKuLSlzwyBlDh41WZbnQn5v5qpKQkICeXl5ntd5eXnExcVd0T22bdvGO++8U9ehCSGEqEJAJpO2bduSlZVFdnY2TqeTDRs2kJ6efkX3SE9P54EHHqinCIUQQlzI781cr732Gnv37qWkpISJEycybtw4hg0bxoQJE5g2bRput5uhQ4fSvHnzK7rvtm3b2L59uyQUIYTwAb/PM/GFC5fyDgbB2sZaG1Lmqmm1WqKjo9FoNKiqSklJCYqiEB4ejk6nQ1EU3G43FouFsrKySu9XFIWIiAiMRiMajQaHw4HZbPZ0CIeFhREeHo5Go8HpdGI2m3E4HISHh2M0GtHpyr9vXngOyvdMCQ8PR1GUSvesbZlDTbCW+Ur7TPxeM6kvUjMRoSAuLo6ftxVyPquMlOYRdOgSgcFgYuOa8+ScL8PhcBMdYyCtdyLR0XqKi4u93h8fH8/pEzb27szCWuaiRatIul2dQElJARERERTlq2xcfY6SYgdNm0WQ1jsRJcKBxaxh6485FOTbURRo3DSc9L6JlFqKiIyMJPuck3UrzmIpddKidSQ9rk7EXFoYdCMnRd0J2WSSnp5+xf0sQgSSyMhIzp2x8dOWXFQVwsJ1KEoEbrfK6ZOltOsQjdOpsiszj3NnLdx531UoSolnNGRYWBi52U5WfHualm0iadU2io1rzmMpdTL42iaUWVx8u+gwUTF62lwVzY6tuRQV2LlxbEtOHivEbnfTvlMMZ09b2LuzALvdzbCRTSnIs7Fs8UkSk0y0ahPFjm15lJqdDMpIwmaz/bLTowZFUXG5XJjN5iprTSK0hGwyESKY6XQ6DPpw1nx/lK5pCfy8/X+jGzUalVvvbI3VVkZ4eDjZ58o4d8aCpdSFRqPB5XIB5XvKnzpuBuA3aQmkNI9g/65CDu0vov+wxhzYU4jLpdKzTyPadYghP9fG8SMlFBXYuKpDNO06RGKz2ejQOZYF7xykIM+Goijs21UIQO+BSTRrEcm5sxaOHChmwLAmFOUrfPOfo5QUO9BooFnLSK65sakkkwYgZJOJNHOJQDJjxgxmzpx52esef/xxpkyZQlxcHD/+cJ74RCOdfhPrSSYajYacnBwAoqKiKMi3k3u+jKTGYUREaii1uDz3crvdhEeU/y9+6pgZvV5DcbEdVQVzsYOiwvImqYhIPQ6Hg4io8muLihy4sWCz2YiPj2fvz4UAtOsYDUBxUfn7IqN+eV+kHiijpMjOrp/ycTlVxt3dFqfDTX6erfYPTwSFkE0m0swlAsmUKVOYMmWK5/XYsWPR6/UsXLiw0rUmk4mc83YOHyjmhjEtMJvLO7adDjcup5bIyEh0Oh0FeW6Wf32c6FgD197UjKKiIjQajWfmss1mo2OXOM6cLOXn7Xns+ikPg1GL0+FC0ShoNArwv0nCFUNxNJryjvuEhAR2bC0gc3MunbvH0blbDHa7/YL34fV+jUYhMcnEyWNmvvniOPGJJq7qGOPpxBehTf6VhQgwiqJQUuTA7VL55vMTnuPHj5SAAiNuas6RA0X8sPwsTZqGc81NzTAatWh1EWi1Ws6espZ3mqdEYjDouG5UC+x2F6jw5b+Po9UqxMYZiEswAlBcaKdpswiKC8prHLFxRsLCI1i3MosDewrp1T+J7lcnoKoqGo2G2HgDHIGiAjuxcZEUFTrQahWiYvSk921Ei9aRZJ8r4+DeIlZ/d5amzcLRarWe5jcRmiSZCBFgXC4XLVrHcvO4VgCYzQ5WLj1DsxYR9OqXhM3qYsXSMwCUmh18/Z/jAAwdkUJikp51K4+hKAq3/792lJodrF5+lrgEI+ezyijItzFweBNsNhvtO8Xw05ZcNv+Yzanj5vJO/Y4xREbp2b+7gAN7CtFqFQ4fKOLwgSJMYVpuGtuKzt3i2b0jn/Wrsti/20RejpXf9IjHYNCyYfU5XG6ViAgdqlpey9HpNahlIT8DocEL2WQifSYiWNntdlyuPHRGLRqNhpS4aH7TI57EZBNRMVpUVeE3PeIrvc9gLF/QomOXOBSlvM/EYNASl2ikpMhBfKKRPgOTSEjSk5eXR0REBGN+35rdO/IpLnbQf0hj2neOxmw2ExtvrPQZekP5XBQUG2Nub8OeHfnlI8OuaULb9lHYbDaaNo/gxNEScs5baZRsot+QZFAcuN1unzw74T8yaTEABeskp9poaGW+VJ/JxSomKKqqisViwWAwoNfrK13ncrlwOByYTCYArFYrUD5EuGLSo9Vq9RyH8hFfF05aLC0tRVVVIiIi0Ggqr7ZktVqx2+0YDAavSYulpaWeWPV6PYqioKoqNpvNayRXQ/t3huAts0xaFCLEWCwWr9dlZWWXHGp78cTBS00kdDgcFBUVVTpuNpsvGZPdbq/yvhVJRTQ8kkyEqMY3nxXW3b3++ybffjen0vGUlBSv1zdc+yA3jXyozj73ptti6+xeQlxKyCYT6TMRgeSmkQ/VaZIQItCEbDKReSZCCOE7AbmfiRBCiOAiyUQIIUStSTIRQghRayGbTGQPeCGE8B3pgA8yFTvnGQwGtFotUD7mv2KuQMX5sLAwtFotbrcbm81GcXExF85PDQsLIzIystL9LRYLVquViIgI9Hq9Z+JaSUkJNpuNuLg4z+de6OIJcxeS/SyECH0hm0xCVVhYGMbTRylZ/AnOMydRUUmeuYBipXwl18TEROzrvifvy09xnjuNNi6B+Eeew9ikhdfM54iICIpm/AXHySOeY9rYROL//jpGoxHHd19StHE1rqJ8jKndCf/D47jdbpRjB8l94x+V4kqY8nei2nbk/JN/wG3+325/hnapRD70jCeZVCQnWV5DiNAiySTIKIqCKy8XTWQ0bpsV17kzQHmNIyIiAvv6leTP+Aum3oOIvm0CroI80Fb9z+w8dwbVaiWs7xAANBFRqKqKoig4z51B36I1tiXb0CU2BsqXGtdEx2Dq0cdzj9Lvv0YtK0MTE1d+z1PH0SY0wtjtagB0jVM8y3NERkRAcSGq240mNhGbw0FBQUE9PSkhhC9JMgkyVqsV49UDiOk3FOdzD/+STMqZTCYK/7sIJSKS6LF34y4qJLzfULTxiRScO1fl/TThkegap6Br1BhTen9Kf1l7KWL8H9E7HZiXfO651uFwYImMRf/7+8vXXzp9DPOXnxI+eASuqFiUX5YY10THomvSDH1KC4zdemG12Qg3F3F+8p24srMAUIwmmrz/DRqNRmopQoQASSZBxuVykZ+fT0JCQqVzer0e+8E9qA4HOc8+BPzS9PXXWYS1al/lukmugjyKP30Pd3Eh+rYdafTye5jNdmw2G8nRUZWur7hHQkICJYs+BiDqlrsoMZuJi/uldnL6BEUL5qCWmjH1GkjiczMo2bQaV3YWSS/PRdeiNfb9u1AMRlSLtdJnCCGCjySTEKKqKooxDNVmI3nWAtDpOXf/GEq+WEDsX2Z6mrAcDgcajYbEZ19Fm9AI1eUif/rTlG1cjf2nzRg6dMVmq367Vb1ej7aoAMva7zF264XSoi22nBw0Gg3Jb3yKEpeA4nKS/eR9WLesw3HiCMbOaShGI9lP/gFtUhNMPXpj7NzDa89yIUTwCtmhwaFKq9WSkJBQaQnyxMRENBoN+lbtANDExqONLd+PQlXd6PV6TMf2o1mzjPgwE1rV7UksqqKgiS6vVaguJ2FhYSQmJnrd32AwkJCQgEajISIigpKvPgW3i6gxd2I2m8tHjpUUozGacDqdqFodmshfajZOB4b2qTT5cCmJf/sXpvR+lC7/Esua5RiNxnp+YkIIXwjZmkmoLvRoNBpxbVlH/ifvePofzj96J8au6cQ/9AzR4+4lZ9d2cl+YUr7NHRB53RgASlctw7LyW8J6DUQJi+DsvTdi7NQNVBXbnp/QJqdg7N4bt05HyVsvYdudCYBtdybnH7yNmLsmEdH1akxuF/nLv0Tfqh367r0oyM5Gr9fjyDpFztMPYOjUFbe5BMfhfRg6dEHfuj2l/12E5ceV6Ju1wnnyGAC65KbYpb9EiJAgm2MFoEttpmMymYgszMWaudHruK5JM5S0fiiKgubMCSyr/wtaLWG9B0GbDjidTjR7fsJx8giRI8eAKQzr1h+x79uJaitDl9KS8KHXU+JyYzQaUbf9iPO893MzpffHlZyCLuccZVvWYkztjq1Za8xmMxqNhkYx0Vg3r8V+eB+4XOhbtiVs8AisbhVjcQGWdStw5pxDYzRh7H412q5Xe8oZiBsI1eUS9P7izyXoZ8yYwcyZMy973eOPP86UKVN8EJF/BOLPdk1c6eZYkkwC0OV++Cp2xruQqqqUlZXhdpcng4rmI7vdjtVqRaPREBYW5mnastvtmEwmz654LpeLsrIyXC4XWq0Wk8mE8svclQoulwur1erZuc/tdntt3KTRaDCZTOh0Os/1FTEZDAbPREtVVXE4HAG/A58kk7p1JbtLhpJA/NmuCdlp0Qdc9/22zu5168YDbC649K52AL3jIvm8bwcASi5zreWXPxdyAY4LXmvf+7raHftcLtcld8y7eOe/ChcnlwtVtzOfECI0SDLxs4oEIYQQwUxGcwkhhKg1SSZCCCFqTZKJEEKIWpNkIoQQotaCqgP+/PnzLFq0CIvFEtLj0oUQItj4vWYyZ84c/vCHP1RKDjt27ODRRx/l4Ycf5ssvvwQgOTmZSZMm+SFKIYQQl+L3ZDJkyBCmTp3qdcztdjNv3jymTp3KrFmzWL9+PadPn/ZThEIIIS7H78kkNTW10vaxhw8fpnHjxiQnJ6PT6ejXrx9bt271U4RCCCEuJyD7TC7eryMhIYFDhw5RUlLCwoULOX78OIsXL2b06NFVvn/FihWsWLECgOnTp1daAbe2ztfp3fyjrp9Jbel0uoCLCQr9HUCtBdIzrVi6J5Bi8oXA/NmuewGZTKpaLkxRFKKiorj//vsv+/6MjAwyMjI8r4NxXZz6FmjPJFjXLwp0gfRMHQ4Her2+2pg0Gk2lrRWgfCmei38nKIqCXq/3rAPncDhwOBxe11x4P5fLhdPp9Ly3Yu05u93uOQ7lWzwYDAYAbDZbnewCGqw/21e6Npffm7mqkpCQQF5enud1Xl6eZxe/mtq2bRvvvPNOXYcmhKgn8fHx5Ofnk52d7flTUlJCbGys13U6nY6EhATMZjMHDx7k0KFDqKpKQkKCZ3FSRVFISEggNzeX7OxsIiMj0Wq1hIWFkZCQwJkzZzhw4AAGg4G4uDgURSE6Oprw8HBOnjzJ8ePHiYmJITo62g9PIjgFZM2kbdu2ZGVlkZ2dTXx8PBs2bOCRRx65onukp6eTnp5eTxEKIeqaVqvliy++oKioyHOsefPm3HfffV7X6fV6du7cyRdffEF0dDRmsxm3282YMWNITU2luLiY6OhoMjMzWbx4MQCTJk0iOjoak8nEG2+8QW5uLkajka+++orf//73dOrUiXPnzvHee+95aiZfffUVDzzwACZT+YZvJpPJs+q10+mkrKysylaUhsrvyeS1115j7969lJSUMHHiRMaNG8ewYcOYMGEC06ZNw+12M3ToUJo3b35F9w3VzbGECHXdunXjpptuAsqbqi5uvoLyJpjHH3+cxMRETp06xVtvvcWWLVvo3r07RqMRm83G0qVLadKkCVlZ5ZvIhYeHs2HDBnJycrj11lvp0qULL7/8MkuXLqVz586sWrUKm83G5MmTURSF6dOn8/3333P33XdjNpvZvn07+fn5GI1GWrduTatWrSgoKPDpswlkfk8mkydPrvJ4WloaaWlpv/q+UjMRIjgdOHCA48ePEx8fT0ZGBo0bN/Y6b7VaiYqKwmazUVJS4uk3qWiSio6O5oMPPqBr164YjUZPMtHr9Zw8eRKAVq1aodVqadq0KYcOHcJsNnu2Y9DpdJ7mslOnTgGwaNEijh49Svfu3SkqKiIzM5M2bdrU/8MIIgHZZyKEaHhUVSU1NZVrr72WXr16cfr0aT788ENUVUWr1XpdV1JSgtFoxG638/HHHxMTE8NNN92ERqNh+/bt5OXlccMNN3jdX6PRePbb0ev1uN1uTyKyWCyeL6/vvvuup7+1Yl8fi8WCTqcjNjaWnj17Mm7cOK+YRADUTOqLNHMJEVx0Oh0333wzZWVlGAwG8vLyyMzMJCsri9atW3tqC2azGb1eT05ODh9++CGRkZHce++9REZGoigK27dvR6fT8e9//5vs7GwAli5dys0330xMTAxQnhyio6M9tZGYmBgaN25MXFwcR48eJTo6mu+++46oqCgAbr75ZlasWMGmTZsoLCykUaNGPProo354SoErZGsm6enpkkiECBI6nY7c3Fy+//57cnNzOXbsGEeOHPGMytLpdMyYMYMFCxYQExNDTk4O7733Hg6Hg/T0dA4fPsyePXvQarW0a9eOJk2aoNfrPdtba7VaFEWhc+fOAGzatImDBw9y+vRp2rRpQ1hYGLm5uWi1WtLS0igsLKS0tJQePXoAUFRUxA033MBjjz1GamoqOTk52O32SttnN2RSMxFCBAStVsv27dtZuXIlAGFhYYwZM4bw8HBUVaW4uJiIiAgAzp07h8vlAmDZsmUAREVF0b17dwYNGuR5/7fffsuPP/7IiBEjSExMJDExkX79+rF582Y2bdpESkoKo0aNoqysjLy8PD744ANUVcVgMDBgwAD69u2L0+lk27Zt7N69GygfdtynTx9PU5kop6gNYGzb2bNn6/R+dbkHvL9o3/va3yF4CcSJXd98VujvEGrtptti/R2Cx9ixY9Hr9SxcuLDK8xVDd202Gy6Xi4iICKxWK8XFxcTGxmIymYDyDniDwVBlrcBisXiGFhuNRs8cEpfLRU5ODlqt1tNRb7PZCA8Pp7S0FKvVSnx8PFDejBYZGYnL5aK4uBiDwUB0dDQOh4OysjIiIiJQVZWioiKvCY/VCcSf7Zq40kmLIVszEUIEl+LiYoqLiz1Jwmw2e879miG4NpuNc+fOeR1zOp3k5+ejKAqKong62KF8tQBFUdBoNOTl5XnmkJSVlVFWVuY5Z7VaZX5JFSSZCCE8Xn/99Tq7148//sj69esrHU9JSfF63b9/fwYMGFAnn1nTyc2qqlaZEFRV9TSfXck5EcLJRPpMhPCvAQMG1FmSEIEvZJOJTFoUQgjfkXFtQgghak2SiRBCiFoL2WQiS9ALIYTvSJ+JEEKIWgvZmokQQgjfkWQihBCi1iSZCCGEqLWQTSbSAS+EEL4jHfBCCCFqLWRrJkIIIXxHkokQQohak2QihBCi1iSZCCGEqDVJJkIIIWpNkokQQohaC9lkIvNMhBDCd2SeiRBCiFoL2ZqJEEII35FkIoQQotYkmQghhKg1SSZCCCFqTZKJEEKIWpNkIoQQotYkmQghhKi1oJpnYrVamTt3Ljqdjs6dOzNw4EB/hySEEIIASCZz5swhMzOTmJgYZsyY4Tm+Y8cO3n//fdxuN8OHD2fUqFFs2bKFPn36kJ6ezqxZsySZCCFEgPB7M9eQIUOYOnWq1zG32828efOYOnUqs2bNYv369Zw+fZq8vDwSExMB0Gj8HroQQohfXLJmUlxczNq1a8nMzOTEiRNYLBbCw8Np2bIl3bt3Z8iQIURHR9cqgNTUVLKzs72OHT58mMaNG5OcnAxAv3792Lp1KwkJCeTl5dGqVStUVa3V5wohhKg71SaTTz/9lHXr1tGjRw+GDRtGSkoKYWFhlJWVcebMGfbu3ctTTz3FgAEDuOOOO+o0qPz8fBISEjyvExISOHToENdddx3z588nMzOTnj17Vvv+FStWsGLFCgCmT5/uqc3UlfN1ejf/qOtnUls6nS7gYoJCfwdQa4H3TOtXIJY3MH+26161ySQuLo7XX38dvV5f6Vzr1q0ZMGAAdrudH374oc6DqqrWoSgKJpOJBx988LLvz8jIICMjw/M6Nze3TuMLBYH2TBITEwMuplDQ0J5pIJY3WH+2mzZtekXXV9vxcN1111WZSC5kMBgYOXLkFX1gTVQ0Z1XIy8sjLi7uiu4hS9ALIYTv1KgXe/fu3Z5+jYKCAmbPns2cOXMoLCysl6Datm1LVlYW2dnZOJ1ONmzYcMXLyaenp/PAAw/US3xCCFGVGTNmkJKS4vXHaDRWOnbhyNVQUaNkMm/ePM/oqQULFuByuVAUpU6++b/22ms8++yznD17lokTJ/LDDz+g1WqZMGEC06ZN47HHHqNv3740b978iu4rNRMhhK9NmTKFM2fOeP707duXQYMGeR07c+YMU6ZM8Xeoda5G80zy8/NJTEzE5XLx888/M2fOHHQ6XZ188588eXKVx9PS0khLS/vV95XNsYQQwndqlEzCwsIoLCzk1KlTNGvWDJPJhNPpxOl01nd8QgghgkCNksnIkSN55plncDqd3HPPPQDs37+flJSU+oytVrZt28b27dul30QIIXygRslk1KhR9OrVC41GQ+PGjQGIj49n4sSJ9RpcbUgzlxBC+E6N1+a6eMzxlY5BFkIIEbqqHc31zDPPsHHjxmr7RSqG7F68rlagkNFcQgjhO9XWTB566CE+++wz5s6dS+vWrWnatCkmkwmr1UpWVhZHjx6lS5cuNZqR7g/SzCWEEL5TbTJp1qwZU6ZMobCwkJ07d3Ly5ElKSkqIiIhg0KBB/PGPfyQmJsaXsfrcZ599VulYhw4d+A3gcKt8eTav0vnU6HA6R4dT5nKzJCu/0vmuMRF0iAqj2OFi+fmCSufTYiNpG2ki3+5kZXZhpfO94qNoGW4k2+ZgTU5RpfP9E6JpGmbgbJmd9XnFlc4PbhRDklHPiRMn2LRpU6Xz11xzDfHx8Rw+fJjt27dXOn/dddcRHR3N/v37+fnnnyud/+1vf0tYWBi7d+9mz549lc6PGTMGvV7Pjh07OHDggOd4VFQUJSUl3HbbbQBs3bqVo0ePer1Xp9Nxyy23ALBx40ZOnjzpdd5kMnHzzTcDsHbtWrKysrzOR0VFcf311wOwatWqSguMxsXFce211wLw3XffsWHLGa/zMVGJdO5Uvu3BTzu/p8xq9n5/bGM6te8LwLaflmF3WL3OJ8Y3o327qwHYvO0bXG7vWn9yo1a0bd0DgA1bFnOxpo3b0arFb3C5HGzevqTS+eYpHWme0gm7vYxtO/4LgIVwz/lu3brRsWNHiouLWbZsWaX3V6x3Z7FYOHLkSOX7N29ObGwsZrOZY8eOVTrfsmVLoqOjKS4u5sSJE5XOt27dmsjISM/I0Iu1bduW8PBw8vLyOHv2bKXz7du3x2g0kpOTw7lz5yqd79ixI8AV/+xVqI+fvezsbLRaLUuXLr2in72CAu/fDUlJSQwdOhSApUuXUlJS4nW+SZMmDBo0CICvvvoKq9XqKY+vXLbPJDY21hNkMJHRXEKIYBEZGYnBYLjsdAuNRkNSUhImk4nCwkIcDgdQnsy0Wq3nXnq9Ho1Gg9vt9nqvVqtFVVXP5yiKUmk7D7fb/atWZVfUBrCWe1XfcmrDdd9v6/R+/qB972t/h+AlEBfD++azQn+HUGs33RZ7Rde//vrr9ROIjzzyyCP+DsHL2LFj0ev1LFy4sMrzWq2W2NhYDJSBNRs1PAWrQ6GoqMjrF7perycuLg6t2wLWbNAYcJuSKbXYUBSFSKMKbof3zRUtFqeekpISYmJiMOpcKGXnQGsqf29pGZHhehSnxettqi6CnLwizxYgNeX3nRaFEKKhiouLQ3vmS1yn/wuqC7QmTO3uQo3uQVHR/5qxTSYTmqzvcB27oOndEEdUl8dRIlvg2vMaat5P3jePbInhN8+REB+P5uwS3KeXgdsOgNL8BiJa3AI563EdnOf1Nk2Xx9Hpml1xWSSZCCGEHxiNRvS2M7hOfYuSmI6m7R24dr2K+/DHhPXqTslFzVSYGqHp8jhKVDvUsytwn1iE+/RStB0nomk5GrXpNQCohXtRTy1Bie2ETq/HnbcD98mvUBoPQtPm9+VJy16ES1VRfrm1psMDYCjvA1ciW6CzuHE7bVC4G+yFoItAiW6LYmpUbXlCdu9bGRoshAhker0etWA3AEpiOhjiUOJ/A64yKDnqtQVIWVkZ9qjfUKS0wGx1Q/RV5ScUPWVlZeRZIyhwN4aYTlB8GBQtmqbX4nQ6UXM2l19riMd9dCFq9kYIS8Jut3vur57/EfXcarDlgS6CyMhI1L3/wr3/bdw5m3EfX4T7xJeXLE+NaiaqqrJy5UrWr19PSUkJr776Knv37qWwsJB+/frV/On5kAwNFkIEMo1Gg2r/pSlLG4bD4UCnDUMFsBehMf5vpXSn00l+fj4RERFE6u249i8AfRSaFr/FYrFgt9sxGo0opSdwF+1HSeqHXYkElwuttbwvUs3ZiBLRHPeRT1CKjxDeaRJusx4lvjsYE1ALdqHmbEHjLEXb7DqcpSchojmaFqNQIluAcum6R41qJp999hmrVq0iIyPD00makJDAV199dcUPUAghRPmoKUUfVf7CVYZOp0N12cpfG6KJiIggPj6euLg4DAYD0dHRRGuLcO14AdwOtN2mYiUSm638PZGRkeX9IoCm2XWYzebyzzBElx9rewfa1IchLBk1bzuqqqJJ6ouS+ij2Zrei7fJ4eVy5meXXtxoLtgLcu17CtfEh3McXXbI8NUoma9as4amnnqJ///4oSnkrW1JSUqWx0kIIIWrG4XCgxHUGQM3fiaLaUQv3gMaIEtUGvWJHf2g2xpwVJCYmEuE6i+vnf4LqRNPuDnBZMakFxMbGotfrMbiLUXO2oMR1wWlsgs1mw+l0osR0KP9AezGqyw7OMjDEoigK7txtKLYcTCYjatFBAE/yUWLao+09E23f2RDZCvX8ukuWp0bNXG63G5PJ5HXMarVWOiaEEKJmbDYbzuiWaJoMRc1ahSt7Q3lfR9vbsbsNGChDLdgFujAA3Pk7yvtTXODe86/ym0S3Q5f6VHmt5Mz/ASpKs+sxm8sn1JaWlhKeNAhytuI+OB+OfgpuF5r2EwBQc7fj3vsGoAAqGOLRtBwDgGvnK+C2gdYEtjyUpEt3adQomfTo0YMFCxYwfvz48gBUlc8++8wzY1YIIcSVKygoILbl7ehTRqKWnUOJaoXNbaKwoIDEhDi0PaeBNgyn04k25Vq4+Be6xoDN4UCr1aJJyYAmQ3AZkinLyQHKKwKFxaXEdH4SrfUUqrMMJbIFFrsGW34+MVf9P7TNr0e15qHoI3GHN6fEYkNbWkp4zxdQS0+By4ZijEcNv/Rw4Rolk7vvvpvZs2dzzz334HQ6ufvuu+natSt//OMff90T9AGZAS+ECHROp5Pc3NxfZqw3x1lgw+Uqn0SYk5uPVmtAVZ24XOXLsiiKoYp7lHfi63Q6wICr2Hvyr81mIzsnB4MhGkWJwZFX6hlybLVa0etNaDTNcdvcOEvzPZMlI5o2RYnt5LmPwqXVKJmEh4fz5JNPUlhYSG5uLomJicTGxtbkrX4jo7mEEDWRdPiZOrvX39/fwQsf7Kx0/OKNBJ+7pyt/ubd7nXxmdrsXAS67FMuFQ4EvVLEkS21d0aRFg8FAfHw8breb/PzyRQzj4+PrJBAhhAh2f7m3e50liWBTo2Syc+dO3n33XXJ+aYe7UFUr6wohhGhYapRM3n77bW655Rb69++PwVC5zU4IIUTDVqNk4nA4GDp0aKWlioUQQgio4aTFG264ga+++upXrXEvhBAi9NWoZtK7d2+mTZvGl19+SVRUlNe52bNn10tgtSVDg4UQwndqlExmzpxJx44d6du3b9D0mcjQYCGE8J0aJZPs7Gxeeukl6TMRQghRpRplh/T0dHbv3l3fsQghhAhSNR7N9fLLL9OpUydiYmK8zgXykipCCCF8o0bJpHnz5jRv3vzyFwohhGiQapRMbr311vqOQwghRBCrNpns3buX1NRUgEv2l3Tp0qXuoxJCCBFUqk0m8+bNY8aMGQC89dZbVV6jKErAzjMRQgjhO9UmkxkzZvDjjz8yYMAA3nzzTV/GVK3z58+zaNEiLBYLU6ZM8Xc4QgghfnHJocHvvfdenX3QnDlz+MMf/lApCezYsYNHH32Uhx9+mC+//PKS90hOTmbSpEl1FpMQQoi6cckO+Lpci2vIkCGMHDnSq5bjdruZN28ezz77LAkJCTzzzDOkp6fjdrv59NNPvd4/adKkSsOShRBCBIZLJhO3233ZyYo17YBPTU0lOzvb69jhw4dp3LgxycnJAPTr14+tW7cyevRonn766RrdVwghhP9dMpk4HA7efvvtamsote2Az8/PJyEhwfM6ISGBQ4cOVXt9SUkJCxcu5Pjx4yxevJjRo0dXed2KFStYsWIFANOnTycxMfFXx1iV83V6N/+o62dSWzqdLuBigkJ/B1BrgfdM69evKu/huo/DlwLl3/iSycRkMtXraK2qkpSiVL9tfVRUFPfff/9l75uRkUFGRobndW5u7q8LMIQF2jNJTEwMuJhCQUN7pr+mvEn1EIcv1de/cdOmTa/oer+u3JiQkEBeXp7ndV5eHnFxcXVy723btvHOO+/Uyb2EEEJc2iWTSX1vhtW2bVuysrLIzs7G6XSyYcOGOls2Pj09XfYyEUIIH7lkM9eCBQvq7INee+019u7dS0lJCRMnTmTcuHEMGzaMCRMmMG3aNNxuN0OHDq2zNcBkcywhhPCdGq3NVRcmT55c5fG0tDTS0tLq/PNkcywhhPAd2e1KCCFErYVsMpEOeCGE8B2fNXP5mjRzCSGE74RszUQIIYTvhGwykWYuIYTwHWnmEkIIUWshWzMRQgjhOyGbTKSZSwghfEeauYQQQtRayNZMhBBC+I4kEyGEELUWsslE+kyEEMJ3pM9ECCFErYVszUQIIYTvSDIRQghRa5JMhBBC1JokEyGEELUWsslERnMJIYTvyGguIYQQtRayNRMhhBC+I8lECCFErUkyEUIIUWuSTIQQQtSaJBMhhBC1FrLJRIYGCyGE78jQYCGEELUWsjUTIYQQviPJRAghRK1JMhFCCFFrkkyEEELUmiQT4XMzZswgJSXF64/RaKx0bMaMGf4OVQhRQyE7mksErilTpjBlyhTP67Fjx6LX61m4cKEfoxJC1IbUTIQQQtSaJBMhhBC1FlTNXFu2bCEzM5Pi4mJGjBhBt27d/B2SEEIIfJhM5syZQ2ZmJjExMV4dqzt27OD999/H7XYzfPhwRo0aVe09evXqRa9evTCbzXz00UeSTIQQIkD4LJkMGTKEkSNH8uabb3qOud1u5s2bx7PPPktCQgLPPPMM6enpuN1uPv30U6/3T5o0iZiYGAAWLVrEiBEjfBW6EEKIy/BZMklNTSU7O9vr2OHDh2ncuDHJyckA9OvXj61btzJ69GiefvrpSvdQVZVPPvmE7t2706ZNm2o/a8WKFaxYsQKA6dOnk5iYWIclgfN1ejf/qOtnUht6vR5FUQIqpnKF/g6g1gLvmdavX1Xew3Ufhy8Fyr+xX/tM8vPzSUhI8LxOSEjg0KFD1V6/bNkydu3ahcVi4dy5c1x77bVVXpeRkUFGRobndW5ubt0FHSIC6Zk4HA70en1AxRQqGtoz/TXlTaqHOHypvv6NmzZtekXX+zWZqKpa6ZiiKNVef/3113P99dfX6N7btm1j+/btPPDAA786PiGEEDXj12SSkJBAXl6e53VeXh5xcXF1cm9Zgl4IIXzHr/NM2rZtS1ZWFtnZ2TidTjZs2CAJQAghgpDPaiavvfYae/fupaSkhIkTJzJu3DiGDRvGhAkTmDZtGm63m6FDh9K8efM6+Txp5hJCCN/xWTKZPHlylcfT0tJIS0ur88+TZi4hhPCdkF1ORfaAF0II3wmq5VSuhNRMhBDCd0K2ZiKEEMJ3QjaZSDOXEEL4jjRzCSGEqLWQrZkIIYTwnZBNJtLMJYQQviPNXEIIIWotZGsmQgghfEeSiRBCiFqTZCKEEKLWQjaZSAe8EEL4jnTACyGEqLWQrZkIIYTwHUkmQgghak2SiRBCiFoL2WQiHfBCCOE70gEvgoLRaCQyMhK9Xg9AWVkZJSUluN3uSteGh4cTGRmJRqPB7XZjsVgwm80AREREEBER4TlXWlpKaWkpBoOB8PBw9Ho9iqKgqiopzR2cOVUKQEqLCLr1TKBxSjiKAkUFdn7ensehfUW+ewhCBLCQTSYidBiNRpz6CJ5bdpB1R3KJC9dz59UtGds1mdzcXK9ro6KiOFrk4rUlO9h3roQ2iRE8PLgtXZNiUFWV/fkO/vXVTxzKNnNVUiSPDmlHx/hoTCYTb6w7xtYTBZhtTkamJjP4qjjOnCqlXcdoOvdPZM66I6z5MheXqpLeIo5pI1M5tK8InU4hMkqPqoK5xIHLpfrpSQnhPyHbzCVCR1RUFC+vPMjaw7k8Mbw9nZvEMGvVIX7OMhMWFua5TqPRoDWE8dTXu8kx2/jbDZ1wut08/dVunBoDisHEU1/tosTq4O83pGK2OXn6q12gN6LVarE6XHRNiSGr2EpxmfOXe0KfwclM/r+fWXs4l8lD2zFjdFf6tIrH5VTpNziZ39/Xjt6/bUK/UU25a1J7ho5o6q9HJYTfSDIRAU1RFGxuhbWHc0ltEsWY7ilM6NMSgG/3nMNoNHquNRgMbD9dSF6pnRGdkhnRqTGjujbF4nCx5nAO64/mUWx1cmOXJlzbKZkbuzSmyOpkw7F87HY7jw9uzeiu3omgSbMIjhRYOJJbyo1dmhCm1+JwuRnbPYWoaD1qso4b393IrfM2c8vcTQx9Yy3tOsX49BkJEQikmUsENK1Wy7liGwAJ4Qbsdjtx4QYAzhVb0Wq1XtdmFVnKr43wvjaryIpJX35t/EXnzhVZsdlsWK1W4H81HYDYOAP7z5cA8FnmadbFmDhbZKVLk2je/X0aqw7lUGJz8vmE3jSONvHzmSKU+nscQgQsqZmIgKaqKuGG8iTgcKtoNBqcrvJO93CDDo1GQ0REhKdz3nOtq/xaxwXXVpxzuiruo/5yTltlRz6Aw+Em0lj+voFtE/nq/n6M7Z7C7qxitp0soG/reEx6DbfO38zN727ku/3nsbvchIVrq7yfEKFKaiYioLlcLpIiDcSE6TmSawaNhoPZ5SOz2ieVJ5DNWVbOFJZx59Ut6JBUnhQOZpeg0+m8rg37pWZSfq4ZB7NLfjkXRViYiaioKApyLZ7Pbt8pls7d4j3XVSSVSGP5/zZajUJa81iWTRrAvnPFLN93nq92ZdG/Tfmor2OHSnzwhIQIDCGbTLZt28b27dt54IEH/B2KqCW7zcp9/Vrx6spD3P7BFrJLbMSF67m1ewpOp5Mlu7PYeCyf36U1p3W8ieHtG/Hd/mxOFW7l4Hkzac1j6dksGlVV6dc6ga93ZXEwx8z+cyX0a51AanIEAA9+/jPH88qTydK95/jxaC7/vKkLXVNiGHpVI77fn43V4Wbz8Xw6JEXSPSWW9zeeYPOJfJrFhnHgfAlaRaFZbBj7LZZLFUmIkBOyyUTmmYSOkpISRnVOomNydPnQ4DA916cmo3XZsNt1jO3ejAFtEtEqKoWFhfzt+k5kdExm37lifpfWjGFXNaKoqAhVVXnl5s6sOpzLwWwzd6Q3Z2i7RAoLC4mOjmZU16aUWJ1en9042sSa78/yl4wObOiYxIHzJfRrcxUZVzXi7IlSRqYmE2HUkWO2MaxDEv1aJ2A0Q9YZSSaiYQnZZCJCh9vtJi8vj+ZhJsb3SEJVVcrMhbhcLjQaDV0TTXRNjKSosACHw0F+bg7pyWH0SWmM0+mkIC8HVS3vH8nLzaF3kzD6N4/E6XSSl1t+rqCggL5NTV6fuzuzjMwfznPiqJkjB4tp1yGagTGxmHMdfLn5GKVmJ41TwmmdbKJzZDgOh4sTmwo5fcLsj8ckhF9JMhE1cvMn++vsXme++5CsFQsqHU9JSfF63STjblKuHV9nn/vVHR1RVRVLFU1QLper0vE9Pxd6/u6wu9m3q5CLnTtj4ZzUQoSQZCJ8L+Xa8XWaJIQQ/idDg4UQQtSaJBMhhBC1JslECCFErUkyEUIIUWtB1QF/+vRpli5dSklJCb/5zW+49tpr/R2SEEIIfJhM5syZQ2ZmJjExMcyYMcNzfMeOHbz//vu43W6GDx/OqFGjqr1Hs2bNuP/++3G73bKLohBCBBCfJZMhQ4YwcuRI3nzzTc8xt9vNvHnzePbZZ0lISOCZZ54hPT0dt9vNp59+6vX+SZMmERMTw7Zt2/jyyy8ZOXKkr0IXQghxGT5LJqmpqWRnZ3sdO3z4MI0bNyY5ORmAfv36sXXrVkaPHs3TTz9d5X0qlkl58cUXGTBgQL3HLYQQ4vL82meSn59PQkKC53VCQgKHDh2q9vo9e/awefNmnE4nPXr0qPa6FStWsGLFCgCmT59O06Z1vPPdt9vq9n5BYOsTDW/3wAcea3hlnj59ur9D8L2mH/o7gloJlJ9Sv47mqlgv6UKKUv3WQp07d2bChAncf//9l2zmysjIYPr06UH7P0Z1tbJQJmVuGKTMocuvySQhIYG8vDzP67y8POLi4vwYkRBCiF/Dr8mkbdu2ZGVlkZ2djdPpZMOGDbJsvBBCBCGf9Zm89tpr7N27l5KSEiZOnMi4ceMYNmwYEyZMYNq0abjdboYOHUrz5s19FVLAysjI8HcIPidlbhikzKFLUavquBBCCCGugCynIoQQotYkmQghhKg1SSZCCCFqLagWehQi1NjtdjIzM9m3bx8FBQUYDAaaN29OWlpayA5GkTKHZpmlAz5AnD17lrlz51JUVMSMGTM4ceIE27Zt45ZbbvF3aPWmIZb5Qv/5z3/Yvn07nTt3pk2bNkRHR+NwOMjKymL37t04HA7uvvtuWrZs6e9Q64yUOYTLrIqA8Je//EU9dOiQ+sQTT3iOPf74436MqP41xDJfaPv27Zc8X1hYqB4+fNhH0fiGlLmyUCmzNHMFCLvdTrt27byOaTSh3aXVEMt8obS0NK/XFosFRVEICwsDICYmhpiYGH+EVm8acpntdjsGg8HrXHFxcciUWZJJgIiKiuLcuXOetck2bdoU8kvLNMQyV+Xw4cO89dZbWK1WVFUlIiKCiRMn0rZtW3+HVm8aYpmfeeYZHnjgAdq3bw+U/7wvXLiQf/3rX36OrG5In0mAOH/+PO+++y4HDhwgIiKCpKQkHnnkERo1auTv0OpNQyxzVf70pz/x//7f/6NTp04A7N+/n7lz5/Lqq6/6ObL60xDLfPLkSd566y1SU1MpKCjwrAZy4crpwUxqJgHA7Xbz3Xff8dxzz3m+qVVU+0NVQyxzdcLCwjy/VAE6duwY8s+iIZa5RYsWjB49mtmzZxMWFsbf/va3kEkkIMkkIGg0Go4ePQqAyWTyczS+0RDLXJ22bdvy7rvv0r9/fxRFYcOGDaSmpnqeT5s2bfwcYd1riGV+6623OH/+PK+++ipnz57lpZdeYsSIESGza6w0cwWIBQsWkJWVRd++fTEajZ7jvXv39mNU9ashlrkqf/vb3y55/q9//auPIvGdhljmJUuWcMMNN3j6CC0WCx9++CGTJk3yc2R1Q5JJgJgzZ06Vxx988EEfR+I7DbHMomHLyckhKyuLrl27YrfbcblcIdO8J8lECD+zWCx8/vnn7Nu3D4DU1FTGjh1LeHi4nyOrPw2xzCtWrGDlypWYzWbeeOMNsrKyeO+99/jLX/7i79DqhPSZBIi8vDzmz5/PgQMHUBSFDh06cO+994ZUB93FGmKZqzJnzhxatGjBY489BsDatWuZM2cOf/rTn/wcWf1piGVevnw5L774IlOnTgWgSZMmFBUV+TmqutNwZogFuDlz5pCens4777zD22+/TXp6erXNQKGiIZa5KufPn2fcuHEkJyeTnJzMrbfeyvnz5/0dVr1qiGXW6/XodP/7/u5yuTz9J6FAkkmAKC4uZujQoWi1WrRaLUOGDKG4uNjfYdWrhljmqhgMBvbv3+95vX///kozpUNNQyxzamoqixYtwm63s3PnTmbOnEnPnj39HVadkWauABEdHc3atWsZMGAAAD/++CNRUVF+jqp+NcQyV+W+++7jzTffxGKxoKoqkZGRPPTQQ/4Oq141xDLffvvt/PDDD7Ro0YLvv/+eHj16MHz4cH+HVWekAz5A5ObmMm/ePA4ePIiiKLRv354JEyaQmJjo79DqTUMs86VYLBaAoO2E/u9//8uAAQOIjIys8XuCvczifySZBIj9+/fTsWPHyx4LJQ2xzFUJlZFN//73v1m/fj2tW7dm2LBhdOvWrdo+gVApc01MmTLlkn0jobKEjCSTAPHUU0/x0ksvXfZYKAnlMrvd7hqvgPzqq6/SokULBg8eDJSPbDpx4kRQjmxSVZWff/6Z1atXc+TIEfr27cuwYcNo3Lix13WhVObLycnJAcpHcwEMGjQIgHXr1mE0Ghk7dqzfYqtL0mfiZwcPHuTAgQMUFxezZMkSz3GLxYLb7fZjZPWnIZT54Ycfpk+fPgwdOpRmzZpd8trz5897/RK99dZbeeKJJ+o7xHqhKAqxsbHExsai1WopLS1l5syZdO3alTvvvNNzXSiV+XIqFi49cOAAL7zwguf4HXfcwXPPPSfJRNQNp9OJ1WrF5XJRVlbmOR4eHs7jjz/ux8jqT0Mo86uvvsr69et5++23UVWVoUOH0q9fvyqbcSpGNlU07wXryKalS5eyZs0aoqOjGTZsGHfeeSc6nQ63282jjz7qlUxCpcxXwmq1epX5wIEDWK1WP0dVd6SZK0Dk5OR4vsGYzWYiIiJCagx6VRpKmffu3cu//vUvLBYLvXv3ZuzYsV7NPsePH/eMbAKIiIjgoYceCrptXD/77DOGDRtW5RYCp0+f9qqhhUqZr8TRo0d56623vAYdTJo0KWQWtZRk4mdffPEFffv2JSUlBYfDwT//+U+OHz+OVqvlkUceoWvXrv4Osc41hDK73W4yMzNZtWoVOTk5DBo0iAEDBrB///5qN0S68JfMt99+yw033ODrsH8Vs9l8yfOXGt0VrGWujVAdwSbNXH62YcMGbrnlFgDWrFmDqqrMmzePs2fP8uabb4bEL9aLNYQyP/LII3Tu3Jnf/va3dOjQwXO8T58+7N27t8r3XPjLpWKF2WDw1FNPeWqUF383VRSF2bNnV/veYC3zr+FwONi8eTPZ2dlefYPSZyLqhE6n8/yPuGPHDvr3749Go6FZs2Yh0xl9sYZQ5ueff77a+TITJkzwcTT168033/R3CEHh5ZdfJjw8nDZt2qDX6/0dTp2TZOJner2ekydPEhsby549e7j77rs952w2mx8jqz+hXOZt27bx1ltveRLmY4895lUzCWWqqrJu3Tqys7MZO3Ysubm5FBYW0q5dO3+HFhDy8/P585//7O8w6o0kEz+75557mDlzJsXFxdxwww0kJSUBkJmZSatWrfwbXD0J5TL/+9//5u9//zspKSkcOnSIjz/+uNqNoO6+++4qBxyoqordbq/vUOvc3LlzURSFPXv2MHbsWEwmE/PmzePFF1/0XBNqZb4S7du35+TJk7Ro0cLfodQLSSZ+dtVVV/Haa6/hcrnQarWe42lpaaSlpfkxsvpTUeaLhUKZtVotKSkpQHk5LzX0c8GCBb4KyycOHz7MSy+9xJNPPgmUd7w7nU6va0KtzFdi//79rF69mqSkJPR6PaqqoihKyMyAl2QSIB555JEaT3ILFYWFhSxcuJCCggKmTp3K6dOnOXjwIMOGDfN3aL9aUVGR10TMi1/feOON/gjLJ7RaLW6321PzKC4uDsmh3r9WxT4moUqSSYC4kkluoWLOnDkMGTKExYsXA+WbBc2aNSuok8nw4cO9JmJe/DqUXXfddbzyyiueLwmbNm3id7/7nb/D8ruKodOhsj1vdSSZBIiwsDAyMjLIyMjwTHL78MMPq5zkFipKSkro168fX375JVD+zbam61kFqltvvdXfIfjNwIEDadOmDbt27QLgiSeeaDC17EupGDpd1ZS+yw2dDiaSTALExZPcbrrpJs8ktxdffLHKSW7Bzmg0UlJS4mkKOXjwYMjUxLKzs1m2bBk5OTm4XC7P8aeeeqrK63NycsjKyqJr167Y7XZcLldQfpO12Wyepq7LdaiHSpkvp6EMnZYZ8AHij3/8I507d2bYsGGVhpLOnz8/5OYmQPnyEu+//75nhEtxcTGPPfZY0I/ogvJv5UOHDqVFixZeta3U1NRK165YsYKVK1diNpt54403yMrK4r333uMvf/mLL0OutS+++IKNGzfSu3dvALZu3UqfPn08E1QvFCplFv8jNZMA4Ha7GTJkSLUzYUMxkQA0b96c559/nrNnz6KqKk2bNq2yKSAY6fV6rr/++hpdu3z5cl588UVPB22TJk0oKiqqz/Dqxfr163nppZc8CzaOGjWKp556qspkEiplFv8T3A3UIUKj0bBnzx5/h+Fzzz77LFqtlubNm9OiRQt0Oh3PPvusv8OqE9dffz2ff/45Bw8e5OjRo54/VdHr9eh0//te53K5gnIUVKNGjXA4HJ7XDoeD5OTkKq8NlTKHyooNdUFqJgGiffv2zJs3j379+mE0Gj3HQ2VF0QsVFhaSn5+P3W7n2LFjntpIWVlZ0M+Ar3Dy5EnWrl3L7t27vZq5/vrXv1a6NjU1lUWLFmG329m5cyfLly+nZ8+evgy3Tuh0Oh5//HG6du2Koijs3LmTjh07Mn/+fMC7hh0qZb6SfWvOnTtHQkICer2ePXv2cOLECQYPHkxERISPoq1f0mcSIKqbJV3VL59gt3r1atasWcORI0do27at57jJZGLIkCGeNvdgNnnyZF599VWvb9/Vcbvd/PDDD+zcuRNVVenWrRvDhw8Pum/qq1evvuT5IUOGeP4eKmUuKytj/fr1rF69+rJD+p944gmmT59OTk4O06ZNo2fPnmRlZfHMM8/4IfK6J8lE+M2mTZvo06ePv8OoF7NmzWLChAnExMT4OxThI5fbt6ZiS+qvv/4avV7Pddddx5NPPsnLL7/sx6jrjjRzBZDMzExOnTrl1e4cKstTX2jt2rUMGjSInJwcr9nhFUJhlnhRURGTJ0+mXbt2XrWTC4cGT5ky5ZLfxINtmY2srCw+/fRTTp8+7fUzfOE8ilAr85UM6ddqtfz444+sWbPG83Nw4bDxYCfJJEC8++672O129uzZw7Bhw9i0aVPIrrZa0S9S1bpVwdbMUZ1x48Zd9pqnn37aB5H4zpw5cxg3bhwffvghU6dOZdWqVZWuCbUyX8m+NQ8++CDfffcdo0ePJikpiezsbAYOHOjrkOuPKgLClClTvP5bVlamvvDCC/4MyS+WLFni7xD8oqCgQN26dau6detWtaCgwN/h/CpPPvmkqqqq+vjjj3uOPffcc9VeHwplzsnJ8XcIAUNqJgGiYmy+0WgkPz+fqKgosrOz/RyV74XKbnsXLrXudDpxOp2YTCY+/PDDSteuXLmSL774gi5duqCqKu+//z633HJL0K1RZjAYcLvdNGnShP/+97/Ex8dXO3ck2Mt8JfvWhFrTXnUkmQSItLQ0SktLuemmmzxr+QTL/1iisouXWt+yZQuHDx+u8tqvv/6al19+maioKKB8zbJnn3026P79x48fj91u59577+Wzzz5j9+7dPPTQQ1VeG+xlvpJ9ayqa9pYvXw7AoEGDAFi3bp3XNIBgJ8kkQFR0tPfp04eePXvicDhCZp0qAb169eKrr76q8lxCQoLXmlRhYWHVbvkbyCr6+EwmEw8++OAlrw32Ml/JvjWNGjUC4MCBA7zwwgue43fccQfPPfdcyAyykWQSQA4cOFBpYcDBgwf7MaL6UdEEpF4wKr3idajstrd582bP31VV5ciRI9VeGx8fz9SpU0lPT0dRFLZt20bbtm09I92CZXTb2bNn+frrr8nNzfX6Ga5qrlSwl/nX7FtjtVrZv38/HTt2BMr/f79UEgo2kkwCxBtvvMH58+dp1aqV14zpUEwmDWG3ve3bt3v+rtFoSEpK8uxAeLHk5GSvZUfS09MBgm4flFmzZnHNNdeQkZFx2a0Egr3Mv2bfmkmTJvHWW29hsVgACA8PZ9KkSfUapy/JpMUA8dhjjzFz5syQGRpbEzt37uT06dNAeRNJ+/bt/RyRqI2KSXni0i5MJqFEaiYBonnz5hQWFhIXF+fvUOpdbm4ur7zyCiaTiTZt2qCqKps3b8ZgMPDkk0+ydu1ahg8f7u8wf5Uvvvjikuerah8/cuQIixYtqtQ8FCyjfCp2EuzZsyfLly+nV69e6PV6z/nIyMhK7wn2Mle4kn1rHA4HmzdvJjs722uBSOkzEXWqpKSExx9//JIzpkPFvHnzuO6667zWagJYs2aNZ9XgYE0mVY3Osdls/PDDD5SUlFT5i+P111/nrrvuokWLFkFZM714J8FvvvnG63xVOwkGe5krvPLKKwwdOpSePXtetmnv5ZdfJjw8nDZt2ngl21AhySRANKTtXs+ePVspkUB5/9DChQuDuqnkpptu8vy9rKyMpUuXsmrVKvr16+d17kLR0dGePoNg9Nhjj5GQkOCpVa9evZrNmzfTqFGjalcCCPYyV7iSfWvy8/P585//XM8R+Y8kkwCRmppKYWGhZ9RPu3btQnaRwOr2gHC73RgMhqAvt9lsZsmSJaxbt47Bgwfz0ksvVdnUU2HcuHG8/fbbdOnSxesba7Csnvzee+/x3HPPAeWLHS5cuJB7772X48eP88477zBlypRK7wn2Mleo2LemW7duXi0KVW0d0b59e8+uoqFIkkmA2LBhAx9//LFnW9f58+dz1113heSquj179uTtt9/mnnvuwWQyAeXDJj/88EN69Ojh5+hq56OPPmLLli0MHz6cGTNmeMp3KatWreLs2bM4nU6vppJg+cXqdrs9yXLDhg0MHz6cPn360KdPH5544okq3xPsZa5wJfvW7N+/n9WrV5OUlIRer0dVVRRFCbp+oupIMgkQixcv5sUXX/R8Ky8uLuaFF14IyWRy5513snDhQh566CESExNRFIWcnBwGDx7M7bff7u/wamXJkiXodDoWLVrE4sWLPccrfnFUtZzKiRMnmDFjhi/DrFNutxuXy4VWq2X37t3cf//9XueqEuxlrrBlyxZmz55do31rKrYoDlWSTAKE2+32at6JjIwM2S1Bjx8/zo033shtt93GuXPn2L17N5mZmTidTqxW6yWbhALdZ599dsXvueqqqzh9+vRld+oLVP379+f5558nKioKg8FAp06dgPKdBasb/hrsZa7QsmVLSktLa9Q0G8wDDWpCkkmA6N69O9OmTaN///5AeXNBsDf5VKeijd1gMGA2m/nqq68u28Yeyg4cOMCaNWuCtvljzJgxdOnShcLCQs+WvVD+Benee++t8j3BXuYKNdm3psKLL77oGfXmcDjIzs6madOmzJw505ch1xuZtBhANm3axIEDB1BVldTUVHr16uXvkOrFE088wSuvvALA3LlziY6O9oz6ufBcQ5GTk1Pl8Yo1nUJRqJT54j1LKlT0fV7K0aNHWbFihVezYDCTmkkAqei0DHW/po09FFksFsLDw70WPAx1oVbmmiSN6rRp0+aSa7YFG0kmfvbcc8/xwgsveO1/AZfusA12v6aNPRS9/vrrPP3005Um/UF5+3pVk/2CXaiV+Ur2rblwIUi3282xY8eIjo72Waz1TZq5hF8cPHjQ08ZeMXz27NmzWK3WKsfoCxEMKvatqWpU4ueff+75u1arpVGjRvTu3duzMV6wk2QSIN544w0efvjhyx4ToSMnJ4eIiAhPbWz37t1s3bqVRo0aMXLkyBoNNw02DaHMf/7zn5k2bVq158vKylAUpUZzkILJpReTET5TsXpuBZfLxdGjR/0UjfCFWbNmefazOH78OLNmzSIxMZHjx48zd+5cP0dXP0KtzJs3b/b82bRpE5988km11548eZInn3ySKVOm8Pjjj/PUU09x8uRJH0Zbv4L/a0CQW7x4MYsXL8ZutzN+/HigvL9Ep9ORkZHh5+hEfbLb7cTHxwOwdu1ahg4dyk033YTb7a5275NgF2plvpJ9a959913uvvtuunTpAsCePXt49913+cc//uGTWOubJBM/Gz16NKNHj+bTTz8N+tnf4spc2MK8Z88efv/73wNcdvXZYBZqZb7c9sQXstlsnkQC0LlzZ2w2W32E5ReSTAJEu3btPMMmAUpLS9mzZ0/IzjUR0KVLF2bOnElcXBxms9nzi6agoCAk+g6qEipl/jX71iQlJfHFF18waNAgANatWxd082ouRTrgA0RVk/WefPJJXn75ZT9FJOqbqqps2LCBgoIC+vXr52n+OXbsGEVFRXTv3t2/AdaDUCnzxXu2gPe+NR999FGl82azmf/85z+eicmdOnXi1ltvDerlgy4UPF8FQlxVOf3CndtEaKpYPudCrVu39vy9Yr5RKAmFMv+afWsiIyOZMGGCr0L0OUkmAaJNmzZ8+OGHjBgxAkVRWLZsmcy3CHF/+9vf6N27N1dffTWJiYme406n07NceZcuXarcSCxYhVKZa7pvTXFxMcuXLyciIoJhw4bx0UcfsX//fpKTk7n77rtp3LixH6Kve9LMFSCsViv/93//x65du1BVlW7dujFmzJiQG4su/sdut7Nq1Sp+/PFHsrOzCQ8Px+Fw4Ha76dq1KyNHjqRVq1b+DrNOhUqZL9y3ZuTIkZf8//Qf//gHbdq0wWq1smvXLoYMGUJ6ejr79u3jxx9/5Pnnn/dd4PVIkokQAcDpdFJSUoLBYCAiIsLf4fhEMJf5tttuQ6fTodVqL7sMUkV/qKqqPPjgg7z11luVzoUCaebysw8++IB77rmH6dOnV9lOXNVS1iL06HQ6zx7qDUUwl/lK9q2pGPasKEqltbiCdUh0VSSZ+FnFMMHf/va3fo5ECFEfzp8/z0svvYSqqp6/Q3ktJjs728/R1R1p5gogxcXFACG1kqgQDV11e55UqM0y9oFEkomfqarK559/zvLly1FVFVVV0Wg0XHfddVVOfBJCiEAkycTPlixZwk8//cQDDzxAUlISUF4tnjt3Lt26dePGG2/0c4RCCHF5odP7E6TWrl3Lo48+6kkkAMnJyTz88MOsXbvWj5EJIUTNSTLxM5fLVWUfSXR0tMyAF0IEDRnN5WeXWtwumBa+E0Jc2tmzZ/n666/Jzc31+qL417/+1Y9R1R35beVnx48f9+xjciFVVXE4HH6ISAhRH2bNmsU111xDRkZGSM0vqSDJxM+uZPKTECJ4aTQarr32Wn+HUW9kNJcQQtQjs9kMwNKlS4mJiaFXr17o9XrP+VBZgl6SiRBC1KOHHnoIRVGq3GZCURRmz57th6jqniQTIYTwAbvdjsFguOyxYBV6vUBCCBGAnnvuuRodC1bSAS+EEPWosLCQ/Px87HY7x44d8zR3lZWVYbPZ/Bxd3ZFkIoQQ9WjHjh2sWbOGvLw8FixY4DluMpn4/e9/78fI6pb0mQghhA9s2rSJPn36+DuMeiPJRAgh6tHatWsZNGgQ33zzTZUb4IXKYq7SzCWEEPWool/EarX6OZL6JTUTIYSoR1u2bKFDhw7ExMT4O5R6JclECCHq0YwZMzh48CBGo5EOHTp4/jRv3tzfodUpSSZCCOED2dnZHDx4kAMHDnDw4EFyc3Np164dzzzzjL9DqxPSZyKEED6QlJSEw+HAbrdjt9s9fw8VUjMRQoh6tGjRIg4ePEhJSQlNmjShffv2XHXVVbRs2TKklqKXmokQQtSjtWvXYjKZSEtLo0OHDlx11VWEh4f7O6w6JzUTIYSoZ2azmQMHDnDgwAEOHTqE1WqlZcuWdOjQgaFDh/o7vDohyUQIIXzE5XJx9OhR9u3bx/fff092dnbIbJAnyUQIIerRtm3bPLWSU6dO0bx5c9q3b+8ZIhwdHe3vEOuE9JkIIUQ9Wr16Ne3bt+fOO++kTZs26HSh+WtXaiZCCFGPVFWtck2uK70m0IXOuDQhhAhAf/vb31i2bBm5ublex51OJ7t372b27NmsWbPGT9HVHamZCCFEPbLb7axatYoff/yR7OxswsPDcTgcuN1uunbtysiRI2nVqpW/w6w1SSZCCOEjTqeTkpISDAYDERER/g6nTkkyEUIIUWvSZyKEEKLWJJkIIYSoNUkmQgghai00Z88IcQX279/Pxx9/zKlTp9BoNDRr1ozx48fTrl07Vq9ezcqVK3nhhRfqPY5FixaxePFiANxuN06nE4PBAECjRo2YOXNmvccgxK8lyUQ0aBaLhenTp/OHP/yBfv364XQ62bdvH3q9vk7u73K50Gq1Nbp2zJgxjBkzBsCnSUyIuiDJRDRoWVlZAAwYMAAAg8FAt27dADh9+jTvvfceTqeTu+66C61WywcffIDFYmH+/Pn89NNPGI1Ghg8fzujRo9FoNJ4k0LZtW9asWcOIESO45ZZbWLhwIRs3bsTpdHL11Vdzzz33eGodl/P1119z8OBB/vSnP3mOzZ8/H41Gwz333MPzzz9P+/bt2bVrF2fPnqVz5848+OCDREZGAnDw4EEWLFjA6dOnadSoEffccw+dO3euy8cohPSZiIatSZMmaDQaZs+ezU8//YTZbPaca9asGffddx/t27fno48+4oMPPgDKf5FbLBZmz57N888/z9q1a1m9erXnfYcOHSI5OZm5c+cyZswYPvnkE7KysnjllVd4/fXXyc/P54svvqhxjAMHDuTnn3+mtLQUKK/tbNiwgUGDBnmuWbNmDZMmTeKdd95Bo9Ewf/58APLz85k+fTpjxoxh/vz53HXXXcyYMYPi4uJaPDUhKpNkIhq08PBw/v73v6MoCu+88w5/+MMfeOmllygsLKzyerfbzYYNG7j99tsJCwsjKSmJG2+8kbVr13quiYuL47rrrkOr1aLX61m5ciXjx48nMjKSsLAwxowZw/r162scY1xcHJ06dWLjxo0A7Nixg6ioKNq0aeO5ZtCgQbRo0QKTycTvfvc7Nm7ciNvtZu3atfTo0YO0tDQ0Gg1du3albdu2ZGZm/roHJkQ1pJlLNHjNmjXjoYceAuDMmTO88cYbfPDBB0yePLnStcXFxTidThITEz3HGjVqRH5+vuf1heeKi4ux2Ww8/fTTnmOqquJ2u68oxsGDB/Pdd9+RkZHBunXrvGolAAkJCV6f73K5KC4uJjc3l02bNrF9+3bPeZfLJc1cos5JMhHiAikpKQwZMoTvv/++yvPR0dFotVpyc3Np1qwZALm5ucTHx1d5fVRUFAaDgZkzZ1Z7TU1cffXVzJ07l5MnT7J9+3buvPNOr/N5eXmev+fm5qLVaomOjiYhIYGBAwcyceLEX/3ZQtSENHOJBu3MmTN88803nl/Gubm5rF+/nquuugqA2NhY8vPzcTqdAGg0Gvr27cvChQspKysjJyeHJUuWMHDgwCrvr9FoGD58OB988AFFRUVAeT/Gjh07rihOg8FA7969ef3112nXrp1X7Qdg3bp1nD59GpvNxn/+8x/69OmDRqNh4MCBbN++nR07duB2u7Hb7ezZs8cr+QhRF6RmIhq0sLAwDh06xJIlS7BYLISHh9OzZ0/PN/8uXbp4OuI1Gg3z5s1jwoQJzJ8/nz/+8Y8YDAaGDx9+yX2877jjDr744gv+/Oc/U1JSQnx8PNdccw3du3e/oliHDBnCDz/8wKRJkyqdGzRoEG+++SZnz56lU6dOPPjgg0B5k9eTTz7Jxx9/zL/+9S80Gg3t2rXjvvvuu6LPFuJyZKFHIYJEbm4ukydP5t133yU8PNxz/Pnnn2fgwIEMHz7cj9GJhk6auYQIAm63myVLltCvXz+vRCJEoJBkIkSAs1qtjB8/np07dzJu3Dh/hyNElaSZSwghRK1JzUQIIUStSTIRQghRa5JMhBBC1JokEyGEELUmyUQIIUStSTIRQghRa/8fyqAhH2akvfwAAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run Time: ~5s\n",
+ "\n",
+ "# Plot results\n",
+ "plot_results(\n",
+ " experiments=[dict_runs, sqlite_runs, numpy_runs, shapely_runs, numpy_index_runs],\n",
+ " title=\"Box Query\",\n",
+ " tick_label=[\n",
+ " \"DictionaryStore\",\n",
+ " \"SQLiteStore\",\n",
+ " \"NumPy\\n(Simple Loop)\",\n",
+ " \"Shapely\\n(Simple Loop)\",\n",
+ " \"NumPy\\n(With Bounds Index)\",\n",
+ " ],\n",
+ ")\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "LJiGGkespT56"
+ },
+ "source": [
+ "## 2.3) Size vs Approximate Lower Bound\n",
+ "\n",
+ "Here we calculate an estimated lower bound on file size by finding the\n",
+ "the Shannon entropy of each file. This tells us the theoretical minimum\n",
+ "number of bits per byte. The lowest lower bound is then used as an\n",
+ "estimate of the minimum file size possible to store the annotation data.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "0IO10faZpT56",
+ "outputId": "033c2530-072a-4aa5-cf34-c2298e90d86f"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " "
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Approximate Lower Bound Size: 3.60 GB\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r"
+ ]
+ }
+ ],
+ "source": [
+ "# Run Time: ~5m\n",
+ "\n",
+ "\n",
+ "# Files to consider containing keys, geometry, and properties.\n",
+ "# Files which are missing keys e.g. cells.pickle are excluded\n",
+ "# for a fair comparison.\n",
+ "file_names = [\n",
+ " \"cells-dicionary-store.pickle\",\n",
+ " \"cells-dict.pickle\",\n",
+ " \"cells.db\",\n",
+ " \"cells.db.zstd\",\n",
+ " \"cells.geojson\",\n",
+ " \"cells.ndjson\",\n",
+ " \"cells.ndjson.zstd\",\n",
+ "]\n",
+ "\n",
+ "\n",
+ "def human_readible_bytes(byte_count: int) -> tuple[int, str]:\n",
+ " \"\"\"Convert bytes to human readble size and suffix.\"\"\"\n",
+ " byte_count_ref = 1024\n",
+ " for suffix in [\"B\", \"KB\", \"MB\", \"GB\", \"TB\"]:\n",
+ " if byte_count < byte_count_ref:\n",
+ " return byte_count, suffix\n",
+ " byte_count /= byte_count_ref\n",
+ " return byte_count, \"PB\"\n",
+ "\n",
+ "\n",
+ "def shannon_entropy(\n",
+ " fp: Path,\n",
+ " sample_size: int = 1e9, # 1GiB\n",
+ " stride: int = 7,\n",
+ " skip: int = 1e5, # 100KiB\n",
+ ") -> float:\n",
+ " \"\"\"Calculate the Shannon entropy of a file from a sample.\n",
+ "\n",
+ " The first `skip` bytes are skipped to avoid sampling low entropy\n",
+ " (highly ordered) parts which commonly occur at the beginning e.g.\n",
+ " headers.\n",
+ "\n",
+ " Args:\n",
+ " fp: File path to calculate entropy of.\n",
+ " sample_size: Number of bytes to sample from the file.\n",
+ " stride: Number of bytes to skip between samples.\n",
+ " skip: Number of bytes to skip before sampling.\n",
+ " \"\"\"\n",
+ " npmmap = np.memmap(Path(fp), dtype=np.uint8, mode=\"r\")\n",
+ " values, counts = np.unique(\n",
+ " npmmap[int(skip) : int(skip + (sample_size * stride)) : int(stride)],\n",
+ " return_counts=True,\n",
+ " )\n",
+ " total = np.sum(counts)\n",
+ " frequencies = {v: 0 for v in range(256)}\n",
+ " for v, x in zip(values, counts):\n",
+ " frequencies[v] = x / total\n",
+ " frequency_array = np.array(list(frequencies.values()))\n",
+ " epsilon = 1e-16\n",
+ " return -np.sum(frequency_array * np.log2(frequency_array + epsilon))\n",
+ "\n",
+ "\n",
+ "# Find the min across all of the representations for the lowest lower\n",
+ "# bound.\n",
+ "bytes_lower_bounds = {\n",
+ " path: (\n",
+ " shannon_entropy(Path(path)) / 8 * len(np.memmap(path, dtype=np.uint8, mode=\"r\"))\n",
+ " )\n",
+ " for path in tqdm(\n",
+ " [Path.cwd() / name for name in file_names],\n",
+ " position=0,\n",
+ " leave=False,\n",
+ " )\n",
+ "}\n",
+ "\n",
+ "lowest_bytes_lower_bound = min(bytes_lower_bounds.values())\n",
+ "\n",
+ "size, suffix = human_readible_bytes(lowest_bytes_lower_bound)\n",
+ "logger.info(\"Approximate Lower Bound Size: %2f %s\", size, suffix)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "chwB3zeupT56"
+ },
+ "source": [
+ "### Plot Results\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "cu5jkrVppT56",
+ "outputId": "bb36aea5-d5d7-4560-a853-d2a8afba0eac"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Get file sizes\n",
+ "file_sizes = {\n",
+ " path: path.stat().st_size for path in [Path.cwd() / name for name in file_names]\n",
+ "}\n",
+ "\n",
+ "# Sort by size\n",
+ "file_sizes = dict(sorted(file_sizes.items(), key=lambda x: x[1]))\n",
+ "\n",
+ "# Plot\n",
+ "plt.bar(\n",
+ " x=range(len(file_sizes)),\n",
+ " height=file_sizes.values(),\n",
+ " tick_label=[p.name for p in file_sizes],\n",
+ " color=[f\"C{i}\" for i in range(len(file_sizes))],\n",
+ ")\n",
+ "plt.xlabel(\"File Name\")\n",
+ "plt.ylabel(\"Bytes\")\n",
+ "plt.xticks(rotation=90)\n",
+ "plt.hlines(\n",
+ " y=lowest_bytes_lower_bound,\n",
+ " xmin=-0.5,\n",
+ " xmax=len(file_sizes) - 0.5,\n",
+ " linestyles=\"dashed\",\n",
+ " color=\"black\",\n",
+ " label=\"Approximate Bytes Lower Bound\",\n",
+ ")\n",
+ "plt.legend()\n",
+ "plt.tight_layout()\n",
+ "plt.title(\"Polygon Annotation File Sizes\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gmuEWlImpT57"
+ },
+ "source": [
+ "The SQLite representation (4.9GB) appears to be quite compact compared\n",
+ "with GeoJSON and ndjson. Although not as compact as a dictionary pickle\n",
+ "or Zstandard compressed ndjson, it offers a good compromise between\n",
+ "compactness and read performance.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Yhe5rMXPpT57"
+ },
+ "source": [
+ "# 3: Extra Bits\n",
+ "\n",
+ "## 3.1) Space Saving\n",
+ "\n",
+ "A lot of space can be saved by rounding the coordinates to the nearest\n",
+ "integer when storing them. Below we make a copy of the dataset with all\n",
+ "coordinates rounded.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "H2Jsc0repT57",
+ "outputId": "d2ca9eff-b67d-4bfc-ad5a-57c87bc6a7da"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 10008338/10008338 [51:00<00:00, 3270.16it/s] \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Run Time: ~50m\n",
+ "! rm integer-cells.db\n",
+ "int_cell_sqlite_store = SQLiteStore(\"integer-cells.db\")\n",
+ "\n",
+ "# We use batches of 1000 to speed up appending\n",
+ "batch = {}\n",
+ "batch_size = 1000\n",
+ "for key, annotation in tqdm(cell_sqlite_store.items(), total=len(cell_sqlite_store)):\n",
+ " geometry = Polygon(np.array(annotation.geometry.exterior.coords).round())\n",
+ " rounded_annotation = Annotation(geometry, annotation.properties)\n",
+ " batch[key] = rounded_annotation\n",
+ " if len(batch) >= batch_size:\n",
+ " int_cell_sqlite_store.append_many(batch.values(), batch.keys())\n",
+ " batch = {}\n",
+ "_ = int_cell_sqlite_store.append_many(batch.values(), batch.keys())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "U6aooIROpT57"
+ },
+ "source": [
+ "Here the database size is reduced to 2.9GB, down from 4.9GB.\n",
+ "Additionally, when using integer coordinates, the database compresses\n",
+ "much better. Zstandard can compress to approximately 60% of the\n",
+ "original size (and 35% of the floating point coordinate\n",
+ "database size). This may be done for archival purposes.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Q3TJ8XX4pT57",
+ "outputId": "b99d1af7-4c68-4394-cf9a-8bb2b64471a0"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "integer-cells.db : 60.58% ( 2.86 GiB => 1.73 GiB, integer-cells.db.zstd) \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Run time: ~15s\n",
+ "! zstd -f -k integer-cells.db -o integer-cells.db.zstd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "alFRiIAbpT57"
+ },
+ "source": [
+ "With higher (slower) compression settings the space can be further\n",
+ "reduced for long term storage.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "nVFqovfPpT57",
+ "outputId": "0948bbe6-4252-4c93-eab7-8e3be4e98235"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "integer-cells.db : 51.22% ( 2.86 GiB => 1.47 GiB, integer-cells.db.19.zstd) \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Run time: ~20m\n",
+ "! zstd -f -k -19 --long integer-cells.db -o integer-cells.db.19.zstd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "C3voJ43OpT57"
+ },
+ "source": [
+ "## 3.2) Feature Comparison Summary\n",
+ "\n",
+ "Here we briefly summarise some of the positives and negatives of each format and construct a comparison matrix.\n",
+ "\n",
+ "**GeoJSON**\n",
+ "\n",
+ "*Positives*\n",
+ "\n",
+ "- Simple, based JSON which is well known.\n",
+ "- Well defined with a public specification.\n",
+ "- Popular format for geometry, many tools which work with it.\n",
+ "- Fast to write.\n",
+ "\n",
+ "*Negatives*\n",
+ "\n",
+ "- Requires loading the whole file into memory for parsing. Some\n",
+ " specialised parsers can, in some situations, reduce or avoid this but\n",
+ " it is not possible in general.\n",
+ "- Not a very compact representation.\n",
+ "\n",
+ "**ndjson (One GeoJSON Feature Per Line)**\n",
+ "\n",
+ "*Positives*\n",
+ "\n",
+ "- Simple.\n",
+ "- Better to parse than JSON/GeoJSON. Each line can be parsed\n",
+ " independently.\n",
+ "- Many tools to parse JSON lines.\n",
+ "- Fast to write.\n",
+ "\n",
+ "*Negatives*\n",
+ "\n",
+ "- Not a very compact representation.\n",
+ "- Requires loading the whole dataset from disk before querying OR\n",
+ " scanning through and reparsing each line for each query.\n",
+ "- Amending annotations can be tricky. The easiest way is to blank out a\n",
+ " line and append a modified copy each time. This could end up\n",
+ " fragmenting the file and wasting a lot of space. More complex methods\n",
+ " could be developed to reduce fragmenting the file.\n",
+ "\n",
+ "**pickle**\n",
+ "\n",
+ "*Positives*\n",
+ "\n",
+ "- Fast to write.\n",
+ "\n",
+ "*Negatives*\n",
+ "\n",
+ "- Vulnerable to arbitrary code execution when loading from disk.\n",
+ "- Requires loading the whole dataset into memory for querying.\n",
+ "\n",
+ "**SQLite (SQLiteStore Flavour)**\n",
+ "\n",
+ "*Positives*\n",
+ "\n",
+ "- Very fast to query (uses an R-TREE index to accelerate\n",
+ " spatial queries).\n",
+ "- Does not require loading data into memory before querying.\n",
+ "- Possible to index property lookups.\n",
+ "\n",
+ "*Negatives*\n",
+ "\n",
+ "- Not the most compact representation on disk.\n",
+ "\n",
+ "### Feature Matrix\n",
+ "\n",
+ "| Format | Size On-Disk | Size In-Memory | Partial Reads | Serialization | Query Performance |\n",
+ "| ----------: | :----------- | :------------- | :------------ | :------------ | :---------------- |\n",
+ "| SQLiteStore | Medium | Small | Yes | Slow | Fast |\n",
+ "| GeoJSON | Large | Large | No | Fast | Slow |\n",
+ "| ndjson | Large | Large | Yes | Fast | Medium |\n",
+ "| pickle | Small | Medium | No | Medium | Slow |\n",
+ "\n"
]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "for n in range(4):\n",
- " display(cell_polygon(xy=(0, 0), n_points=20, repeat_first=False, seed=n))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "APUNL2PtpT5w"
- },
- "source": [
- "### Randomised Cell Boundaries\n",
- "\n",
- "Here we create a function to generate grid of cells for testing. It uses a fixed seed for reproducibility.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "SOpBKM7IpT5w"
- },
- "source": [
- "### A Sample 5×5 Grid\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "2xA-oG4VpT5w",
- "outputId": "caea51e4-8a27-4dd1-ed0d-c272b93d8bb7"
- },
- "outputs": [
- {
- "data": {
- "image/svg+xml": " ",
- "text/plain": [
- ""
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "MultiPolygon(polygons=list(cell_grid(size=(5, 5), spacing=35)))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "b6S8vzFipT5w"
- },
- "source": [
- "# Part 1: Small Scale Benchmarking of Annotation Storage\n",
- "\n",
- "Using the already defined data generation functions (`cell_polygon` and\n",
- "`cell_grid`), we create some simple artificial cell boundaries by\n",
- "creating a circle of points, adding some noise, scaling to introduce\n",
- "eccentricity, and then rotating. We use 20 points per cell, which is a\n",
- "reasonably high value for cell annotation. However, this can be\n",
- "adjusted.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "UZMoLDvkpT5x"
- },
- "source": [
- "## 1.1) Appending Annotations (In-Memory & Disk I/O)\n",
- "\n",
- "Here we test:\n",
- "\n",
- "1. A python dictionary based in-memory store (`DictionaryStore`)\n",
- "1. An SQLite database based in-memory store (`SQLiteStore`)\n",
- "\n",
- "Both of these stores may operate in memory. The `SQLiteStore` may also\n",
- "be backed by an on-disk file for datasets which are too large to fit in\n",
- "memory. The `DictionaryStore` class can serialise/deserialise itself\n",
- "to/from disk in a line delimited GeoJSON format (each line seperated\n",
- "by `\\n` is a valid GeoJSON object)\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "DZBiw_EepT5x"
- },
- "outputs": [],
- "source": [
- "# Convert to annotations (a dataclass pairing a geometry and (optional)\n",
- "# key-value properties)\n",
- "# Run time: ~2s\n",
- "annotations = [\n",
- " Annotation(polygon) for polygon in cell_grid(size=(100, 100), spacing=35)\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "LUVa03F2pT5x"
- },
- "source": [
- "### 1.1.1) In Memory Append\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "7PzE7AhdpT5x",
- "outputId": "974bb3d0-3290-4315-a6fc-3b7ca90072a6"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run time: ~5s\n",
- "\n",
- "# Time dictionary store\n",
- "dict_runs = timeit.repeat(\n",
- " \"dict_store.append_many(annotations)\",\n",
- " setup=\"dict_store = DictionaryStore()\",\n",
- " globals={\"DictionaryStore\": DictionaryStore, \"annotations\": annotations},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " \"sql_store.append_many(annotations)\",\n",
- " setup=\"sql_store = SQLiteStore()\",\n",
- " globals={\"SQLiteStore\": SQLiteStore, \"annotations\": annotations},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")\n",
- "\n",
- "# Plot the results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"Time to Append 10,000 Annotations In Memory\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
- ")\n",
- "plt.hlines(0.5, -0.5, 1.5, linestyles=\"dashed\", color=\"k\")\n",
- "plt.xlim([-0.5, 1.5])\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "gU6PLE7wpT5x"
- },
- "source": [
- "Note that inserting into the `SQLiteStore` is much slower than the\n",
- "`DictionaryStore`. Appending to a `Dictionary` store simply requires\n",
- "adding a memory reference to a dictionary. Therefore, this is a very\n",
- "fast operation. On the other hand, for the `SQLiteStore`, the insertion\n",
- "is slower because the data must be serialised for the database and the\n",
- "R-Tree spatial index must also be updated. Updating the index is a\n",
- "relatively expensive operation. However, this spatial index allows for\n",
- "very fast queries of a very large set of annotations within a set of\n",
- "spatial bounds.\n",
- "\n",
- "Insertion is typically only performed once for each\n",
- "annotation, whereas queries may be performed many times on the\n",
- "annotation set. Therefore, it makes sense to trade a more expensive\n",
- "insertion for fast queries as the cost of insertion will be amortised\n",
- "over a number of queries on the data. Additionally, data may be written\n",
- "to the database from multiple threads or subprocesses (so long as a new\n",
- "instance of `SQLiteStore` is created for each thread or subprocess to\n",
- "attach to a database on disk) thus freeing up the main thread.\n",
- "\n",
- "For comparison, we also compare bulk insertion plus seralising to disk\n",
- "as line-delimited GeoJSON from the `DictionaryStore` as this is the\n",
- "default serialisation to disk method (`DictionaryStore.dump(file_path`).\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "t2q9QTCfpT5x",
- "outputId": "2202c328-ba48-476b-8efa-662678d75135"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run time: ~10s\n",
- "\n",
- "setup = \"fp.truncate(0)\\nstore = Store(fp)\" # Clear the file\n",
- "\n",
- "# Time dictionary store\n",
- "with tempfile.NamedTemporaryFile(\"w+\") as fp:\n",
- " dict_runs = timeit.repeat(\n",
- " (\"store.append_many(annotations)\\nstore.commit()\"),\n",
- " setup=setup,\n",
- " globals={\"Store\": DictionaryStore, \"annotations\": annotations, \"fp\": fp},\n",
- " number=1,\n",
- " repeat=3,\n",
- " )\n",
- "\n",
- "# Time SQLite store\n",
- "with tempfile.NamedTemporaryFile(\"w+b\") as fp:\n",
- " sqlite_runs = timeit.repeat(\n",
- " (\"store.append_many(annotations)\\nstore.commit()\"),\n",
- " setup=setup,\n",
- " globals={\"Store\": SQLiteStore, \"annotations\": annotations, \"fp\": fp},\n",
- " number=1,\n",
- " repeat=3,\n",
- " )\n",
- "\n",
- "# Plot the results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"Time to Append & Serialise 10,000 Annotations To Disk\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
- ")\n",
- "plt.hlines(0.5, -0.5, 1.5, linestyles=\"dashed\", color=\"k\")\n",
- "plt.xlim([-0.5, 1.5])\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "LKr6FmctpT5x"
- },
- "source": [
- "Here we can see that when we include the serialisation to disk in the\n",
- "benchmark, the time to insert is much more similar.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "V7WV8wNmpT5x"
- },
- "source": [
- "## 1.2) Box Query\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "eul4PYZPpT5x",
- "outputId": "a0131a72-f527-48b1-8aac-8cbccfced2ed"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run time: ~20s\n",
- "\n",
- "# One time Setup\n",
- "dict_store = DictionaryStore()\n",
- "sql_store = SQLiteStore()\n",
- "dict_store.append_many(annotations)\n",
- "sql_store.append_many(annotations)\n",
- "\n",
- "rng = np.random.default_rng(123)\n",
- "boxes = [\n",
- " Polygon.from_bounds(x, y, 128, 128) for x, y in rng.integers(0, 1000, size=(100, 2))\n",
- "]\n",
- "stmt = \"for box in boxes:\\n _ = store.query(box)\"\n",
- "\n",
- "# Time dictionary store\n",
- "dict_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": dict_store, \"boxes\": boxes},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": sql_store, \"boxes\": boxes},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Plot the results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"100 Box Queries\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "z9ntCgKapT5x"
- },
- "source": [
- "Here we can see that the `SQLiteStore` is a bit faster. Addtionally,\n",
- "difference in performance is more pronounced when there are more\n",
- "annotations (as we will see later in this notebook) in the store or when\n",
- "just returning keys:\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "vfGH6e4upT5x",
- "outputId": "7cf8bf30-a4c9-4de5-9a5f-f9fd6cffc141"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run time: ~15s\n",
- "\n",
- "# One time Setup\n",
- "dict_store = DictionaryStore()\n",
- "sql_store = SQLiteStore()\n",
- "dict_store.append_many(annotations)\n",
- "sql_store.append_many(annotations)\n",
- "\n",
- "rng = np.random.default_rng(123)\n",
- "boxes = [\n",
- " Polygon.from_bounds(x, y, 128, 128) for x, y in rng.integers(0, 1000, size=(100, 2))\n",
- "]\n",
- "stmt = \"for box in boxes:\\n _ = store.iquery(box)\" # Just return the keys (uuids)\n",
- "\n",
- "# Time dictionary store\n",
- "dict_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": dict_store, \"boxes\": boxes},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": sql_store, \"boxes\": boxes},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Plot the results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"100 Box Queries (Key Lookup Only)\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "xVQlsK1MpT5y"
- },
- "source": [
- "## 1.3) Polygon Query\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "fnkdnKWRpT5y",
- "outputId": "03ccc35c-df96-4d68-9d53-72ac835a9088"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run time: ~15s\n",
- "\n",
- "# One time Setup\n",
- "dict_store = DictionaryStore()\n",
- "sql_store = SQLiteStore()\n",
- "dict_store.append_many(annotations)\n",
- "sql_store.append_many(annotations)\n",
- "\n",
- "rng = np.random.default_rng(123)\n",
- "query_polygons = [\n",
- " Polygon(\n",
- " [\n",
- " (x, y),\n",
- " (x + 128, y),\n",
- " (x + 128, y + 128),\n",
- " (x, y),\n",
- " ],\n",
- " )\n",
- " for x, y in rng.integers(0, 1000, size=(100, 2))\n",
- "]\n",
- "stmt = \"for polygon in query_polygons:\\n _ = store.query(polygon)\"\n",
- "\n",
- "# Time dictionary store\n",
- "dict_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": dict_store, \"query_polygons\": query_polygons},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": sql_store, \"query_polygons\": query_polygons},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Plot the results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"100 Polygon Queries\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "1k1xOgB5pT5y"
- },
- "source": [
- "Here we can see that performing queries within a polygon region is about\n",
- "10x faster with the `SQLiteStore` than with the `DictionaryStore`.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "iYFK95w1pT5y"
- },
- "source": [
- "## 1.4) Predicate Query\n",
- "\n",
- "Here we query the whole annotation region but with a predicate to\n",
- "select only annotations with the class label of 0. We also,\n",
- "demonstrate how creating a database index can dramatically improve\n",
- "the performance of queries.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "zNX4UG4BpT5y",
- "outputId": "97444739-4aa5-42c7-bebc-84a022282ac7"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run time: ~2m\n",
- "\n",
- "# Setup\n",
- "labelled_annotations = copy.deepcopy(annotations)\n",
- "for n, annotation in enumerate(labelled_annotations):\n",
- " annotation.properties[\"class\"] = n % 10\n",
- " annotation.properties[\"vector\"] = rng.integers(1, 4, 10).tolist()\n",
- "\n",
- "predicate = \"(props['class'] == ?) & (3 in props['vector'])\"\n",
- "classes = rng.integers(0, 10, size=100)\n",
- "stmt = \"for n in classes:\\n store.query(where=predicate.replace('?', str(n)))\"\n",
- "\n",
- "dict_store = DictionaryStore()\n",
- "sql_store = SQLiteStore()\n",
- "\n",
- "dict_store.append_many(labelled_annotations)\n",
- "sql_store.append_many(labelled_annotations)\n",
- "\n",
- "\n",
- "# Time dictionary store\n",
- "dict_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": dict_store, \"predicate\": predicate, \"classes\": classes},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "dict_result = dict_store.query(where=predicate.replace(\"?\", \"0\"))\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": sql_store, \"predicate\": predicate, \"classes\": classes},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "sql_result = sql_store.query(where=predicate.replace(\"?\", \"0\"))\n",
- "\n",
- "\n",
- "# Add an index\n",
- "# Note: Indexes may not always speed up the query (sometimes they can\n",
- "# actually slow it down), test to make sure.\n",
- "sql_store.create_index(\"class_lookup\", \"props['class']\")\n",
- "sql_store.create_index(\"has_3\", \"3 in props['vector']\")\n",
- "\n",
- "# Time SQLite store again\n",
- "sqlite_index_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\"store\": sql_store, \"predicate\": predicate, \"classes\": classes},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "sql_index_result = sql_store.query(where=predicate.replace(\"?\", \"0\"))\n",
- "\n",
- "# # Validate the results against each other\n",
- "# for a, b, c in zip(dict_result, sql_result, sql_index_result):\n",
- "# assert a.geometry == b.geometry == c.geometry # noqa: ERA001\n",
- "\n",
- "# Plot the results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs, sqlite_index_runs],\n",
- " title=\"100 Queries with a Predicate\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\", \"SQLiteStore\\n(with index)\"],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "gp8mq1TNpT5y"
- },
- "source": [
- "### Polygon & Predicate Query\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Eu0hGvhdpT5y",
- "outputId": "0d89174e-01e0-4e71-a9c3-e063ed30ca38"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run time: ~10s\n",
- "\n",
- "# Setup\n",
- "labelled_annotations = copy.deepcopy(annotations)\n",
- "for n, annotation in enumerate(labelled_annotations):\n",
- " annotation.properties[\"class\"] = n % 10\n",
- "\n",
- "predicate = \"props['class'] == \"\n",
- "classes = rng.integers(0, 10, size=50)\n",
- "query_polygons = [\n",
- " Polygon(\n",
- " [\n",
- " (x, y),\n",
- " (x + 128, y),\n",
- " (x + 128, y + 128),\n",
- " (x, y),\n",
- " ],\n",
- " )\n",
- " for x, y in rng.integers(0, 1000, size=(100, 2))\n",
- "]\n",
- "stmt = (\n",
- " \"for n, poly in zip(classes, query_polygons):\\n\"\n",
- " \" store.query(poly, where=predicate + str(n))\"\n",
- ")\n",
- "\n",
- "dict_store = DictionaryStore()\n",
- "sql_store = SQLiteStore()\n",
- "\n",
- "dict_store.append_many(labelled_annotations)\n",
- "sql_store.append_many(labelled_annotations)\n",
- "\n",
- "\n",
- "# Time dictionary store\n",
- "dict_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\n",
- " \"store\": dict_store,\n",
- " \"predicate\": predicate,\n",
- " \"classes\": classes,\n",
- " \"query_polygons\": query_polygons,\n",
- " },\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "dict_result = dict_store.query(query_polygons[0], where=predicate + \"0\")\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\n",
- " \"store\": sql_store,\n",
- " \"predicate\": predicate,\n",
- " \"classes\": classes,\n",
- " \"query_polygons\": query_polygons,\n",
- " },\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "sql_result = sql_store.query(query_polygons[0], where=predicate + \"0\")\n",
- "\n",
- "\n",
- "# Check that the set difference of bounding boxes is empty i.e. all sets\n",
- "# of results contain polygons which produce the same set of bounding\n",
- "# boxes. This avoids being tripped up by slight varations in order or\n",
- "# coordinate order between the results.\n",
- "dict_set = {x.geometry.bounds for x in dict_result}\n",
- "sql_set = {x.geometry.bounds for x in sql_result}\n",
- "assert len(dict_set.difference(sql_set)) == 0 # noqa: S101\n",
- "\n",
- "# Plot the results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"100 Queries with a Polygon and Predicate\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "kJ8x5tJmpT5y"
- },
- "source": [
- "### Complex Predicate Query\n",
- "\n",
- "Here we slightly increase the complexity of the predicate to show how\n",
- "the complexity of a predicate can dramatically affect the performance\n",
- "when handling many annotations.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "VHb4PqbHpT5y",
- "outputId": "343b44c7-741d-4e11-9dd2-85f357ba6f32"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run time: ~1m\n",
- "\n",
- "# Setup\n",
- "box = Polygon.from_bounds(0, 0, 1024, 1024)\n",
- "labelled_annotations = copy.deepcopy(annotations)\n",
- "for n, annotation in enumerate(labelled_annotations):\n",
- " annotation.properties[\"class\"] = n % 4\n",
- " annotation.properties[\"n\"] = n\n",
- "\n",
- "predicate = \"(props['n'] > 1000) & (props['n'] % 4 == 0) & (props['class'] == \"\n",
- "targets = rng.integers(0, 4, size=100)\n",
- "stmt = \"for n in targets:\\n store.query(box, where=predicate + str(n) + ')')\"\n",
- "\n",
- "dict_store = DictionaryStore()\n",
- "sql_store = SQLiteStore()\n",
- "\n",
- "dict_store.append_many(labelled_annotations)\n",
- "sql_store.append_many(labelled_annotations)\n",
- "\n",
- "\n",
- "# Time dictionary store\n",
- "dict_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\n",
- " \"store\": dict_store,\n",
- " \"predicate\": predicate,\n",
- " \"targets\": targets,\n",
- " \"box\": box,\n",
- " },\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "dict_result = dict_store.query(box, where=predicate + \"0)\")\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " stmt,\n",
- " globals={\n",
- " \"store\": sql_store,\n",
- " \"predicate\": predicate,\n",
- " \"targets\": targets,\n",
- " \"box\": box,\n",
- " },\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "sql_result = sql_store.query(box, where=predicate + \"0)\")\n",
- "\n",
- "\n",
- "# Check that the set difference of bounding boxes is empty i.e. all sets\n",
- "# of results contain polygons which produce the same set of bounding\n",
- "# boxes. This avoids being tripped up by slight varations in order or\n",
- "# coordinate order between the results.\n",
- "dict_set = {x.geometry.bounds for x in dict_result.values()}\n",
- "sql_set = {x.geometry.bounds for x in sql_result.values()}\n",
- "\n",
- "assert len(dict_set.difference(sql_set)) == 0 # noqa: S101\n",
- "\n",
- "# Plot the results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"100 Queries with a Complex Predicate\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "CAT0KmS6pT5y"
- },
- "source": [
- "# Part 2: Large Scale Dataset Benchmarking\n",
- "\n",
- "Here we generate some sets of anntations with five million items each\n",
- "(in a 2237 x 2237 grid). One is a set of points, the other a set of\n",
- "generated cell boundaries.\n",
- "\n",
- "The code to generate and write out the annotations to various formats is\n",
- "included in the following cells. However, some of these take a very long\n",
- "time to run. A pre-generated dataset is downloaded and then read from\n",
- "disk instead to save time. However, you may uncomment the generation\n",
- "code to replicate the original.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "nwH5zYFupT5y"
- },
- "source": [
- "## 2.1) Points Dataset\n",
- "\n",
- "Here we generate a simple points data in a grid. The grid is 2237 x 2237\n",
- "and contains over 5 million points. We also write this to disk in\n",
- "various formats. Some formats take a long time and are commented out. A\n",
- "summary of times for a consumer laptop are shown in a table at the end.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "2FjCL2jgpT5y"
- },
- "outputs": [],
- "source": [
- "# Generate some points with a little noise\n",
- "# Run time: ~5s\n",
- "points = np.array(\n",
- " [\n",
- " [x, y]\n",
- " for x in np.linspace(0, 75_000, 2237)\n",
- " for y in np.linspace(0, 75_000, 2237)\n",
- " ],\n",
- ")\n",
- "# Add some noise between -1 and 1\n",
- "rng_42 = np.random.default_rng(42)\n",
- "points += rng_42.uniform(-1, 1, size=(2237**2, 2))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "DRWABSBVpT5z"
- },
- "source": [
- "### 2.1.1) Writing To Disk\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "x76WbSFdpT52"
- },
- "outputs": [],
- "source": [
- "# Save as a simple Numpy array (.npy)\n",
- "# Run time: <1s\n",
- "np.save(\"points.npy\", points)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "dkKtM-DKpT52"
- },
- "outputs": [],
- "source": [
- "# Save as compressed NumPy archive (.npz)\n",
- "# Run time: ~5s\n",
- "np.savez_compressed(\"points.npz\", points)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "rbHdEIbPpT52"
- },
- "source": [
- "Note that the above numpy format is missing the keys (UUIDs) of each point.\n",
- "This may not be required in all cases. However, for the sake of comparison\n",
- "we also generate a NumPy archive with keys included. We store the UUIDs\n",
- "as integers to save space and for a fair comparison where the optimal\n",
- "storage method is used in each case. Note however that UUIDs are too\n",
- "large to be a standard C type and therefore are stored as an object\n",
- "array.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "DbLm4l5tpT52"
- },
- "outputs": [],
- "source": [
- "# Generate UUIDs\n",
- "# Run time: ~10s\n",
- "keys = np.array([uuid.uuid4().int for _ in range(len(points))])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "zXuAqw0KpT52"
- },
- "outputs": [],
- "source": [
- "# Generate some UUIDs as keys\n",
- "# Save in NumPy format (.npz)\n",
- "# Run time: <1s\n",
- "np.savez(\"uuid_points.npz\", keys=keys, coords=points)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "UAHAgPU4pT52"
- },
- "outputs": [],
- "source": [
- "# Save in compressed (zip) NumPy format (.npz)\n",
- "# Run time: ~10s\n",
- "np.savez_compressed(\"uuid_points_compressed.npz\", keys=keys, coords=points)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "j5wlDFYfpT52"
- },
- "outputs": [],
- "source": [
- "# Write to SQLite with SQLiteStore\n",
- "# Run time: ~10m\n",
- "points_sqlite_store = SQLiteStore(\"points.db\")\n",
- "_ = points_sqlite_store.append_many(\n",
- " annotations=(Annotation(Point(x, y)) for x, y in points),\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "tUekiEqspT53"
- },
- "outputs": [],
- "source": [
- "# Load a DictionaryStore into memory by copying from the SQLiteStore\n",
- "# Run time: ~1m 30s\n",
- "points_dict_store = DictionaryStore(Path(\"points.ndjson\"))\n",
- "for key, value in points_sqlite_store.items():\n",
- " points_dict_store[key] = value"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Uynntjq7pT53"
- },
- "outputs": [],
- "source": [
- "# Save as GeoJSON\n",
- "# Run time: ~1m 30s\n",
- "points_sqlite_store.to_geojson(\"points.geojson\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "4YMuggcgpT53"
- },
- "outputs": [],
- "source": [
- "# Save as ndjson\n",
- "# Run time: ~1m 30s\n",
- "# Spec: https://github.com/ndjson/ndjson-spec\n",
- "points_sqlite_store.to_ndjson(\"points.ndjson\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "lW9NoCPwpT53"
- },
- "source": [
- "### 2.1.2) Points Dataset Statistics Summary\n",
- "\n",
- "| Format | Write Time | Size |\n",
- "| -----------------------------: | ---------: | -----: |\n",
- "| SQLiteStore (.db) | 6m 20s | 893MB |\n",
- "| ndjson | 1m 23s | 667 MB |\n",
- "| GeoJSON | 1m 42s | 500 MB |\n",
- "| NumPy + UUID (.npz) | 0.5s | 165 MB |\n",
- "| NumPy + UUID Compressed (.npz) | 31s | 136 MB |\n",
- "| NumPy (.npy) | 0.1s | 76 MB |\n",
- "| NumPy Compressed (.npz) | 3.3s | 66 MB |\n",
- "\n",
- "Note that the points SQLite database is significantly larger than the\n",
- "NumPy arrays on disk. The numpy array is much more storage efficient\n",
- "partly because there is no R Tree index or unique identifier (UUID)\n",
- "stored for each point. For a more fair comparison, another NumPy archive\n",
- "(.npz) is created where the keys are stored along with the coordinates.\n",
- "\n",
- "Also note that although the compressed NumPy representation is much\n",
- "smaller, it must be decompressed in memeory before it can be used. The\n",
- "uncompressed versions may be memory mapped if their size exceeds the\n",
- "available memory.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "a_3Gz5Q0pT53"
- },
- "source": [
- "### 2.1.3) Simple Box Query\n",
- "\n",
- "Here we evaluate the performance of performing a simple box query on the\n",
- "data. All points which are in the area between 128 and 256 in the x and\n",
- "y coordinates are retrieved. It is assumed that the data is already in\n",
- "memory for the NumPy formats. In reality this would not the be case for\n",
- "the first query, all data would have to be read from disk, which is a\n",
- "significan overhead. However, this cost is amortised across many\n",
- "queries. To ensure the fairest possible comparison, it is assumed that\n",
- "many queries will be performed, and that this data loading cost in\n",
- "negligable.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "o9J0d6gdpT53"
- },
- "outputs": [],
- "source": [
- "box = Polygon.from_bounds(128, 128, 256, 256)\n",
- "\n",
- "# Time numpy\n",
- "numpy_runs = timeit.repeat(\n",
- " (\n",
- " \"where = np.all([\"\n",
- " \"points[:, 0] > 128,\"\n",
- " \"points[:, 0] < 256,\"\n",
- " \"points[:, 1] > 128,\"\n",
- " \"points[:, 1] < 256\"\n",
- " \"], 0)\\n\"\n",
- " \"uuids = keys[where]\\n\"\n",
- " \"result = points[where]\\n\"\n",
- " ),\n",
- " globals={\"keys\": keys, \"points\": points, \"np\": np},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Time SQLiteStore\n",
- "sqlite_runs = timeit.repeat(\n",
- " \"store.query(box)\",\n",
- " globals={\"store\": points_sqlite_store, \"box\": box},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Time DictionaryStore\n",
- "dict_runs = timeit.repeat(\n",
- " \"store.query(box)\",\n",
- " globals={\"store\": points_dict_store, \"box\": box},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "eX1qqUIipT53",
- "outputId": "a4033a88-6b2d-4a55-f3f6-ba419ef748c0"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs, numpy_runs],\n",
- " title=\"Points Box Query (5 Million Points)\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\", \"NumPy Array\"],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "aNU6FP90pT53"
- },
- "source": [
- "Although the NumPy array is very space efficient on disk, it is not as\n",
- "fast to query as the `SQLiteStore`. The `SQLiteStore` is likely faster\n",
- "due to the use of the R tree index. Furthermore, the method used to\n",
- "store the points in a NumPy array is limited in that it does not use\n",
- "UUIDs, which makes merging two datasets more difficult as the indexes of\n",
- "points no longer uniquely identify them. Additionally, only homogeneous\n",
- "data such as two-dimentional coordinates can be practically stored in\n",
- "this way. If the user would like to store variable length data\n",
- "structures such as polygons, or even mix data types by storing both\n",
- "points and polygons, then using raw NumPy arrays in this way can become\n",
- "cumbersome and begins to offer little benefit in terms of storage\n",
- "efficient or query performance.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "c766NXGPpT53"
- },
- "source": [
- "### 2.1.4) Polygon Query\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "6jiMpRnxpT53"
- },
- "outputs": [],
- "source": [
- "big_triangle = Polygon(\n",
- " shell=[ # noqa: S604\n",
- " (1024, 1024),\n",
- " (1024, 4096),\n",
- " (4096, 4096),\n",
- " (1024, 1024),\n",
- " ],\n",
- ")\n",
- "\n",
- "# Time SQLiteStore\n",
- "sqlite_runs = timeit.repeat(\n",
- " \"store.query(polygon)\",\n",
- " globals={\"store\": points_sqlite_store, \"polygon\": big_triangle},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")\n",
- "\n",
- "# Time DictionaryStore\n",
- "dict_runs = timeit.repeat(\n",
- " \"store.query(polygon)\",\n",
- " globals={\"store\": points_dict_store, \"polygon\": big_triangle},\n",
- " number=1,\n",
- " repeat=10,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Es2OQ5OdpT53",
- "outputId": "b98176ee-7003-49f7-f5ca-62b08180b2ee"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"Polygon Query (5 Million Points)\",\n",
- " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "HUBEmZDMpT53"
- },
- "source": [
- "## 2.2) Cell Boundary Polygons Dataset\n",
- "\n",
- "Here we generate a much larger and more complex polygon dataset. This\n",
- "consistes of a grid of over 5 million generated cell boundary like\n",
- "polygons.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "xhCr_TDVpT53",
- "outputId": "c02b7a20-6ab1-4cae-b6bb-fb5c6d94cd12"
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|██████████| 5004169/5004169 [10:04<00:00, 8277.35it/s] \n"
- ]
- }
- ],
- "source": [
- "# Generate a grid of 5 million cell boundary polygons (2237 x 2237)\n",
- "# Run time: ~10m\n",
- "rng_42 = np.random.default_rng(42)\n",
- "\n",
- "cell_polygons = [\n",
- " Annotation(geometry=polygon, properties={\"class\": rng_42.integers(0, 4)})\n",
- " for polygon in tqdm(cell_grid(size=(2237, 2237), spacing=35), total=2237**2)\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "21RgwKtgpT54"
- },
- "source": [
- "### 2.2.1) Write To Formats For Comparison\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "CDVLMRUtpT54"
- },
- "outputs": [],
- "source": [
- "# Write to an SQLiteStore on disk (SSD for recorded times here)\n",
- "# Run time: ~30m\n",
- "cell_sqlite_store = SQLiteStore(\"cells.db\")\n",
- "_ = cell_sqlite_store.append_many(annotations=cell_polygons)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "6Fb4tQHVpT54",
- "outputId": "fba12c47-e0cb-44fd-ca95-35c38454c9cc"
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- " \r"
- ]
- }
- ],
- "source": [
- "# Create a copy as an in memory DictionaryStore\n",
- "# Run time: ~5m\n",
- "cell_dict_store = DictionaryStore()\n",
- "for key, value in tqdm( # Show a nice progress bar\n",
- " cell_sqlite_store.items(),\n",
- " total=len(cell_sqlite_store),\n",
- " leave=False,\n",
- " position=0,\n",
- "):\n",
- " cell_dict_store[key] = value"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "wXOOuGWypT54",
- "outputId": "e2fb300e-e5b8-4459-b172-249cda363b50"
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|██████████| 5004169/5004169 [01:26<00:00, 58002.74it/s]\n"
- ]
- }
- ],
- "source": [
- "# Transform into a numpy array\n",
- "# Run Time: ~1m\n",
- "cell_polygons_np = np.array(\n",
- " [np.array(a.geometry.exterior.coords) for a in tqdm(cell_polygons)],\n",
- " dtype=object,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "yv9VgW9TpT54"
- },
- "outputs": [],
- "source": [
- "# Create an Nx4 index of (xmin, ymin, xmax, ymax) as a simple spatial\n",
- "# index to speed up the numpy query.\n",
- "# Run time: ~1m\n",
- "min_max_index = np.array(\n",
- " [(*np.min(coords, 0), *np.max(coords, 0)) for coords in cell_polygons_np],\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "nFmHxwBwpT54"
- },
- "outputs": [],
- "source": [
- "# Write to GeoJSON\n",
- "# Run time: ~10m\n",
- "\n",
- "cell_dict_store.to_geojson(\"cells.geojson\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "2UH6WdmipT54"
- },
- "outputs": [],
- "source": [
- "# Write to line delimited JSON (ndjson)\n",
- "# Run time: ~10m\n",
- "\n",
- "cell_dict_store.to_ndjson(\"cells.ndjson\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "fw6wg5gapT54",
- "outputId": "61a32277-fb8d-4bdc-be28-b379cb0a23eb"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "cells.ndjson : 40.82% ( 8.82 GiB => 3.60 GiB, cells.ndjson.zstd) \n"
- ]
- }
- ],
- "source": [
- "# Zstandard compression of ndjson to demonstrate how well it compresses.\n",
- "# Gzip may also be used but is slower to compress.\n",
- "# Run time: ~1m\n",
- "! zstd -f -k cells.ndjson -o cells.ndjson.zstd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "rzGC65zhpT55",
- "outputId": "75ad772b-5641-4d64-ae16-7d50206e1b85"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "cells.db : 75.87% ( 4.87 GiB => 3.69 GiB, cells.db.zstd) \n"
- ]
- }
- ],
- "source": [
- "# Zstandard compression of sqlite to demonstrate how well it compresses.\n",
- "# Gzip may also be used but is slower to compress.\n",
- "# Run time: ~20s\n",
- "! zstd -f -k cells.db -o cells.db.zstd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "xT0KZLxdpT55"
- },
- "outputs": [],
- "source": [
- "# Write as a pickle (list)\n",
- "# Run time: ~2m\n",
- "with Path(\"cells.pickle\").open(\"wb\") as fh:\n",
- " pickle.dump(cell_polygons, fh)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "-TAWGEu9pT55"
- },
- "outputs": [],
- "source": [
- "# Write as a pickle (dict)\n",
- "# Run time: ~15m\n",
- "with Path(\"cells-dict.pickle\").openI(\"wb\") as fh:\n",
- " pickle.dump(cell_dict_store._rows, fh) # noqa: SLF001"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "I-W4o3GepT55"
- },
- "outputs": [],
- "source": [
- "# Write dictionary store to a pickle\n",
- "# Run time: ~20m\n",
- "with Path(\"cells.pickle\").open(\"wb\") as fh:\n",
- " pickle.dump(cell_dict_store, fh)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "dALe8k0BpT55"
- },
- "outputs": [],
- "source": [
- "# Write as numpy object array (similar to writing out with pickle),\n",
- "# Numpy cannot handle ragged arrays and therefore dtype must be object.\n",
- "# Run time: ~30m\n",
- "np.save(\"cells.npy\", np.asanyarray(cell_polygons_np, dtype=object))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "hOrGS0HgpT55"
- },
- "outputs": [],
- "source": [
- "# Create UUIDs, and get the class labels for each cell boundary\n",
- "# Run time: ~2m\n",
- "_uuids = [str(uuid.uuid4) for _ in cell_polygons]\n",
- "_cls = [x.properties[\"class\"] for x in cell_polygons]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Fs2cz8lVpT55"
- },
- "outputs": [],
- "source": [
- "# Write as NumPy archive (.npz) with uuid and min_max_index\n",
- "# Run time: ~40m\n",
- "np.savez(\n",
- " \"cells.npz\",\n",
- " uuids=_uuids,\n",
- " polygons=cell_polygons_np,\n",
- " min_max_index=min_max_index,\n",
- " cls=_cls,\n",
- ")\n",
- "\n",
- "del _uuids, _cls"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "4gOTqc03pT55"
- },
- "source": [
- "### 2.2.2) Time To Write Summary Statistics\n",
- "\n",
- "The following is a summary of the time required to write each format to\n",
- "disk and the total disk space occupied by the final output.\n",
- "\n",
- "Note that some of these formats, such as GeoJSON compress well with\n",
- "schemes such as gzip and zstd, reducing the disk space by approximately\n",
- "half. Statistics for zstd compressed data is also reported below. It\n",
- "should be noted that the data must be decompressed to be usable.\n",
- "However, for gzip and zstd, this may be done in a streaming fashion from\n",
- "disk.\n",
- "\n",
- "| Format | Write Time | Size |\n",
- "| ----------------: | ---------: | -----: |\n",
- "| SQLiteStore (.db) | 33m 48.4s | 4.9 GB |\n",
- "| GeoJSON | 11m 32.9s | 8.9 GB |\n",
- "| ndjson | 9m 0.9s | 8.8 GB |\n",
- "| pickle | 1m 2.9s | 1.8 GB |\n",
- "| zstd (SQLite) | 18.2s | 3.7 GB |\n",
- "| zstd (ndjson) | 43.7s | 3.6 GB |\n",
- "| NumPy (.npy) | 50.3s | 1.8 GB |\n",
- "| NumPy (.npz) | 55.3s | 2.6 GB |\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "wS3sGpnWpT55"
- },
- "source": [
- "### 2.2.3) Box Query\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "MKvKfkyvpT55"
- },
- "outputs": [],
- "source": [
- "# Run time: ~5m\n",
- "\n",
- "# Setup\n",
- "xmin, ymin, xmax, ymax = 128, 12, 256, 256\n",
- "box = Polygon.from_bounds(xmin, ymin, xmax, ymax)\n",
- "\n",
- "\n",
- "# Time DictionaryStore\n",
- "dict_runs = timeit.repeat(\n",
- " \"store.query(box)\",\n",
- " globals={\"store\": cell_dict_store, \"box\": box},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " \"store.query(box)\",\n",
- " globals={\"store\": cell_sqlite_store, \"box\": box},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "0Yo14C3kpT55",
- "outputId": "764bc28b-3072-4887-ea88-4c88ffcefb5f"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Plot results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"Box Query (5 Million Polygons)\",\n",
- " tick_label=[\n",
- " \"DictionaryStore\",\n",
- " \"SQLiteStore\",\n",
- " ],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "ExF-fOGQpT56"
- },
- "source": [
- "### 2.2.4) Polygon Query\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "PcxKapqNpT56"
- },
- "outputs": [],
- "source": [
- "# Run Time: 35s\n",
- "\n",
- "# Setup\n",
- "big_triangle = Polygon(\n",
- " shell=[ # noqa: S604\n",
- " (1024, 1024),\n",
- " (1024, 4096),\n",
- " (4096, 4096),\n",
- " (1024, 1024),\n",
- " ],\n",
- ")\n",
- "\n",
- "\n",
- "# Time DictionaryStore\n",
- "dict_runs = timeit.repeat(\n",
- " \"store.query(polygon)\",\n",
- " globals={\"store\": cell_dict_store, \"polygon\": big_triangle},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")\n",
- "\n",
- "# Time SQLite store\n",
- "sqlite_runs = timeit.repeat(\n",
- " \"store.query(polygon)\",\n",
- " globals={\"store\": cell_sqlite_store, \"polygon\": big_triangle},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "vqHA50DQpT56",
- "outputId": "7e837f4c-ada9-400f-b5f3-c59430b137f3"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Plot results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs],\n",
- " title=\"Polygon Query (5 Million Polygons)\",\n",
- " tick_label=[\n",
- " \"DictionaryStore\",\n",
- " \"SQLiteStore\",\n",
- " ],\n",
- ")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "6m-E5AwapT56"
- },
- "source": [
- "### 2.2.5) Predicate Query\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "whEn34rOpT56"
- },
- "outputs": [],
- "source": [
- "# Run Time: ~10m\n",
- "\n",
- "# Setup\n",
- "xmin, ymin, xmax, ymax = 128, 12, 256, 256\n",
- "box = Polygon.from_bounds(xmin, ymin, xmax, ymax)\n",
- "predicate = \"props['class'] == 0\"\n",
- "\n",
- "# Time DictionaryStore\n",
- "dict_runs = timeit.repeat(\n",
- " \"store.query(box, predicate)\",\n",
- " globals={\"store\": cell_dict_store, \"box\": box, \"predicate\": predicate},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")\n",
- "\n",
- "# Time SQLiteStore\n",
- "sqlite_runs = timeit.repeat(\n",
- " \"store.query(box, where=predicate)\",\n",
- " globals={\"store\": cell_sqlite_store, \"box\": box, \"predicate\": predicate},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")\n",
- "\n",
- "np_stmt = f\"\"\"\n",
- "polygons = [\n",
- " polygon\n",
- " for polygon in tqdm(cell_polygons_np)\n",
- " if np.all([\n",
- " np.max(polygon, 0) >= ({xmin}, {ymin}), np.min(polygon, 0) <= ({xmax}, {ymax})\n",
- " ])\n",
- "]\n",
- "\"\"\"\n",
- "\n",
- "# Time numpy\n",
- "numpy_runs = timeit.repeat(\n",
- " np_stmt,\n",
- " globals={\"cell_polygons_np\": cell_polygons_np, \"np\": np, \"tqdm\": lambda x: x},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")\n",
- "\n",
- "# Time shapely\n",
- "shapely_runs = timeit.repeat(\n",
- " \"polygons = [box.intersects(ann.geometry) for ann in cell_polygons]\",\n",
- " globals={\"box\": box, \"cell_polygons\": cell_polygons},\n",
- " number=1,\n",
- " repeat=3,\n",
- ")\n",
- "\n",
- "# Time box indexed numpy\n",
- "numpy_index_runs = timeit.repeat(\n",
- " \"in_box = np.all(min_max_index[:, :2] <= (xmax, ymax), 1) \"\n",
- " \"& np.all(min_max_index[:, 2:] >= (xmin, ymin), 1)\\n\"\n",
- " \"polygons = [p for p, w in zip(cell_polygons, in_box) if w]\",\n",
- " globals={\n",
- " \"min_max_index\": min_max_index,\n",
- " \"xmin\": xmin,\n",
- " \"ymin\": ymin,\n",
- " \"xmax\": xmax,\n",
- " \"ymax\": ymax,\n",
- " \"np\": np,\n",
- " \"cell_polygons\": cell_polygons,\n",
- " },\n",
- " number=1,\n",
- " repeat=3,\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "oRxJTg7BpT56",
- "outputId": "d235e51a-5109-486e-b779-fe39e5f6ee33"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Run Time: ~5s\n",
- "\n",
- "# Plot results\n",
- "plot_results(\n",
- " experiments=[dict_runs, sqlite_runs, numpy_runs, shapely_runs, numpy_index_runs],\n",
- " title=\"Box Query\",\n",
- " tick_label=[\n",
- " \"DictionaryStore\",\n",
- " \"SQLiteStore\",\n",
- " \"NumPy\\n(Simple Loop)\",\n",
- " \"Shapely\\n(Simple Loop)\",\n",
- " \"NumPy\\n(With Bounds Index)\",\n",
- " ],\n",
- ")\n",
- "plt.xticks(rotation=90)\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "LJiGGkespT56"
- },
- "source": [
- "## 2.3) Size vs Approximate Lower Bound\n",
- "\n",
- "Here we calculate an estimated lower bound on file size by finding the\n",
- "the Shannon entropy of each file. This tells us the theoretical minimum\n",
- "number of bits per byte. The lowest lower bound is then used as an\n",
- "estimate of the minimum file size possible to store the annotation data.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "0IO10faZpT56",
- "outputId": "033c2530-072a-4aa5-cf34-c2298e90d86f"
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- " "
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Approximate Lower Bound Size: 3.60 GB\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r"
- ]
- }
- ],
- "source": [
- "# Run Time: ~5m\n",
- "\n",
- "\n",
- "# Files to consider containing keys, geometry, and properties.\n",
- "# Files which are missing keys e.g. cells.pickle are excluded\n",
- "# for a fair comparison.\n",
- "file_names = [\n",
- " \"cells-dicionary-store.pickle\",\n",
- " \"cells-dict.pickle\",\n",
- " \"cells.db\",\n",
- " \"cells.db.zstd\",\n",
- " \"cells.geojson\",\n",
- " \"cells.ndjson\",\n",
- " \"cells.ndjson.zstd\",\n",
- "]\n",
- "\n",
- "\n",
- "def human_readible_bytes(byte_count: int) -> tuple[int, str]:\n",
- " \"\"\"Convert bytes to human readble size and suffix.\"\"\"\n",
- " byte_count_ref = 1024\n",
- " for suffix in [\"B\", \"KB\", \"MB\", \"GB\", \"TB\"]:\n",
- " if byte_count < byte_count_ref:\n",
- " return byte_count, suffix\n",
- " byte_count /= byte_count_ref\n",
- " return byte_count, \"PB\"\n",
- "\n",
- "\n",
- "def shannon_entropy(\n",
- " fp: Path,\n",
- " sample_size: int = 1e9, # 1GiB\n",
- " stride: int = 7,\n",
- " skip: int = 1e5, # 100KiB\n",
- ") -> float:\n",
- " \"\"\"Calculate the Shannon entropy of a file from a sample.\n",
- "\n",
- " The first `skip` bytes are skipped to avoid sampling low entropy\n",
- " (highly ordered) parts which commonly occur at the beginning e.g.\n",
- " headers.\n",
- "\n",
- " Args:\n",
- " fp: File path to calculate entropy of.\n",
- " sample_size: Number of bytes to sample from the file.\n",
- " stride: Number of bytes to skip between samples.\n",
- " skip: Number of bytes to skip before sampling.\n",
- " \"\"\"\n",
- " npmmap = np.memmap(Path(fp), dtype=np.uint8, mode=\"r\")\n",
- " values, counts = np.unique(\n",
- " npmmap[int(skip) : int(skip + (sample_size * stride)) : int(stride)],\n",
- " return_counts=True,\n",
- " )\n",
- " total = np.sum(counts)\n",
- " frequencies = {v: 0 for v in range(256)}\n",
- " for v, x in zip(values, counts):\n",
- " frequencies[v] = x / total\n",
- " frequency_array = np.array(list(frequencies.values()))\n",
- " epsilon = 1e-16\n",
- " return -np.sum(frequency_array * np.log2(frequency_array + epsilon))\n",
- "\n",
- "\n",
- "# Find the min across all of the representations for the lowest lower\n",
- "# bound.\n",
- "bytes_lower_bounds = {\n",
- " path: (\n",
- " shannon_entropy(Path(path)) / 8 * len(np.memmap(path, dtype=np.uint8, mode=\"r\"))\n",
- " )\n",
- " for path in tqdm(\n",
- " [Path.cwd() / name for name in file_names],\n",
- " position=0,\n",
- " leave=False,\n",
- " )\n",
- "}\n",
- "\n",
- "lowest_bytes_lower_bound = min(bytes_lower_bounds.values())\n",
- "\n",
- "size, suffix = human_readible_bytes(lowest_bytes_lower_bound)\n",
- "logger.info(\"Approximate Lower Bound Size: %2f %s\", size, suffix)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "chwB3zeupT56"
- },
- "source": [
- "### Plot Results\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "cu5jkrVppT56",
- "outputId": "bb36aea5-d5d7-4560-a853-d2a8afba0eac"
- },
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Get file sizes\n",
- "file_sizes = {\n",
- " path: path.stat().st_size for path in [Path.cwd() / name for name in file_names]\n",
- "}\n",
- "\n",
- "# Sort by size\n",
- "file_sizes = dict(sorted(file_sizes.items(), key=lambda x: x[1]))\n",
- "\n",
- "# Plot\n",
- "plt.bar(\n",
- " x=range(len(file_sizes)),\n",
- " height=file_sizes.values(),\n",
- " tick_label=[p.name for p in file_sizes],\n",
- " color=[f\"C{i}\" for i in range(len(file_sizes))],\n",
- ")\n",
- "plt.xlabel(\"File Name\")\n",
- "plt.ylabel(\"Bytes\")\n",
- "plt.xticks(rotation=90)\n",
- "plt.hlines(\n",
- " y=lowest_bytes_lower_bound,\n",
- " xmin=-0.5,\n",
- " xmax=len(file_sizes) - 0.5,\n",
- " linestyles=\"dashed\",\n",
- " color=\"black\",\n",
- " label=\"Approximate Bytes Lower Bound\",\n",
- ")\n",
- "plt.legend()\n",
- "plt.tight_layout()\n",
- "plt.title(\"Polygon Annotation File Sizes\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "gmuEWlImpT57"
- },
- "source": [
- "The SQLite representation (4.9GB) appears to be quite compact compared\n",
- "with GeoJSON and ndjson. Although not as compact as a dictionary pickle\n",
- "or Zstandard compressed ndjson, it offers a good compromise between\n",
- "compactness and read performance.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Yhe5rMXPpT57"
- },
- "source": [
- "# 3: Extra Bits\n",
- "\n",
- "## 3.1) Space Saving\n",
- "\n",
- "A lot of space can be saved by rounding the coordinates to the nearest\n",
- "integer when storing them. Below we make a copy of the dataset with all\n",
- "coordinates rounded.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "H2Jsc0repT57",
- "outputId": "d2ca9eff-b67d-4bfc-ad5a-57c87bc6a7da"
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|██████████| 10008338/10008338 [51:00<00:00, 3270.16it/s] \n"
- ]
- }
- ],
- "source": [
- "# Run Time: ~50m\n",
- "! rm integer-cells.db\n",
- "int_cell_sqlite_store = SQLiteStore(\"integer-cells.db\")\n",
- "\n",
- "# We use batches of 1000 to speed up appending\n",
- "batch = {}\n",
- "batch_size = 1000\n",
- "for key, annotation in tqdm(cell_sqlite_store.items(), total=len(cell_sqlite_store)):\n",
- " geometry = Polygon(np.array(annotation.geometry.exterior.coords).round())\n",
- " rounded_annotation = Annotation(geometry, annotation.properties)\n",
- " batch[key] = rounded_annotation\n",
- " if len(batch) >= batch_size:\n",
- " int_cell_sqlite_store.append_many(batch.values(), batch.keys())\n",
- " batch = {}\n",
- "_ = int_cell_sqlite_store.append_many(batch.values(), batch.keys())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "U6aooIROpT57"
- },
- "source": [
- "Here the database size is reduced to 2.9GB, down from 4.9GB.\n",
- "Additionally, when using integer coordinates, the database compresses\n",
- "much better. Zstandard can compress to approximately 60% of the\n",
- "original size (and 35% of the floating point coordinate\n",
- "database size). This may be done for archival purposes.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Q3TJ8XX4pT57",
- "outputId": "b99d1af7-4c68-4394-cf9a-8bb2b64471a0"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "integer-cells.db : 60.58% ( 2.86 GiB => 1.73 GiB, integer-cells.db.zstd) \n"
- ]
}
- ],
- "source": [
- "# Run time: ~15s\n",
- "! zstd -f -k integer-cells.db -o integer-cells.db.zstd"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "alFRiIAbpT57"
- },
- "source": [
- "With higher (slower) compression settings the space can be further\n",
- "reduced for long term storage.\n",
- "\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "nVFqovfPpT57",
- "outputId": "0948bbe6-4252-4c93-eab7-8e3be4e98235"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "integer-cells.db : 51.22% ( 2.86 GiB => 1.47 GiB, integer-cells.db.19.zstd) \n"
- ]
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "interpreter": {
+ "hash": "a3ed8fb525a8bde66cc7655a5df08d8d0f8699a69b9eb5ccab28dc0a7837eec6"
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
}
- ],
- "source": [
- "# Run time: ~20m\n",
- "! zstd -f -k -19 --long integer-cells.db -o integer-cells.db.19.zstd"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "C3voJ43OpT57"
- },
- "source": [
- "## 3.2) Feature Comparison Summary\n",
- "\n",
- "Here we briefly summarise some of the positives and negatives of each format and construct a comparison matrix.\n",
- "\n",
- "**GeoJSON**\n",
- "\n",
- "*Positives*\n",
- "\n",
- "- Simple, based JSON which is well known.\n",
- "- Well defined with a public specification.\n",
- "- Popular format for geometry, many tools which work with it.\n",
- "- Fast to write.\n",
- "\n",
- "*Negatives*\n",
- "\n",
- "- Requires loading the whole file into memory for parsing. Some\n",
- " specialised parsers can, in some situations, reduce or avoid this but\n",
- " it is not possible in general.\n",
- "- Not a very compact representation.\n",
- "\n",
- "**ndjson (One GeoJSON Feature Per Line)**\n",
- "\n",
- "*Positives*\n",
- "\n",
- "- Simple.\n",
- "- Better to parse than JSON/GeoJSON. Each line can be parsed\n",
- " independently.\n",
- "- Many tools to parse JSON lines.\n",
- "- Fast to write.\n",
- "\n",
- "*Negatives*\n",
- "\n",
- "- Not a very compact representation.\n",
- "- Requires loading the whole dataset from disk before querying OR\n",
- " scanning through and reparsing each line for each query.\n",
- "- Amending annotations can be tricky. The easiest way is to blank out a\n",
- " line and append a modified copy each time. This could end up\n",
- " fragmenting the file and wasting a lot of space. More complex methods\n",
- " could be developed to reduce fragmenting the file.\n",
- "\n",
- "**pickle**\n",
- "\n",
- "*Positives*\n",
- "\n",
- "- Fast to write.\n",
- "\n",
- "*Negatives*\n",
- "\n",
- "- Vulnerable to arbitrary code execution when loading from disk.\n",
- "- Requires loading the whole dataset into memory for querying.\n",
- "\n",
- "**SQLite (SQLiteStore Flavour)**\n",
- "\n",
- "*Positives*\n",
- "\n",
- "- Very fast to query (uses an R-TREE index to accelerate\n",
- " spatial queries).\n",
- "- Does not require loading data into memory before querying.\n",
- "- Possible to index property lookups.\n",
- "\n",
- "*Negatives*\n",
- "\n",
- "- Not the most compact representation on disk.\n",
- "\n",
- "### Feature Matrix\n",
- "\n",
- "| Format | Size On-Disk | Size In-Memory | Partial Reads | Serialization | Query Performance |\n",
- "| ----------: | :----------- | :------------- | :------------ | :------------ | :---------------- |\n",
- "| SQLiteStore | Medium | Small | Yes | Slow | Fast |\n",
- "| GeoJSON | Large | Large | No | Fast | Slow |\n",
- "| ndjson | Large | Large | Yes | Fast | Medium |\n",
- "| pickle | Small | Medium | No | Medium | Slow |\n",
- "\n"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "provenance": []
- },
- "interpreter": {
- "hash": "a3ed8fb525a8bde66cc7655a5df08d8d0f8699a69b9eb5ccab28dc0a7837eec6"
- },
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat": 4,
+ "nbformat_minor": 0
}
diff --git a/benchmarks/annotation_store_alloc.py b/benchmarks/annotation_store_alloc.py
index 41b85043f..82c642ada 100644
--- a/benchmarks/annotation_store_alloc.py
+++ b/benchmarks/annotation_store_alloc.py
@@ -102,7 +102,7 @@
import warnings
from pathlib import Path
from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING, Any, Generator
+from typing import TYPE_CHECKING, Any
sys.path.append("../")
@@ -139,18 +139,19 @@ def __exit__(self: memray, *args: object) -> None:
# Intentionally blank.
-import numpy as np # noqa: E402
-import psutil # noqa: E402
-from shapely.geometry import Polygon # noqa: E402
-from tqdm import tqdm # noqa: E402
+import numpy as np
+import psutil
+from shapely.geometry import Polygon
+from tqdm import tqdm
-from tiatoolbox.annotation.storage import ( # noqa: E402
+from tiatoolbox.annotation.storage import (
Annotation,
DictionaryStore,
SQLiteStore,
)
if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Generator
from numbers import Number
diff --git a/docker/3.8/Debian/Dockerfile b/docker/3.11/Debian/Dockerfile
similarity index 91%
rename from docker/3.8/Debian/Dockerfile
rename to docker/3.11/Debian/Dockerfile
index 9c4e5ecc8..3b399ddac 100644
--- a/docker/3.8/Debian/Dockerfile
+++ b/docker/3.11/Debian/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8-slim-bullseye
+FROM python:3.11-slim-bullseye
#get linux packages
RUN apt-get -y update && apt-get -y install --no-install-recommends \
diff --git a/docker/3.11/Ubuntu/Dockerfile b/docker/3.11/Ubuntu/Dockerfile
new file mode 100644
index 000000000..72d7adee8
--- /dev/null
+++ b/docker/3.11/Ubuntu/Dockerfile
@@ -0,0 +1,30 @@
+FROM ubuntu:22.04 AS builder-image
+
+# To avoid tzdata blocking the build with frontend questions
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install python3.11
+RUN apt-get update && \
+ apt install software-properties-common -y &&\
+ add-apt-repository ppa:deadsnakes/ppa -y && apt update &&\
+ apt-get install -y --no-install-recommends python3.11-venv &&\
+ apt-get install libpython3.11-de -y &&\
+ apt-get install python3.11-dev -y &&\
+ apt-get install build-essential -y &&\
+ apt-get clean
+
+# Add env to PATH
+RUN python3.11 -m venv /venv
+ENV PATH=/venv/bin:$PATH
+
+# install TIAToolbox and its requirements
+RUN apt-get update && apt-get install --no-install-recommends -y \
+ libopenjp2-7-dev libopenjp2-tools \
+ openslide-tools \
+ libgl1 \
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir tiatoolbox
+
+# activate virtual environment
+ENV VIRTUAL_ENV=/opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
diff --git a/docker/3.12/Debian/Dockerfile b/docker/3.12/Debian/Dockerfile
new file mode 100644
index 000000000..412f8d015
--- /dev/null
+++ b/docker/3.12/Debian/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.12-slim-bullseye
+
+#get linux packages
+RUN apt-get -y update && apt-get -y install --no-install-recommends \
+ libopenjp2-7-dev libopenjp2-tools \
+ openslide-tools \
+ libgl1 \
+ build-essential \
+ && pip3 --no-cache-dir install tiatoolbox \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+# set the entry point to bash
+ENTRYPOINT ["/bin/bash"]
diff --git a/docker/3.12/Ubuntu/Dockerfile b/docker/3.12/Ubuntu/Dockerfile
new file mode 100644
index 000000000..d99483d74
--- /dev/null
+++ b/docker/3.12/Ubuntu/Dockerfile
@@ -0,0 +1,30 @@
+FROM ubuntu:22.04 AS builder-image
+
+# To avoid tzdata blocking the build with frontend questions
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install python3.12
+RUN apt-get update && \
+ apt install software-properties-common -y &&\
+ add-apt-repository ppa:deadsnakes/ppa -y && apt update &&\
+ apt-get install -y --no-install-recommends python3.12-venv &&\
+ apt-get install libpython3.12-de -y &&\
+ apt-get install python3.12-dev -y &&\
+ apt-get install build-essential -y &&\
+ apt-get clean
+
+# Add env to PATH
+RUN python3.12 -m venv /venv
+ENV PATH=/venv/bin:$PATH
+
+# install TIAToolbox and its requirements
+RUN apt-get update && apt-get install --no-install-recommends -y \
+ libopenjp2-7-dev libopenjp2-tools \
+ openslide-tools \
+ libgl1 \
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir tiatoolbox
+
+# activate virtual environment
+ENV VIRTUAL_ENV=/opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
diff --git a/docs/installation.rst b/docs/installation.rst
index e8fe41478..80895e939 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -60,7 +60,7 @@ MacPorts
Installing Stable Release
=========================
-Please note that TIAToolbox is tested for python version 3.8, 3.9 and 3.10.
+Please note that TIAToolbox is tested for python version 3.9, 3.10, 3.11 and 3.12.
Recommended
-----------
diff --git a/examples/full-pipelines/slide-graph.ipynb b/examples/full-pipelines/slide-graph.ipynb
index de6f2b60f..8b10087e2 100644
--- a/examples/full-pipelines/slide-graph.ipynb
+++ b/examples/full-pipelines/slide-graph.ipynb
@@ -133,7 +133,7 @@
"import warnings\n",
"from collections import OrderedDict\n",
"from pathlib import Path\n",
- "from typing import Callable, Iterator\n",
+ "from typing import TYPE_CHECKING, Callable\n",
"\n",
"# Third party imports\n",
"import joblib\n",
@@ -191,6 +191,9 @@
" WSIReader,\n",
")\n",
"\n",
+ "if TYPE_CHECKING: # pragma: no cover\n",
+ " from collections.abc import Iterator\n",
+ "\n",
"warnings.filterwarnings(\"ignore\")\n",
"mpl.rcParams[\"figure.dpi\"] = 300 # for high resolution figure in notebook"
]
@@ -397,7 +400,7 @@
"# https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/\n",
"wsi_patient_codes = np.array([\"-\".join(v.split(\"-\")[:3]) for v in wsi_names])\n",
"wsi_labels = np.array(\n",
- " [clinical_info[v] if v in clinical_info else np.nan for v in wsi_patient_codes],\n",
+ " [clinical_info.get(v, np.nan) for v in wsi_patient_codes],\n",
")\n",
"\n",
"# * Filter the WSIs and paths that do not have labels\n",
diff --git a/pyproject.toml b/pyproject.toml
index dbf71f456..0662f9e65 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,7 +72,7 @@ search = 'TOOLBOX_VER: {current_version}'
replace = 'TOOLBOX_VER: {new_version}'
[tool.ruff]
-select = [
+lint.select = [
"A", # flake8-builtins
"B", # flake8-bugbear
"D", # pydocstyle, need to enable for docstrings check.
@@ -126,13 +126,13 @@ select = [
"SLOT", # flake8-slots
"ASYNC", # flake8-async
]
-ignore = []
+lint.ignore = []
# Allow Ruff to discover `*.ipynb` files.
include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
# Allow autofix for all enabled rules (when `--fix`) is provided.
-fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
-unfixable = []
+lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+lint.unfixable = []
# Exclude a variety of commonly ignored directories.
exclude = [
@@ -149,29 +149,29 @@ exclude = [
]
# Ignore `F401` (import violations) in all `__init__.py` files.
-per-file-ignores = {"__init__.py" = ["F401"], "tests/*" = ["T201", "PGH001", "SLF001", "S101", "PLR2004"], "benchmarks/*" = ["T201", "INP001"], "pre-commit/*" = ["T201", "INP001"], "tiatoolbox/cli/*" = ["PLR0913"]}
+lint.per-file-ignores = {"__init__.py" = ["F401"], "tests/*" = ["T201", "PGH001", "SLF001", "S101", "PLR2004"], "benchmarks/*" = ["T201", "INP001"], "pre-commit/*" = ["T201", "INP001"], "tiatoolbox/cli/*" = ["PLR0913"]}
# Same as Black.
line-length = 88
# Allow unused variables when underscore-prefixed.
-dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
-# Minimum Python version 3.8.
-target-version = "py38"
+# Minimum Python version 3.9.
+target-version = "py39"
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
# Unlike Flake8, default to a complexity level of 10.
max-complexity = 14
# need to enable for docstrings check.
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
# Use Google-style docstrings.
convention = "google"
-[tool.ruff.pylint]
+[tool.ruff.lint.pylint]
max-args = 10
[tool.mypy]
ignore_missing_imports = true
-python_version = 3.8
+python_version = 3.9
diff --git a/requirements/requirements.conda.yml b/requirements/requirements.conda.yml
index 09be84a12..0d999ac35 100644
--- a/requirements/requirements.conda.yml
+++ b/requirements/requirements.conda.yml
@@ -9,6 +9,6 @@ dependencies:
- openslide
- pip>=20.0.2
- pixman>=0.39.0
- - python>=3.8, <=3.11
+ - python>=3.9, <=3.12
- pip:
- -r requirements.txt
diff --git a/requirements/requirements.dev.conda.yml b/requirements/requirements.dev.conda.yml
index 494d5a0d3..4a743d837 100644
--- a/requirements/requirements.dev.conda.yml
+++ b/requirements/requirements.dev.conda.yml
@@ -9,6 +9,6 @@ dependencies:
- openslide
- pip>=20.0.2
- pixman>=0.39.0
- - python>=3.8, <=3.11
+ - python>=3.9, <=3.12
- pip:
- -r requirements_dev.txt
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 11e61999d..f9a13c809 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -32,5 +32,5 @@ torch>=2.1.0
torchvision>=0.15.0
tqdm>=4.64.1
umap-learn>=0.5.3
-wsidicom>=0.7.0, <0.18.0 # newly released version is causing tests to fail for now
+wsidicom>=0.18.0
zarr>=2.13.3
diff --git a/requirements/requirements.win64.conda.yml b/requirements/requirements.win64.conda.yml
index f6386597f..1aeff0a7a 100644
--- a/requirements/requirements.win64.conda.yml
+++ b/requirements/requirements.win64.conda.yml
@@ -9,6 +9,6 @@ dependencies:
- openjpeg>=2.4.0
- pip>=20.0.2
- pixman>=0.39.0
- - python>=3.8, <=3.11
+ - python>=3.9, <=3.12
- pip:
- -r requirements.txt
diff --git a/requirements/requirements.win64.dev.conda.yml b/requirements/requirements.win64.dev.conda.yml
index 078d75a38..64b4b07d1 100644
--- a/requirements/requirements.win64.dev.conda.yml
+++ b/requirements/requirements.win64.dev.conda.yml
@@ -9,6 +9,6 @@ dependencies:
- openjpeg>=2.4.0
- pip>=20.0.2
- pixman>=0.39.0
- - python>=3.8, <=3.11
+ - python>=3.9, <=3.12
- pip:
- -r requirements_dev.txt
diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt
index 6911165c5..697d05d2a 100644
--- a/requirements/requirements_dev.txt
+++ b/requirements/requirements_dev.txt
@@ -12,7 +12,7 @@ pytest>=7.2.0
pytest-cov>=4.0.0
pytest-runner>=6.0
pytest-xdist[psutil]
-ruff==0.1.13 # This will be updated by pre-commit bot to latest version
+ruff==0.2.2 # This will be updated by pre-commit bot to latest version
toml>=0.10.2
twine>=4.0.1
wheel>=0.37.1
diff --git a/setup.py b/setup.py
index 92fe58e0b..efb7f20ec 100644
--- a/setup.py
+++ b/setup.py
@@ -34,16 +34,16 @@
setup(
author="TIA Centre",
author_email="tia@dcs.warwick.ac.uk",
- python_requires=">=3.8, <3.12",
+ python_requires=">=3.9, <3.13",
classifiers=[
"Development Status :: 2 - Pre-Alpha",
"Intended Audience :: Developers",
"Natural Language :: English",
"Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
],
description="Computational pathology toolbox developed by TIA Centre.",
dependency_links=dependency_links,
diff --git a/tests/test_annotation_stores.py b/tests/test_annotation_stores.py
index 562e9a8a1..cac3937ba 100644
--- a/tests/test_annotation_stores.py
+++ b/tests/test_annotation_stores.py
@@ -6,10 +6,11 @@
import pickle
import sqlite3
import sys
+from collections.abc import Generator
from itertools import repeat, zip_longest
from pathlib import Path
from timeit import timeit
-from typing import TYPE_CHECKING, Callable, ClassVar, Generator
+from typing import TYPE_CHECKING, Callable, ClassVar
import numpy as np
import pandas as pd
@@ -1801,13 +1802,13 @@ def test_load_cases_error(
store._load_cases(["foo"], lambda: None, lambda: None)
@staticmethod
- def test_py38_init(
+ def test_py39_init(
fill_store: Callable, # noqa: ARG004
store_cls: type[AnnotationStore],
monkeypatch: object,
) -> None:
- """Test that __init__ is compatible with Python 3.8."""
- py38_version = (3, 8, 0)
+ """Test that __init__ is compatible with Python 3.9."""
+ py39_version = (3, 9, 0)
class Connection(sqlite3.Connection):
"""Mock SQLite connection."""
@@ -1821,7 +1822,7 @@ def create_function(
"""Mock create_function without `deterministic` kwarg."""
return self.create_function(self, name, num_params)
- monkeypatch.setattr(sys, "version_info", py38_version)
+ monkeypatch.setattr(sys, "version_info", py39_version)
monkeypatch.setattr(sqlite3, "Connection", Connection)
_ = store_cls()
diff --git a/tests/test_app_bokeh.py b/tests/test_app_bokeh.py
index 3d072a919..b29d78188 100644
--- a/tests/test_app_bokeh.py
+++ b/tests/test_app_bokeh.py
@@ -2,24 +2,18 @@
from __future__ import annotations
+import importlib.resources as importlib_resources
import io
import json
import multiprocessing
import re
-import sys
import time
from pathlib import Path
-from typing import TYPE_CHECKING, Generator
+from typing import TYPE_CHECKING
import bokeh.models as bkmodels
import matplotlib.pyplot as plt
import numpy as np
-
-if sys.version_info >= (3, 9): # pragma: no cover
- import importlib.resources as importlib_resources
-else: # pragma: no cover
- # To support Python 3.8
- import importlib_resources # type: ignore[import-not-found]
import pytest
import requests
from bokeh.application import Application
@@ -35,7 +29,9 @@
from tiatoolbox.visualization.tileserver import TileServer
from tiatoolbox.visualization.ui_utils import get_level_by_extent
-if TYPE_CHECKING:
+if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Generator
+
from bokeh.document import Document
# constants
diff --git a/tests/test_docs.py b/tests/test_docs.py
index 020188797..ea446737a 100644
--- a/tests/test_docs.py
+++ b/tests/test_docs.py
@@ -9,10 +9,13 @@
import sys
from doctest import DocTest
from pathlib import Path
-from typing import Generator
+from typing import TYPE_CHECKING
import pytest
+if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Generator
+
@pytest.fixture()
def source_files(root_path: Path) -> Generator:
diff --git a/tests/test_dsl.py b/tests/test_dsl.py
index e09753556..1657db1b6 100644
--- a/tests/test_dsl.py
+++ b/tests/test_dsl.py
@@ -5,7 +5,7 @@
import json
import sqlite3
from numbers import Number
-from typing import Callable, ClassVar, Mapping
+from typing import TYPE_CHECKING, Callable, ClassVar
import pytest
@@ -19,6 +19,9 @@
py_regexp,
)
+if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Mapping
+
BINARY_OP_STRINGS = [
"+",
"-",
@@ -101,7 +104,7 @@ class TestSQLite:
@staticmethod
def test_prop_or_prop() -> None:
"""Test OR operator between two prop accesses."""
- query = eval( # skipcq: PYL-W0123 # noqa: S307
+ query = eval( # skipcq: PYL-W0123
"(props['int'] == 2) | (props['int'] == 3)",
SQL_GLOBALS,
{},
@@ -143,7 +146,7 @@ def test_number_binary_operations(
"""Check that binary operations between ints does not error."""
for op in BINARY_OP_STRINGS:
query = f"2 {op} 2"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -159,7 +162,7 @@ def test_property_binary_operations(
"""Check that binary operations between properties does not error."""
for op in BINARY_OP_STRINGS:
query = f"props['int'] {op} props['int']"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -175,7 +178,7 @@ def test_r_binary_operations(
"""Test right hand binary operations between numbers and properties."""
for op in BINARY_OP_STRINGS:
query = f"2 {op} props['int']"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -191,7 +194,7 @@ def test_number_prefix_operations(
"""Test prefix operations on numbers."""
for op in PREFIX_OP_STRINGS:
query = f"{op}1"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -207,7 +210,7 @@ def test_property_prefix_operations(
"""Test prefix operations on properties."""
for op in PREFIX_OP_STRINGS:
query = f"{op}props['int']"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -222,7 +225,7 @@ def test_regex_nested_props(
) -> None:
"""Test regex on nested properties."""
query = "props['nesting']['fib'][4]"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -237,7 +240,7 @@ def test_regex_str_props(
) -> None:
"""Test regex on string properties."""
query = "regexp('Hello', props['string'])"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -252,7 +255,7 @@ def test_regex_str_str(
) -> None:
"""Test regex on string and string."""
query = "regexp('Hello', 'Hello world!')"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -267,7 +270,7 @@ def test_regex_props_str(
) -> None:
"""Test regex on property and string."""
query = "regexp(props['string'], 'Hello world!')"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -282,7 +285,7 @@ def test_regex_ignore_case(
) -> None:
"""Test regex with ignorecase flag."""
query = "regexp('hello', props['string'], re.IGNORECASE)"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -297,7 +300,7 @@ def test_regex_no_match(
) -> None:
"""Test regex with no match."""
query = "regexp('Yello', props['string'])"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -312,7 +315,7 @@ def test_has_key(
) -> None:
"""Test has_key function."""
query = "has_key(props, 'foo')"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -327,7 +330,7 @@ def test_is_none(
) -> None:
"""Test is_none function."""
query = "is_none(props['null'])"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -342,7 +345,7 @@ def test_is_not_none(
) -> None:
"""Test is_not_none function."""
query = "is_not_none(props['int'])"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -357,7 +360,7 @@ def test_nested_has_key(
) -> None:
"""Test nested has_key function."""
query = "has_key(props['dict'], 'a')"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -372,7 +375,7 @@ def test_list_sum(
) -> None:
"""Test sum function on a list."""
query = "sum(props['list'])"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -387,7 +390,7 @@ def test_abs(
) -> None:
"""Test abs function."""
query = "abs(props['neg'])"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -402,7 +405,7 @@ def test_not(
) -> None:
"""Test not operator."""
query = "not props['bool']"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -417,7 +420,7 @@ def test_props_int_keys(
) -> None:
"""Test props with int keys."""
query = "props['list'][1]"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -432,7 +435,7 @@ def test_props_get(
) -> None:
"""Test props.get function."""
query = "is_none(props.get('foo'))"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -447,7 +450,7 @@ def test_props_get_default(
) -> None:
"""Test props.get function with default."""
query = "props.get('foo', 42)"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -462,7 +465,7 @@ def test_in_list(
) -> None:
"""Test in operator for list."""
query = "1 in props.get('list')"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -478,7 +481,7 @@ def test_has_key_exception(
"""Test has_key function with exception."""
query = "has_key(1, 'a')"
with pytest.raises(TypeError, match="(not iterable)|(Unsupported type)"):
- _ = eval( # skipcq: PYL-W0123 # noqa: S307
+ _ = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -492,7 +495,7 @@ def test_logical_and(
) -> None:
"""Test logical and operator."""
query = "props['bool'] & is_none(props['null'])"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -507,7 +510,7 @@ def test_logical_or(
) -> None:
"""Test logical or operator."""
query = "props['bool'] | (props['int'] < 2)"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -522,7 +525,7 @@ def test_nested_logic(
) -> None:
"""Test nested logical operators."""
query = "(props['bool'] | (props['int'] < 2)) & abs(props['neg'])"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -537,7 +540,7 @@ def test_contains_list(
) -> None:
"""Test contains operator for list."""
query = "1 in props['list']"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -552,7 +555,7 @@ def test_contains_dict(
) -> None:
"""Test contains operator for dict."""
query = "'a' in props['dict']"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -567,7 +570,7 @@ def test_contains_str(
) -> None:
"""Test contains operator for str."""
query = "'Hello' in props['string']"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
@@ -582,7 +585,7 @@ def test_key_with_period(
) -> None:
"""Test key with period."""
query = "props['dot.key']"
- result = eval( # skipcq: PYL-W0123 # noqa: S307
+ result = eval( # skipcq: PYL-W0123
query,
eval_globals,
eval_locals,
diff --git a/tests/test_graph.py b/tests/test_graph.py
index a423064a6..99c7bdbe8 100644
--- a/tests/test_graph.py
+++ b/tests/test_graph.py
@@ -87,7 +87,7 @@ def test_affinity_to_edge_index_fuzz_output_shape() -> None:
for _ in range(1000):
# Generate some random square inputs
input_shape = [rng.integers(2, 10)] * 2
- affinity_matrix = np.random.sample(input_shape)
+ affinity_matrix = rng.random(input_shape)
threshold = rng.random()
# Convert to torch randomly
if rng.random() > 0.5:
@@ -108,7 +108,7 @@ def test_affinity_to_edge_index_invalid_fuzz_input_shape() -> None:
for _ in range(100):
input_shape = [rng.integers(2, 10)] * 2
input_shape[1] -= 1
- affinity_matrix = np.random.sample(input_shape)
+ affinity_matrix = rng.random(input_shape)
threshold = rng.random()
# Convert to torch randomly
if rng.random() > 0.5:
diff --git a/tests/test_wsireader.py b/tests/test_wsireader.py
index 390751b5d..8bdea210b 100644
--- a/tests/test_wsireader.py
+++ b/tests/test_wsireader.py
@@ -11,7 +11,7 @@
from pathlib import Path
# When no longer supporting Python <3.9 this should be collections.abc.Iterable
-from typing import TYPE_CHECKING, Callable, Iterable
+from typing import TYPE_CHECKING, Callable
import cv2
import glymur
@@ -46,7 +46,9 @@
is_zarr,
)
-if TYPE_CHECKING:
+if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Iterable
+
import requests
from openslide import OpenSlide
@@ -204,7 +206,7 @@ def read_bounds_level_consistency(wsi: WSIReader, bounds: IntBounds) -> None:
# from interpolation when calculating the downsampled levels. This
# adds some tolerance for the comparison.
blurred = [cv2.GaussianBlur(img, (5, 5), cv2.BORDER_REFLECT) for img in resized]
- as_float = [img.astype(np.float_) for img in blurred]
+ as_float = [img.astype(np.float64) for img in blurred]
# Pair-wise check resolutions for mean squared error
for i, a in enumerate(as_float):
@@ -2646,7 +2648,7 @@ def test_read_rect_level_consistency(wsi: WSIReader) -> None:
# from interpolation when calculating the downsampled levels. This
# adds some tolerance for the comparison.
blurred = [cv2.GaussianBlur(img, (5, 5), cv2.BORDER_REFLECT) for img in resized]
- as_float = [img.astype(np.float_) for img in blurred]
+ as_float = [img.astype(np.float64) for img in blurred]
# Pair-wise check resolutions for mean squared error
for i, a in enumerate(as_float):
diff --git a/tiatoolbox/__init__.py b/tiatoolbox/__init__.py
index aa8866c5c..5a42b9bed 100644
--- a/tiatoolbox/__init__.py
+++ b/tiatoolbox/__init__.py
@@ -2,16 +2,11 @@
from __future__ import annotations
+import importlib.resources as importlib_resources
import importlib.util
import sys
from pathlib import Path
-from typing import TYPE_CHECKING, Dict, TypedDict
-
-if sys.version_info >= (3, 9): # pragma: no cover
- import importlib.resources as importlib_resources
-else: # pragma: no cover
- # To support Python 3.8
- import importlib_resources # type: ignore[import-not-found]
+from typing import TYPE_CHECKING, TypedDict
import yaml
@@ -94,9 +89,8 @@ def read_registry_files(path_to_registry: str | Path) -> dict:
"""
- path_to_registry = str(path_to_registry) # To pass tests with Python 3.8
pretrained_files_registry_path = importlib_resources.as_file(
- importlib_resources.files("tiatoolbox") / path_to_registry,
+ importlib_resources.files("tiatoolbox") / str(path_to_registry),
)
with pretrained_files_registry_path as registry_file_path:
diff --git a/tiatoolbox/annotation/storage.py b/tiatoolbox/annotation/storage.py
index 9863e08d3..541e66a63 100644
--- a/tiatoolbox/annotation/storage.py
+++ b/tiatoolbox/annotation/storage.py
@@ -40,7 +40,7 @@
import zlib
from abc import ABC, abstractmethod
from collections import defaultdict
-from collections.abc import MutableMapping
+from collections.abc import Generator, Iterable, Iterator, MutableMapping
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
@@ -50,9 +50,6 @@
Any,
Callable,
ClassVar,
- Generator,
- Iterable,
- Iterator,
)
import numpy as np
@@ -2028,7 +2025,9 @@ def transform(
transformed_geoms = {
key: transform(annotation.geometry) for key, annotation in self.items()
}
- self.patch_many(transformed_geoms.keys(), transformed_geoms.values())
+ _keys = transformed_geoms.keys()
+ _values = transformed_geoms.values()
+ self.patch_many(_keys, _values)
def __del__(self: AnnotationStore) -> None:
"""Implements destructor method.
diff --git a/tiatoolbox/cli/visualize.py b/tiatoolbox/cli/visualize.py
index 7f5ed0ad5..86810954a 100644
--- a/tiatoolbox/cli/visualize.py
+++ b/tiatoolbox/cli/visualize.py
@@ -2,19 +2,13 @@
from __future__ import annotations
+import importlib.resources as importlib_resources
import os
import subprocess
-import sys
from pathlib import Path
from threading import Thread
import click
-
-if sys.version_info >= (3, 9): # pragma: no cover
- import importlib.resources as importlib_resources
-else: # pragma: no cover
- # To support Python 3.8
- import importlib_resources # type: ignore[import-not-found]
from flask_cors import CORS
from tiatoolbox.cli.common import tiatoolbox_cli
diff --git a/tiatoolbox/data/__init__.py b/tiatoolbox/data/__init__.py
index 1ac4e8e31..d7058493e 100644
--- a/tiatoolbox/data/__init__.py
+++ b/tiatoolbox/data/__init__.py
@@ -2,6 +2,7 @@
"""Package to define datasets available to download via TIAToolbox."""
from __future__ import annotations
+import importlib.resources as importlib_resources
import sys
import tempfile
import zipfile
@@ -9,11 +10,6 @@
from typing import TYPE_CHECKING
from urllib.parse import urlparse
-if sys.version_info >= (3, 9): # pragma: no cover
- import importlib.resources as importlib_resources
-else: # pragma: no cover
- import importlib_resources # To support Python 3.8
-
from tiatoolbox import logger, read_registry_files
if TYPE_CHECKING: # pragma: no cover
diff --git a/tiatoolbox/models/dataset/dataset_abc.py b/tiatoolbox/models/dataset/dataset_abc.py
index 31fb2bfd5..b60ecd66e 100644
--- a/tiatoolbox/models/dataset/dataset_abc.py
+++ b/tiatoolbox/models/dataset/dataset_abc.py
@@ -4,9 +4,11 @@
from abc import ABC, abstractmethod
from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Iterable, List, Union
+from typing import TYPE_CHECKING, Callable, Union
if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Iterable
+
try:
from typing import TypeGuard
except ImportError:
@@ -18,7 +20,7 @@
from tiatoolbox.utils import imread
-input_type = Union[List[Union[str, Path, np.ndarray]], np.ndarray]
+input_type = Union[list[Union[str, Path, np.ndarray]], np.ndarray]
class PatchDatasetABC(ABC, torch.utils.data.Dataset):
diff --git a/tiatoolbox/tools/graph.py b/tiatoolbox/tools/graph.py
index 6114b9b48..c3b138ddd 100644
--- a/tiatoolbox/tools/graph.py
+++ b/tiatoolbox/tools/graph.py
@@ -18,7 +18,7 @@
from numpy.typing import ArrayLike
-def delaunay_adjacency(points: ArrayLike, dthresh: Number) -> list:
+def delaunay_adjacency(points: ArrayLike, dthresh: float) -> list:
"""Create an adjacency matrix via Delaunay triangulation from a list of coordinates.
Points which are further apart than dthresh will not be connected.
@@ -28,7 +28,7 @@ def delaunay_adjacency(points: ArrayLike, dthresh: Number) -> list:
Args:
points (ArrayLike):
An nxm list of coordinates.
- dthresh (int):
+ dthresh (float):
Distance threshold for triangulation.
Returns:
@@ -57,6 +57,7 @@ def delaunay_adjacency(points: ArrayLike, dthresh: Number) -> list:
tessellation = Delaunay(points)
# Find all connected neighbours for each point in the set of
# triangles. Starting with an empty dictionary.
+ triangle_neighbours: defaultdict
triangle_neighbours = defaultdict(set)
# Iterate over each triplet of point indexes which denotes a
# triangle within the tessellation.
@@ -157,7 +158,7 @@ def edge_index_to_triangles(edge_index: ArrayLike) -> ArrayLike:
def affinity_to_edge_index(
affinity_matrix: torch.Tensor | ArrayLike,
- threshold: Number = 0.5,
+ threshold: float = 0.5,
) -> torch.tensor | ArrayLike:
"""Convert an affinity matrix (similarity matrix) to an edge index.
@@ -233,12 +234,12 @@ def _umap_reducer(graph: dict[str, ArrayLike]) -> ArrayLike:
def build(
points: ArrayLike,
features: ArrayLike,
- lambda_d: Number = 3.0e-3,
- lambda_f: Number = 1.0e-3,
- lambda_h: Number = 0.8,
- connectivity_distance: Number = 4000,
- neighbour_search_radius: Number = 2000,
- feature_range_thresh: Number | None = 1e-4,
+ lambda_d: float = 3.0e-3,
+ lambda_f: float = 1.0e-3,
+ lambda_h: float = 0.8,
+ connectivity_distance: int = 4000,
+ neighbour_search_radius: int = 2000,
+ feature_range_thresh: float | None = 1e-4,
) -> dict[str, ArrayLike]:
"""Build a graph via hybrid clustering in spatial and feature space.
@@ -416,7 +417,7 @@ def build(
@classmethod
def visualise(
- cls: SlideGraphConstructor,
+ cls: type[SlideGraphConstructor],
graph: dict[str, ArrayLike],
color: ArrayLike | str | Callable | None = None,
node_size: Number | ArrayLike | Callable = 25,
@@ -510,8 +511,8 @@ def visualise(
# Plot the nodes
plt.scatter(
*nodes.T,
- c=color(graph) if isinstance(color, Callable) else color,
- s=node_size(graph) if isinstance(node_size, Callable) else node_size,
+ c=color(graph) if callable(color) else color,
+ s=node_size(graph) if callable(node_size) else node_size,
zorder=2,
)
diff --git a/tiatoolbox/tools/pyramid.py b/tiatoolbox/tools/pyramid.py
index 1a797ebc3..cfbe55190 100644
--- a/tiatoolbox/tools/pyramid.py
+++ b/tiatoolbox/tools/pyramid.py
@@ -17,7 +17,7 @@
import zipfile
from io import BytesIO
from pathlib import Path
-from typing import TYPE_CHECKING, Iterator
+from typing import TYPE_CHECKING
import defusedxml
import numpy as np
@@ -28,6 +28,8 @@
from tiatoolbox.utils.visualization import AnnotationRenderer, random_colors
if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Iterator
+
from tiatoolbox.annotation import AnnotationStore
from tiatoolbox.wsicore.wsireader import WSIMeta, WSIReader
@@ -129,7 +131,7 @@ def level_count(self: TilePyramidGenerator) -> int:
total_level_count = super_level_count + 1 + self.sub_tile_level_count
return int(total_level_count)
- def get_thumb_tile(self: TilePyramidGenerator) -> Image:
+ def get_thumb_tile(self: TilePyramidGenerator) -> Image.Image:
"""Return a thumbnail which fits the whole slide in one tile.
The thumbnail output size has the longest edge equal to the tile
@@ -157,7 +159,7 @@ def get_tile(
pad_mode: str = "constant",
interpolation: str = "optimise",
transparent_value: int | None = None,
- ) -> Image:
+ ) -> Image.Image:
"""Get a tile at a given level and coordinate.
Note that levels are in the reverse order of those in WSIReader.
@@ -223,7 +225,7 @@ def get_tile(
)
output_size = np.repeat(output_size, 2).astype(int)
thumb = self.get_thumb_tile()
- thumb.thumbnail(output_size)
+ thumb.thumbnail((output_size[0], output_size[1]))
return thumb
slide_dimensions = np.array(self.wsi.info.slide_dimensions)
if all(slide_dimensions < [baseline_x, baseline_y]):
@@ -331,7 +333,7 @@ def save_tile(tile_path: Path, tile: Image.Image) -> None:
msg = "Unsupported compression for zip."
raise ValueError(msg)
- archive = zipfile.ZipFile(
+ zip_archive = zipfile.ZipFile(
path,
mode="w",
compression=compression2enum[compression],
@@ -343,7 +345,7 @@ def save_tile(tile_path: Path, tile: Image.Image) -> None:
tile.save(bio, format="jpeg")
bio.seek(0)
data = bio.read()
- archive.writestr(
+ zip_archive.writestr(
str(tile_path),
data,
compress_type=compression2enum[compression],
@@ -360,7 +362,7 @@ def save_tile(tile_path: Path, tile: Image.Image) -> None:
msg = "Unsupported compression for tar."
raise ValueError(msg)
- archive = tarfile.TarFile.open(path, mode=compression2mode[compression])
+ tar_archive = tarfile.TarFile.open(path, mode=compression2mode[compression])
def save_tile(tile_path: Path, tile: Image.Image) -> None:
"""Write the tile to the output zip."""
@@ -368,9 +370,9 @@ def save_tile(tile_path: Path, tile: Image.Image) -> None:
tile.save(bio, format="jpeg")
bio.seek(0)
tar_info = tarfile.TarInfo(name=str(tile_path))
- tar_info.mtime = time.time()
+ tar_info.mtime = int(time.time())
tar_info.size = bio.tell()
- archive.addfile(tarinfo=tar_info, fileobj=bio)
+ tar_archive.addfile(tarinfo=tar_info, fileobj=bio)
for level in range(self.level_count):
for x, y in np.ndindex(self.tile_grid_size(level)):
@@ -378,13 +380,17 @@ def save_tile(tile_path: Path, tile: Image.Image) -> None:
tile_path = self.tile_path(level, x, y)
save_tile(tile_path, tile)
- if container is not None:
- archive.close()
+ if container == "zip":
+ zip_archive.close()
+ if container == "tar":
+ tar_archive.close()
def __len__(self: TilePyramidGenerator) -> int:
"""Return length of instance attributes."""
- return sum(
- np.prod(self.tile_grid_size(level)) for level in range(self.level_count)
+ return int(
+ sum(
+ np.prod(self.tile_grid_size(level)) for level in range(self.level_count)
+ ),
)
def __iter__(self: TilePyramidGenerator) -> Iterator:
@@ -452,7 +458,7 @@ def tile_group(self: ZoomifyGenerator, level: int, x: int, y: int) -> int:
cumulative_sum = sum(np.prod(self.tile_grid_size(n)) for n in range(level))
index_in_level = np.ravel_multi_index((y, x), self.tile_grid_size(level)[::-1])
tile_index = cumulative_sum + index_in_level
- return tile_index // 256 # the tile group
+ return int(tile_index // 256) # the tile group
def tile_path(self: ZoomifyGenerator, level: int, x: int, y: int) -> Path:
"""Generate the Zoomify path for a specified tile.
@@ -537,7 +543,7 @@ def __init__(
mapper = {key: (*color, 1) for key, color in zip(types, colors)}
self.renderer.mapper = lambda x: mapper[x]
- def get_thumb_tile(self: AnnotationTileGenerator) -> Image:
+ def get_thumb_tile(self: AnnotationTileGenerator) -> Image.Image:
"""Return a thumbnail which fits the whole slide in one tile.
The thumbnail output size has the longest edge equal to the tile
@@ -587,7 +593,7 @@ def get_tile(
pad_mode: str | None = None,
interpolation: str | None = None,
transparent_value: int | None = None, # noqa: ARG002
- ) -> Image:
+ ) -> Image.Image:
"""Render a tile at a given level and coordinate.
Note that levels are in the reverse order of those in WSIReader.
@@ -646,20 +652,21 @@ def get_tile(
scale = self.level_downsample(level)
baseline_x = (x * self.tile_size * scale) - (self.overlap * scale)
baseline_y = (y * self.tile_size * scale) - (self.overlap * scale)
- coord = [baseline_x, baseline_y]
+ coord = (int(baseline_x), int(baseline_y))
if level < self.sub_tile_level_count:
output_size = self.output_tile_size // 2 ** (
self.sub_tile_level_count - level
)
output_size = np.repeat(output_size, 2).astype(int)
thumb = self.get_thumb_tile()
- thumb.thumbnail(output_size)
+ thumb.thumbnail((output_size[0], output_size[1]))
return thumb
slide_dimensions = np.array(self.info.slide_dimensions)
if all(slide_dimensions < [baseline_x, baseline_y]):
raise IndexError
- bounds = locsize2bounds(coord, [self.output_tile_size * scale] * 2)
+ size = [self.output_tile_size * scale] * 2
+ bounds = locsize2bounds(coord, (int(size[0]), int(size[1])))
tile = self.renderer.render_annotations(
self.store,
bounds,
diff --git a/tiatoolbox/tools/stainextract.py b/tiatoolbox/tools/stainextract.py
index 4126f7e55..cb2972ae2 100644
--- a/tiatoolbox/tools/stainextract.py
+++ b/tiatoolbox/tools/stainextract.py
@@ -2,22 +2,12 @@
from __future__ import annotations
-from typing import TYPE_CHECKING
-
import numpy as np
from sklearn.decomposition import DictionaryLearning
from tiatoolbox.utils.misc import get_luminosity_tissue_mask
from tiatoolbox.utils.transforms import rgb2od
-if TYPE_CHECKING: # pragma: no cover
- import sys
-
- if sys.version_info >= (3, 9):
- from typing import Self
- else: # pragma: no cover
- from typing_extensions import Self # To support Python 3.8
-
def vectors_in_correct_direction(e_vectors: np.ndarray) -> np.ndarray:
"""Points the eigen vectors in the right direction.
@@ -92,14 +82,14 @@ class CustomExtractor:
"""
- def __init__(self: Self, stain_matrix: np.ndarray) -> None:
+ def __init__(self: CustomExtractor, stain_matrix: np.ndarray) -> None:
"""Initialize :class:`CustomExtractor`."""
self.stain_matrix = stain_matrix
if self.stain_matrix.shape not in [(2, 3), (3, 3)]:
msg = "Stain matrix must have shape (2, 3) or (3, 3)."
raise ValueError(msg)
- def get_stain_matrix(self: Self, _: np.ndarray) -> np.ndarray:
+ def get_stain_matrix(self: CustomExtractor, _: np.ndarray) -> np.ndarray:
"""Get the user defined stain matrix.
Returns:
@@ -131,11 +121,11 @@ class RuifrokExtractor:
"""
- def __init__(self: Self) -> None:
+ def __init__(self: RuifrokExtractor) -> None:
"""Initialize :class:`RuifrokExtractor`."""
self.__stain_matrix = np.array([[0.65, 0.70, 0.29], [0.07, 0.99, 0.11]])
- def get_stain_matrix(self: Self, _: np.ndarray) -> np.ndarray:
+ def get_stain_matrix(self: RuifrokExtractor, _: np.ndarray) -> np.ndarray:
"""Get the pre-defined stain matrix.
Returns:
@@ -175,7 +165,7 @@ class MacenkoExtractor:
"""
def __init__(
- self: Self,
+ self: MacenkoExtractor,
luminosity_threshold: float = 0.8,
angular_percentile: float = 99,
) -> None:
@@ -183,7 +173,7 @@ def __init__(
self.__luminosity_threshold = luminosity_threshold
self.__angular_percentile = angular_percentile
- def get_stain_matrix(self: Self, img: np.ndarray) -> np.ndarray:
+ def get_stain_matrix(self: MacenkoExtractor, img: np.ndarray) -> np.ndarray:
"""Stain matrix estimation.
Args:
@@ -264,7 +254,7 @@ class VahadaneExtractor:
"""
def __init__(
- self: Self,
+ self: VahadaneExtractor,
luminosity_threshold: float = 0.8,
regularizer: float = 0.1,
) -> None:
@@ -272,7 +262,7 @@ def __init__(
self.__luminosity_threshold = luminosity_threshold
self.__regularizer = regularizer
- def get_stain_matrix(self: Self, img: np.ndarray) -> np.ndarray:
+ def get_stain_matrix(self: VahadaneExtractor, img: np.ndarray) -> np.ndarray:
"""Stain matrix estimation.
Args:
diff --git a/tiatoolbox/tools/tissuemask.py b/tiatoolbox/tools/tissuemask.py
index ac99490d8..c2ea74d80 100644
--- a/tiatoolbox/tools/tissuemask.py
+++ b/tiatoolbox/tools/tissuemask.py
@@ -18,11 +18,6 @@ class TissueMasker(ABC):
"""
- def __init__(self: TissueMasker) -> None:
- """Initialize :class:`TissueMasker`."""
- super().__init__()
- self.fitted = False
-
@abstractmethod
def fit(
self: TissueMasker,
@@ -55,9 +50,6 @@ def transform(self: TissueMasker, images: np.ndarray) -> np.ndarray:
e.g. regions of tissue vs background.
"""
- if not self.fitted:
- msg = "Fit must be called before transform."
- raise SyntaxError(msg)
def fit_transform(
self: TissueMasker,
@@ -76,7 +68,7 @@ def fit_transform(
**kwargs (dict):
Other key word arguments passed to fit.
"""
- self.fit(images, **kwargs)
+ self.fit(images, masks=None, **kwargs)
return self.transform(images)
@@ -97,13 +89,15 @@ class OtsuTissueMasker(TissueMasker):
"""
- def __init__(self: TissueMasker) -> None:
+ def __init__(self: OtsuTissueMasker) -> None:
"""Initialize :class:`OtsuTissueMasker`."""
- super().__init__()
+ self.threshold: float | None
+ self.fitted: bool
self.threshold = None
+ self.fitted = False
def fit(
- self: TissueMasker,
+ self: OtsuTissueMasker,
images: np.ndarray,
masks: np.ndarray | None = None, # noqa: ARG002
) -> None:
@@ -141,7 +135,7 @@ def fit(
self.fitted = True
- def transform(self: TissueMasker, images: np.ndarray) -> np.ndarray:
+ def transform(self: OtsuTissueMasker, images: np.ndarray) -> np.ndarray:
"""Create masks using the threshold found during :func:`fit`.
Args:
@@ -155,7 +149,9 @@ def transform(self: TissueMasker, images: np.ndarray) -> np.ndarray:
channels).
"""
- super().transform(images)
+ if not self.fitted:
+ msg = "Fit must be called before transform."
+ raise SyntaxError(msg)
masks = []
for image in images:
@@ -165,7 +161,7 @@ def transform(self: TissueMasker, images: np.ndarray) -> np.ndarray:
mask = (grey < self.threshold).astype(bool)
masks.append(mask)
- return masks
+ return np.array(masks)
class MorphologicalMasker(OtsuTissueMasker):
@@ -206,7 +202,7 @@ class MorphologicalMasker(OtsuTissueMasker):
"""
def __init__(
- self: TissueMasker,
+ self: MorphologicalMasker,
*,
mpp: float | tuple[float, float] | None = None,
power: float | tuple[float, float] | None = None,
@@ -250,18 +246,19 @@ def __init__(
# Convert MPP to an integer kernel_size
if mpp is not None:
- mpp = np.array(mpp)
- if mpp.size != 2: # noqa: PLR2004
- mpp = mpp.repeat(2)
- kernel_size = np.max([32 / mpp, [1, 1]], axis=0)
+ mpp_array = np.array(mpp)
+ if mpp_array.size != 2: # noqa: PLR2004
+ mpp_array = mpp_array.repeat(2)
+ kernel_size = np.max([32 / mpp_array, [1, 1]], axis=0)
# Ensure kernel_size is a length 2 numpy array
- kernel_size = np.array(kernel_size)
- if kernel_size.size != 2: # noqa: PLR2004
- kernel_size = kernel_size.repeat(2)
+ kernel_size_array = np.array(kernel_size)
+ if kernel_size_array.size != 2: # noqa: PLR2004
+ kernel_size_array = kernel_size_array.repeat(2)
# Convert to an integer double/ pair
- self.kernel_size = tuple(np.round(kernel_size).astype(int))
+ self.kernel_size: tuple[int, int]
+ self.kernel_size = tuple(np.round(kernel_size_array).astype(int))
# Create structuring element for morphological operations
self.kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, self.kernel_size)
@@ -270,7 +267,7 @@ def __init__(
if self.min_region_size is None:
self.min_region_size = np.sum(self.kernel)
- def transform(self: TissueMasker, images: np.ndarray) -> None:
+ def transform(self: MorphologicalMasker, images: np.ndarray) -> np.ndarray:
"""Create masks using the found threshold followed by morphological operations.
Args:
@@ -284,7 +281,9 @@ def transform(self: TissueMasker, images: np.ndarray) -> None:
channels).
"""
- super().transform(images)
+ if not self.fitted:
+ msg = "Fit must be called before transform."
+ raise SyntaxError(msg)
results = []
for image in images:
@@ -304,4 +303,4 @@ def transform(self: TissueMasker, images: np.ndarray) -> None:
mask = cv2.morphologyEx(mask, cv2.MORPH_DILATE, self.kernel)
results.append(mask.astype(bool))
- return results
+ return np.array(results)
diff --git a/tiatoolbox/typing.py b/tiatoolbox/typing.py
index c70dbf3e1..ea0299e12 100644
--- a/tiatoolbox/typing.py
+++ b/tiatoolbox/typing.py
@@ -2,7 +2,8 @@
from __future__ import annotations
-from typing import Callable, Dict, List, Literal, Sequence, SupportsFloat, Tuple, Union
+from collections.abc import Sequence
+from typing import Callable, Literal, SupportsFloat, Union
import numpy as np
from shapely.geometry import LineString, Point, Polygon # type: ignore[import-untyped]
@@ -10,15 +11,15 @@
# Proper type annotations for shapely is not yet available.
-JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None]
-NumPair = Tuple[SupportsFloat, SupportsFloat]
-IntPair = Tuple[int, int]
+JSON = Union[dict[str, "JSON"], list["JSON"], str, int, float, bool, None]
+NumPair = tuple[SupportsFloat, SupportsFloat]
+IntPair = tuple[int, int]
# WSIReader
Resolution = Union[SupportsFloat, NumPair, np.ndarray, Sequence[SupportsFloat]]
Units = Literal["mpp", "power", "baseline", "level"]
-Bounds = Tuple[SupportsFloat, SupportsFloat, SupportsFloat, SupportsFloat]
-IntBounds = Tuple[int, int, int, int]
+Bounds = tuple[SupportsFloat, SupportsFloat, SupportsFloat, SupportsFloat]
+IntBounds = tuple[int, int, int, int]
# Annotation Store
Geometry = Union[Point, LineString, Polygon]
diff --git a/tiatoolbox/utils/misc.py b/tiatoolbox/utils/misc.py
index 9d0c2de97..4d3d4b66b 100644
--- a/tiatoolbox/utils/misc.py
+++ b/tiatoolbox/utils/misc.py
@@ -983,7 +983,7 @@ def select_cv2_interpolation(scale_factor: float | npt.NDArray[np.float64]) -> s
interpolation type
"""
- if np.any(scale_factor > 1.0): # noqa: PLR2004
+ if np.any(scale_factor > 1.0):
return "cubic"
return "area"
@@ -1327,7 +1327,7 @@ def dict_to_zarr(
compressor = (
kwargs["compressor"] if "compressor" in kwargs else numcodecs.Zstd(level=1)
)
- chunks = kwargs["chunks"] if "chunks" in kwargs else 10000
+ chunks = kwargs.get("chunks", 10000)
# ensure proper zarr extension
save_path = save_path.parent.absolute() / (save_path.stem + ".zarr")
diff --git a/tiatoolbox/utils/transforms.py b/tiatoolbox/utils/transforms.py
index 36c43ec21..05396c798 100644
--- a/tiatoolbox/utils/transforms.py
+++ b/tiatoolbox/utils/transforms.py
@@ -141,7 +141,7 @@ def imresize(
scale_factor_array = img.shape[:2][::-1] / np.array(output_size_array)
# Return original if scale factor is 1
- if np.all(scale_factor_array == 1.0): # noqa: PLR2004
+ if np.all(scale_factor_array == 1.0):
return img
# Get appropriate cv2 interpolation enum
diff --git a/tiatoolbox/utils/visualization.py b/tiatoolbox/utils/visualization.py
index 3e7c9da46..ba26fe47f 100644
--- a/tiatoolbox/utils/visualization.py
+++ b/tiatoolbox/utils/visualization.py
@@ -119,7 +119,7 @@ def overlay_prediction_mask(
msg,
)
if np.issubdtype(img.dtype, np.floating):
- if not (img.max() <= 1.0 and img.min() >= 0): # noqa: PLR2004
+ if not (img.max() <= 1.0 and img.min() >= 0):
msg = "Not support float `img` outside [0, 1]."
raise ValueError(msg)
img = np.array(img * 255, dtype=np.uint8)
@@ -157,7 +157,7 @@ def overlay_prediction_mask(
cv2.addWeighted(rgb_prediction, alpha, overlay, 1 - alpha, 0, overlay)
overlay = overlay.astype(np.uint8)
- if min_val > 0.0: # noqa: PLR2004
+ if min_val > 0.0:
overlay[~prediction_sel] = img[~prediction_sel]
if ax is None and not return_ax:
@@ -310,7 +310,7 @@ def overlay_probability_map(
overlay[overlay > 255.0] = 255.0 # noqa: PLR2004
overlay = overlay.astype(np.uint8)
- if min_val > 0.0: # noqa: PLR2004
+ if min_val > 0.0:
overlay[~prediction_sel] = img[~prediction_sel]
if ax is None and not return_ax:
@@ -374,7 +374,7 @@ def _validate_overlay_probability_map(
msg,
)
- if prediction.max() > 1.0: # noqa: PLR2004
+ if prediction.max() > 1.0:
msg = "Not support float `prediction` outside [0, 1]."
raise ValueError(msg)
if prediction.min() < 0:
@@ -382,15 +382,15 @@ def _validate_overlay_probability_map(
raise ValueError(msg)
# if `min_val` is defined, only display the overlay for areas with prob > min_val
- if min_val < 0.0: # noqa: PLR2004
+ if min_val < 0.0:
msg = f"`min_val={min_val}` is not between [0, 1]."
raise ValueError(msg)
- if min_val > 1.0: # noqa: PLR2004
+ if min_val > 1.0:
msg = f"`min_val={min_val}` is not between [0, 1]."
raise ValueError(msg)
if np.issubdtype(img.dtype, np.floating):
- if img.max() > 1.0: # noqa: PLR2004
+ if img.max() > 1.0:
msg = "Not support float `img` outside [0, 1]."
raise ValueError(msg)
if img.min() < 0:
@@ -633,6 +633,7 @@ def __init__( # noqa: PLR0913
self.secondary_cmap = secondary_cmap
self.blur_radius = blur_radius
self.function_mapper = function_mapper
+ self.blur: ImageFilter.GaussianBlur | None
if blur_radius > 0:
self.blur = ImageFilter.GaussianBlur(blur_radius)
self.edge_thickness = 0
diff --git a/tiatoolbox/visualization/bokeh_app/main.py b/tiatoolbox/visualization/bokeh_app/main.py
index 608dc23a9..0f29a4aea 100644
--- a/tiatoolbox/visualization/bokeh_app/main.py
+++ b/tiatoolbox/visualization/bokeh_app/main.py
@@ -64,14 +64,14 @@
# GitHub actions seems unable to find TIAToolbox unless this is here
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
-from tiatoolbox import logger # noqa: E402
-from tiatoolbox.models.engine.nucleus_instance_segmentor import ( # noqa: E402
+from tiatoolbox import logger
+from tiatoolbox.models.engine.nucleus_instance_segmentor import (
NucleusInstanceSegmentor,
)
-from tiatoolbox.tools.pyramid import ZoomifyGenerator # noqa: E402
-from tiatoolbox.utils.visualization import random_colors # noqa: E402
-from tiatoolbox.visualization.ui_utils import get_level_by_extent # noqa: E402
-from tiatoolbox.wsicore.wsireader import WSIReader # noqa: E402
+from tiatoolbox.tools.pyramid import ZoomifyGenerator
+from tiatoolbox.utils.visualization import random_colors
+from tiatoolbox.visualization.ui_utils import get_level_by_extent
+from tiatoolbox.wsicore.wsireader import WSIReader
if TYPE_CHECKING: # pragma: no cover
from bokeh.document import Document
diff --git a/tiatoolbox/wsicore/wsimeta.py b/tiatoolbox/wsicore/wsimeta.py
index ac9200295..4a7ad0d9b 100644
--- a/tiatoolbox/wsicore/wsimeta.py
+++ b/tiatoolbox/wsicore/wsimeta.py
@@ -11,13 +11,15 @@
from numbers import Number
from pathlib import Path
-from typing import TYPE_CHECKING, Mapping, Sequence
+from typing import TYPE_CHECKING
import numpy as np
from tiatoolbox import logger
if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Mapping, Sequence
+
from tiatoolbox.typing import Resolution, Units
diff --git a/tiatoolbox/wsicore/wsireader.py b/tiatoolbox/wsicore/wsireader.py
index 7e3307189..f7e4cacf5 100644
--- a/tiatoolbox/wsicore/wsireader.py
+++ b/tiatoolbox/wsicore/wsireader.py
@@ -11,7 +11,7 @@
from datetime import datetime
from numbers import Number
from pathlib import Path
-from typing import TYPE_CHECKING, Iterable
+from typing import TYPE_CHECKING
import numpy as np
import openslide
@@ -31,6 +31,8 @@
from tiatoolbox.wsicore.wsimeta import WSIMeta
if TYPE_CHECKING: # pragma: no cover
+ from collections.abc import Iterable
+
import glymur
from tiatoolbox.typing import Bounds, IntBounds, IntPair, NumPair, Resolution, Units
@@ -97,8 +99,8 @@ def is_zarr(path: Path) -> bool:
_ = zarr.open(str(path), mode="r")
except Exception: # skipcq: PYL-W0703 # noqa: BLE001
return False
- else:
- return True
+
+ return True
def is_ngff( # noqa: PLR0911
@@ -404,10 +406,9 @@ def info(self: WSIReader) -> WSIMeta:
Returns:
WSIMeta:
- An object containing normalized slide metadata
+ An object containing normalized slide metadata.
"""
- # In Python>=3.8 this could be replaced with functools.cached_property
if self._m_info is not None:
return copy.deepcopy(self._m_info)
self._m_info = self._info()
@@ -1577,7 +1578,7 @@ def save_tiles(
# Rescale to the correct objective value
if rescale != 1:
- im = utils.transforms.imresize(img=im, scale_factor=(1 / rescale))
+ im = utils.transforms.imresize(img=im, scale_factor=1 / rescale)
img_save_name = (
"_".join(
@@ -5519,7 +5520,7 @@ def read_rect(
utils.transforms.background_composite(base_region, alpha=True),
)
im_region = Image.fromarray(im_region)
- if self.alpha < 1.0: # noqa: PLR2004
+ if self.alpha < 1.0:
im_region.putalpha(
im_region.getchannel("A").point(lambda i: i * self.alpha),
)
@@ -5712,7 +5713,7 @@ class docstrings for more information.
utils.transforms.background_composite(base_region, alpha=True),
)
im_region = Image.fromarray(im_region)
- if self.alpha < 1.0: # noqa: PLR2004
+ if self.alpha < 1.0:
im_region.putalpha(
im_region.getchannel("A").point(lambda i: i * self.alpha),
)