From 5c00381797a392204ae8f4501445e628034462fc Mon Sep 17 00:00:00 2001 From: Shan E Ahmed Raza <13048456+shaneahmed@users.noreply.github.com> Date: Tue, 20 Feb 2024 10:02:04 +0000 Subject: [PATCH] :pushpin: Update minimum Python version to `3.9` (#786) - Update minimum Python version to `3.9` ToDo: - [x] Fix all errors - [x] Update docker containers - [x] Use `functools.cachedtools` - [x] Test docker containers --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: mostafajahanifar <74412979+mostafajahanifar@users.noreply.github.com> --- .github/workflows/docker-publish.yml | 12 +- .github/workflows/mypy-type-check.yml | 2 +- .github/workflows/pip-install.yml | 2 +- .github/workflows/python-package.yml | 2 +- README.md | 2 +- benchmarks/annotation_store.ipynb | 5375 +++++++++-------- benchmarks/annotation_store_alloc.py | 3 +- docker/{3.8 => 3.11}/Debian/Dockerfile | 2 +- docker/3.11/Ubuntu/Dockerfile | 30 + docker/3.12/Debian/Dockerfile | 14 + docker/3.12/Ubuntu/Dockerfile | 30 + docs/installation.rst | 2 +- examples/full-pipelines/slide-graph.ipynb | 5 +- pyproject.toml | 6 +- requirements/requirements.conda.yml | 2 +- requirements/requirements.dev.conda.yml | 2 +- requirements/requirements.win64.conda.yml | 2 +- requirements/requirements.win64.dev.conda.yml | 2 +- setup.py | 4 +- tests/test_annotation_stores.py | 11 +- tests/test_app_bokeh.py | 14 +- tests/test_docs.py | 5 +- tests/test_dsl.py | 5 +- tests/test_wsireader.py | 6 +- tiatoolbox/__init__.py | 12 +- tiatoolbox/annotation/storage.py | 5 +- tiatoolbox/cli/visualize.py | 8 +- tiatoolbox/data/__init__.py | 6 +- tiatoolbox/models/dataset/dataset_abc.py | 6 +- tiatoolbox/tools/pyramid.py | 4 +- tiatoolbox/tools/stainextract.py | 26 +- tiatoolbox/typing.py | 13 +- tiatoolbox/wsicore/wsimeta.py | 4 +- tiatoolbox/wsicore/wsireader.py | 7 +- 34 files changed, 2850 insertions(+), 2781 deletions(-) rename docker/{3.8 => 3.11}/Debian/Dockerfile (91%) create mode 100644 docker/3.11/Ubuntu/Dockerfile create mode 100644 docker/3.12/Debian/Dockerfile create mode 100644 docker/3.12/Ubuntu/Dockerfile diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 4f63c729e..4d486766b 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -15,8 +15,6 @@ jobs: fail-fast: true matrix: include: - - dockerfile: ./docker/3.8/Debian/Dockerfile - mtag: py3.8-debian - dockerfile: ./docker/3.9/Debian/Dockerfile mtag: py3.9-debian - dockerfile: ./docker/3.9/Ubuntu/Dockerfile @@ -25,7 +23,15 @@ jobs: mtag: py3.10-debian - dockerfile: ./docker/3.10/Ubuntu/Dockerfile mtag: py3.10-ubuntu - - dockerfile: ./docker/3.10/Ubuntu/Dockerfile + - dockerfile: ./docker/3.11/Debian/Dockerfile + mtag: py3.11-debian + - dockerfile: ./docker/3.11/Ubuntu/Dockerfile + mtag: py3.11-ubuntu + - dockerfile: ./docker/3.12/Debian/Dockerfile + mtag: py3.12-debian + - dockerfile: ./docker/3.12/Ubuntu/Dockerfile + mtag: py3.12-ubuntu + - dockerfile: ./docker/3.12/Ubuntu/Dockerfile mtag: latest permissions: contents: read diff --git a/.github/workflows/mypy-type-check.yml b/.github/workflows/mypy-type-check.yml index a22f339c5..1c026da9e 100644 --- a/.github/workflows/mypy-type-check.yml +++ b/.github/workflows/mypy-type-check.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml index abdb11527..ffa6961c9 100644 --- a/.github/workflows/pip-install.yml +++ b/.github/workflows/pip-install.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] os: [ubuntu-22.04, windows-latest, macos-latest] steps: - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 321316040..9df1550c6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index 0c5de616d..da8c04f06 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ Prepare a computer as a convenient platform for further development of the Pytho 5. Create virtual environment for TIAToolbox using ```sh - $ conda create -n tiatoolbox-dev python=3.8 # select version of your choice + $ conda create -n tiatoolbox-dev python=3.9 # select version of your choice $ conda activate tiatoolbox-dev $ pip install -r requirements/requirements_dev.txt ``` diff --git a/benchmarks/annotation_store.ipynb b/benchmarks/annotation_store.ipynb index 6c8b83d65..882ab251c 100644 --- a/benchmarks/annotation_store.ipynb +++ b/benchmarks/annotation_store.ipynb @@ -1,2703 +1,2704 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "aqPkpRk-pT5q" - }, - "source": [ - "# Benchmarking Annotation Storage\n", - "\n", - "Click to open in: \\[[GitHub](https://github.com/TissueImageAnalytics/tiatoolbox/tree/develop/benchmarks/annotation_store.ipynb)\\]\\[[Colab](https://colab.research.google.com/github/TissueImageAnalytics/tiatoolbox/blob/develop/benchmarks/annotation_store.ipynb)\\]\\[[Kaggle](https://kaggle.com/kernels/welcome?src=https://github.com/TissueImageAnalytics/tiatoolbox/blob/develop/benchmarks/annotation_store.ipynb)\\]\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BS0G58BPpT5s" - }, - "source": [ - "_In order to run this notebook on a Kaggle platform, 1) click the Kaggle URL 2) click on Settings on the right of the Kaggle screen, 3) log in to your Kaggle account, 4) tick \"Internet\" checkbox under Settings, to enable necessary downloads._\n", - "\n", - "**NOTE:** Some parts of this notebook require a lot of memory. Part 2 in particular may not run on memory constrained systems. The notebook will run well on an MacBook Air (M1, 2020) but will use a lot of swap. It may require >64GB of memory for second half to avoid using swap.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EjHQXjqrpT5s" - }, - "source": [ - "## About This Notebook\n", - "\n", - "Managing annotation, either created by hand or from model output, is a\n", - "common task in computational pathology. For a small number of\n", - "annotations this may be trivial. However, for large numbers of\n", - "annotations, it is often necessary to store the annotations in a more\n", - "structured format such as a database. This is because finding a desired\n", - "subset of annotations within a very large collection, for example over\n", - "one million cell boundary polygons derived from running HoVerNet on a\n", - "WSI, may be very slow if performed in a naive manner. In the toolbox, we\n", - "implement two storage method to make handling annotations easier:\n", - "`DictionaryStore` and `SQLiteStore`.\n", - "\n", - "### Storage Classes\n", - "\n", - "Both stores act as a key-value store where the key is the annotation ID\n", - "(as a string) and the value is the annotation. This follows the Python\n", - "[`MutableMapping`](https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping)\n", - "interface meaning that the stores can be used in the same way as a\n", - "regular Python dictionary (`dict`).\n", - "\n", - "The `DictionaryStore` is implemented internally using a Python\n", - "dictionary. It is a realtively simple class, operating with all\n", - "annotations in memory and using a simple scan method to search for\n", - "annotations. This works very well for a small number of annotations. In\n", - "contrast the `SQLiteStore` is implemented using a SQLite database\n", - "(either in memory or on disk), it is a more complex class making use of\n", - "an rtree index to efficiently spatially search for annotations. This is\n", - "much more suited to a very large number of annotations. However, they\n", - "both follow the same interface and can be used interchangeably for\n", - "almost all methods (`SQLiteStore` has some additional methods).\n", - "\n", - "### Provided Functionality (Mini Tutorial)\n", - "\n", - "The storage classes provide a lot of functionality including. This\n", - "includes all of the standard `MutableMapping` methods, as well as\n", - "some additional ones for querying the collection of annotations.\n", - "Below is a brief summary of the main functionality.\n", - "\n", - "#### Adding Annotations\n", - "\n", - "```python\n", - "from tiatoolbox.annotation.storage import Annotation, DictionaryStore, SQliteStore\n", - "from shapely.geometry import Polygon\n", - "\n", - "# Create a new store. If no path is given it is an in-memory store.\n", - "store = DictionaryStore()\n", - "\n", - "# An annotation is a shapely geometry and a JSON serializable dictionary\n", - "annotation = Annotation(Polygon.from_bounds(0, 0, 1, 1), {\"id\": \"1\"})\n", - "\n", - "# Add the annotation to the store in the same way as a dictionary\n", - "store[\"foo\"] = annotation\n", - "\n", - "# Bulk append is also supported. This will be faster in some contexts\n", - "# (e.g. for an SQLiteStore) than adding them one at a time.\n", - "# Here we add 100 simple box annotations.\n", - "# As we have not specified a set of keys to use, a new UUID is generated\n", - "# for each. The respective generated keys are also returned.\n", - "annotations = [\n", - " Annotation(Polygon.from_bounds(n, n, n + 1, n + 1), {\"id\": n}) for n in range(100)\n", - "]\n", - "keys = store.append_many(annotations)\n", - "```\n", - "\n", - "#### Removing Annotations\n", - "\n", - "```python\n", - "# Remove an annotation by key\n", - "del store[\"foo\"]\n", - "\n", - "# Bulk removal\n", - "keys = [\"1234-5676....\", \"...\"] # etc.\n", - "store.remove_many(keys)\n", - "```\n", - "\n", - "#### Querying Within a Region\n", - "\n", - "```python\n", - "# Find all annotations which intersect a polygon\n", - "search_region = Polygon.from_bounds(0, 0, 10, 10)\n", - "result = store.query(search_region)\n", - "\n", - "# Find all annotations which are contained within a polygon\n", - "search_region = Polygon.from_bounds(0, 0, 10, 10)\n", - "result = store.query(search_region, geometry_predicate=\"contains\")\n", - "```\n", - "\n", - "#### Querying Using A Predicate Statement\n", - "\n", - "```python\n", - "# 'props' is a provided shorthand to access the 'properties' dictionary\n", - "results = store.query(where=\"propd['id'] == 1\")\n", - "```\n", - "\n", - "#### Serializing and Deserializing\n", - "\n", - "```python\n", - "# Serialize the store to a GeoJSON string\n", - "json_string = store.to_geojson()\n", - "\n", - "# Serialize the store to a GeoJSON file\n", - "store.to_geojson(\"boxes.geojson\")\n", - "\n", - "# Deserialize a GeoJSON string into a store (even of a different type)\n", - "sqlitestore = SqliteStore.from_geojson(\"boxes.geojson\")\n", - "\n", - "# The above is an in-memory store. We can also now write this to disk\n", - "# as an SQLite database.\n", - "sqlitestore.dump(\"boxes.db\")\n", - "```\n", - "\n", - "### Benchmarking\n", - "\n", - "Here we evaluate the storage efficient and data querying performance of\n", - "the annotation store versus other common formats. We will evaluate some\n", - "common situations and use cases including:\n", - "\n", - "- Disk I/O (tested with an SSD)\n", - "- Querying the data for annotations within a box region\n", - "- Querying the data for annotations within a polygon region\n", - "- Querying the data with a predicate e.g. 'class=1'\n", - "\n", - "All saved output is from running this notebook on a 2020 M1 MacBook Air with 16GB RAM.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aov8ENq2pT5t" - }, - "source": [ - "## Imports\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UoMpbDXopT5t" - }, - "outputs": [], - "source": [ - "\"\"\"Import modules required to run the Jupyter notebook.\"\"\"\n", - "\n", - "from __future__ import annotations\n", - "\n", - "# Clear logger to use tiatoolbox.logger\n", - "import logging\n", - "\n", - "if logging.getLogger().hasHandlers():\n", - " logging.getLogger().handlers.clear()\n", - "\n", - "import copy\n", - "import pickle\n", - "import sys\n", - "import tempfile\n", - "import timeit\n", - "import uuid\n", - "from pathlib import Path\n", - "from typing import TYPE_CHECKING, Any, Generator\n", - "\n", - "import numpy as np\n", - "from IPython.display import display\n", - "from matplotlib import patheffects\n", - "from matplotlib import pyplot as plt\n", - "from shapely import affinity\n", - "from shapely.geometry import MultiPolygon, Point, Polygon\n", - "from tqdm.auto import tqdm\n", - "\n", - "if TYPE_CHECKING:\n", - " from numbers import Number\n", - "\n", - "sys.path.append(\"..\") # If running locally without pypi installed tiatoolbox\n", - "\n", - "from tiatoolbox import logger\n", - "from tiatoolbox.annotation.storage import (\n", - " Annotation,\n", - " DictionaryStore,\n", - " SQLiteStore,\n", - ")\n", - "\n", - "plt.style.use(\"ggplot\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nW-UyVQOpT5u" - }, - "source": [ - "## Data Generation & Utility Functions\n", - "\n", - "Here we define some useful functions to generate some artificial data\n", - "and visualise results.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "N5xNW64UpT5v" - }, - "outputs": [], - "source": [ - "def cell_polygon(\n", - " xy: tuple[Number, Number],\n", - " n_points: int = 20,\n", - " radius: Number = 8,\n", - " noise: Number = 0.01,\n", - " eccentricity: tuple[Number, Number] = (1, 3),\n", - " direction: str = \"CCW\",\n", - " seed: int = 0,\n", - " *,\n", - " repeat_first: bool = True,\n", - ") -> Polygon:\n", - " \"\"\"Generate a fake cell boundary polygon.\n", - "\n", - " Borrowed from tiatoolbox unit tests.\n", - "\n", - " Cell boundaries are generated an ellipsoids with randomised eccentricity,\n", - " added noise, and a random rotation.\n", - "\n", - " Args:\n", - " xy (tuple(int)): The x,y centre point to generate the cell boundary around.\n", - " n_points (int): Number of points in the boundary. Defaults to 20.\n", - " radius (float): Radius of the points from the centre. Defaults to 10.\n", - " noise (float): Noise to add to the point locations. Defaults to 1.\n", - " eccentricity (tuple(float)): Range of values (low, high) to use for\n", - " randomised eccentricity. Defaults to (1, 3).\n", - " repeat_first (bool): Enforce that the last point is equal to the first.\n", - " direction (str): Ordering of the points. Defaults to \"CCW\". Valid options\n", - " are: counter-clockwise \"CCW\", and clockwise \"CW\".\n", - " seed: Seed for the random number generator. Defaults to 0.\n", - "\n", - " \"\"\"\n", - " rand_state = np.random.default_rng().__getstate__()\n", - " rng_seed = np.random.default_rng(seed)\n", - "\n", - " if repeat_first:\n", - " n_points -= 1\n", - "\n", - " # Generate points about an ellipse with random eccentricity\n", - " x, y = xy\n", - " alpha = np.linspace(0, 2 * np.pi - (2 * np.pi / n_points), n_points)\n", - " rx = radius * (rng_seed.random() + 0.5)\n", - " ry = rng_seed.uniform(*eccentricity) * radius - 0.5 * rx\n", - " x = rx * np.cos(alpha) + x + (rng_seed.random(n_points) - 0.5) * noise\n", - " y = ry * np.sin(alpha) + y + (rng_seed.random(n_points) - 0.5) * noise\n", - " boundary_coords = np.stack([x, y], axis=1).astype(int).tolist()\n", - "\n", - " # Copy first coordinate to the end if required\n", - " if repeat_first:\n", - " boundary_coords = [*boundary_coords, boundary_coords[0]]\n", - "\n", - " # Swap direction\n", - " if direction.strip().lower() == \"cw\":\n", - " boundary_coords = boundary_coords[::-1]\n", - "\n", - " polygon = Polygon(boundary_coords)\n", - "\n", - " # Add random rotation\n", - " angle = rng_seed.random() * 360\n", - " polygon = affinity.rotate(polygon, angle, origin=\"centroid\")\n", - "\n", - " # Restore the random state\n", - " np.random.default_rng().__setstate__(rand_state)\n", - "\n", - " return polygon" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jyQEBhNIpT5v" - }, - "outputs": [], - "source": [ - "def cell_grid(\n", - " size: tuple[int, int] = (10, 10),\n", - " spacing: Number = 25,\n", - ") -> Generator[Polygon, None, None]:\n", - " \"\"\"Generate a grid of cell boundaries.\"\"\"\n", - " return (\n", - " cell_polygon(xy=np.multiply(ij, spacing), repeat_first=False, seed=n)\n", - " for n, ij in enumerate(np.ndindex(size))\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VVjSum_9pT5v" - }, - "outputs": [], - "source": [ - "def plot_results(\n", - " experiments: list[list[Number]],\n", - " title: str,\n", - " capsize: int = 5,\n", - " **kwargs: dict[str, Any],\n", - ") -> None:\n", - " \"\"\"Plot the results of a benchmark.\n", - "\n", - " Uses the min for the bar height (see See\n", - " https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat),\n", - " and plots a min-max error bar.\n", - "\n", - " \"\"\"\n", - " x = range(len(experiments))\n", - " color = [f\"C{x_i}\" for x_i in x]\n", - " plt.bar(\n", - " x=x,\n", - " height=[min(e) for e in experiments],\n", - " color=color,\n", - " yerr=[[0 for e in experiments], [max(e) - min(e) for e in experiments]],\n", - " capsize=capsize,\n", - " **kwargs,\n", - " )\n", - " for i, (runs, c) in enumerate(zip(experiments, color)):\n", - " plt.text(\n", - " i,\n", - " min(runs),\n", - " f\" {min(runs):.4f}s\",\n", - " ha=\"left\",\n", - " va=\"bottom\",\n", - " color=c,\n", - " zorder=10,\n", - " fontweight=\"bold\",\n", - " path_effects=[\n", - " patheffects.withStroke(linewidth=2, foreground=\"w\"),\n", - " ],\n", - " )\n", - " plt.title(title)\n", - " plt.hlines(\n", - " 0.5,\n", - " -0.5,\n", - " len(experiments) - 0.5,\n", - " linestyles=\"dashed\",\n", - " colors=\"black\",\n", - " alpha=0.5,\n", - " )\n", - " plt.yscale(\"log\")\n", - " plt.xlabel(\"Store Type\")\n", - " plt.ylabel(\"Time (s)\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tHEUErSmpT5w" - }, - "source": [ - "## Display Some Generated Data\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YUQmgohbpT5w", - "outputId": "1a0cdee1-e32d-41e9-fb9d-26c5ee572880" - }, - "outputs": [ + "cells": [ { - "data": { - "image/svg+xml": "", - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "aqPkpRk-pT5q" + }, + "source": [ + "# Benchmarking Annotation Storage\n", + "\n", + "Click to open in: \\[[GitHub](https://github.com/TissueImageAnalytics/tiatoolbox/tree/develop/benchmarks/annotation_store.ipynb)\\]\\[[Colab](https://colab.research.google.com/github/TissueImageAnalytics/tiatoolbox/blob/develop/benchmarks/annotation_store.ipynb)\\]\\[[Kaggle](https://kaggle.com/kernels/welcome?src=https://github.com/TissueImageAnalytics/tiatoolbox/blob/develop/benchmarks/annotation_store.ipynb)\\]\n", + "\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/svg+xml": "", - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "BS0G58BPpT5s" + }, + "source": [ + "_In order to run this notebook on a Kaggle platform, 1) click the Kaggle URL 2) click on Settings on the right of the Kaggle screen, 3) log in to your Kaggle account, 4) tick \"Internet\" checkbox under Settings, to enable necessary downloads._\n", + "\n", + "**NOTE:** Some parts of this notebook require a lot of memory. Part 2 in particular may not run on memory constrained systems. The notebook will run well on an MacBook Air (M1, 2020) but will use a lot of swap. It may require >64GB of memory for second half to avoid using swap.\n", + "\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/svg+xml": "", - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "EjHQXjqrpT5s" + }, + "source": [ + "## About This Notebook\n", + "\n", + "Managing annotation, either created by hand or from model output, is a\n", + "common task in computational pathology. For a small number of\n", + "annotations this may be trivial. However, for large numbers of\n", + "annotations, it is often necessary to store the annotations in a more\n", + "structured format such as a database. This is because finding a desired\n", + "subset of annotations within a very large collection, for example over\n", + "one million cell boundary polygons derived from running HoVerNet on a\n", + "WSI, may be very slow if performed in a naive manner. In the toolbox, we\n", + "implement two storage method to make handling annotations easier:\n", + "`DictionaryStore` and `SQLiteStore`.\n", + "\n", + "### Storage Classes\n", + "\n", + "Both stores act as a key-value store where the key is the annotation ID\n", + "(as a string) and the value is the annotation. This follows the Python\n", + "[`MutableMapping`](https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping)\n", + "interface meaning that the stores can be used in the same way as a\n", + "regular Python dictionary (`dict`).\n", + "\n", + "The `DictionaryStore` is implemented internally using a Python\n", + "dictionary. It is a realtively simple class, operating with all\n", + "annotations in memory and using a simple scan method to search for\n", + "annotations. This works very well for a small number of annotations. In\n", + "contrast the `SQLiteStore` is implemented using a SQLite database\n", + "(either in memory or on disk), it is a more complex class making use of\n", + "an rtree index to efficiently spatially search for annotations. This is\n", + "much more suited to a very large number of annotations. However, they\n", + "both follow the same interface and can be used interchangeably for\n", + "almost all methods (`SQLiteStore` has some additional methods).\n", + "\n", + "### Provided Functionality (Mini Tutorial)\n", + "\n", + "The storage classes provide a lot of functionality including. This\n", + "includes all of the standard `MutableMapping` methods, as well as\n", + "some additional ones for querying the collection of annotations.\n", + "Below is a brief summary of the main functionality.\n", + "\n", + "#### Adding Annotations\n", + "\n", + "```python\n", + "from tiatoolbox.annotation.storage import Annotation, DictionaryStore, SQliteStore\n", + "from shapely.geometry import Polygon\n", + "\n", + "# Create a new store. If no path is given it is an in-memory store.\n", + "store = DictionaryStore()\n", + "\n", + "# An annotation is a shapely geometry and a JSON serializable dictionary\n", + "annotation = Annotation(Polygon.from_bounds(0, 0, 1, 1), {\"id\": \"1\"})\n", + "\n", + "# Add the annotation to the store in the same way as a dictionary\n", + "store[\"foo\"] = annotation\n", + "\n", + "# Bulk append is also supported. This will be faster in some contexts\n", + "# (e.g. for an SQLiteStore) than adding them one at a time.\n", + "# Here we add 100 simple box annotations.\n", + "# As we have not specified a set of keys to use, a new UUID is generated\n", + "# for each. The respective generated keys are also returned.\n", + "annotations = [\n", + " Annotation(Polygon.from_bounds(n, n, n + 1, n + 1), {\"id\": n}) for n in range(100)\n", + "]\n", + "keys = store.append_many(annotations)\n", + "```\n", + "\n", + "#### Removing Annotations\n", + "\n", + "```python\n", + "# Remove an annotation by key\n", + "del store[\"foo\"]\n", + "\n", + "# Bulk removal\n", + "keys = [\"1234-5676....\", \"...\"] # etc.\n", + "store.remove_many(keys)\n", + "```\n", + "\n", + "#### Querying Within a Region\n", + "\n", + "```python\n", + "# Find all annotations which intersect a polygon\n", + "search_region = Polygon.from_bounds(0, 0, 10, 10)\n", + "result = store.query(search_region)\n", + "\n", + "# Find all annotations which are contained within a polygon\n", + "search_region = Polygon.from_bounds(0, 0, 10, 10)\n", + "result = store.query(search_region, geometry_predicate=\"contains\")\n", + "```\n", + "\n", + "#### Querying Using A Predicate Statement\n", + "\n", + "```python\n", + "# 'props' is a provided shorthand to access the 'properties' dictionary\n", + "results = store.query(where=\"propd['id'] == 1\")\n", + "```\n", + "\n", + "#### Serializing and Deserializing\n", + "\n", + "```python\n", + "# Serialize the store to a GeoJSON string\n", + "json_string = store.to_geojson()\n", + "\n", + "# Serialize the store to a GeoJSON file\n", + "store.to_geojson(\"boxes.geojson\")\n", + "\n", + "# Deserialize a GeoJSON string into a store (even of a different type)\n", + "sqlitestore = SqliteStore.from_geojson(\"boxes.geojson\")\n", + "\n", + "# The above is an in-memory store. We can also now write this to disk\n", + "# as an SQLite database.\n", + "sqlitestore.dump(\"boxes.db\")\n", + "```\n", + "\n", + "### Benchmarking\n", + "\n", + "Here we evaluate the storage efficient and data querying performance of\n", + "the annotation store versus other common formats. We will evaluate some\n", + "common situations and use cases including:\n", + "\n", + "- Disk I/O (tested with an SSD)\n", + "- Querying the data for annotations within a box region\n", + "- Querying the data for annotations within a polygon region\n", + "- Querying the data with a predicate e.g. 'class=1'\n", + "\n", + "All saved output is from running this notebook on a 2020 M1 MacBook Air with 16GB RAM.\n", + "\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/svg+xml": "", - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "aov8ENq2pT5t" + }, + "source": [ + "## Imports\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UoMpbDXopT5t" + }, + "outputs": [], + "source": [ + "\"\"\"Import modules required to run the Jupyter notebook.\"\"\"\n", + "\n", + "from __future__ import annotations\n", + "\n", + "# Clear logger to use tiatoolbox.logger\n", + "import logging\n", + "\n", + "if logging.getLogger().hasHandlers():\n", + " logging.getLogger().handlers.clear()\n", + "\n", + "import copy\n", + "import pickle\n", + "import sys\n", + "import tempfile\n", + "import timeit\n", + "import uuid\n", + "from pathlib import Path\n", + "from typing import TYPE_CHECKING, Any\n", + "\n", + "import numpy as np\n", + "from IPython.display import display\n", + "from matplotlib import patheffects\n", + "from matplotlib import pyplot as plt\n", + "from shapely import affinity\n", + "from shapely.geometry import MultiPolygon, Point, Polygon\n", + "from tqdm.auto import tqdm\n", + "\n", + "if TYPE_CHECKING:\n", + " from collections.abc import Generator\n", + " from numbers import Number\n", + "\n", + "sys.path.append(\"..\") # If running locally without pypi installed tiatoolbox\n", + "\n", + "from tiatoolbox import logger\n", + "from tiatoolbox.annotation.storage import (\n", + " Annotation,\n", + " DictionaryStore,\n", + " SQLiteStore,\n", + ")\n", + "\n", + "plt.style.use(\"ggplot\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nW-UyVQOpT5u" + }, + "source": [ + "## Data Generation & Utility Functions\n", + "\n", + "Here we define some useful functions to generate some artificial data\n", + "and visualise results.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "N5xNW64UpT5v" + }, + "outputs": [], + "source": [ + "def cell_polygon(\n", + " xy: tuple[Number, Number],\n", + " n_points: int = 20,\n", + " radius: Number = 8,\n", + " noise: Number = 0.01,\n", + " eccentricity: tuple[Number, Number] = (1, 3),\n", + " direction: str = \"CCW\",\n", + " seed: int = 0,\n", + " *,\n", + " repeat_first: bool = True,\n", + ") -> Polygon:\n", + " \"\"\"Generate a fake cell boundary polygon.\n", + "\n", + " Borrowed from tiatoolbox unit tests.\n", + "\n", + " Cell boundaries are generated an ellipsoids with randomised eccentricity,\n", + " added noise, and a random rotation.\n", + "\n", + " Args:\n", + " xy (tuple(int)): The x,y centre point to generate the cell boundary around.\n", + " n_points (int): Number of points in the boundary. Defaults to 20.\n", + " radius (float): Radius of the points from the centre. Defaults to 10.\n", + " noise (float): Noise to add to the point locations. Defaults to 1.\n", + " eccentricity (tuple(float)): Range of values (low, high) to use for\n", + " randomised eccentricity. Defaults to (1, 3).\n", + " repeat_first (bool): Enforce that the last point is equal to the first.\n", + " direction (str): Ordering of the points. Defaults to \"CCW\". Valid options\n", + " are: counter-clockwise \"CCW\", and clockwise \"CW\".\n", + " seed: Seed for the random number generator. Defaults to 0.\n", + "\n", + " \"\"\"\n", + " rand_state = np.random.default_rng().__getstate__()\n", + " rng_seed = np.random.default_rng(seed)\n", + "\n", + " if repeat_first:\n", + " n_points -= 1\n", + "\n", + " # Generate points about an ellipse with random eccentricity\n", + " x, y = xy\n", + " alpha = np.linspace(0, 2 * np.pi - (2 * np.pi / n_points), n_points)\n", + " rx = radius * (rng_seed.random() + 0.5)\n", + " ry = rng_seed.uniform(*eccentricity) * radius - 0.5 * rx\n", + " x = rx * np.cos(alpha) + x + (rng_seed.random(n_points) - 0.5) * noise\n", + " y = ry * np.sin(alpha) + y + (rng_seed.random(n_points) - 0.5) * noise\n", + " boundary_coords = np.stack([x, y], axis=1).astype(int).tolist()\n", + "\n", + " # Copy first coordinate to the end if required\n", + " if repeat_first:\n", + " boundary_coords = [*boundary_coords, boundary_coords[0]]\n", + "\n", + " # Swap direction\n", + " if direction.strip().lower() == \"cw\":\n", + " boundary_coords = boundary_coords[::-1]\n", + "\n", + " polygon = Polygon(boundary_coords)\n", + "\n", + " # Add random rotation\n", + " angle = rng_seed.random() * 360\n", + " polygon = affinity.rotate(polygon, angle, origin=\"centroid\")\n", + "\n", + " # Restore the random state\n", + " np.random.default_rng().__setstate__(rand_state)\n", + "\n", + " return polygon" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jyQEBhNIpT5v" + }, + "outputs": [], + "source": [ + "def cell_grid(\n", + " size: tuple[int, int] = (10, 10),\n", + " spacing: Number = 25,\n", + ") -> Generator[Polygon, None, None]:\n", + " \"\"\"Generate a grid of cell boundaries.\"\"\"\n", + " return (\n", + " cell_polygon(xy=np.multiply(ij, spacing), repeat_first=False, seed=n)\n", + " for n, ij in enumerate(np.ndindex(size))\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VVjSum_9pT5v" + }, + "outputs": [], + "source": [ + "def plot_results(\n", + " experiments: list[list[Number]],\n", + " title: str,\n", + " capsize: int = 5,\n", + " **kwargs: dict[str, Any],\n", + ") -> None:\n", + " \"\"\"Plot the results of a benchmark.\n", + "\n", + " Uses the min for the bar height (see See\n", + " https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat),\n", + " and plots a min-max error bar.\n", + "\n", + " \"\"\"\n", + " x = range(len(experiments))\n", + " color = [f\"C{x_i}\" for x_i in x]\n", + " plt.bar(\n", + " x=x,\n", + " height=[min(e) for e in experiments],\n", + " color=color,\n", + " yerr=[[0 for e in experiments], [max(e) - min(e) for e in experiments]],\n", + " capsize=capsize,\n", + " **kwargs,\n", + " )\n", + " for i, (runs, c) in enumerate(zip(experiments, color)):\n", + " plt.text(\n", + " i,\n", + " min(runs),\n", + " f\" {min(runs):.4f}s\",\n", + " ha=\"left\",\n", + " va=\"bottom\",\n", + " color=c,\n", + " zorder=10,\n", + " fontweight=\"bold\",\n", + " path_effects=[\n", + " patheffects.withStroke(linewidth=2, foreground=\"w\"),\n", + " ],\n", + " )\n", + " plt.title(title)\n", + " plt.hlines(\n", + " 0.5,\n", + " -0.5,\n", + " len(experiments) - 0.5,\n", + " linestyles=\"dashed\",\n", + " colors=\"black\",\n", + " alpha=0.5,\n", + " )\n", + " plt.yscale(\"log\")\n", + " plt.xlabel(\"Store Type\")\n", + " plt.ylabel(\"Time (s)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tHEUErSmpT5w" + }, + "source": [ + "## Display Some Generated Data\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YUQmgohbpT5w", + "outputId": "1a0cdee1-e32d-41e9-fb9d-26c5ee572880" + }, + "outputs": [ + { + "data": { + "image/svg+xml": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for n in range(4):\n", + " display(cell_polygon(xy=(0, 0), n_points=20, repeat_first=False, seed=n))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "APUNL2PtpT5w" + }, + "source": [ + "### Randomised Cell Boundaries\n", + "\n", + "Here we create a function to generate grid of cells for testing. It uses a fixed seed for reproducibility.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SOpBKM7IpT5w" + }, + "source": [ + "### A Sample 5×5 Grid\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2xA-oG4VpT5w", + "outputId": "caea51e4-8a27-4dd1-ed0d-c272b93d8bb7" + }, + "outputs": [ + { + "data": { + "image/svg+xml": "", + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MultiPolygon(polygons=list(cell_grid(size=(5, 5), spacing=35)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b6S8vzFipT5w" + }, + "source": [ + "# Part 1: Small Scale Benchmarking of Annotation Storage\n", + "\n", + "Using the already defined data generation functions (`cell_polygon` and\n", + "`cell_grid`), we create some simple artificial cell boundaries by\n", + "creating a circle of points, adding some noise, scaling to introduce\n", + "eccentricity, and then rotating. We use 20 points per cell, which is a\n", + "reasonably high value for cell annotation. However, this can be\n", + "adjusted.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UZMoLDvkpT5x" + }, + "source": [ + "## 1.1) Appending Annotations (In-Memory & Disk I/O)\n", + "\n", + "Here we test:\n", + "\n", + "1. A python dictionary based in-memory store (`DictionaryStore`)\n", + "1. An SQLite database based in-memory store (`SQLiteStore`)\n", + "\n", + "Both of these stores may operate in memory. The `SQLiteStore` may also\n", + "be backed by an on-disk file for datasets which are too large to fit in\n", + "memory. The `DictionaryStore` class can serialise/deserialise itself\n", + "to/from disk in a line delimited GeoJSON format (each line seperated\n", + "by `\\n` is a valid GeoJSON object)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DZBiw_EepT5x" + }, + "outputs": [], + "source": [ + "# Convert to annotations (a dataclass pairing a geometry and (optional)\n", + "# key-value properties)\n", + "# Run time: ~2s\n", + "annotations = [\n", + " Annotation(polygon) for polygon in cell_grid(size=(100, 100), spacing=35)\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LUVa03F2pT5x" + }, + "source": [ + "### 1.1.1) In Memory Append\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7PzE7AhdpT5x", + "outputId": "974bb3d0-3290-4315-a6fc-3b7ca90072a6" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run time: ~5s\n", + "\n", + "# Time dictionary store\n", + "dict_runs = timeit.repeat(\n", + " \"dict_store.append_many(annotations)\",\n", + " setup=\"dict_store = DictionaryStore()\",\n", + " globals={\"DictionaryStore\": DictionaryStore, \"annotations\": annotations},\n", + " number=1,\n", + " repeat=3,\n", + ")\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " \"sql_store.append_many(annotations)\",\n", + " setup=\"sql_store = SQLiteStore()\",\n", + " globals={\"SQLiteStore\": SQLiteStore, \"annotations\": annotations},\n", + " number=1,\n", + " repeat=3,\n", + ")\n", + "\n", + "# Plot the results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"Time to Append 10,000 Annotations In Memory\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", + ")\n", + "plt.hlines(0.5, -0.5, 1.5, linestyles=\"dashed\", color=\"k\")\n", + "plt.xlim([-0.5, 1.5])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gU6PLE7wpT5x" + }, + "source": [ + "Note that inserting into the `SQLiteStore` is much slower than the\n", + "`DictionaryStore`. Appending to a `Dictionary` store simply requires\n", + "adding a memory reference to a dictionary. Therefore, this is a very\n", + "fast operation. On the other hand, for the `SQLiteStore`, the insertion\n", + "is slower because the data must be serialised for the database and the\n", + "R-Tree spatial index must also be updated. Updating the index is a\n", + "relatively expensive operation. However, this spatial index allows for\n", + "very fast queries of a very large set of annotations within a set of\n", + "spatial bounds.\n", + "\n", + "Insertion is typically only performed once for each\n", + "annotation, whereas queries may be performed many times on the\n", + "annotation set. Therefore, it makes sense to trade a more expensive\n", + "insertion for fast queries as the cost of insertion will be amortised\n", + "over a number of queries on the data. Additionally, data may be written\n", + "to the database from multiple threads or subprocesses (so long as a new\n", + "instance of `SQLiteStore` is created for each thread or subprocess to\n", + "attach to a database on disk) thus freeing up the main thread.\n", + "\n", + "For comparison, we also compare bulk insertion plus seralising to disk\n", + "as line-delimited GeoJSON from the `DictionaryStore` as this is the\n", + "default serialisation to disk method (`DictionaryStore.dump(file_path`).\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "t2q9QTCfpT5x", + "outputId": "2202c328-ba48-476b-8efa-662678d75135" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run time: ~10s\n", + "\n", + "setup = \"fp.truncate(0)\\nstore = Store(fp)\" # Clear the file\n", + "\n", + "# Time dictionary store\n", + "with tempfile.NamedTemporaryFile(\"w+\") as fp:\n", + " dict_runs = timeit.repeat(\n", + " (\"store.append_many(annotations)\\nstore.commit()\"),\n", + " setup=setup,\n", + " globals={\"Store\": DictionaryStore, \"annotations\": annotations, \"fp\": fp},\n", + " number=1,\n", + " repeat=3,\n", + " )\n", + "\n", + "# Time SQLite store\n", + "with tempfile.NamedTemporaryFile(\"w+b\") as fp:\n", + " sqlite_runs = timeit.repeat(\n", + " (\"store.append_many(annotations)\\nstore.commit()\"),\n", + " setup=setup,\n", + " globals={\"Store\": SQLiteStore, \"annotations\": annotations, \"fp\": fp},\n", + " number=1,\n", + " repeat=3,\n", + " )\n", + "\n", + "# Plot the results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"Time to Append & Serialise 10,000 Annotations To Disk\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", + ")\n", + "plt.hlines(0.5, -0.5, 1.5, linestyles=\"dashed\", color=\"k\")\n", + "plt.xlim([-0.5, 1.5])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LKr6FmctpT5x" + }, + "source": [ + "Here we can see that when we include the serialisation to disk in the\n", + "benchmark, the time to insert is much more similar.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V7WV8wNmpT5x" + }, + "source": [ + "## 1.2) Box Query\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eul4PYZPpT5x", + "outputId": "a0131a72-f527-48b1-8aac-8cbccfced2ed" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run time: ~20s\n", + "\n", + "# One time Setup\n", + "dict_store = DictionaryStore()\n", + "sql_store = SQLiteStore()\n", + "dict_store.append_many(annotations)\n", + "sql_store.append_many(annotations)\n", + "\n", + "rng = np.random.default_rng(123)\n", + "boxes = [\n", + " Polygon.from_bounds(x, y, 128, 128) for x, y in rng.integers(0, 1000, size=(100, 2))\n", + "]\n", + "stmt = \"for box in boxes:\\n _ = store.query(box)\"\n", + "\n", + "# Time dictionary store\n", + "dict_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": dict_store, \"boxes\": boxes},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": sql_store, \"boxes\": boxes},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Plot the results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"100 Box Queries\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z9ntCgKapT5x" + }, + "source": [ + "Here we can see that the `SQLiteStore` is a bit faster. Addtionally,\n", + "difference in performance is more pronounced when there are more\n", + "annotations (as we will see later in this notebook) in the store or when\n", + "just returning keys:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vfGH6e4upT5x", + "outputId": "7cf8bf30-a4c9-4de5-9a5f-f9fd6cffc141" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run time: ~15s\n", + "\n", + "# One time Setup\n", + "dict_store = DictionaryStore()\n", + "sql_store = SQLiteStore()\n", + "dict_store.append_many(annotations)\n", + "sql_store.append_many(annotations)\n", + "\n", + "rng = np.random.default_rng(123)\n", + "boxes = [\n", + " Polygon.from_bounds(x, y, 128, 128) for x, y in rng.integers(0, 1000, size=(100, 2))\n", + "]\n", + "stmt = \"for box in boxes:\\n _ = store.iquery(box)\" # Just return the keys (uuids)\n", + "\n", + "# Time dictionary store\n", + "dict_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": dict_store, \"boxes\": boxes},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": sql_store, \"boxes\": boxes},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Plot the results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"100 Box Queries (Key Lookup Only)\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xVQlsK1MpT5y" + }, + "source": [ + "## 1.3) Polygon Query\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fnkdnKWRpT5y", + "outputId": "03ccc35c-df96-4d68-9d53-72ac835a9088" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run time: ~15s\n", + "\n", + "# One time Setup\n", + "dict_store = DictionaryStore()\n", + "sql_store = SQLiteStore()\n", + "dict_store.append_many(annotations)\n", + "sql_store.append_many(annotations)\n", + "\n", + "rng = np.random.default_rng(123)\n", + "query_polygons = [\n", + " Polygon(\n", + " [\n", + " (x, y),\n", + " (x + 128, y),\n", + " (x + 128, y + 128),\n", + " (x, y),\n", + " ],\n", + " )\n", + " for x, y in rng.integers(0, 1000, size=(100, 2))\n", + "]\n", + "stmt = \"for polygon in query_polygons:\\n _ = store.query(polygon)\"\n", + "\n", + "# Time dictionary store\n", + "dict_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": dict_store, \"query_polygons\": query_polygons},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": sql_store, \"query_polygons\": query_polygons},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Plot the results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"100 Polygon Queries\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1k1xOgB5pT5y" + }, + "source": [ + "Here we can see that performing queries within a polygon region is about\n", + "10x faster with the `SQLiteStore` than with the `DictionaryStore`.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iYFK95w1pT5y" + }, + "source": [ + "## 1.4) Predicate Query\n", + "\n", + "Here we query the whole annotation region but with a predicate to\n", + "select only annotations with the class label of 0. We also,\n", + "demonstrate how creating a database index can dramatically improve\n", + "the performance of queries.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zNX4UG4BpT5y", + "outputId": "97444739-4aa5-42c7-bebc-84a022282ac7" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run time: ~2m\n", + "\n", + "# Setup\n", + "labelled_annotations = copy.deepcopy(annotations)\n", + "for n, annotation in enumerate(labelled_annotations):\n", + " annotation.properties[\"class\"] = n % 10\n", + " annotation.properties[\"vector\"] = rng.integers(1, 4, 10).tolist()\n", + "\n", + "predicate = \"(props['class'] == ?) & (3 in props['vector'])\"\n", + "classes = rng.integers(0, 10, size=100)\n", + "stmt = \"for n in classes:\\n store.query(where=predicate.replace('?', str(n)))\"\n", + "\n", + "dict_store = DictionaryStore()\n", + "sql_store = SQLiteStore()\n", + "\n", + "dict_store.append_many(labelled_annotations)\n", + "sql_store.append_many(labelled_annotations)\n", + "\n", + "\n", + "# Time dictionary store\n", + "dict_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": dict_store, \"predicate\": predicate, \"classes\": classes},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "dict_result = dict_store.query(where=predicate.replace(\"?\", \"0\"))\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": sql_store, \"predicate\": predicate, \"classes\": classes},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "sql_result = sql_store.query(where=predicate.replace(\"?\", \"0\"))\n", + "\n", + "\n", + "# Add an index\n", + "# Note: Indexes may not always speed up the query (sometimes they can\n", + "# actually slow it down), test to make sure.\n", + "sql_store.create_index(\"class_lookup\", \"props['class']\")\n", + "sql_store.create_index(\"has_3\", \"3 in props['vector']\")\n", + "\n", + "# Time SQLite store again\n", + "sqlite_index_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\"store\": sql_store, \"predicate\": predicate, \"classes\": classes},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "sql_index_result = sql_store.query(where=predicate.replace(\"?\", \"0\"))\n", + "\n", + "# # Validate the results against each other\n", + "# for a, b, c in zip(dict_result, sql_result, sql_index_result):\n", + "# assert a.geometry == b.geometry == c.geometry # noqa: ERA001\n", + "\n", + "# Plot the results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs, sqlite_index_runs],\n", + " title=\"100 Queries with a Predicate\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\", \"SQLiteStore\\n(with index)\"],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gp8mq1TNpT5y" + }, + "source": [ + "### Polygon & Predicate Query\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Eu0hGvhdpT5y", + "outputId": "0d89174e-01e0-4e71-a9c3-e063ed30ca38" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run time: ~10s\n", + "\n", + "# Setup\n", + "labelled_annotations = copy.deepcopy(annotations)\n", + "for n, annotation in enumerate(labelled_annotations):\n", + " annotation.properties[\"class\"] = n % 10\n", + "\n", + "predicate = \"props['class'] == \"\n", + "classes = rng.integers(0, 10, size=50)\n", + "query_polygons = [\n", + " Polygon(\n", + " [\n", + " (x, y),\n", + " (x + 128, y),\n", + " (x + 128, y + 128),\n", + " (x, y),\n", + " ],\n", + " )\n", + " for x, y in rng.integers(0, 1000, size=(100, 2))\n", + "]\n", + "stmt = (\n", + " \"for n, poly in zip(classes, query_polygons):\\n\"\n", + " \" store.query(poly, where=predicate + str(n))\"\n", + ")\n", + "\n", + "dict_store = DictionaryStore()\n", + "sql_store = SQLiteStore()\n", + "\n", + "dict_store.append_many(labelled_annotations)\n", + "sql_store.append_many(labelled_annotations)\n", + "\n", + "\n", + "# Time dictionary store\n", + "dict_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\n", + " \"store\": dict_store,\n", + " \"predicate\": predicate,\n", + " \"classes\": classes,\n", + " \"query_polygons\": query_polygons,\n", + " },\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "dict_result = dict_store.query(query_polygons[0], where=predicate + \"0\")\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\n", + " \"store\": sql_store,\n", + " \"predicate\": predicate,\n", + " \"classes\": classes,\n", + " \"query_polygons\": query_polygons,\n", + " },\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "sql_result = sql_store.query(query_polygons[0], where=predicate + \"0\")\n", + "\n", + "\n", + "# Check that the set difference of bounding boxes is empty i.e. all sets\n", + "# of results contain polygons which produce the same set of bounding\n", + "# boxes. This avoids being tripped up by slight varations in order or\n", + "# coordinate order between the results.\n", + "dict_set = {x.geometry.bounds for x in dict_result}\n", + "sql_set = {x.geometry.bounds for x in sql_result}\n", + "assert len(dict_set.difference(sql_set)) == 0 # noqa: S101\n", + "\n", + "# Plot the results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"100 Queries with a Polygon and Predicate\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kJ8x5tJmpT5y" + }, + "source": [ + "### Complex Predicate Query\n", + "\n", + "Here we slightly increase the complexity of the predicate to show how\n", + "the complexity of a predicate can dramatically affect the performance\n", + "when handling many annotations.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VHb4PqbHpT5y", + "outputId": "343b44c7-741d-4e11-9dd2-85f357ba6f32" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run time: ~1m\n", + "\n", + "# Setup\n", + "box = Polygon.from_bounds(0, 0, 1024, 1024)\n", + "labelled_annotations = copy.deepcopy(annotations)\n", + "for n, annotation in enumerate(labelled_annotations):\n", + " annotation.properties[\"class\"] = n % 4\n", + " annotation.properties[\"n\"] = n\n", + "\n", + "predicate = \"(props['n'] > 1000) & (props['n'] % 4 == 0) & (props['class'] == \"\n", + "targets = rng.integers(0, 4, size=100)\n", + "stmt = \"for n in targets:\\n store.query(box, where=predicate + str(n) + ')')\"\n", + "\n", + "dict_store = DictionaryStore()\n", + "sql_store = SQLiteStore()\n", + "\n", + "dict_store.append_many(labelled_annotations)\n", + "sql_store.append_many(labelled_annotations)\n", + "\n", + "\n", + "# Time dictionary store\n", + "dict_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\n", + " \"store\": dict_store,\n", + " \"predicate\": predicate,\n", + " \"targets\": targets,\n", + " \"box\": box,\n", + " },\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "dict_result = dict_store.query(box, where=predicate + \"0)\")\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " stmt,\n", + " globals={\n", + " \"store\": sql_store,\n", + " \"predicate\": predicate,\n", + " \"targets\": targets,\n", + " \"box\": box,\n", + " },\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "sql_result = sql_store.query(box, where=predicate + \"0)\")\n", + "\n", + "\n", + "# Check that the set difference of bounding boxes is empty i.e. all sets\n", + "# of results contain polygons which produce the same set of bounding\n", + "# boxes. This avoids being tripped up by slight varations in order or\n", + "# coordinate order between the results.\n", + "dict_set = {x.geometry.bounds for x in dict_result.values()}\n", + "sql_set = {x.geometry.bounds for x in sql_result.values()}\n", + "\n", + "assert len(dict_set.difference(sql_set)) == 0 # noqa: S101\n", + "\n", + "# Plot the results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"100 Queries with a Complex Predicate\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CAT0KmS6pT5y" + }, + "source": [ + "# Part 2: Large Scale Dataset Benchmarking\n", + "\n", + "Here we generate some sets of anntations with five million items each\n", + "(in a 2237 x 2237 grid). One is a set of points, the other a set of\n", + "generated cell boundaries.\n", + "\n", + "The code to generate and write out the annotations to various formats is\n", + "included in the following cells. However, some of these take a very long\n", + "time to run. A pre-generated dataset is downloaded and then read from\n", + "disk instead to save time. However, you may uncomment the generation\n", + "code to replicate the original.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nwH5zYFupT5y" + }, + "source": [ + "## 2.1) Points Dataset\n", + "\n", + "Here we generate a simple points data in a grid. The grid is 2237 x 2237\n", + "and contains over 5 million points. We also write this to disk in\n", + "various formats. Some formats take a long time and are commented out. A\n", + "summary of times for a consumer laptop are shown in a table at the end.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2FjCL2jgpT5y" + }, + "outputs": [], + "source": [ + "# Generate some points with a little noise\n", + "# Run time: ~5s\n", + "points = np.array(\n", + " [\n", + " [x, y]\n", + " for x in np.linspace(0, 75_000, 2237)\n", + " for y in np.linspace(0, 75_000, 2237)\n", + " ],\n", + ")\n", + "# Add some noise between -1 and 1\n", + "rng_42 = np.random.default_rng(42)\n", + "points += rng_42.uniform(-1, 1, size=(2237**2, 2))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRWABSBVpT5z" + }, + "source": [ + "### 2.1.1) Writing To Disk\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x76WbSFdpT52" + }, + "outputs": [], + "source": [ + "# Save as a simple Numpy array (.npy)\n", + "# Run time: <1s\n", + "np.save(\"points.npy\", points)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dkKtM-DKpT52" + }, + "outputs": [], + "source": [ + "# Save as compressed NumPy archive (.npz)\n", + "# Run time: ~5s\n", + "np.savez_compressed(\"points.npz\", points)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rbHdEIbPpT52" + }, + "source": [ + "Note that the above numpy format is missing the keys (UUIDs) of each point.\n", + "This may not be required in all cases. However, for the sake of comparison\n", + "we also generate a NumPy archive with keys included. We store the UUIDs\n", + "as integers to save space and for a fair comparison where the optimal\n", + "storage method is used in each case. Note however that UUIDs are too\n", + "large to be a standard C type and therefore are stored as an object\n", + "array.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DbLm4l5tpT52" + }, + "outputs": [], + "source": [ + "# Generate UUIDs\n", + "# Run time: ~10s\n", + "keys = np.array([uuid.uuid4().int for _ in range(len(points))])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zXuAqw0KpT52" + }, + "outputs": [], + "source": [ + "# Generate some UUIDs as keys\n", + "# Save in NumPy format (.npz)\n", + "# Run time: <1s\n", + "np.savez(\"uuid_points.npz\", keys=keys, coords=points)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UAHAgPU4pT52" + }, + "outputs": [], + "source": [ + "# Save in compressed (zip) NumPy format (.npz)\n", + "# Run time: ~10s\n", + "np.savez_compressed(\"uuid_points_compressed.npz\", keys=keys, coords=points)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j5wlDFYfpT52" + }, + "outputs": [], + "source": [ + "# Write to SQLite with SQLiteStore\n", + "# Run time: ~10m\n", + "points_sqlite_store = SQLiteStore(\"points.db\")\n", + "_ = points_sqlite_store.append_many(\n", + " annotations=(Annotation(Point(x, y)) for x, y in points),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tUekiEqspT53" + }, + "outputs": [], + "source": [ + "# Load a DictionaryStore into memory by copying from the SQLiteStore\n", + "# Run time: ~1m 30s\n", + "points_dict_store = DictionaryStore(Path(\"points.ndjson\"))\n", + "for key, value in points_sqlite_store.items():\n", + " points_dict_store[key] = value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Uynntjq7pT53" + }, + "outputs": [], + "source": [ + "# Save as GeoJSON\n", + "# Run time: ~1m 30s\n", + "points_sqlite_store.to_geojson(\"points.geojson\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4YMuggcgpT53" + }, + "outputs": [], + "source": [ + "# Save as ndjson\n", + "# Run time: ~1m 30s\n", + "# Spec: https://github.com/ndjson/ndjson-spec\n", + "points_sqlite_store.to_ndjson(\"points.ndjson\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lW9NoCPwpT53" + }, + "source": [ + "### 2.1.2) Points Dataset Statistics Summary\n", + "\n", + "| Format | Write Time | Size |\n", + "| -----------------------------: | ---------: | -----: |\n", + "| SQLiteStore (.db) | 6m 20s | 893MB |\n", + "| ndjson | 1m 23s | 667 MB |\n", + "| GeoJSON | 1m 42s | 500 MB |\n", + "| NumPy + UUID (.npz) | 0.5s | 165 MB |\n", + "| NumPy + UUID Compressed (.npz) | 31s | 136 MB |\n", + "| NumPy (.npy) | 0.1s | 76 MB |\n", + "| NumPy Compressed (.npz) | 3.3s | 66 MB |\n", + "\n", + "Note that the points SQLite database is significantly larger than the\n", + "NumPy arrays on disk. The numpy array is much more storage efficient\n", + "partly because there is no R Tree index or unique identifier (UUID)\n", + "stored for each point. For a more fair comparison, another NumPy archive\n", + "(.npz) is created where the keys are stored along with the coordinates.\n", + "\n", + "Also note that although the compressed NumPy representation is much\n", + "smaller, it must be decompressed in memeory before it can be used. The\n", + "uncompressed versions may be memory mapped if their size exceeds the\n", + "available memory.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a_3Gz5Q0pT53" + }, + "source": [ + "### 2.1.3) Simple Box Query\n", + "\n", + "Here we evaluate the performance of performing a simple box query on the\n", + "data. All points which are in the area between 128 and 256 in the x and\n", + "y coordinates are retrieved. It is assumed that the data is already in\n", + "memory for the NumPy formats. In reality this would not the be case for\n", + "the first query, all data would have to be read from disk, which is a\n", + "significan overhead. However, this cost is amortised across many\n", + "queries. To ensure the fairest possible comparison, it is assumed that\n", + "many queries will be performed, and that this data loading cost in\n", + "negligable.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "o9J0d6gdpT53" + }, + "outputs": [], + "source": [ + "box = Polygon.from_bounds(128, 128, 256, 256)\n", + "\n", + "# Time numpy\n", + "numpy_runs = timeit.repeat(\n", + " (\n", + " \"where = np.all([\"\n", + " \"points[:, 0] > 128,\"\n", + " \"points[:, 0] < 256,\"\n", + " \"points[:, 1] > 128,\"\n", + " \"points[:, 1] < 256\"\n", + " \"], 0)\\n\"\n", + " \"uuids = keys[where]\\n\"\n", + " \"result = points[where]\\n\"\n", + " ),\n", + " globals={\"keys\": keys, \"points\": points, \"np\": np},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Time SQLiteStore\n", + "sqlite_runs = timeit.repeat(\n", + " \"store.query(box)\",\n", + " globals={\"store\": points_sqlite_store, \"box\": box},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Time DictionaryStore\n", + "dict_runs = timeit.repeat(\n", + " \"store.query(box)\",\n", + " globals={\"store\": points_dict_store, \"box\": box},\n", + " number=1,\n", + " repeat=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eX1qqUIipT53", + "outputId": "a4033a88-6b2d-4a55-f3f6-ba419ef748c0" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs, numpy_runs],\n", + " title=\"Points Box Query (5 Million Points)\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\", \"NumPy Array\"],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aNU6FP90pT53" + }, + "source": [ + "Although the NumPy array is very space efficient on disk, it is not as\n", + "fast to query as the `SQLiteStore`. The `SQLiteStore` is likely faster\n", + "due to the use of the R tree index. Furthermore, the method used to\n", + "store the points in a NumPy array is limited in that it does not use\n", + "UUIDs, which makes merging two datasets more difficult as the indexes of\n", + "points no longer uniquely identify them. Additionally, only homogeneous\n", + "data such as two-dimentional coordinates can be practically stored in\n", + "this way. If the user would like to store variable length data\n", + "structures such as polygons, or even mix data types by storing both\n", + "points and polygons, then using raw NumPy arrays in this way can become\n", + "cumbersome and begins to offer little benefit in terms of storage\n", + "efficient or query performance.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c766NXGPpT53" + }, + "source": [ + "### 2.1.4) Polygon Query\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6jiMpRnxpT53" + }, + "outputs": [], + "source": [ + "big_triangle = Polygon(\n", + " shell=[ # noqa: S604\n", + " (1024, 1024),\n", + " (1024, 4096),\n", + " (4096, 4096),\n", + " (1024, 1024),\n", + " ],\n", + ")\n", + "\n", + "# Time SQLiteStore\n", + "sqlite_runs = timeit.repeat(\n", + " \"store.query(polygon)\",\n", + " globals={\"store\": points_sqlite_store, \"polygon\": big_triangle},\n", + " number=1,\n", + " repeat=10,\n", + ")\n", + "\n", + "# Time DictionaryStore\n", + "dict_runs = timeit.repeat(\n", + " \"store.query(polygon)\",\n", + " globals={\"store\": points_dict_store, \"polygon\": big_triangle},\n", + " number=1,\n", + " repeat=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Es2OQ5OdpT53", + "outputId": "b98176ee-7003-49f7-f5ca-62b08180b2ee" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"Polygon Query (5 Million Points)\",\n", + " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HUBEmZDMpT53" + }, + "source": [ + "## 2.2) Cell Boundary Polygons Dataset\n", + "\n", + "Here we generate a much larger and more complex polygon dataset. This\n", + "consistes of a grid of over 5 million generated cell boundary like\n", + "polygons.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xhCr_TDVpT53", + "outputId": "c02b7a20-6ab1-4cae-b6bb-fb5c6d94cd12" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5004169/5004169 [10:04<00:00, 8277.35it/s] \n" + ] + } + ], + "source": [ + "# Generate a grid of 5 million cell boundary polygons (2237 x 2237)\n", + "# Run time: ~10m\n", + "rng_42 = np.random.default_rng(42)\n", + "\n", + "cell_polygons = [\n", + " Annotation(geometry=polygon, properties={\"class\": rng_42.integers(0, 4)})\n", + " for polygon in tqdm(cell_grid(size=(2237, 2237), spacing=35), total=2237**2)\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "21RgwKtgpT54" + }, + "source": [ + "### 2.2.1) Write To Formats For Comparison\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CDVLMRUtpT54" + }, + "outputs": [], + "source": [ + "# Write to an SQLiteStore on disk (SSD for recorded times here)\n", + "# Run time: ~30m\n", + "cell_sqlite_store = SQLiteStore(\"cells.db\")\n", + "_ = cell_sqlite_store.append_many(annotations=cell_polygons)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6Fb4tQHVpT54", + "outputId": "fba12c47-e0cb-44fd-ca95-35c38454c9cc" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "# Create a copy as an in memory DictionaryStore\n", + "# Run time: ~5m\n", + "cell_dict_store = DictionaryStore()\n", + "for key, value in tqdm( # Show a nice progress bar\n", + " cell_sqlite_store.items(),\n", + " total=len(cell_sqlite_store),\n", + " leave=False,\n", + " position=0,\n", + "):\n", + " cell_dict_store[key] = value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wXOOuGWypT54", + "outputId": "e2fb300e-e5b8-4459-b172-249cda363b50" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5004169/5004169 [01:26<00:00, 58002.74it/s]\n" + ] + } + ], + "source": [ + "# Transform into a numpy array\n", + "# Run Time: ~1m\n", + "cell_polygons_np = np.array(\n", + " [np.array(a.geometry.exterior.coords) for a in tqdm(cell_polygons)],\n", + " dtype=object,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yv9VgW9TpT54" + }, + "outputs": [], + "source": [ + "# Create an Nx4 index of (xmin, ymin, xmax, ymax) as a simple spatial\n", + "# index to speed up the numpy query.\n", + "# Run time: ~1m\n", + "min_max_index = np.array(\n", + " [(*np.min(coords, 0), *np.max(coords, 0)) for coords in cell_polygons_np],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nFmHxwBwpT54" + }, + "outputs": [], + "source": [ + "# Write to GeoJSON\n", + "# Run time: ~10m\n", + "\n", + "cell_dict_store.to_geojson(\"cells.geojson\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2UH6WdmipT54" + }, + "outputs": [], + "source": [ + "# Write to line delimited JSON (ndjson)\n", + "# Run time: ~10m\n", + "\n", + "cell_dict_store.to_ndjson(\"cells.ndjson\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fw6wg5gapT54", + "outputId": "61a32277-fb8d-4bdc-be28-b379cb0a23eb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cells.ndjson : 40.82% ( 8.82 GiB => 3.60 GiB, cells.ndjson.zstd) \n" + ] + } + ], + "source": [ + "# Zstandard compression of ndjson to demonstrate how well it compresses.\n", + "# Gzip may also be used but is slower to compress.\n", + "# Run time: ~1m\n", + "! zstd -f -k cells.ndjson -o cells.ndjson.zstd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rzGC65zhpT55", + "outputId": "75ad772b-5641-4d64-ae16-7d50206e1b85" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cells.db : 75.87% ( 4.87 GiB => 3.69 GiB, cells.db.zstd) \n" + ] + } + ], + "source": [ + "# Zstandard compression of sqlite to demonstrate how well it compresses.\n", + "# Gzip may also be used but is slower to compress.\n", + "# Run time: ~20s\n", + "! zstd -f -k cells.db -o cells.db.zstd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xT0KZLxdpT55" + }, + "outputs": [], + "source": [ + "# Write as a pickle (list)\n", + "# Run time: ~2m\n", + "with Path(\"cells.pickle\").open(\"wb\") as fh:\n", + " pickle.dump(cell_polygons, fh)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-TAWGEu9pT55" + }, + "outputs": [], + "source": [ + "# Write as a pickle (dict)\n", + "# Run time: ~15m\n", + "with Path(\"cells-dict.pickle\").openI(\"wb\") as fh:\n", + " pickle.dump(cell_dict_store._rows, fh) # noqa: SLF001" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "I-W4o3GepT55" + }, + "outputs": [], + "source": [ + "# Write dictionary store to a pickle\n", + "# Run time: ~20m\n", + "with Path(\"cells.pickle\").open(\"wb\") as fh:\n", + " pickle.dump(cell_dict_store, fh)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dALe8k0BpT55" + }, + "outputs": [], + "source": [ + "# Write as numpy object array (similar to writing out with pickle),\n", + "# Numpy cannot handle ragged arrays and therefore dtype must be object.\n", + "# Run time: ~30m\n", + "np.save(\"cells.npy\", np.asanyarray(cell_polygons_np, dtype=object))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hOrGS0HgpT55" + }, + "outputs": [], + "source": [ + "# Create UUIDs, and get the class labels for each cell boundary\n", + "# Run time: ~2m\n", + "_uuids = [str(uuid.uuid4) for _ in cell_polygons]\n", + "_cls = [x.properties[\"class\"] for x in cell_polygons]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Fs2cz8lVpT55" + }, + "outputs": [], + "source": [ + "# Write as NumPy archive (.npz) with uuid and min_max_index\n", + "# Run time: ~40m\n", + "np.savez(\n", + " \"cells.npz\",\n", + " uuids=_uuids,\n", + " polygons=cell_polygons_np,\n", + " min_max_index=min_max_index,\n", + " cls=_cls,\n", + ")\n", + "\n", + "del _uuids, _cls" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4gOTqc03pT55" + }, + "source": [ + "### 2.2.2) Time To Write Summary Statistics\n", + "\n", + "The following is a summary of the time required to write each format to\n", + "disk and the total disk space occupied by the final output.\n", + "\n", + "Note that some of these formats, such as GeoJSON compress well with\n", + "schemes such as gzip and zstd, reducing the disk space by approximately\n", + "half. Statistics for zstd compressed data is also reported below. It\n", + "should be noted that the data must be decompressed to be usable.\n", + "However, for gzip and zstd, this may be done in a streaming fashion from\n", + "disk.\n", + "\n", + "| Format | Write Time | Size |\n", + "| ----------------: | ---------: | -----: |\n", + "| SQLiteStore (.db) | 33m 48.4s | 4.9 GB |\n", + "| GeoJSON | 11m 32.9s | 8.9 GB |\n", + "| ndjson | 9m 0.9s | 8.8 GB |\n", + "| pickle | 1m 2.9s | 1.8 GB |\n", + "| zstd (SQLite) | 18.2s | 3.7 GB |\n", + "| zstd (ndjson) | 43.7s | 3.6 GB |\n", + "| NumPy (.npy) | 50.3s | 1.8 GB |\n", + "| NumPy (.npz) | 55.3s | 2.6 GB |\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wS3sGpnWpT55" + }, + "source": [ + "### 2.2.3) Box Query\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MKvKfkyvpT55" + }, + "outputs": [], + "source": [ + "# Run time: ~5m\n", + "\n", + "# Setup\n", + "xmin, ymin, xmax, ymax = 128, 12, 256, 256\n", + "box = Polygon.from_bounds(xmin, ymin, xmax, ymax)\n", + "\n", + "\n", + "# Time DictionaryStore\n", + "dict_runs = timeit.repeat(\n", + " \"store.query(box)\",\n", + " globals={\"store\": cell_dict_store, \"box\": box},\n", + " number=1,\n", + " repeat=3,\n", + ")\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " \"store.query(box)\",\n", + " globals={\"store\": cell_sqlite_store, \"box\": box},\n", + " number=1,\n", + " repeat=3,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0Yo14C3kpT55", + "outputId": "764bc28b-3072-4887-ea88-4c88ffcefb5f" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"Box Query (5 Million Polygons)\",\n", + " tick_label=[\n", + " \"DictionaryStore\",\n", + " \"SQLiteStore\",\n", + " ],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ExF-fOGQpT56" + }, + "source": [ + "### 2.2.4) Polygon Query\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PcxKapqNpT56" + }, + "outputs": [], + "source": [ + "# Run Time: 35s\n", + "\n", + "# Setup\n", + "big_triangle = Polygon(\n", + " shell=[ # noqa: S604\n", + " (1024, 1024),\n", + " (1024, 4096),\n", + " (4096, 4096),\n", + " (1024, 1024),\n", + " ],\n", + ")\n", + "\n", + "\n", + "# Time DictionaryStore\n", + "dict_runs = timeit.repeat(\n", + " \"store.query(polygon)\",\n", + " globals={\"store\": cell_dict_store, \"polygon\": big_triangle},\n", + " number=1,\n", + " repeat=3,\n", + ")\n", + "\n", + "# Time SQLite store\n", + "sqlite_runs = timeit.repeat(\n", + " \"store.query(polygon)\",\n", + " globals={\"store\": cell_sqlite_store, \"polygon\": big_triangle},\n", + " number=1,\n", + " repeat=3,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vqHA50DQpT56", + "outputId": "7e837f4c-ada9-400f-b5f3-c59430b137f3" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs],\n", + " title=\"Polygon Query (5 Million Polygons)\",\n", + " tick_label=[\n", + " \"DictionaryStore\",\n", + " \"SQLiteStore\",\n", + " ],\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6m-E5AwapT56" + }, + "source": [ + "### 2.2.5) Predicate Query\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "whEn34rOpT56" + }, + "outputs": [], + "source": [ + "# Run Time: ~10m\n", + "\n", + "# Setup\n", + "xmin, ymin, xmax, ymax = 128, 12, 256, 256\n", + "box = Polygon.from_bounds(xmin, ymin, xmax, ymax)\n", + "predicate = \"props['class'] == 0\"\n", + "\n", + "# Time DictionaryStore\n", + "dict_runs = timeit.repeat(\n", + " \"store.query(box, predicate)\",\n", + " globals={\"store\": cell_dict_store, \"box\": box, \"predicate\": predicate},\n", + " number=1,\n", + " repeat=3,\n", + ")\n", + "\n", + "# Time SQLiteStore\n", + "sqlite_runs = timeit.repeat(\n", + " \"store.query(box, where=predicate)\",\n", + " globals={\"store\": cell_sqlite_store, \"box\": box, \"predicate\": predicate},\n", + " number=1,\n", + " repeat=3,\n", + ")\n", + "\n", + "np_stmt = f\"\"\"\n", + "polygons = [\n", + " polygon\n", + " for polygon in tqdm(cell_polygons_np)\n", + " if np.all([\n", + " np.max(polygon, 0) >= ({xmin}, {ymin}), np.min(polygon, 0) <= ({xmax}, {ymax})\n", + " ])\n", + "]\n", + "\"\"\"\n", + "\n", + "# Time numpy\n", + "numpy_runs = timeit.repeat(\n", + " np_stmt,\n", + " globals={\"cell_polygons_np\": cell_polygons_np, \"np\": np, \"tqdm\": lambda x: x},\n", + " number=1,\n", + " repeat=3,\n", + ")\n", + "\n", + "# Time shapely\n", + "shapely_runs = timeit.repeat(\n", + " \"polygons = [box.intersects(ann.geometry) for ann in cell_polygons]\",\n", + " globals={\"box\": box, \"cell_polygons\": cell_polygons},\n", + " number=1,\n", + " repeat=3,\n", + ")\n", + "\n", + "# Time box indexed numpy\n", + "numpy_index_runs = timeit.repeat(\n", + " \"in_box = np.all(min_max_index[:, :2] <= (xmax, ymax), 1) \"\n", + " \"& np.all(min_max_index[:, 2:] >= (xmin, ymin), 1)\\n\"\n", + " \"polygons = [p for p, w in zip(cell_polygons, in_box) if w]\",\n", + " globals={\n", + " \"min_max_index\": min_max_index,\n", + " \"xmin\": xmin,\n", + " \"ymin\": ymin,\n", + " \"xmax\": xmax,\n", + " \"ymax\": ymax,\n", + " \"np\": np,\n", + " \"cell_polygons\": cell_polygons,\n", + " },\n", + " number=1,\n", + " repeat=3,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oRxJTg7BpT56", + "outputId": "d235e51a-5109-486e-b779-fe39e5f6ee33" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Run Time: ~5s\n", + "\n", + "# Plot results\n", + "plot_results(\n", + " experiments=[dict_runs, sqlite_runs, numpy_runs, shapely_runs, numpy_index_runs],\n", + " title=\"Box Query\",\n", + " tick_label=[\n", + " \"DictionaryStore\",\n", + " \"SQLiteStore\",\n", + " \"NumPy\\n(Simple Loop)\",\n", + " \"Shapely\\n(Simple Loop)\",\n", + " \"NumPy\\n(With Bounds Index)\",\n", + " ],\n", + ")\n", + "plt.xticks(rotation=90)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LJiGGkespT56" + }, + "source": [ + "## 2.3) Size vs Approximate Lower Bound\n", + "\n", + "Here we calculate an estimated lower bound on file size by finding the\n", + "the Shannon entropy of each file. This tells us the theoretical minimum\n", + "number of bits per byte. The lowest lower bound is then used as an\n", + "estimate of the minimum file size possible to store the annotation data.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0IO10faZpT56", + "outputId": "033c2530-072a-4aa5-cf34-c2298e90d86f" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Approximate Lower Bound Size: 3.60 GB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r" + ] + } + ], + "source": [ + "# Run Time: ~5m\n", + "\n", + "\n", + "# Files to consider containing keys, geometry, and properties.\n", + "# Files which are missing keys e.g. cells.pickle are excluded\n", + "# for a fair comparison.\n", + "file_names = [\n", + " \"cells-dicionary-store.pickle\",\n", + " \"cells-dict.pickle\",\n", + " \"cells.db\",\n", + " \"cells.db.zstd\",\n", + " \"cells.geojson\",\n", + " \"cells.ndjson\",\n", + " \"cells.ndjson.zstd\",\n", + "]\n", + "\n", + "\n", + "def human_readible_bytes(byte_count: int) -> tuple[int, str]:\n", + " \"\"\"Convert bytes to human readble size and suffix.\"\"\"\n", + " byte_count_ref = 1024\n", + " for suffix in [\"B\", \"KB\", \"MB\", \"GB\", \"TB\"]:\n", + " if byte_count < byte_count_ref:\n", + " return byte_count, suffix\n", + " byte_count /= byte_count_ref\n", + " return byte_count, \"PB\"\n", + "\n", + "\n", + "def shannon_entropy(\n", + " fp: Path,\n", + " sample_size: int = 1e9, # 1GiB\n", + " stride: int = 7,\n", + " skip: int = 1e5, # 100KiB\n", + ") -> float:\n", + " \"\"\"Calculate the Shannon entropy of a file from a sample.\n", + "\n", + " The first `skip` bytes are skipped to avoid sampling low entropy\n", + " (highly ordered) parts which commonly occur at the beginning e.g.\n", + " headers.\n", + "\n", + " Args:\n", + " fp: File path to calculate entropy of.\n", + " sample_size: Number of bytes to sample from the file.\n", + " stride: Number of bytes to skip between samples.\n", + " skip: Number of bytes to skip before sampling.\n", + " \"\"\"\n", + " npmmap = np.memmap(Path(fp), dtype=np.uint8, mode=\"r\")\n", + " values, counts = np.unique(\n", + " npmmap[int(skip) : int(skip + (sample_size * stride)) : int(stride)],\n", + " return_counts=True,\n", + " )\n", + " total = np.sum(counts)\n", + " frequencies = {v: 0 for v in range(256)}\n", + " for v, x in zip(values, counts):\n", + " frequencies[v] = x / total\n", + " frequency_array = np.array(list(frequencies.values()))\n", + " epsilon = 1e-16\n", + " return -np.sum(frequency_array * np.log2(frequency_array + epsilon))\n", + "\n", + "\n", + "# Find the min across all of the representations for the lowest lower\n", + "# bound.\n", + "bytes_lower_bounds = {\n", + " path: (\n", + " shannon_entropy(Path(path)) / 8 * len(np.memmap(path, dtype=np.uint8, mode=\"r\"))\n", + " )\n", + " for path in tqdm(\n", + " [Path.cwd() / name for name in file_names],\n", + " position=0,\n", + " leave=False,\n", + " )\n", + "}\n", + "\n", + "lowest_bytes_lower_bound = min(bytes_lower_bounds.values())\n", + "\n", + "size, suffix = human_readible_bytes(lowest_bytes_lower_bound)\n", + "logger.info(\"Approximate Lower Bound Size: %2f %s\", size, suffix)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "chwB3zeupT56" + }, + "source": [ + "### Plot Results\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cu5jkrVppT56", + "outputId": "bb36aea5-d5d7-4560-a853-d2a8afba0eac" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Get file sizes\n", + "file_sizes = {\n", + " path: path.stat().st_size for path in [Path.cwd() / name for name in file_names]\n", + "}\n", + "\n", + "# Sort by size\n", + "file_sizes = dict(sorted(file_sizes.items(), key=lambda x: x[1]))\n", + "\n", + "# Plot\n", + "plt.bar(\n", + " x=range(len(file_sizes)),\n", + " height=file_sizes.values(),\n", + " tick_label=[p.name for p in file_sizes],\n", + " color=[f\"C{i}\" for i in range(len(file_sizes))],\n", + ")\n", + "plt.xlabel(\"File Name\")\n", + "plt.ylabel(\"Bytes\")\n", + "plt.xticks(rotation=90)\n", + "plt.hlines(\n", + " y=lowest_bytes_lower_bound,\n", + " xmin=-0.5,\n", + " xmax=len(file_sizes) - 0.5,\n", + " linestyles=\"dashed\",\n", + " color=\"black\",\n", + " label=\"Approximate Bytes Lower Bound\",\n", + ")\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.title(\"Polygon Annotation File Sizes\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gmuEWlImpT57" + }, + "source": [ + "The SQLite representation (4.9GB) appears to be quite compact compared\n", + "with GeoJSON and ndjson. Although not as compact as a dictionary pickle\n", + "or Zstandard compressed ndjson, it offers a good compromise between\n", + "compactness and read performance.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Yhe5rMXPpT57" + }, + "source": [ + "# 3: Extra Bits\n", + "\n", + "## 3.1) Space Saving\n", + "\n", + "A lot of space can be saved by rounding the coordinates to the nearest\n", + "integer when storing them. Below we make a copy of the dataset with all\n", + "coordinates rounded.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "H2Jsc0repT57", + "outputId": "d2ca9eff-b67d-4bfc-ad5a-57c87bc6a7da" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 10008338/10008338 [51:00<00:00, 3270.16it/s] \n" + ] + } + ], + "source": [ + "# Run Time: ~50m\n", + "! rm integer-cells.db\n", + "int_cell_sqlite_store = SQLiteStore(\"integer-cells.db\")\n", + "\n", + "# We use batches of 1000 to speed up appending\n", + "batch = {}\n", + "batch_size = 1000\n", + "for key, annotation in tqdm(cell_sqlite_store.items(), total=len(cell_sqlite_store)):\n", + " geometry = Polygon(np.array(annotation.geometry.exterior.coords).round())\n", + " rounded_annotation = Annotation(geometry, annotation.properties)\n", + " batch[key] = rounded_annotation\n", + " if len(batch) >= batch_size:\n", + " int_cell_sqlite_store.append_many(batch.values(), batch.keys())\n", + " batch = {}\n", + "_ = int_cell_sqlite_store.append_many(batch.values(), batch.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U6aooIROpT57" + }, + "source": [ + "Here the database size is reduced to 2.9GB, down from 4.9GB.\n", + "Additionally, when using integer coordinates, the database compresses\n", + "much better. Zstandard can compress to approximately 60% of the\n", + "original size (and 35% of the floating point coordinate\n", + "database size). This may be done for archival purposes.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q3TJ8XX4pT57", + "outputId": "b99d1af7-4c68-4394-cf9a-8bb2b64471a0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "integer-cells.db : 60.58% ( 2.86 GiB => 1.73 GiB, integer-cells.db.zstd) \n" + ] + } + ], + "source": [ + "# Run time: ~15s\n", + "! zstd -f -k integer-cells.db -o integer-cells.db.zstd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "alFRiIAbpT57" + }, + "source": [ + "With higher (slower) compression settings the space can be further\n", + "reduced for long term storage.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nVFqovfPpT57", + "outputId": "0948bbe6-4252-4c93-eab7-8e3be4e98235" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "integer-cells.db : 51.22% ( 2.86 GiB => 1.47 GiB, integer-cells.db.19.zstd) \n" + ] + } + ], + "source": [ + "# Run time: ~20m\n", + "! zstd -f -k -19 --long integer-cells.db -o integer-cells.db.19.zstd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C3voJ43OpT57" + }, + "source": [ + "## 3.2) Feature Comparison Summary\n", + "\n", + "Here we briefly summarise some of the positives and negatives of each format and construct a comparison matrix.\n", + "\n", + "**GeoJSON**\n", + "\n", + "*Positives*\n", + "\n", + "- Simple, based JSON which is well known.\n", + "- Well defined with a public specification.\n", + "- Popular format for geometry, many tools which work with it.\n", + "- Fast to write.\n", + "\n", + "*Negatives*\n", + "\n", + "- Requires loading the whole file into memory for parsing. Some\n", + " specialised parsers can, in some situations, reduce or avoid this but\n", + " it is not possible in general.\n", + "- Not a very compact representation.\n", + "\n", + "**ndjson (One GeoJSON Feature Per Line)**\n", + "\n", + "*Positives*\n", + "\n", + "- Simple.\n", + "- Better to parse than JSON/GeoJSON. Each line can be parsed\n", + " independently.\n", + "- Many tools to parse JSON lines.\n", + "- Fast to write.\n", + "\n", + "*Negatives*\n", + "\n", + "- Not a very compact representation.\n", + "- Requires loading the whole dataset from disk before querying OR\n", + " scanning through and reparsing each line for each query.\n", + "- Amending annotations can be tricky. The easiest way is to blank out a\n", + " line and append a modified copy each time. This could end up\n", + " fragmenting the file and wasting a lot of space. More complex methods\n", + " could be developed to reduce fragmenting the file.\n", + "\n", + "**pickle**\n", + "\n", + "*Positives*\n", + "\n", + "- Fast to write.\n", + "\n", + "*Negatives*\n", + "\n", + "- Vulnerable to arbitrary code execution when loading from disk.\n", + "- Requires loading the whole dataset into memory for querying.\n", + "\n", + "**SQLite (SQLiteStore Flavour)**\n", + "\n", + "*Positives*\n", + "\n", + "- Very fast to query (uses an R-TREE index to accelerate\n", + " spatial queries).\n", + "- Does not require loading data into memory before querying.\n", + "- Possible to index property lookups.\n", + "\n", + "*Negatives*\n", + "\n", + "- Not the most compact representation on disk.\n", + "\n", + "### Feature Matrix\n", + "\n", + "| Format | Size On-Disk | Size In-Memory | Partial Reads | Serialization | Query Performance |\n", + "| ----------: | :----------- | :------------- | :------------ | :------------ | :---------------- |\n", + "| SQLiteStore | Medium | Small | Yes | Slow | Fast |\n", + "| GeoJSON | Large | Large | No | Fast | Slow |\n", + "| ndjson | Large | Large | Yes | Fast | Medium |\n", + "| pickle | Small | Medium | No | Medium | Slow |\n", + "\n" ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "for n in range(4):\n", - " display(cell_polygon(xy=(0, 0), n_points=20, repeat_first=False, seed=n))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "APUNL2PtpT5w" - }, - "source": [ - "### Randomised Cell Boundaries\n", - "\n", - "Here we create a function to generate grid of cells for testing. It uses a fixed seed for reproducibility.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SOpBKM7IpT5w" - }, - "source": [ - "### A Sample 5×5 Grid\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2xA-oG4VpT5w", - "outputId": "caea51e4-8a27-4dd1-ed0d-c272b93d8bb7" - }, - "outputs": [ - { - "data": { - "image/svg+xml": "", - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "MultiPolygon(polygons=list(cell_grid(size=(5, 5), spacing=35)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b6S8vzFipT5w" - }, - "source": [ - "# Part 1: Small Scale Benchmarking of Annotation Storage\n", - "\n", - "Using the already defined data generation functions (`cell_polygon` and\n", - "`cell_grid`), we create some simple artificial cell boundaries by\n", - "creating a circle of points, adding some noise, scaling to introduce\n", - "eccentricity, and then rotating. We use 20 points per cell, which is a\n", - "reasonably high value for cell annotation. However, this can be\n", - "adjusted.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UZMoLDvkpT5x" - }, - "source": [ - "## 1.1) Appending Annotations (In-Memory & Disk I/O)\n", - "\n", - "Here we test:\n", - "\n", - "1. A python dictionary based in-memory store (`DictionaryStore`)\n", - "1. An SQLite database based in-memory store (`SQLiteStore`)\n", - "\n", - "Both of these stores may operate in memory. The `SQLiteStore` may also\n", - "be backed by an on-disk file for datasets which are too large to fit in\n", - "memory. The `DictionaryStore` class can serialise/deserialise itself\n", - "to/from disk in a line delimited GeoJSON format (each line seperated\n", - "by `\\n` is a valid GeoJSON object)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DZBiw_EepT5x" - }, - "outputs": [], - "source": [ - "# Convert to annotations (a dataclass pairing a geometry and (optional)\n", - "# key-value properties)\n", - "# Run time: ~2s\n", - "annotations = [\n", - " Annotation(polygon) for polygon in cell_grid(size=(100, 100), spacing=35)\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LUVa03F2pT5x" - }, - "source": [ - "### 1.1.1) In Memory Append\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7PzE7AhdpT5x", - "outputId": "974bb3d0-3290-4315-a6fc-3b7ca90072a6" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run time: ~5s\n", - "\n", - "# Time dictionary store\n", - "dict_runs = timeit.repeat(\n", - " \"dict_store.append_many(annotations)\",\n", - " setup=\"dict_store = DictionaryStore()\",\n", - " globals={\"DictionaryStore\": DictionaryStore, \"annotations\": annotations},\n", - " number=1,\n", - " repeat=3,\n", - ")\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " \"sql_store.append_many(annotations)\",\n", - " setup=\"sql_store = SQLiteStore()\",\n", - " globals={\"SQLiteStore\": SQLiteStore, \"annotations\": annotations},\n", - " number=1,\n", - " repeat=3,\n", - ")\n", - "\n", - "# Plot the results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"Time to Append 10,000 Annotations In Memory\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", - ")\n", - "plt.hlines(0.5, -0.5, 1.5, linestyles=\"dashed\", color=\"k\")\n", - "plt.xlim([-0.5, 1.5])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gU6PLE7wpT5x" - }, - "source": [ - "Note that inserting into the `SQLiteStore` is much slower than the\n", - "`DictionaryStore`. Appending to a `Dictionary` store simply requires\n", - "adding a memory reference to a dictionary. Therefore, this is a very\n", - "fast operation. On the other hand, for the `SQLiteStore`, the insertion\n", - "is slower because the data must be serialised for the database and the\n", - "R-Tree spatial index must also be updated. Updating the index is a\n", - "relatively expensive operation. However, this spatial index allows for\n", - "very fast queries of a very large set of annotations within a set of\n", - "spatial bounds.\n", - "\n", - "Insertion is typically only performed once for each\n", - "annotation, whereas queries may be performed many times on the\n", - "annotation set. Therefore, it makes sense to trade a more expensive\n", - "insertion for fast queries as the cost of insertion will be amortised\n", - "over a number of queries on the data. Additionally, data may be written\n", - "to the database from multiple threads or subprocesses (so long as a new\n", - "instance of `SQLiteStore` is created for each thread or subprocess to\n", - "attach to a database on disk) thus freeing up the main thread.\n", - "\n", - "For comparison, we also compare bulk insertion plus seralising to disk\n", - "as line-delimited GeoJSON from the `DictionaryStore` as this is the\n", - "default serialisation to disk method (`DictionaryStore.dump(file_path`).\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t2q9QTCfpT5x", - "outputId": "2202c328-ba48-476b-8efa-662678d75135" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run time: ~10s\n", - "\n", - "setup = \"fp.truncate(0)\\nstore = Store(fp)\" # Clear the file\n", - "\n", - "# Time dictionary store\n", - "with tempfile.NamedTemporaryFile(\"w+\") as fp:\n", - " dict_runs = timeit.repeat(\n", - " (\"store.append_many(annotations)\\nstore.commit()\"),\n", - " setup=setup,\n", - " globals={\"Store\": DictionaryStore, \"annotations\": annotations, \"fp\": fp},\n", - " number=1,\n", - " repeat=3,\n", - " )\n", - "\n", - "# Time SQLite store\n", - "with tempfile.NamedTemporaryFile(\"w+b\") as fp:\n", - " sqlite_runs = timeit.repeat(\n", - " (\"store.append_many(annotations)\\nstore.commit()\"),\n", - " setup=setup,\n", - " globals={\"Store\": SQLiteStore, \"annotations\": annotations, \"fp\": fp},\n", - " number=1,\n", - " repeat=3,\n", - " )\n", - "\n", - "# Plot the results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"Time to Append & Serialise 10,000 Annotations To Disk\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", - ")\n", - "plt.hlines(0.5, -0.5, 1.5, linestyles=\"dashed\", color=\"k\")\n", - "plt.xlim([-0.5, 1.5])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LKr6FmctpT5x" - }, - "source": [ - "Here we can see that when we include the serialisation to disk in the\n", - "benchmark, the time to insert is much more similar.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "V7WV8wNmpT5x" - }, - "source": [ - "## 1.2) Box Query\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eul4PYZPpT5x", - "outputId": "a0131a72-f527-48b1-8aac-8cbccfced2ed" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run time: ~20s\n", - "\n", - "# One time Setup\n", - "dict_store = DictionaryStore()\n", - "sql_store = SQLiteStore()\n", - "dict_store.append_many(annotations)\n", - "sql_store.append_many(annotations)\n", - "\n", - "rng = np.random.default_rng(123)\n", - "boxes = [\n", - " Polygon.from_bounds(x, y, 128, 128) for x, y in rng.integers(0, 1000, size=(100, 2))\n", - "]\n", - "stmt = \"for box in boxes:\\n _ = store.query(box)\"\n", - "\n", - "# Time dictionary store\n", - "dict_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": dict_store, \"boxes\": boxes},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": sql_store, \"boxes\": boxes},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Plot the results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"100 Box Queries\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z9ntCgKapT5x" - }, - "source": [ - "Here we can see that the `SQLiteStore` is a bit faster. Addtionally,\n", - "difference in performance is more pronounced when there are more\n", - "annotations (as we will see later in this notebook) in the store or when\n", - "just returning keys:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vfGH6e4upT5x", - "outputId": "7cf8bf30-a4c9-4de5-9a5f-f9fd6cffc141" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run time: ~15s\n", - "\n", - "# One time Setup\n", - "dict_store = DictionaryStore()\n", - "sql_store = SQLiteStore()\n", - "dict_store.append_many(annotations)\n", - "sql_store.append_many(annotations)\n", - "\n", - "rng = np.random.default_rng(123)\n", - "boxes = [\n", - " Polygon.from_bounds(x, y, 128, 128) for x, y in rng.integers(0, 1000, size=(100, 2))\n", - "]\n", - "stmt = \"for box in boxes:\\n _ = store.iquery(box)\" # Just return the keys (uuids)\n", - "\n", - "# Time dictionary store\n", - "dict_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": dict_store, \"boxes\": boxes},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": sql_store, \"boxes\": boxes},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Plot the results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"100 Box Queries (Key Lookup Only)\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xVQlsK1MpT5y" - }, - "source": [ - "## 1.3) Polygon Query\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fnkdnKWRpT5y", - "outputId": "03ccc35c-df96-4d68-9d53-72ac835a9088" - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEaCAYAAADtxAsqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAw70lEQVR4nO3deXhT1b4+8Dc7Q5POQ+hImcpcQIbKDC2UIog/zgEVvCKCHCZx4oogRX3keMQDeMEDB0SEIiKgIIIg98gVZChQ5EIZKi1YBouFFjpR0jZJm2H9/uhtDrEDW22b0r6f5+HR7LWy9zch5M3aw9oKIYQAERGRDJKrCyAiogcHQ4OIiGRjaBARkWwMDSIiko2hQUREsjE0iIhINoYGUTViYmIwdepUV5fxQNq4cSNUKpWry6A6wNCgOpGYmIg//elPaNmyJRQKBd59990q+508eRL9+/eHVqtFSEgI4uPjYbPZnPqkp6fjkUcegbu7O/R6PWbOnImSkpIatz958mQoFAooFAqoVCq0bNkSM2fORH5+fq29xoaoqKgIb7zxBjp06AA3Nzf4+flh5MiROHz4cL3WMX78eNy8ebNet0n1g6FBdaK4uBidO3fG0qVLERwcXGWfzMxMxMXFoUOHDkhOTsaaNWuwdu1avPHGG07riY2NhUqlQlJSErZv3459+/bhL3/5y31rGDRoELKzs5GRkYGVK1fiq6++wrPPPltrr7GhMRgMGDBgALZt24Z3330X6enpOHToENq1a4fY2Fhs2LChzmsQQsBisUCn0yEoKKjOt0cuIIjqWMuWLcXf/va3Ssvj4+NFWFiYsNlsjmWrVq0S7u7uori4WAghxNq1a4VWqxWFhYWOPnv37hUAxLVr16rd5qRJk0RsbKzTsnfffVdIkiSMRqOw2+3i/fffF61btxZqtVq0adNGfPDBB079o6OjxV/+8hchhBAbNmwQPj4+oqSkxKnPwoULRatWrYTdbhdCCLF//37RpUsX4ebmJrp27SoOHz4sAIjPPvvM8ZxLly6JRx99VHh4eAgPDw/x2GOPicuXLzvaP/nkE6FUKsWxY8dEjx49hE6nE1FRUeL06dPVvl4hhHjppZeEVqsVGRkZldpmzpwptFqtuHnzptM27pWZmSkAiEOHDjmWXb58WYwdO1b4+PgIX19fERcXJ1JSUirVevDgQdG9e3ehVqvFN998U+X6T58+LeLi4oSHh4fQ6/VizJgxTrVmZmaKsWPHioCAAKHVakXr1q3F0qVLa3zNVP840iCXOX78OIYPHw5J+vfHcMSIETAajTh79qyjT79+/eDj4+PoU/Gc48eP/6bt6XQ62O12WK1WfPjhh3jrrbcwf/58pKamYu7cuZg/fz4SEhKqfO5TTz0FhUKBL7/80rHMbrfjk08+wdSpU6FQKHDz5k2MHj0affr0wZkzZ/DBBx/g1VdfdVqPyWTC8OHDYTabceTIERw5cgTFxcUYMWIEysrKnNYdHx+PFStW4MyZM/Dz88O4ceNgtVqrrE8IgS1btmDChAlo2bJlpfYFCxbAbDZjx44dst+v27dvY+DAgQgMDMTRo0fxww8/oEOHDoiJiUFubq5TrfPmzcOyZctw6dIl9OnTp9K60tLSEB0djX79+uH06dM4ePAglEol4uLiYDabAQCzZs3C3bt3ceDAAVy8eBEJCQlo3ry57Hqpnrg6tajxq26k0a5dOxEfH++0rLi4WAAQ27dvF0IIERcXJ/7jP/6j0nP1en2Nv0J/PdJITU0Vbdq0EX369BFCCNG8eXMxd+5cp+fMnj1btG7d2vH43pGGEOW/5AcMGOB4vG/fPqFSqURWVpYQQogFCxaIli1bCqvV6ujz7bffOo001q9fL3Q6ncjNzXX0uXXrltBqteLTTz8VQpT/egcgkpOTHX1OnDghAIhLly5V+Xpv374tAIjly5dX+554e3uLWbNmObZxv5HG22+/7Xi/KtjtdqdRWUWtiYmJTv1+vf5JkyaJ8ePHO/Uxm81Cp9OJXbt2CSGE6Natm3j77berrZ8aBo40qEFRKBRO/5XTtzqHDx+Gp6cndDodunTpgjZt2mDr1q0wGAy4ceMGBg8e7NQ/OjoaGRkZMBqNVa5vxowZOH78ONLS0gAA69atw6hRoxASEgKg/Nf0ww8/DKVS6XhOv379nNaRmpqKzp07Q6/XO5YFBQWhQ4cOSE1NdXptDz30kONxWFgYgPJf/1URMuYdFUJArVbft1+FU6dOITk5GZ6eno4/Xl5eyMjIwOXLl536Pvzww/dd165du5zWFRAQALPZ7FjX7Nmz8d5776FPnz54/fXXkZiYKLtWqj88J45cJiQkBLdu3XJaVvG44uB5SEgIMjMznfpYLBYUFBRUe4C9Qp8+ffDpp59CpVIhJCQEbm5uAMoPGAOVQ+d+X7yRkZEYOHAg1q9fj/nz52PPnj34+uuvnfr8ep1VBVtVy4QQTsslSXIKn4o2u91eZW2BgYHw9/fHhQsXqmzPzMxEUVER2rdv71j/r1ksFqfHdrsdsbGxWLVqVaW+9+4uVCqV0Gq1VW733nVNnDgR8+fPr9QWEBAAAHjuuecwYsQI7Nu3D4cOHcLIkSMxZswYbN68ucZ1U/3iSINcZsCAAdi/f7/TF+G+ffvg7u6OHj16OPqcOHHC8UUPwPGcAQMG1Lh+nU6Htm3bolWrVo7AAABvb280b94cR44cceqfmJiI1q1bw93dvdp1zpgxA5s2bcLHH3+M4OBgjBgxwtHWuXNnnDp1yumU4RMnTjg9PzIyEqmpqcjLy3Msu337NtLT0xEZGVnj66mJQqHAhAkTsHXrVly/fr1S+3vvvQetVovx48cDKA8Zm83mNHI5c+aM03OioqKQmpqKsLAwtG3b1ulPs2bNflN9UVFRSElJQURERKV1+fn5OfqFhITgueeew6ZNm5CQkIAtW7Y4/d1TA+DavWPUWBUVFYmzZ8+Ks2fPipCQEPHCCy+Is2fPOp0l9MsvvwgvLy8xZcoUceHCBbF7927h7+8vXn/9daf1NG/eXIwaNUqcO3dOHDx4ULRq1arS/vFfq+rsqXutXr1aaLVa8fHHH4v09HTx0UcfCTc3N7F+/XpHn18f0xBCCJPJJAICAoRGoxELFy50artx44bQ6XRi2rRpIi0tTRw8eFD06tVLABCbN28WQghhNBpFixYtxNChQ0VycrI4ffq0iImJEREREaK0tFQIIf/Mpl8rLCwUXbt2FREREeLLL78U169fF+fOnRMvv/yykCRJbN261dE3Pz9feHl5icmTJ4v09HTx7bffim7dujlt49atWyIkJEQMHz5cJCYmip9//lkcPXpULFiwQBw/frzaWqtanpaWJjw9PcXTTz8tTp48Ka5duyYOHjwoXn75ZXH16lUhhBAvvPCC+O///m9x5coVceHCBfHkk0+K8PBwx5lp1DAwNKhOHDp0SACo9Cc6Otqp34kTJ0S/fv2Em5ubCAoKEvPnz3c6kCxE+SmqcXFxQqfTCX9/fzF9+nTHKbnVuV9o2O12sXTpUtGqVSuhUqlE69atazzl9l6zZ88WkiSJzMzMSm379+8XkZGRQqPRiK5duzoOhO/YscPp9YwcOdJxyu2oUaOqPOX2XnJCQwgh7t69K+Lj40Xbtm2FWq0WAISHh4c4efJkpb579+4VHTt2FFqtVvTv31/s27ev0jYyMjLE008/LfR6vdBoNKJFixZiwoQJjtOd5YaGEEKkpKSI0aNHC19fX6HVakVERISYNm2ayM/PF0IIMWvWLNGuXTuh1WqFv7+/ePTRR8WFCxdqfL1U/xRC8M59RL/FuHHjYDKZ8M0339y3b2JiIqKjo5GSkoKuXbvWQ3XOTp06hUceeQSjR4/Ghg0bqjyWQfRb8BNEJNOdO3ewZ88e7Nq1C3PmzKmyz5o1a5CUlISMjAz861//wrRp09CnTx+XBAZQflbTkSNH0KpVK5w/f94lNVDjwrOniGTq0aMH8vPzMW/ePMTExFTZ5/r16/j73/+O27dvIzg4GHFxcViyZEn9FvorXbt2dVloUePD3VNERCQbd08REZFsDA0iIpKtSRzTyMrKcnUJTYJer3e6aO1egYGBMO7eitILZyFKiqHrPwSKYaMrXbglSRICPdxh2LYBZT+nAxYLvJ+ZAWPzNo6J7bRaLbyKC3Fn1d8BAP7zFiHfJuDh4QFN3i0Ubf8EZempUGjd4fmnp+AZNxrGYwdQ/N87YL2dBcndA7reg+Dx1F+QV3DHcXGhQqGQNR0HuUZNny+qXaGhodW2NYnQINdTKBQoTTkNhUoNc+pZqFu3Q1WzICkUCtiLDSi79hNEWSnKLqbAbih0tEuSBG9PD+T/bTbKrlwC7HbAUga1mzu0xmLceu0vkLx94TXmaUCI8j8AStPOQxUcBl2/GJQc+AaGbRsgefvCc9hoSJJUPg2G2QhotLAJgbt37zrNOktE5RgaVC9KSkrg99Zy2H9Oh+mHI9X2s9lssPgFIOBvq2Dc8znKLqY4tfv4+MD49eewlxRD1zcapqRDAMqnDCnZvRPCVALfVxdC8vSCKrQFVPpAmM1m+E55BaVWK4QQ8AkKRd47r8JyIwP+np4o2vslsj5dBWEsAZRKeD42Dh4TZzE0iKrA0KB6UVxcjNLSUvjcvyvu3LkDDw+PSgfcdDodlLdvIv+LdWj23kco/ma7o02j0aD4/wImf8kCQNgBAfg9PxdSzKO4lZsLlUoFPx9vFH63G5AkeMSMBAAU7dgIdau2CJi3CLY7+bAX3qmlV03U+PBAOLmcSqWCTqeDVqutcbpzT09PFH64BLo+0ZA8vGA3FgMArNk3INmsUKjLJyX0mfQCwrYfhtLPH4Ub/gk3jab8ftnuOtxZNA+mU8fg/+pfIdp2QllZGTSduqEs7Txuz54Ew2cfAZLCaYZZIvo3jjSoXri7u8PDwwOi8N93fNPpdFAqlXBTKmE6/j2UAc3gFtEJQghotVqY7nm+l5cX1Go1rHm3UZp6FsbE7xxtuW+9iKDVX0AV3go4AaiCwyBpdZB8A2Az3IWkAHwUArlvzIL1RgaaLVwBbc++sFqtUKlUCJi7CGWjxqHs6iUU7f4C+YvjEbrtUP29OUQPEIYG1QtPT08Uvv8mrDfLp+02Hv8eZdd+gt+s+UBgCAr+6y1oew+CT/wSKI3FyIt/Bbb88oC5+9kaSHu+QNDS9dC/tQywWv5v+UcwJych4M3/giq4OTxHPYnib7bh7uY1MJ85AcvVS/CIGw2FUoW7n30Ey+U0KDy8cOejpQAAbfc+8Jv1OvLeeRWqoFBIHp4QllJInt7gOVREVWNoUL0QQkDTrhNUwWHQ9urvWC65e0ChVsPriUlQN28Fu90OlVoDt87dnVfwf6fD3vX0BVB+QNwjdhTUrdvBLbI78gwGuLu7I2jFZhT/awfsRXfh92I8tEMfQ2lpKbQ9+0Dy8nZapbplGwCA+8BhKE07B0t2JjyiR8Bj5BiUlJTU5dtB9MBqEtOI8DqN+lHTefRqtbrKu7vZbDbY7XbHbUiNRiPUanWVtyUtKytDaWkpgPK7xVXcLMlqtcJkKt+ZpdFooNPpoFAoYLVaHbdu9fDwqLKu0tJSqNVqqFQqSJIEm80Gs9nMM6caIF6nUX94nQbVumXLlmH58uX37ffqq69izpw5sFgslW4neq+KC/cAOL64a2Kz2VBUVFRpeVlZWZVf+FX1vfc5RCQPQ6OO2KaNdnUJdcqeLm/0Zt/zOWyXqr8u40GnXLfH1SUQ1SuGBv0ur7YPxavtqx/CElHjxOs0iIhINoYGERHJxtAgIiLZGBpERCTbA3Ug3Gw2Y/369VCpVIiMjMSgQYNcXRIRUZPi8tD48MMPcebMGfj4+GDZsmWO5efOncMnn3wCu92O2NhY/PnPf8b//u//om/fvoiKisIHH3zA0CAiqmcu3z0VExODBQsWOC2z2+1ISEjAggUL8MEHH+D48eO4ceMG8vPzodfrAZTfjIeIiOqXy0canTt3Rk5OjtOyK1euIDg4GEFBQQCA/v3749SpUwgICEB+fj5atWpV4205Dxw4gAMHDgAAFi9e7Aia+nS73rdIruCKz1ZTpVKp+H43AC4PjaoUFBQgICDA8TggIACXL1/GyJEjsWHDBpw5cwa9evWq9vnDhg3DsGHDHI85Xw3VFX626g/nnqo/D9zcU1WNIhQKBbRaLWbNmuWCioiICGgAxzSqUrEbqkJ+fj78/PxcWBEREQENNDQiIiKQnZ2NnJwcWK1WJCUlISoqytVlERE1eS7fPfWPf/wDaWlpKCoqwsyZMzFu3DgMHToUU6ZMwaJFi2C32zFkyBCEh4e7ulQioibP5aExe/bsKpf37NkTPXv2rN9iiIioRg1y9xQRETVMjTY0Tp8+jbVr17q6DCKiRsXlu6fqSlRUFA+eExHVskY70iAiotrH0CAiItkYGkREJBtDg4iIZGNoEBGRbAwNIiKSrdGGBq/TICKqfbxOg4iIZGu0Iw0iIqp9DA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhka7ShwSvCiYhqH68IJyIi2RrtSIOIiGofQ4OIiGRjaBARkWwMDSIiko2hQUREsjE0iIhINoYGERHJxtAgIiLZGBpERCRbow0NTiNCRFT7OI0IERHJ1mhHGkREVPsYGkREJBtDg4iIZGNoEBGRbAwNIiKSjaFBRESyMTSIiEg2hgYREcnG0CAiItkYGkREJFujnUakNmzbtq3Ssg4dOqB79+6wWCzYuXNnpfbIyEh06dIFJpsde7MLKrV38/FABy8dDBYb/uf2nUrtPX09EeGpRUGZFd/nFFZq7+3vhZbubsgpteBI7t1K7QMCvBGq0yDLVIbj+YZK7dHNfBDopsZ1Yyn+t6CoUntsoC/8NSpcLTbjTGFxpfZHgvzgrVbipyITUu6WVGp/LMQfOqWEVIMRaQZjpfY/hwZALSlwvrAE6cWmSu1PNtcDAE7fKcbPJWanNqVCgbFhAQCAH/KLkGkqdWp3kySMDvUHABzNM+CWucyp3VOlxMhgPwDA4dy7yC21OLX7qlWIC/IFAOy/XYhCi9WpvZmbGjHNfAAA3966g2KrDYp7PiMhISEYPHgwAGD37t0wm53rb9GiBfr16wcA+Oqrr2C1Oq+/TZs2ePjhhwH8wc+eyYQ9e/ZUan/ooYfQsWNHGAwGfPvtt5Xae/XqhbZt26KgoAD79++v1N63b1+0bNkSOTk5OHToUKX2gQMHIiwsDDdv3sSxY8cqtQ8ZMgSBgYG4fv06fvjhh0rtcXFx8Pf3x5UrV5CcnFyp/ZlnngEAXLp0CefPn6/UPnr0aOh0Oly4cAGpqamV2seOHQu1Wo1z587hp59+qtQ+fvx4AMCpU6dw7do1pzaVSoXHH38cAHDixAn88ssvTu1arRZ/+tOfAACJiYnIzs52avfy8sKjjz4KADh06BBycnKc2v38/DB8+HAAwHfffYc7d5y/GwIDAzFkyBAAwL/+9S8UFTn/263qs1fxempbox1pcMJCIqLapxBCCFcXUdeysrLqfZu2aaPrfZtU/5TrKv+ip7qh1+uRl5fn6jKahNDQ0GrbGu1Ig4iIah9Dg4iIZGNoEBGRbAwNIiKSjaFBRESyMTSIiEg2hgYREcnG0CAiItkYGkREJBtDg4iIZGNoEBGRbAwNIiKSjaFBRESyMTSIiEi2RhsavJ8GEVHtq/HOfQaDAYmJiThz5gyuX78Oo9EId3d3tGzZEt27d0dMTAy8vb3rq9bfJCoqClFRUa4ug4ioUak2NLZu3YqjR4+iR48eGDp0KMLCwqDT6WAymXDz5k2kpaXh9ddfx8CBAzFhwoT6rJmIiFyk2tDw8/PDypUroVarK7W1bt0aAwcORFlZGQ4ePFinBRIRUcNRbWiMHDnyvk/WaDQYMWJErRZEREQNV43HNCpcuHABgYGBCAwMxJ07d7BlyxZIkoSnn34avr6+dVwiERE1FLLOnkpISIAklXfdtGkTbDYbFAoFz04iImpiZI00CgoKoNfrYbPZcP78eXz44YdQqVSYMWNGXddHREQNiKzQ0Ol0KCwsRGZmJpo3bw6tVgur1Qqr1VrX9RERUQMiKzRGjBiB+Ph4WK1WTJ48GQBw6dIlhIWF1WVtRETUwMgKjT//+c/o3bs3JElCcHAwAMDf3x8zZ86s0+KIiKhhkRUaABAaGlrjYyIiavyqPXsqPj4eJ06cqPa4hdVqRVJSEhYsWFBnxRERUcNS7UjjhRdewLZt27B+/Xq0bt0aoaGh0Gq1MJvNyM7OxrVr19ClSxfMmjWrPuslIiIXUgghRE0dCgsLkZKSgl9++QUlJSXw8PBAy5Yt0a1bN/j4+NRXnX9IVlZWvW/TNm10vW+T6p9y3R5Xl9Bk6PV65OXlubqMJqGmww/3Pabh6+uLwYMH12pBRET0YGq099MgIqLax9AgIiLZGBpERCQbQ4OIiGSTFRpCCBw4cAB//etf8dprrwEA0tLSkJSUVKfF/RG8RzgRUe2TFRrbtm3DoUOHMGzYMMcpbwEBAdi9e3edFvdHREVFcRZeIqJaJis0jhw5gtdffx0DBgyAQqEAAAQGBiInJ6dOiyMiooZFVmjY7XZotVqnZWazudIyIiJq3GSFRo8ePbBp0yZYLBYA5cc4tm3bhl69etVpcURE1LDICo1nn30WBQUFmDx5MoxGI5599lnk5uZiwoQJdV0fERE1ILKmRnd3d8e8efNQWFiIvLw86PV6+Pr61nFpRETU0Pym6zQ0Gg38/f1ht9tRUFCAgoKCuqqLiIgaIFkjjZSUFHz88cfIzc2t1LZt27ZaL4qIiBomWaHx0Ucf4fHHH8eAAQOg0WjquiYiImqgZIWGxWLBkCFDIEmcdYSIqCmTlQKjRo3C7t27cZ/7NRERUSMna6TRp08fLFq0CF9//TW8vLyc2latWlUnhRERUcMjKzSWL1+Ojh07ol+/fjymQUTUhMkKjZycHCxZsoTHNIiImjhZKRAVFYULFy7UdS1ERNTAyT57aunSpejUqRN8fHyc2l588cU6KYyIiBoeWaERHh6O8PDwuq6FiIgaOFmh8eSTT9Z1HURE9ACoNjTS0tLQuXNnAKjxeEaXLl1qvyoiImqQqg2NhIQELFu2DACwZs2aKvsoFApep0FE1IQoRA2XeR87dgwDBw6sz3rqRFZWVr1v0zZtdL1vk+qfct0eV5fQZOj1euTl5bm6jCYhNDS02rYaT7ldt25drRdDREQPrhpDg3NNERHRvWo8e8put9/3oj4eCCciajpqDA2LxYKPPvqo2hEHD4QTETUtNYaGVqtlKBARkQNnICQiItl4IJyIiGSrMTQ2bdpUX3XUutOnT2Pt2rWuLoOIqFGRNffUgygqKgpRUVGuLoOIqFHhMQ0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsDA0iIpKNoUFERLIxNIiISDaGBhERyaZydQG/xe3bt7Fz504YjUbMmTPH1eUQETU59TbS+PDDDzF16tRKX/bnzp3DK6+8gpdeeglff/11jesICgrC888/X4dVEhFRTeptpBETE4MRI0Zg9erVjmV2ux0JCQl48803ERAQgPj4eERFRcFut2Pr1q1Oz3/++efh4+NTX+USEVEV6i00OnfujJycHKdlV65cQXBwMIKCggAA/fv3x6lTpzBmzBjMnz//d2/rwIEDOHDgAABg8eLF0Ov1v7/w3+l2vW+RXMEVn62mSqVS8f1uAFx6TKOgoAABAQGOxwEBAbh8+XK1/YuKivD5558jIyMDu3btwpgxY6rsN2zYMAwbNszxOC8vr/aKJroHP1v1R6/X8/2uJ6GhodW2uTQ0hBCVlikUimr7e3l5Yfr06XVZEhER1cClp9wGBAQgPz/f8Tg/Px9+fn4urIiIiGri0tCIiIhAdnY2cnJyYLVakZSUhKioKFeWRERENai33VP/+Mc/kJaWhqKiIsycORPjxo3D0KFDMWXKFCxatAh2ux1DhgxBeHh4fZVERES/Ub2FxuzZs6tc3rNnT/Ts2bO+yiAioj+A04gQEZFsjTY0Tp8+jbVr17q6DCKiRuWBmnvqt4iKiuJBdSKiWtZoRxpERFT7GBpERCQbQ4OIiGRrtMc0iKjp0Wq1UKlUsNvtMJlMVU5VBJRPV6TT6SBJEqxWK8xms+P5VU1lZLFYYLVaoVKpoFKpoFAoIIRwPA8A1Go1NBoNFAoFbDYbzGZztdt/kDE0iOiBp1Qq4e/vj5TsYpy7mY9wX3dEt22GYsNdlJaWOvV1c3ODp7cPjlzJR2ahEQ+F+eKhkGYwm824ZVLgUk5RpfWP6BgIu82KHKMN528XwWy1IzLYG346BUwmE3x9fZFXCnx/JR8miw0t/NwxsE0zGArvwGKx1NfbUC8YGkT0wPPx8cHqYxnYcjoT4b46ZN01o2OwFz4e3x1lebmOX/wKhQJe3j6Yse0cLt4qQqiPFh8d+xlPR4XjP4e0w47UDHx49JrTupUKBUZ2CoIFSoxdfxxKSQGbXWDO0HYY2dYbbm5uyCyyYfLm0/DWqtDS3wMfHr2G2PbN8M7IDigsLIS7uzuUSiWEELBarTCZTLDZbK54q/6wRntMg9dpEDUNSqUSRVYFvjhzAz2a++KrqX0xuW9LpGYbcPRaAXQ6naOvVqvFsWsFuJBtwKS+LfHV1L7oFe6LbWduIKeoFJP7tMTJ14bg5GtDsGli+Sn7sR2awW6zQi0BX03ti4UjO1Xa/o9Zd2G1C7wY3RZrn+qBEG8tztwohEajgdrTF5+dy8E7+69i6eEM7L9W9EBPzNpoQyMqKgozZsxwdRlEVMdUKhXSc4phswt0DvaCzWZD52BvAEDaLQOUSqVT39RbBgBw9O0U7A2bXeBybhEKCgpwKzsblrIybDmdCQCY2LslioqKcCc/D75KK359yKOsrAyD2zZDK393bD71C9777ifkFpdiQlQLAMCKw1fwxelMtPDTwVurxsmMAkjSg/vV++BWTkQEQJIklJRZAQBqpQS73Q6NsvybvaTU5vQFXd7XVmXf4lIrJEmCWq3GnVKBA5dyENXCDxH+WpSWlsJut1e5S0mSJNjsdujUSpSUWnHLYIZaKaEiWyw2O+wQMFnsaBfoiflxHaBSPbhHBh7cyomIANhsNgR5aQEAhSYLNBoNCozlB5+DvN3g4eEBd3d3AOXHNIK83Mr7Gn/V10sLm80KT09PfHQiEzYhMLF3CxQXFwMoH6VoNBoA/z6wrlarodPpsPr4dVy8XYR1/9ET3Zv74oXtZ7Eq8SrGdg/DKzHt4OeuQUrWXXx17ia0agm7pvaFJJWH1oOGIw0ieqBZLBZ0CvRAmI8WiVdyceRKLr5OuQlJAQzrEAgAeDzhB4xd/wOA8mWSAth1/iYSr+Qh8UouQn206BzkCbvdDqtCha9TshCh90DvcB+YTCYA5TeN+/7aXST/UggAuJBtwIGrhbArlPDWlv/+PnX9Dn66XYTMOyZoVBLcVBJSsw2I6xiE9/5fJOI6BsJgtqLAZHlgd1FxpEFEDzQhBMwmI959LBJLD6TjtV0/ItDTDW+O6IRmuvLrMNzVStgFYLVa0Uwn4a0RnbDm6DXM2ZWCjkFemDesPcwmI3Q6Hf7n4m1oVBKe7d0SRqPRsR2FJOGfR64CAHx15ccmTmYUYGAbPcb3DMel28X45GQGPk76GSHeWrw9shPUSgkXsu9i+9kbMFvscFNJeKJ7GEK9NMjLveuqt+wPUYjGePXJr2RlZdX7Nm3TRtf7Nqn+KdftcXUJTYZer0deXl617R4eHvDw8IBVKKCWAJPJhKKiInh6ejp2TxmNRhQXF8PLyws6nQ4WO6BSCJSUlKCkpAQ6nQ7e3t6QJAkWiwX5+fmO03V9fX3h5uZWabsWiwXFxcXw9vaGSqWC1Q4oFQImkwmlpaXw8vKCSqWC2WqHViWhrKwMBoMBVqu1bt6oWhAaGlptG0caRNTgLFu2DMuXL79vv1dffRVz5swBAMcXf8XV2hWKiopQVOR8wZ7BYIDBYKjU12QyOXZH/VphYWGNtVQXaBUXFyoUChQ2gt/oDA2iB9CftlxydQl16mZK9SOKe32ekofERvxe7J7Q0dUlVMLQIKIGJ2z4JIQNn+TqMqgKD+bhexl4RTgRUe1rtCMN3rmPiKj2NdqRBhER1T6GBhERycbQICIi2RgaREQkG0ODiIhkY2gQEZFsTWLuKSIiqh0caVCtmT9/vqtLoEaMn6+GgaFBRESyMTSIiEg2hgbVmmHDhrm6BGrE+PlqGHggnIiIZONIg4iIZGNoEBGRbI12avTGbPz48WjRogVsNhuUSiWio6Px6KOPQpIkXL16FUeOHMGUKVOqff7OnTsxduxYx+M333wT7777bn2UXkl6ejo2btwIi8UCq9WKfv36Ydy4cUhNTYVKpUKHDh1cUhfJs3PnThw7dgySJEGhUGD69Olo3bo1Nm/ejOTkZABAWFgYpk6dCr1eDwCYOHEiPvvsM6f1fPfdd3Bzc0N0dDQOHz6Mbt26wd/fv8Zt87PjGgyNB5BGo8H7778PALh79y5WrlwJo9GIcePGISIiAhERETU+f9euXU6hUdeBURFuVVm9ejX+8z//E61atYLdbkdWVhYAIDU1FVqt9jf9w69pO1T70tPTkZycjCVLlkCtVsNgMMBqtWLr1q0wmUxYsWIFJEnCoUOHsHTpUixevBiSVPXOjeHDhzv+//DhwwgPD79vaPCz4xoMjQecj48Ppk+fjvj4eDz55JNIS0vDN998g/nz58NsNmPDhg24evUqFAoFnnjiCVy9ehVlZWWYO3cuwsPD8fLLLzt++QkhsHnzZpw7dw4A8Pjjj6N///5ITU3Fl19+CS8vL2RmZqJNmzZ46aWXoFAosGPHDiQnJ6OsrAzt27fH9OnToVAosHDhQrRv3x4//fQTunTpgsOHD2PFihVQqVQwGo2YO3cuVqxYAYPBAD8/PwCAJElo3rw5cnJysH//fkiShKNHj2LKlCnQ6/VYs2YNDAYDvL29MWvWLOj1eqxevRqenp7IyMhA69atMXz4cCQkJMBgMMDNzQ0zZsxAWFiYC/+GGq87d+7Ay8sLarUaAODt7Y3S0lIcPnwYq1atcgTEkCFDcOjQIfz444946KGHqlzX9u3bodVqERgYiKtXr2LlypXQaDRYtGgRbty4gU8//RRms9nxd+/n58fPjoswNBqBoKAgCCFw9+5dp+U7duyAu7s7li1bBgAoLi5G3759sW/fPsdI5V4nT55ERkYG3n//fRgMBsTHx6NTp04AgJ9//hnLly+Hn58f3nrrLfz000/o2LEjRowYgSeeeAIA8M9//hPJycmOOyYajUb89a9/BQDk5ubizJkz6N27N5KSktCnTx+oVCqMGjUKs2fPRufOndG9e3dER0cjMDAQcXFx0Gq1GD16NABg8eLFGDx4MGJiYnDw4EFs2LAB8+bNAwBkZ2fjrbfegiRJeOeddzBt2jSEhITg8uXLWL9+Pd5+++06eNfpoYcewo4dO/DKK6+ga9eu6N+/Pzw8PKDX6+Hu7u7Ut02bNrhx40a1oVGh4vM5ceJEREREwGq1Ov6uvb29kZSUhM8//xyzZs3iZ8dFGBqNRFVnTv/444+YPXu247Gnp2eN67h06RIGDBgASZLg6+uLzp074+rVq9DpdGjbti0CAgIAAK1atUJOTg46duyICxcuYM+ePSgtLUVxcTHCw8MdodG/f3/HuocOHYo9e/agd+/eOHToEGbMmAEAeOKJJzBw4ECkpKTg2LFjOH78OBYuXFiptsuXL+O1114DAAwePBhbtmxxtPXt2xeSJMFsNuOnn37C8uXLHW1Wq/U+7xz9XlqtFkuWLMHFixeRmpqKDz74AGPGjIFCoai1bWRlZSEzMxN/+9vfAAB2u90xuuBnxzUYGo3A7du3IUkSfHx8cPPmTae22voHXLELAijfFWC321FWVoaEhAT8/e9/h16vx/bt21FWVubo5+bm5vj/jh07IiEhAWlpabDb7WjRooWjLTg4GMHBwYiNjcXUqVNRVFT0m2rTarUAyr9QPDw8qhxFUd2QJAmRkZGIjIxEixYtsH//fuTm5sJkMkGn0zn6/fzzz+jbt+/v2kbz5s2xaNGiKtv42al/POX2AWcwGLBu3TqMGDGiUkB069YN+/btczwuLi4GAKhUqip/RXXq1AknTpyA3W6HwWDAxYsX0bZt22q3bbFYAJTvyzabzTh58mSNtQ4ePBgrVqzAkCFDHMvOnDnjGCVlZ2dDkiR4eHhAp9PBbDY7+rVv3x5JSUkAgGPHjqFjx46V1u/u7o7AwECcOHECQPnoKyMjo8aa6PfLyspCdna243FGRgZCQ0MRHR2NTz/9FHa7HQBw5MgRqNVq2QemtVotTCYTACA0NBQGgwHp6ekAyn/9Z2ZmAuBnx1U40ngAVRzIrjjjY9CgQXjssccq9Xv88cexfv16zJkzB5Ik4YknnkCfPn0QGxuLuXPnonXr1nj55Zcd/Xv37o309HTMnTsXAPDMM8/A19e30uilgoeHB2JjYzFnzhwEBgbe96ytQYMG4YsvvsCAAQMcyxITE/Hpp59Co9FAqVTipZdegiRJ6NWrF5YvX45Tp05hypQpeO6557BmzRrs2bPHcTCzKi+//DLWrVuHnTt3wmq1YsCAAWjVqtX93lL6HSpOtCgpKYFSqURwcDCmT58OnU6Hzz77DK+88grKysrg7e2NRYsWOX7UlJWVYebMmY71/PqzGxMTg3Xr1jkOhM+ZMweffPIJjEYjbDYbHn30UYSHh/Oz4yKcRoTqzQ8//IBTp07hpZdecnUpVE8KCwuxaNEiPPLII5w7qpFgaFC92LBhA86ePYv4+HiEhoa6uhwi+p0YGkREJBsPhBMRkWwMDSIiko2hQUREsjE0iIhINl6nQU3epUuXsHnzZmRmZjomvps0aRLatm2Lw4cP4/vvv3dMY1GXdu7ciV27dgEov0LZarVCo9EAAJo1a+Y0xQWRqzA0qEkzGo1YvHgxpk6div79+8NqteLixYtO06b8Eb9lyu2xY8c6pqyvz7Ai+i0YGtSkVUyDMXDgQADl9yqpmIn1xo0bWLduHaxWKyZOnAilUomNGzfCaDQ6rjtxc3NDbGwsxowZA0mSHF/2EREROHLkCB555BE8/vjj+Pzzz3HixAlYrVY8/PDDmDx5smMUcT979uxBenq6Y9I9oPy6F0mSMHnyZMc09D/++COysrIQGRmJWbNmOSaoTE9Px6ZNm3Djxg00a9YMkydPRmRkZG2+jdSE8JgGNWkhISGQJAmrVq3C2bNnHfNzAeUT5U2bNg3t27fHZ599ho0bNwIo/8I2Go1YtWoVFi5ciMTERBw+fNjxvMuXLyMoKAjr16/H2LFjsWXLFmRnZ+P999/HypUrUVBQgB07dsiucdCgQTh//jxKSkoAlI9ekpKSMHjwYEefI0eO4Pnnn8fatWshSRI2bNgAACgoKMDixYsxduxYbNiwARMnTsSyZctgMBj+wLtGTRlDg5o0d3d3vPPOO1AoFFi7di2mTp2KJUuWoLCwsMr+drsdSUlJePrpp6HT6RAYGIjHHnsMiYmJjj5+fn4YOXIklEol1Go1vv/+e0yaNAmenp7Q6XQYO3Ysjh8/LrtGPz8/x2SSAHDu3Dl4eXmhTZs2jj6DBw9GixYtoNVq8dRTTzkmnkxMTESPHj3Qs2dPSJKEbt26ISIiAmfOnPl9bxg1edw9RU1e8+bN8cILLwAAbt68iX/+85/YuHGj071IKlTc0rTiftdA+UHqgoICx+N72wwGA0pLSzF//nzHMiGEYwZYuaKjo/Hdd99h2LBhOHr0qNMoA4DjXicV27fZbDAYDMjLy8MPP/zguF83UD5S4e4p+r0YGkT3CAsLQ0xMDPbv319lu7e3N5RKJfLy8tC8eXMAQF5eXrX3s/by8oJGo8Hy5cvve8/rmjz88MNYv349fvnlFyQnJ+OZZ55xas/Pz3f8f15eHpRKJby9vREQEIBBgwY5zSpL9Edw9xQ1aTdv3sQ333zj+NLNy8vD8ePH0a5dOwCAr68vCgoKHPcfkSQJ/fr1w+effw6TyYTc3Fzs3bsXgwYNqnL9kiQhNjYWGzdudNyOt6CgwHEfdrk0Gg369OmDlStXom3btk6jGQA4evQobty4gdLSUmzfvt1xR7pBgwYhOTkZ586dc9w4KzU11SlkiH4LjjSoSdPpdLh8+TL27t0Lo9EId3d39OrVy/FLvkuXLo4D4pIkISEhAVOmTMGGDRvw4osvQqPRIDY21unGUr82YcIE7NixA2+88QaKiorg7++PuLg4dO/e/TfVWnGP6+eff75S2+DBg7F69WpkZWWhU6dOjntG6PV6zJs3D5s3b8aKFSsgSRLatm2LadOm/aZtE1XgLLdED4i8vDzMnj0bH3/8Mdzd3R3LFy5ciEGDBiE2NtaF1VFTwd1TRA8Au92OvXv3on///k6BQVTfGBpEDZzZbMakSZOQkpKCcePGubocauK4e4qIiGTjSIOIiGRjaBARkWwMDSIiko2hQUREsjE0iIhItv8PjLre/Fk97CgAAAAASUVORK5CYII=", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run time: ~15s\n", - "\n", - "# One time Setup\n", - "dict_store = DictionaryStore()\n", - "sql_store = SQLiteStore()\n", - "dict_store.append_many(annotations)\n", - "sql_store.append_many(annotations)\n", - "\n", - "rng = np.random.default_rng(123)\n", - "query_polygons = [\n", - " Polygon(\n", - " [\n", - " (x, y),\n", - " (x + 128, y),\n", - " (x + 128, y + 128),\n", - " (x, y),\n", - " ],\n", - " )\n", - " for x, y in rng.integers(0, 1000, size=(100, 2))\n", - "]\n", - "stmt = \"for polygon in query_polygons:\\n _ = store.query(polygon)\"\n", - "\n", - "# Time dictionary store\n", - "dict_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": dict_store, \"query_polygons\": query_polygons},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": sql_store, \"query_polygons\": query_polygons},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Plot the results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"100 Polygon Queries\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1k1xOgB5pT5y" - }, - "source": [ - "Here we can see that performing queries within a polygon region is about\n", - "10x faster with the `SQLiteStore` than with the `DictionaryStore`.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iYFK95w1pT5y" - }, - "source": [ - "## 1.4) Predicate Query\n", - "\n", - "Here we query the whole annotation region but with a predicate to\n", - "select only annotations with the class label of 0. We also,\n", - "demonstrate how creating a database index can dramatically improve\n", - "the performance of queries.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zNX4UG4BpT5y", - "outputId": "97444739-4aa5-42c7-bebc-84a022282ac7" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run time: ~2m\n", - "\n", - "# Setup\n", - "labelled_annotations = copy.deepcopy(annotations)\n", - "for n, annotation in enumerate(labelled_annotations):\n", - " annotation.properties[\"class\"] = n % 10\n", - " annotation.properties[\"vector\"] = rng.integers(1, 4, 10).tolist()\n", - "\n", - "predicate = \"(props['class'] == ?) & (3 in props['vector'])\"\n", - "classes = rng.integers(0, 10, size=100)\n", - "stmt = \"for n in classes:\\n store.query(where=predicate.replace('?', str(n)))\"\n", - "\n", - "dict_store = DictionaryStore()\n", - "sql_store = SQLiteStore()\n", - "\n", - "dict_store.append_many(labelled_annotations)\n", - "sql_store.append_many(labelled_annotations)\n", - "\n", - "\n", - "# Time dictionary store\n", - "dict_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": dict_store, \"predicate\": predicate, \"classes\": classes},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "dict_result = dict_store.query(where=predicate.replace(\"?\", \"0\"))\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": sql_store, \"predicate\": predicate, \"classes\": classes},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "sql_result = sql_store.query(where=predicate.replace(\"?\", \"0\"))\n", - "\n", - "\n", - "# Add an index\n", - "# Note: Indexes may not always speed up the query (sometimes they can\n", - "# actually slow it down), test to make sure.\n", - "sql_store.create_index(\"class_lookup\", \"props['class']\")\n", - "sql_store.create_index(\"has_3\", \"3 in props['vector']\")\n", - "\n", - "# Time SQLite store again\n", - "sqlite_index_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\"store\": sql_store, \"predicate\": predicate, \"classes\": classes},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "sql_index_result = sql_store.query(where=predicate.replace(\"?\", \"0\"))\n", - "\n", - "# # Validate the results against each other\n", - "# for a, b, c in zip(dict_result, sql_result, sql_index_result):\n", - "# assert a.geometry == b.geometry == c.geometry # noqa: ERA001\n", - "\n", - "# Plot the results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs, sqlite_index_runs],\n", - " title=\"100 Queries with a Predicate\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\", \"SQLiteStore\\n(with index)\"],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gp8mq1TNpT5y" - }, - "source": [ - "### Polygon & Predicate Query\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Eu0hGvhdpT5y", - "outputId": "0d89174e-01e0-4e71-a9c3-e063ed30ca38" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run time: ~10s\n", - "\n", - "# Setup\n", - "labelled_annotations = copy.deepcopy(annotations)\n", - "for n, annotation in enumerate(labelled_annotations):\n", - " annotation.properties[\"class\"] = n % 10\n", - "\n", - "predicate = \"props['class'] == \"\n", - "classes = rng.integers(0, 10, size=50)\n", - "query_polygons = [\n", - " Polygon(\n", - " [\n", - " (x, y),\n", - " (x + 128, y),\n", - " (x + 128, y + 128),\n", - " (x, y),\n", - " ],\n", - " )\n", - " for x, y in rng.integers(0, 1000, size=(100, 2))\n", - "]\n", - "stmt = (\n", - " \"for n, poly in zip(classes, query_polygons):\\n\"\n", - " \" store.query(poly, where=predicate + str(n))\"\n", - ")\n", - "\n", - "dict_store = DictionaryStore()\n", - "sql_store = SQLiteStore()\n", - "\n", - "dict_store.append_many(labelled_annotations)\n", - "sql_store.append_many(labelled_annotations)\n", - "\n", - "\n", - "# Time dictionary store\n", - "dict_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\n", - " \"store\": dict_store,\n", - " \"predicate\": predicate,\n", - " \"classes\": classes,\n", - " \"query_polygons\": query_polygons,\n", - " },\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "dict_result = dict_store.query(query_polygons[0], where=predicate + \"0\")\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\n", - " \"store\": sql_store,\n", - " \"predicate\": predicate,\n", - " \"classes\": classes,\n", - " \"query_polygons\": query_polygons,\n", - " },\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "sql_result = sql_store.query(query_polygons[0], where=predicate + \"0\")\n", - "\n", - "\n", - "# Check that the set difference of bounding boxes is empty i.e. all sets\n", - "# of results contain polygons which produce the same set of bounding\n", - "# boxes. This avoids being tripped up by slight varations in order or\n", - "# coordinate order between the results.\n", - "dict_set = {x.geometry.bounds for x in dict_result}\n", - "sql_set = {x.geometry.bounds for x in sql_result}\n", - "assert len(dict_set.difference(sql_set)) == 0 # noqa: S101\n", - "\n", - "# Plot the results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"100 Queries with a Polygon and Predicate\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kJ8x5tJmpT5y" - }, - "source": [ - "### Complex Predicate Query\n", - "\n", - "Here we slightly increase the complexity of the predicate to show how\n", - "the complexity of a predicate can dramatically affect the performance\n", - "when handling many annotations.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VHb4PqbHpT5y", - "outputId": "343b44c7-741d-4e11-9dd2-85f357ba6f32" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run time: ~1m\n", - "\n", - "# Setup\n", - "box = Polygon.from_bounds(0, 0, 1024, 1024)\n", - "labelled_annotations = copy.deepcopy(annotations)\n", - "for n, annotation in enumerate(labelled_annotations):\n", - " annotation.properties[\"class\"] = n % 4\n", - " annotation.properties[\"n\"] = n\n", - "\n", - "predicate = \"(props['n'] > 1000) & (props['n'] % 4 == 0) & (props['class'] == \"\n", - "targets = rng.integers(0, 4, size=100)\n", - "stmt = \"for n in targets:\\n store.query(box, where=predicate + str(n) + ')')\"\n", - "\n", - "dict_store = DictionaryStore()\n", - "sql_store = SQLiteStore()\n", - "\n", - "dict_store.append_many(labelled_annotations)\n", - "sql_store.append_many(labelled_annotations)\n", - "\n", - "\n", - "# Time dictionary store\n", - "dict_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\n", - " \"store\": dict_store,\n", - " \"predicate\": predicate,\n", - " \"targets\": targets,\n", - " \"box\": box,\n", - " },\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "dict_result = dict_store.query(box, where=predicate + \"0)\")\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " stmt,\n", - " globals={\n", - " \"store\": sql_store,\n", - " \"predicate\": predicate,\n", - " \"targets\": targets,\n", - " \"box\": box,\n", - " },\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "sql_result = sql_store.query(box, where=predicate + \"0)\")\n", - "\n", - "\n", - "# Check that the set difference of bounding boxes is empty i.e. all sets\n", - "# of results contain polygons which produce the same set of bounding\n", - "# boxes. This avoids being tripped up by slight varations in order or\n", - "# coordinate order between the results.\n", - "dict_set = {x.geometry.bounds for x in dict_result.values()}\n", - "sql_set = {x.geometry.bounds for x in sql_result.values()}\n", - "\n", - "assert len(dict_set.difference(sql_set)) == 0 # noqa: S101\n", - "\n", - "# Plot the results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"100 Queries with a Complex Predicate\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CAT0KmS6pT5y" - }, - "source": [ - "# Part 2: Large Scale Dataset Benchmarking\n", - "\n", - "Here we generate some sets of anntations with five million items each\n", - "(in a 2237 x 2237 grid). One is a set of points, the other a set of\n", - "generated cell boundaries.\n", - "\n", - "The code to generate and write out the annotations to various formats is\n", - "included in the following cells. However, some of these take a very long\n", - "time to run. A pre-generated dataset is downloaded and then read from\n", - "disk instead to save time. However, you may uncomment the generation\n", - "code to replicate the original.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nwH5zYFupT5y" - }, - "source": [ - "## 2.1) Points Dataset\n", - "\n", - "Here we generate a simple points data in a grid. The grid is 2237 x 2237\n", - "and contains over 5 million points. We also write this to disk in\n", - "various formats. Some formats take a long time and are commented out. A\n", - "summary of times for a consumer laptop are shown in a table at the end.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2FjCL2jgpT5y" - }, - "outputs": [], - "source": [ - "# Generate some points with a little noise\n", - "# Run time: ~5s\n", - "points = np.array(\n", - " [\n", - " [x, y]\n", - " for x in np.linspace(0, 75_000, 2237)\n", - " for y in np.linspace(0, 75_000, 2237)\n", - " ],\n", - ")\n", - "# Add some noise between -1 and 1\n", - "rng_42 = np.random.default_rng(42)\n", - "points += rng_42.uniform(-1, 1, size=(2237**2, 2))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DRWABSBVpT5z" - }, - "source": [ - "### 2.1.1) Writing To Disk\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x76WbSFdpT52" - }, - "outputs": [], - "source": [ - "# Save as a simple Numpy array (.npy)\n", - "# Run time: <1s\n", - "np.save(\"points.npy\", points)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dkKtM-DKpT52" - }, - "outputs": [], - "source": [ - "# Save as compressed NumPy archive (.npz)\n", - "# Run time: ~5s\n", - "np.savez_compressed(\"points.npz\", points)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rbHdEIbPpT52" - }, - "source": [ - "Note that the above numpy format is missing the keys (UUIDs) of each point.\n", - "This may not be required in all cases. However, for the sake of comparison\n", - "we also generate a NumPy archive with keys included. We store the UUIDs\n", - "as integers to save space and for a fair comparison where the optimal\n", - "storage method is used in each case. Note however that UUIDs are too\n", - "large to be a standard C type and therefore are stored as an object\n", - "array.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DbLm4l5tpT52" - }, - "outputs": [], - "source": [ - "# Generate UUIDs\n", - "# Run time: ~10s\n", - "keys = np.array([uuid.uuid4().int for _ in range(len(points))])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zXuAqw0KpT52" - }, - "outputs": [], - "source": [ - "# Generate some UUIDs as keys\n", - "# Save in NumPy format (.npz)\n", - "# Run time: <1s\n", - "np.savez(\"uuid_points.npz\", keys=keys, coords=points)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UAHAgPU4pT52" - }, - "outputs": [], - "source": [ - "# Save in compressed (zip) NumPy format (.npz)\n", - "# Run time: ~10s\n", - "np.savez_compressed(\"uuid_points_compressed.npz\", keys=keys, coords=points)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "j5wlDFYfpT52" - }, - "outputs": [], - "source": [ - "# Write to SQLite with SQLiteStore\n", - "# Run time: ~10m\n", - "points_sqlite_store = SQLiteStore(\"points.db\")\n", - "_ = points_sqlite_store.append_many(\n", - " annotations=(Annotation(Point(x, y)) for x, y in points),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tUekiEqspT53" - }, - "outputs": [], - "source": [ - "# Load a DictionaryStore into memory by copying from the SQLiteStore\n", - "# Run time: ~1m 30s\n", - "points_dict_store = DictionaryStore(Path(\"points.ndjson\"))\n", - "for key, value in points_sqlite_store.items():\n", - " points_dict_store[key] = value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Uynntjq7pT53" - }, - "outputs": [], - "source": [ - "# Save as GeoJSON\n", - "# Run time: ~1m 30s\n", - "points_sqlite_store.to_geojson(\"points.geojson\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4YMuggcgpT53" - }, - "outputs": [], - "source": [ - "# Save as ndjson\n", - "# Run time: ~1m 30s\n", - "# Spec: https://github.com/ndjson/ndjson-spec\n", - "points_sqlite_store.to_ndjson(\"points.ndjson\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lW9NoCPwpT53" - }, - "source": [ - "### 2.1.2) Points Dataset Statistics Summary\n", - "\n", - "| Format | Write Time | Size |\n", - "| -----------------------------: | ---------: | -----: |\n", - "| SQLiteStore (.db) | 6m 20s | 893MB |\n", - "| ndjson | 1m 23s | 667 MB |\n", - "| GeoJSON | 1m 42s | 500 MB |\n", - "| NumPy + UUID (.npz) | 0.5s | 165 MB |\n", - "| NumPy + UUID Compressed (.npz) | 31s | 136 MB |\n", - "| NumPy (.npy) | 0.1s | 76 MB |\n", - "| NumPy Compressed (.npz) | 3.3s | 66 MB |\n", - "\n", - "Note that the points SQLite database is significantly larger than the\n", - "NumPy arrays on disk. The numpy array is much more storage efficient\n", - "partly because there is no R Tree index or unique identifier (UUID)\n", - "stored for each point. For a more fair comparison, another NumPy archive\n", - "(.npz) is created where the keys are stored along with the coordinates.\n", - "\n", - "Also note that although the compressed NumPy representation is much\n", - "smaller, it must be decompressed in memeory before it can be used. The\n", - "uncompressed versions may be memory mapped if their size exceeds the\n", - "available memory.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a_3Gz5Q0pT53" - }, - "source": [ - "### 2.1.3) Simple Box Query\n", - "\n", - "Here we evaluate the performance of performing a simple box query on the\n", - "data. All points which are in the area between 128 and 256 in the x and\n", - "y coordinates are retrieved. It is assumed that the data is already in\n", - "memory for the NumPy formats. In reality this would not the be case for\n", - "the first query, all data would have to be read from disk, which is a\n", - "significan overhead. However, this cost is amortised across many\n", - "queries. To ensure the fairest possible comparison, it is assumed that\n", - "many queries will be performed, and that this data loading cost in\n", - "negligable.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "o9J0d6gdpT53" - }, - "outputs": [], - "source": [ - "box = Polygon.from_bounds(128, 128, 256, 256)\n", - "\n", - "# Time numpy\n", - "numpy_runs = timeit.repeat(\n", - " (\n", - " \"where = np.all([\"\n", - " \"points[:, 0] > 128,\"\n", - " \"points[:, 0] < 256,\"\n", - " \"points[:, 1] > 128,\"\n", - " \"points[:, 1] < 256\"\n", - " \"], 0)\\n\"\n", - " \"uuids = keys[where]\\n\"\n", - " \"result = points[where]\\n\"\n", - " ),\n", - " globals={\"keys\": keys, \"points\": points, \"np\": np},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Time SQLiteStore\n", - "sqlite_runs = timeit.repeat(\n", - " \"store.query(box)\",\n", - " globals={\"store\": points_sqlite_store, \"box\": box},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Time DictionaryStore\n", - "dict_runs = timeit.repeat(\n", - " \"store.query(box)\",\n", - " globals={\"store\": points_dict_store, \"box\": box},\n", - " number=1,\n", - " repeat=10,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eX1qqUIipT53", - "outputId": "a4033a88-6b2d-4a55-f3f6-ba419ef748c0" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs, numpy_runs],\n", - " title=\"Points Box Query (5 Million Points)\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\", \"NumPy Array\"],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aNU6FP90pT53" - }, - "source": [ - "Although the NumPy array is very space efficient on disk, it is not as\n", - "fast to query as the `SQLiteStore`. The `SQLiteStore` is likely faster\n", - "due to the use of the R tree index. Furthermore, the method used to\n", - "store the points in a NumPy array is limited in that it does not use\n", - "UUIDs, which makes merging two datasets more difficult as the indexes of\n", - "points no longer uniquely identify them. Additionally, only homogeneous\n", - "data such as two-dimentional coordinates can be practically stored in\n", - "this way. If the user would like to store variable length data\n", - "structures such as polygons, or even mix data types by storing both\n", - "points and polygons, then using raw NumPy arrays in this way can become\n", - "cumbersome and begins to offer little benefit in terms of storage\n", - "efficient or query performance.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c766NXGPpT53" - }, - "source": [ - "### 2.1.4) Polygon Query\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6jiMpRnxpT53" - }, - "outputs": [], - "source": [ - "big_triangle = Polygon(\n", - " shell=[ # noqa: S604\n", - " (1024, 1024),\n", - " (1024, 4096),\n", - " (4096, 4096),\n", - " (1024, 1024),\n", - " ],\n", - ")\n", - "\n", - "# Time SQLiteStore\n", - "sqlite_runs = timeit.repeat(\n", - " \"store.query(polygon)\",\n", - " globals={\"store\": points_sqlite_store, \"polygon\": big_triangle},\n", - " number=1,\n", - " repeat=10,\n", - ")\n", - "\n", - "# Time DictionaryStore\n", - "dict_runs = timeit.repeat(\n", - " \"store.query(polygon)\",\n", - " globals={\"store\": points_dict_store, \"polygon\": big_triangle},\n", - " number=1,\n", - " repeat=10,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Es2OQ5OdpT53", - "outputId": "b98176ee-7003-49f7-f5ca-62b08180b2ee" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"Polygon Query (5 Million Points)\",\n", - " tick_label=[\"DictionaryStore\", \"SQLiteStore\"],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HUBEmZDMpT53" - }, - "source": [ - "## 2.2) Cell Boundary Polygons Dataset\n", - "\n", - "Here we generate a much larger and more complex polygon dataset. This\n", - "consistes of a grid of over 5 million generated cell boundary like\n", - "polygons.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xhCr_TDVpT53", - "outputId": "c02b7a20-6ab1-4cae-b6bb-fb5c6d94cd12" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5004169/5004169 [10:04<00:00, 8277.35it/s] \n" - ] - } - ], - "source": [ - "# Generate a grid of 5 million cell boundary polygons (2237 x 2237)\n", - "# Run time: ~10m\n", - "rng_42 = np.random.default_rng(42)\n", - "\n", - "cell_polygons = [\n", - " Annotation(geometry=polygon, properties={\"class\": rng_42.integers(0, 4)})\n", - " for polygon in tqdm(cell_grid(size=(2237, 2237), spacing=35), total=2237**2)\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "21RgwKtgpT54" - }, - "source": [ - "### 2.2.1) Write To Formats For Comparison\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "CDVLMRUtpT54" - }, - "outputs": [], - "source": [ - "# Write to an SQLiteStore on disk (SSD for recorded times here)\n", - "# Run time: ~30m\n", - "cell_sqlite_store = SQLiteStore(\"cells.db\")\n", - "_ = cell_sqlite_store.append_many(annotations=cell_polygons)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6Fb4tQHVpT54", - "outputId": "fba12c47-e0cb-44fd-ca95-35c38454c9cc" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - } - ], - "source": [ - "# Create a copy as an in memory DictionaryStore\n", - "# Run time: ~5m\n", - "cell_dict_store = DictionaryStore()\n", - "for key, value in tqdm( # Show a nice progress bar\n", - " cell_sqlite_store.items(),\n", - " total=len(cell_sqlite_store),\n", - " leave=False,\n", - " position=0,\n", - "):\n", - " cell_dict_store[key] = value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wXOOuGWypT54", - "outputId": "e2fb300e-e5b8-4459-b172-249cda363b50" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5004169/5004169 [01:26<00:00, 58002.74it/s]\n" - ] - } - ], - "source": [ - "# Transform into a numpy array\n", - "# Run Time: ~1m\n", - "cell_polygons_np = np.array(\n", - " [np.array(a.geometry.exterior.coords) for a in tqdm(cell_polygons)],\n", - " dtype=object,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yv9VgW9TpT54" - }, - "outputs": [], - "source": [ - "# Create an Nx4 index of (xmin, ymin, xmax, ymax) as a simple spatial\n", - "# index to speed up the numpy query.\n", - "# Run time: ~1m\n", - "min_max_index = np.array(\n", - " [(*np.min(coords, 0), *np.max(coords, 0)) for coords in cell_polygons_np],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nFmHxwBwpT54" - }, - "outputs": [], - "source": [ - "# Write to GeoJSON\n", - "# Run time: ~10m\n", - "\n", - "cell_dict_store.to_geojson(\"cells.geojson\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2UH6WdmipT54" - }, - "outputs": [], - "source": [ - "# Write to line delimited JSON (ndjson)\n", - "# Run time: ~10m\n", - "\n", - "cell_dict_store.to_ndjson(\"cells.ndjson\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fw6wg5gapT54", - "outputId": "61a32277-fb8d-4bdc-be28-b379cb0a23eb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cells.ndjson : 40.82% ( 8.82 GiB => 3.60 GiB, cells.ndjson.zstd) \n" - ] - } - ], - "source": [ - "# Zstandard compression of ndjson to demonstrate how well it compresses.\n", - "# Gzip may also be used but is slower to compress.\n", - "# Run time: ~1m\n", - "! zstd -f -k cells.ndjson -o cells.ndjson.zstd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rzGC65zhpT55", - "outputId": "75ad772b-5641-4d64-ae16-7d50206e1b85" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cells.db : 75.87% ( 4.87 GiB => 3.69 GiB, cells.db.zstd) \n" - ] - } - ], - "source": [ - "# Zstandard compression of sqlite to demonstrate how well it compresses.\n", - "# Gzip may also be used but is slower to compress.\n", - "# Run time: ~20s\n", - "! zstd -f -k cells.db -o cells.db.zstd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xT0KZLxdpT55" - }, - "outputs": [], - "source": [ - "# Write as a pickle (list)\n", - "# Run time: ~2m\n", - "with Path(\"cells.pickle\").open(\"wb\") as fh:\n", - " pickle.dump(cell_polygons, fh)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-TAWGEu9pT55" - }, - "outputs": [], - "source": [ - "# Write as a pickle (dict)\n", - "# Run time: ~15m\n", - "with Path(\"cells-dict.pickle\").openI(\"wb\") as fh:\n", - " pickle.dump(cell_dict_store._rows, fh) # noqa: SLF001" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "I-W4o3GepT55" - }, - "outputs": [], - "source": [ - "# Write dictionary store to a pickle\n", - "# Run time: ~20m\n", - "with Path(\"cells.pickle\").open(\"wb\") as fh:\n", - " pickle.dump(cell_dict_store, fh)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dALe8k0BpT55" - }, - "outputs": [], - "source": [ - "# Write as numpy object array (similar to writing out with pickle),\n", - "# Numpy cannot handle ragged arrays and therefore dtype must be object.\n", - "# Run time: ~30m\n", - "np.save(\"cells.npy\", np.asanyarray(cell_polygons_np, dtype=object))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hOrGS0HgpT55" - }, - "outputs": [], - "source": [ - "# Create UUIDs, and get the class labels for each cell boundary\n", - "# Run time: ~2m\n", - "_uuids = [str(uuid.uuid4) for _ in cell_polygons]\n", - "_cls = [x.properties[\"class\"] for x in cell_polygons]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Fs2cz8lVpT55" - }, - "outputs": [], - "source": [ - "# Write as NumPy archive (.npz) with uuid and min_max_index\n", - "# Run time: ~40m\n", - "np.savez(\n", - " \"cells.npz\",\n", - " uuids=_uuids,\n", - " polygons=cell_polygons_np,\n", - " min_max_index=min_max_index,\n", - " cls=_cls,\n", - ")\n", - "\n", - "del _uuids, _cls" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4gOTqc03pT55" - }, - "source": [ - "### 2.2.2) Time To Write Summary Statistics\n", - "\n", - "The following is a summary of the time required to write each format to\n", - "disk and the total disk space occupied by the final output.\n", - "\n", - "Note that some of these formats, such as GeoJSON compress well with\n", - "schemes such as gzip and zstd, reducing the disk space by approximately\n", - "half. Statistics for zstd compressed data is also reported below. It\n", - "should be noted that the data must be decompressed to be usable.\n", - "However, for gzip and zstd, this may be done in a streaming fashion from\n", - "disk.\n", - "\n", - "| Format | Write Time | Size |\n", - "| ----------------: | ---------: | -----: |\n", - "| SQLiteStore (.db) | 33m 48.4s | 4.9 GB |\n", - "| GeoJSON | 11m 32.9s | 8.9 GB |\n", - "| ndjson | 9m 0.9s | 8.8 GB |\n", - "| pickle | 1m 2.9s | 1.8 GB |\n", - "| zstd (SQLite) | 18.2s | 3.7 GB |\n", - "| zstd (ndjson) | 43.7s | 3.6 GB |\n", - "| NumPy (.npy) | 50.3s | 1.8 GB |\n", - "| NumPy (.npz) | 55.3s | 2.6 GB |\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wS3sGpnWpT55" - }, - "source": [ - "### 2.2.3) Box Query\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MKvKfkyvpT55" - }, - "outputs": [], - "source": [ - "# Run time: ~5m\n", - "\n", - "# Setup\n", - "xmin, ymin, xmax, ymax = 128, 12, 256, 256\n", - "box = Polygon.from_bounds(xmin, ymin, xmax, ymax)\n", - "\n", - "\n", - "# Time DictionaryStore\n", - "dict_runs = timeit.repeat(\n", - " \"store.query(box)\",\n", - " globals={\"store\": cell_dict_store, \"box\": box},\n", - " number=1,\n", - " repeat=3,\n", - ")\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " \"store.query(box)\",\n", - " globals={\"store\": cell_sqlite_store, \"box\": box},\n", - " number=1,\n", - " repeat=3,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0Yo14C3kpT55", - "outputId": "764bc28b-3072-4887-ea88-4c88ffcefb5f" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Plot results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"Box Query (5 Million Polygons)\",\n", - " tick_label=[\n", - " \"DictionaryStore\",\n", - " \"SQLiteStore\",\n", - " ],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ExF-fOGQpT56" - }, - "source": [ - "### 2.2.4) Polygon Query\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PcxKapqNpT56" - }, - "outputs": [], - "source": [ - "# Run Time: 35s\n", - "\n", - "# Setup\n", - "big_triangle = Polygon(\n", - " shell=[ # noqa: S604\n", - " (1024, 1024),\n", - " (1024, 4096),\n", - " (4096, 4096),\n", - " (1024, 1024),\n", - " ],\n", - ")\n", - "\n", - "\n", - "# Time DictionaryStore\n", - "dict_runs = timeit.repeat(\n", - " \"store.query(polygon)\",\n", - " globals={\"store\": cell_dict_store, \"polygon\": big_triangle},\n", - " number=1,\n", - " repeat=3,\n", - ")\n", - "\n", - "# Time SQLite store\n", - "sqlite_runs = timeit.repeat(\n", - " \"store.query(polygon)\",\n", - " globals={\"store\": cell_sqlite_store, \"polygon\": big_triangle},\n", - " number=1,\n", - " repeat=3,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vqHA50DQpT56", - "outputId": "7e837f4c-ada9-400f-b5f3-c59430b137f3" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Plot results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs],\n", - " title=\"Polygon Query (5 Million Polygons)\",\n", - " tick_label=[\n", - " \"DictionaryStore\",\n", - " \"SQLiteStore\",\n", - " ],\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6m-E5AwapT56" - }, - "source": [ - "### 2.2.5) Predicate Query\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "whEn34rOpT56" - }, - "outputs": [], - "source": [ - "# Run Time: ~10m\n", - "\n", - "# Setup\n", - "xmin, ymin, xmax, ymax = 128, 12, 256, 256\n", - "box = Polygon.from_bounds(xmin, ymin, xmax, ymax)\n", - "predicate = \"props['class'] == 0\"\n", - "\n", - "# Time DictionaryStore\n", - "dict_runs = timeit.repeat(\n", - " \"store.query(box, predicate)\",\n", - " globals={\"store\": cell_dict_store, \"box\": box, \"predicate\": predicate},\n", - " number=1,\n", - " repeat=3,\n", - ")\n", - "\n", - "# Time SQLiteStore\n", - "sqlite_runs = timeit.repeat(\n", - " \"store.query(box, where=predicate)\",\n", - " globals={\"store\": cell_sqlite_store, \"box\": box, \"predicate\": predicate},\n", - " number=1,\n", - " repeat=3,\n", - ")\n", - "\n", - "np_stmt = f\"\"\"\n", - "polygons = [\n", - " polygon\n", - " for polygon in tqdm(cell_polygons_np)\n", - " if np.all([\n", - " np.max(polygon, 0) >= ({xmin}, {ymin}), np.min(polygon, 0) <= ({xmax}, {ymax})\n", - " ])\n", - "]\n", - "\"\"\"\n", - "\n", - "# Time numpy\n", - "numpy_runs = timeit.repeat(\n", - " np_stmt,\n", - " globals={\"cell_polygons_np\": cell_polygons_np, \"np\": np, \"tqdm\": lambda x: x},\n", - " number=1,\n", - " repeat=3,\n", - ")\n", - "\n", - "# Time shapely\n", - "shapely_runs = timeit.repeat(\n", - " \"polygons = [box.intersects(ann.geometry) for ann in cell_polygons]\",\n", - " globals={\"box\": box, \"cell_polygons\": cell_polygons},\n", - " number=1,\n", - " repeat=3,\n", - ")\n", - "\n", - "# Time box indexed numpy\n", - "numpy_index_runs = timeit.repeat(\n", - " \"in_box = np.all(min_max_index[:, :2] <= (xmax, ymax), 1) \"\n", - " \"& np.all(min_max_index[:, 2:] >= (xmin, ymin), 1)\\n\"\n", - " \"polygons = [p for p, w in zip(cell_polygons, in_box) if w]\",\n", - " globals={\n", - " \"min_max_index\": min_max_index,\n", - " \"xmin\": xmin,\n", - " \"ymin\": ymin,\n", - " \"xmax\": xmax,\n", - " \"ymax\": ymax,\n", - " \"np\": np,\n", - " \"cell_polygons\": cell_polygons,\n", - " },\n", - " number=1,\n", - " repeat=3,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oRxJTg7BpT56", - "outputId": "d235e51a-5109-486e-b779-fe39e5f6ee33" - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Run Time: ~5s\n", - "\n", - "# Plot results\n", - "plot_results(\n", - " experiments=[dict_runs, sqlite_runs, numpy_runs, shapely_runs, numpy_index_runs],\n", - " title=\"Box Query\",\n", - " tick_label=[\n", - " \"DictionaryStore\",\n", - " \"SQLiteStore\",\n", - " \"NumPy\\n(Simple Loop)\",\n", - " \"Shapely\\n(Simple Loop)\",\n", - " \"NumPy\\n(With Bounds Index)\",\n", - " ],\n", - ")\n", - "plt.xticks(rotation=90)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LJiGGkespT56" - }, - "source": [ - "## 2.3) Size vs Approximate Lower Bound\n", - "\n", - "Here we calculate an estimated lower bound on file size by finding the\n", - "the Shannon entropy of each file. This tells us the theoretical minimum\n", - "number of bits per byte. The lowest lower bound is then used as an\n", - "estimate of the minimum file size possible to store the annotation data.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0IO10faZpT56", - "outputId": "033c2530-072a-4aa5-cf34-c2298e90d86f" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Approximate Lower Bound Size: 3.60 GB\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r" - ] - } - ], - "source": [ - "# Run Time: ~5m\n", - "\n", - "\n", - "# Files to consider containing keys, geometry, and properties.\n", - "# Files which are missing keys e.g. cells.pickle are excluded\n", - "# for a fair comparison.\n", - "file_names = [\n", - " \"cells-dicionary-store.pickle\",\n", - " \"cells-dict.pickle\",\n", - " \"cells.db\",\n", - " \"cells.db.zstd\",\n", - " \"cells.geojson\",\n", - " \"cells.ndjson\",\n", - " \"cells.ndjson.zstd\",\n", - "]\n", - "\n", - "\n", - "def human_readible_bytes(byte_count: int) -> tuple[int, str]:\n", - " \"\"\"Convert bytes to human readble size and suffix.\"\"\"\n", - " byte_count_ref = 1024\n", - " for suffix in [\"B\", \"KB\", \"MB\", \"GB\", \"TB\"]:\n", - " if byte_count < byte_count_ref:\n", - " return byte_count, suffix\n", - " byte_count /= byte_count_ref\n", - " return byte_count, \"PB\"\n", - "\n", - "\n", - "def shannon_entropy(\n", - " fp: Path,\n", - " sample_size: int = 1e9, # 1GiB\n", - " stride: int = 7,\n", - " skip: int = 1e5, # 100KiB\n", - ") -> float:\n", - " \"\"\"Calculate the Shannon entropy of a file from a sample.\n", - "\n", - " The first `skip` bytes are skipped to avoid sampling low entropy\n", - " (highly ordered) parts which commonly occur at the beginning e.g.\n", - " headers.\n", - "\n", - " Args:\n", - " fp: File path to calculate entropy of.\n", - " sample_size: Number of bytes to sample from the file.\n", - " stride: Number of bytes to skip between samples.\n", - " skip: Number of bytes to skip before sampling.\n", - " \"\"\"\n", - " npmmap = np.memmap(Path(fp), dtype=np.uint8, mode=\"r\")\n", - " values, counts = np.unique(\n", - " npmmap[int(skip) : int(skip + (sample_size * stride)) : int(stride)],\n", - " return_counts=True,\n", - " )\n", - " total = np.sum(counts)\n", - " frequencies = {v: 0 for v in range(256)}\n", - " for v, x in zip(values, counts):\n", - " frequencies[v] = x / total\n", - " frequency_array = np.array(list(frequencies.values()))\n", - " epsilon = 1e-16\n", - " return -np.sum(frequency_array * np.log2(frequency_array + epsilon))\n", - "\n", - "\n", - "# Find the min across all of the representations for the lowest lower\n", - "# bound.\n", - "bytes_lower_bounds = {\n", - " path: (\n", - " shannon_entropy(Path(path)) / 8 * len(np.memmap(path, dtype=np.uint8, mode=\"r\"))\n", - " )\n", - " for path in tqdm(\n", - " [Path.cwd() / name for name in file_names],\n", - " position=0,\n", - " leave=False,\n", - " )\n", - "}\n", - "\n", - "lowest_bytes_lower_bound = min(bytes_lower_bounds.values())\n", - "\n", - "size, suffix = human_readible_bytes(lowest_bytes_lower_bound)\n", - "logger.info(\"Approximate Lower Bound Size: %2f %s\", size, suffix)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "chwB3zeupT56" - }, - "source": [ - "### Plot Results\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cu5jkrVppT56", - "outputId": "bb36aea5-d5d7-4560-a853-d2a8afba0eac" - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAbIAAAEeCAYAAADvrZCJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABMKElEQVR4nO3deVyN6f8/8NdpLymU6pMoCS0YDMqWkHUaBolhqEEyzBgMsoxtbNnJkghpjGGMicZYMwiFssyQNhJDqUOS1Emdc/3+6Nf97WhPp/vcx/v5eHg8nHs77+vcnPe5lvu6RIwxBkIIIUSg1PgOgBBCCPkQlMgIIYQIGiUyQgghgkaJjBBCiKBRIiOEECJolMgIIYQIGiUyohAuLi6YNGkS32GQEry8vODq6sp3GLCyssKKFSu413X5b0UkEuHAgQN18l6k7lAiI6V4eXlBJBJBJBJBQ0MDlpaWmDJlCl6+fMl3aAqXn58PY2Nj6Orq4sWLF7zG4urqCi8vr2qfd+DAAYhEolLbt2zZgiNHjtRCZJUr/vdT8o+Ojg4AIDo6GjNnzqz197xy5Qr69++Pxo0bQ0dHB5aWlnB3d8fjx4+5Y9LS0uDu7l7r7034RYmMlKlnz55IS0tDSkoK/P39cfToUYwfP57vsBTu6NGjsLS0RO/evREcHMx3OLXK0NAQDRs2rLP327ZtG9LS0rg/xQmlcePGqFevXq2+V1xcHPr164eWLVsiPDwccXFxCA4OhpWVFbKzs7njzMzMuIRKVAgj5D2enp6sb9++cttWrFjB1NTUWG5uLpPJZGzdunWsefPmTFNTk1lbW7NNmzbJHd+rVy82ceJExhhje/fuZYaGhuzt27dyxyxdupRZWVkxmUzGGGPs3LlzrE2bNkxbW5u1bduWXbx4kQFgP//8M3dOfHw8Gzx4MKtXrx6rV68ec3NzY0lJSdz+ffv2MXV1dXblyhXWoUMHpquryzp16sRiYmKqVHZnZ2e2ZcsWdvjwYdaqVatS+4vL9dNPPzFTU1PWsGFD5unpyXJyckp9foGBgaxZs2asfv36bMiQISwjI0PuWsHBwczOzo5paWmxJk2asIULF7KCggLuGgDk/ly4cIExxtiCBQuYra0t09XVZRYWFszHx4dlZWUxxhi7cOFCqfM8PT3l4ipWlftoaWnJFi1axKZPn84aNmzITExM2A8//MAKCwsr/Bzfv2/vX3P58uWlPtOS/P39WevWrZm2tjazsbFhK1as4D6bsmzatIkZGxtXGNP7cS1ZsqTUZ1Xy82KMsbNnz7Ju3boxHR0dZm5uzry8vNiLFy+4/ffu3WP9+/dnhoaGTE9Pj9na2rKQkJBK4yC1ixIZKaWsRLZhwwYGgGVnZ7Nt27YxHR0dFhgYyBITE1lAQADT1tZmQUFB3PElv5xyc3NZgwYNWHBwMLdfKpUyS0tLtmLFCsYYY0+fPmW6urps4sSJLDY2loWHh7OOHTvKffHk5uayZs2asT59+rCYmBgWExPDXFxcWIsWLVh+fj5jrCiRiUQi1rNnTxYREcHi4uJYv379mLW1dYVfhIwxFhcXx7S0tJhYLGYSiYQ1bNiQSx4ly2VoaMhmzJjB4uLi2KlTp5ihoSFbvHix3OdnYGDARo8eze7evcuuXr3KmjVrxsaPH88dc+LECaampsZWrVrFEhIS2KFDh1iDBg3Yjz/+yBhjLCsri/Xs2ZN5eHiwtLQ0lpaWxpVx+fLlLCIigj169IiFh4ez1q1bc9fOz89n27ZtYwC484qT3Pv3tSr30dLSkjVo0ICtXr2aJSYmskOHDjF1dXW2d+/eCj/LD0lkS5YsYc2aNWN//PEHS05OZn/99Rdr2rQp99mUpTiukydPVjmuN2/ecJ9RWloaCwsLYxoaGmzfvn2MMcbOnz/PdHV1mb+/P0tMTGQ3btxgLi4urGfPntyPr7Zt27Ivv/ySxcbGsocPH7KTJ0+yP//8s8IYSO2jREZKef8LLzY2lllbWzNHR0fGGGMWFhZszpw5cufMmDGDNW/enHv9/pfTd999x7p37869Pn36NNPQ0GCpqamMsaJahqWlpdwv/VOnTsl98QQFBTFdXV0mFou5Y54/f850dHTY/v37GWNFiQwAu3nzJndMVFQUA8Di4+MrLPeMGTPYF198wb3+5ptv2Jdffil3TK9evVjbtm3ltvn4+DAnJyfutaenJzM2NmYSiYTbtnr1amZmZsa97tGjBxs5cqTcdTZv3sx0dHS4hNW3b1+52kF5/vjjD6alpcWkUiljjLGff/6ZldXY8v59rcp9tLS0ZJ9//rncMQMGDGCjR4+uMCYATFtbm6s516tXj0v2FSWyt2/fMl1dXXbq1Cm56+3fv58ZGhqW+35SqZRNnDiRiUQi1qhRIzZgwADm5+fHnjx5UiqushLskydPmJmZmdzn0atXL+br6yt33OPHjxkAdvv2bcYYYwYGBlziI/wRfB/Zjh07MGnSJPzwww+VHisWi/HTTz9h9uzZWLp06UcxeKGmLl68CH19fejq6qJNmzawtrbGwYMHkZ2djadPn8LZ2Vnu+F69eiElJQW5ubllXs/HxwdXr17F/fv3AQC7d+/GZ599hv/9738AgPv376Nz585QV1fnzunatavcNWJjY2Fvbw9jY2Num6mpKVq3bo3Y2Fhum0gkwieffMK9btKkCQAgPT293PJKJBKEhITA09OT2+bl5YU//vij1L+T9u3by71u0qRJqWvb2dlBW1u73GNiY2PL/AwlEgkePnxYbpwA8Mcff8DZ2Rnm5ubQ19fH2LFj8e7dOzx//rzC80qqzn2sSnnLsnLlSty5c4f7M3369ErPiY2NRV5eHkaMGAF9fX3uj4+PD16/fg2xWFzmeWpqaggKCkJqaiq2bdsGe3t7BAYGws7ODhcvXqzwPXNycvD555+ja9eu8PPz47ZHR0dj8+bNcnHY29sDAJKSkgAAs2fPxqRJk+Di4oKlS5fi1q1blZaR1D7BJzIXFxcsWLCgSsf+/PPPcHZ2xvr16+Hu7o6DBw8qODrhcnR0xJ07dxAXF4e8vDycO3cO1tbW3P73R8WxShZRcHBwQI8ePRAUFISMjAyEhYVh8uTJcse8f82yRt6VtY0xJrddTU1NLiEW75PJZOXG9/vvvyMzMxPu7u7Q0NCAhoYGunXrhvz8fOzfv1/uWC0trVIxvX/tso55/zMq7zMsq4zFrl+/jpEjR8LZ2RmhoaG4desWdu7cCQB49+5dueeVpyr3sSrlLYupqSlsbGy4P0ZGRpWeU3zdI0eOyCXBu3fvIikpCY0aNarwfDMzM3z55ZfYuHEj4uPjYWlpiWXLllX4fmPGjIGmpiYOHDgANTU1uX2+vr5ycdy5cwdJSUkYNGgQAGDRokVITEyEh4cH7t27BycnJ/z444+VlpPULsEnMnt7e+jr68tte/78OVauXAlfX18sXrwYz549AwA8ffoUbdu2BVD0xRoTE1Pn8QqFrq4ubGxsYGVlJVezMDAwgIWFBS5duiR3fEREBJo3bw49Pb1yr+nj44OQkBDs2rULZmZmGDhwILfP3t4e0dHRkEql3LaoqCi58x0cHBAbGys3LD49PR2JiYlwcHCocVkBIDAwEF5eXqW+tObOnYvdu3d/0LXL4uDgUOZnqKury/1g0NLSkvs8gKIh5sbGxlixYgUcHR3RqlUrPH36VO6Y4sTz/rklfch9VCQHBwfo6OggOTlZLgkW/yn5A6UyWlpasLa2RkZGRrnHzJ49G3fu3MGff/5ZqsydOnVCbGxsmXGU/M6xtrbG1KlT8fvvv+Onn35CQEBA9QtOPojgE1lZdu3ahQkTJmDNmjUYN24cgoKCAACWlpa4fv06AODGjRvIy8vDmzdv+AxVkObPn4+tW7di9+7dSEpKQmBgIAICAiqtGRc/v7N8+XJMnDhR7tfv1KlTkZ6ejm+++QZxcXG4cOECFi5cCOD/ag1jxoxB48aNMWrUKNy6dQs3b97E6NGj0aRJE4waNarG5bl//z6uXLmCCRMmoE2bNnJ/fHx8EB8fj4iIiBpfvyzz58/H0aNH4efnh8TERPz2229YunQpfvjhBy4RNW/eHDdv3sTDhw/x4sULFBQUoHXr1hCLxdizZw+Sk5MREhKCHTt2yF27efPmAICwsDCIxWLk5OSUG0NN7qMi6evrY8GCBViwYAG2bduGhIQExMbG4tChQ/D19S33vMDAQPj4+ODMmTN48OAB4uLisGbNGpw6dQrDhg0r85zg4GDs2LGD+354/vw5nj9/jtevXwMAfvrpJxw/fhwzZ87EnTt38PDhQ5w+fRoTJ05EXl4ecnJyMG3aNPz999949OgRbt++jdOnT3PNj6TuaPAdQG2TSCRISEjAxo0buW2FhYUAgHHjxmHv3r24ePEi7Ozs0KhRo2r9wiNFvvnmG7x9+xarVq3C1KlT0bRpU/j5+WHixIkVnqejo4Nx48bB39+/1LFNmjRBWFgYZsyYgf3796N169ZYu3YtBg0axD33o6uri7Nnz2LmzJlc346LiwtOnz5dqvmrOgIDA2Fubo4ePXqU2mdtbY1OnTph165dpfqTPsTgwYOxd+9e+Pn5YfHixWjcuDGmTp2KJUuWcMf88MMPuHv3Lj755BO8ffsWFy5cgJubGxYuXIgFCxYgJycHvXr1wrp16zBmzBjuvM6dO+P777/HlClTIBaLMX78+DKfiavpfVS0RYsWwdzcHFu3bsXs2bOhq6uLVq1aVfhweJcuXRAVFYVp06YhNTUV2trasLa2xubNmzF16tQyz7l48SLy8/MxYMAAue2enp4IDg5G79698ffff2PZsmXo2bMnZDIZmjVrhgEDBkBTUxMikQivXr3CxIkTkZaWBgMDA/Tu3Rvr16+vzY+DVIGIVda5IQAZGRlYs2YNNmzYgNzcXMyYMQO7du2q8ByJRIIZM2Zw/Qukbnh4eCAvLw9//vlnpcdGRESgV69e+Pfff7kmYUIIeZ/KNS3q6enBxMSE619hjCElJQVA0Uit4s7k0NBQ9O7dm68wPzqvXr1CWFgYQkNDyx1hGhAQgMjISKSkpODkyZPw9vaGo6MjJTFCSIUEXyPbvHkz7t+/jzdv3sDQ0BAeHh5o06YNdu/ejaysLBQWFqJ79+5wd3fHtWvXcPDgQYhEItjZ2WHixInQ1NTkuwgfBSsrK7x8+RLTp0/HypUryzxm3rx5OHjwINLT02FmZoZ+/fphzZo1VRrtRgj5eAk+kRFCCPm4qVzTIiGEkI8LJTJCCCGCJvjh96mpqXyHUCljY2Pe17aqDapQDiqD8lCFclAZ6o65uXm5+6hGRgghRNAokRFCCBE0SmSEEEIETfB9ZO9jjEEikUAmk1U4i3hdSk9PR35+Pt9hfDBVKMfHWAbGGNTU1KCjo6M0/yf4cjh2HN8hVGiUw89VOs78zt1afd/ye5+qL7V93U9goHKJTCKRQFNTExoaylM0DQ0NlZjTURXK8bGWobCwEBKJBLq6ugqKihD+qFzTokwmU6okRogy0NDQqNIaYoQIkcolso+96YSQ8tD/DaKqVC6REUII+bhQIlOQU6dOoUmTJnjw4AFvMTx//hze3t61cq3Tp08jISGhWuccPnwYbdu2Rb9+/dC7d294e3sjLy+v0nOeP3/+IaECACIjIzF+/PgPvs6HqEn5P5QylJuQukaJTEGOHTuGLl264Pjx47V2zeIFQqvKzMwMu3fvrpX3Pn36NBITE6t93pAhQ3Du3DlcuHABWlpaCAsLq/D4I0eOID09vaZh8qqs+1Pd8hNCqo8SmQK8ffsWMTExWL9+vVwii4yMxPDhwzFx4kS4uLjA19eX64Bv2bIlli1bhgEDBsDDwwMvX74EALi7u2P16tUYMWIEgoKCcPnyZfTv3x99+/bFrFmzkJ+fjzt37sDV1RUSiQS5ubno3bs34uPj8d9//6FPnz4AimoHEyZMgKenJ5ycnLBv3z4EBgaif//+cHNzw6tXrwAAv/zyCwYPHgxXV1euBhEdHY1z585h2bJl6NevH1JSUpCSkoKxY8di4MCBGDZsWKU1z8LCQuTm5sLQ0BA5OTlwcnJCQUEBAODNmzdwdHTEn3/+iX/++Qfffvst+vXrh7y8PPz7778YMWIEBg4ciDFjxnBJbs+ePXBxcYGrqyu++eabKt+bP/74A3379kWfPn245WTCwsKwdOlSAEBQUBC6du0KAEhJScEXX3wBAOXG8f79qUr5AeDp06fw8PCAq6srPDw88OzZMwDAjBkzcOLECe68li1bAij6t+Pu7g5vb290794d3377LYoXrrhw4QKcnZ3xxRdf4NSpU1X+LAhRFSo/vM/d3b3UNjc3N3h5eSEvLw/jxpV+rmTkyJEYNWoUMjMzMXnyZLl9v//+e6Xvefr0abi4uKBFixZo0KAB/v33X9jb2wMA7ty5gwsXLsDCwgJjx47FyZMn4ebmhtzcXLRt2xZLlizBpk2bsHHjRu6LNjs7G0ePHoVEIkGPHj1w+PBhtGjRAtOnT0dISAi8vb3Rr18/rF27FhKJBMOHD4etrS3+++8/ubgSEhJw5swZ5Ofno3v37liwYAHOnj2LJUuW4Pfff4e3tzcGDRqEsWPHAgDWrFmDX3/9FRMmTEC/fv0wYMAADBo0CEDRSs9+fn6wtrbGrVu3MH/+fBw5cqTUZxEWFoYbN24gIyMD1tbW6NevH9TV1dG1a1ecP38eAwcOxPHjxzF48GB8/vnn2L9/PxYtWoRPPvkEBQUF+PHHH7Fv3z4YGRnh+PHjWLNmDTZu3Ijt27cjKioK2traeP36daX3BChqal2xYgVOnToFQ0NDfPnllzh9+jScnJy4lcKvX7+Ohg0bIi0tDTdu3ICjo2OFcZS8P2Upq/wAsHDhQri7u8PDwwOHDh3CokWLsHfv3grjv3fvHv7++29YWFjgs88+Q3R0NNq1a4c5c+bgt99+Q/PmzTFlypQqfRaEqBKqkSnAsWPHMHToUADA0KFDERoayu1r3749LC0toa6uji+++AI3btwAAKipqWHIkCEAgOHDh3PbAXDbHz58iGbNmqFFixYAihLu9evXAQAzZ85EREQE/v33X0ydOrXMuLp16wZ9fX0YGRmhfv363JeqnZ0dl/QSEhIwbNgw9O3bF6GhoWX2i719+xY3b96Ej48P+vXrB19fX2RkZJT5nsVNa3fu3IGtrS0CAgIAAGPGjMHhw4cBFNUWR40aVerchw8fIiEhAaNHj0a/fv3g7++PtLQ0LuZvv/0WR48erfLjFv/88w+6desGIyMjaGhoYPjw4bh27RpMTEzw9u1b5OTkIC0tDV988QWuX7+OGzduoEuXLhXGUVzG8pRX/ps3b2LYsGEAgBEjRsjd7/K0b98e5ubmUFNTg4ODA/777z88ePAAzZo1g7W1NUQiEUaMGFGlz4IQVaLyNbKKalC6uroV7m/UqFGVamAlZWZmIjIyEgkJCRCJRJBKpVBTU8OCBQsAlB4CXd6Q6JLb9fT0AAAVrYGalZWF3NxcFBYWIj8/nzunJC0tLe7vampq0NbW5t5LKpUCKEqIe/bsgYODAw4fPoyoqKhS15HJZDAwMMC5c+fKjaes8vTr1w/79u3Dt99+i86dO+O///5DVFQUZDIZbG1tS53DGEOrVq3w559/ltoXEhKCa9eu4ezZs9i8eTMuXLhQaUKr6PP79NNPcfjwYVhbW8PR0RGHDh3CzZs3sXjxYjx79qzcOACU+Vm/7/3yl7UfkH/eizHGNb8C8vdPXV2d65OjYfXkY0c1slr2119/cb+wr1+/jpiYGDRr1oz7xX3nzh08efIEMpkMYWFh6NKlC4Ci5PDXX38BAEJDQ7ntJdnY2OC///7Do0ePAABHjx6Fk5MTAGDu3LmYM2cOhg0bxjVJ1kROTg5MTU1RUFAgV5PU19dHTk4OAKB+/fpo2rQp98XOGENsbGyl175x4wYsLS251+7u7pg2bRo8PDy4bfXq1ePep0WLFsjMzERMTAwAoKCgAAkJCZDJZEhNTUX37t3x448/Ijs7G2/fvq30/Tt06ICoqChkZmZCKpXi2LFjXH+Yo6Mjdu7cCScnJ7Rp0waRkZHQ0tKCgYFBuXFUV8nyd+rUies//eOPP7j7bWFhgbt3i6YfOnPmjFwiK4uNjQ2ePHmClJQUAEWtAYR8bFS+RlbXjh8/jmnTpslt++yzzxAaGoohQ4agY8eOWLVqFeLj4+Ho6Mj1Oenp6SEhIQEDBw5E/fr1uT6bknR0dLBx40b4+PhAKpXik08+wbhx43DkyBFoaGhg2LBhkEqlGDp0KK5cuSKXNKpqzpw5cHNzg4WFBWxtbbmkMnToUMydOxe7d+/Grl27sG3bNsyfPx9btmxBYWEhhg4dCgcHh1LXK+4jYozhf//7HzZt2sTtGz58ONatW8cNqACK+t7mzZsHHR0dhIWFITAwEIsXL0Z2djakUikmTZoEa2trfPfdd3jz5g0YY/D29uYGUZR09epVfPrpp9zrwMBALFiwACNHjgRjDH369MGAAQMAFCWy1NRUODo6Ql1dHebm5rCxsQFQVBMqK47WrVtX+nmWV/7ly5dj1qxZ2LlzJxo1asRtHzt2LL7++mt89tln6NGjR6W1PR0dHaxduxbjx49Ho0aN0KVLF8THx1caFyGqRMQqam8RgPcX1szNza1SU09d0tDQQGFhISIjI7Fz506EhISUOqZly5ZISkriIbqqKy5HbTlx4gTOnDmDrVu31to1K1PbZeBDTcugbP83+FjQkSYNVjxFTRpc0cKaVCMjvPjxxx9x4cKFMpM6IYRUByWyOtStWzd069atzH3KXhurbStWrOA7BEKIilC5wR4CbyklRGHo/wZRVXVSI0tNTZXr5M/IyICHhwc+++wzbltsbCzWrl0LExMTAEWd72U9zFwZNTU1FBYW0lIuhJRQWFgINTWV+91KCIA6SmTm5uZYt24dgKJh5j4+PmUOL7ezs8O8efM+6L10dHQgkUiQn5+vNM/XaGtrC35VYkA1yvExlqHkCtGEqKI6r7bcvXsXZmZmaNy4sUKuLxKJlG4VXD5GZymCKpSDykCI6qnzRHb16lV07969zH2JiYmYM2cOGjZsiHHjxqFp06aljgkPD0d4eDgAwM/PD8bGxgqNtzZoaGgIIs7KqEI5qAzKQ1XKUZtU4fPgowx1+hxZYWEhfHx8sGHDBjRo0EBuX25uLtf8cevWLQQHB8Pf37/Sa77/HJkyUpVf0KpQDiqD8qDnyEqj58jKV9FzZHXa+3v79m00b968VBIDima2KG7D79ixI6RSKbKzs+syPEIIIQJUp4msombFrKwsbnjwgwcPIJPJUL9+/boMjxBCiADVWR9Zfn4+/v33X7n1vc6ePQsA6N+/PzeTubq6OrS0tDBjxgylGXVICCFEedVZItPW1i61cGD//v25vw8cOBADBw6sq3AIIYSoCHpCkhBCiKBRIiOEECJolMgIIYQIGiUyQgghgkaJjBBCiKDVOJGlp6dDLBbXZiyEEEJItVU5kW3evBkJCQkAgAsXLmDWrFmYNWsW/v77b4UFRwghhFSmyons3r17aNGiBQDgxIkTWLRoEVatWoVjx44pKjZCCCGkUlV+ILp4scrMzEzk5OTA1tYWAPD69WuFBUcIIYRUpsqJzMrKCqGhoRCLxejYsSMAIDMzU+nW/iKEEPJxqXLT4pQpU/DkyRO8e/cOo0ePBlC0fliPHj0UFhwhhBBSmSrXyMzMzPD999/LbXNycoKTk1OtB0UIIYRUVZUTGWMM58+fR2RkJLKzs7F+/Xrcv38fWVlZ6NatmyJjJIQQQspV5abFw4cP48KFC+jbty+3qquRkRGOHz+usOAIIYSQylQ5kV26dAm+vr7o3r07t06YiYkJMjIyFBYcIYQQUpkqNy3KZDLo6OjIbZNIJKW2lWfatGnQ0dGBmpoa1NXV4efnJ7efMYZ9+/bh9u3b0NbWxtSpU2FtbV3V8AghhHykqpzIOnTogJCQEHh6egIoSjyHDx/Gp59+WuU3W7JkCQwMDMrcd/v2bTx//hz+/v5ISkpCUFAQVq1aVeVrE0IUy+TB/Nq94APApJYulWGzupauRISoyk2L48ePR2ZmJry8vJCbm4vx48dDLBZjzJgxtRJITEwMnJ2dIRKJ0KpVK7x9+xavXr2qlWsTQghRXVWukenp6WHu3Ll4/fo1xGIxjI2N0aBBg2q92cqVKwEA/fr1g6urq9y+zMxMGBsbc6+NjIyQmZmJhg0byh0XHh6O8PBwAICfn5/cOcpKQ0NDEHFWRhXKQWX4AA/q/i2rSuj3tJgqlIOPMlQ5kc2dOxdr166FoaEhDA0Nue3z5s0r1d9VluXLl6NRo0Z4/fo1VqxYAXNzc9jb23P7GWOlzikeVFKSq6urXBIsHkGpzIyNjQURZ2VUoRxUhpqrrWZARRD6PS1W1XKYKziOD6Goe2FuXn6pq9y0+Pz581LbGGNIT0+v0vmNGjUCABgaGqJz58548ED+552RkZHcB/Dy5ctStTFCCCHkfZXWyLZt2wagaNLg4r8XE4vFaNq0aaVvIpFIwBiDrq4uJBIJ/v33X7i7u8sd06lTJ5w+fRrdu3dHUlIS9PT0KJERQgipVKWJzNTUtMy/i0QitG7dGl27dq30TV6/fo3169cDAKRSKXr06IH27dvj7NmzAID+/fujQ4cOuHXrFqZPnw4tLS1MnTq12oUhhBDy8ak0kY0cORIA0LJlS7Rv375Gb2Jqaop169aV2t6/f3/u7yKRCJMmTarR9QkhhHy8qtxHdvDgQfz111+0/hghhBClUuVRiyNGjMDly5dx6NAh2NnZwdnZGV26dIGWlpYi4yMfuT8PZ9XyFWv3ep+PalDpMf7+/rX6nrVt+vTpfIdAyAepciJzdHSEo6MjcnJyEBkZiTNnziAoKAhdunSBs7Mz2rRpo8g4CSGEkDJVOZEV09fXR69evaCjo4OwsDBcv34dcXFxUFNTw8SJE9GuXTtFxEkIIYSUScTKehK5DDKZDP/++y8iIiJw69YttGrVSq558dq1a9izZw92796t6JjlpKamftD57z8GAABubm7w8vJCXl4exo0bV2r/yJEjMWrUKGRmZmLy5Mml9o8bNw5Dhw7Fs2fP8P3330NTUxMFBQXc/smTJ6N///548OAB5s2bV+r86dOnw9nZGffu3cPSpUtL7ff19UXnzp0RHR2NNWvWlNq/dOlStGnTBr2X/oLU87+U2m81fCZ0TJoi634knkf8Xmq/9eh50Gpggsw7F5Bx7c9S+1uMWwLNeoZ4EXMaL2LOltrfcsIqqGvpICPyODL/vVRqv+2UjTg+1hY7d+7kZmkppqOjgwMHDgAANm3ahOOh8ufr6xnC5+stAIDQE5uQ/Pgfuf0NDU0x4auiz+S30NX4LzVBbr9pY0t85bEMAHDgtyVIFz+W29/UvDU8hhXNKbj3gC9evZZ/TtLa8hMMc5sJAAjc9z109N/K7e/evTtmziza/9VXX0EikeDp06fc/hYtWsDR0RFAUb9zqc/G1hYdO3ZEQUEBjhw5Ump/27Zt0bZtW+Tm5uLYsWOl9nfo0AF2dnbIzs7GiRMnSu3v0qULbGxs8PLlS5w5cwYAYGFhwe2v6N+eZl4ylnt3QLc2Joi8l4FFu2+Xuv6GbzujfctGOB+TilU/3y21f8cPTmjdzBAnrv6HTb/dL7U/eGEPNDWph9/+foTA44ml9h9e1gvGDXSw/9QDhJx+yG0v0C2aYPznn3+Grq4ugoODyyz/yGW6AIALR57g/nX5B3c1tdQweVV7AMDZA4+QdEd+ijw9A018vbgtAODEnod4HCc/XsDQWBtfzXMAAIQGJCL1YY7c/sZN9OAx0xYA8NumeIif5crtN2+hj0PbrgMAvvvuO6Slpcnt//TTTzF/ftG/ze9GeuDlm2y5/X0/aY9FX44FAAxa/CPy3uXL7Xfr7IjZI4q+71zmzcH7PHo4Y6rb58iVSDB46aJS+7369oNXv/548fo13FevKLX/m8FuGOXcC9GNG5VahPn330t/z1RXRQ9EV7lG5uPjAwMDAzg7O+Orr77iHnAu5uTkxP3HEDp2cCekV/+AVCoDS0gqvT84FdLwXyB9VwiW8LDUftmudZCe2ANZ3juwhEd49/7+bSsgPbINshwJWMLj0udvWgzpzwaQZeeCJfxXW8UihBCVVOUa2cOHD9GiRQsARc+FxcfHo0mTJnK/5vjwoTWyski9h9T6NWuT+u6wKh039Jd4BUdSc8fH2lbpuNof7FG7PqbBHrU++30tqurs94djS7ewKJNRDj9X6TjzO6Vru8oitX1bhVz3g2pkmZmZ2Lt3L54+fYpWrVrh888/x5IlS6Cmpoa3b9/i22+/Rffu3Ws1YEIIIaSqKn2ObNeuXahXrx48PT0hk8mwcuVKTJkyBUFBQZg1axZCQ0PrIk5CCCGkTJUmssTERHh7e6NDhw7w9vbG69ev0blzZwBA586dIRaLFR4kIYQQUp5KE5lUKoWGRlELpLa2NnR0dMpcXoUQQgjhQ6V9ZFKpFPfu3eNey2SyUq8JIYQQvlSayAwNDREQEMC91tfXl3ttYGCgmMgIIYSQKqg0kW3fvr0u4iCEEEJqpMqz3xNCCCHKqNpzLdbEixcvsH37dmRlZUEkEsHV1RWDBw+WOyY2NhZr166FiYkJgKJJisuaPooQQggpqU4Smbq6OsaNGwdra2vk5eVh3rx5aNeuXalZQezs7Mqce5AQQggpT500LTZs2BDW1kWTeurq6qJJkybIzMysi7cmhBCi4uqkRlZSRkYGHj16BBsbm1L7EhMTMWfOHDRs2BDjxo1D06ZN6zo8QgghAlOniUwikWDDhg3w8vKCnp6e3L7mzZtjx44d0NHRwa1bt7Bu3boyJ1sNDw/nlv7w8/ODsbFxrceZXvkhvFJEmeta1cuQpcgwPthHdS8eKDaOD6EK9wFQjXLwUYY6S2SFhYXYsGEDevbsya3HVFLJxNaxY0fs2bMH2dnZpZ5Tc3V1haurK/f6xQv5NYU+BqpQZlUoA6Aa5ahqGUwUHMeHUIX7AFS9HOXPA88/Rd2Lima/r5M+MsYYdu7ciSZNmsDNza3MY7KyslC8osyDBw8gk8lQv379ugiPEEKIgNVJjSwhIQERERFo1qwZ5swpWpn0yy+/5DJ3//79ce3aNZw9exbq6urQ0tLCjBkzaE5HQgghlaqTRGZra4vffvutwmMGDhyIgQMH1kU4hBBCVAjN7EEIIUTQKJERQggRNEpkhBBCBI0SGSGEEEGjREYIIUTQKJERQggRNEpkhBBCBI0SGSGEEEGjREYIIUTQKJERQggRNEpkhBBCBI0SGSGEEEGjREYIIUTQKJERQggRNEpkhBBCBI0SGSGEEEGrk4U1AeDOnTvYt28fZDIZ+vbtiy+++EJuP2MM+/btw+3bt6GtrY2pU6fC2tq6rsIjhBAiUHVSI5PJZNizZw8WLFiATZs24erVq3j69KncMbdv38bz58/h7++PyZMnIygoqC5CI4QQInB1ksgePHgAMzMzmJqaQkNDA926dUN0dLTcMTExMXB2doZIJEKrVq3w9u1bvHr1qi7CI4QQImB10rSYmZkJIyMj7rWRkRGSkpJKHWNsbCx3TGZmJho2bCh3XHh4OMLDwwEAfn5+MDc3r/2A/4qp/WvyIHqOAj6bOuYzU/hl8PPz4zuE2mG+n+8IylXVfyUzzc8rNI46o4jvvVrCR2R1UiNjjJXaJhKJqn0MALi6usLPz09QXw7z5s3jO4RaoQrloDIoD1UoB5VBOdRJIjMyMsLLly+51y9fvixV0zIyMsKLFy8qPIYQQgh5X50kshYtWiAtLQ0ZGRkoLCxEZGQkOnXqJHdMp06dEBERAcYYEhMToaenR4mMEEJIpeqkj0xdXR0TJkzAypUrIZPJ0Lt3bzRt2hRnz54FAPTv3x8dOnTArVu3MH36dGhpaWHq1Kl1EVqdcHV15TuEWqEK5aAyKA9VKAeVQTmIWFmdU4QQQohA0MwehBBCBI0SGSGEEEGjREYIIUTQKJERQggRtDqbNPhjkZOTU+F+fX39Ooqk5k6cOFHhfjc3tzqK5MOlpqYiKCgIr1+/xoYNG/D48WPExMRgxIgRfIdWIxKJBDo6OnyHUSPp6enYt28fkpKSuKnoPD09YWpqyndo1ZKZmQmxWAypVMpts7e35zEiQomslvn6+kIkEoExhhcvXkBfXx+MMbx9+xbGxsbYvn073yFWKi8vD0BREnj48CH3zN/NmzdhZ2fHZ2jVFhgYiHHjxmHXrl0AAEtLS/j7+wsukSUkJGDnzp2QSCQICAhASkoKwsPDMWnSJL5DqzJ/f38MGDAAc+bMAQBcvXoVW7ZswapVq3iOrOoOHDiAqKgoWFhYcDMPiUQiwSWy69ev45dffsHr168BFM2sJBKJsH+/8k5DVhFKZLWsOFHt2rULnTp1QseOHQEUze5/9+5dPkOrspEjRwIAVqxYgTVr1kBXV5fbvnHjRj5Dq7Z3797BxsZGbpuamvBa1Pfv34+FCxdi7dq1AAArKyvExcXxHFX1MMbg7OzMvXZ2dsaZM2d4jKj6oqOjsXnzZmhqavIdygc5cOAAfH19YWFhwXcotUJ4/6MF4uHDh1wSA4AOHTrg/v37PEZUfS9evICGxv/91tHQ0IBYLOYxouqrX78+nj9/zv16vnbtmmBnjCk5qTYgnISck5ODnJwcODg44NixY8jIyIBYLMbx48fRoUMHvsOrFlNTU7kmRaFq0KCByiQxgGpkCmNgYICjR4+iZ8+eEIlEuHz5MurXr893WNXi7OyMBQsWoHPnzhCJRLhx44bcL2ohmDhxInbt2oVnz57Bx8cHJiYm+O677/gOq9qMjIyQkJAAkUiEwsJCnDx5Ek2aNOE7rCop2dwOAOfOneP2iUQiuLu78xVatWlpaWHOnDlo27at3I+8CRMm8BhV9VlbW2PTpk3o3LmzXO3S0dGRx6hqjmb2UJCcnBwcOXIEcXFxEIlEsLOzg7u7uyAGe5SUnJyM+Ph4AICdnR2aN2/Oc0Q1I5FIwBjjmkmFJjs7G8HBwbh79y4YY2jXrh2+/vprwf04ErqLFy+Wud3FxaVO4/hQO3bsKHO7UKcGpESmIFFRUejatWul25TZ1q1bS9VeytqmjFRp5KXQXb9+vcL9QqsFFBYWIjU1FQBgbm4uVzMj/KA7oCDHjh0rlbTK2qbMnj59KvdaJpMhOTmZp2iqp3jkpdDt3bu3wv1CaNK6efMmAOD169dITEyEg4MDACA2NhYODg6CSmSxsbHYvn07GjduDKCoH3natGmCG7X48uVL7N27l2uubt26Nb7++mu5BZCFhBJZLbt9+zZu376NzMxMuS+hvLw8wXTOh4aGIjQ0FO/evYOnpyeAohFnGhoagpkpu3jkZU5OTqnm3IyMDD5CqhFra2u+Q/hgxc1Vfn5+2LhxIzfY5tWrV9izZw+foVVbSEgIfvzxR25l+tTUVGzZsgVr1qzhObLq2bFjB3r06IFZs2YBAC5fvowdO3Zg0aJFPEdWM5TIalnDhg1hbW2NmJgYuS8hXV1dLikou2HDhmHYsGE4ePAgxowZw3c4H2TNmjWYP38+9PT0ABTVMjdt2oQNGzbwHFnVFPe9ZGRkwMTERG7fgwcPeIio5sRisdyIUUNDQ6SlpfEYUfVJpVIuiQFFTYtCHMWYnZ2N3r17c69dXFzw119/8RjRh6FEVsusrKxgZWWFHj16cG3nOTk5ePnypeAGenTs2JGbSSIiIgKPHj3C4MGDuWYVIRg2bBiXzFJTU7Ft2zZMnz6d77CqbcOGDfD19UWjRo0AAPfv38eePXsEk5CBotkvVq5cie7duwMAIiMjuWZGobC2tkZAQAA3evfy5cuCrDUbGBggIiICPXr0AABcuXJF0AOHaLCHgixduhRz586FTCbDnDlzYGBgAHt7e8HUygBg9uzZWLduHR4/foxt27ahT58+uH79OpYtW8Z3aNVy48YNhIWFIS8vD7Nnz8b//vc/vkOqtgcPHmDPnj3w9fVFcnIyfv31V/j6+pZ6tkzZ3bhxg3ue0t7eHl26dOE5ouopKCjAmTNnEB8fD8YY7OzsMGDAAME9IP3ixQvs2bMHiYmJAMD1kQnpR2pJVCNTkNzcXOjp6eH8+fPo3bs3PDw8MHv2bL7DqhZ1dXWIRCLExMRg8ODB6NOnDy5dusR3WFXy/iCJvLw8mJiY4NSpUwCEMUiiJBsbG3z99ddYsWIFNDU1sWjRIhgYGPAdVrV16dJFcMmrJE1NTbi5ucHNzY1raRFaEgOKHq739fXlO4xaQ4lMQaRSKV69eoWoqCiMHj2a73BqREdHB6Ghobh8+TKWLVsGmUyGwsJCvsOqkvebe4TY/AMUDZAonpUEAPLz86Gnp4eAgAAAEPyXUWBgIHx8fPgOo8pUoaUFKJqiavjw4dDS0sKqVavw+PFjeHp6Cm7Cg2KUyBTE3d0dK1euhK2tLWxsbJCeng4zMzO+w6qWmTNn4sqVK5gyZQoaNGiAFy9eYMiQIXyHVSXFgyQkEgm0tLS4EaMymQwFBQU8RlY9Qvm8a6pfv358h1AtqtDSAgD//PMPvvrqK9y4cQONGjXCrFmzsGzZMsEmMmGMBxegli1bYv369dzs5KampoKaqRwAwsPD4eTkxM14b2xsLKgkAADLly/Hu3fvuNfv3r3D8uXLeYyoeuzt7WFvbw9jY2PY2Nhwr21sbATXP1aSTCZDbm6u4GrKJVtaSs6lKjTFIy1v3bqFHj16CG4g2vsokSnIt99+i82bN8t9ia5evZrHiKrv9OnTWLlyJe7du8dtKzlPnhC8e/dObv0uHR0d5Ofn8xhRzWzcuFHuOUQ1NTVs2rSJx4iqb8uWLcjNzYVEIsGsWbMwY8YMhIWF8R1WtRS3tJiZmQm2pQUAPv30U8yYMQPJyclo06YNsrOzBdnXV4wSmYI0a9YMdnZ2WLRoEZ4/fw4AENoA0UaNGmHhwoU4ePAg94UjtDLo6OjIzUaSnJwMLS0tHiOqGalUWmolAqH0VxZ7+vQp9PT0EB0djQ4dOmDHjh2IiIjgO6xq6dq1a6mWFiE2LY4dOxYrVqyAn58fNDQ0oK2tjblz5/IdVo1RH5mCiEQiDBgwAJaWllizZg3Gjh0r12kvFMbGxli6dCmCgoKwceNGuRqmEHh6emLTpk1ys0nMnDmT56iqz8DAADExMdwip9HR0YJ77kcqlaKwsBDR0dEYOHAgNDQ0BPN/4vjx4xg6dGiZU4aJRCLo6+ujZ8+eSl87u3fvHtq0aVPm/JfF5bC1tRXMLETFKJEpSHHNxdbWFosXL8bmzZvx7NkznqOqnuL+Cy0tLUydOhWnT58WzFyLxWxsbLBp0ybBT/Lq7e2NrVu3clM6GRkZ4dtvv+U5qupxdXXFtGnTYGVlBTs7O4jFYsGsRlC8ZE55fXo5OTnYsGED1q1bV5dhVdv9+/fRpk0bbv7L97158wZHjx4V3FRV9EC0grx69UpuOh6pVIqEhATBTS4qVBX98gSEN+N6MaEvR/M+qVQKdXV1vsOoFefOnRPcKMyyBAQE4JtvvuE7jGoR3k9TgXh/FWJ1dXW5QQdC9dtvv8HDw4PvMCpV2S9PoSSyiIgIODs7l7ssjRCWo1GFJXXef57vfb6+voJKYrm5udx6iUDR6Fh3d3fo6ekJLokBlMjq1NmzZzFlyhS+w/ggQhkuXZxshbpQYLHiEZZCXpZGyLEXK36e7/r168jKykLPnj0BAFevXhXktE47duxAs2bNuP7iiIgI7NixQ5ADVwBqWiQq7s2bNzhy5AgSEhIAFPVZuru7C26gBFEOS5YsKTXXaFnblN2cOXNK9eeVtU0oqEamQJmZmRCLxXLLPAipjyw9PR379u1DUlISRCIRWrVqBU9PT5iamvIdWpVt3rwZdnZ2+OGHHwAUzVa+efNmwXVmC/leqMLioMWys7ORnp7Ofe4ZGRnIzs7mOarq09LSQnx8PGxtbQEA8fHxgnwspRglMgU5cOAAoqKiYGFhwbWti0QiQSUyf39/DBgwAHPmzAFQ1IyyZcsWrFq1iufIqi4nJwfu7u7c6xEjRiA6OprHiGpGyPdCKM3RVeHp6YmlS5dyiUwsFsPb25vnqKrP29sb27dvR25uLgCgXr16gm6Gp0SmINHR0di8ebOgn5ZnjMnNvebs7IwzZ87wGFH1OTg44OrVq+jatSsA4Nq1a4KcWkjI96J43stixWvcCVH79u3h7+/PPUrTpEkTQf4ft7Kywrp167hEVrzwrFBRH5mCrFq1CrNmzRLkf9icnBwARQ+B1qtXD926dYNIJEJkZCQKCgrkajjKbvz48cjPz5ebNFhbWxtAUQ15//79fIZXZb/88kuZ92LgwIEAIIi58hITExEQEACJRIKAgACkpKQgPDxccHOQJiQklOoy6NWrF48RVV9WVhZ+/fVXvHr1CgsWLMDTp0+RmJiIPn368B1ajVCNTEG0tLQwZ84ctG3bVu4BXCH0B/j6+kIkEnEPdZecX1EkEgkqkYWEhPAdQq2IjIwEUHquywsXLkAkEmHbtm18hFUtwcHBWLhwIdauXQugqFZQPPxbKLZu3Yr09HRYWVnJzX4htES2Y8cOuLi4IDQ0FADwv//9D5s2baJERuR16tSJm05IaLZv3853COQ9qnJP3p+xX2hTISUnJ2Pjxo2CmVqrPG/evEG3bt1w7NgxAEXPuQrtXpREiUxBXFxcUFhYKMipkcqbDaOYUB4mLs/cuXO5WoGQZWVloUGDBnyHUWVGRkZISEiASCRCYWEhTp48yU39JBRNmzZFVlZWqQkPhEZbWxtv3rzhEnJiYqKg+8moj0xBYmNjsX37du5hyRcvXmDatGmCGLW4Y8cOAMDr16+RmJgIBwcHAEVlcnBwEOxDk6pm9erVmD9/Pt9hVFl2djaCg4Nx9+5dMMbQrl07TJgwQRD9e8WWLVuGlJQU2NjYyP0wFdpK3cnJydi3bx+ePHmCZs2aITs7G7NmzYKlpSXfodUMIwoxd+5c9uzZM+71s2fP2Ny5c3mMqPpWr17NMjMzudeZmZls3bp1PEZUfT///HOVthHF27p1K3vz5g33+s2bN2z79u08RlR9sbGxZf4RosLCQvbkyRP2+PFjVlBQwHc4H0QYbV0CJJVKYW5uzr02NzeXG+UkBGKxWK4JxdDQEGlpaTxGVH13794tte3OnTv46quveIjmw8THxyMtLQ29e/dGdnY2JBIJTExM+A6ryp48eSJX+9LX10dKSgp/AdWAEFpUquL97oO0tDTo6emhWbNmMDQ05CmqmqNEpiDW1tYICAjgnv25fPmy4B4Mtbe3x8qVK9G9e3cARSPnipsZld3Zs2dx5swZpKenyzWF5uXloXXr1jxGVjNHjhzBw4cPuURWWFiIrVu3Yvny5XyHVmWMMeTk5HDJLCcnR3A/7soSGBgIHx8fvsOolr///luu2+D+/fto2bIl0tLS4O7uLvfMohBQIlMQb29vnDlzBqdOnQJjDHZ2dhgwYADfYVXLxIkTcePGDdy/fx9A0XpSXbp04TmqqunRowfat2+PgwcPYuzYsdx2XV1dQfXJFLtx4wbWrl3L9cU0atRIcJPxurm5YdGiRXB0dIRIJEJUVBSGDx/Od1gfTEiz3hcTiUTYtGkTN1goKysLQUFBWLVqFZYsWUKJjBTR1NSEm5sb3NzckJOTg5cvXwpyBoAuXboIJnmVpKenBz09PQwePBj6+vrc+l15eXlISkpCy5YteY6weopXUy4eZSaRSHiOqPp69eqFFi1a4N69e2CMYfbs2bCwsOA7rBqTyWSQSCSCa2kBiroNSo54Le420NfXF+T6cMJ9cEDJLV26FLm5ucjJycGcOXOwY8cOwcwiUZHAwEC+Q6iWoKAgudlVtLW1ERQUxGNENdO1a1fs2rULb9++RXh4OJYvX46+ffvyHVa1WVhYYODAgRg0aJAgk9iWLVuQm5sLiUSCWbNmYcaMGQgLC+M7rGqzs7ODn58fLl68iIsXL2Lt2rWws7ODRCJBvXr1+A6v2qhGpiC5ubnQ09PD+fPn0bt3b3h4eKjEsHWhNaMwxuQeXlVTUxNcvwxjDN26dUNqaip0dXWRmpqKUaNGoV27dnyH9tF5+vQp9PT0cPnyZXTo0AFjx47FvHnzuPXKhGLixIm4fv064uPjARTVloubfJcsWcJzdNVHiUxBpFIpXr16haioKIwePZrvcD6YUJtRTE1NcfLkSfTv3x9A0SAQIY30A4r6M9atW4c1a9ZQ8uKZVCpFYWEhoqOjMXDgQK7JV2hEIhGcnJzg5OTEdyi1gpoWFcTd3R0rV66EmZkZbGxskJ6eDjMzM77DqhZVaEbx9vZGYmIipkyZgm+++QZJSUmCG2EGAC1btsSDBw/4DuOj5+rqimnTpiE/Px92dnYQi8Vc/6vQCa3boCSa2YOUq3jF2MuXLyM5OZlrRlm/fj3foX10Zs6cidTUVJiYmEBbW5trMqV7wT+pVCrIARLvS05OFlyLSzFqWqxlx48fx9ChQ8tcFVckEkFfXx89e/YURO1MyM0oFd0HQBirEJS0YMECvkP4qJ04caLC/W5ubnUUSe0TardBSZTIalnxJKjl/aPIycnBhg0bsG7duroMq0aKm1GsrKwE14xS2X0QmsaNGyMlJYXrnLe1tYWVlRW/QX1EhPbMXmW2bNkCb29vqKmpYd68ecjNzYWbm5vgBq0Uo6ZFHpw7d05wo/+KqUozitCcPHkS58+f557pu3HjBlxdXTFo0CCeIyNCpGrdBlQjq2V+fn4VNr/5+voqfRJThWaUqtwHIfn777+xcuVK7pm4oUOH4scff6REVkfKa6IuJrSmaiF3G5SFElktK66aX79+HVlZWejZsycA4OrVq9ySLspOFZpRVOE+lMQYk1v4UE1NDdSYUndUpYm6mJC7DcpCTYsKsmTJEixbtqzSbUSxVOU+nDhxApcuXULnzp0BANHR0ejVq5cgaseqSCKRyM0YowqE3G1ANTIFyc7ORnp6OkxNTQEAGRkZyM7O5jmqqlGlZhQh34eS3NzcYG9vzw32mDp1Kpo3b85zVB+fxMREBAQEQCKRICAgACkpKQgPD8ekSZP4Dq1KVKHboCyUyBTE09MTS5cu5b5AxWIxvL29eY6qalSpGaWs+zB58mSeo6q+rVu34rvvvpO7N8XbSN0JDg7GwoULsXbtWgCAlZUV4uLieI6q6lSh26AslMgUpH379vD398ezZ88AFA0HF8rs9y4uLnKvhdyMIuT7UNLTp0/lXstkMiQnJ/MUzcfN2NhY7nXJvktlN3LkSL5DUAhKZAqUnJwMsVgMqVSKx48fAyianFMohN6MUkxTUxNWVlaCXAAxNDQUoaGhePfuHTw9PQEUDfzQ0NCAq6srz9F9fIyMjJCQkACRSITCwkKcPHmSe2ZRCFSp26AkSmQKsnXrVqSnp8PKykruF5uQEpnQm1HeJ8QazLBhwzBs2DAcPHgQY8aM4Tucj563tzeCg4ORmZmJKVOmoF27doL6YadK3QYlUSJTkOTkZGzcuFHQz2YAwm5GeZ+BgQHfIdRYx44duSbeiIgIPHr0CIMHDxbkowRCFhISggkTJnCrjOfk5CAkJARTp07lObKqUaVug5KE+62k5Jo2bYqsrCy+w/gg7zejhIWFCaoZ5X3z589Hbm4u32HUSFBQELS1tZGSkoKwsDA0btwY27Zt4zusj86TJ0+4JAYA+vr6SElJ4S+gGkpMTMTMmTMxc+ZMAEBKSoogF5wtRjUyBXnz5g1mzZoFGxsbaGj838cspBklhN6MAqjOnHLq6uoQiUSIiYnB4MGD0adPH1y6dInvsD46jDHk5OTI1ciEtlAroHrdBpTIFEQVRgcJvRkFUJ0VfXV0dBAaGorLly9j2bJlkMlkKCws5Dusj46bmxsWLVrEraYcFRWF4cOH8x1WjahStwElMgWxt7fnO4QPpgrNKKoyp9zMmTNx5coVTJkyBQ0aNMCLFy8El4xVQa9evdCiRQvcu3cPjDHMnj0bFhYWfIdVbUIfffk+4aZgARLaCqzFzSjFhNiMoior+jZo0ABubm6ws7PDzZs3YWxsLKgRsKrEwsICAwcOxKBBgwSZxICiboMzZ85w3QYpKSmC6zYoieZarENCW4H10qVLOHbsWKlmFGdnZ75D+yBCnlMOKOpnXbNmDd9hEAHbtm0bvLy8BN1tUBI1LdYBoa7AKuRmFFWdUw4AzXpPPpgqdBuURIlMQVRltJyFhYVgkldJqjannEwm4zrjhThXJFEuqjL6shglMgVRldFyQqUKo0ZL+u677+Dk5ITevXvDxsaG73CIwKnS6EuAEpnCqMpoOaFStTnl1q9fj6tXr2Lnzp1gjKF3797o1q0b9PT0+A6NCJCQuw3KQolMQVRtBVahEVp/ZGV0dXXh6uoKV1dX3L9/H1u2bMH+/fvh6OgId3d3mJmZ8R0iERihdhuUhUYt1iGhj5YTMqHPKSeTyXDr1i1cuHABYrEYzs7O6NGjB+Lj4/Hrr79iy5YtfIdICG+oRlbLVHm0nBCpylI006dPh4ODA4YMGYLWrVtz252cnHD//n0eIyOEf5TIapmqjZYTOlWYU04mk8HFxQXu7u5l7hdafx8htY0SWS1TtdFyqkDoc8qpqakhNja23ERGyMeOElktU7XRckKnKnPKtWrVCnv27EG3bt2gra3NbVe1QS2E1AQN9qhlFy9erHD/+wvbEcXKzs5GcHAw7t69C8YY2rVrJzejv1AsW7aszO1Lliyp40gIUT6UyBRM6KPlhE7V5pQjhJRGTYsKoiqj5YROleaUu3XrFv777z8UFBRw26jfjBBaxkVhikfL1a9fH4AwR8upAlVYigYAdu3ahcjISJw+fRqMMURFRUEsFvMdFiFKgWpkCiT00XKqQFXmlEtMTMT69esxe/ZsjBw5Ep9//jnWr1/Pd1iEKAVKZAqiKqPlhE5V5pTT0tICAGhrayMzMxP169dHRkYGz1ERohxosIeCqMpoOaIcfv/9dwwaNAh3797Fnj17IBKJ0KdPH4wePZrv0AjhHSUyBaHRckRRCgoKUFBQQDPfE/L/UdOigqjSaDmiHBISEiAWi+UGq/Tq1YvHiAhRDpTIFETVVmAl/Nq6dSvS09NhZWUlN2iIEhkhlMgURlVGyxHlkJycjI0bN9LirISUgRKZgqjKaDmiHJo2bYqsrCw0bNiQ71AIUTo02IMQAVi2bBlSUlJgY2MDDY3/+/3p6+vLY1SEKAeqkREiALQ8ECHloxoZIQKRlZWFhw8fAgBsbGxgaGjIc0SEKAdKZIQIQGRkJA4cOAB7e3sAQFxcHMaNGwcnJyeeIyOEf9S0SIgAhIaGYvXq1VwtLDs7G8uXL6dERgho9ntCBEEmk8k1Jerr60Mmk/EYESHKg2pkhAhA+/btsXLlSnTv3h1AUVNjhw4deI6KEOVAfWSECMS1a9eQkJAAxhjs7e3RpUsXvkMiRClQIiOEECJo1LRIiBJbtGgRli9fjvHjx8tNT8UYg0gkwv79+3mMjhDlQDUyQgghgkajFgkRgMTEROTl5XGvJRIJkpKSeIyIEOVBiYwQAQgKCoKOjg73WktLC0FBQTxGRIjyoERGiAAU94kVU1NTo/XtCPn/KJERIgCmpqY4efIkCgsLUVhYiJMnT8LExITvsAhRCjTYgxABeP36Nfbt24d79+5BJBKhTZs28PLyoomDCQElMkIIIQJHz5ERosSOHz+OoUOHYu/evWXunzBhQh1HRIjyoURGiBJr0qQJAMDa2prnSAhRXtS0SAghRNCoRkaIEvPz85Mbdv8+X1/fOoyGEOVEiYwQJTZkyBAAwPXr15GVlYWePXsCAK5evYrGjRvzGRohSoMSGSFKzN7eHgBw+PBhLFu2jNveqVMnLFmyhK+wCFEq9EA0IQKQnZ2N9PR07nVGRgays7N5jIgQ5UGDPQgRgDt37iAwMBCmpqYAALFYjMmTJ+OTTz7hOTJC+EeJjBCBKCgowLNnzwAUDcvX1NTkOSJClAM1LRIiEJqamrCyssKZM2coiRFSAiUyQgQmOTmZ7xAIUSqUyAgRGAMDA75DIESpUB8ZIQIjk8kgkUigp6fHdyiEKAWqkREiAFu2bEFubi4kEglmzZqFGTNmICwsjO+wCFEKlMgIEYCnT59CT08P0dHR6NChA3bs2IGIiAi+wyJEKVAiI0QApFIpCgsLER0djc6dO0NDQ6PCORgJ+ZhQIiNEAFxdXTFt2jTk5+fDzs4OYrEYurq6fIdFiFKgwR6ECJRUKoW6ujrfYRDCO5o0mBAlduLEiQr3u7m51VEkhCgvSmSEKLG8vDy+QyBE6VHTIiGEEEGjGhkhSmzv3r0V7p8wYUIdRUKI8qJERogSs7a25jsEQpQeNS0SIiASiQQ6Ojp8h0GIUqHnyAgRgMTERMycORMzZ84EAKSkpCAoKIjnqAhRDpTICBGA4OBgLFy4EPXr1wcAWFlZIS4ujueoCFEOlMgIEQhjY2O512pq9N+XEIAGexAiCEZGRkhISIBIJEJhYSFOnjyJJk2a8B0WIUqBBnsQIgDZ2dkIDg7G3bt3wRhDu3btMGHCBOjr6/MdGiG8oxoZIQIQEhIil7hycnIQEhKCqVOn8hwZIfyjRnZCBODJkydytS99fX2kpKTwFxAhSoQSGSECwBhDTk4O9zonJwdSqZTHiAhRHtS0SIgAuLm5YdGiRXB0dIRIJEJUVBSGDx/Od1iEKAUa7EGIQDx9+hT37t0DYwxt27aFhYUF3yERohQokRFCCBE06iMjhBAiaJTICCGECBolMkIUZNy4cUhPTwcAbN++HYcOHeI5IkJUE41aJOQDTZs2DVlZWXJzH27ZsgU///xzrVzfw8MDTZs2xbp167j3OHToEF6+fIlp06bVynsQImSUyAipBb6+vmjXrp3Crv/q1StERkaiR48eCnsPQoSKEhkhCuLh4QF/f3+YmZmV2nfz5k0cOnQIYrEYFhYW8Pb2hqWlZbnXGjJkCH777Td07doV6urqpfZv3LgRcXFxePfuHaysrDBp0iQ0bdoUQFGzpra2NjIyMhAXFwcrKyv88MMPOHbsGC5dugRDQ0N8//33aN68OQAgMzMTe/fuRVxcHHR0dPDZZ59h8ODBtfSpEFL7qI+MkDqWnJyMgIAATJ48GXv37oWrqyvWrl2LgoKCcs9xdHSErq4uLl68WOb+9u3bw9/fH0FBQWjevDn8/f3l9kdFRWH06NHYs2cPNDQ0sHDhQjRv3hx79uyBk5MTQkJCAAAymQxr1qyBlZUVAgMDsXjxYpw8eRJ37typreITUusokRFSC9atWwcvLy94eXlh7dq1FR57/vx5uLq6omXLllBTU4OLiws0NDSQlJRU7jkikQijRo3C77//XmbC69OnD3R1daGpqYmRI0fi8ePHyM3N5fZ37twZ1tbW0NLSQpcuXaClpYVevXpBTU0N3bp1w6NHjwAADx8+RHZ2Ntzd3aGhoQFTU1P07dsXkZGRNfxkCFE8alokpBbMmTOnyn1kL168wKVLl3D69GluW2FhITIzMys8r2PHjjA2NkZ4eLjcdplMhl9//RXXrl1DdnY2RCIRgKKlX/T09AAADRo04I7X0tKCoaGh3GuJRAIAEIvFePXqFby8vOSub2dnV6WyEcIHSmSE1DEjIyMMHz68RnMljh49Gps3b5Yb9HHlyhXExMRg0aJFaNy4MXJzc/H111/XKDZjY2OYmJiUapokRJlR0yIhdaxv3744d+4ckpKSwBiDRCLBrVu3kJeXV+m5Dg4OaNasGS5dusRty8vLg4aGBvT19ZGfn49ff/21xrHZ2NhAV1cXx44dw7t37yCTyfDkyRM8ePCgxtckRNGoRkZIHWvRogV8fHywd+9epKWlQUtLC7a2tlVuvhs9ejQWLlzIve7Vqxf++ecfTJkyBfr6+hg1ahTOnj1bo9jU1NTg6+uLkJAQTJs2DYWFhTA3N8eoUaNqdD1C6gJNGkwIIUTQqGmREEKIoFEiI4QQImiUyAghhAgaJTJCCCGCRomMEEKIoFEiI4QQImiUyAghhAgaJTJCCCGC9v8AXTLRpHcIHbQAAAAASUVORK5CYII=", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Get file sizes\n", - "file_sizes = {\n", - " path: path.stat().st_size for path in [Path.cwd() / name for name in file_names]\n", - "}\n", - "\n", - "# Sort by size\n", - "file_sizes = dict(sorted(file_sizes.items(), key=lambda x: x[1]))\n", - "\n", - "# Plot\n", - "plt.bar(\n", - " x=range(len(file_sizes)),\n", - " height=file_sizes.values(),\n", - " tick_label=[p.name for p in file_sizes],\n", - " color=[f\"C{i}\" for i in range(len(file_sizes))],\n", - ")\n", - "plt.xlabel(\"File Name\")\n", - "plt.ylabel(\"Bytes\")\n", - "plt.xticks(rotation=90)\n", - "plt.hlines(\n", - " y=lowest_bytes_lower_bound,\n", - " xmin=-0.5,\n", - " xmax=len(file_sizes) - 0.5,\n", - " linestyles=\"dashed\",\n", - " color=\"black\",\n", - " label=\"Approximate Bytes Lower Bound\",\n", - ")\n", - "plt.legend()\n", - "plt.tight_layout()\n", - "plt.title(\"Polygon Annotation File Sizes\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gmuEWlImpT57" - }, - "source": [ - "The SQLite representation (4.9GB) appears to be quite compact compared\n", - "with GeoJSON and ndjson. Although not as compact as a dictionary pickle\n", - "or Zstandard compressed ndjson, it offers a good compromise between\n", - "compactness and read performance.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Yhe5rMXPpT57" - }, - "source": [ - "# 3: Extra Bits\n", - "\n", - "## 3.1) Space Saving\n", - "\n", - "A lot of space can be saved by rounding the coordinates to the nearest\n", - "integer when storing them. Below we make a copy of the dataset with all\n", - "coordinates rounded.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "H2Jsc0repT57", - "outputId": "d2ca9eff-b67d-4bfc-ad5a-57c87bc6a7da" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 10008338/10008338 [51:00<00:00, 3270.16it/s] \n" - ] - } - ], - "source": [ - "# Run Time: ~50m\n", - "! rm integer-cells.db\n", - "int_cell_sqlite_store = SQLiteStore(\"integer-cells.db\")\n", - "\n", - "# We use batches of 1000 to speed up appending\n", - "batch = {}\n", - "batch_size = 1000\n", - "for key, annotation in tqdm(cell_sqlite_store.items(), total=len(cell_sqlite_store)):\n", - " geometry = Polygon(np.array(annotation.geometry.exterior.coords).round())\n", - " rounded_annotation = Annotation(geometry, annotation.properties)\n", - " batch[key] = rounded_annotation\n", - " if len(batch) >= batch_size:\n", - " int_cell_sqlite_store.append_many(batch.values(), batch.keys())\n", - " batch = {}\n", - "_ = int_cell_sqlite_store.append_many(batch.values(), batch.keys())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "U6aooIROpT57" - }, - "source": [ - "Here the database size is reduced to 2.9GB, down from 4.9GB.\n", - "Additionally, when using integer coordinates, the database compresses\n", - "much better. Zstandard can compress to approximately 60% of the\n", - "original size (and 35% of the floating point coordinate\n", - "database size). This may be done for archival purposes.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q3TJ8XX4pT57", - "outputId": "b99d1af7-4c68-4394-cf9a-8bb2b64471a0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "integer-cells.db : 60.58% ( 2.86 GiB => 1.73 GiB, integer-cells.db.zstd) \n" - ] } - ], - "source": [ - "# Run time: ~15s\n", - "! zstd -f -k integer-cells.db -o integer-cells.db.zstd" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "alFRiIAbpT57" - }, - "source": [ - "With higher (slower) compression settings the space can be further\n", - "reduced for long term storage.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nVFqovfPpT57", - "outputId": "0948bbe6-4252-4c93-eab7-8e3be4e98235" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "integer-cells.db : 51.22% ( 2.86 GiB => 1.47 GiB, integer-cells.db.19.zstd) \n" - ] + ], + "metadata": { + "colab": { + "provenance": [] + }, + "interpreter": { + "hash": "a3ed8fb525a8bde66cc7655a5df08d8d0f8699a69b9eb5ccab28dc0a7837eec6" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" } - ], - "source": [ - "# Run time: ~20m\n", - "! zstd -f -k -19 --long integer-cells.db -o integer-cells.db.19.zstd" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "C3voJ43OpT57" - }, - "source": [ - "## 3.2) Feature Comparison Summary\n", - "\n", - "Here we briefly summarise some of the positives and negatives of each format and construct a comparison matrix.\n", - "\n", - "**GeoJSON**\n", - "\n", - "*Positives*\n", - "\n", - "- Simple, based JSON which is well known.\n", - "- Well defined with a public specification.\n", - "- Popular format for geometry, many tools which work with it.\n", - "- Fast to write.\n", - "\n", - "*Negatives*\n", - "\n", - "- Requires loading the whole file into memory for parsing. Some\n", - " specialised parsers can, in some situations, reduce or avoid this but\n", - " it is not possible in general.\n", - "- Not a very compact representation.\n", - "\n", - "**ndjson (One GeoJSON Feature Per Line)**\n", - "\n", - "*Positives*\n", - "\n", - "- Simple.\n", - "- Better to parse than JSON/GeoJSON. Each line can be parsed\n", - " independently.\n", - "- Many tools to parse JSON lines.\n", - "- Fast to write.\n", - "\n", - "*Negatives*\n", - "\n", - "- Not a very compact representation.\n", - "- Requires loading the whole dataset from disk before querying OR\n", - " scanning through and reparsing each line for each query.\n", - "- Amending annotations can be tricky. The easiest way is to blank out a\n", - " line and append a modified copy each time. This could end up\n", - " fragmenting the file and wasting a lot of space. More complex methods\n", - " could be developed to reduce fragmenting the file.\n", - "\n", - "**pickle**\n", - "\n", - "*Positives*\n", - "\n", - "- Fast to write.\n", - "\n", - "*Negatives*\n", - "\n", - "- Vulnerable to arbitrary code execution when loading from disk.\n", - "- Requires loading the whole dataset into memory for querying.\n", - "\n", - "**SQLite (SQLiteStore Flavour)**\n", - "\n", - "*Positives*\n", - "\n", - "- Very fast to query (uses an R-TREE index to accelerate\n", - " spatial queries).\n", - "- Does not require loading data into memory before querying.\n", - "- Possible to index property lookups.\n", - "\n", - "*Negatives*\n", - "\n", - "- Not the most compact representation on disk.\n", - "\n", - "### Feature Matrix\n", - "\n", - "| Format | Size On-Disk | Size In-Memory | Partial Reads | Serialization | Query Performance |\n", - "| ----------: | :----------- | :------------- | :------------ | :------------ | :---------------- |\n", - "| SQLiteStore | Medium | Small | Yes | Slow | Fast |\n", - "| GeoJSON | Large | Large | No | Fast | Slow |\n", - "| ndjson | Large | Large | Yes | Fast | Medium |\n", - "| pickle | Small | Medium | No | Medium | Slow |\n", - "\n" - ] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "interpreter": { - "hash": "a3ed8fb525a8bde66cc7655a5df08d8d0f8699a69b9eb5ccab28dc0a7837eec6" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/benchmarks/annotation_store_alloc.py b/benchmarks/annotation_store_alloc.py index d5b6df9cb..82c642ada 100644 --- a/benchmarks/annotation_store_alloc.py +++ b/benchmarks/annotation_store_alloc.py @@ -102,7 +102,7 @@ import warnings from pathlib import Path from tempfile import NamedTemporaryFile -from typing import TYPE_CHECKING, Any, Generator +from typing import TYPE_CHECKING, Any sys.path.append("../") @@ -151,6 +151,7 @@ def __exit__(self: memray, *args: object) -> None: ) if TYPE_CHECKING: # pragma: no cover + from collections.abc import Generator from numbers import Number diff --git a/docker/3.8/Debian/Dockerfile b/docker/3.11/Debian/Dockerfile similarity index 91% rename from docker/3.8/Debian/Dockerfile rename to docker/3.11/Debian/Dockerfile index 9c4e5ecc8..3b399ddac 100644 --- a/docker/3.8/Debian/Dockerfile +++ b/docker/3.11/Debian/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.8-slim-bullseye +FROM python:3.11-slim-bullseye #get linux packages RUN apt-get -y update && apt-get -y install --no-install-recommends \ diff --git a/docker/3.11/Ubuntu/Dockerfile b/docker/3.11/Ubuntu/Dockerfile new file mode 100644 index 000000000..72d7adee8 --- /dev/null +++ b/docker/3.11/Ubuntu/Dockerfile @@ -0,0 +1,30 @@ +FROM ubuntu:22.04 AS builder-image + +# To avoid tzdata blocking the build with frontend questions +ENV DEBIAN_FRONTEND=noninteractive + +# Install python3.11 +RUN apt-get update && \ + apt install software-properties-common -y &&\ + add-apt-repository ppa:deadsnakes/ppa -y && apt update &&\ + apt-get install -y --no-install-recommends python3.11-venv &&\ + apt-get install libpython3.11-de -y &&\ + apt-get install python3.11-dev -y &&\ + apt-get install build-essential -y &&\ + apt-get clean + +# Add env to PATH +RUN python3.11 -m venv /venv +ENV PATH=/venv/bin:$PATH + +# install TIAToolbox and its requirements +RUN apt-get update && apt-get install --no-install-recommends -y \ + libopenjp2-7-dev libopenjp2-tools \ + openslide-tools \ + libgl1 \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN pip install --no-cache-dir tiatoolbox + +# activate virtual environment +ENV VIRTUAL_ENV=/opt/venv +ENV PATH="/opt/venv/bin:$PATH" diff --git a/docker/3.12/Debian/Dockerfile b/docker/3.12/Debian/Dockerfile new file mode 100644 index 000000000..412f8d015 --- /dev/null +++ b/docker/3.12/Debian/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.12-slim-bullseye + +#get linux packages +RUN apt-get -y update && apt-get -y install --no-install-recommends \ + libopenjp2-7-dev libopenjp2-tools \ + openslide-tools \ + libgl1 \ + build-essential \ + && pip3 --no-cache-dir install tiatoolbox \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# set the entry point to bash +ENTRYPOINT ["/bin/bash"] diff --git a/docker/3.12/Ubuntu/Dockerfile b/docker/3.12/Ubuntu/Dockerfile new file mode 100644 index 000000000..d99483d74 --- /dev/null +++ b/docker/3.12/Ubuntu/Dockerfile @@ -0,0 +1,30 @@ +FROM ubuntu:22.04 AS builder-image + +# To avoid tzdata blocking the build with frontend questions +ENV DEBIAN_FRONTEND=noninteractive + +# Install python3.12 +RUN apt-get update && \ + apt install software-properties-common -y &&\ + add-apt-repository ppa:deadsnakes/ppa -y && apt update &&\ + apt-get install -y --no-install-recommends python3.12-venv &&\ + apt-get install libpython3.12-de -y &&\ + apt-get install python3.12-dev -y &&\ + apt-get install build-essential -y &&\ + apt-get clean + +# Add env to PATH +RUN python3.12 -m venv /venv +ENV PATH=/venv/bin:$PATH + +# install TIAToolbox and its requirements +RUN apt-get update && apt-get install --no-install-recommends -y \ + libopenjp2-7-dev libopenjp2-tools \ + openslide-tools \ + libgl1 \ + && apt-get clean && rm -rf /var/lib/apt/lists/* +RUN pip install --no-cache-dir tiatoolbox + +# activate virtual environment +ENV VIRTUAL_ENV=/opt/venv +ENV PATH="/opt/venv/bin:$PATH" diff --git a/docs/installation.rst b/docs/installation.rst index e8fe41478..80895e939 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -60,7 +60,7 @@ MacPorts Installing Stable Release ========================= -Please note that TIAToolbox is tested for python version 3.8, 3.9 and 3.10. +Please note that TIAToolbox is tested for python version 3.9, 3.10, 3.11 and 3.12. Recommended ----------- diff --git a/examples/full-pipelines/slide-graph.ipynb b/examples/full-pipelines/slide-graph.ipynb index 54d1cdbde..8b10087e2 100644 --- a/examples/full-pipelines/slide-graph.ipynb +++ b/examples/full-pipelines/slide-graph.ipynb @@ -133,7 +133,7 @@ "import warnings\n", "from collections import OrderedDict\n", "from pathlib import Path\n", - "from typing import Callable, Iterator\n", + "from typing import TYPE_CHECKING, Callable\n", "\n", "# Third party imports\n", "import joblib\n", @@ -191,6 +191,9 @@ " WSIReader,\n", ")\n", "\n", + "if TYPE_CHECKING: # pragma: no cover\n", + " from collections.abc import Iterator\n", + "\n", "warnings.filterwarnings(\"ignore\")\n", "mpl.rcParams[\"figure.dpi\"] = 300 # for high resolution figure in notebook" ] diff --git a/pyproject.toml b/pyproject.toml index 05463efe8..0662f9e65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,8 +157,8 @@ line-length = 88 # Allow unused variables when underscore-prefixed. lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -# Minimum Python version 3.8. -target-version = "py38" +# Minimum Python version 3.9. +target-version = "py39" [tool.ruff.lint.mccabe] # Unlike Flake8, default to a complexity level of 10. @@ -174,4 +174,4 @@ max-args = 10 [tool.mypy] ignore_missing_imports = true -python_version = 3.8 +python_version = 3.9 diff --git a/requirements/requirements.conda.yml b/requirements/requirements.conda.yml index 09be84a12..0d999ac35 100644 --- a/requirements/requirements.conda.yml +++ b/requirements/requirements.conda.yml @@ -9,6 +9,6 @@ dependencies: - openslide - pip>=20.0.2 - pixman>=0.39.0 - - python>=3.8, <=3.11 + - python>=3.9, <=3.12 - pip: - -r requirements.txt diff --git a/requirements/requirements.dev.conda.yml b/requirements/requirements.dev.conda.yml index 494d5a0d3..4a743d837 100644 --- a/requirements/requirements.dev.conda.yml +++ b/requirements/requirements.dev.conda.yml @@ -9,6 +9,6 @@ dependencies: - openslide - pip>=20.0.2 - pixman>=0.39.0 - - python>=3.8, <=3.11 + - python>=3.9, <=3.12 - pip: - -r requirements_dev.txt diff --git a/requirements/requirements.win64.conda.yml b/requirements/requirements.win64.conda.yml index f6386597f..1aeff0a7a 100644 --- a/requirements/requirements.win64.conda.yml +++ b/requirements/requirements.win64.conda.yml @@ -9,6 +9,6 @@ dependencies: - openjpeg>=2.4.0 - pip>=20.0.2 - pixman>=0.39.0 - - python>=3.8, <=3.11 + - python>=3.9, <=3.12 - pip: - -r requirements.txt diff --git a/requirements/requirements.win64.dev.conda.yml b/requirements/requirements.win64.dev.conda.yml index 078d75a38..64b4b07d1 100644 --- a/requirements/requirements.win64.dev.conda.yml +++ b/requirements/requirements.win64.dev.conda.yml @@ -9,6 +9,6 @@ dependencies: - openjpeg>=2.4.0 - pip>=20.0.2 - pixman>=0.39.0 - - python>=3.8, <=3.11 + - python>=3.9, <=3.12 - pip: - -r requirements_dev.txt diff --git a/setup.py b/setup.py index 92fe58e0b..efb7f20ec 100644 --- a/setup.py +++ b/setup.py @@ -34,16 +34,16 @@ setup( author="TIA Centre", author_email="tia@dcs.warwick.ac.uk", - python_requires=">=3.8, <3.12", + python_requires=">=3.9, <3.13", classifiers=[ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Developers", "Natural Language :: English", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ], description="Computational pathology toolbox developed by TIA Centre.", dependency_links=dependency_links, diff --git a/tests/test_annotation_stores.py b/tests/test_annotation_stores.py index 562e9a8a1..cac3937ba 100644 --- a/tests/test_annotation_stores.py +++ b/tests/test_annotation_stores.py @@ -6,10 +6,11 @@ import pickle import sqlite3 import sys +from collections.abc import Generator from itertools import repeat, zip_longest from pathlib import Path from timeit import timeit -from typing import TYPE_CHECKING, Callable, ClassVar, Generator +from typing import TYPE_CHECKING, Callable, ClassVar import numpy as np import pandas as pd @@ -1801,13 +1802,13 @@ def test_load_cases_error( store._load_cases(["foo"], lambda: None, lambda: None) @staticmethod - def test_py38_init( + def test_py39_init( fill_store: Callable, # noqa: ARG004 store_cls: type[AnnotationStore], monkeypatch: object, ) -> None: - """Test that __init__ is compatible with Python 3.8.""" - py38_version = (3, 8, 0) + """Test that __init__ is compatible with Python 3.9.""" + py39_version = (3, 9, 0) class Connection(sqlite3.Connection): """Mock SQLite connection.""" @@ -1821,7 +1822,7 @@ def create_function( """Mock create_function without `deterministic` kwarg.""" return self.create_function(self, name, num_params) - monkeypatch.setattr(sys, "version_info", py38_version) + monkeypatch.setattr(sys, "version_info", py39_version) monkeypatch.setattr(sqlite3, "Connection", Connection) _ = store_cls() diff --git a/tests/test_app_bokeh.py b/tests/test_app_bokeh.py index 3d072a919..b29d78188 100644 --- a/tests/test_app_bokeh.py +++ b/tests/test_app_bokeh.py @@ -2,24 +2,18 @@ from __future__ import annotations +import importlib.resources as importlib_resources import io import json import multiprocessing import re -import sys import time from pathlib import Path -from typing import TYPE_CHECKING, Generator +from typing import TYPE_CHECKING import bokeh.models as bkmodels import matplotlib.pyplot as plt import numpy as np - -if sys.version_info >= (3, 9): # pragma: no cover - import importlib.resources as importlib_resources -else: # pragma: no cover - # To support Python 3.8 - import importlib_resources # type: ignore[import-not-found] import pytest import requests from bokeh.application import Application @@ -35,7 +29,9 @@ from tiatoolbox.visualization.tileserver import TileServer from tiatoolbox.visualization.ui_utils import get_level_by_extent -if TYPE_CHECKING: +if TYPE_CHECKING: # pragma: no cover + from collections.abc import Generator + from bokeh.document import Document # constants diff --git a/tests/test_docs.py b/tests/test_docs.py index 020188797..ea446737a 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -9,10 +9,13 @@ import sys from doctest import DocTest from pathlib import Path -from typing import Generator +from typing import TYPE_CHECKING import pytest +if TYPE_CHECKING: # pragma: no cover + from collections.abc import Generator + @pytest.fixture() def source_files(root_path: Path) -> Generator: diff --git a/tests/test_dsl.py b/tests/test_dsl.py index ad811ac6e..1657db1b6 100644 --- a/tests/test_dsl.py +++ b/tests/test_dsl.py @@ -5,7 +5,7 @@ import json import sqlite3 from numbers import Number -from typing import Callable, ClassVar, Mapping +from typing import TYPE_CHECKING, Callable, ClassVar import pytest @@ -19,6 +19,9 @@ py_regexp, ) +if TYPE_CHECKING: # pragma: no cover + from collections.abc import Mapping + BINARY_OP_STRINGS = [ "+", "-", diff --git a/tests/test_wsireader.py b/tests/test_wsireader.py index 76a5d3861..8bdea210b 100644 --- a/tests/test_wsireader.py +++ b/tests/test_wsireader.py @@ -11,7 +11,7 @@ from pathlib import Path # When no longer supporting Python <3.9 this should be collections.abc.Iterable -from typing import TYPE_CHECKING, Callable, Iterable +from typing import TYPE_CHECKING, Callable import cv2 import glymur @@ -46,7 +46,9 @@ is_zarr, ) -if TYPE_CHECKING: +if TYPE_CHECKING: # pragma: no cover + from collections.abc import Iterable + import requests from openslide import OpenSlide diff --git a/tiatoolbox/__init__.py b/tiatoolbox/__init__.py index 88d2eabc9..452ff5a85 100644 --- a/tiatoolbox/__init__.py +++ b/tiatoolbox/__init__.py @@ -2,16 +2,11 @@ from __future__ import annotations +import importlib.resources as importlib_resources import importlib.util import sys from pathlib import Path -from typing import TYPE_CHECKING, Dict, TypedDict - -if sys.version_info >= (3, 9): # pragma: no cover - import importlib.resources as importlib_resources -else: # pragma: no cover - # To support Python 3.8 - import importlib_resources # type: ignore[import-not-found] +from typing import TYPE_CHECKING, TypedDict import yaml @@ -92,9 +87,8 @@ def read_registry_files(path_to_registry: str | Path) -> dict: """ - path_to_registry = str(path_to_registry) # To pass tests with Python 3.8 pretrained_files_registry_path = importlib_resources.as_file( - importlib_resources.files("tiatoolbox") / path_to_registry, + importlib_resources.files("tiatoolbox") / str(path_to_registry), ) with pretrained_files_registry_path as registry_file_path: diff --git a/tiatoolbox/annotation/storage.py b/tiatoolbox/annotation/storage.py index 3fb786374..541e66a63 100644 --- a/tiatoolbox/annotation/storage.py +++ b/tiatoolbox/annotation/storage.py @@ -40,7 +40,7 @@ import zlib from abc import ABC, abstractmethod from collections import defaultdict -from collections.abc import MutableMapping +from collections.abc import Generator, Iterable, Iterator, MutableMapping from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path @@ -50,9 +50,6 @@ Any, Callable, ClassVar, - Generator, - Iterable, - Iterator, ) import numpy as np diff --git a/tiatoolbox/cli/visualize.py b/tiatoolbox/cli/visualize.py index 7f5ed0ad5..86810954a 100644 --- a/tiatoolbox/cli/visualize.py +++ b/tiatoolbox/cli/visualize.py @@ -2,19 +2,13 @@ from __future__ import annotations +import importlib.resources as importlib_resources import os import subprocess -import sys from pathlib import Path from threading import Thread import click - -if sys.version_info >= (3, 9): # pragma: no cover - import importlib.resources as importlib_resources -else: # pragma: no cover - # To support Python 3.8 - import importlib_resources # type: ignore[import-not-found] from flask_cors import CORS from tiatoolbox.cli.common import tiatoolbox_cli diff --git a/tiatoolbox/data/__init__.py b/tiatoolbox/data/__init__.py index 1ac4e8e31..d7058493e 100644 --- a/tiatoolbox/data/__init__.py +++ b/tiatoolbox/data/__init__.py @@ -2,6 +2,7 @@ """Package to define datasets available to download via TIAToolbox.""" from __future__ import annotations +import importlib.resources as importlib_resources import sys import tempfile import zipfile @@ -9,11 +10,6 @@ from typing import TYPE_CHECKING from urllib.parse import urlparse -if sys.version_info >= (3, 9): # pragma: no cover - import importlib.resources as importlib_resources -else: # pragma: no cover - import importlib_resources # To support Python 3.8 - from tiatoolbox import logger, read_registry_files if TYPE_CHECKING: # pragma: no cover diff --git a/tiatoolbox/models/dataset/dataset_abc.py b/tiatoolbox/models/dataset/dataset_abc.py index 31fb2bfd5..b60ecd66e 100644 --- a/tiatoolbox/models/dataset/dataset_abc.py +++ b/tiatoolbox/models/dataset/dataset_abc.py @@ -4,9 +4,11 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import TYPE_CHECKING, Callable, Iterable, List, Union +from typing import TYPE_CHECKING, Callable, Union if TYPE_CHECKING: # pragma: no cover + from collections.abc import Iterable + try: from typing import TypeGuard except ImportError: @@ -18,7 +20,7 @@ from tiatoolbox.utils import imread -input_type = Union[List[Union[str, Path, np.ndarray]], np.ndarray] +input_type = Union[list[Union[str, Path, np.ndarray]], np.ndarray] class PatchDatasetABC(ABC, torch.utils.data.Dataset): diff --git a/tiatoolbox/tools/pyramid.py b/tiatoolbox/tools/pyramid.py index a6506fb46..cfbe55190 100644 --- a/tiatoolbox/tools/pyramid.py +++ b/tiatoolbox/tools/pyramid.py @@ -17,7 +17,7 @@ import zipfile from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Iterator +from typing import TYPE_CHECKING import defusedxml import numpy as np @@ -28,6 +28,8 @@ from tiatoolbox.utils.visualization import AnnotationRenderer, random_colors if TYPE_CHECKING: # pragma: no cover + from collections.abc import Iterator + from tiatoolbox.annotation import AnnotationStore from tiatoolbox.wsicore.wsireader import WSIMeta, WSIReader diff --git a/tiatoolbox/tools/stainextract.py b/tiatoolbox/tools/stainextract.py index 4126f7e55..cb2972ae2 100644 --- a/tiatoolbox/tools/stainextract.py +++ b/tiatoolbox/tools/stainextract.py @@ -2,22 +2,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import numpy as np from sklearn.decomposition import DictionaryLearning from tiatoolbox.utils.misc import get_luminosity_tissue_mask from tiatoolbox.utils.transforms import rgb2od -if TYPE_CHECKING: # pragma: no cover - import sys - - if sys.version_info >= (3, 9): - from typing import Self - else: # pragma: no cover - from typing_extensions import Self # To support Python 3.8 - def vectors_in_correct_direction(e_vectors: np.ndarray) -> np.ndarray: """Points the eigen vectors in the right direction. @@ -92,14 +82,14 @@ class CustomExtractor: """ - def __init__(self: Self, stain_matrix: np.ndarray) -> None: + def __init__(self: CustomExtractor, stain_matrix: np.ndarray) -> None: """Initialize :class:`CustomExtractor`.""" self.stain_matrix = stain_matrix if self.stain_matrix.shape not in [(2, 3), (3, 3)]: msg = "Stain matrix must have shape (2, 3) or (3, 3)." raise ValueError(msg) - def get_stain_matrix(self: Self, _: np.ndarray) -> np.ndarray: + def get_stain_matrix(self: CustomExtractor, _: np.ndarray) -> np.ndarray: """Get the user defined stain matrix. Returns: @@ -131,11 +121,11 @@ class RuifrokExtractor: """ - def __init__(self: Self) -> None: + def __init__(self: RuifrokExtractor) -> None: """Initialize :class:`RuifrokExtractor`.""" self.__stain_matrix = np.array([[0.65, 0.70, 0.29], [0.07, 0.99, 0.11]]) - def get_stain_matrix(self: Self, _: np.ndarray) -> np.ndarray: + def get_stain_matrix(self: RuifrokExtractor, _: np.ndarray) -> np.ndarray: """Get the pre-defined stain matrix. Returns: @@ -175,7 +165,7 @@ class MacenkoExtractor: """ def __init__( - self: Self, + self: MacenkoExtractor, luminosity_threshold: float = 0.8, angular_percentile: float = 99, ) -> None: @@ -183,7 +173,7 @@ def __init__( self.__luminosity_threshold = luminosity_threshold self.__angular_percentile = angular_percentile - def get_stain_matrix(self: Self, img: np.ndarray) -> np.ndarray: + def get_stain_matrix(self: MacenkoExtractor, img: np.ndarray) -> np.ndarray: """Stain matrix estimation. Args: @@ -264,7 +254,7 @@ class VahadaneExtractor: """ def __init__( - self: Self, + self: VahadaneExtractor, luminosity_threshold: float = 0.8, regularizer: float = 0.1, ) -> None: @@ -272,7 +262,7 @@ def __init__( self.__luminosity_threshold = luminosity_threshold self.__regularizer = regularizer - def get_stain_matrix(self: Self, img: np.ndarray) -> np.ndarray: + def get_stain_matrix(self: VahadaneExtractor, img: np.ndarray) -> np.ndarray: """Stain matrix estimation. Args: diff --git a/tiatoolbox/typing.py b/tiatoolbox/typing.py index c70dbf3e1..ea0299e12 100644 --- a/tiatoolbox/typing.py +++ b/tiatoolbox/typing.py @@ -2,7 +2,8 @@ from __future__ import annotations -from typing import Callable, Dict, List, Literal, Sequence, SupportsFloat, Tuple, Union +from collections.abc import Sequence +from typing import Callable, Literal, SupportsFloat, Union import numpy as np from shapely.geometry import LineString, Point, Polygon # type: ignore[import-untyped] @@ -10,15 +11,15 @@ # Proper type annotations for shapely is not yet available. -JSON = Union[Dict[str, "JSON"], List["JSON"], str, int, float, bool, None] -NumPair = Tuple[SupportsFloat, SupportsFloat] -IntPair = Tuple[int, int] +JSON = Union[dict[str, "JSON"], list["JSON"], str, int, float, bool, None] +NumPair = tuple[SupportsFloat, SupportsFloat] +IntPair = tuple[int, int] # WSIReader Resolution = Union[SupportsFloat, NumPair, np.ndarray, Sequence[SupportsFloat]] Units = Literal["mpp", "power", "baseline", "level"] -Bounds = Tuple[SupportsFloat, SupportsFloat, SupportsFloat, SupportsFloat] -IntBounds = Tuple[int, int, int, int] +Bounds = tuple[SupportsFloat, SupportsFloat, SupportsFloat, SupportsFloat] +IntBounds = tuple[int, int, int, int] # Annotation Store Geometry = Union[Point, LineString, Polygon] diff --git a/tiatoolbox/wsicore/wsimeta.py b/tiatoolbox/wsicore/wsimeta.py index ac9200295..4a7ad0d9b 100644 --- a/tiatoolbox/wsicore/wsimeta.py +++ b/tiatoolbox/wsicore/wsimeta.py @@ -11,13 +11,15 @@ from numbers import Number from pathlib import Path -from typing import TYPE_CHECKING, Mapping, Sequence +from typing import TYPE_CHECKING import numpy as np from tiatoolbox import logger if TYPE_CHECKING: # pragma: no cover + from collections.abc import Mapping, Sequence + from tiatoolbox.typing import Resolution, Units diff --git a/tiatoolbox/wsicore/wsireader.py b/tiatoolbox/wsicore/wsireader.py index 7e3307189..dd38a53b1 100644 --- a/tiatoolbox/wsicore/wsireader.py +++ b/tiatoolbox/wsicore/wsireader.py @@ -11,7 +11,7 @@ from datetime import datetime from numbers import Number from pathlib import Path -from typing import TYPE_CHECKING, Iterable +from typing import TYPE_CHECKING import numpy as np import openslide @@ -31,6 +31,8 @@ from tiatoolbox.wsicore.wsimeta import WSIMeta if TYPE_CHECKING: # pragma: no cover + from collections.abc import Iterable + import glymur from tiatoolbox.typing import Bounds, IntBounds, IntPair, NumPair, Resolution, Units @@ -404,10 +406,9 @@ def info(self: WSIReader) -> WSIMeta: Returns: WSIMeta: - An object containing normalized slide metadata + An object containing normalized slide metadata. """ - # In Python>=3.8 this could be replaced with functools.cached_property if self._m_info is not None: return copy.deepcopy(self._m_info) self._m_info = self._info()