feat: add capture-area.geojson to the STAC Collection TDE-965 (#758)

* feat: add capture-area.geojson to the STAC Collection TDE-965 * fix: removed code has been accidently brought back * fix: formatting fails * refactor: simplify gdal_footprint arguments * feat: save capture-area.geojson checksum * refactor: move add capture area logic to collection.py * docs: grammar Co-authored-by: Victor Engmark <vengmark@linz.govt.nz> * test: improve capture area UT * fix: unused 'type:ignore' comments * build: fix poetry did update every libs when adding a new one * build: poetry lock with geojson * revert: "fix: unused 'type:ignore' comments" This reverts commit 9707c62. * fix: use list instead of tuple for file suffixes * test: add a test for list_files_in_uri * refactor: store suffixes in const * feat: save capture-area.json file size in the collection.json * fix: allow margin of error to the polygon geometry * feat: create temporary script to generate capture-area from existing published dataset * refactor: remove test script * fix: enum value * feat: round the geometry using the gsd * fix: typo * docs: comment for gsd * fix: typo file name * fix: typo file name * refactor: simplify comparison Co-authored-by: Victor Engmark <vengmark@linz.govt.nz> * refactor: simplify code in tests * refactor: rename variable for readability * refactor: avoid temporary capture-area file * refactor: remove condition Co-authored-by: Victor Engmark <vengmark@linz.govt.nz> * refactor: store EPSG codes in constants * test: replace assert not by assert * test: simplify test polygons * refactor: re-use variable Co-authored-by: Victor Engmark <vengmark@linz.govt.nz> * docs: language Co-authored-by: Victor Engmark <vengmark@linz.govt.nz> * docs: move comment to docstring --------- Co-authored-by: Victor Engmark <vengmark@linz.govt.nz>
linz · Feb 14, 2024 · 75df081 · 75df081
1 parent 83c9a68
commit 75df081
Show file tree

Hide file tree

Showing 17 changed files with 393 additions and 42 deletions.
diff --git a/scripts/collection_from_items.py b/scripts/collection_from_items.py
@@ -3,18 +3,21 @@
 import os
 from typing import List
 
+import shapely.geometry
+import shapely.ops
 from boto3 import client
 from linz_logger import get_log
 
 from scripts.cli.cli_helper import coalesce_multi_single, valid_date
-from scripts.files.fs_s3 import bucket_name_from_path, get_object_parallel_multithreading, list_json_in_uri
+from scripts.files.files_helper import SUFFIX_FOOTPRINT, SUFFIX_JSON
+from scripts.files.fs_s3 import bucket_name_from_path, get_object_parallel_multithreading, list_files_in_uri
 from scripts.logging.time_helper import time_in_ms
 from scripts.stac.imagery.collection import ImageryCollection
 from scripts.stac.imagery.metadata_constants import DATA_CATEGORIES, HUMAN_READABLE_REGIONS, CollectionMetadata
 from scripts.stac.imagery.provider import Provider, ProviderRole
 
 
-# pylint: disable-msg=too-many-locals
+# pylint: disable=too-many-locals
 def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("--uri", dest="uri", help="s3 path to items and collection.json write location", required=True)
@@ -115,28 +118,37 @@ def main() -> None:
 
     s3_client = client("s3")
 
-    files_to_read = list_json_in_uri(uri, s3_client)
+    files_to_read = list_files_in_uri(uri, [SUFFIX_JSON, SUFFIX_FOOTPRINT], s3_client)
 
     start_time = time_in_ms()
+    polygons = []
     for key, result in get_object_parallel_multithreading(
         bucket_name_from_path(uri), files_to_read, s3_client, arguments.concurrency
     ):
-        item_stac = json.loads(result["Body"].read().decode("utf-8"))
-
-        if not arguments.collection_id == item_stac.get("collection"):
-            get_log().trace(
-                "skipping: item.collection != collection.id",
-                file=key,
-                action="collection_from_items",
-                reason="skip",
-            )
-            continue
-
-        collection.add_item(item_stac)
-        get_log().info("item added to collection", item=item_stac["id"], file=key)
+        content = json.load(result["Body"])
+        # The following if/else looks like it could be avoid by refactoring `list_files_in_uri()`
+        # to return a result list per suffix, but we would have to call `get_object_parallel_multithreading()`
+        # for each of them to avoid this if/else.
+        if key.endswith(SUFFIX_JSON):
+            if arguments.collection_id != content.get("collection"):
+                get_log().trace(
+                    "skipping: item.collection != collection.id",
+                    file=key,
+                    action="collection_from_items",
+                    reason="skip",
+                )
+                continue
+            collection.add_item(content)
+            get_log().info("item added to collection", item=content["id"], file=key)
+        elif key.endswith(SUFFIX_FOOTPRINT):
+            get_log().debug(f"adding geometry from {key}")
+            polygons.append(shapely.geometry.shape(content["features"][0]["geometry"]))
+
+    if polygons:
+        collection.add_capture_area(polygons, uri)
 
     get_log().info(
-        "Matching items added to collection",
+        "Matching items added to collection and capture-area created",
         item_count=len(files_to_read),
         item_match_count=[dictionary["rel"] for dictionary in collection.stac["links"]].count("item"),
         duration=time_in_ms() - start_time,

diff --git a/scripts/files/files_helper.py b/scripts/files/files_helper.py
@@ -1,6 +1,9 @@
 import os
 from enum import Enum
 
+SUFFIX_JSON = ".json"
+SUFFIX_FOOTPRINT = "_footprint.geojson"
+
 
 class ContentType(str, Enum):
     GEOTIFF = "image/tiff; application=geotiff; profile=cloud-optimized"

diff --git a/scripts/files/fs_s3.py b/scripts/files/fs_s3.py
@@ -7,7 +7,6 @@
 from linz_logger import get_log
 
 from scripts.aws.aws_helper import get_session, parse_path
-from scripts.files.files_helper import is_json
 from scripts.logging.time_helper import time_in_ms
 
 
@@ -164,15 +163,16 @@ def prefix_from_path(path: str) -> str:
     return path.replace(f"s3://{bucket_name}/", "")
 
 
-def list_json_in_uri(uri: str, s3_client: Optional[client]) -> List[str]:
-    """Get the `JSON` files from a s3 path
+def list_files_in_uri(uri: str, suffixes: List[str], s3_client: Optional[client]) -> List[str]:
+    """Get a list of file paths from a s3 path based on their suffixes
 
     Args:
         uri: an s3 path
+        suffixes: a a list of suffixes. example: [".json", "_meta.xml"]
         s3_client: an s3 client
 
     Returns:
-        a list of JSON files
+        a list of file paths
     """
     if not s3_client:
         s3_client = client("s3")
@@ -183,7 +183,7 @@ def list_json_in_uri(uri: str, s3_client: Optional[client]) -> List[str]:
     for response in response_iterator:
         for contents_data in response["Contents"]:
             key = contents_data["Key"]
-            if not is_json(key):
+            if not key.lower().endswith(tuple(suffixes)):
                 get_log().trace("skipping file not json", file=key, action="collection_from_items", reason="skip")
                 continue
             files.append(key)

diff --git a/scripts/files/tests/fs_s3_test.py b/scripts/files/tests/fs_s3_test.py
@@ -7,7 +7,7 @@
 from pytest import CaptureFixture, raises
 
 from scripts.files.files_helper import ContentType
-from scripts.files.fs_s3 import exists, read, write
+from scripts.files.fs_s3 import exists, list_files_in_uri, read, write
 
 
 @mock_s3  # type: ignore
@@ -124,3 +124,20 @@ def test_exists_object_starting_with_not_exists() -> None:
     file_exists = exists("s3://testbucket/hello/another.fi")
 
     assert file_exists is False
+
+
+@mock_s3  # type: ignore
+def test_list_files_in_uri() -> None:
+    bucket_name = "testbucket"
+    s3 = resource("s3", region_name=DEFAULT_REGION_NAME)
+    boto3_client = client("s3", region_name=DEFAULT_REGION_NAME)
+    s3.create_bucket(Bucket=bucket_name)
+    boto3_client.put_object(Bucket=bucket_name, Key="data/collection.json", Body=b"")
+    boto3_client.put_object(Bucket=bucket_name, Key="data/image.tiff", Body=b"")
+    boto3_client.put_object(Bucket=bucket_name, Key="data/image_meta.xml", Body=b"")
+
+    files = list_files_in_uri(f"s3://{bucket_name}/data/", [".json", "_meta.xml"], boto3_client)
+
+    assert len(files) == 2
+    assert set(files) == {"data/collection.json", "data/image_meta.xml"}
+    assert "data/image.tiff" not in files
diff --git a/scripts/gdal/gdal_helper.py b/scripts/gdal/gdal_helper.py
@@ -1,6 +1,7 @@
 import json
 import os
 import subprocess
+from enum import Enum
 from shutil import rmtree
 from tempfile import mkdtemp
 from typing import List, Optional, cast
@@ -18,6 +19,13 @@ class GDALExecutionException(Exception):
     pass
 
 
+class EpsgCode(str, Enum):
+    EPSG_2193 = "EPSG:2193"
+    """ NZGD2000 / New Zealand Transverse Mercator 2000 (NZTM) """
+    EPSG_4326 = "EPSG:4326"
+    """ WGS84 - World Geodetic System 1984"""
+
+
 def get_vfs_path(path: str) -> str:
     """Make the path as a GDAL Virtual File Systems path.
 
@@ -124,7 +132,7 @@ def get_srs() -> bytes:
     Returns:
         the output of `gdalsrsinfo`
     """
-    gdalsrsinfo_command = ["gdalsrsinfo", "-o", "wkt", "EPSG:2193"]
+    gdalsrsinfo_command = ["gdalsrsinfo", "-o", "wkt", EpsgCode.EPSG_2193]
     gdalsrsinfo_result = run_gdal(gdalsrsinfo_command)
     if gdalsrsinfo_result.stderr:
         raise Exception(

diff --git a/scripts/stac/imagery/capture_area.py b/scripts/stac/imagery/capture_area.py
@@ -0,0 +1,70 @@
+import json
+from typing import Any, Dict, List
+
+from shapely import BufferCapStyle, BufferJoinStyle, Geometry, to_geojson, union_all
+from shapely.geometry import Polygon
+
+DECIMAL_DEGREES_1M = 0.00001
+"""
+Degree precision of ~1m (decimal places 5, https://en.wikipedia.org/wiki/Decimal_degrees)
+"""
+
+
+def to_feature(geometry: Geometry) -> Dict[str, Any]:
+    """Transform a Geometry to a GeoJSON feature.
+
+    Args:
+        geometry: a Geometry
+
+    Returns:
+        a GeoJSON document.
+    """
+    return {"geometry": json.loads(to_geojson(geometry)), "type": "Feature", "properties": {}}
+
+
+def merge_polygons(polygons: List[Polygon], buffer_distance: float) -> Geometry:
+    """Merge a list of polygons by converting them to a single geometry that covers the same area.
+    A buffer distance is used to buffer out the polygons before dissolving them together and then negative buffer them back in.
+    The merged geometry is simplify (rounded) to the decimal used for the buffer.
+
+    Args:
+        polygons: list of polygons to merge
+        buffer_distance: decimal places to use to buffer the polygons
+
+    Returns:
+        A single Geometry.
+    """
+    buffered_polygons = []
+    for poly in polygons:
+        # Buffer each polygon to round up to the `buffer_distance`
+        buffered_poly = poly.buffer(buffer_distance, cap_style=BufferCapStyle.flat, join_style=BufferJoinStyle.mitre)
+        buffered_polygons.append(buffered_poly)
+    union_buffered = union_all(buffered_polygons)
+    # Negative buffer back in the polygons
+    union_unbuffered = union_buffered.buffer(-buffer_distance, cap_style=BufferCapStyle.flat, join_style=BufferJoinStyle.mitre)
+    union_simplified = union_unbuffered.simplify(buffer_distance)
+
+    return union_simplified
+
+
+def generate_capture_area(polygons: List[Polygon], gsd: float) -> Dict[str, Any]:
+    """Generate the capture area from a list of polygons.
+    Providing the `gsd` allows to round the geometry as we've seen some tiffs geometry being slightly off,
+    sometimes due to rounding issue in their creation process (before delivery).
+    If we don't apply this rounding, we could get a very small gaps between tiffs
+    which would result in a capture area having gaps.
+    The `gsd` (in meter) is multiplied by the 1m degree of precision.
+    Note that all the polygons are buffered which means a gap bigger than the gsd,
+    but < gsd*2) will be closed.
+
+    Args:
+        polygons: list of polygons of the area
+        gsd: Ground Sample Distance in meter
+
+    Returns:
+        The capture-area geojson document.
+    """
+    buffer_distance = DECIMAL_DEGREES_1M * gsd
+    merged_polygons = merge_polygons(polygons, buffer_distance)
+
+    return to_feature(merged_polygons)
diff --git a/scripts/stac/imagery/collection.py b/scripts/stac/imagery/collection.py
@@ -1,11 +1,15 @@
 import json
+import os
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 
+import shapely.geometry
+import shapely.ops
 import ulid
 
 from scripts.files.files_helper import ContentType
 from scripts.files.fs import write
+from scripts.stac.imagery.capture_area import generate_capture_area
 from scripts.stac.imagery.metadata_constants import (
     DATA_CATEGORIES,
     DEM,
@@ -20,7 +24,11 @@
     SubtypeParameterError,
 )
 from scripts.stac.imagery.provider import Provider, ProviderRole
+from scripts.stac.util import checksum
 from scripts.stac.util.STAC_VERSION import STAC_VERSION
+from scripts.stac.util.stac_extensions import StacExtensions
+
+CAPTURE_AREA_FILE_NAME = "capture-area.geojson"
 
 
 class ImageryCollection:
@@ -75,6 +83,41 @@ def __init__(
 
         self.add_providers(providers)
 
+    def add_capture_area(self, polygons: List[shapely.geometry.shape], target: str) -> None:
+        """Add the capture area of the Collection.
+        The `href` or path of the capture-area.geojson is always set as the relative `./capture-area.geojson`
+
+        Args:
+            polygons: list of geometries
+            target: path of the capture-area-geojson file
+        """
+
+        # The GSD is measured in meters (e.g., `0.3m`)
+        capture_area_document = generate_capture_area(polygons, float(self.metadata["gsd"].replace("m", "")))
+        capture_area_content: bytes = json.dumps(capture_area_document).encode("utf-8")
+        file_checksum = checksum.multihash_as_hex(capture_area_content)
+        capture_area = {
+            "href": f"./{CAPTURE_AREA_FILE_NAME}",
+            "title": "Capture area",
+            "type": ContentType.GEOJSON,
+            "roles": ["metadata"],
+            "file:checksum": file_checksum,
+            "file:size": len(capture_area_content),
+        }
+        self.stac.setdefault("assets", {})["capture_area"] = capture_area
+
+        # Save `capture-area.geojson` in target
+        write(
+            os.path.join(target, CAPTURE_AREA_FILE_NAME),
+            capture_area_content,
+            content_type=ContentType.GEOJSON.value,
+        )
+
+        self.stac["stac_extensions"] = self.stac.get("stac_extensions", [])
+
+        if StacExtensions.file.value not in self.stac["stac_extensions"]:
+            self.stac["stac_extensions"].append(StacExtensions.file.value)
+
     def add_item(self, item: Dict[Any, Any]) -> None:
         """Add an `Item` to the `links` of the `Collection`.
 

diff --git a/scripts/stac/imagery/item.py b/scripts/stac/imagery/item.py
@@ -1,6 +1,7 @@
 import os
 from typing import Any, Dict, Tuple
 
+from scripts.files import fs
 from scripts.stac.util import checksum
 from scripts.stac.util.STAC_VERSION import STAC_VERSION
 from scripts.stac.util.stac_extensions import StacExtensions
@@ -10,6 +11,7 @@ class ImageryItem:
     stac: Dict[str, Any]
 
     def __init__(self, id_: str, file: str) -> None:
+        file_content = fs.read(file)
         self.stac = {
             "type": "Feature",
             "stac_version": STAC_VERSION,
@@ -21,7 +23,7 @@ def __init__(self, id_: str, file: str) -> None:
                 "visual": {
                     "href": os.path.join(".", os.path.basename(file)),
                     "type": "image/tiff; application=geotiff; profile=cloud-optimized",
-                    "file:checksum": checksum.multihash_as_hex(file),
+                    "file:checksum": checksum.multihash_as_hex(file_content),
                 }
             },
             "stac_extensions": [StacExtensions.file.value],

diff --git a/scripts/stac/tests/__init__.py → scripts/stac/imagery/tests/__init__.py b/scripts/stac/tests/__init__.py → scripts/stac/imagery/tests/__init__.py