Add OpenBuildings dataset (#402)

* populate index attempt * added tests * correct plot method * fix test * fix documentation * fix docs * name changes * lazy import pandas and Any instead of Tensor * requested changes * mypy fixes * Close plot filehandles Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com>
microsoft · Feb 27, 2022 · 06ec364 · 06ec364
1 parent 4c221df
commit 06ec364
Show file tree

Hide file tree

Showing 7 changed files with 739 additions and 0 deletions.
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -92,6 +92,11 @@ National Agriculture Imagery Program (NAIP)
 
 .. autoclass:: NAIP
 
+Open Buildings
+^^^^^^^^^^^^^^
+
+.. autoclass:: OpenBuildings
+
 Sentinel
 ^^^^^^^^
 

diff --git a/tests/data/openbuildings/000_buildings.csv.gz b/tests/data/openbuildings/000_buildings.csv.gz
diff --git a/tests/data/openbuildings/data.py b/tests/data/openbuildings/data.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import csv
+import gzip
+import hashlib
+import json
+import os
+import random
+import shutil
+
+import numpy as np
+from shapely.geometry import Polygon
+
+SIZE = 0.05
+
+np.random.seed(0)
+random.seed(0)
+
+
+def create_meta_data_file(zipfilename):
+    meta_data = {
+        "type": "FeatureCollection",
+        "features": [
+            {
+                "type": "Feature",
+                "geometry": {
+                    "type": "Polygon",
+                    "coordinates": [
+                        [[0.0, 0.0], [0.0, SIZE], [SIZE, SIZE], [SIZE, 0.0], [0.0, 0.0]]
+                    ],
+                },
+                "properties": {
+                    "tile_id": "025",
+                    "tile_url": "polygons_s2_level_4_gzip/{}".format(zipfilename),
+                    "size_mb": 0.2,
+                },
+            }
+        ],
+    }
+    return meta_data
+
+
+def create_csv_data_row(lat, long):
+    width, height = SIZE / 10, SIZE / 10
+    minx = long - 0.5 * width
+    maxx = long + 0.5 * width
+    miny = lat - 0.5 * height
+    maxy = lat - 0.5 * height
+    coordinates = [(minx, miny), (minx, maxy), (maxx, maxy), (maxx, miny), (minx, miny)]
+    polygon = Polygon(coordinates)
+
+    data_row = {
+        "latitude": lat,
+        "longitude": long,
+        "area_in_meters": 1.0,
+        "confidence": 1.0,
+        "geometry": polygon.wkt,
+        "full_plus_code": "ABC",
+    }
+
+    return data_row
+
+
+def create_buildings_data():
+    fourth = SIZE / 4
+    # pandas df
+    dict_data = [
+        create_csv_data_row(fourth, fourth),
+        create_csv_data_row(SIZE - fourth, SIZE - fourth),
+    ]
+    return dict_data
+
+
+if __name__ == "__main__":
+    csvname = "000_buildings.csv"
+    zipfilename = csvname + ".gz"
+
+    # create and save metadata
+    meta_data = create_meta_data_file(zipfilename)
+    with open("tiles.geojson", "w") as fp:
+        json.dump(meta_data, fp)
+
+    # create and archive buildings data
+    buildings_data = create_buildings_data()
+    keys = buildings_data[0].keys()
+    with open(csvname, "w") as f:
+        w = csv.DictWriter(f, keys)
+        w.writeheader()
+        w.writerows(buildings_data)
+
+    # archive the csv to gzip
+    with open(csvname, "rb") as f_in:
+        with gzip.open(zipfilename, "wb") as f_out:
+            shutil.copyfileobj(f_in, f_out)
+
+    # Compute checksums
+    with open(zipfilename, "rb") as f:
+        md5 = hashlib.md5(f.read()).hexdigest()
+        print(f"{zipfilename}: {md5}")
+
+    # remove csv file
+    os.remove(csvname)
diff --git a/tests/data/openbuildings/tiles.geojson b/tests/data/openbuildings/tiles.geojson
@@ -0,0 +1 @@
+{"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [[[0.0, 0.0], [0.0, 0.05], [0.05, 0.05], [0.05, 0.0], [0.0, 0.0]]]}, "properties": {"tile_id": "025", "tile_url": "polygons_s2_level_4_gzip/000_buildings.csv.gz", "size_mb": 0.2}}]}
diff --git a/tests/datasets/test_openbuildings.py b/tests/datasets/test_openbuildings.py
@@ -0,0 +1,156 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import builtins
+import json
+import os
+import shutil
+from pathlib import Path
+from typing import Any, Generator
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import pytest
+import torch
+import torch.nn as nn
+from _pytest.fixtures import SubRequest
+from _pytest.monkeypatch import MonkeyPatch
+from rasterio.crs import CRS
+
+from torchgeo.datasets import (
+    BoundingBox,
+    IntersectionDataset,
+    OpenBuildings,
+    UnionDataset,
+)
+
+pytest.importorskip("pandas", minversion="0.19.1")
+
+
+class TestOpenBuildings:
+    @pytest.fixture
+    def dataset(
+        self, monkeypatch: Generator[MonkeyPatch, None, None], tmp_path: Path
+    ) -> OpenBuildings:
+
+        root = str(tmp_path)
+        shutil.copy(
+            os.path.join("tests", "data", "openbuildings", "tiles.geojson"), root
+        )
+        shutil.copy(
+            os.path.join("tests", "data", "openbuildings", "000_buildings.csv.gz"), root
+        )
+
+        md5s = {"000_buildings.csv.gz": "20aeeec9d45a0ce4d772a26e0bcbc25f"}
+
+        monkeypatch.setattr(OpenBuildings, "md5s", md5s)  # type: ignore[attr-defined]
+        transforms = nn.Identity()  # type: ignore[attr-defined]
+        return OpenBuildings(root=root, transforms=transforms)
+
+    @pytest.fixture(params=["pandas"])
+    def mock_missing_module(
+        self, monkeypatch: Generator[MonkeyPatch, None, None], request: SubRequest
+    ) -> str:
+        import_orig = builtins.__import__
+        package = str(request.param)
+
+        def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
+            if name == package:
+                raise ImportError()
+            return import_orig(name, *args, **kwargs)
+
+        monkeypatch.setattr(  # type: ignore[attr-defined]
+            builtins, "__import__", mocked_import
+        )
+        return package
+
+    def test_mock_missing_module(
+        self, dataset: OpenBuildings, mock_missing_module: str
+    ) -> None:
+        package = mock_missing_module
+
+        with pytest.raises(
+            ImportError,
+            match=f"{package} is not installed and is required to use this dataset",
+        ):
+            OpenBuildings(root=dataset.root)
+
+    def test_no_shapes_to_rasterize(
+        self, dataset: OpenBuildings, tmp_path: Path
+    ) -> None:
+        # empty csv buildings file
+        path = os.path.join(tmp_path, "000_buildings.csv.gz")
+        df = pd.read_csv(path)
+        df = pd.DataFrame(columns=df.columns)
+        df.to_csv(path, compression="gzip")
+        x = dataset[dataset.bounds]
+        assert isinstance(x, dict)
+        assert isinstance(x["crs"], CRS)
+        assert isinstance(x["mask"], torch.Tensor)
+
+    def test_no_building_data_found(self, tmp_path: Path) -> None:
+        false_root = os.path.join(tmp_path, "empty")
+        os.makedirs(false_root)
+        shutil.copy(
+            os.path.join("tests", "data", "openbuildings", "tiles.geojson"), false_root
+        )
+        with pytest.raises(
+            RuntimeError, match="have manually downloaded the dataset as suggested "
+        ):
+            OpenBuildings(root=false_root)
+
+    def test_corrupted(self, dataset: OpenBuildings, tmp_path: Path) -> None:
+        with open(os.path.join(tmp_path, "000_buildings.csv.gz"), "w") as f:
+            f.write("bad")
+        with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
+            OpenBuildings(dataset.root, checksum=True)
+
+    def test_no_meta_data_found(self, tmp_path: Path) -> None:
+        false_root = os.path.join(tmp_path, "empty")
+        os.makedirs(false_root)
+        with pytest.raises(FileNotFoundError, match="Meta data file"):
+            OpenBuildings(root=false_root)
+
+    def test_nothing_in_index(self, dataset: OpenBuildings, tmp_path: Path) -> None:
+        # change meta data to another 'title_url' so that there is no match found
+        with open(os.path.join(tmp_path, "tiles.geojson"), "r") as f:
+            content = json.load(f)
+            content["features"][0]["properties"]["tile_url"] = "mismatch.csv.gz"
+
+        with open(os.path.join(tmp_path, "tiles.geojson"), "w") as f:
+            json.dump(content, f)
+
+        with pytest.raises(FileNotFoundError, match="data was found in"):
+            OpenBuildings(dataset.root)
+
+    def test_getitem(self, dataset: OpenBuildings) -> None:
+        x = dataset[dataset.bounds]
+        assert isinstance(x, dict)
+        assert isinstance(x["crs"], CRS)
+        assert isinstance(x["mask"], torch.Tensor)
+
+    def test_and(self, dataset: OpenBuildings) -> None:
+        ds = dataset & dataset
+        assert isinstance(ds, IntersectionDataset)
+
+    def test_or(self, dataset: OpenBuildings) -> None:
+        ds = dataset | dataset
+        assert isinstance(ds, UnionDataset)
+
+    def test_invalid_query(self, dataset: OpenBuildings) -> None:
+        query = BoundingBox(100, 100, 100, 100, 0, 0)
+        with pytest.raises(
+            IndexError, match="query: .* not found in index with bounds:"
+        ):
+            dataset[query]
+
+    def test_plot(self, dataset: OpenBuildings) -> None:
+        x = dataset[dataset.bounds]
+        dataset.plot(x, suptitle="test")
+        plt.close()
+
+    def test_plot_prediction(self, dataset: OpenBuildings) -> None:
+        x = dataset[dataset.bounds]
+        x["prediction"] = x["mask"].clone()
+        dataset.plot(x, suptitle="Prediction")
+        plt.close()
diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -66,6 +66,7 @@
 from .naip import NAIP
 from .nasa_marine_debris import NASAMarineDebris
 from .nwpu import VHR10
+from .openbuildings import OpenBuildings
 from .oscd import OSCD
 from .patternnet import PatternNet
 from .potsdam import Potsdam2D
@@ -121,6 +122,7 @@
     "Landsat8",
     "Landsat9",
     "NAIP",
+    "OpenBuildings",
     "Sentinel",
     "Sentinel2",
     # VisionDataset