Skip to content

Commit

Permalink
Add OpenBuildings dataset (#402)
Browse files Browse the repository at this point in the history
* populate index attempt

* added tests

* correct plot method

* fix test

* fix documentation

* fix docs

* name changes

* lazy import pandas and Any instead of Tensor

* requested changes

* mypy fixes

* Close plot filehandles

Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com>
  • Loading branch information
nilsleh and adamjstewart committed Feb 27, 2022
1 parent 4c221df commit 06ec364
Show file tree
Hide file tree
Showing 7 changed files with 739 additions and 0 deletions.
5 changes: 5 additions & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ National Agriculture Imagery Program (NAIP)

.. autoclass:: NAIP

Open Buildings
^^^^^^^^^^^^^^

.. autoclass:: OpenBuildings

Sentinel
^^^^^^^^

Expand Down
Binary file added tests/data/openbuildings/000_buildings.csv.gz
Binary file not shown.
105 changes: 105 additions & 0 deletions tests/data/openbuildings/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import csv
import gzip
import hashlib
import json
import os
import random
import shutil

import numpy as np
from shapely.geometry import Polygon

SIZE = 0.05

np.random.seed(0)
random.seed(0)


def create_meta_data_file(zipfilename):
meta_data = {
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [
[[0.0, 0.0], [0.0, SIZE], [SIZE, SIZE], [SIZE, 0.0], [0.0, 0.0]]
],
},
"properties": {
"tile_id": "025",
"tile_url": "polygons_s2_level_4_gzip/{}".format(zipfilename),
"size_mb": 0.2,
},
}
],
}
return meta_data


def create_csv_data_row(lat, long):
width, height = SIZE / 10, SIZE / 10
minx = long - 0.5 * width
maxx = long + 0.5 * width
miny = lat - 0.5 * height
maxy = lat - 0.5 * height
coordinates = [(minx, miny), (minx, maxy), (maxx, maxy), (maxx, miny), (minx, miny)]
polygon = Polygon(coordinates)

data_row = {
"latitude": lat,
"longitude": long,
"area_in_meters": 1.0,
"confidence": 1.0,
"geometry": polygon.wkt,
"full_plus_code": "ABC",
}

return data_row


def create_buildings_data():
fourth = SIZE / 4
# pandas df
dict_data = [
create_csv_data_row(fourth, fourth),
create_csv_data_row(SIZE - fourth, SIZE - fourth),
]
return dict_data


if __name__ == "__main__":
csvname = "000_buildings.csv"
zipfilename = csvname + ".gz"

# create and save metadata
meta_data = create_meta_data_file(zipfilename)
with open("tiles.geojson", "w") as fp:
json.dump(meta_data, fp)

# create and archive buildings data
buildings_data = create_buildings_data()
keys = buildings_data[0].keys()
with open(csvname, "w") as f:
w = csv.DictWriter(f, keys)
w.writeheader()
w.writerows(buildings_data)

# archive the csv to gzip
with open(csvname, "rb") as f_in:
with gzip.open(zipfilename, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)

# Compute checksums
with open(zipfilename, "rb") as f:
md5 = hashlib.md5(f.read()).hexdigest()
print(f"{zipfilename}: {md5}")

# remove csv file
os.remove(csvname)
1 change: 1 addition & 0 deletions tests/data/openbuildings/tiles.geojson
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [[[0.0, 0.0], [0.0, 0.05], [0.05, 0.05], [0.05, 0.0], [0.0, 0.0]]]}, "properties": {"tile_id": "025", "tile_url": "polygons_s2_level_4_gzip/000_buildings.csv.gz", "size_mb": 0.2}}]}
156 changes: 156 additions & 0 deletions tests/datasets/test_openbuildings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import builtins
import json
import os
import shutil
from pathlib import Path
from typing import Any, Generator

import matplotlib.pyplot as plt
import pandas as pd
import pytest
import torch
import torch.nn as nn
from _pytest.fixtures import SubRequest
from _pytest.monkeypatch import MonkeyPatch
from rasterio.crs import CRS

from torchgeo.datasets import (
BoundingBox,
IntersectionDataset,
OpenBuildings,
UnionDataset,
)

pytest.importorskip("pandas", minversion="0.19.1")


class TestOpenBuildings:
@pytest.fixture
def dataset(
self, monkeypatch: Generator[MonkeyPatch, None, None], tmp_path: Path
) -> OpenBuildings:

root = str(tmp_path)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "tiles.geojson"), root
)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "000_buildings.csv.gz"), root
)

md5s = {"000_buildings.csv.gz": "20aeeec9d45a0ce4d772a26e0bcbc25f"}

monkeypatch.setattr(OpenBuildings, "md5s", md5s) # type: ignore[attr-defined]
transforms = nn.Identity() # type: ignore[attr-defined]
return OpenBuildings(root=root, transforms=transforms)

@pytest.fixture(params=["pandas"])
def mock_missing_module(
self, monkeypatch: Generator[MonkeyPatch, None, None], request: SubRequest
) -> str:
import_orig = builtins.__import__
package = str(request.param)

def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
if name == package:
raise ImportError()
return import_orig(name, *args, **kwargs)

monkeypatch.setattr( # type: ignore[attr-defined]
builtins, "__import__", mocked_import
)
return package

def test_mock_missing_module(
self, dataset: OpenBuildings, mock_missing_module: str
) -> None:
package = mock_missing_module

with pytest.raises(
ImportError,
match=f"{package} is not installed and is required to use this dataset",
):
OpenBuildings(root=dataset.root)

def test_no_shapes_to_rasterize(
self, dataset: OpenBuildings, tmp_path: Path
) -> None:
# empty csv buildings file
path = os.path.join(tmp_path, "000_buildings.csv.gz")
df = pd.read_csv(path)
df = pd.DataFrame(columns=df.columns)
df.to_csv(path, compression="gzip")
x = dataset[dataset.bounds]
assert isinstance(x, dict)
assert isinstance(x["crs"], CRS)
assert isinstance(x["mask"], torch.Tensor)

def test_no_building_data_found(self, tmp_path: Path) -> None:
false_root = os.path.join(tmp_path, "empty")
os.makedirs(false_root)
shutil.copy(
os.path.join("tests", "data", "openbuildings", "tiles.geojson"), false_root
)
with pytest.raises(
RuntimeError, match="have manually downloaded the dataset as suggested "
):
OpenBuildings(root=false_root)

def test_corrupted(self, dataset: OpenBuildings, tmp_path: Path) -> None:
with open(os.path.join(tmp_path, "000_buildings.csv.gz"), "w") as f:
f.write("bad")
with pytest.raises(RuntimeError, match="Dataset found, but corrupted."):
OpenBuildings(dataset.root, checksum=True)

def test_no_meta_data_found(self, tmp_path: Path) -> None:
false_root = os.path.join(tmp_path, "empty")
os.makedirs(false_root)
with pytest.raises(FileNotFoundError, match="Meta data file"):
OpenBuildings(root=false_root)

def test_nothing_in_index(self, dataset: OpenBuildings, tmp_path: Path) -> None:
# change meta data to another 'title_url' so that there is no match found
with open(os.path.join(tmp_path, "tiles.geojson"), "r") as f:
content = json.load(f)
content["features"][0]["properties"]["tile_url"] = "mismatch.csv.gz"

with open(os.path.join(tmp_path, "tiles.geojson"), "w") as f:
json.dump(content, f)

with pytest.raises(FileNotFoundError, match="data was found in"):
OpenBuildings(dataset.root)

def test_getitem(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
assert isinstance(x, dict)
assert isinstance(x["crs"], CRS)
assert isinstance(x["mask"], torch.Tensor)

def test_and(self, dataset: OpenBuildings) -> None:
ds = dataset & dataset
assert isinstance(ds, IntersectionDataset)

def test_or(self, dataset: OpenBuildings) -> None:
ds = dataset | dataset
assert isinstance(ds, UnionDataset)

def test_invalid_query(self, dataset: OpenBuildings) -> None:
query = BoundingBox(100, 100, 100, 100, 0, 0)
with pytest.raises(
IndexError, match="query: .* not found in index with bounds:"
):
dataset[query]

def test_plot(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
dataset.plot(x, suptitle="test")
plt.close()

def test_plot_prediction(self, dataset: OpenBuildings) -> None:
x = dataset[dataset.bounds]
x["prediction"] = x["mask"].clone()
dataset.plot(x, suptitle="Prediction")
plt.close()
2 changes: 2 additions & 0 deletions torchgeo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
from .naip import NAIP
from .nasa_marine_debris import NASAMarineDebris
from .nwpu import VHR10
from .openbuildings import OpenBuildings
from .oscd import OSCD
from .patternnet import PatternNet
from .potsdam import Potsdam2D
Expand Down Expand Up @@ -121,6 +122,7 @@
"Landsat8",
"Landsat9",
"NAIP",
"OpenBuildings",
"Sentinel",
"Sentinel2",
# VisionDataset
Expand Down
Loading

0 comments on commit 06ec364

Please sign in to comment.