Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add EDDMapS dataset #533

Merged
merged 3 commits into from
May 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ Cropland Data Layer (CDL)

.. autoclass:: CDL

EDDMapS
^^^^^^^

.. autoclass:: EDDMapS

EnviroAtlas
^^^^^^^^^^^

Expand Down
97 changes: 97 additions & 0 deletions tests/data/eddmaps/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import pandas as pd

filename = "mappings.csv"

size = 3
data = {
"gbifID": [""] * size,
"decimalLatitude": [41.881832] * size,
"decimalLongitude": [""] + [-87.623177] * (size - 1),
"objectid": [""] * size,
"reporter": [""] * size,
"RecOwner": [""] * size,
"SciName": ["Homo sapiens"] * size,
"ComName": ["human"] * size,
"Nativity": ["Native"] * size,
"OccStatus": ["Detected"] * size,
"Status": ["Positive"] * size,
"ObsDate": ["", "", "05-07-22"],
"DateEnt": ["05-07-22"] * size,
"DateUp": ["05-07-22"] * size,
"Location": ["Chicago, Illinois, United States"] * size,
"Latitude": [41.881832] * size,
"Longitude": [""] + [-87.623177] * (size - 1),
"Datum": ["WGS84"] * size,
"Method": [""] * size,
"CoordAcc": [""] * size,
"DataType": [""] * size,
"Centroid": [""] * size,
"Abundance": [""] * size,
"InfestAcre": [""] * size,
"GrossAcre": [""] * size,
"Percentcov": [""] * size,
"Density": [""] * size,
"Quantity": [""] * size,
"QuantityU": [""] * size,
"APPXQuant": [""] * size,
"NumCollect": [""] * size,
"Smallest": [""] * size,
"Largest": [""] * size,
"Incidence": [""] * size,
"Severity": [""] * size,
"Host": [""] * size,
"Host_Name": [""] * size,
"HostPheno": [""] * size,
"HostDamage": [""] * size,
"ManageStat": ["Unknown"] * size,
"PopStat": [""] * size,
"Habitat": [""] * size,
"LocalOwner": [""] * size,
"Site": [""] * size,
"RecBasis": [""] * size,
"Museum": [""] * size,
"MuseumRec": [""] * size,
"Voucher": [""] * size,
"ObsIDer": [""] * size,
"CollectTme": [""] * size,
"UUID": [""] * size,
"OrgSrcID": [""] * size,
"OrigName": ["Homo sapiens"] * size,
"RecSrcTyp": ["Bulk Data"] * size,
"Surveyor": [""] * size,
"DateAcc": [""] * size,
"VisitType": [""] * size,
"DataMthd": [""] * size,
"TrapType": [""] * size,
"NumTraps": [""] * size,
"TargetName": [""] * size,
"TargetCnt": [""] * size,
"TargetRnge": [""] * size,
"Phenology": [""] * size,
"LifeStatus": [""] * size,
"Sex": [""] * size,
"PID": [""] * size,
"WaterName": [""] * size,
"WaterType": [""] * size,
"Substrate": [""] * size,
"TreatArea": [""] * size,
"PlantTreat": [""] * size,
"TreatComm": [""] * size,
"Reference": [""] * size,
"Locality": [""] * size,
"Comments": [""] * size,
"ReviewDate": ["05-07-22"] * size,
"Reviewer": ["Charles Darwin"] * size,
"VerifyMthd": ["Bulk Verified"] * size,
"Verified": ["Verified"] * size,
"IDCred": ["Credible"] * size,
"ReviewComm": [""] * size,
}

df = pd.DataFrame(data)
df.to_csv(filename, index=False)
4 changes: 4 additions & 0 deletions tests/data/eddmaps/mappings.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
gbifID,decimalLatitude,decimalLongitude,objectid,reporter,RecOwner,SciName,ComName,Nativity,OccStatus,Status,ObsDate,DateEnt,DateUp,Location,Latitude,Longitude,Datum,Method,CoordAcc,DataType,Centroid,Abundance,InfestAcre,GrossAcre,Percentcov,Density,Quantity,QuantityU,APPXQuant,NumCollect,Smallest,Largest,Incidence,Severity,Host,Host_Name,HostPheno,HostDamage,ManageStat,PopStat,Habitat,LocalOwner,Site,RecBasis,Museum,MuseumRec,Voucher,ObsIDer,CollectTme,UUID,OrgSrcID,OrigName,RecSrcTyp,Surveyor,DateAcc,VisitType,DataMthd,TrapType,NumTraps,TargetName,TargetCnt,TargetRnge,Phenology,LifeStatus,Sex,PID,WaterName,WaterType,Substrate,TreatArea,PlantTreat,TreatComm,Reference,Locality,Comments,ReviewDate,Reviewer,VerifyMthd,Verified,IDCred,ReviewComm
,41.881832,,,,,Homo sapiens,human,Native,Detected,Positive,,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
,41.881832,-87.623177,,,,Homo sapiens,human,Native,Detected,Positive,,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,-87.623177,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
,41.881832,-87.623177,,,,Homo sapiens,human,Native,Detected,Positive,05-07-22,05-07-22,05-07-22,"Chicago, Illinois, United States",41.881832,-87.623177,WGS84,,,,,,,,,,,,,,,,,,,,,,Unknown,,,,,,,,,,,,,Homo sapiens,Bulk Data,,,,,,,,,,,,,,,,,,,,,,,05-07-22,Charles Darwin,Bulk Verified,Verified,Credible,
67 changes: 67 additions & 0 deletions tests/datasets/test_eddmaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import builtins
import os
from pathlib import Path
from typing import Any

import pytest
from _pytest.monkeypatch import MonkeyPatch

from torchgeo.datasets import BoundingBox, EDDMapS, IntersectionDataset, UnionDataset

pytest.importorskip("pandas", minversion="0.23.2")


class TestEDDMapS:
@pytest.fixture(scope="class")
def dataset(self) -> EDDMapS:
root = os.path.join("tests", "data", "eddmaps")
return EDDMapS(root)

def test_getitem(self, dataset: EDDMapS) -> None:
x = dataset[dataset.bounds]
assert isinstance(x, dict)

def test_len(self, dataset: EDDMapS) -> None:
assert len(dataset) == 2

def test_and(self, dataset: EDDMapS) -> None:
ds = dataset & dataset
assert isinstance(ds, IntersectionDataset)

def test_or(self, dataset: EDDMapS) -> None:
ds = dataset | dataset
assert isinstance(ds, UnionDataset)

def test_no_data(self, tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError, match="Dataset not found"):
EDDMapS(str(tmp_path))

@pytest.fixture
def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None:
import_orig = builtins.__import__

def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
if name == "pandas":
raise ImportError()
return import_orig(name, *args, **kwargs)

monkeypatch.setattr(builtins, "__import__", mocked_import)

def test_mock_missing_module(
self, dataset: EDDMapS, mock_missing_module: None
) -> None:
with pytest.raises(
ImportError,
match="pandas is not installed and is required to use this dataset",
):
EDDMapS(dataset.root)

def test_invalid_query(self, dataset: EDDMapS) -> None:
query = BoundingBox(0, 0, 0, 0, 0, 0)
with pytest.raises(
IndexError, match="query: .* not found in index with bounds:"
):
dataset[query]
2 changes: 2 additions & 0 deletions torchgeo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .cv4a_kenya_crop_type import CV4AKenyaCropType
from .cyclone import TropicalCycloneWindEstimation
from .dfc2022 import DFC2022
from .eddmaps import EDDMapS
from .enviroatlas import EnviroAtlas
from .esri2020 import Esri2020
from .etci2021 import ETCI2021
Expand Down Expand Up @@ -117,6 +118,7 @@
"ChesapeakeWV",
"ChesapeakeCVPR",
"CMSGlobalMangroveCanopy",
"EDDMapS",
"Esri2020",
"EUDEM",
"GBIF",
Expand Down
116 changes: 116 additions & 0 deletions torchgeo/datasets/eddmaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

"""Dataset for EDDMapS."""

import os
import sys
from typing import Any, Dict

import numpy as np
from rasterio.crs import CRS

from .geo import GeoDataset
from .utils import BoundingBox, disambiguate_timestamp


class EDDMapS(GeoDataset):
"""Dataset for EDDMapS.

`EDDMapS <https://www.eddmaps.org/>`_, Early Detection and Distribution Mapping
System, is a web-based mapping system for documenting invasive species and pest
distribution. Launched in 2005 by the Center for Invasive Species and Ecosystem
Health at the University of Georgia, it was originally designed as a tool for
state Exotic Pest Plant Councils to develop more complete distribution data of
invasive species. Since then, the program has expanded to include the entire US
and Canada as well as to document certain native pest species.

EDDMapS query results can be downloaded in CSV, KML, or Shapefile format. This
dataset currently only supports CSV files.

If you use an EDDMapS dataset in your research, please cite it like so:

* EDDMapS. *YEAR*. Early Detection & Distribution Mapping System. The University of
Georgia - Center for Invasive Species and Ecosystem Health. Available online at
http://www.eddmaps.org/; last accessed *DATE*.

.. note::
This dataset requires the following additional library to be installed:

* `pandas <https://pypi.org/project/pandas/>`_ to load CSV files

.. versionadded:: 0.3
"""

res = 0
_crs = CRS.from_epsg(4326) # Lat/Lon

def __init__(self, root: str = "data") -> None:
"""Initialize a new Dataset instance.

Args:
root: root directory where dataset can be found

Raises:
FileNotFoundError: if no files are found in ``root``
ImportError: if pandas is not installed
"""
super().__init__()

self.root = root

filepath = os.path.join(root, "mappings.csv")
if not os.path.exists(filepath):
raise FileNotFoundError(f"Dataset not found in `root={self.root}`")

try:
import pandas as pd # noqa: F401
except ImportError:
raise ImportError(
"pandas is not installed and is required to use this dataset"
)

# Read CSV file
data = pd.read_csv(
filepath, engine="c", usecols=["ObsDate", "Latitude", "Longitude"]
)

# Convert from pandas DataFrame to rtree Index
i = 0
for date, y, x in data.itertuples(index=False, name=None):
# Skip rows without lat/lon
if np.isnan(y) or np.isnan(x):
continue

if not pd.isna(date):
mint, maxt = disambiguate_timestamp(date, "%m-%d-%y")
else:
mint, maxt = 0, sys.maxsize

coords = (x, x, y, y, mint, maxt)
self.index.insert(i, coords)
i += 1

def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
"""Retrieve metadata indexed by query.

Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index

Returns:
sample of metadata at that index

Raises:
IndexError: if query is not found in the index
"""
hits = self.index.intersection(tuple(query), objects=True)
bboxes = [hit.bbox for hit in hits]

if not bboxes:
raise IndexError(
f"query: {query} not found in index with bounds: {self.bounds}"
)

sample = {"crs": self.crs, "bbox": bboxes}

return sample