forked from microsoft/torchgeo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add iNaturalist dataset (microsoft#532)
- Loading branch information
1 parent
25349ca
commit d517197
Showing
6 changed files
with
265 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. | ||
|
||
import pandas as pd | ||
|
||
filename = "observations-012345.csv" | ||
|
||
# User can select which columns to export. The following are the default columns. | ||
# Not all columns may exist in the actual dataset. | ||
size = 4 | ||
data = { | ||
"id": [""] * size, | ||
"observed_on_string": [""] * size, | ||
"observed_on": ["", "", "2022-05-07", "2022-05-07"], | ||
"time_observed_at": ["", "", "", "2022-05-07 11:02:53 +0100"], | ||
"time_zone": ["Central Time (US & Canada)"] * size, | ||
"user_id": [123] * size, | ||
"user_login": ["darwin"] * size, | ||
"created_at": ["2022-05-07 11:02:53 +0100"] * size, | ||
"updated_at": ["2022-05-07 11:02:53 +0100"] * size, | ||
"quality_grade": ["research"] * size, | ||
"license": ["CCO"] * size, | ||
"url": ["https://inaturalist.org/observations/123"] * size, | ||
"image_url": [ | ||
"https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg" | ||
] | ||
* size, | ||
"sound_url": ["https://static.inaturalist.org/sounds/123.m4a?123"] * size, | ||
"tag_list": ["Chicago"] * size, | ||
"description": [""] * size, | ||
"num_identification_agreements": [1] * size, | ||
"num_identification_disagreements": [0] * size, | ||
"captive_cultivated": ["false"] * size, | ||
"oauth_application_id": [""] * size, | ||
"place_guess": ["Chicago"] * size, | ||
"latitude": [41.881832] * size, | ||
"longitude": [""] + [-87.623177] * (size - 1), | ||
"positional_accuracy": [5] * size, | ||
"private_place_guess": [""] * size, | ||
"private_latitude": [""] * size, | ||
"private_longitude": [""] * size, | ||
"public_positional_accuracy": [5] * size, | ||
"geoprivacy": [""] * size, | ||
"taxon_geoprivacy": [""] * size, | ||
"coordinates_obscured": ["false"] * size, | ||
"positioning_method": ["gps"] * size, | ||
"positioning_device": ["gps"] * size, | ||
"species_guess": ["Homo sapiens"] * size, | ||
"scientific_name": ["Homo sapiens"] * size, | ||
"common_name": ["human"] * size, | ||
"iconic_taxon_name": ["Animalia"] * size, | ||
"taxon_id": [123] * size, | ||
} | ||
|
||
df = pd.DataFrame(data) | ||
df.to_csv(filename, index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,created_at,updated_at,quality_grade,license,url,image_url,sound_url,tag_list,description,num_identification_agreements,num_identification_disagreements,captive_cultivated,oauth_application_id,place_guess,latitude,longitude,positional_accuracy,private_place_guess,private_latitude,private_longitude,public_positional_accuracy,geoprivacy,taxon_geoprivacy,coordinates_obscured,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id | ||
,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123 | ||
,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123 | ||
,,2022-05-07,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123 | ||
,,2022-05-07,2022-05-07 11:02:53 +0100,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. | ||
|
||
import builtins | ||
import os | ||
from pathlib import Path | ||
from typing import Any | ||
|
||
import pytest | ||
from _pytest.monkeypatch import MonkeyPatch | ||
|
||
from torchgeo.datasets import ( | ||
BoundingBox, | ||
INaturalist, | ||
IntersectionDataset, | ||
UnionDataset, | ||
) | ||
|
||
pytest.importorskip("pandas", minversion="0.23.2") | ||
|
||
|
||
class TestINaturalist: | ||
@pytest.fixture(scope="class") | ||
def dataset(self) -> INaturalist: | ||
root = os.path.join("tests", "data", "inaturalist") | ||
return INaturalist(root) | ||
|
||
def test_getitem(self, dataset: INaturalist) -> None: | ||
x = dataset[dataset.bounds] | ||
assert isinstance(x, dict) | ||
|
||
def test_len(self, dataset: INaturalist) -> None: | ||
assert len(dataset) == 3 | ||
|
||
def test_and(self, dataset: INaturalist) -> None: | ||
ds = dataset & dataset | ||
assert isinstance(ds, IntersectionDataset) | ||
|
||
def test_or(self, dataset: INaturalist) -> None: | ||
ds = dataset | dataset | ||
assert isinstance(ds, UnionDataset) | ||
|
||
def test_no_data(self, tmp_path: Path) -> None: | ||
with pytest.raises(FileNotFoundError, match="Dataset not found"): | ||
INaturalist(str(tmp_path)) | ||
|
||
@pytest.fixture | ||
def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None: | ||
import_orig = builtins.__import__ | ||
|
||
def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any: | ||
if name == "pandas": | ||
raise ImportError() | ||
return import_orig(name, *args, **kwargs) | ||
|
||
monkeypatch.setattr(builtins, "__import__", mocked_import) | ||
|
||
def test_mock_missing_module( | ||
self, dataset: INaturalist, mock_missing_module: None | ||
) -> None: | ||
with pytest.raises( | ||
ImportError, | ||
match="pandas is not installed and is required to use this dataset", | ||
): | ||
INaturalist(dataset.root) | ||
|
||
def test_invalid_query(self, dataset: INaturalist) -> None: | ||
query = BoundingBox(0, 0, 0, 0, 0, 0) | ||
with pytest.raises( | ||
IndexError, match="query: .* not found in index with bounds:" | ||
): | ||
dataset[query] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. | ||
|
||
"""Dataset for iNaturalist.""" | ||
|
||
import glob | ||
import os | ||
import sys | ||
from typing import Any, Dict | ||
|
||
from rasterio.crs import CRS | ||
|
||
from .geo import GeoDataset | ||
from .utils import BoundingBox, disambiguate_timestamp | ||
|
||
|
||
class INaturalist(GeoDataset): | ||
"""Dataset for iNaturalist. | ||
`iNaturalist <https://www.inaturalist.org/>`_ is a joint initiative of the | ||
California Academy of Sciences and the National Geographic Society. It allows | ||
citizen scientists to upload observations of organisms that can be downloaded by | ||
scientists and researchers. | ||
If you use an iNaturalist dataset in your research, please cite it according to: | ||
* https://www.inaturalist.org/pages/help#cite | ||
.. note:: | ||
This dataset requires the following additional library to be installed: | ||
* `pandas <https://pypi.org/project/pandas/>`_ to load CSV files | ||
.. versionadded:: 0.3 | ||
""" | ||
|
||
res = 0 | ||
_crs = CRS.from_epsg(4326) # Lat/Lon | ||
|
||
def __init__(self, root: str = "data") -> None: | ||
"""Initialize a new Dataset instance. | ||
Args: | ||
root: root directory where dataset can be found | ||
Raises: | ||
FileNotFoundError: if no files are found in ``root`` | ||
ImportError: if pandas is not installed | ||
""" | ||
super().__init__() | ||
|
||
self.root = root | ||
|
||
files = glob.glob(os.path.join(root, "**.csv")) | ||
if not files: | ||
raise FileNotFoundError(f"Dataset not found in `root={self.root}`") | ||
|
||
try: | ||
import pandas as pd # noqa: F401 | ||
except ImportError: | ||
raise ImportError( | ||
"pandas is not installed and is required to use this dataset" | ||
) | ||
|
||
# Read CSV file | ||
data = pd.read_csv( | ||
files[0], | ||
engine="c", | ||
usecols=["observed_on", "time_observed_at", "latitude", "longitude"], | ||
) | ||
|
||
# Dataset contains many possible timestamps: | ||
# | ||
# * observed_on_string: no consistent format (can't use) | ||
# * observed_on: day precision (better) | ||
# * time_observed_at: second precision (best) | ||
# * created_at: when observation was submitted (shouldn't use) | ||
# * updated_at: when submission was updated (shouldn't use) | ||
# | ||
# The created_at/updated_at timestamps can be years after the actual submission, | ||
# so they shouldn't be used, even if observed_on/time_observed_at are missing. | ||
|
||
# Convert from pandas DataFrame to rtree Index | ||
i = 0 | ||
for date, time, y, x in data.itertuples(index=False, name=None): | ||
# Skip rows without lat/lon | ||
if pd.isna(y) or pd.isna(x): | ||
continue | ||
|
||
if not pd.isna(time): | ||
mint, maxt = disambiguate_timestamp(time, "%Y-%m-%d %H:%M:%S %z") | ||
elif not pd.isna(date): | ||
mint, maxt = disambiguate_timestamp(date, "%Y-%m-%d") | ||
else: | ||
mint, maxt = 0, sys.maxsize | ||
|
||
coords = (x, x, y, y, mint, maxt) | ||
self.index.insert(i, coords) | ||
i += 1 | ||
|
||
def __getitem__(self, query: BoundingBox) -> Dict[str, Any]: | ||
"""Retrieve metadata indexed by query. | ||
Args: | ||
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index | ||
Returns: | ||
sample of metadata at that index | ||
Raises: | ||
IndexError: if query is not found in the index | ||
""" | ||
hits = self.index.intersection(tuple(query), objects=True) | ||
bboxes = [hit.bbox for hit in hits] | ||
|
||
if not bboxes: | ||
raise IndexError( | ||
f"query: {query} not found in index with bounds: {self.bounds}" | ||
) | ||
|
||
sample = {"crs": self.crs, "bbox": bboxes} | ||
|
||
return sample |