Skip to content

Commit

Permalink
use url for hashing instead of inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
ocefpaf committed Feb 26, 2024
1 parent 2fead37 commit 4eaaae7
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 10 deletions.
27 changes: 25 additions & 2 deletions erddapy/core/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import copy
import functools
import io
from collections import OrderedDict
from datetime import datetime
from typing import BinaryIO, Dict, List, Optional, Tuple, Union
from urllib.parse import quote_plus
from urllib import parse

import httpx
import pytz
Expand All @@ -15,6 +16,28 @@
OptionalStr = Optional[str]


def _sort_url(url):
"""
Returns a URL with sorted variables and constraints to ensure unique hash.
"""
parts = parse.urlparse(url)
if parts.query:
query = parts.query.split("&", maxsplit=1)
if len(query) == 1:
variables = parts.query
constraints = ""
else:
variables, constraints = parts.query.split("&", maxsplit=1)
sorted_variables = ",".join(sorted(variables.split(",")))
sorted_query = OrderedDict(sorted(dict(parse.parse_qsl(constraints)).items()))
sorted_query_str = parse.unquote(parse.urlencode(sorted_query))
sorted_url = f"{parts.scheme}://{parts.netloc}{parts.path}?{parts.params}{sorted_variables}&{sorted_query_str}{parts.fragment}"
else:
sorted_url = url
return sorted_url.strip("&")


@functools.lru_cache(maxsize=128)
def _urlopen(url: str, auth: Optional[tuple] = None, **kwargs: Dict) -> BinaryIO:
if "timeout" not in kwargs.keys():
Expand Down Expand Up @@ -207,7 +230,7 @@ def get_search_url(
"&maxTime={maxTime}"
)
if search_for:
search_for = quote_plus(search_for)
search_for = parse.quote_plus(search_for)
base += "&searchFor={searchFor}"

# Convert dates from datetime to `seconds since 1970-01-01T00:00:00Z`.
Expand Down
10 changes: 4 additions & 6 deletions erddapy/erddapy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
_distinct,
_format_constraints_url,
_quote_string_constraints,
_sort_url,
download_formats,
get_categorize_url,
get_download_url,
Expand Down Expand Up @@ -486,12 +487,9 @@ def download_file(
raise ValueError(
f"Requested filetype {file_type} not available on ERDDAP",
)
url = self.get_download_url(response=file_type)
constraints_str = str(dict(sorted(self.constraints.items()))) + str(
sorted(self.variables),
)
constraints_hash = hashlib.shake_256(constraints_str.encode()).hexdigest(5)
file_name = Path(f"{self.dataset_id}_{constraints_hash}.{file_type}")
url = _sort_url(self.get_download_url(response=file_type))
fname_hash = hashlib.shake_256(url.encode()).hexdigest(5)
file_name = Path(f"{self.dataset_id}_{fname_hash}.{file_type}")
if not file_name.exists():
urlretrieve(url, file_name)
return file_name
42 changes: 41 additions & 1 deletion tests/test_to_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,47 @@ def test_to_iris_griddap(dataset_griddap):

@pytest.mark.web
def test_download_file(dataset_tabledap):
"""Test direct download of tabledap dataset"""
"""Test direct download of tabledap dataset with defined variable and constraints."""
fn = dataset_tabledap.download_file("nc")
ds = xr.load_dataset(fn)
assert ds["time"].name == "time"
assert ds["temperature"].name == "temperature"
dataset_tabledap.variables = dataset_tabledap.variables[::-1]
fn_new = dataset_tabledap.download_file("nc")
assert fn_new == fn


@pytest.mark.web
def test_download_file_variables_only(dataset_tabledap):
"""Test direct download of tabledap dataset with undefined constraints."""
dataset_tabledap.constraints = {}
fn = dataset_tabledap.download_file("nc")
ds = xr.load_dataset(fn)
assert ds["time"].name == "time"
assert ds["temperature"].name == "temperature"
dataset_tabledap.variables = dataset_tabledap.variables[::-1]
fn_new = dataset_tabledap.download_file("nc")
assert fn_new == fn


@pytest.mark.web
def test_download_file_constraints_only(dataset_tabledap):
"""Test direct download of tabledap dataset with undefined variables."""
dataset_tabledap.variables = []
fn = dataset_tabledap.download_file("nc")
ds = xr.load_dataset(fn)
assert ds["time"].name == "time"
assert ds["temperature"].name == "temperature"
dataset_tabledap.variables = dataset_tabledap.variables[::-1]
fn_new = dataset_tabledap.download_file("nc")
assert fn_new == fn


@pytest.mark.web
def test_download_file_undefined_query(dataset_tabledap):
"""Test direct download of tabledap dataset with undefined query."""
dataset_tabledap.variables = []
dataset_tabledap.constraints = {}
fn = dataset_tabledap.download_file("nc")
ds = xr.load_dataset(fn)
assert ds["time"].name == "time"
Expand Down
29 changes: 28 additions & 1 deletion tests/test_url_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import httpx
import pytest

from erddapy.core.url import check_url_response, urlopen
from erddapy.core.url import _sort_url, check_url_response, urlopen


@pytest.mark.web
Expand Down Expand Up @@ -39,3 +39,30 @@ def test_check_url_response():
)
with pytest.raises(httpx.HTTPError):
check_url_response(bad_request)


def test__sort_url():
"""Test _sort_url with defined variable and constraints."""
url = "https://erddap.sensors.ioos.us/erddap/tabledap/amelia_20180501t0000.nc?time,temperature&time>=1525737600.0&time<=1526245200.0&latitude>=36&latitude<=38&longitude>=-76&longitude<=-73"
expected = "https://erddap.sensors.ioos.us/erddap/tabledap/amelia_20180501t0000.nc?temperature,time&latitude<=38&latitude>=36&longitude<=-73&longitude>=-76&time<=1526245200.0&time>=1525737600.0"
assert _sort_url(url) == expected


def test__sort_url_variables_only():
"""Test _sort_url with undefined constraints."""
url = "https://erddap.sensors.ioos.us/erddap/tabledap/amelia_20180501t0000.nc?&time>=1525737600.0&time<=1526245200.0&latitude>=36&latitude<=38&longitude>=-76&longitude<=-73"
expected = "https://erddap.sensors.ioos.us/erddap/tabledap/amelia_20180501t0000.nc?&latitude<=38&latitude>=36&longitude<=-73&longitude>=-76&time<=1526245200.0&time>=1525737600.0"
assert _sort_url(url) == expected


def test__sort_url_constraints_only():
"""Test _sort_url with undefined variables."""
url = "https://erddap.sensors.ioos.us/erddap/tabledap/amelia_20180501t0000.nc?time,temperature"
expected = "https://erddap.sensors.ioos.us/erddap/tabledap/amelia_20180501t0000.nc?temperature,time"
assert _sort_url(url) == expected


def test__sort_url_undefined_query():
"""Test _sort_url with undefined query."""
url = "https://erddap.sensors.ioos.us/erddap/tabledap/amelia_20180501t0000.nc?"
assert _sort_url(url) == url

0 comments on commit 4eaaae7

Please sign in to comment.