Skip to content

Commit

Permalink
DNS resolver on ip check (#9150)
Browse files Browse the repository at this point in the history
* changes

* changes

* add changeset

* chaanges

* changes

* changes

* changes

* changes

* add caching and whitelist

* remove hf.space

---------

Co-authored-by: Ali Abid <aliabid94@gmail.com>
Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com>
Co-authored-by: Abubakar Abid <abubakar@huggingface.co>
  • Loading branch information
4 people authored Aug 27, 2024
1 parent 21f117d commit 80c966a
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 17 deletions.
5 changes: 5 additions & 0 deletions .changeset/solid-chicken-love.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"gradio": minor
---

feat:DNS resolver on ip check
65 changes: 51 additions & 14 deletions gradio/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
import socket
import subprocess
import tempfile
import urllib.request
import warnings
from functools import lru_cache
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any
from urllib.parse import urlparse
from urllib.parse import urlparse, urlunparse

import aiofiles
import httpx
Expand Down Expand Up @@ -271,36 +273,71 @@ def save_file_to_cache(file_path: str | Path, cache_dir: str) -> str:
return full_temp_file_path


def check_public_url(url: str):
@lru_cache(maxsize=256)
def resolve_with_google_dns(hostname: str) -> str | None:
url = f"https://dns.google/resolve?name={hostname}&type=A"

with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())

if data.get("Status") == 0 and "Answer" in data:
for answer in data["Answer"]:
if answer["type"] == 1:
return answer["data"]


# Always return these URLs as is, without checking to see if they resolve
# to an internal IP address. This is because Hugging Face uses DNS splitting,
# which means that requests from HF Spaces to HF Datasets or HF Models
# may resolve to internal IP addresses even if they are publicly accessible.
PUBLIC_URL_WHITELIST = ["hf.co", "huggingface.co"]


def get_public_url(url: str) -> str:
parsed_url = urlparse(url)
if parsed_url.scheme not in ["http", "https"]:
raise httpx.RequestError(f"Invalid URL: {url}")
raise httpx.RequestError(f"Invalid scheme for URL: {url}")
hostname = parsed_url.hostname
if not hostname:
raise httpx.RequestError(f"Invalid URL: {url}")
raise httpx.RequestError(f"Invalid URL: {url}, missing hostname")
if hostname.lower() in PUBLIC_URL_WHITELIST:
return url

try:
addrinfo = socket.getaddrinfo(hostname, None)
except socket.gaierror:
raise httpx.RequestError(f"Cannot resolve hostname: {hostname}") from None
except socket.gaierror as e:
raise httpx.RequestError(
f"Cannot resolve URL with hostname: {hostname}, please download this file and use the path instead."
) from e

for family, _, _, _, sockaddr in addrinfo:
ip = sockaddr[0]
if family == socket.AF_INET6:
ip = ip.split("%")[0] # Remove scope ID if present

if not ipaddress.ip_address(ip).is_global:
raise httpx.RequestError(
f"Non-public IP address found: {ip} for URL: {url}"
if ipaddress.ip_address(ip).is_global:
return url

google_resolved_ip = resolve_with_google_dns(hostname)
if google_resolved_ip and ipaddress.ip_address(google_resolved_ip).is_global:
if parsed_url.scheme == "https":
return url
new_parsed = parsed_url._replace(netloc=google_resolved_ip)
if parsed_url.port:
new_parsed = new_parsed._replace(
netloc=f"{google_resolved_ip}:{parsed_url.port}"
)
return urlunparse(new_parsed)

return True
raise httpx.RequestError(
f"No public IP address found for URL: {url}, please download this file and use the path instead."
)


def save_url_to_cache(url: str, cache_dir: str) -> str:
"""Downloads a file and makes a temporary file path for a copy if does not already
exist. Otherwise returns the path to the existing temp file."""
check_public_url(url)
url = get_public_url(url)

temp_dir = hash_url(url)
temp_dir = Path(cache_dir) / temp_dir
Expand All @@ -314,7 +351,7 @@ def save_url_to_cache(url: str, cache_dir: str) -> str:
open(full_temp_file_path, "wb") as f,
):
for redirect in response.history:
check_public_url(str(redirect.url))
get_public_url(str(redirect.url))

for chunk in response.iter_raw():
f.write(chunk)
Expand All @@ -325,7 +362,7 @@ def save_url_to_cache(url: str, cache_dir: str) -> str:
async def async_save_url_to_cache(url: str, cache_dir: str) -> str:
"""Downloads a file and makes a temporary file path for a copy if does not already
exist. Otherwise returns the path to the existing temp file. Uses async httpx."""
check_public_url(url)
url = get_public_url(url)

temp_dir = hash_url(url)
temp_dir = Path(cache_dir) / temp_dir
Expand All @@ -336,7 +373,7 @@ async def async_save_url_to_cache(url: str, cache_dir: str) -> str:
if not Path(full_temp_file_path).exists():
async with async_client.stream("GET", url, follow_redirects=True) as response:
for redirect in response.history:
check_public_url(str(redirect.url))
get_public_url(str(redirect.url))

async with aiofiles.open(full_temp_file_path, "wb") as f:
async for chunk in response.aiter_raw():
Expand Down
7 changes: 4 additions & 3 deletions test/test_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,8 +420,8 @@ async def test_json_data_not_moved_to_cache():
],
)
def test_local_urls_fail(url):
with pytest.raises(httpx.RequestError, match="Non-public IP address found"):
processing_utils.check_public_url(url)
with pytest.raises(httpx.RequestError, match="No public IP address found for URL"):
processing_utils.get_public_url(url)


@pytest.mark.parametrize(
Expand All @@ -430,7 +430,8 @@ def test_local_urls_fail(url):
"https://google.com",
"https://8.8.8.8/",
"http://93.184.215.14.nip.io/",
"https://huggingface.co/datasets/dylanebert/3dgs/resolve/main/luigi/luigi.ply",
],
)
def test_public_urls_pass(url):
assert processing_utils.check_public_url(url)
assert processing_utils.get_public_url(url)

0 comments on commit 80c966a

Please sign in to comment.