From 134063246160c8c7eaf9730f7398ad3b460bdf22 Mon Sep 17 00:00:00 2001 From: Marc Cenac <547446+mrcnc@users.noreply.github.com> Date: Thu, 14 Nov 2024 08:38:04 -0600 Subject: [PATCH] Support WASB scheme in ADLSFileIO (#11504) --- .../apache/iceberg/azure/AzureProperties.java | 13 +++++++++++- .../iceberg/azure/adlsv2/ADLSLocation.java | 20 +++++++++++++------ .../iceberg/azure/AzurePropertiesTest.java | 10 ++++++---- .../azure/adlsv2/ADLSLocationTest.java | 19 ++++++++++++++---- .../apache/iceberg/io/ResolvingFileIO.java | 4 +++- 5 files changed, 50 insertions(+), 16 deletions(-) diff --git a/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java b/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java index 2d363cbc5231..a7f9885a4726 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java +++ b/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java @@ -77,6 +77,17 @@ public Optional adlsWriteBlockSize() { return Optional.ofNullable(adlsWriteBlockSize); } + /** + * Applies configuration to the {@link DataLakeFileSystemClientBuilder} to provide the endpoint + * and credentials required to create an instance of the client. + * + *

The default endpoint is constructed in the form {@code + * https://{account}.dfs.core.windows.net} and default credentials are provided via the {@link + * com.azure.identity.DefaultAzureCredential}. + * + * @param account the service account name + * @param builder the builder instance + */ public void applyClientConfiguration(String account, DataLakeFileSystemClientBuilder builder) { String sasToken = adlsSasTokens.get(account); if (sasToken != null && !sasToken.isEmpty()) { @@ -93,7 +104,7 @@ public void applyClientConfiguration(String account, DataLakeFileSystemClientBui if (connectionString != null && !connectionString.isEmpty()) { builder.endpoint(connectionString); } else { - builder.endpoint("https://" + account); + builder.endpoint("https://" + account + ".dfs.core.windows.net"); } } } diff --git a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java index 5af590628fe8..fb91c4cb3233 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java +++ b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java @@ -30,14 +30,21 @@ * *

Locations follow a URI like structure to identify resources * - *

{@code abfs[s]://[@]/}
+ *
{@code abfs[s]://[@].dfs.core.windows.net/}
+ * + * or + * + *
{@code wasb[s]://@.blob.core.windows.net/}
+ * + * For compatibility, locations using the wasb scheme are also accepted but will use the Azure Data + * Lake Storage Gen2 REST APIs instead of the Blob Storage REST APIs. * *

See Azure * Data Lake Storage URI */ class ADLSLocation { - private static final Pattern URI_PATTERN = Pattern.compile("^abfss?://([^/?#]+)(.*)?$"); + private static final Pattern URI_PATTERN = Pattern.compile("^(abfss?|wasbs?)://([^/?#]+)(.*)?$"); private final String storageAccount; private final String container; @@ -55,17 +62,18 @@ class ADLSLocation { ValidationException.check(matcher.matches(), "Invalid ADLS URI: %s", location); - String authority = matcher.group(1); + String authority = matcher.group(2); String[] parts = authority.split("@", -1); if (parts.length > 1) { this.container = parts[0]; - this.storageAccount = parts[1]; + String host = parts[1]; + this.storageAccount = host.split("\\.", -1)[0]; } else { this.container = null; - this.storageAccount = authority; + this.storageAccount = authority.split("\\.", -1)[0]; } - String uriPath = matcher.group(2); + String uriPath = matcher.group(3); this.path = uriPath == null ? "" : uriPath.startsWith("/") ? uriPath.substring(1) : uriPath; } diff --git a/azure/src/test/java/org/apache/iceberg/azure/AzurePropertiesTest.java b/azure/src/test/java/org/apache/iceberg/azure/AzurePropertiesTest.java index 6b8287c44e58..4f032d7ab125 100644 --- a/azure/src/test/java/org/apache/iceberg/azure/AzurePropertiesTest.java +++ b/azure/src/test/java/org/apache/iceberg/azure/AzurePropertiesTest.java @@ -97,11 +97,13 @@ public void testNoSasToken() { @Test public void testWithConnectionString() { AzureProperties props = - new AzureProperties(ImmutableMap.of("adls.connection-string.account1", "http://endpoint")); + new AzureProperties( + ImmutableMap.of( + "adls.connection-string.account1", "https://account1.dfs.core.usgovcloudapi.net")); DataLakeFileSystemClientBuilder clientBuilder = mock(DataLakeFileSystemClientBuilder.class); props.applyClientConfiguration("account1", clientBuilder); - verify(clientBuilder).endpoint("http://endpoint"); + verify(clientBuilder).endpoint("https://account1.dfs.core.usgovcloudapi.net"); } @Test @@ -111,7 +113,7 @@ public void testNoMatchingConnectionString() { DataLakeFileSystemClientBuilder clientBuilder = mock(DataLakeFileSystemClientBuilder.class); props.applyClientConfiguration("account1", clientBuilder); - verify(clientBuilder).endpoint("https://account1"); + verify(clientBuilder).endpoint("https://account1.dfs.core.windows.net"); } @Test @@ -120,7 +122,7 @@ public void testNoConnectionString() { DataLakeFileSystemClientBuilder clientBuilder = mock(DataLakeFileSystemClientBuilder.class); props.applyClientConfiguration("account", clientBuilder); - verify(clientBuilder).endpoint("https://account"); + verify(clientBuilder).endpoint("https://account.dfs.core.windows.net"); } @Test diff --git a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java index 403886f4b28e..10b5e1877cca 100644 --- a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java +++ b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java @@ -33,7 +33,18 @@ public void testLocationParsing(String scheme) { String p1 = scheme + "://container@account.dfs.core.windows.net/path/to/file"; ADLSLocation location = new ADLSLocation(p1); - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + assertThat(location.storageAccount()).isEqualTo("account"); + assertThat(location.container().get()).isEqualTo("container"); + assertThat(location.path()).isEqualTo("path/to/file"); + } + + @ParameterizedTest + @ValueSource(strings = {"wasb", "wasbs"}) + public void testWasbLocatonParsing(String scheme) { + String p1 = scheme + "://container@account.blob.core.windows.net/path/to/file"; + ADLSLocation location = new ADLSLocation(p1); + + assertThat(location.storageAccount()).isEqualTo("account"); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo("path/to/file"); } @@ -43,7 +54,7 @@ public void testEncodedString() { String p1 = "abfs://container@account.dfs.core.windows.net/path%20to%20file"; ADLSLocation location = new ADLSLocation(p1); - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + assertThat(location.storageAccount()).isEqualTo("account"); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo("path%20to%20file"); } @@ -67,7 +78,7 @@ public void testNoContainer() { String p1 = "abfs://account.dfs.core.windows.net/path/to/file"; ADLSLocation location = new ADLSLocation(p1); - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + assertThat(location.storageAccount()).isEqualTo("account"); assertThat(location.container().isPresent()).isFalse(); assertThat(location.path()).isEqualTo("path/to/file"); } @@ -77,7 +88,7 @@ public void testNoPath() { String p1 = "abfs://container@account.dfs.core.windows.net"; ADLSLocation location = new ADLSLocation(p1); - assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + assertThat(location.storageAccount()).isEqualTo("account"); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo(""); } diff --git a/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java b/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java index a858045aab8b..a8adf979f85a 100644 --- a/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java +++ b/core/src/main/java/org/apache/iceberg/io/ResolvingFileIO.java @@ -62,7 +62,9 @@ public class ResolvingFileIO implements HadoopConfigurable, DelegateFileIO { "s3n", S3_FILE_IO_IMPL, "gs", GCS_FILE_IO_IMPL, "abfs", ADLS_FILE_IO_IMPL, - "abfss", ADLS_FILE_IO_IMPL); + "abfss", ADLS_FILE_IO_IMPL, + "wasb", ADLS_FILE_IO_IMPL, + "wasbs", ADLS_FILE_IO_IMPL); private final Map ioInstances = Maps.newConcurrentMap(); private final AtomicBoolean isClosed = new AtomicBoolean(false);