Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test Hive S3 datalake over proxy #11310

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import io.trino.plugin.hive.s3.S3HiveQueryRunner;
import io.trino.testing.AbstractTestQueryFramework;
import io.trino.testing.QueryRunner;
import org.testcontainers.containers.Network;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

Expand All @@ -46,30 +47,34 @@
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.joining;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.testcontainers.containers.Network.newNetwork;

public abstract class BaseTestHiveOnDataLake
extends AbstractTestQueryFramework
{
private static final String HIVE_TEST_SCHEMA = "hive_insert_overwrite";
protected final String bucketName;
protected final String schemaName;
protected final Network network;

private String bucketName;
private HiveMinioDataLake dockerizedS3DataLake;
private HiveMetastore metastoreClient;
protected HiveMinioDataLake dockerizedS3DataLake;
protected HiveMetastore metastoreClient;

private final String hiveHadoopImage;

public BaseTestHiveOnDataLake(String hiveHadoopImage)
{
this.hiveHadoopImage = requireNonNull(hiveHadoopImage, "hiveHadoopImage is null");
this.bucketName = "test-hive-data-lake-" + randomTableSuffix();
this.schemaName = "hive_data_lake_" + randomTableSuffix();
this.network = closeAfterClass(newNetwork());
}

@Override
protected QueryRunner createQueryRunner()
throws Exception
{
this.bucketName = "test-hive-insert-overwrite-" + randomTableSuffix();
this.dockerizedS3DataLake = closeAfterClass(
new HiveMinioDataLake(bucketName, ImmutableMap.of(), hiveHadoopImage));
new HiveMinioDataLake(bucketName, ImmutableMap.of(), hiveHadoopImage, Optional.of(network)));
this.dockerizedS3DataLake.start();
this.metastoreClient = new BridgingHiveMetastore(new ThriftHiveMetastore(
new TestingMetastoreLocator(
Expand All @@ -87,24 +92,26 @@ protected QueryRunner createQueryRunner()
new NoHdfsAuthentication()),
false),
HiveIdentity.none());
return S3HiveQueryRunner.create(
dockerizedS3DataLake,
ImmutableMap.<String, String>builder()
// This is required when using MinIO which requires path style access
.put("hive.insert-existing-partitions-behavior", "OVERWRITE")
.put("hive.non-managed-table-writes-enabled", "true")
// Below are required to enable caching on metastore
.put("hive.metastore-cache-ttl", "1d")
.put("hive.metastore-refresh-interval", "1d")
.buildOrThrow());
return S3HiveQueryRunner.create(dockerizedS3DataLake, getAdditionalHivePropertiesBuilder().buildOrThrow());
}

protected ImmutableMap.Builder<String, String> getAdditionalHivePropertiesBuilder()
{
return ImmutableMap.<String, String>builder()
// This is required when using MinIO which requires path style access
.put("hive.insert-existing-partitions-behavior", "OVERWRITE")
.put("hive.non-managed-table-writes-enabled", "true")
// Below are required to enable caching on metastore
.put("hive.metastore-cache-ttl", "1d")
.put("hive.metastore-refresh-interval", "1d");
}

@BeforeClass
public void setUp()
{
computeActual(format(
"CREATE SCHEMA hive.%1$s WITH (location='s3a://%2$s/%1$s')",
HIVE_TEST_SCHEMA,
schemaName,
bucketName));
}

Expand Down Expand Up @@ -190,7 +197,7 @@ public void testInsertOverwritePartitionedAndBucketedExternalTable()
"partitioned_by=ARRAY['regionkey']",
"bucketed_by = ARRAY['nationkey']",
"bucket_count = 3",
format("external_location = 's3a://%s/%s/%s/'", this.bucketName, HIVE_TEST_SCHEMA, testTable)));
format("external_location = 's3a://%s/%s/%s/'", this.bucketName, schemaName, testTable)));
copyTpchNationToTable(testTable);
assertOverwritePartition(externalTableName);
}
Expand Down Expand Up @@ -231,7 +238,7 @@ public void testFlushPartitionCache()
// Refresh cache for schema_name => 'dummy_schema', table_name => 'dummy_table', partition_column =>
getQueryRunner().execute(format(
"CALL system.flush_metadata_cache(schema_name => '%s', table_name => '%s', partition_column => ARRAY['%s'], partition_value => ARRAY['%s'])",
HIVE_TEST_SCHEMA,
schemaName,
tableName,
partitionColumn,
partitionValue1));
Expand All @@ -247,7 +254,7 @@ public void testFlushPartitionCache()
private void renamePartitionResourcesOutsideTrino(String tableName, String partitionColumn, String regionKey)
{
String partitionName = format("%s=%s", partitionColumn, regionKey);
String partitionS3KeyPrefix = format("%s/%s/%s", HIVE_TEST_SCHEMA, tableName, partitionName);
String partitionS3KeyPrefix = format("%s/%s/%s", schemaName, tableName, partitionName);
String renamedPartitionSuffix = "CP";

// Copy whole partition to new location
Expand All @@ -264,13 +271,13 @@ private void renamePartitionResourcesOutsideTrino(String tableName, String parti
});

// Delete old partition and update metadata to point to location of new copy
Table hiveTable = metastoreClient.getTable(HIVE_TEST_SCHEMA, tableName).get();
Table hiveTable = metastoreClient.getTable(schemaName, tableName).get();
Partition hivePartition = metastoreClient.getPartition(hiveTable, List.of(regionKey)).get();
Map<String, PartitionStatistics> partitionStatistics =
metastoreClient.getPartitionStatistics(hiveTable, List.of(hivePartition));

metastoreClient.dropPartition(HIVE_TEST_SCHEMA, tableName, List.of(regionKey), true);
metastoreClient.addPartitions(HIVE_TEST_SCHEMA, tableName, List.of(
metastoreClient.dropPartition(schemaName, tableName, List.of(regionKey), true);
metastoreClient.addPartitions(schemaName, tableName, List.of(
new PartitionWithStatistics(
Partition.builder(hivePartition)
.withStorage(builder -> builder.setLocation(
Expand Down Expand Up @@ -333,7 +340,7 @@ protected String getTestTableName()

protected String getTestTableName(String tableName)
{
return format("hive.%s.%s", HIVE_TEST_SCHEMA, tableName);
return format("hive.%s.%s", schemaName, tableName);
}

protected String getCreateTableStatement(String tableName, String... propertiesEntries)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive;

import com.google.common.collect.ImmutableMap;
import io.trino.plugin.hive.containers.HiveHadoop;
import io.trino.testing.containers.Httpd;
import io.trino.testing.containers.Minio;

import static io.trino.testing.containers.Httpd.builderForHttpProxy;

public class TestHive2OnDataLakeOverProxy
extends BaseTestHiveOnDataLake
Comment on lines +23 to +24
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How many, and which tests do we run with S3 HTTP proxy?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the first one in trino repository. And in this particular case all underlining tests in class will be using S3 (MinIO) over proxy.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand, but the tests actually invoked look like a random selection. For example flush procedure tests.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that's true. Cause behind my decision is that. I needed:

  • Hive Data Lake set up
  • test's utilising complex operations on S3 (MinIO).
    BaseTestHiveOnDataLake test seems to be perfect fit aside cache flush (which are pretty fast so shouldn't be a big deal).
    We could ofcourse sub-select some tests and either replicate them or make separate base class.
    But I'm not sure if it's worth. In current setup it's a natural place to add new use cases for hive data lake. And we will have a proof that all of those use cases would work over proxy as well (which i agree might be an overkill 🤷‍♂️ )

{
public TestHive2OnDataLakeOverProxy()
{
super(HiveHadoop.DEFAULT_IMAGE);
}

@Override
protected ImmutableMap.Builder<String, String> getAdditionalHivePropertiesBuilder()
{
Httpd httpd = closeAfterClass(builderForHttpProxy(8888)
.addModuleConfiguration("" +
"<VirtualHost *:8888>\n" +
" ErrorLog /var/log/http.log\n" +
" \n" +
" ProxyRequests Off\n" +
" ProxyVia Block\n" +
" ProxyPreserveHost On\n" +
" \n" +
" <Proxy *>\n" +
" Require all granted\n" +
" </Proxy>\n" +
" \n" +
" ProxyPass / http://" + Minio.DEFAULT_HOST_NAME + ":" + Minio.MINIO_API_PORT + "/\n" +
" ProxyPassReverse / http://" + Minio.DEFAULT_HOST_NAME + ":" + Minio.MINIO_API_PORT + "/\n" +
"</VirtualHost>")
.withNetwork(network)
.build());
httpd.start();
return super.getAdditionalHivePropertiesBuilder()
.put("hive.s3.ssl.enabled", "false")
.put("hive.s3.proxy.protocol", "http")
.put("hive.s3.proxy.host", "localhost")
.put("hive.s3.proxy.port", "" + httpd.getListenPort());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.testcontainers.containers.Network;

import java.util.Map;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkState;
import static java.util.Objects.requireNonNull;
Expand All @@ -42,6 +43,7 @@ public class HiveMinioDataLake
private final AutoCloseableCloser closer = AutoCloseableCloser.create();

private State state = State.INITIAL;
private Optional<Network> network;
private AmazonS3 s3Client;

public HiveMinioDataLake(String bucketName, Map<String, String> hiveHadoopFilesToMount)
Expand All @@ -51,25 +53,33 @@ public HiveMinioDataLake(String bucketName, Map<String, String> hiveHadoopFilesT

public HiveMinioDataLake(String bucketName, Map<String, String> hiveHadoopFilesToMount, String hiveHadoopImage)
{
this(bucketName, hiveHadoopFilesToMount, hiveHadoopImage, Optional.of(newNetwork()));
}

public HiveMinioDataLake(String bucketName, Map<String, String> hiveHadoopFilesToMount, String hiveHadoopImage, Optional<Network> network)
{
Minio.Builder minioBuilder = Minio.builder()
.withEnvVars(ImmutableMap.<String, String>builder()
.put("MINIO_ACCESS_KEY", ACCESS_KEY)
.put("MINIO_SECRET_KEY", SECRET_KEY)
.buildOrThrow());
HiveHadoop.Builder hiveHadoopBuilder = HiveHadoop.builder()
.withFilesToMount(ImmutableMap.<String, String>builder()
.put("hive_minio_datalake/hive-core-site.xml", "/etc/hadoop/conf/core-site.xml")
.putAll(hiveHadoopFilesToMount)
.buildOrThrow())
.withImage(hiveHadoopImage);

this.network = network;
this.network.ifPresent(value -> {
closer.register(value);
minioBuilder.withNetwork(value);
hiveHadoopBuilder.withNetwork(value);
});

this.bucketName = requireNonNull(bucketName, "bucketName is null");
Network network = closer.register(newNetwork());
this.minio = closer.register(
Minio.builder()
.withNetwork(network)
.withEnvVars(ImmutableMap.<String, String>builder()
.put("MINIO_ACCESS_KEY", ACCESS_KEY)
.put("MINIO_SECRET_KEY", SECRET_KEY)
.buildOrThrow())
.build());
this.hiveHadoop = closer.register(
HiveHadoop.builder()
.withFilesToMount(ImmutableMap.<String, String>builder()
.put("hive_minio_datalake/hive-core-site.xml", "/etc/hadoop/conf/core-site.xml")
.putAll(hiveHadoopFilesToMount)
.buildOrThrow())
.withImage(hiveHadoopImage)
.withNetwork(network)
.build());
this.minio = closer.register(minioBuilder.build());
this.hiveHadoop = closer.register(hiveHadoopBuilder.build());
}

public void start()
Expand All @@ -92,6 +102,11 @@ public void start()
state = State.STARTED;
}

public Optional<Network> getNetwork()
{
return network;
}

public AmazonS3 getS3Client()
{
checkState(state == State.STARTED, "Can't provide client when MinIO state is: %s", state);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,20 @@ protected void withRunCommand(List<String> runCommand)

protected void copyFileToContainer(String resourcePath, String dockerPath)
{
container.withCopyFileToContainer(
forHostPath(
forClasspathResource(resourcePath)
// Container fails to mount jar:file:/<host_path>!<resource_path> resources
// This assures that JAR resources are being copied out to tmp locations
// and mounted from there.
.getResolvedPath()),
copyHostFileToContainer(
forClasspathResource(resourcePath)
// Container fails to mount jar:file:/<host_path>!<resource_path> resources
// This assures that JAR resources are being copied out to tmp locations
// and mounted from there.
.getResolvedPath(),
dockerPath);
}

protected void copyHostFileToContainer(String hostPath, String dockerPath)
{
container.withCopyFileToContainer(forHostPath(hostPath), dockerPath);
}

protected HostAndPort getMappedHostAndPortForExposedPort(int exposedPort)
{
return fromParts(container.getHost(), container.getMappedPort(exposedPort));
Expand Down
Loading