diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c9241a7f3c3c..7c8fa4c7638c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -332,7 +332,9 @@ jobs: S3_BUCKET_ENDPOINT: "s3.${{ vars.TRINO_AWS_REGION }}.amazonaws.com" run: | if [ "${AWS_ACCESS_KEY_ID}" != "" ]; then - $MAVEN test ${MAVEN_TEST} -pl :trino-hive -P aws-tests + $MAVEN test ${MAVEN_TEST} -pl :trino-hive -P aws-tests \ + -Ds3.bucket="${S3_BUCKET}" \ + -Ds3.bucket-endpoint="${S3_BUCKET_ENDPOINT}" fi - name: Run Hive Azure ABFS Access Key Tests if: matrix.config != 'config-empty' # Hive 1.x does not support Azure storage diff --git a/plugin/trino-hive/pom.xml b/plugin/trino-hive/pom.xml index 902e45d4853e..ba0ebe735019 100644 --- a/plugin/trino-hive/pom.xml +++ b/plugin/trino-hive/pom.xml @@ -532,6 +532,7 @@ **/TestHiveGlueMetastore.java **/TestHiveS3AndGlueMetastoreTest.java **/TestTrinoS3FileSystemAwsS3.java + **/TestS3SelectQueries.java **/TestFullParquetReader.java **/Test*FailureRecoveryTest.java @@ -587,6 +588,7 @@ **/TestHiveGlueMetastore.java **/TestHiveS3AndGlueMetastoreTest.java **/TestTrinoS3FileSystemAwsS3.java + **/TestS3SelectQueries.java diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/IonSqlQueryBuilder.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/IonSqlQueryBuilder.java index 37f3648164ff..2ecf45ddfd0a 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/IonSqlQueryBuilder.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/IonSqlQueryBuilder.java @@ -19,7 +19,6 @@ import com.google.common.primitives.SignedBytes; import io.airlift.slice.Slice; import io.trino.plugin.hive.HiveColumnHandle; -import io.trino.spi.TrinoException; import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.Range; import io.trino.spi.predicate.TupleDomain; @@ -33,11 +32,12 @@ import java.util.ArrayList; import java.util.List; +import java.util.Optional; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.Iterables.getOnlyElement; -import static io.trino.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT; +import static io.trino.plugin.hive.s3select.S3SelectDataType.CSV; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.BooleanType.BOOLEAN; import static io.trino.spi.type.DateType.DATE; @@ -62,11 +62,27 @@ public class IonSqlQueryBuilder private static final String DATA_SOURCE = "S3Object s"; private final TypeManager typeManager; private final S3SelectDataType s3SelectDataType; + private final String nullPredicate; + private final String notNullPredicate; - public IonSqlQueryBuilder(TypeManager typeManager, S3SelectDataType s3SelectDataType) + public IonSqlQueryBuilder(TypeManager typeManager, S3SelectDataType s3SelectDataType, Optional optionalNullCharacterEncoding) { + if (optionalNullCharacterEncoding.isPresent()) { + checkArgument(s3SelectDataType == CSV, "Null character encoding should only be provided for CSV data"); + } + this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.s3SelectDataType = requireNonNull(s3SelectDataType, "s3SelectDataType is null"); + + String nullCharacterEncoding = optionalNullCharacterEncoding.orElse(""); + this.nullPredicate = switch (s3SelectDataType) { + case JSON -> "IS NULL"; + case CSV -> "= '%s'".formatted(nullCharacterEncoding); + }; + this.notNullPredicate = switch (s3SelectDataType) { + case JSON -> "IS NOT NULL"; + case CSV -> "!= '%s'".formatted(nullCharacterEncoding); + }; } public String buildSql(List columns, TupleDomain tupleDomain) @@ -105,14 +121,10 @@ public String buildSql(List columns, TupleDomain "s.%s".formatted(column.getBaseColumnName()); + case CSV -> "s._%d".formatted(column.getBaseHiveColumnIndex() + 1); + }; } private List toConjuncts(List columns, TupleDomain tupleDomain) @@ -149,7 +161,7 @@ private String toPredicate(Domain domain, Type type, HiveColumnHandle column) if (domain.getValues().isNone()) { if (domain.isNullAllowed()) { - return getFullyQualifiedColumnName(column) + " = '' "; + return getFullyQualifiedColumnName(column) + " " + nullPredicate; } return "FALSE"; } @@ -158,7 +170,7 @@ private String toPredicate(Domain domain, Type type, HiveColumnHandle column) if (domain.isNullAllowed()) { return "TRUE"; } - return getFullyQualifiedColumnName(column) + " <> '' "; + return getFullyQualifiedColumnName(column) + " " + notNullPredicate; } List disjuncts = new ArrayList<>(); @@ -178,12 +190,17 @@ private String toPredicate(Domain domain, Type type, HiveColumnHandle column) } // If rangeConjuncts is null, then the range was ALL, which should already have been checked for checkState(!rangeConjuncts.isEmpty()); - disjuncts.add("(" + Joiner.on(" AND ").join(rangeConjuncts) + ")"); + if (rangeConjuncts.size() == 1) { + disjuncts.add("%s %s AND %s".formatted(getFullyQualifiedColumnName(column), notNullPredicate, getOnlyElement(rangeConjuncts))); + } + else { + disjuncts.add("(%s %s AND %s)".formatted(getFullyQualifiedColumnName(column), notNullPredicate, Joiner.on(" AND ").join(rangeConjuncts))); + } } // Add back all of the possible single values either as an equality or an IN predicate if (singleValues.size() == 1) { - disjuncts.add(toPredicate("=", getOnlyElement(singleValues), type, column)); + disjuncts.add("%s %s AND %s".formatted(getFullyQualifiedColumnName(column), notNullPredicate, toPredicate("=", getOnlyElement(singleValues), type, column))); } else if (singleValues.size() > 1) { List values = new ArrayList<>(); @@ -191,13 +208,17 @@ else if (singleValues.size() > 1) { checkType(type); values.add(valueToQuery(type, value)); } - disjuncts.add(createColumn(type, column) + " IN (" + Joiner.on(",").join(values) + ")"); + disjuncts.add("%s %s AND %s IN (%s)".formatted( + getFullyQualifiedColumnName(column), + notNullPredicate, + createColumn(type, column), + Joiner.on(",").join(values))); } // Add nullability disjuncts checkState(!disjuncts.isEmpty()); if (domain.isNullAllowed()) { - disjuncts.add(getFullyQualifiedColumnName(column) + " = '' "); + disjuncts.add(getFullyQualifiedColumnName(column) + " " + nullPredicate); } return "(" + Joiner.on(" OR ").join(disjuncts) + ")"; @@ -233,7 +254,8 @@ private static String valueToQuery(Type type, Object value) return String.valueOf((boolean) value); } if (type.equals(DATE)) { - return "`" + FORMATTER.print(DAYS.toMillis((long) value)) + "`"; + // CAST('2007-04-05T14:30Z' AS TIMESTAMP) + return "'" + FORMATTER.print(DAYS.toMillis((long) value)) + "'"; } if (type.equals(VarcharType.VARCHAR)) { return "'" + ((Slice) value).toStringUtf8() + "'"; @@ -252,22 +274,11 @@ private String createColumn(Type type, HiveColumnHandle columnHandle) String column = getFullyQualifiedColumnName(columnHandle); if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { - return formatPredicate(column, "INT"); + return "CAST(" + column + " AS INT)"; } if (type.equals(BOOLEAN)) { - return formatPredicate(column, "BOOL"); - } - if (type.equals(DATE)) { - return formatPredicate(column, "TIMESTAMP"); - } - if (type instanceof DecimalType decimalType) { - return formatPredicate(column, format("DECIMAL(%s,%s)", decimalType.getPrecision(), decimalType.getScale())); + return "CAST(" + column + " AS BOOL)"; } return column; } - - private String formatPredicate(String column, String type) - { - return format("case %s when '' then null else CAST(%s AS %s) end", column, column, type); - } } diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java index 2927414e34be..16560502030f 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/S3SelectRecordCursorProvider.java @@ -21,6 +21,7 @@ import io.trino.plugin.hive.HiveColumnHandle; import io.trino.plugin.hive.HiveRecordCursorProvider; import io.trino.plugin.hive.ReaderColumns; +import io.trino.plugin.hive.s3select.csv.S3SelectCsvRecordReader; import io.trino.plugin.hive.type.TypeInfo; import io.trino.spi.TrinoException; import io.trino.spi.connector.ConnectorSession; @@ -106,7 +107,11 @@ public Optional createRecordCursor( if (s3SelectDataTypeOptional.isPresent()) { S3SelectDataType s3SelectDataType = s3SelectDataTypeOptional.get(); - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager, s3SelectDataType); + Optional nullCharacterEncoding = Optional.empty(); + if (s3SelectDataType == S3SelectDataType.CSV) { + nullCharacterEncoding = S3SelectCsvRecordReader.nullCharacterEncoding(schema); + } + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager, s3SelectDataType, nullCharacterEncoding); String ionSqlQuery = queryBuilder.buildSql(readerColumns, effectivePredicate); Optional recordReader = S3SelectLineRecordReaderProvider.get(configuration, path, start, length, schema, ionSqlQuery, s3ClientFactory, s3SelectDataType); diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/csv/S3SelectCsvRecordReader.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/csv/S3SelectCsvRecordReader.java index 47d63095d82c..89dc6d3c9102 100644 --- a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/csv/S3SelectCsvRecordReader.java +++ b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/s3select/csv/S3SelectCsvRecordReader.java @@ -20,9 +20,11 @@ import com.amazonaws.services.s3.model.OutputSerialization; import io.trino.plugin.hive.s3select.S3SelectLineRecordReader; import io.trino.plugin.hive.s3select.TrinoS3ClientFactory; +import io.trino.plugin.hive.util.SerdeConstants; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import java.util.Optional; import java.util.Properties; import static io.trino.plugin.hive.util.SerdeConstants.ESCAPE_CHAR; @@ -104,4 +106,9 @@ public boolean shouldEnableScanRange() buildInputSerialization().getCsv().getAllowQuotedRecordDelimiter()); return CompressionType.NONE.equals(getCompressionType()) && !isQuotedRecordDelimiterAllowed; } + + public static Optional nullCharacterEncoding(Properties schema) + { + return Optional.ofNullable(schema.getProperty(SerdeConstants.SERIALIZATION_NULL_FORMAT)); + } } diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseTestHiveOnDataLake.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseTestHiveOnDataLake.java index 2037805dd26f..3b711f498d21 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseTestHiveOnDataLake.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/BaseTestHiveOnDataLake.java @@ -29,8 +29,11 @@ import io.trino.spi.predicate.NullableValue; import io.trino.spi.predicate.TupleDomain; import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.MaterializedResult; import io.trino.testing.QueryRunner; import io.trino.testing.minio.MinioClient; +import io.trino.testing.sql.TestTable; +import org.intellij.lang.annotations.Language; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -64,6 +67,7 @@ import static java.util.stream.Collectors.joining; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.testng.Assert.assertEquals; public abstract class BaseTestHiveOnDataLake extends AbstractTestQueryFramework @@ -1788,6 +1792,124 @@ public void testPartitionedTableExternalLocationOnTopOfTheBucket() assertUpdate("DROP TABLE " + tableName); } + @Test(dataProvider = "s3SelectFileFormats") + public void testS3SelectPushdown(String tableProperties) + { + Session usingAppendInserts = Session.builder(getSession()) + .setCatalogSessionProperty("hive", "insert_existing_partitions_behavior", "APPEND") + .build(); + List values = ImmutableList.of( + "1, true, 11, 111, 1111, 11111, 'one', 1.1, DATE '2020-01-01'", + "2, true, 22, 222, 2222, 22222, 'two', 2.2, DATE '2020-02-02'", + "3, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL", + "4, false, 44, 444, 4444, 44444, 'four', 4.4, DATE '2020-04-04'"); + try (TestTable table = new TestTable( + sql -> getQueryRunner().execute(usingAppendInserts, sql), + "hive.%s.test_s3_select_pushdown".formatted(HIVE_TEST_SCHEMA), + "(id INT, bool_t BOOLEAN, tiny_t TINYINT, small_t SMALLINT, int_t INT, big_t BIGINT, string_t VARCHAR, decimal_t DECIMAL(10, 5), date_t DATE) " + + "WITH (" + tableProperties + ")", values)) { + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t = true", "VALUES 1, 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t = false", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t = 22", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t != 22", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t > 22", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t >= 22", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t = 22 OR tiny_t = 44", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NULL OR tiny_t >= 22", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t = 222", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t != 222", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t > 222", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t >= 222", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t = 222 OR small_t = 444", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NULL OR small_t >= 222", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t = 2222", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t != 2222", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t > 2222", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t >= 2222", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t = 2222 OR int_t = 4444", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NULL OR int_t >= 2222", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t = 22222", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t != 22222", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t > 22222", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t >= 22222", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t = 22222 OR big_t = 44444", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NULL OR big_t >= 22222", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t = 'two'", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t != 'two'", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t < 'two'", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t <= 'two'", "VALUES 1, 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t = 'two' OR string_t = 'four'", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NULL OR string_t >= 'two'", "VALUES 2, 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE decimal_t = 2.2", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE decimal_t != 2.2", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE decimal_t < 2.2", "VALUES 1"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE decimal_t <= 2.2", "VALUES 1, 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE decimal_t = 2.2 OR decimal_t = 4.4", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE decimal_t IS NULL OR decimal_t >= 2.2", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE decimal_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE decimal_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t = DATE '2020-02-02'", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t != DATE '2020-02-02'", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t > DATE '2020-02-02'", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t <= DATE '2020-02-02'", "VALUES 1, 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t = DATE '2020-02-02' OR date_t = DATE '2020-04-04'", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NULL OR date_t >= DATE '2020-02-02'", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NOT NULL", "VALUES 1, 2, 4"); + } + } + + private void assertS3SelectQuery(@Language("SQL") String query, @Language("SQL") String expectedValues) + { + Session withS3SelectPushdown = Session.builder(getSession()) + .setCatalogSessionProperty("hive", "s3_select_pushdown_enabled", "true") + .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") + .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") + .build(); + + MaterializedResult expectedResult = computeActual(expectedValues); + assertQueryStats( + withS3SelectPushdown, + query, + statsWithPushdown -> { + long inputPositionsWithPushdown = statsWithPushdown.getPhysicalInputPositions(); + assertQueryStats( + getSession(), + query, + statsWithoutPushdown -> assertThat(statsWithoutPushdown.getPhysicalInputPositions()).isGreaterThan(inputPositionsWithPushdown), + results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); + }, + results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); + } + + @DataProvider + public static Object[][] s3SelectFileFormats() + { + return new Object[][] { + {"format = 'JSON'"}, + {"format = 'TEXTFILE', textfile_field_separator=',', textfile_field_separator_escape='|', null_format='~'"} + }; + } + private void renamePartitionResourcesOutsideTrino(String tableName, String partitionColumn, String regionKey) { String partitionName = format("%s=%s", partitionColumn, regionKey); diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestS3SelectQueries.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestS3SelectQueries.java new file mode 100644 index 000000000000..93fbbf725c20 --- /dev/null +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3/TestS3SelectQueries.java @@ -0,0 +1,188 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.s3; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.Session; +import io.trino.plugin.hive.HiveQueryRunner; +import io.trino.plugin.hive.NodeVersion; +import io.trino.plugin.hive.metastore.HiveMetastoreConfig; +import io.trino.plugin.hive.metastore.file.FileHiveMetastore; +import io.trino.plugin.hive.metastore.file.FileHiveMetastoreConfig; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.MaterializedResult; +import io.trino.testing.QueryRunner; +import io.trino.testing.sql.TestTable; +import org.intellij.lang.annotations.Language; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Parameters; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.List; + +import static io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static java.util.Objects.requireNonNull; +import static org.assertj.core.api.Assertions.assertThat; +import static org.testng.Assert.assertEquals; + +// The test requires AWS credentials be provided via one of the ways used by the DefaultAWSCredentialsProviderChain. +public class TestS3SelectQueries + extends AbstractTestQueryFramework +{ + private final String bucket; + private final String bucketEndpoint; + + @Parameters({"s3.bucket", "s3.bucket-endpoint"}) + public TestS3SelectQueries(String bucket, String bucketEndpoint) + { + this.bucket = requireNonNull(bucket, "bucket is null"); + this.bucketEndpoint = requireNonNull(bucketEndpoint, "bucketEndpoint is null"); + } + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + ImmutableMap.Builder hiveProperties = ImmutableMap.builder(); + hiveProperties.put("hive.s3.endpoint", bucketEndpoint); + hiveProperties.put("hive.non-managed-table-writes-enabled", "true"); + return HiveQueryRunner.builder() + .setHiveProperties(hiveProperties.buildOrThrow()) + .setInitialTables(ImmutableList.of()) + .setMetastore(queryRunner -> { + File baseDir = queryRunner.getCoordinator().getBaseDataDir().resolve("hive_data").toFile(); + return new FileHiveMetastore( + new NodeVersion("testversion"), + HDFS_ENVIRONMENT, + new HiveMetastoreConfig().isHideDeltaLakeTables(), + new FileHiveMetastoreConfig() + .setCatalogDirectory(baseDir.toURI().toString()) + .setMetastoreUser("test") + .setDisableLocationChecks(true)); + }) + .build(); + } + + @Test(dataProvider = "s3SelectFileFormats") + public void testS3SelectPushdown(String tableProperties) + { + Session usingAppendInserts = Session.builder(getSession()) + .setCatalogSessionProperty("hive", "insert_existing_partitions_behavior", "APPEND") + .build(); + List values = ImmutableList.of( + "1, true, 11, 111, 1111, 11111, 'one', DATE '2020-01-01'", + "2, true, 22, 222, 2222, 22222, 'two', DATE '2020-02-02'", + "3, NULL, NULL, NULL, NULL, NULL, NULL, NULL", + "4, false, 44, 444, 4444, 44444, '', DATE '2020-04-04'"); + try (TestTable table = new TestTable( + sql -> getQueryRunner().execute(usingAppendInserts, sql), + "hive.%s.test_s3_select_pushdown".formatted(HiveQueryRunner.TPCH_SCHEMA), + "(id INT, bool_t BOOLEAN, tiny_t TINYINT, small_t SMALLINT, int_t INT, big_t BIGINT, string_t VARCHAR, date_t DATE) " + + "WITH (external_location = 's3://" + bucket + "/test_s3_select_pushdown/test_table_" + randomNameSuffix() + "', " + tableProperties + ")", values)) { + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t = true", "VALUES 1, 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t = false", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE bool_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t = 22", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t != 22", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t > 22", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t >= 22", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t = 22 OR tiny_t = 44", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NULL OR tiny_t >= 22", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE tiny_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t = 222", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t != 222", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t > 222", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t >= 222", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t = 222 OR small_t = 444", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NULL OR small_t >= 222", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE small_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t = 2222", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t != 2222", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t > 2222", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t >= 2222", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t = 2222 OR int_t = 4444", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NULL OR int_t >= 2222", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE int_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t = 22222", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t != 22222", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t > 22222", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t >= 22222", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t = 22222 OR big_t = 44444", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NULL OR big_t >= 22222", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE big_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t = 'two'", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t != 'two'", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t < 'two'", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t <= 'two'", "VALUES 1, 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t = 'two' OR string_t = ''", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NULL OR string_t >= 'two'", "VALUES 2, 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE string_t IS NOT NULL", "VALUES 1, 2, 4"); + + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t = DATE '2020-02-02'", "VALUES 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t != DATE '2020-02-02'", "VALUES 1, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t > DATE '2020-02-02'", "VALUES 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t <= DATE '2020-02-02'", "VALUES 1, 2"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t = DATE '2020-02-02' OR date_t = DATE '2020-04-04'", "VALUES 2, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NULL OR date_t >= DATE '2020-02-02'", "VALUES 2, 3, 4"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NULL", "VALUES 3"); + assertS3SelectQuery("SELECT id FROM " + table.getName() + " WHERE date_t IS NOT NULL", "VALUES 1, 2, 4"); + } + } + + private void assertS3SelectQuery(@Language("SQL") String query, @Language("SQL") String expectedValues) + { + Session withS3SelectPushdown = Session.builder(getSession()) + .setCatalogSessionProperty("hive", "s3_select_pushdown_enabled", "true") + .setCatalogSessionProperty("hive", "json_native_reader_enabled", "false") + .setCatalogSessionProperty("hive", "text_file_native_reader_enabled", "false") + .build(); + + MaterializedResult expectedResult = computeActual(expectedValues); + assertQueryStats( + withS3SelectPushdown, + query, + statsWithPushdown -> { + long inputPositionsWithPushdown = statsWithPushdown.getPhysicalInputPositions(); + assertQueryStats( + getSession(), + query, + statsWithoutPushdown -> assertThat(statsWithoutPushdown.getPhysicalInputPositions()).isGreaterThan(inputPositionsWithPushdown), + results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); + }, + results -> assertEquals(results.getOnlyColumnAsSet(), expectedResult.getOnlyColumnAsSet())); + } + + @DataProvider + public static Object[][] s3SelectFileFormats() + { + return new Object[][] { + {"format = 'JSON'"}, + {"format = 'TEXTFILE', textfile_field_separator=',', textfile_field_separator_escape='|', null_format='~'"} + }; + } +} diff --git a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestIonSqlQueryBuilder.java b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestIonSqlQueryBuilder.java index 243a26470e1c..56d1223173f6 100644 --- a/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestIonSqlQueryBuilder.java +++ b/plugin/trino-hive/src/test/java/io/trino/plugin/hive/s3select/TestIonSqlQueryBuilder.java @@ -60,33 +60,32 @@ public void testBuildSQL() createBaseColumn("n_regionkey", 2, HIVE_INT, INTEGER, REGULAR, Optional.empty())); // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV); + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); assertEquals(queryBuilder.buildSql(columns, TupleDomain.all()), "SELECT s._1, s._2, s._3 FROM S3Object s"); TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of( columns.get(2), Domain.create(SortedRangeSet.copyOf(BIGINT, ImmutableList.of(Range.equal(BIGINT, 3L))), false))); assertEquals(queryBuilder.buildSql(columns, tupleDomain), - "SELECT s._1, s._2, s._3 FROM S3Object s WHERE (case s._3 when '' then null else CAST(s._3 AS INT) end = 3)"); + "SELECT s._1, s._2, s._3 FROM S3Object s WHERE (s._3 != '' AND CAST(s._3 AS INT) = 3)"); // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON); + queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); assertEquals(queryBuilder.buildSql(columns, TupleDomain.all()), "SELECT s.n_nationkey, s.n_name, s.n_regionkey FROM S3Object s"); assertEquals(queryBuilder.buildSql(columns, tupleDomain), - "SELECT s.n_nationkey, s.n_name, s.n_regionkey FROM S3Object s " + - "WHERE (case s.n_regionkey when '' then null else CAST(s.n_regionkey AS INT) end = 3)"); + "SELECT s.n_nationkey, s.n_name, s.n_regionkey FROM S3Object s WHERE (s.n_regionkey IS NOT NULL AND CAST(s.n_regionkey AS INT) = 3)"); } @Test public void testEmptyColumns() { // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV); + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); assertEquals(queryBuilder.buildSql(ImmutableList.of(), TupleDomain.all()), "SELECT ' ' FROM S3Object s"); // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON); + queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); assertEquals(queryBuilder.buildSql(ImmutableList.of(), TupleDomain.all()), "SELECT ' ' FROM S3Object s"); } @@ -106,18 +105,20 @@ public void testDecimalColumns() columns.get(2), Domain.create(ofRanges(Range.range(decimalType, shortDecimal("0.0"), true, shortDecimal("0.02"), true)), false))); // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager, S3SelectDataType.CSV); + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager, S3SelectDataType.CSV, Optional.empty()); assertEquals(queryBuilder.buildSql(columns, tupleDomain), - "SELECT s._1, s._2, s._3 FROM S3Object s WHERE ((case s._1 when '' then null else CAST(s._1 AS DECIMAL(20,0)) end < 50)) AND " + - "(case s._2 when '' then null else CAST(s._2 AS DECIMAL(20,2)) end = 0.05) AND ((case s._3 when '' then null else CAST(s._3 AS DECIMAL(10,2)) " + - "end >= 0.00 AND case s._3 when '' then null else CAST(s._3 AS DECIMAL(10,2)) end <= 0.02))"); + "SELECT s._1, s._2, s._3 FROM S3Object s WHERE " + + "(s._1 != '' AND s._1 < 50) AND " + + "(s._2 != '' AND s._2 = 0.05) AND " + + "((s._3 != '' AND s._3 >= 0.00 AND s._3 <= 0.02))"); // JSON - queryBuilder = new IonSqlQueryBuilder(typeManager, S3SelectDataType.JSON); + queryBuilder = new IonSqlQueryBuilder(typeManager, S3SelectDataType.JSON, Optional.empty()); assertEquals(queryBuilder.buildSql(columns, tupleDomain), - "SELECT s.quantity, s.extendedprice, s.discount FROM S3Object s WHERE ((case s.quantity when '' then null else CAST(s.quantity AS DECIMAL(20,0)) end < 50)) AND " + - "(case s.extendedprice when '' then null else CAST(s.extendedprice AS DECIMAL(20,2)) end = 0.05) AND ((case s.discount when '' then null else CAST(s.discount AS DECIMAL(10,2)) " + - "end >= 0.00 AND case s.discount when '' then null else CAST(s.discount AS DECIMAL(10,2)) end <= 0.02))"); + "SELECT s.quantity, s.extendedprice, s.discount FROM S3Object s WHERE " + + "(s.quantity IS NOT NULL AND s.quantity < 50) AND " + + "(s.extendedprice IS NOT NULL AND s.extendedprice = 0.05) AND " + + "((s.discount IS NOT NULL AND s.discount >= 0.00 AND s.discount <= 0.02))"); } @Test @@ -130,12 +131,12 @@ public void testDateColumn() columns.get(1), Domain.create(SortedRangeSet.copyOf(DATE, ImmutableList.of(Range.equal(DATE, (long) DateTimeUtils.parseDate("2001-08-22")))), false))); // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s._1, s._2 FROM S3Object s WHERE (case s._2 when '' then null else CAST(s._2 AS TIMESTAMP) end = `2001-08-22`)"); + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); + assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s._1, s._2 FROM S3Object s WHERE (s._2 != '' AND s._2 = '2001-08-22')"); // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s.t1, s.t2 FROM S3Object s WHERE (case s.t2 when '' then null else CAST(s.t2 AS TIMESTAMP) end = `2001-08-22`)"); + queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); + assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s.t1, s.t2 FROM S3Object s WHERE (s.t2 IS NOT NULL AND s.t2 = '2001-08-22')"); } @Test @@ -152,11 +153,11 @@ public void testNotPushDoublePredicates() columns.get(2), Domain.create(ofRanges(Range.range(DOUBLE, 0.0, true, 0.02, true)), false))); // CSV - IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s._1, s._2, s._3 FROM S3Object s WHERE ((case s._1 when '' then null else CAST(s._1 AS INT) end < 50))"); + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.CSV, Optional.empty()); + assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s._1, s._2, s._3 FROM S3Object s WHERE (s._1 != '' AND CAST(s._1 AS INT) < 50)"); // JSON - queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON); - assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s.quantity, s.extendedprice, s.discount FROM S3Object s WHERE ((case s.quantity when '' then null else CAST(s.quantity AS INT) end < 50))"); + queryBuilder = new IonSqlQueryBuilder(TESTING_TYPE_MANAGER, S3SelectDataType.JSON, Optional.empty()); + assertEquals(queryBuilder.buildSql(columns, tupleDomain), "SELECT s.quantity, s.extendedprice, s.discount FROM S3Object s WHERE (s.quantity IS NOT NULL AND CAST(s.quantity AS INT) < 50)"); } } diff --git a/testing/trino-testing-containers/src/main/java/io/trino/testing/containers/Minio.java b/testing/trino-testing-containers/src/main/java/io/trino/testing/containers/Minio.java index d253aebba031..624d7c5f9ac4 100644 --- a/testing/trino-testing-containers/src/main/java/io/trino/testing/containers/Minio.java +++ b/testing/trino-testing-containers/src/main/java/io/trino/testing/containers/Minio.java @@ -41,7 +41,7 @@ public class Minio { private static final Logger log = Logger.get(Minio.class); - public static final String DEFAULT_IMAGE = "minio/minio:RELEASE.2022-10-05T14-58-27Z"; + public static final String DEFAULT_IMAGE = "minio/minio:RELEASE.2023-05-18T00-05-36Z"; public static final String DEFAULT_HOST_NAME = "minio"; public static final int MINIO_API_PORT = 4566;