diff --git a/plugin/trino-delta-lake/pom.xml b/plugin/trino-delta-lake/pom.xml index a3325091311b..632020452934 100644 --- a/plugin/trino-delta-lake/pom.xml +++ b/plugin/trino-delta-lake/pom.xml @@ -303,6 +303,12 @@ test + + io.trino + trino-tpcds + test + + io.trino trino-tpch diff --git a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogParser.java b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogParser.java index 82a7bd23001b..03de2ce67019 100644 --- a/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogParser.java +++ b/plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/TransactionLogParser.java @@ -86,8 +86,8 @@ public final class TransactionLogParser private static final Logger log = Logger.get(TransactionLogParser.class); // Before 1900, Java Time and Joda Time are not consistent with java.sql.Date and java.util.Calendar - // Since January 1, 1900 UTC is still December 31, 1899 in other zones, we are adding a 1 year margin. - public static final LocalDate START_OF_MODERN_ERA = LocalDate.of(1901, 1, 1); + // Since January 1, 1900 UTC is still December 31, 1899 in other zones, we are adding a 1 day margin. + public static final LocalDate START_OF_MODERN_ERA = LocalDate.of(1900, 1, 2); public static final String LAST_CHECKPOINT_FILENAME = "_last_checkpoint"; diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/DeltaLakeQueryRunner.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/DeltaLakeQueryRunner.java index ba985cdc6e6f..509543dc6f61 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/DeltaLakeQueryRunner.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/DeltaLakeQueryRunner.java @@ -19,6 +19,7 @@ import io.trino.Session; import io.trino.plugin.hive.containers.HiveHadoop; import io.trino.plugin.hive.containers.HiveMinioDataLake; +import io.trino.plugin.tpcds.TpcdsPlugin; import io.trino.plugin.tpch.TpchPlugin; import io.trino.testing.DistributedQueryRunner; import io.trino.testing.QueryRunner; @@ -99,6 +100,9 @@ public DistributedQueryRunner build() queryRunner.installPlugin(new TpchPlugin()); queryRunner.createCatalog("tpch", "tpch"); + queryRunner.installPlugin(new TpcdsPlugin()); + queryRunner.createCatalog("tpcds", "tpcds"); + queryRunner.installPlugin(new TestingDeltaLakePlugin()); Map deltaProperties = new HashMap<>(this.deltaProperties.buildOrThrow()); if (!deltaProperties.containsKey("hive.metastore") && !deltaProperties.containsKey("hive.metastore.uri")) { diff --git a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAnalyze.java b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAnalyze.java index 409a8d7b5498..d83b600209cd 100644 --- a/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAnalyze.java +++ b/plugin/trino-delta-lake/src/test/java/io/trino/plugin/deltalake/TestDeltaLakeAnalyze.java @@ -458,6 +458,27 @@ public void testDropStatsAccessControl() } } + /** + * Verify Delta has good stats for TPC-DS data sets. Note that TPC-DS date_dim contains + * dates as old as 1900-01-02, which may be problematic. + */ + @Test + public void testStatsOnTpcDsData() + { + try (TestTable table = new TestTable( + getQueryRunner()::execute, + "test_old_date_stats", + "AS SELECT d_date FROM tpcds.tiny.date_dim")) { + runAnalyzeVerifySplitCount(table.getName(), 1); + // Accurate column stats on d_date are important for producing efficient query plans, e.g. on q72 + assertQuery( + "SHOW STATS FOR " + table.getName(), + "VALUES" + + "('d_date', null, 72713.0, 0.0, null, '1900-01-02', '2100-01-01')," + + "(null, null, null, null, 73049.0, null, null)"); + } + } + private void runAnalyzeVerifySplitCount(String tableName, long expectedSplitCount) { MaterializedResultWithQueryId analyzeResult = getDistributedQueryRunner().executeWithQueryId(getSession(), "ANALYZE " + tableName);