Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce margin for START_OF_MODERN_ERA in delta lake stats #15005

Merged
merged 1 commit into from
Nov 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions plugin/trino-delta-lake/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,12 @@
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-tpcds</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-tpch</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ public final class TransactionLogParser
private static final Logger log = Logger.get(TransactionLogParser.class);

// Before 1900, Java Time and Joda Time are not consistent with java.sql.Date and java.util.Calendar
// Since January 1, 1900 UTC is still December 31, 1899 in other zones, we are adding a 1 year margin.
public static final LocalDate START_OF_MODERN_ERA = LocalDate.of(1901, 1, 1);
// Since January 1, 1900 UTC is still December 31, 1899 in other zones, we are adding a 1 day margin.
public static final LocalDate START_OF_MODERN_ERA = LocalDate.of(1900, 1, 2);

public static final String LAST_CHECKPOINT_FILENAME = "_last_checkpoint";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import io.trino.Session;
import io.trino.plugin.hive.containers.HiveHadoop;
import io.trino.plugin.hive.containers.HiveMinioDataLake;
import io.trino.plugin.tpcds.TpcdsPlugin;
import io.trino.plugin.tpch.TpchPlugin;
import io.trino.testing.DistributedQueryRunner;
import io.trino.testing.QueryRunner;
Expand Down Expand Up @@ -99,6 +100,9 @@ public DistributedQueryRunner build()
queryRunner.installPlugin(new TpchPlugin());
queryRunner.createCatalog("tpch", "tpch");

queryRunner.installPlugin(new TpcdsPlugin());
queryRunner.createCatalog("tpcds", "tpcds");

queryRunner.installPlugin(new TestingDeltaLakePlugin());
Map<String, String> deltaProperties = new HashMap<>(this.deltaProperties.buildOrThrow());
if (!deltaProperties.containsKey("hive.metastore") && !deltaProperties.containsKey("hive.metastore.uri")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,27 @@ public void testDropStatsAccessControl()
}
}

/**
* Verify Delta has good stats for TPC-DS data sets. Note that TPC-DS date_dim contains
* dates as old as 1900-01-02, which may be problematic.
*/
@Test
public void testStatsOnTpcDsData()
{
try (TestTable table = new TestTable(
getQueryRunner()::execute,
"test_old_date_stats",
"AS SELECT d_date FROM tpcds.tiny.date_dim")) {
runAnalyzeVerifySplitCount(table.getName(), 1);
// Accurate column stats on d_date are important for producing efficient query plans, e.g. on q72
assertQuery(
"SHOW STATS FOR " + table.getName(),
"VALUES"
+ "('d_date', null, 72713.0, 0.0, null, '1900-01-02', '2100-01-01'),"
+ "(null, null, null, null, 73049.0, null, null)");
}
}

private void runAnalyzeVerifySplitCount(String tableName, long expectedSplitCount)
{
MaterializedResultWithQueryId analyzeResult = getDistributedQueryRunner().executeWithQueryId(getSession(), "ANALYZE " + tableName);
Expand Down