Skip to content

Commit

Permalink
Fix NullPointerException in Iceberg stats reader
Browse files Browse the repository at this point in the history
Fixes

```
java.lang.NullPointerException
	at io.trino.plugin.iceberg.TableStatisticsMaker.updatePartitionedStats(TableStatisticsMaker.java:316)
	at io.trino.plugin.iceberg.TableStatisticsMaker.updateSummaryMin(TableStatisticsMaker.java:298)
	at io.trino.plugin.iceberg.TableStatisticsMaker.makeTableStatistics(TableStatisticsMaker.java:155)
	at io.trino.plugin.iceberg.TableStatisticsMaker.getTableStatistics(TableStatisticsMaker.java:73)
	at io.trino.plugin.iceberg.IcebergMetadata.getTableStatistics(IcebergMetadata.java:693)
```

However, further fixes in that code area are due.
  • Loading branch information
findepi committed Oct 25, 2021
1 parent 79ab945 commit 9466aa1
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import java.nio.ByteBuffer;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -66,11 +67,14 @@ public Partition(
this.fileCount = 1;
this.size = size;
if (minValues == null || maxValues == null || nullCounts == null) {
this.minValues = null;
this.maxValues = null;
this.nullCounts = null;
this.columnSizes = null;
corruptedStats = null;
// This class initialization is asymmetric with respect to first file
// TODO (https://github.com/trinodb/trino/issues/9716) rethink stats collection process to ensure results are correct, and in particular do not depent on ordering
this.minValues = new HashMap<>();
this.maxValues = new HashMap<>();
this.nullCounts = new HashMap<>();
this.columnSizes = new HashMap<>();
this.corruptedStats = new HashSet<>();
this.hasValidColumnMetrics = false;
}
else {
this.minValues = new HashMap<>(minValues);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Cons
summary.incrementFileCount();
summary.incrementRecordCount(dataFile.recordCount());
summary.incrementSize(dataFile.fileSizeInBytes());
// TODO (https://github.com/trinodb/trino/issues/9716) for partition fields we should extract values with IcebergUtil#getPartitionKeys
updateSummaryMin(summary, partitionFields, convertBounds(idToTypeMapping, dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
updateSummaryMax(summary, partitionFields, convertBounds(idToTypeMapping, dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
summary.updateNullCount(dataFile.nullValueCounts());
Expand Down Expand Up @@ -309,8 +310,13 @@ private void updatePartitionedStats(
List<PartitionField> partitionFields,
Map<Integer, Object> current,
Map<Integer, Object> newStats,
// TODO (https://github.com/trinodb/trino/issues/9716) replace with something like a comparator, or comparator factory
Predicate<Integer> predicate)
{
if (newStats == null) {
// TODO (https://github.com/trinodb/trino/issues/9716) if some/many files miss statistics, we should probably invalidate statistics collection, see Partition#hasValidColumnMetrics
return;
}
for (PartitionField field : partitionFields) {
int id = field.sourceId();
if (summary.getCorruptedStats().contains(id)) {
Expand All @@ -319,6 +325,7 @@ private void updatePartitionedStats(

Object newValue = newStats.get(id);
if (newValue == null) {
// TODO (https://github.com/trinodb/trino/issues/9716) if some/many files miss statistics, we should probably invalidate statistics collection, see Partition#hasValidColumnMetrics
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.Iterables.getOnlyElement;
import static com.google.common.collect.MoreCollectors.onlyElement;
import static com.google.common.collect.MoreCollectors.toOptional;
import static io.trino.SystemSessionProperties.JOIN_DISTRIBUTION_TYPE;
import static io.trino.plugin.hive.HdfsEnvironment.HdfsContext;
import static io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT;
Expand Down Expand Up @@ -525,6 +526,31 @@ private void testSelectOrPartitionedByTimestampWithTimeZone(boolean partitioned)
instant3Utc));
}

// show stats
assertThat(query("SHOW STATS FOR " + tableName))
.skippingTypesCheck()
.matches("VALUES " +
"('_timestamptz', NULL, NULL, " + (format == ORC ? "NULL, NULL, NULL, NULL" : "0e0, NULL, '2021-10-31 00:30:00.005 UTC', '2021-10-31 00:30:00.007 UTC'") + "), " +
"(NULL, NULL, NULL, NULL, 3e0, NULL, NULL)");

if (partitioned) {
// show stats with predicate
assertThat(query("SHOW STATS FOR (SELECT * FROM " + tableName + " WHERE _timestamptz = " + instant1La + ")"))
.skippingTypesCheck()
.matches("VALUES " +
// TODO (https://github.com/trinodb/trino/issues/9716) the min/max values are off by 1 millisecond
"('_timestamptz', NULL, NULL, " + (format == ORC ? "NULL, NULL, NULL, NULL" : "0e0, NULL, '2021-10-31 00:30:00.005 UTC', '2021-10-31 00:30:00.005 UTC'") + "), " +
"(NULL, NULL, NULL, NULL, 1e0, NULL, NULL)");
}
else {
// show stats with predicate
assertThat(query("SHOW STATS FOR (SELECT * FROM " + tableName + " WHERE _timestamptz = " + instant1La + ")"))
.skippingTypesCheck()
.matches("VALUES " +
"('_timestamptz', NULL, NULL, NULL, NULL, NULL, NULL), " +
"(NULL, NULL, NULL, NULL, NULL, NULL, NULL)");
}

assertUpdate("DROP TABLE " + tableName);
}

Expand Down Expand Up @@ -683,15 +709,63 @@ public void testCreatePartitionedTable()

// SHOW STATS
if (format == ORC) {
// TODO (https://github.com/trinodb/trino/issues/9714, https://github.com/trinodb/trino/issues/9716) SHOW STATS fails with NullPointerException
assertThatThrownBy(() -> query("SHOW STATS FOR test_partitioned_table"))
.hasToString("java.lang.RuntimeException: java.lang.NullPointerException")
.hasStackTraceContaining("at io.trino.plugin.iceberg.TableStatisticsMaker.updatePartitionedStats");
assertThat(query("SHOW STATS FOR test_partitioned_table"))
.projected(0, 2, 3, 4, 5, 6) // ignore data size which is varying for Parquet (and not available for ORC)
.skippingTypesCheck()
.satisfies(result -> {
// TODO https://github.com/trinodb/trino/issues/9716 stats results are non-deterministic
// once fixed, replace with assertThat(query(...)).matches(...)
MaterializedRow aSampleColumnStatsRow = result.getMaterializedRows().stream()
.filter(row -> "a_boolean".equals(row.getField(0)))
.collect(toOptional()).orElseThrow();
if (aSampleColumnStatsRow.getField(2) == null) {
assertEqualsIgnoreOrder(result, computeActual("VALUES " +
" ('a_boolean', NULL, NULL, NULL, NULL, NULL), " +
" ('an_integer', NULL, NULL, NULL, '1', '1'), " +
" ('a_bigint', NULL, NULL, NULL, '1', '1'), " +
" ('a_real', NULL, NULL, NULL, '1.0', '1.0'), " +
" ('a_double', NULL, NULL, NULL, '1.0', '1.0'), " +
" ('a_short_decimal', NULL, NULL, NULL, '1.0', '1.0'), " +
" ('a_long_decimal', NULL, NULL, NULL, '11.0', '11.0'), " +
" ('a_varchar', NULL, NULL, NULL, NULL, NULL), " +
" ('a_varbinary', NULL, NULL, NULL, NULL, NULL), " +
" ('a_date', NULL, NULL, NULL, '2021-07-24', '2021-07-24'), " +
" ('a_time', NULL, NULL, NULL, NULL, NULL), " +
" ('a_timestamp', NULL, NULL, NULL, NULL, NULL), " +
" ('a_timestamptz', NULL, NULL, NULL, NULL, NULL), " +
" ('a_uuid', NULL, NULL, NULL, NULL, NULL), " +
" ('a_row', NULL, NULL, NULL, NULL, NULL), " +
" ('an_array', NULL, NULL, NULL, NULL, NULL), " +
" ('a_map', NULL, NULL, NULL, NULL, NULL), " +
" (NULL, NULL, NULL, 2e0, NULL, NULL)"));
}
else {
assertEqualsIgnoreOrder(result, computeActual("VALUES " +
" ('a_boolean', NULL, 0e0, NULL, NULL, NULL), " +
" ('an_integer', NULL, 0e0, NULL, '1', '1'), " +
" ('a_bigint', NULL, 0e0, NULL, '1', '1'), " +
" ('a_real', NULL, 0e0, NULL, '1.0', '1.0'), " +
" ('a_double', NULL, 0e0, NULL, '1.0', '1.0'), " +
" ('a_short_decimal', NULL, 0e0, NULL, '1.0', '1.0'), " +
" ('a_long_decimal', NULL, 0e0, NULL, '11.0', '11.0'), " +
" ('a_varchar', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_varbinary', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_date', NULL, 0e0, NULL, '2021-07-24', '2021-07-24'), " +
" ('a_time', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_timestamp', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_timestamptz', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_uuid', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_row', NULL, 0e0, NULL, NULL, NULL), " +
" ('an_array', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_map', NULL, 0e0, NULL, NULL, NULL), " +
" (NULL, NULL, NULL, 2e0, NULL, NULL)"));
}
});
}
else {
assertThat(query("SHOW STATS FOR test_partitioned_table"))
.skippingTypesCheck()
.projected(0, 2, 3, 4, 5, 6) // ignore data size which is varying for Parquet (and not available for ORC)
.skippingTypesCheck()
.matches("VALUES " +
" ('a_boolean', NULL, 0.5e0, NULL, 'true', 'true'), " +
" ('an_integer', NULL, 0.5e0, NULL, '1', '1'), " +
Expand Down Expand Up @@ -2459,18 +2533,83 @@ public void testAllAvailableTypes()
.matches(nullValues);

// SHOW STATS
try {
// TODO (https://github.com/trinodb/trino/issues/9714, https://github.com/trinodb/trino/issues/9716) SHOW STATS may fail with NullPointerException, depending which file is processed first
// assertThat(query("SHOW STATS FOR test_all_types"))
// .skippingTypesCheck()
// .matches("....");

computeActual("SHOW STATS FOR test_all_types");
if (format == ORC) {
assertThat(query("SHOW STATS FOR test_all_types"))
.projected(0, 2, 3, 4, 5, 6) // ignore data size which is varying for Parquet (and not available for ORC)
.skippingTypesCheck()
.satisfies(result -> {
// TODO https://github.com/trinodb/trino/issues/9716 stats results are non-deterministic
// once fixed, replace with assertThat(query(...)).matches(...)
MaterializedRow aSampleColumnStatsRow = result.getMaterializedRows().stream()
.filter(row -> "a_boolean".equals(row.getField(0)))
.collect(toOptional()).orElseThrow();
if (aSampleColumnStatsRow.getField(2) == null) {
assertEqualsIgnoreOrder(result, computeActual("VALUES " +
" ('a_boolean', NULL, NULL, NULL, NULL, NULL), " +
" ('an_integer', NULL, NULL, NULL, NULL, NULL), " +
" ('a_bigint', NULL, NULL, NULL, NULL, NULL), " +
" ('a_real', NULL, NULL, NULL, NULL, NULL), " +
" ('a_double', NULL, NULL, NULL, NULL, NULL), " +
" ('a_short_decimal', NULL, NULL, NULL, NULL, NULL), " +
" ('a_long_decimal', NULL, NULL, NULL, NULL, NULL), " +
" ('a_varchar', NULL, NULL, NULL, NULL, NULL), " +
" ('a_varbinary', NULL, NULL, NULL, NULL, NULL), " +
" ('a_date', NULL, NULL, NULL, NULL, NULL), " +
" ('a_time', NULL, NULL, NULL, NULL, NULL), " +
" ('a_timestamp', NULL, NULL, NULL, NULL, NULL), " +
" ('a_timestamptz', NULL, NULL, NULL, NULL, NULL), " +
" ('a_uuid', NULL, NULL, NULL, NULL, NULL), " +
" ('a_row', NULL, NULL, NULL, NULL, NULL), " +
" ('an_array', NULL, NULL, NULL, NULL, NULL), " +
" ('a_map', NULL, NULL, NULL, NULL, NULL), " +
" (NULL, NULL, NULL, 2e0, NULL, NULL)"));
}
else {
assertEqualsIgnoreOrder(result, computeActual("VALUES " +
" ('a_boolean', NULL, 0e0, NULL, NULL, NULL), " +
" ('an_integer', NULL, 0e0, NULL, '1', '1'), " +
" ('a_bigint', NULL, 0e0, NULL, '1', '1'), " +
" ('a_real', NULL, 0e0, NULL, '1.0', '1.0'), " +
" ('a_double', NULL, 0e0, NULL, '1.0', '1.0'), " +
" ('a_short_decimal', NULL, 0e0, NULL, '1.0', '1.0'), " +
" ('a_long_decimal', NULL, 0e0, NULL, '11.0', '11.0'), " +
" ('a_varchar', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_varbinary', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_date', NULL, 0e0, NULL, '2021-07-24', '2021-07-24'), " +
" ('a_time', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_timestamp', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_timestamptz', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_uuid', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_row', NULL, 0e0, NULL, NULL, NULL), " +
" ('an_array', NULL, 0e0, NULL, NULL, NULL), " +
" ('a_map', NULL, 0e0, NULL, NULL, NULL), " +
" (NULL, NULL, NULL, 2e0, NULL, NULL)"));
}
});
}
catch (RuntimeException sometimesExpected) {
assertThat(sometimesExpected)
.hasToString("java.lang.RuntimeException: java.lang.NullPointerException")
.hasStackTraceContaining("at io.trino.plugin.iceberg.TableStatisticsMaker.makeTableStatistics");
else {
assertThat(query("SHOW STATS FOR test_all_types"))
.projected(0, 2, 3, 4, 5, 6) // ignore data size which is varying for Parquet (and not available for ORC)
.skippingTypesCheck()
.matches("VALUES " +
" ('a_boolean', NULL, 0.5e0, NULL, 'true', 'true'), " +
" ('an_integer', NULL, 0.5e0, NULL, '1', '1'), " +
" ('a_bigint', NULL, 0.5e0, NULL, '1', '1'), " +
" ('a_real', NULL, 0.5e0, NULL, '1.0', '1.0'), " +
" ('a_double', NULL, 0.5e0, NULL, '1.0', '1.0'), " +
" ('a_short_decimal', NULL, 0.5e0, NULL, '1.0', '1.0'), " +
" ('a_long_decimal', NULL, 0.5e0, NULL, '11.0', '11.0'), " +
" ('a_varchar', NULL, 0.5e0, NULL, NULL, NULL), " +
" ('a_varbinary', NULL, 0.5e0, NULL, NULL, NULL), " +
" ('a_date', NULL, 0.5e0, NULL, '2021-07-24', '2021-07-24'), " +
" ('a_time', NULL, 0.5e0, NULL, NULL, NULL), " +
" ('a_timestamp', NULL, 0.5e0, NULL, '2021-07-24 03:43:57.987654', '2021-07-24 03:43:57.987654'), " +
" ('a_timestamptz', NULL, 0.5e0, NULL, '2021-07-24 04:43:57.987 UTC', '2021-07-24 04:43:57.987 UTC'), " +
" ('a_uuid', NULL, 0.5e0, NULL, NULL, NULL), " +
" ('a_row', NULL, NULL, NULL, NULL, NULL), " +
" ('an_array', NULL, NULL, NULL, NULL, NULL), " +
" ('a_map', NULL, NULL, NULL, NULL, NULL), " +
" (NULL, NULL, NULL, 2e0, NULL, NULL)");
}

// $partitions
Expand Down

0 comments on commit 9466aa1

Please sign in to comment.