Skip to content

Commit

Permalink
ORC-1482: Adaptation to read ORC files created by CUDF
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This pr is aimed at adapting to read ORC files created by CUDF, which may have missing statistics in their DOUBLE/FLOAT columns.

### Why are the changes needed?

Official ORC readers can't read CUDF-created ORC files properly.

### How was this patch tested?

Added UT.

Closes apache#1598 from guiyanakuang/ORC-1482-to-1.8.

Authored-by: Yiqun Zhang <guiyanakuang@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
guiyanakuang authored and dongjoon-hyun committed Aug 25, 2023
1 parent ed0eb30 commit 012403e
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 2 deletions.
12 changes: 10 additions & 2 deletions java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -695,8 +695,8 @@ static TruthValue evaluatePredicateProto(OrcProto.ColumnStatistics statsProto,
" include ORC-517. Writer version: {}",
predicate.getColumnName(), writerVersion);
return TruthValue.YES_NO_NULL;
} else if (category == TypeDescription.Category.DOUBLE
|| category == TypeDescription.Category.FLOAT) {
} else if ((category == TypeDescription.Category.DOUBLE ||
category == TypeDescription.Category.FLOAT) && cs instanceof DoubleColumnStatistics) {
DoubleColumnStatistics dstas = (DoubleColumnStatistics) cs;
if (Double.isNaN(dstas.getSum())) {
LOG.debug("Not using predication pushdown on {} because stats contain NaN values",
Expand Down Expand Up @@ -1708,4 +1708,12 @@ public CompressionCodec getCompressionCodec() {
public int getMaxDiskRangeChunkLimit() {
return maxDiskRangeChunkLimit;
}

/**
* Get sargApplier for testing.
* @return sargApplier in record reader.
*/
SargApplier getSargApp() {
return sargApp;
}
}
42 changes: 42 additions & 0 deletions java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.orc.CompressionCodec;
import org.apache.orc.CompressionKind;
import org.apache.orc.DataReader;
import org.apache.orc.DoubleColumnStatistics;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto;
Expand Down Expand Up @@ -2474,6 +2475,47 @@ public void testWithoutStatistics() {
assertEquals(TruthValue.YES_NO_NULL, truthValue);
}

@Test
public void testDoubleColumnWithoutDoubleStatistics() throws Exception {
// orc-file-no-double-statistic.orc is an orc file created by cudf with a schema of
// struct<x:double>, one row and a value of null.
// Test file source https://issues.apache.org/jira/projects/ORC/issues/ORC-1482
Path filePath = new Path(ClassLoader.getSystemResource("orc-file-no-double-statistic.orc")
.getPath());

Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);

Reader reader = OrcFile.createReader(filePath,
OrcFile.readerOptions(conf).filesystem(fs));

TypeDescription schema = TypeDescription.fromString("struct<x:double>");

assertEquals(schema, reader.getSchema());
assertFalse(reader.getStatistics()[0] instanceof DoubleColumnStatistics);

SearchArgument sarg = SearchArgumentFactory.newBuilder()
.isNull("x", PredicateLeaf.Type.FLOAT)
.build();

Reader.Options options = reader.options()
.searchArgument(sarg, new String[] {"x"})
.useSelected(true)
.allowSARGToFilter(true);

VectorizedRowBatch batch = schema.createRowBatch();
long rowCount = 0;
try (RecordReader rr = reader.rows(options)) {
assertTrue(rr.nextBatch(batch));
rowCount += batch.size;
assertFalse(rr.nextBatch(batch));
if (rr instanceof RecordReaderImpl) {
assertEquals(0, ((RecordReaderImpl) rr).getSargApp().getExceptionCount()[0]);
}
}
assertEquals(1, rowCount);
}

@Test
public void testMissMinOrMaxInStatistics() {
OrcProto.ColumnEncoding encoding = OrcProto.ColumnEncoding.newBuilder()
Expand Down
Binary file not shown.

0 comments on commit 012403e

Please sign in to comment.