Skip to content

Commit

Permalink
fix: Fix readback of parquet files with missing dictionary page offse…
Browse files Browse the repository at this point in the history
…ts (#5765)

This fixes #5764
  • Loading branch information
abaranec authored Jul 12, 2024
1 parent 4fdad15 commit ae966a8
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -176,21 +176,14 @@ public Function<SeekableChannelContext, Dictionary> getDictionarySupplier() {

@NotNull
private Dictionary getDictionary(final SeekableChannelContext channelContext) {
final long dictionaryPageOffset;
final ColumnMetaData chunkMeta = columnChunk.getMeta_data();
if (chunkMeta.isSetDictionary_page_offset()) {
dictionaryPageOffset = chunkMeta.getDictionary_page_offset();
} else if ((chunkMeta.isSetEncoding_stats() && (chunkMeta.getEncoding_stats().stream()
.anyMatch(pes -> pes.getEncoding() == PLAIN_DICTIONARY
|| pes.getEncoding() == RLE_DICTIONARY)))
|| (chunkMeta.isSetEncodings() && (chunkMeta.getEncodings().stream()
.anyMatch(en -> en == PLAIN_DICTIONARY || en == RLE_DICTIONARY)))) {
// Fallback, inspired by
// https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding
dictionaryPageOffset = chunkMeta.getData_page_offset();
} else {
return NULL_DICTIONARY;
}
// If the dictionary page offset is set, use it, otherwise inspect the first data page -- it might be the
// dictionary page. Fallback, inspired by
// https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding
final long dictionaryPageOffset = chunkMeta.isSetDictionary_page_offset()
? chunkMeta.getDictionary_page_offset()
: chunkMeta.getData_page_offset();

return readDictionary(dictionaryPageOffset, channelContext);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import io.deephaven.parquet.table.layout.ParquetKeyValuePartitionedLayout;
import io.deephaven.stringset.HashStringSet;
import io.deephaven.stringset.StringSet;
import io.deephaven.time.DateTimeUtils;
import io.deephaven.vector.*;
import junit.framework.TestCase;
import org.junit.*;
Expand All @@ -28,6 +29,7 @@
import java.io.IOException;
import java.lang.reflect.Proxy;
import java.nio.file.Files;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
Expand All @@ -39,7 +41,9 @@
import static io.deephaven.engine.testutil.TstUtils.assertTableEquals;
import static io.deephaven.engine.testutil.TstUtils.tableRangesAreEqual;
import static io.deephaven.engine.util.TableTools.*;
import static org.junit.Assert.*;
import static org.junit.Assert.assertTrue;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

/**
* Tests for {@link ParquetTools}.
Expand Down Expand Up @@ -527,6 +531,8 @@ private void testWriteRead(Table source, Function<Table, Table> transform) {
assertTableEquals(transform.apply(source), transform.apply(readBack));
}

// This method is used in a formula. Do not remove
@SuppressWarnings("unused")
public static DoubleVector generateDoubles(int howMany) {
final double[] yarr = new double[howMany];
for (int ii = 0; ii < howMany; ii++) {
Expand All @@ -535,6 +541,8 @@ public static DoubleVector generateDoubles(int howMany) {
return new DoubleVectorDirect(yarr);
}

// This method is used in a formula. Do not remove
@SuppressWarnings("unused")
public static FloatVector generateFloats(int howMany) {
final float[] yarr = new float[howMany];
for (int ii = 0; ii < howMany; ii++) {
Expand All @@ -543,11 +551,33 @@ public static FloatVector generateFloats(int howMany) {
return new FloatVectorDirect(yarr);
}

// This method is used in a formula. Do not remove
@SuppressWarnings("unused")
public static ObjectVector<String> makeSillyStringArray(int howMany) {
final String[] fireTruck = new String[howMany];
for (int ii = 0; ii < howMany; ii++) {
fireTruck[ii] = String.format("%04d", ii);
}
return new ObjectVectorDirect<>(fireTruck);
}

/**
* This test checks that we can read old parquet files that don't properly have dictionary page offset set, and also
* that we can read int96 columns with null values.
*/
@Test
public void testNoDictionaryOffset() {
final Table withNullsAndMissingOffsets =
ParquetTools.readTable(TestParquetTools.class.getResource("/dictOffset/dh17395.parquet").getFile());
final Table clean = ParquetTools
.readTable(TestParquetTools.class.getResource("/dictOffset/dh17395_clean.parquet").getFile());

assertEquals("D2442F78-AEFB-49F7-85A0-00506BBE74D4",
withNullsAndMissingOffsets.getColumnSource("LISTID").get(0));
assertNull(withNullsAndMissingOffsets.getColumnSource("TIMESTAMP").get(0));
assertNull(withNullsAndMissingOffsets.getColumnSource("SETTLEMENT_DATE").get(0));
assertEquals(1698672050703000000L,
DateTimeUtils.epochNanos((Instant) withNullsAndMissingOffsets.getColumnSource("CREATE_DATE").get(0)));
assertTableEquals(withNullsAndMissingOffsets, clean);
}
}
Git LFS file not shown
Git LFS file not shown

0 comments on commit ae966a8

Please sign in to comment.