Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizations in parquet file page materialization #5582

Merged
merged 8 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ static PageMaterializerFactory factoryForType(@NotNull final PrimitiveType primi
// The column will store nanoseconds elapsed since epoch as long values
switch (timestampLogicalType.getUnit()) {
case MILLIS:
return TimestampNanosFromMillisMaterializer.Factory;
return InstantNanosFromMillisMaterializer.Factory;
case MICROS:
return TimestampNanosFromMicrosMaterializer.Factory;
return InstantNanosFromMicrosMaterializer.Factory;
case NANOS:
return LongMaterializer.Factory;
}
Expand All @@ -68,11 +68,11 @@ static PageMaterializerFactory factoryForType(@NotNull final PrimitiveType primi
// Ref:https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#local-semantics-timestamps-not-normalized-to-utc
switch (timestampLogicalType.getUnit()) {
case MILLIS:
return LocalDateTimeMaterializer.FromMillisFactory;
return LocalDateTimeFromMillisMaterializer.Factory;
case MICROS:
return LocalDateTimeMaterializer.FromMicrosFactory;
return LocalDateTimeFromMicrosMaterializer.Factory;
case NANOS:
return LocalDateTimeMaterializer.FromNanosFactory;
return LocalDateTimeFromNanosMaterializer.Factory;
}
}
} else if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.TimeLogicalTypeAnnotation) {
Expand Down Expand Up @@ -101,7 +101,7 @@ static PageMaterializerFactory factoryForType(@NotNull final PrimitiveType primi
if (logicalTypeAnnotation instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) {
return StringMaterializer.Factory;
}
case FIXED_LEN_BYTE_ARRAY:
case FIXED_LEN_BYTE_ARRAY: // fall through
return BlobMaterializer.Factory;
default:
throw new RuntimeException("Unexpected type name:" + primitiveTypeName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ public PageMaterializer makeMaterializerNonNull(ValuesReader dataReader, int num
};

/**
* {@link PageMaterializer} implementation for {@link Instant}s stored as Int96s representing an Impala format
* Timestamp (nanoseconds of day and Julian date encoded as 8 bytes and 4 bytes, respectively)
* {@link PageMaterializer} implementation for {@link Instant Instants} stored as Int96s representing an Impala
* format Timestamp (nanoseconds of day and Julian date encoded as 8 bytes and 4 bytes, respectively)
*/
private static final class InstantFromInt96PageMaterializer extends LongPageMaterializerBase
implements PageMaterializer {
Expand Down Expand Up @@ -68,11 +68,17 @@ private InstantFromInt96PageMaterializer(ValuesReader dataReader, long nullValue
*/
private static void setReferenceTimeZone(@NotNull final String timeZone) {
offset = DateTimeUtils.nanosOfDay(DateTimeUtils.parseInstant("1970-01-01T00:00:00 " + timeZone),
ZoneId.of("UTC"));
ZoneId.of("UTC"), false);
}

@Override
long readLong() {
public void fillValues(int startIndex, int endIndex) {
for (int ii = startIndex; ii < endIndex; ii++) {
data[ii] = readInstantNanos();
}
}

long readInstantNanos() {
final ByteBuffer resultBuffer = ByteBuffer.wrap(dataReader.readBytes().getBytesUnsafe());
resultBuffer.order(java.nio.ByteOrder.LITTLE_ENDIAN);
final long nanos = resultBuffer.getLong();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,39 @@
import io.deephaven.time.DateTimeUtils;
import org.apache.parquet.column.values.ValuesReader;

public class TimestampNanosFromMicrosMaterializer {
public class InstantNanosFromMicrosMaterializer {

public static final PageMaterializerFactory Factory = new PageMaterializerFactory() {
@Override
public PageMaterializer makeMaterializerWithNulls(ValuesReader dataReader, Object nullValue, int numValues) {
return new TimestampNanosFromMicrosPageMaterializer(dataReader, (long) nullValue, numValues);
return new InstantNanosFromMicrosPageMaterializer(dataReader, (long) nullValue, numValues);
}

@Override
public PageMaterializer makeMaterializerNonNull(ValuesReader dataReader, int numValues) {
return new TimestampNanosFromMicrosPageMaterializer(dataReader, numValues);
return new InstantNanosFromMicrosPageMaterializer(dataReader, numValues);
}
};

private static final class TimestampNanosFromMicrosPageMaterializer extends LongPageMaterializerBase
private static final class InstantNanosFromMicrosPageMaterializer extends LongPageMaterializerBase
implements PageMaterializer {

final ValuesReader dataReader;

private TimestampNanosFromMicrosPageMaterializer(ValuesReader dataReader, int numValues) {
private InstantNanosFromMicrosPageMaterializer(ValuesReader dataReader, int numValues) {
this(dataReader, 0, numValues);
}

private TimestampNanosFromMicrosPageMaterializer(ValuesReader dataReader, long nullValue, int numValues) {
private InstantNanosFromMicrosPageMaterializer(ValuesReader dataReader, long nullValue, int numValues) {
super(nullValue, numValues);
this.dataReader = dataReader;
}

@Override
long readLong() {
return DateTimeUtils.microsToNanos(dataReader.readLong());
public void fillValues(int startIndex, int endIndex) {
for (int ii = startIndex; ii < endIndex; ii++) {
data[ii] = DateTimeUtils.microsToNanos(dataReader.readLong());
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
//
// ****** AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY
// ****** Edit TimestampNanosFromMicrosMaterializer and run "./gradlew replicatePageMaterializers" to regenerate
// ****** Edit InstantNanosFromMicrosMaterializer and run "./gradlew replicatePageMaterializers" to regenerate
//
// @formatter:off
package io.deephaven.parquet.base.materializers;
Expand All @@ -12,37 +12,39 @@
import io.deephaven.time.DateTimeUtils;
import org.apache.parquet.column.values.ValuesReader;

public class TimestampNanosFromMillisMaterializer {
public class InstantNanosFromMillisMaterializer {

public static final PageMaterializerFactory Factory = new PageMaterializerFactory() {
@Override
public PageMaterializer makeMaterializerWithNulls(ValuesReader dataReader, Object nullValue, int numValues) {
return new TimestampNanosFromMillisPageMaterializer(dataReader, (long) nullValue, numValues);
return new InstantNanosFromMillisPageMaterializer(dataReader, (long) nullValue, numValues);
}

@Override
public PageMaterializer makeMaterializerNonNull(ValuesReader dataReader, int numValues) {
return new TimestampNanosFromMillisPageMaterializer(dataReader, numValues);
return new InstantNanosFromMillisPageMaterializer(dataReader, numValues);
}
};

private static final class TimestampNanosFromMillisPageMaterializer extends LongPageMaterializerBase
private static final class InstantNanosFromMillisPageMaterializer extends LongPageMaterializerBase
implements PageMaterializer {

final ValuesReader dataReader;

private TimestampNanosFromMillisPageMaterializer(ValuesReader dataReader, int numValues) {
private InstantNanosFromMillisPageMaterializer(ValuesReader dataReader, int numValues) {
this(dataReader, 0, numValues);
}

private TimestampNanosFromMillisPageMaterializer(ValuesReader dataReader, long nullValue, int numValues) {
private InstantNanosFromMillisPageMaterializer(ValuesReader dataReader, long nullValue, int numValues) {
super(nullValue, numValues);
this.dataReader = dataReader;
}

@Override
long readLong() {
return DateTimeUtils.millisToNanos(dataReader.readLong());
public void fillValues(int startIndex, int endIndex) {
for (int ii = startIndex; ii < endIndex; ii++) {
data[ii] = DateTimeUtils.millisToNanos(dataReader.readLong());
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
//
package io.deephaven.parquet.base.materializers;

import io.deephaven.parquet.base.PageMaterializer;
import io.deephaven.parquet.base.PageMaterializerFactory;
import io.deephaven.parquet.base.ParquetTimeUtils;
import org.apache.parquet.column.values.ValuesReader;

import java.time.LocalDateTime;

public class LocalDateTimeFromMicrosMaterializer {

public static final PageMaterializerFactory Factory = new PageMaterializerFactory() {
@Override
public PageMaterializer makeMaterializerWithNulls(ValuesReader dataReader, Object nullValue, int numValues) {
return new LocalDateTimeFromMicrosPageMaterializer(dataReader, (LocalDateTime) nullValue, numValues);
}

@Override
public PageMaterializer makeMaterializerNonNull(ValuesReader dataReader, int numValues) {
return new LocalDateTimeFromMicrosPageMaterializer(dataReader, numValues);
}
};

private static final class LocalDateTimeFromMicrosPageMaterializer extends LocalDateTimePageMaterializerBase
implements PageMaterializer {

final ValuesReader dataReader;

private LocalDateTimeFromMicrosPageMaterializer(ValuesReader dataReader, int numValues) {
this(dataReader, null, numValues);
}

private LocalDateTimeFromMicrosPageMaterializer(ValuesReader dataReader, LocalDateTime nullValue,
int numValues) {
super(nullValue, numValues);
this.dataReader = dataReader;
}

@Override
public void fillValues(int startIndex, int endIndex) {
for (int ii = startIndex; ii < endIndex; ii++) {
data[ii] = ParquetTimeUtils.epochMicrosToLocalDateTimeUTC(dataReader.readLong());
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
//
// ****** AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY
// ****** Edit LocalDateTimeFromMicrosMaterializer and run "./gradlew replicatePageMaterializers" to regenerate
//
// @formatter:off
package io.deephaven.parquet.base.materializers;

import io.deephaven.parquet.base.PageMaterializer;
import io.deephaven.parquet.base.PageMaterializerFactory;
import io.deephaven.parquet.base.ParquetTimeUtils;
import org.apache.parquet.column.values.ValuesReader;

import java.time.LocalDateTime;

public class LocalDateTimeFromMillisMaterializer {

public static final PageMaterializerFactory Factory = new PageMaterializerFactory() {
@Override
public PageMaterializer makeMaterializerWithNulls(ValuesReader dataReader, Object nullValue, int numValues) {
return new LocalDateTimeFromMillisPageMaterializer(dataReader, (LocalDateTime) nullValue, numValues);
}

@Override
public PageMaterializer makeMaterializerNonNull(ValuesReader dataReader, int numValues) {
return new LocalDateTimeFromMillisPageMaterializer(dataReader, numValues);
}
};

private static final class LocalDateTimeFromMillisPageMaterializer extends LocalDateTimePageMaterializerBase
implements PageMaterializer {

final ValuesReader dataReader;

private LocalDateTimeFromMillisPageMaterializer(ValuesReader dataReader, int numValues) {
this(dataReader, null, numValues);
}

private LocalDateTimeFromMillisPageMaterializer(ValuesReader dataReader, LocalDateTime nullValue,
int numValues) {
super(nullValue, numValues);
this.dataReader = dataReader;
}

@Override
public void fillValues(int startIndex, int endIndex) {
for (int ii = startIndex; ii < endIndex; ii++) {
data[ii] = ParquetTimeUtils.epochMillisToLocalDateTimeUTC(dataReader.readLong());
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
//
// ****** AUTO-GENERATED CLASS - DO NOT EDIT MANUALLY
// ****** Edit LocalDateTimeFromMicrosMaterializer and run "./gradlew replicatePageMaterializers" to regenerate
malhotrashivam marked this conversation as resolved.
Show resolved Hide resolved
//
// @formatter:off
package io.deephaven.parquet.base.materializers;

import io.deephaven.parquet.base.PageMaterializer;
import io.deephaven.parquet.base.PageMaterializerFactory;
import io.deephaven.parquet.base.ParquetTimeUtils;
import org.apache.parquet.column.values.ValuesReader;

import java.time.LocalDateTime;

public class LocalDateTimeFromNanosMaterializer {

public static final PageMaterializerFactory Factory = new PageMaterializerFactory() {
@Override
public PageMaterializer makeMaterializerWithNulls(ValuesReader dataReader, Object nullValue, int numValues) {
return new LocalDateTimeFromNanosPageMaterializer(dataReader, (LocalDateTime) nullValue, numValues);
}

@Override
public PageMaterializer makeMaterializerNonNull(ValuesReader dataReader, int numValues) {
return new LocalDateTimeFromNanosPageMaterializer(dataReader, numValues);
}
};

private static final class LocalDateTimeFromNanosPageMaterializer extends LocalDateTimePageMaterializerBase
malhotrashivam marked this conversation as resolved.
Show resolved Hide resolved
implements PageMaterializer {

final ValuesReader dataReader;

private LocalDateTimeFromNanosPageMaterializer(ValuesReader dataReader, int numValues) {
this(dataReader, null, numValues);
}

private LocalDateTimeFromNanosPageMaterializer(ValuesReader dataReader, LocalDateTime nullValue,
int numValues) {
super(nullValue, numValues);
this.dataReader = dataReader;
}

@Override
public void fillValues(int startIndex, int endIndex) {
for (int ii = startIndex; ii < endIndex; ii++) {
data[ii] = ParquetTimeUtils.epochNanosToLocalDateTimeUTC(dataReader.readLong());
}
}
}
}
Loading
Loading