diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml
index 3bd3a593eb..9f7479a6c1 100644
--- a/parquet-column/pom.xml
+++ b/parquet-column/pom.xml
@@ -58,6 +58,11 @@
fastutil${fastutil.version}
+
+ net.openhft
+ zero-allocation-hashing
+ ${net.openhft.version}
+ com.carrotsearch
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java
index 4595723cf2..d9238de27f 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java
@@ -18,20 +18,24 @@
*/
package org.apache.parquet.column;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+
import org.apache.parquet.Preconditions;
import org.apache.parquet.bytes.ByteBufferAllocator;
import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
import org.apache.parquet.bytes.HeapByteBufferAllocator;
-
-import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt;
-
-import java.util.Objects;
-
import org.apache.parquet.column.impl.ColumnWriteStoreV1;
import org.apache.parquet.column.impl.ColumnWriteStoreV2;
import org.apache.parquet.column.page.PageWriteStore;
import org.apache.parquet.column.values.ValuesWriter;
import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore;
import org.apache.parquet.column.values.factory.DefaultValuesWriterFactory;
import org.apache.parquet.column.values.factory.ValuesWriterFactory;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
@@ -56,6 +60,7 @@ public class ParquetProperties {
public static final int DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH = 64;
public static final int DEFAULT_STATISTICS_TRUNCATE_LENGTH = Integer.MAX_VALUE;
public static final int DEFAULT_PAGE_ROW_COUNT_LIMIT = 20_000;
+ public static final int DEFAULT_MAX_BLOOM_FILTER_BYTES = 1024 * 1024;
public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;
@@ -96,6 +101,11 @@ public static WriterVersion fromString(String name) {
private final ValuesWriterFactory valuesWriterFactory;
private final int columnIndexTruncateLength;
private final int statisticsTruncateLength;
+
+ // The key-value pair represents the column name and its expected distinct number of values in a row group.
+ private final Map bloomFilterExpectedDistinctNumbers;
+ private final int maxBloomFilterBytes;
+ private final List bloomFilterColumns;
private final int pageRowCountLimit;
private final boolean pageWriteChecksumEnabled;
private final boolean enableByteStreamSplit;
@@ -115,6 +125,9 @@ private ParquetProperties(Builder builder) {
this.valuesWriterFactory = builder.valuesWriterFactory;
this.columnIndexTruncateLength = builder.columnIndexTruncateLength;
this.statisticsTruncateLength = builder.statisticsTruncateLength;
+ this.bloomFilterExpectedDistinctNumbers = builder.bloomFilterColumnExpectedNDVs;
+ this.bloomFilterColumns = builder.bloomFilterColumns;
+ this.maxBloomFilterBytes = builder.maxBloomFilterBytes;
this.pageRowCountLimit = builder.pageRowCountLimit;
this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
this.enableByteStreamSplit = builder.enableByteStreamSplit;
@@ -189,11 +202,24 @@ public ByteBufferAllocator getAllocator() {
public ColumnWriteStore newColumnWriteStore(MessageType schema,
PageWriteStore pageStore) {
+ switch (writerVersion) {
+ case PARQUET_1_0:
+ return new ColumnWriteStoreV1(schema, pageStore, this);
+ case PARQUET_2_0:
+ return new ColumnWriteStoreV2(schema, pageStore, this);
+ default:
+ throw new IllegalArgumentException("unknown version " + writerVersion);
+ }
+ }
+
+ public ColumnWriteStore newColumnWriteStore(MessageType schema,
+ PageWriteStore pageStore,
+ BloomFilterWriteStore bloomFilterWriteStore) {
switch (writerVersion) {
case PARQUET_1_0:
- return new ColumnWriteStoreV1(schema, pageStore, this);
+ return new ColumnWriteStoreV1(schema, pageStore, bloomFilterWriteStore, this);
case PARQUET_2_0:
- return new ColumnWriteStoreV2(schema, pageStore, this);
+ return new ColumnWriteStoreV2(schema, pageStore, bloomFilterWriteStore, this);
default:
throw new IllegalArgumentException("unknown version " + writerVersion);
}
@@ -231,6 +257,22 @@ public boolean getPageWriteChecksumEnabled() {
return pageWriteChecksumEnabled;
}
+ public Map getBloomFilterColumnExpectedNDVs() {
+ return bloomFilterExpectedDistinctNumbers;
+ }
+
+ public Set getBloomFilterColumns() {
+ if (bloomFilterColumns != null && bloomFilterColumns.size() > 0){
+ return new HashSet<>(bloomFilterColumns);
+ }
+
+ return bloomFilterExpectedDistinctNumbers.keySet();
+ }
+
+ public int getMaxBloomFilterBytes() {
+ return maxBloomFilterBytes;
+ }
+
public static Builder builder() {
return new Builder();
}
@@ -250,6 +292,9 @@ public String toString() {
+ "Max row count for page size check is: " + getMaxRowCountForPageSizeCheck() + '\n'
+ "Truncate length for column indexes is: " + getColumnIndexTruncateLength() + '\n'
+ "Truncate length for statistics min/max is: " + getStatisticsTruncateLength() + '\n'
+ + "Bloom filter enabled column names are: " + getBloomFilterColumns() + '\n'
+ + "Max Bloom filter size for a column is " + getMaxBloomFilterBytes() + '\n'
+ + "Bloom filter enabled column expected number of distinct values are: " + getBloomFilterColumnExpectedNDVs().values() + '\n'
+ "Page row count limit to " + getPageRowCountLimit() + '\n'
+ "Writing page checksums is: " + (getPageWriteChecksumEnabled() ? "on" : "off");
}
@@ -266,6 +311,9 @@ public static class Builder {
private ValuesWriterFactory valuesWriterFactory = DEFAULT_VALUES_WRITER_FACTORY;
private int columnIndexTruncateLength = DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
private int statisticsTruncateLength = DEFAULT_STATISTICS_TRUNCATE_LENGTH;
+ private Map bloomFilterColumnExpectedNDVs = new HashMap<>();
+ private int maxBloomFilterBytes = DEFAULT_MAX_BLOOM_FILTER_BYTES;
+ private List bloomFilterColumns = new ArrayList<>();
private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT;
private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
private boolean enableByteStreamSplit = DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED;
@@ -286,6 +334,9 @@ private Builder(ParquetProperties toCopy) {
this.allocator = toCopy.allocator;
this.pageRowCountLimit = toCopy.pageRowCountLimit;
this.pageWriteChecksumEnabled = toCopy.pageWriteChecksumEnabled;
+ this.bloomFilterColumnExpectedNDVs = toCopy.bloomFilterExpectedDistinctNumbers;
+ this.bloomFilterColumns = toCopy.bloomFilterColumns;
+ this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes;
this.enableByteStreamSplit = toCopy.enableByteStreamSplit;
}
@@ -396,6 +447,41 @@ public Builder withStatisticsTruncateLength(int length) {
return this;
}
+ /**
+ * Set max Bloom filter bytes for related columns.
+ *
+ * @param maxBloomFilterBytes the max bytes of a Bloom filter bitset for a column.
+ * @return this builder for method chaining
+ */
+ public Builder withMaxBloomFilterBytes(int maxBloomFilterBytes) {
+ this.maxBloomFilterBytes = maxBloomFilterBytes;
+ return this;
+ }
+
+ /**
+ * Set Bloom filter column names and expected NDVs.
+ *
+ * @param columnToNDVMap the columns which has bloom filter enabled.
+ *
+ * @return this builder for method chaining
+ */
+ public Builder withBloomFilterColumnToNDVMap(Map columnToNDVMap) {
+ this.bloomFilterColumnExpectedNDVs = columnToNDVMap;
+ return this;
+ }
+
+ /**
+ * Set Bloom filter column names.
+ *
+ * @param columns the columns which has bloom filter enabled.
+ *
+ * @return this builder for method chaining
+ */
+ public Builder withBloomFilterColumnNames(List columns) {
+ this.bloomFilterColumns = columns;
+ return this;
+ }
+
public Builder withPageRowCountLimit(int rowCount) {
Preconditions.checkArgument(rowCount > 0, "Invalid row count limit for pages: " + rowCount);
pageRowCountLimit = rowCount;
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java
index 2018c01f5e..8740099730 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreBase.java
@@ -34,6 +34,8 @@
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.page.PageWriteStore;
import org.apache.parquet.column.page.PageWriter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
import org.apache.parquet.schema.MessageType;
/**
@@ -74,7 +76,7 @@ private interface ColumnWriterProvider {
public ColumnWriter getColumnWriter(ColumnDescriptor path) {
ColumnWriterBase column = columns.get(path);
if (column == null) {
- column = createColumnWriter(path, pageWriteStore.getPageWriter(path), props);
+ column = createColumnWriter(path, pageWriteStore.getPageWriter(path), null, props);
columns.put(path, column);
}
return column;
@@ -91,7 +93,7 @@ public ColumnWriter getColumnWriter(ColumnDescriptor path) {
Map mcolumns = new TreeMap<>();
for (ColumnDescriptor path : schema.getColumns()) {
PageWriter pageWriter = pageWriteStore.getPageWriter(path);
- mcolumns.put(path, createColumnWriter(path, pageWriter, props));
+ mcolumns.put(path, createColumnWriter(path, pageWriter, null, props));
}
this.columns = unmodifiableMap(mcolumns);
@@ -105,7 +107,38 @@ public ColumnWriter getColumnWriter(ColumnDescriptor path) {
};
}
- abstract ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props);
+ // The Bloom filter is written to a specified bitset instead of pages, so it needs a separate write store abstract.
+ ColumnWriteStoreBase(
+ MessageType schema,
+ PageWriteStore pageWriteStore,
+ BloomFilterWriteStore bloomFilterWriteStore,
+ ParquetProperties props) {
+ this.props = props;
+ this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO);
+ Map mcolumns = new TreeMap<>();
+ for (ColumnDescriptor path : schema.getColumns()) {
+ PageWriter pageWriter = pageWriteStore.getPageWriter(path);
+ if (props.getBloomFilterColumns() != null && props.getBloomFilterColumns().size() > 0) {
+ BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path);
+ mcolumns.put(path, createColumnWriter(path, pageWriter, bloomFilterWriter, props));
+ } else {
+ mcolumns.put(path, createColumnWriter(path, pageWriter, null, props));
+ }
+ }
+ this.columns = unmodifiableMap(mcolumns);
+
+ this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck();
+
+ columnWriterProvider = new ColumnWriterProvider() {
+ @Override
+ public ColumnWriter getColumnWriter(ColumnDescriptor path) {
+ return columns.get(path);
+ }
+ };
+ }
+
+ abstract ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter,
+ BloomFilterWriter bloomFilterWriter, ParquetProperties props);
@Override
public ColumnWriter getColumnWriter(ColumnDescriptor path) {
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java
index 7258423fb4..c4760d04f2 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV1.java
@@ -22,6 +22,8 @@
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.page.PageWriteStore;
import org.apache.parquet.column.page.PageWriter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
import org.apache.parquet.schema.MessageType;
public class ColumnWriteStoreV1 extends ColumnWriteStoreBase {
@@ -36,8 +38,15 @@ public ColumnWriteStoreV1(final PageWriteStore pageWriteStore,
super(pageWriteStore, props);
}
+ public ColumnWriteStoreV1(MessageType schema, PageWriteStore pageWriteStore,
+ BloomFilterWriteStore bloomFilterWriteStore,
+ ParquetProperties props) {
+ super(schema, pageWriteStore, bloomFilterWriteStore, props);
+ }
+
@Override
- ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props) {
- return new ColumnWriterV1(path, pageWriter, props);
+ ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter,
+ BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
+ return new ColumnWriterV1(path, pageWriter, bloomFilterWriter, props);
}
}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java
index bf1090d0bc..590c3edcf2 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriteStoreV2.java
@@ -22,6 +22,8 @@
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.page.PageWriteStore;
import org.apache.parquet.column.page.PageWriter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
import org.apache.parquet.schema.MessageType;
public class ColumnWriteStoreV2 extends ColumnWriteStoreBase {
@@ -30,8 +32,15 @@ public ColumnWriteStoreV2(MessageType schema, PageWriteStore pageWriteStore, Par
super(schema, pageWriteStore, props);
}
+ public ColumnWriteStoreV2(MessageType schema, PageWriteStore pageWriteStore,
+ BloomFilterWriteStore bloomFilterWriteStore,
+ ParquetProperties props) {
+ super(schema, pageWriteStore, bloomFilterWriteStore, props);
+ }
+
@Override
- ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props) {
- return new ColumnWriterV2(path, pageWriter, props);
+ ColumnWriterBase createColumnWriter(ColumnDescriptor path, PageWriter pageWriter,
+ BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
+ return new ColumnWriterV2(path, pageWriter, bloomFilterWriter, props);
}
}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java
index 8fc7d31ba1..c46b26a283 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java
@@ -19,6 +19,8 @@
package org.apache.parquet.column.impl;
import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnWriter;
@@ -27,6 +29,9 @@
import org.apache.parquet.column.page.PageWriter;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.api.Binary;
import org.slf4j.Logger;
@@ -53,10 +58,22 @@ abstract class ColumnWriterBase implements ColumnWriter {
private long rowsWrittenSoFar = 0;
private int pageRowCount;
+ private final BloomFilterWriter bloomFilterWriter;
+ private final BloomFilter bloomFilter;
+
ColumnWriterBase(
ColumnDescriptor path,
PageWriter pageWriter,
ParquetProperties props) {
+ this(path, pageWriter, null, props);
+ }
+
+ ColumnWriterBase(
+ ColumnDescriptor path,
+ PageWriter pageWriter,
+ BloomFilterWriter bloomFilterWriter,
+ ParquetProperties props
+ ) {
this.path = path;
this.pageWriter = pageWriter;
resetStatistics();
@@ -64,6 +81,30 @@ abstract class ColumnWriterBase implements ColumnWriter {
this.repetitionLevelColumn = createRLWriter(props, path);
this.definitionLevelColumn = createDLWriter(props, path);
this.dataColumn = props.newValuesWriter(path);
+
+ this.bloomFilterWriter = bloomFilterWriter;
+ Set bloomFilterColumns = props.getBloomFilterColumns();
+ String column = String.join(".", path.getPath());
+ if (!bloomFilterColumns.contains(column)) {
+ this.bloomFilter = null;
+ return;
+ }
+ int maxBloomFilterSize = props.getMaxBloomFilterBytes();
+
+ Map bloomFilterColumnExpectedNDVs = props.getBloomFilterColumnExpectedNDVs();
+ if (bloomFilterColumnExpectedNDVs.size() > 0) {
+ // If user specify the column NDV, we construct Bloom filter from it.
+ if (bloomFilterColumnExpectedNDVs.containsKey(column)) {
+ int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(
+ bloomFilterColumnExpectedNDVs.get(column).intValue(), BlockSplitBloomFilter.DEFAULT_FPP);
+
+ this.bloomFilter = new BlockSplitBloomFilter(optimalNumOfBits / 8, maxBloomFilterSize);
+ } else {
+ this.bloomFilter = null;
+ }
+ } else {
+ this.bloomFilter = new BlockSplitBloomFilter(maxBloomFilterSize);
+ }
}
abstract ValuesWriter createRLWriter(ParquetProperties props, ColumnDescriptor path);
@@ -122,6 +163,36 @@ public long getBufferedSizeInMemory() {
+ pageWriter.getMemSize();
}
+ private void updateBloomFilter(int value) {
+ if (bloomFilter != null) {
+ bloomFilter.insertHash(bloomFilter.hash(value));
+ }
+ }
+
+ private void updateBloomFilter(long value) {
+ if (bloomFilter != null) {
+ bloomFilter.insertHash(bloomFilter.hash(value));
+ }
+ }
+
+ private void updateBloomFilter(double value) {
+ if (bloomFilter != null) {
+ bloomFilter.insertHash(bloomFilter.hash(value));
+ }
+ }
+
+ private void updateBloomFilter(float value) {
+ if (bloomFilter != null) {
+ bloomFilter.insertHash(bloomFilter.hash(value));
+ }
+ }
+
+ private void updateBloomFilter(Binary value) {
+ if (bloomFilter != null) {
+ bloomFilter.insertHash(bloomFilter.hash(value));
+ }
+ }
+
/**
* Writes the current value
*
@@ -137,6 +208,7 @@ public void write(double value, int repetitionLevel, int definitionLevel) {
definitionLevel(definitionLevel);
dataColumn.writeDouble(value);
statistics.updateStats(value);
+ updateBloomFilter(value);
++valueCount;
}
@@ -155,6 +227,7 @@ public void write(float value, int repetitionLevel, int definitionLevel) {
definitionLevel(definitionLevel);
dataColumn.writeFloat(value);
statistics.updateStats(value);
+ updateBloomFilter(value);
++valueCount;
}
@@ -173,6 +246,7 @@ public void write(Binary value, int repetitionLevel, int definitionLevel) {
definitionLevel(definitionLevel);
dataColumn.writeBytes(value);
statistics.updateStats(value);
+ updateBloomFilter(value);
++valueCount;
}
@@ -209,6 +283,7 @@ public void write(int value, int repetitionLevel, int definitionLevel) {
definitionLevel(definitionLevel);
dataColumn.writeInteger(value);
statistics.updateStats(value);
+ updateBloomFilter(value);
++valueCount;
}
@@ -227,6 +302,7 @@ public void write(long value, int repetitionLevel, int definitionLevel) {
definitionLevel(definitionLevel);
dataColumn.writeLong(value);
statistics.updateStats(value);
+ updateBloomFilter(value);
++valueCount;
}
@@ -246,6 +322,10 @@ void finalizeColumnChunk() {
}
dataColumn.resetDictionary();
}
+
+ if (bloomFilterWriter != null && bloomFilter != null) {
+ bloomFilterWriter.writeBloomFilter(bloomFilter);
+ }
}
/**
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java
index 646e31aa7e..752042480b 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java
@@ -27,6 +27,7 @@
import org.apache.parquet.column.page.PageWriter;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
/**
* Writes (repetition level, definition level, value) triplets and deals with writing pages to the underlying layer.
@@ -37,6 +38,11 @@ final class ColumnWriterV1 extends ColumnWriterBase {
super(path, pageWriter, props);
}
+ public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter,
+ BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
+ super(path, pageWriter, bloomFilterWriter, props);
+ }
+
@Override
ValuesWriter createRLWriter(ParquetProperties props, ColumnDescriptor path) {
return props.newRepetitionLevelWriter(path);
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java
index e4e8563cb9..cc44e2d630 100644
--- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java
+++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -28,6 +28,7 @@
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.ValuesWriter;
import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter;
+import org.apache.parquet.column.values.bloomfilter.BloomFilterWriter;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter;
import org.apache.parquet.io.ParquetEncodingException;
@@ -59,6 +60,11 @@ public BytesInput getBytes() {
super(path, pageWriter, props);
}
+ ColumnWriterV2(ColumnDescriptor path, PageWriter pageWriter, BloomFilterWriter bloomFilterWriter,
+ ParquetProperties props) {
+ super(path, pageWriter, bloomFilterWriter, props);
+ }
+
@Override
ValuesWriter createRLWriter(ParquetProperties props, ColumnDescriptor path) {
return path.getMaxRepetitionLevel() == 0 ? NULL_WRITER : new RLEWriterForV2(props.newRepetitionLevelEncoder(path));
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java
new file mode 100644
index 0000000000..a74c4265e0
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java
@@ -0,0 +1,382 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.column.values.bloomfilter;
+
+import org.apache.parquet.Preconditions;
+import org.apache.parquet.io.api.Binary;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.IntBuffer;
+
+/*
+ * This Bloom filter is implemented using block-based Bloom filter algorithm from Putze et al.'s
+ * "Cache-, Hash- and Space-Efficient Bloom filters". The basic idea is to hash the item to a tiny
+ * Bloom filter which size fit a single cache line or smaller. This implementation sets 8 bits in
+ * each tiny Bloom filter. Each tiny Bloom filter is 32 bytes to take advantage of 32-byte SIMD
+ * instruction.
+ */
+public class BlockSplitBloomFilter implements BloomFilter {
+ // Bytes in a tiny Bloom filter block.
+ private static final int BYTES_PER_BLOCK = 32;
+
+ // Bits in a tiny Bloom filter block.
+ private static final int BITS_PER_BLOCK = 256;
+
+ // The lower bound of bloom filter size, set to the size of a tiny Bloom filter block.
+ public static final int LOWER_BOUND_BYTES = 32;
+
+ // The upper bound of bloom filter size, set to default row group size.
+ public static final int UPPER_BOUND_BYTES = 128 * 1024 * 1024;
+
+ // The number of bits to set in a tiny Bloom filter
+ private static final int BITS_SET_PER_BLOCK = 8;
+
+ // The metadata in the header of a serialized Bloom filter is four four-byte values: the number of bytes,
+ // the filter algorithm, the hash algorithm, and the compression.
+ public static final int HEADER_SIZE = 16;
+
+ // The default false positive probability value
+ public static final double DEFAULT_FPP = 0.01;
+
+ // The hash strategy used in this Bloom filter.
+ private final HashStrategy hashStrategy;
+
+ // The underlying byte array for Bloom filter bitset.
+ private byte[] bitset;
+
+ // A integer array buffer of underlying bitset to help setting bits.
+ private IntBuffer intBuffer;
+
+ // Hash function use to compute hash for column value.
+ private HashFunction hashFunction;
+
+ private int maximumBytes = UPPER_BOUND_BYTES;
+ private int minimumBytes = LOWER_BOUND_BYTES;
+
+ // A cache used for hashing
+ private ByteBuffer cacheBuffer = ByteBuffer.allocate(Long.BYTES);
+
+ private int[] mask = new int[BITS_SET_PER_BLOCK];
+
+ // The block-based algorithm needs 8 odd SALT values to calculate eight indexes
+ // of bits to set, one per 32-bit word.
+ private static final int[] SALT = {0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d,
+ 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31};
+
+ /**
+ * Constructor of block-based Bloom filter.
+ *
+ * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within
+ * [DEFAULT_MINIMUM_BYTES, DEFAULT_MAXIMUM_BYTES], it will be rounded up/down
+ * to lower/upper bound if num_bytes is out of range. It will also be rounded up to a power
+ * of 2. It uses XXH64 as its default hash function.
+ */
+ public BlockSplitBloomFilter(int numBytes) {
+ this(numBytes, LOWER_BOUND_BYTES, UPPER_BOUND_BYTES, HashStrategy.XXH64);
+ }
+
+ /**
+ * Constructor of block-based Bloom filter.
+ *
+ * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within
+ * [DEFAULT_MINIMUM_BYTES, maximumBytes], it will be rounded up/down
+ * to lower/upper bound if num_bytes is out of range. It will also be rounded up to a power
+ * of 2. It uses XXH64 as its default hash function.
+ * @param maximumBytes The maximum bytes of the Bloom filter.
+ */
+ public BlockSplitBloomFilter(int numBytes, int maximumBytes) {
+ this(numBytes, LOWER_BOUND_BYTES, maximumBytes, HashStrategy.XXH64);
+ }
+
+ /**
+ * Constructor of block-based Bloom filter.
+ *
+ * @param numBytes The number of bytes for Bloom filter bitset
+ * @param hashStrategy The hash strategy of Bloom filter.
+ */
+ private BlockSplitBloomFilter(int numBytes, HashStrategy hashStrategy) {
+ this(numBytes, LOWER_BOUND_BYTES, UPPER_BOUND_BYTES, hashStrategy);
+ }
+
+ /**
+ * Constructor of block-based Bloom filter.
+ *
+ * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within
+ * [minimumBytes, maximumBytes], it will be rounded up/down to lower/upper bound if
+ * num_bytes is out of range. It will also be rounded up to a power of 2.
+ * @param minimumBytes The minimum bytes of the Bloom filter.
+ * @param maximumBytes The maximum bytes of the Bloom filter.
+ * @param hashStrategy The adopted hash strategy of the Bloom filter.
+ */
+ public BlockSplitBloomFilter(int numBytes, int minimumBytes, int maximumBytes, HashStrategy hashStrategy) {
+ if (minimumBytes > maximumBytes) {
+ throw new IllegalArgumentException("the minimum bytes should be less or equal than maximum bytes");
+ }
+
+ if (minimumBytes > LOWER_BOUND_BYTES && minimumBytes < UPPER_BOUND_BYTES) {
+ this.minimumBytes = minimumBytes;
+ }
+
+ if (maximumBytes > LOWER_BOUND_BYTES && maximumBytes < UPPER_BOUND_BYTES) {
+ this.maximumBytes = maximumBytes;
+ }
+
+ initBitset(numBytes);
+
+ cacheBuffer.order(ByteOrder.LITTLE_ENDIAN);
+
+ switch (hashStrategy) {
+ case XXH64:
+ this.hashStrategy = hashStrategy;
+ hashFunction = new XxHash();
+ break;
+ default:
+ throw new RuntimeException("Unsupported hash strategy");
+ }
+ }
+
+
+ /**
+ * Construct the Bloom filter with given bitset, it is used when reconstructing
+ * Bloom filter from parquet file. It use XXH64 as its default hash
+ * function.
+ *
+ * @param bitset The given bitset to construct Bloom filter.
+ */
+ public BlockSplitBloomFilter(byte[] bitset) {
+ this(bitset, HashStrategy.XXH64);
+ }
+
+ /**
+ * Construct the Bloom filter with given bitset, it is used when reconstructing
+ * Bloom filter from parquet file.
+ *
+ * @param bitset The given bitset to construct Bloom filter.
+ * @param hashStrategy The hash strategy Bloom filter apply.
+ */
+ private BlockSplitBloomFilter(byte[] bitset, HashStrategy hashStrategy) {
+ if (bitset == null) {
+ throw new RuntimeException("Given bitset is null");
+ }
+
+ cacheBuffer.order(ByteOrder.LITTLE_ENDIAN);
+ this.bitset = bitset;
+ this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
+ switch (hashStrategy) {
+ case XXH64:
+ this.hashStrategy = hashStrategy;
+ hashFunction = new XxHash();
+ break;
+ default:
+ throw new RuntimeException("Unsupported hash strategy");
+ }
+ }
+
+ /**
+ * Create a new bitset for Bloom filter.
+ *
+ * @param numBytes The number of bytes for Bloom filter bitset. The range of num_bytes should be within
+ * [minimumBytes, maximumBytes], it will be rounded up/down
+ * to lower/upper bound if num_bytes is out of range and also will rounded up to a power
+ * of 2. It uses XXH64 as its default hash function and block-based algorithm
+ * as default algorithm.
+ */
+ private void initBitset(int numBytes) {
+ if (numBytes < minimumBytes) {
+ numBytes = minimumBytes;
+ }
+ // Get next power of 2 if it is not power of 2.
+ if ((numBytes & (numBytes - 1)) != 0) {
+ numBytes = Integer.highestOneBit(numBytes) << 1;
+ }
+ if (numBytes > maximumBytes || numBytes < 0) {
+ numBytes = maximumBytes;
+ }
+ this.bitset = new byte[numBytes];
+ this.intBuffer = ByteBuffer.wrap(bitset).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer();
+ }
+
+ @Override
+ public void writeTo(OutputStream out) throws IOException {
+ out.write(bitset);
+ }
+
+ private int[] setMask(int key) {
+ // The following three loops are written separately so that they could be
+ // optimized for vectorization.
+ for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) {
+ mask[i] = key * SALT[i];
+ }
+
+ for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) {
+ mask[i] = mask[i] >>> 27;
+ }
+
+ for (int i = 0; i < BITS_SET_PER_BLOCK; ++i) {
+ mask[i] = 0x1 << mask[i];
+ }
+
+ return mask;
+ }
+
+ @Override
+ public void insertHash(long hash) {
+ long numBlocks = bitset.length / BYTES_PER_BLOCK;
+ long lowHash = hash >>> 32;
+ int blockIndex = (int)((lowHash * numBlocks) >> 32);
+ int key = (int)hash;
+
+ // Calculate mask for bucket.
+ int[] mask = setMask(key);
+ for (int i = 0; i < BITS_SET_PER_BLOCK; i++) {
+ int value = intBuffer.get(blockIndex * (BYTES_PER_BLOCK / 4) + i);
+ value |= mask[i];
+ intBuffer.put(blockIndex * (BYTES_PER_BLOCK / 4) + i, value);
+ }
+ }
+
+ @Override
+ public boolean findHash(long hash) {
+ long numBlocks = bitset.length / BYTES_PER_BLOCK;
+ long lowHash = hash >>> 32;
+ int blockIndex = (int)((lowHash * numBlocks) >> 32);
+ int key = (int)hash;
+
+ // Calculate mask for the tiny Bloom filter.
+ int[] mask = setMask(key);
+ for (int i = 0; i < BITS_SET_PER_BLOCK; i++) {
+ if (0 == (intBuffer.get(blockIndex * (BYTES_PER_BLOCK / 4) + i) & mask[i])) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Calculate optimal size according to the number of distinct values and false positive probability.
+ *
+ * @param n: The number of distinct values.
+ * @param p: The false positive probability.
+ *
+ * @return optimal number of bits of given n and p.
+ */
+ public static int optimalNumOfBits(long n, double p) {
+ Preconditions.checkArgument((p > 0.0 && p < 1.0),
+ "FPP should be less than 1.0 and great than 0.0");
+ final double m = -8 * n / Math.log(1 - Math.pow(p, 1.0 / 8));
+ int numBits = (int)m ;
+
+ // Handle overflow.
+ if (numBits > UPPER_BOUND_BYTES << 3 || m < 0) {
+ numBits = UPPER_BOUND_BYTES << 3;
+ }
+
+ // Round numBits up to (k * BITS_PER_BLOCK)
+ numBits = (numBits + BITS_PER_BLOCK -1) & ~BITS_PER_BLOCK;
+
+ if (numBits < (LOWER_BOUND_BYTES << 3)) {
+ numBits = LOWER_BOUND_BYTES << 3;
+ }
+
+ return numBits;
+ }
+
+ @Override
+ public int getBitsetSize() {
+ return this.bitset.length;
+ }
+
+ @Override
+ public long hash(Object value) {
+ if (value instanceof Binary) {
+ return hashFunction.hashBytes(((Binary) value).getBytes());
+ }
+
+ if (value instanceof Integer) {
+ cacheBuffer.putInt((Integer)value);
+ } else if (value instanceof Long) {
+ cacheBuffer.putLong((Long)value);
+ } else if (value instanceof Float) {
+ cacheBuffer.putFloat((Float)value);
+ } else if (value instanceof Double) {
+ cacheBuffer.putDouble((Double) value);
+ } else {
+ throw new RuntimeException("Parquet Bloom filter: Not supported type");
+ }
+
+ return doHash();
+ }
+
+ @Override
+ public HashStrategy getHashStrategy() {
+ return HashStrategy.XXH64;
+ }
+
+ @Override
+ public Algorithm getAlgorithm() {
+ return Algorithm.BLOCK;
+ }
+
+ @Override
+ public Compression getCompression() {
+ return Compression.UNCOMPRESSED;
+ }
+
+ private long doHash() {
+ cacheBuffer.flip();
+ long hashResult = hashFunction.hashByteBuffer(cacheBuffer);
+ cacheBuffer.clear();
+
+ return hashResult;
+ }
+
+ @Override
+ public long hash(int value) {
+ cacheBuffer.putInt(value);
+ return doHash();
+ }
+
+ @Override
+ public long hash(long value) {
+ cacheBuffer.putLong(value);
+ return doHash();
+ }
+
+ @Override
+ public long hash(double value) {
+ cacheBuffer.putDouble(value);
+ return doHash();
+ }
+
+ @Override
+ public long hash(float value) {
+ cacheBuffer.putFloat(value);
+ return doHash();
+ }
+
+ @Override
+ public long hash(Binary value) {
+ return hashFunction.hashBytes(value.getBytes());
+ }
+}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java
new file mode 100644
index 0000000000..27926e0e2a
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilter.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.values.bloomfilter;
+
+import org.apache.parquet.io.api.Binary;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+/**
+ * A Bloom filter is a compact structure to indicate whether an item is not in a set or probably
+ * in a set. The Bloom filter usually consists of a bit set that represents a elements set,
+ * a hash strategy and a Bloom filter algorithm.
+ */
+public interface BloomFilter {
+ /* Bloom filter Hash strategy.
+ *
+ * xxHash is an extremely fast hash algorithm, running at RAM speed limits. It successfully
+ * completes the SMHasher test suite which evaluates collision, dispersion and randomness qualities
+ * of hash functions. It shows good performance advantage from its benchmark result.
+ * (see https://github.com/Cyan4973/xxHash).
+ */
+ enum HashStrategy {
+ XXH64;
+
+ @Override
+ public String toString() {
+ return "xxhash";
+ }
+ }
+
+ // Bloom filter algorithm.
+ enum Algorithm {
+ BLOCK;
+
+ @Override
+ public String toString() {
+ return "block";
+ }
+ }
+
+ // Bloom filter compression.
+ enum Compression {
+ UNCOMPRESSED;
+
+ @Override
+ public String toString() {
+ return "uncompressed";
+ }
+ }
+
+ /**
+ * Write the Bloom filter to an output stream. It writes the Bloom filter header including the
+ * bitset's length in bytes, the hash strategy, the algorithm, and the bitset.
+ *
+ * @param out the output stream to write
+ */
+ void writeTo(OutputStream out) throws IOException;
+
+ /**
+ * Insert an element to the Bloom filter, the element content is represented by
+ * the hash value of its plain encoding result.
+ *
+ * @param hash the hash result of element.
+ */
+ void insertHash(long hash);
+
+ /**
+ * Determine whether an element is in set or not.
+ *
+ * @param hash the hash value of element plain encoding result.
+ * @return false if element is must not in set, true if element probably in set.
+ */
+ boolean findHash(long hash);
+
+ /**
+ * Get the number of bytes for bitset in this Bloom filter.
+ *
+ * @return The number of bytes for bitset in this Bloom filter.
+ */
+ int getBitsetSize();
+
+ /**
+ * Compute hash for int value by using its plain encoding result.
+ *
+ * @param value the value to hash
+ * @return hash result
+ */
+ long hash(int value);
+
+ /**
+ * Compute hash for long value by using its plain encoding result.
+ *
+ * @param value the value to hash
+ * @return hash result
+ */
+ long hash(long value) ;
+
+ /**
+ * Compute hash for double value by using its plain encoding result.
+ *
+ * @param value the value to hash
+ * @return hash result
+ */
+ long hash(double value);
+
+ /**
+ * Compute hash for float value by using its plain encoding result.
+ *
+ * @param value the value to hash
+ * @return hash result
+ */
+ long hash(float value);
+
+ /**
+ * Compute hash for Binary value by using its plain encoding result.
+ *
+ * @param value the value to hash
+ * @return hash result
+ */
+ long hash(Binary value);
+
+ /**
+ * Compute hash for Object value by using its plain encoding result.
+ *
+ * @param value the value to hash
+ * @return hash result
+ */
+ long hash(Object value);
+
+ // The boolean type is not supported because boolean type has only two values, while Bloom filter is
+ // suitable for high cardinality.
+ // long hash(Boolean value);
+
+ /**
+ * Return the hash strategy that the bloom filter apply.
+ *
+ * @return hash strategy that the bloom filter apply
+ */
+ HashStrategy getHashStrategy();
+
+ /**
+ * Return the algorithm that the bloom filter apply.
+ *
+ * @return algorithm that the bloom filter apply
+ */
+ Algorithm getAlgorithm();
+
+ /**
+ * Return the compress algorithm that the bloom filter apply.
+ *
+ * @return compress algorithm that the bloom filter apply
+ */
+ Compression getCompression();
+}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java
new file mode 100644
index 0000000000..f7e28fdf2d
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriteStore.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.column.values.bloomfilter;
+
+import org.apache.parquet.column.ColumnDescriptor;
+
+/**
+ * Contains all writers for all columns of a row group
+ */
+public interface BloomFilterWriteStore {
+ /**
+ * Get bloom filter writer of a column
+ *
+ * @param path the descriptor for the column
+ * @return the corresponding Bloom filter writer
+ */
+ BloomFilterWriter getBloomFilterWriter(ColumnDescriptor path);
+}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java
new file mode 100644
index 0000000000..e2504d8216
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BloomFilterWriter.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.column.values.bloomfilter;
+
+public interface BloomFilterWriter {
+ /**
+ * Write a Bloom filter
+ *
+ * @param bloomFilter the Bloom filter to write
+ *
+ */
+ void writeBloomFilter(BloomFilter bloomFilter);
+}
+
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/HashFunction.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/HashFunction.java
new file mode 100644
index 0000000000..2043934fb2
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/HashFunction.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.values.bloomfilter;
+
+import java.nio.ByteBuffer;
+
+/**
+ * A interface contains a set of hash functions used by Bloom filter.
+ */
+public interface HashFunction {
+
+ /**
+ * compute the hash value for a byte array.
+ * @param input the input byte array
+ * @return a result of long value.
+ */
+ long hashBytes(byte[] input);
+
+ /**
+ * compute the hash value for a ByteBuffer.
+ * @param input the input ByteBuffer
+ * @return a result of long value.
+ */
+ long hashByteBuffer(ByteBuffer input);
+}
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/XxHash.java b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/XxHash.java
new file mode 100644
index 0000000000..6c52b3c987
--- /dev/null
+++ b/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/XxHash.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.parquet.column.values.bloomfilter;
+
+import net.openhft.hashing.LongHashFunction;
+
+import java.nio.ByteBuffer;
+
+/**
+ * The implementation of HashFunction interface. The XxHash uses XXH64 version xxHash
+ * with a seed of 0.
+ */
+public class XxHash implements HashFunction {
+ @Override
+ public long hashBytes(byte[] input) {
+ return LongHashFunction.xx(0).hashBytes(input);
+ }
+
+ @Override
+ public long hashByteBuffer(ByteBuffer input) {
+ return LongHashFunction.xx(0).hashBytes(input);
+ }
+}
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java
new file mode 100644
index 0000000000..9d2aacc7f1
--- /dev/null
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/bloomfilter/TestBlockSplitBloomFilter.java
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.values.bloomfilter;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import net.openhft.hashing.LongHashFunction;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.apache.parquet.io.api.Binary;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TestBlockSplitBloomFilter {
+
+ @Test
+ public void testConstructor () {
+ BloomFilter bloomFilter1 = new BlockSplitBloomFilter(0);
+ assertEquals(bloomFilter1.getBitsetSize(), BlockSplitBloomFilter.LOWER_BOUND_BYTES);
+ BloomFilter bloomFilter3 = new BlockSplitBloomFilter(1000);
+ assertEquals(bloomFilter3.getBitsetSize(), 1024);
+ }
+
+ @Rule
+ public final TemporaryFolder temp = new TemporaryFolder();
+
+ /*
+ * This test is used to test basic operations including inserting, finding and
+ * serializing and de-serializing.
+ */
+ @Test
+ public void testBloomFilterForString() {
+ final int numValues = 1024 * 1024;
+ int numBytes = BlockSplitBloomFilter.optimalNumOfBits(numValues , 0.01) / 8;
+ BloomFilter bloomFilter = new BlockSplitBloomFilter(numBytes);
+
+ Set testStrings = new HashSet<>();
+ for (int i = 0; i < numValues; i ++) {
+ String str = RandomStringUtils.randomAlphabetic(1, 64);
+ bloomFilter.insertHash(bloomFilter.hash(Binary.fromString(str)));
+ testStrings.add(str);
+ }
+
+ for (String testString : testStrings) {
+ assertTrue(bloomFilter.findHash(bloomFilter.hash(Binary.fromString(testString))));
+ }
+ }
+
+ @Test
+ public void testBloomFilterForPrimitives() {
+ for (int i = 0; i < 4; i++) {
+ long seed = System.nanoTime();
+ testBloomFilterForPrimitives(seed);
+ }
+ }
+
+ private void testBloomFilterForPrimitives(long seed) {
+ Random random = new Random(seed);
+ final int numValues = 1024 * 1024;
+ final int numBytes = BlockSplitBloomFilter.optimalNumOfBits(numValues, random.nextDouble() / 10) / 8;
+ BloomFilter bloomFilter = new BlockSplitBloomFilter(numBytes);
+
+ Set