Skip to content

Commit

Permalink
[SPARK-25317][CORE] Avoid perf regression in Murmur3 Hash on UTF8String
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

SPARK-10399 introduced a performance regression on the hash computation for UTF8String.

The regression can be evaluated with the code attached in the JIRA. That code runs in about 120 us per method on my laptop (MacBook Pro 2.5 GHz Intel Core i7, RAM 16 GB 1600 MHz DDR3) while the code from branch 2.3 takes on the same machine about 45 us for me. After the PR, the code takes about 45 us on the master branch too.

## How was this patch tested?

running the perf test from the JIRA

Closes #22338 from mgaido91/SPARK-25317.

Authored-by: Marco Gaido <marcogaido91@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 64c314e)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information
mgaido91 authored and cloud-fan committed Sep 6, 2018
1 parent d749d03 commit b632e77
Showing 1 changed file with 14 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import com.google.common.primitives.Ints;

import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.types.UTF8String;

Expand Down Expand Up @@ -59,7 +60,7 @@ public static int hashUnsafeWordsBlock(MemoryBlock base, int seed) {
// This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` method.
int lengthInBytes = Ints.checkedCast(base.size());
assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 (word-aligned)";
int h1 = hashBytesByIntBlock(base, seed);
int h1 = hashBytesByIntBlock(base, lengthInBytes, seed);
return fmix(h1, lengthInBytes);
}

Expand All @@ -69,22 +70,27 @@ public static int hashUnsafeWords(Object base, long offset, int lengthInBytes, i
}

public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) {
return hashUnsafeBytesBlock(base, Ints.checkedCast(base.size()), seed);
}

private static int hashUnsafeBytesBlock(MemoryBlock base, int lengthInBytes, int seed) {
// This is not compatible with original and another implementations.
// But remain it for backward compatibility for the components existing before 2.3.
int lengthInBytes = Ints.checkedCast(base.size());
assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
int lengthAligned = lengthInBytes - lengthInBytes % 4;
int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
int h1 = hashBytesByIntBlock(base, lengthAligned, seed);
long offset = base.getBaseOffset();
Object o = base.getBaseObject();
for (int i = lengthAligned; i < lengthInBytes; i++) {
int halfWord = base.getByte(i);
int halfWord = Platform.getByte(o, offset + i);
int k1 = mixK1(halfWord);
h1 = mixH1(h1, k1);
}
return fmix(h1, lengthInBytes);
}

public static int hashUTF8String(UTF8String str, int seed) {
return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
return hashUnsafeBytesBlock(str.getMemoryBlock(), str.numBytes(), seed);
}

public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
Expand All @@ -101,7 +107,7 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
int lengthInBytes = Ints.checkedCast(base.size());
assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative";
int lengthAligned = lengthInBytes - lengthInBytes % 4;
int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
int h1 = hashBytesByIntBlock(base, lengthAligned, seed);
int k1 = 0;
for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) {
k1 ^= (base.getByte(i) & 0xFF) << shift;
Expand All @@ -110,11 +116,10 @@ public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
return fmix(h1, lengthInBytes);
}

private static int hashBytesByIntBlock(MemoryBlock base, int seed) {
long lengthInBytes = base.size();
private static int hashBytesByIntBlock(MemoryBlock base, int lengthInBytes, int seed) {
assert (lengthInBytes % 4 == 0);
int h1 = seed;
for (long i = 0; i < lengthInBytes; i += 4) {
for (int i = 0; i < lengthInBytes; i += 4) {
int halfWord = base.getInt(i);
int k1 = mixK1(halfWord);
h1 = mixH1(h1, k1);
Expand Down

0 comments on commit b632e77

Please sign in to comment.