From 8f8dafe43ce2bdcfaae906930679f57a8b660126 Mon Sep 17 00:00:00 2001 From: Scott Sandre Date: Mon, 23 Sep 2024 16:06:13 -0700 Subject: [PATCH] Use more shards, reduce skew --- .github/workflows/spark_test.yaml | 4 ++-- project/TestParallelization.scala | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/spark_test.yaml b/.github/workflows/spark_test.yaml index f521673f0f..3d0b61c3ab 100644 --- a/.github/workflows/spark_test.yaml +++ b/.github/workflows/spark_test.yaml @@ -9,11 +9,11 @@ jobs: # These Scala versions must match those in the build.sbt scala: [2.12.18, 2.13.13] # Important: This list of shards must be [0..NUM_SHARDS - 1] - shard: [0, 1, 2] + shard: [0, 1, 2, 3] env: SCALA_VERSION: ${{ matrix.scala }} # Important: This must be the same as the length of shards in matrix - NUM_SHARDS: 3 + NUM_SHARDS: 4 steps: - uses: actions/checkout@v3 - uses: technote-space/get-diff-action@v4 diff --git a/project/TestParallelization.scala b/project/TestParallelization.scala index aa973ae660..f76f8bfc71 100644 --- a/project/TestParallelization.scala +++ b/project/TestParallelization.scala @@ -1,3 +1,6 @@ +import scala.util.Random +import scala.util.hashing.MurmurHash3 + import sbt.Keys._ import sbt._ @@ -146,13 +149,14 @@ object TestParallelization { } val testIsAssignedToShard = - math.abs(testDefinition.name.hashCode % numShards.get) == shardId.get + math.abs(MurmurHash3.stringHash(testDefinition.name) % numShards.get) == shardId.get + if(!testIsAssignedToShard) { return new SimpleHashStrategy(groups, shardId) } } - val groupIdx = math.abs(testDefinition.name.hashCode % groupCount) + val groupIdx = Random.nextInt(groupCount) val currentGroup = groups(groupIdx) val updatedGroup = currentGroup.withTests( currentGroup.tests :+ testDefinition