Skip to content

Commit

Permalink
SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException
Browse files Browse the repository at this point in the history
Fix TimSort bug which causes a ArrayOutOfBoundsException.

Using the proposed fix here
http://envisage-project.eu/proving-android-java-and-python-sorting-algorithm-is-broken-and-how-to-fix-it/

Author: Evan Yu <ehotou@gmail.com>

Closes #4804 from hotou/SPARK-5984 and squashes the following commits:

3421b6c [Evan Yu] SPARK-5984: Add info to LICENSE
e61c6b8 [Evan Yu] SPARK-5984: Fix license and document
6ccc280 [Evan Yu] SPARK-5984: Add License header to file
e06c0d2 [Evan Yu] SPARK-5984: Add License header to file
4d95f75 [Evan Yu] SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException
479a106 [Evan Yu] SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException
  • Loading branch information
hotou authored and rxin committed Mar 1, 2015
1 parent 86fcdae commit 643300a
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 5 deletions.
16 changes: 16 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

========================================================================
For TestTimSort (core/src/test/java/org/apache/spark/util/collection/TestTimSort.java):
========================================================================
Copyright (C) 2015 Stijn de Gouw

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

========================================================================
For LimitedInputStream
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -425,15 +425,14 @@ private void pushRun(int runBase, int runLen) {
private void mergeCollapse() {
while (stackSize > 1) {
int n = stackSize - 2;
if (n > 0 && runLen[n-1] <= runLen[n] + runLen[n+1]) {
if ( (n >= 1 && runLen[n-1] <= runLen[n] + runLen[n+1])
|| (n >= 2 && runLen[n-2] <= runLen[n] + runLen[n-1])) {
if (runLen[n - 1] < runLen[n + 1])
n--;
mergeAt(n);
} else if (runLen[n] <= runLen[n + 1]) {
mergeAt(n);
} else {
} else if (runLen[n] > runLen[n + 1]) {
break; // Invariant is established
}
mergeAt(n);
}
}

Expand Down
134 changes: 134 additions & 0 deletions core/src/test/java/org/apache/spark/util/collection/TestTimSort.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/**
* Copyright 2015 Stijn de Gouw
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.spark.util.collection;

import java.util.*;

/**
* This codes generates a int array which fails the standard TimSort.
*
* The blog that reported the bug
* http://www.envisage-project.eu/timsort-specification-and-verification/
*
* This codes was originally wrote by Stijn de Gouw, modified by Evan Yu to adapt to
* our test suite.
*
* https://github.com/abstools/java-timsort-bug
* https://github.com/abstools/java-timsort-bug/blob/master/LICENSE
*/
public class TestTimSort {

private static final int MIN_MERGE = 32;

/**
* Returns an array of integers that demonstrate the bug in TimSort
*/
public static int[] getTimSortBugTestSet(int length) {
int minRun = minRunLength(length);
List<Long> runs = runsJDKWorstCase(minRun, length);
return createArray(runs, length);
}

private static int minRunLength(int n) {
int r = 0; // Becomes 1 if any 1 bits are shifted off
while (n >= MIN_MERGE) {
r |= (n & 1);
n >>= 1;
}
return n + r;
}

private static int[] createArray(List<Long> runs, int length) {
int[] a = new int[length];
Arrays.fill(a, 0);
int endRun = -1;
for (long len : runs) {
a[endRun += len] = 1;
}
a[length - 1] = 0;
return a;
}

/**
* Fills <code>runs</code> with a sequence of run lengths of the form<br>
* Y_n x_{n,1} x_{n,2} ... x_{n,l_n} <br>
* Y_{n-1} x_{n-1,1} x_{n-1,2} ... x_{n-1,l_{n-1}} <br>
* ... <br>
* Y_1 x_{1,1} x_{1,2} ... x_{1,l_1}<br>
* The Y_i's are chosen to satisfy the invariant throughout execution,
* but the x_{i,j}'s are merged (by <code>TimSort.mergeCollapse</code>)
* into an X_i that violates the invariant.
*
* @param length The sum of all run lengths that will be added to <code>runs</code>.
*/
private static List<Long> runsJDKWorstCase(int minRun, int length) {
List<Long> runs = new ArrayList<Long>();

long runningTotal = 0, Y = minRun + 4, X = minRun;

while (runningTotal + Y + X <= length) {
runningTotal += X + Y;
generateJDKWrongElem(runs, minRun, X);
runs.add(0, Y);
// X_{i+1} = Y_i + x_{i,1} + 1, since runs.get(1) = x_{i,1}
X = Y + runs.get(1) + 1;
// Y_{i+1} = X_{i+1} + Y_i + 1
Y += X + 1;
}

if (runningTotal + X <= length) {
runningTotal += X;
generateJDKWrongElem(runs, minRun, X);
}

runs.add(length - runningTotal);
return runs;
}

/**
* Adds a sequence x_1, ..., x_n of run lengths to <code>runs</code> such that:<br>
* 1. X = x_1 + ... + x_n <br>
* 2. x_j >= minRun for all j <br>
* 3. x_1 + ... + x_{j-2} < x_j < x_1 + ... + x_{j-1} for all j <br>
* These conditions guarantee that TimSort merges all x_j's one by one
* (resulting in X) using only merges on the second-to-last element.
*
* @param X The sum of the sequence that should be added to runs.
*/
private static void generateJDKWrongElem(List<Long> runs, int minRun, long X) {
for (long newTotal; X >= 2 * minRun + 1; X = newTotal) {
//Default strategy
newTotal = X / 2 + 1;
//Specialized strategies
if (3 * minRun + 3 <= X && X <= 4 * minRun + 1) {
// add x_1=MIN+1, x_2=MIN, x_3=X-newTotal to runs
newTotal = 2 * minRun + 1;
} else if (5 * minRun + 5 <= X && X <= 6 * minRun + 5) {
// add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=X-newTotal to runs
newTotal = 3 * minRun + 3;
} else if (8 * minRun + 9 <= X && X <= 10 * minRun + 9) {
// add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=2MIN+2, x_5=X-newTotal to runs
newTotal = 5 * minRun + 5;
} else if (13 * minRun + 15 <= X && X <= 16 * minRun + 17) {
// add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=2MIN+2, x_5=3MIN+4, x_6=X-newTotal to runs
newTotal = 8 * minRun + 9;
}
runs.add(0, X - newTotal);
}
runs.add(0, X);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ class SorterSuite extends FunSuite {
}
}

// http://www.envisage-project.eu/timsort-specification-and-verification/
test("SPARK-5984 TimSort bug") {
val data = TestTimSort.getTimSortBugTestSet(67108864)
new Sorter(new IntArraySortDataFormat).sort(data, 0, data.length, Ordering.Int)
(0 to data.length - 2).foreach(i => assert(data(i) <= data(i + 1)))
}

/** Runs an experiment several times. */
def runExperiment(name: String, skip: Boolean = false)(f: => Unit, prepare: () => Unit): Unit = {
if (skip) {
Expand Down

0 comments on commit 643300a

Please sign in to comment.