Merge branch 'master' of github.com:apache/spark into task-metrics-to…

…-accums-followups Conflicts: core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala
apache · Feb 8, 2016 · 854c71f · 854c71f
2 parents 3b1e414 + 37bc203
commit 854c71f
Show file tree

Hide file tree

Showing 376 changed files with 11,834 additions and 3,691 deletions.
diff --git a/.rat-excludes b/.rat-excludes
@@ -25,6 +25,16 @@ graphlib-dot.min.js
 sorttable.js
 vis.min.js
 vis.min.css
+dataTables.bootstrap.css
+dataTables.bootstrap.min.js
+dataTables.rowsGroup.js
+jquery.blockUI.min.js
+jquery.cookies.2.2.0.min.js
+jquery.dataTables.1.10.4.min.css
+jquery.dataTables.1.10.4.min.js
+jquery.mustache.js
+jsonFormatter.min.css
+jsonFormatter.min.js
 .*avsc
 .*txt
 .*json
@@ -63,12 +73,12 @@ logs
 .*dependency-reduced-pom.xml
 known_translations
 json_expectation
-local-1422981759269/*
-local-1422981780767/*
-local-1425081759269/*
-local-1426533911241/*
-local-1426633911242/*
-local-1430917381534/*
+local-1422981759269
+local-1422981780767
+local-1425081759269
+local-1426533911241
+local-1426633911242
+local-1430917381534
 local-1430917381535_1
 local-1430917381535_2
 DESCRIPTION

diff --git a/LICENSE b/LICENSE
@@ -291,3 +291,9 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (MIT License) dagre-d3 (https://github.com/cpettitt/dagre-d3)
      (MIT License) sorttable (https://github.com/stuartlangridge/sorttable)
      (MIT License) boto (https://github.com/boto/boto/blob/develop/LICENSE)
+     (MIT License) datatables (http://datatables.net/license)
+     (MIT License) mustache (https://github.com/mustache/mustache/blob/master/LICENSE)
+     (MIT License) cookies (http://code.google.com/p/cookies/wiki/License)
+     (MIT License) blockUI (http://jquery.malsup.com/block/)
+     (MIT License) RowsGroup (http://datatables.net/license/mit)
+     (MIT License) jsonFormatter (http://www.jqueryscript.net/other/jQuery-Plugin-For-Pretty-JSON-Formatting-jsonFormatter.html)
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -20,13 +20,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.10</artifactId>
+    <artifactId>spark-parent_2.11</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
-  <artifactId>spark-assembly_2.10</artifactId>
+  <artifactId>spark-assembly_2.11</artifactId>
   <name>Spark Project Assembly</name>
   <url>http://spark.apache.org/</url>
   <packaging>pom</packaging>

diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
@@ -21,20 +21,27 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.10</artifactId>
+    <artifactId>spark-parent_2.11</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
-  <artifactId>spark-sketch_2.10</artifactId>
+  <artifactId>spark-sketch_2.11</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Sketch</name>
   <url>http://spark.apache.org/</url>
   <properties>
     <sbt.project.name>sketch</sbt.project.name>
   </properties>
 
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
+  </dependencies>
+
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>

diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BitArray.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BitArray.java
@@ -22,7 +22,7 @@
 import java.io.IOException;
 import java.util.Arrays;
 
-public final class BitArray {
+final class BitArray {
   private final long[] data;
   private long bitCount;
 

diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
@@ -22,31 +22,28 @@
 import java.io.OutputStream;
 
 /**
- * A Bloom filter is a space-efficient probabilistic data structure, that is used to test whether
- * an element is a member of a set. It returns false when the element is definitely not in the
- * set, returns true when the element is probably in the set.
- *
- * Internally a Bloom filter is initialized with 2 information: how many space to use(number of
- * bits) and how many hash values to calculate for each record.  To get as lower false positive
- * probability as possible, user should call {@link BloomFilter#create} to automatically pick a
- * best combination of these 2 parameters.
- *
- * Currently the following data types are supported:
+ * A Bloom filter is a space-efficient probabilistic data structure that offers an approximate
+ * containment test with one-sided error: if it claims that an item is contained in it, this
+ * might be in error, but if it claims that an item is <i>not</i> contained in it, then this is
+ * definitely true. Currently supported data types include:
  * <ul>
  *   <li>{@link Byte}</li>
  *   <li>{@link Short}</li>
  *   <li>{@link Integer}</li>
  *   <li>{@link Long}</li>
  *   <li>{@link String}</li>
  * </ul>
+ * The false positive probability ({@code FPP}) of a Bloom filter is defined as the probability that
+ * {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that hasu
+ * not actually been put in the {@code BloomFilter}.
  *
- * The implementation is largely based on the {@code BloomFilter} class from guava.
+ * The implementation is largely based on the {@code BloomFilter} class from Guava.
  */
 public abstract class BloomFilter {
 
   public enum Version {
     /**
-     * {@code BloomFilter} binary format version 1 (all values written in big-endian order):
+     * {@code BloomFilter} binary format version 1. All values written in big-endian order:
      * <ul>
      *   <li>Version number, always 1 (32 bit)</li>
      *   <li>Number of hash functions (32 bit)</li>
@@ -68,14 +65,13 @@ int getVersionNumber() {
   }
 
   /**
-   * Returns the false positive probability, i.e. the probability that
-   * {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that
-   * has not actually been put in the {@code BloomFilter}.
+   * Returns the probability that {@linkplain #mightContain(Object)} erroneously return {@code true}
+   * for an object that has not actually been put in the {@code BloomFilter}.
    *
-   * <p>Ideally, this number should be close to the {@code fpp} parameter
-   * passed in to create this bloom filter, or smaller. If it is
-   * significantly higher, it is usually the case that too many elements (more than
-   * expected) have been put in the {@code BloomFilter}, degenerating it.
+   * Ideally, this number should be close to the {@code fpp} parameter passed in
+   * {@linkplain #create(long, double)}, or smaller. If it is significantly higher, it is usually
+   * the case that too many items (more than expected) have been put in the {@code BloomFilter},
+   * degenerating it.
    */
   public abstract double expectedFpp();
 
@@ -85,8 +81,8 @@ int getVersionNumber() {
   public abstract long bitSize();
 
   /**
-   * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of
-   * {@link #mightContain(Object)} with the same element will always return {@code true}.
+   * Puts an item into this {@code BloomFilter}. Ensures that subsequent invocations of
+   * {@linkplain #mightContain(Object)} with the same item will always return {@code true}.
    *
    * @return true if the bloom filter's bits changed as a result of this operation. If the bits
    *     changed, this is <i>definitely</i> the first time {@code object} has been added to the
@@ -98,19 +94,19 @@ int getVersionNumber() {
   public abstract boolean put(Object item);
 
   /**
-   * A specialized variant of {@link #put(Object)}, that can only be used to put utf-8 string.
+   * A specialized variant of {@link #put(Object)} that only supports {@code String} items.
    */
-  public abstract boolean putString(String str);
+  public abstract boolean putString(String item);
 
   /**
-   * A specialized variant of {@link #put(Object)}, that can only be used to put long.
+   * A specialized variant of {@link #put(Object)} that only supports {@code long} items.
    */
-  public abstract boolean putLong(long l);
+  public abstract boolean putLong(long item);
 
   /**
-   * A specialized variant of {@link #put(Object)}, that can only be used to put byte array.
+   * A specialized variant of {@link #put(Object)} that only supports byte array items.
    */
-  public abstract boolean putBinary(byte[] bytes);
+  public abstract boolean putBinary(byte[] item);
 
   /**
    * Determines whether a given bloom filter is compatible with this bloom filter. For two
@@ -137,38 +133,36 @@ int getVersionNumber() {
   public abstract boolean mightContain(Object item);
 
   /**
-   * A specialized variant of {@link #mightContain(Object)}, that can only be used to test utf-8
-   * string.
+   * A specialized variant of {@link #mightContain(Object)} that only tests {@code String} items.
    */
-  public abstract boolean mightContainString(String str);
+  public abstract boolean mightContainString(String item);
 
   /**
-   * A specialized variant of {@link #mightContain(Object)}, that can only be used to test long.
+   * A specialized variant of {@link #mightContain(Object)} that only tests {@code long} items.
    */
-  public abstract boolean mightContainLong(long l);
+  public abstract boolean mightContainLong(long item);
 
   /**
-   * A specialized variant of {@link #mightContain(Object)}, that can only be used to test byte
-   * array.
+   * A specialized variant of {@link #mightContain(Object)} that only tests byte array items.
    */
-  public abstract boolean mightContainBinary(byte[] bytes);
+  public abstract boolean mightContainBinary(byte[] item);
 
   /**
-   * Writes out this {@link BloomFilter} to an output stream in binary format.
-   * It is the caller's responsibility to close the stream.
+   * Writes out this {@link BloomFilter} to an output stream in binary format. It is the caller's
+   * responsibility to close the stream.
    */
   public abstract void writeTo(OutputStream out) throws IOException;
 
   /**
-   * Reads in a {@link BloomFilter} from an input stream.
-   * It is the caller's responsibility to close the stream.
+   * Reads in a {@link BloomFilter} from an input stream. It is the caller's responsibility to close
+   * the stream.
    */
   public static BloomFilter readFrom(InputStream in) throws IOException {
     return BloomFilterImpl.readFrom(in);
   }
 
   /**
-   * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the
+   * Computes the optimal k (number of hashes per item inserted in Bloom filter), given the
    * expected insertions and total number of bits in the Bloom filter.
    *
    * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
@@ -197,31 +191,46 @@ private static long optimalNumOfBits(long n, double p) {
   static final double DEFAULT_FPP = 0.03;
 
   /**
-   * Creates a {@link BloomFilter} with given {@code expectedNumItems} and the default {@code fpp}.
+   * Creates a {@link BloomFilter} with the expected number of insertions and a default expected
+   * false positive probability of 3%.
+   *
+   * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
+   * will result in its saturation, and a sharp deterioration of its false positive probability.
    */
   public static BloomFilter create(long expectedNumItems) {
     return create(expectedNumItems, DEFAULT_FPP);
   }
 
   /**
-   * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code fpp}, it will pick
-   * an optimal {@code numBits} and {@code numHashFunctions} for the bloom filter.
+   * Creates a {@link BloomFilter} with the expected number of insertions and expected false
+   * positive probability.
+   *
+   * Note that overflowing a {@code BloomFilter} with significantly more elements than specified,
+   * will result in its saturation, and a sharp deterioration of its false positive probability.
    */
   public static BloomFilter create(long expectedNumItems, double fpp) {
-    assert fpp > 0.0 : "False positive probability must be > 0.0";
-    assert fpp < 1.0 : "False positive probability must be < 1.0";
-    long numBits = optimalNumOfBits(expectedNumItems, fpp);
-    return create(expectedNumItems, numBits);
+    if (fpp <= 0D || fpp >= 1D) {
+      throw new IllegalArgumentException(
+        "False positive probability must be within range (0.0, 1.0)"
+      );
+    }
+
+    return create(expectedNumItems, optimalNumOfBits(expectedNumItems, fpp));
   }
 
   /**
    * Creates a {@link BloomFilter} with given {@code expectedNumItems} and {@code numBits}, it will
    * pick an optimal {@code numHashFunctions} which can minimize {@code fpp} for the bloom filter.
    */
   public static BloomFilter create(long expectedNumItems, long numBits) {
-    assert expectedNumItems > 0 : "Expected insertions must be > 0";
-    assert numBits > 0 : "number of bits must be > 0";
-    int numHashFunctions = optimalNumOfHashFunctions(expectedNumItems, numBits);
-    return new BloomFilterImpl(numHashFunctions, numBits);
+    if (expectedNumItems <= 0) {
+      throw new IllegalArgumentException("Expected insertions must be positive");
+    }
+
+    if (numBits <= 0) {
+      throw new IllegalArgumentException("Number of bits must be positive");
+    }
+
+    return new BloomFilterImpl(optimalNumOfHashFunctions(expectedNumItems, numBits), numBits);
   }
 }