Merge branch 'master' into fork-sparksession

apache · Mar 8, 2017 · 4c23e7a · 4c23e7a
2 parents 05abcf8 + e420fd4
commit 4c23e7a
Show file tree

Hide file tree

Showing 117 changed files with 3,394 additions and 1,053 deletions.
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #'
 #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
 #' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
+#' Input SparkDataFrames can have different schemas (names and data types).
 #'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
@@ -2685,7 +2686,8 @@ setMethod("unionAll",
 
 #' Union two or more SparkDataFrames
 #'
-#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
+#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
+#' requires that the input SparkDataFrames have the same column names.
 #'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
@@ -2709,6 +2711,10 @@ setMethod("unionAll",
 setMethod("rbind",
           signature(... = "SparkDataFrame"),
           function(x, ..., deparse.level = 1) {
+            nm <- lapply(list(x, ...), names)
+            if (length(unique(nm)) != 1) {
+              stop("Names of input data frames are different.")
+            }
             if (nargs() == 3) {
               union(x, ...)
             } else {

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1850,6 +1850,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
   expect_equal(count(unioned2), 12)
   expect_equal(first(unioned2)$name, "Michael")
 
+  df3 <- df2
+  names(df3)[1] <- "newName"
+  expect_error(rbind(df, df3),
+               "Names of input data frames are different.")
+  expect_error(rbind(df, df2, df3),
+               "Names of input data frames are different.")
+
   excepted <- arrange(except(df, df2), desc(df$age))
   expect_is(unioned, "SparkDataFrame")
   expect_equal(count(excepted), 2)
@@ -2585,8 +2592,8 @@ test_that("coalesce, repartition, numPartitions", {
 
   df2 <- repartition(df1, 10)
   expect_equal(getNumPartitions(df2), 10)
-  expect_equal(getNumPartitions(coalesce(df2, 13)), 5)
-  expect_equal(getNumPartitions(coalesce(df2, 7)), 5)
+  expect_equal(getNumPartitions(coalesce(df2, 13)), 10)
+  expect_equal(getNumPartitions(coalesce(df2, 7)), 7)
   expect_equal(getNumPartitions(coalesce(df2, 3)), 3)
 })
 

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -850,26 +850,42 @@ public UTF8String translate(Map<Character, Character> dict) {
     return fromString(sb.toString());
   }
 
-  private int getDigit(byte b) {
-    if (b >= '0' && b <= '9') {
-      return b - '0';
-    }
-    throw new NumberFormatException(toString());
+  /**
+   * Wrapper over `long` to allow result of parsing long from string to be accessed via reference.
+   * This is done solely for better performance and is not expected to be used by end users.
+   */
+  public static class LongWrapper {
+    public long value = 0;
+  }
+
+  /**
+   * Wrapper over `int` to allow result of parsing integer from string to be accessed via reference.
+   * This is done solely for better performance and is not expected to be used by end users.
+   *
+   * {@link LongWrapper} could have been used here but using `int` directly save the extra cost of
+   * conversion from `long` -> `int`
+   */
+  public static class IntWrapper {
+    public int value = 0;
   }
 
   /**
    * Parses this UTF8String to long.
    *
    * Note that, in this method we accumulate the result in negative format, and convert it to
    * positive format at the end, if this string is not started with '-'. This is because min value
-   * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
-   * Integer.MIN_VALUE is '-2147483648'.
+   * is bigger than max value in digits, e.g. Long.MAX_VALUE is '9223372036854775807' and
+   * Long.MIN_VALUE is '-9223372036854775808'.
    *
    * This code is mostly copied from LazyLong.parseLong in Hive.
+   *
+   * @param toLongResult If a valid `long` was parsed from this UTF8String, then its value would
+   *                     be set in `toLongResult`
+   * @return true if the parsing was successful else false
    */
-  public long toLong() {
+  public boolean toLong(LongWrapper toLongResult) {
     if (numBytes == 0) {
-      throw new NumberFormatException("Empty string");
+      return false;
     }
 
     byte b = getByte(0);
@@ -878,7 +894,7 @@ public long toLong() {
     if (negative || b == '+') {
       offset++;
       if (numBytes == 1) {
-        throw new NumberFormatException(toString());
+        return false;
       }
     }
 
@@ -897,41 +913,48 @@ public long toLong() {
         break;
       }
 
-      int digit = getDigit(b);
+      int digit;
+      if (b >= '0' && b <= '9') {
+        digit = b - '0';
+      } else {
+        return false;
+      }
+
       // We are going to process the new digit and accumulate the result. However, before doing
       // this, if the result is already smaller than the stopValue(Long.MIN_VALUE / radix), then
-      // result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
+      // result * 10 will definitely be smaller than minValue, and we can stop.
       if (result < stopValue) {
-        throw new NumberFormatException(toString());
+        return false;
       }
 
       result = result * radix - digit;
       // Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / radix), we
-      // can just use `result > 0` to check overflow. If result overflows, we should stop and throw
-      // exception.
+      // can just use `result > 0` to check overflow. If result overflows, we should stop.
       if (result > 0) {
-        throw new NumberFormatException(toString());
+        return false;
       }
     }
 
     // This is the case when we've encountered a decimal separator. The fractional
     // part will not change the number, but we will verify that the fractional part
     // is well formed.
     while (offset < numBytes) {
-      if (getDigit(getByte(offset)) == -1) {
-        throw new NumberFormatException(toString());
+      byte currentByte = getByte(offset);
+      if (currentByte < '0' || currentByte > '9') {
+        return false;
       }
       offset++;
     }
 
     if (!negative) {
       result = -result;
       if (result < 0) {
-        throw new NumberFormatException(toString());
+        return false;
       }
     }
 
-    return result;
+    toLongResult.value = result;
+    return true;
   }
 
   /**
@@ -946,10 +969,14 @@ public long toLong() {
    *
    * Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
    * reasons, like Hive does.
+   *
+   * @param intWrapper If a valid `int` was parsed from this UTF8String, then its value would
+   *                    be set in `intWrapper`
+   * @return true if the parsing was successful else false
    */
-  public int toInt() {
+  public boolean toInt(IntWrapper intWrapper) {
     if (numBytes == 0) {
-      throw new NumberFormatException("Empty string");
+      return false;
     }
 
     byte b = getByte(0);
@@ -958,7 +985,7 @@ public int toInt() {
     if (negative || b == '+') {
       offset++;
       if (numBytes == 1) {
-        throw new NumberFormatException(toString());
+        return false;
       }
     }
 
@@ -977,61 +1004,69 @@ public int toInt() {
         break;
       }
 
-      int digit = getDigit(b);
+      int digit;
+      if (b >= '0' && b <= '9') {
+        digit = b - '0';
+      } else {
+        return false;
+      }
+
       // We are going to process the new digit and accumulate the result. However, before doing
       // this, if the result is already smaller than the stopValue(Integer.MIN_VALUE / radix), then
-      // result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
+      // result * 10 will definitely be smaller than minValue, and we can stop
       if (result < stopValue) {
-        throw new NumberFormatException(toString());
+        return false;
       }
 
       result = result * radix - digit;
       // Since the previous result is less than or equal to stopValue(Integer.MIN_VALUE / radix),
-      // we can just use `result > 0` to check overflow. If result overflows, we should stop and
-      // throw exception.
+      // we can just use `result > 0` to check overflow. If result overflows, we should stop
       if (result > 0) {
-        throw new NumberFormatException(toString());
+        return false;
       }
     }
 
     // This is the case when we've encountered a decimal separator. The fractional
     // part will not change the number, but we will verify that the fractional part
     // is well formed.
     while (offset < numBytes) {
-      if (getDigit(getByte(offset)) == -1) {
-        throw new NumberFormatException(toString());
+      byte currentByte = getByte(offset);
+      if (currentByte < '0' || currentByte > '9') {
+        return false;
       }
       offset++;
     }
 
     if (!negative) {
       result = -result;
       if (result < 0) {
-        throw new NumberFormatException(toString());
+        return false;
       }
     }
-
-    return result;
+    intWrapper.value = result;
+    return true;
   }
 
-  public short toShort() {
-    int intValue = toInt();
-    short result = (short) intValue;
-    if (result != intValue) {
-      throw new NumberFormatException(toString());
+  public boolean toShort(IntWrapper intWrapper) {
+    if (toInt(intWrapper)) {
+      int intValue = intWrapper.value;
+      short result = (short) intValue;
+      if (result == intValue) {
+        return true;
+      }
     }
-
-    return result;
+    return false;
   }
 
-  public byte toByte() {
-    int intValue = toInt();
-    byte result = (byte) intValue;
-    if (result != intValue) {
-      throw new NumberFormatException(toString());
+  public boolean toByte(IntWrapper intWrapper) {
+    if (toInt(intWrapper)) {
+      int intValue = intWrapper.value;
+      byte result = (byte) intValue;
+      if (result == intValue) {
+        return true;
+      }
     }
-
-    return result;
+    return false;
   }
 
   @Override