Skip to content

Commit

Permalink
Merge branch 'master' into fork-sparksession
Browse files Browse the repository at this point in the history
  • Loading branch information
kunalkhamar authored Mar 8, 2017
2 parents 05abcf8 + e420fd4 commit 4c23e7a
Show file tree
Hide file tree
Showing 117 changed files with 3,394 additions and 1,053 deletions.
8 changes: 7 additions & 1 deletion R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
#'
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
#' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
#' Input SparkDataFrames can have different schemas (names and data types).
#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#'
Expand Down Expand Up @@ -2685,7 +2686,8 @@ setMethod("unionAll",

#' Union two or more SparkDataFrames
#'
#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
#' requires that the input SparkDataFrames have the same column names.
#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#'
Expand All @@ -2709,6 +2711,10 @@ setMethod("unionAll",
setMethod("rbind",
signature(... = "SparkDataFrame"),
function(x, ..., deparse.level = 1) {
nm <- lapply(list(x, ...), names)
if (length(unique(nm)) != 1) {
stop("Names of input data frames are different.")
}
if (nargs() == 3) {
union(x, ...)
} else {
Expand Down
11 changes: 9 additions & 2 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1850,6 +1850,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
expect_equal(count(unioned2), 12)
expect_equal(first(unioned2)$name, "Michael")

df3 <- df2
names(df3)[1] <- "newName"
expect_error(rbind(df, df3),
"Names of input data frames are different.")
expect_error(rbind(df, df2, df3),
"Names of input data frames are different.")

excepted <- arrange(except(df, df2), desc(df$age))
expect_is(unioned, "SparkDataFrame")
expect_equal(count(excepted), 2)
Expand Down Expand Up @@ -2585,8 +2592,8 @@ test_that("coalesce, repartition, numPartitions", {

df2 <- repartition(df1, 10)
expect_equal(getNumPartitions(df2), 10)
expect_equal(getNumPartitions(coalesce(df2, 13)), 5)
expect_equal(getNumPartitions(coalesce(df2, 7)), 5)
expect_equal(getNumPartitions(coalesce(df2, 13)), 10)
expect_equal(getNumPartitions(coalesce(df2, 7)), 7)
expect_equal(getNumPartitions(coalesce(df2, 3)), 3)
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -850,26 +850,42 @@ public UTF8String translate(Map<Character, Character> dict) {
return fromString(sb.toString());
}

private int getDigit(byte b) {
if (b >= '0' && b <= '9') {
return b - '0';
}
throw new NumberFormatException(toString());
/**
* Wrapper over `long` to allow result of parsing long from string to be accessed via reference.
* This is done solely for better performance and is not expected to be used by end users.
*/
public static class LongWrapper {
public long value = 0;
}

/**
* Wrapper over `int` to allow result of parsing integer from string to be accessed via reference.
* This is done solely for better performance and is not expected to be used by end users.
*
* {@link LongWrapper} could have been used here but using `int` directly save the extra cost of
* conversion from `long` -> `int`
*/
public static class IntWrapper {
public int value = 0;
}

/**
* Parses this UTF8String to long.
*
* Note that, in this method we accumulate the result in negative format, and convert it to
* positive format at the end, if this string is not started with '-'. This is because min value
* is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and
* Integer.MIN_VALUE is '-2147483648'.
* is bigger than max value in digits, e.g. Long.MAX_VALUE is '9223372036854775807' and
* Long.MIN_VALUE is '-9223372036854775808'.
*
* This code is mostly copied from LazyLong.parseLong in Hive.
*
* @param toLongResult If a valid `long` was parsed from this UTF8String, then its value would
* be set in `toLongResult`
* @return true if the parsing was successful else false
*/
public long toLong() {
public boolean toLong(LongWrapper toLongResult) {
if (numBytes == 0) {
throw new NumberFormatException("Empty string");
return false;
}

byte b = getByte(0);
Expand All @@ -878,7 +894,7 @@ public long toLong() {
if (negative || b == '+') {
offset++;
if (numBytes == 1) {
throw new NumberFormatException(toString());
return false;
}
}

Expand All @@ -897,41 +913,48 @@ public long toLong() {
break;
}

int digit = getDigit(b);
int digit;
if (b >= '0' && b <= '9') {
digit = b - '0';
} else {
return false;
}

// We are going to process the new digit and accumulate the result. However, before doing
// this, if the result is already smaller than the stopValue(Long.MIN_VALUE / radix), then
// result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
// result * 10 will definitely be smaller than minValue, and we can stop.
if (result < stopValue) {
throw new NumberFormatException(toString());
return false;
}

result = result * radix - digit;
// Since the previous result is less than or equal to stopValue(Long.MIN_VALUE / radix), we
// can just use `result > 0` to check overflow. If result overflows, we should stop and throw
// exception.
// can just use `result > 0` to check overflow. If result overflows, we should stop.
if (result > 0) {
throw new NumberFormatException(toString());
return false;
}
}

// This is the case when we've encountered a decimal separator. The fractional
// part will not change the number, but we will verify that the fractional part
// is well formed.
while (offset < numBytes) {
if (getDigit(getByte(offset)) == -1) {
throw new NumberFormatException(toString());
byte currentByte = getByte(offset);
if (currentByte < '0' || currentByte > '9') {
return false;
}
offset++;
}

if (!negative) {
result = -result;
if (result < 0) {
throw new NumberFormatException(toString());
return false;
}
}

return result;
toLongResult.value = result;
return true;
}

/**
Expand All @@ -946,10 +969,14 @@ public long toLong() {
*
* Note that, this method is almost same as `toLong`, but we leave it duplicated for performance
* reasons, like Hive does.
*
* @param intWrapper If a valid `int` was parsed from this UTF8String, then its value would
* be set in `intWrapper`
* @return true if the parsing was successful else false
*/
public int toInt() {
public boolean toInt(IntWrapper intWrapper) {
if (numBytes == 0) {
throw new NumberFormatException("Empty string");
return false;
}

byte b = getByte(0);
Expand All @@ -958,7 +985,7 @@ public int toInt() {
if (negative || b == '+') {
offset++;
if (numBytes == 1) {
throw new NumberFormatException(toString());
return false;
}
}

Expand All @@ -977,61 +1004,69 @@ public int toInt() {
break;
}

int digit = getDigit(b);
int digit;
if (b >= '0' && b <= '9') {
digit = b - '0';
} else {
return false;
}

// We are going to process the new digit and accumulate the result. However, before doing
// this, if the result is already smaller than the stopValue(Integer.MIN_VALUE / radix), then
// result * 10 will definitely be smaller than minValue, and we can stop and throw exception.
// result * 10 will definitely be smaller than minValue, and we can stop
if (result < stopValue) {
throw new NumberFormatException(toString());
return false;
}

result = result * radix - digit;
// Since the previous result is less than or equal to stopValue(Integer.MIN_VALUE / radix),
// we can just use `result > 0` to check overflow. If result overflows, we should stop and
// throw exception.
// we can just use `result > 0` to check overflow. If result overflows, we should stop
if (result > 0) {
throw new NumberFormatException(toString());
return false;
}
}

// This is the case when we've encountered a decimal separator. The fractional
// part will not change the number, but we will verify that the fractional part
// is well formed.
while (offset < numBytes) {
if (getDigit(getByte(offset)) == -1) {
throw new NumberFormatException(toString());
byte currentByte = getByte(offset);
if (currentByte < '0' || currentByte > '9') {
return false;
}
offset++;
}

if (!negative) {
result = -result;
if (result < 0) {
throw new NumberFormatException(toString());
return false;
}
}

return result;
intWrapper.value = result;
return true;
}

public short toShort() {
int intValue = toInt();
short result = (short) intValue;
if (result != intValue) {
throw new NumberFormatException(toString());
public boolean toShort(IntWrapper intWrapper) {
if (toInt(intWrapper)) {
int intValue = intWrapper.value;
short result = (short) intValue;
if (result == intValue) {
return true;
}
}

return result;
return false;
}

public byte toByte() {
int intValue = toInt();
byte result = (byte) intValue;
if (result != intValue) {
throw new NumberFormatException(toString());
public boolean toByte(IntWrapper intWrapper) {
if (toInt(intWrapper)) {
int intValue = intWrapper.value;
byte result = (byte) intValue;
if (result == intValue) {
return true;
}
}

return result;
return false;
}

@Override
Expand Down
Loading

0 comments on commit 4c23e7a

Please sign in to comment.