-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-11753][SQL][test-hadoop2.2] Make allowNonNumericNumbers option work #9759
Changes from all commits
2777677
b2a835d
186fa5e
dc9abc3
6d90b24
c74d715
6f668c3
1cfd1dc
4fca52c
0e473fc
025edea
af1e3a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -129,13 +129,15 @@ object JacksonParser extends Logging { | |
case (VALUE_STRING, FloatType) => | ||
// Special case handling for NaN and Infinity. | ||
val value = parser.getText | ||
val lowerCaseValue = value.toLowerCase() | ||
if (lowerCaseValue.equals("nan") || | ||
lowerCaseValue.equals("infinity") || | ||
lowerCaseValue.equals("-infinity") || | ||
lowerCaseValue.equals("inf") || | ||
lowerCaseValue.equals("-inf")) { | ||
if (value.equals("NaN") || | ||
value.equals("Infinity") || | ||
value.equals("+Infinity") || | ||
value.equals("-Infinity")) { | ||
value.toFloat | ||
} else if (value.equals("+INF") || value.equals("INF")) { | ||
Float.PositiveInfinity | ||
} else if (value.equals("-INF")) { | ||
Float.NegativeInfinity | ||
} else { | ||
throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.") | ||
} | ||
|
@@ -146,13 +148,15 @@ object JacksonParser extends Logging { | |
case (VALUE_STRING, DoubleType) => | ||
// Special case handling for NaN and Infinity. | ||
val value = parser.getText | ||
val lowerCaseValue = value.toLowerCase() | ||
if (lowerCaseValue.equals("nan") || | ||
lowerCaseValue.equals("infinity") || | ||
lowerCaseValue.equals("-infinity") || | ||
lowerCaseValue.equals("inf") || | ||
lowerCaseValue.equals("-inf")) { | ||
if (value.equals("NaN") || | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "infinity".toDouble, "inf".toDouble are not legal. These non-numeric numbers are case-sensitive, both for Jackson and Scala. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should also allow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok. added. |
||
value.equals("Infinity") || | ||
value.equals("+Infinity") || | ||
value.equals("-Infinity")) { | ||
value.toDouble | ||
} else if (value.equals("+INF") || value.equals("INF")) { | ||
Double.PositiveInfinity | ||
} else if (value.equals("-INF")) { | ||
Double.NegativeInfinity | ||
} else { | ||
throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.") | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.json | |
|
||
import org.apache.spark.sql.QueryTest | ||
import org.apache.spark.sql.test.SharedSQLContext | ||
import org.apache.spark.sql.types.{DoubleType, StructField, StructType} | ||
|
||
/** | ||
* Test cases for various [[JSONOptions]]. | ||
|
@@ -93,23 +94,51 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { | |
assert(df.first().getLong(0) == 18) | ||
} | ||
|
||
// The following two tests are not really working - need to look into Jackson's | ||
// JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS. | ||
ignore("allowNonNumericNumbers off") { | ||
val str = """{"age": NaN}""" | ||
val rdd = spark.sparkContext.parallelize(Seq(str)) | ||
val df = spark.read.json(rdd) | ||
|
||
assert(df.schema.head.name == "_corrupt_record") | ||
test("allowNonNumericNumbers off") { | ||
// non-quoted non-numeric numbers don't work if allowNonNumericNumbers is off. | ||
var testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", | ||
"""{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": INF}""", | ||
"""{"age": +INF}""", """{"age": -INF}""") | ||
testCases.foreach { str => | ||
val rdd = spark.sparkContext.parallelize(Seq(str)) | ||
val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd) | ||
|
||
assert(df.schema.head.name == "_corrupt_record") | ||
} | ||
|
||
// quoted non-numeric numbers should still work even allowNonNumericNumbers is off. | ||
testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "+Infinity"}""", | ||
"""{"age": "-Infinity"}""", """{"age": "INF"}""", """{"age": "+INF"}""", | ||
"""{"age": "-INF"}""") | ||
val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity, | ||
_.isNegInfinity, _.isPosInfinity, _.isPosInfinity, _.isNegInfinity) | ||
val schema = StructType(StructField("age", DoubleType, true) :: Nil) | ||
|
||
testCases.zipWithIndex.foreach { case (str, idx) => | ||
val rdd = spark.sparkContext.parallelize(Seq(str)) | ||
val df = spark.read.option("allowNonNumericNumbers", "false").schema(schema).json(rdd) | ||
|
||
assert(df.schema.head.name == "age") | ||
assert(tests(idx)(df.first().getDouble(0))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why it's double type? Shouldn't it be string if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. from @rxin comment that we want to support quoted non-numeric numbers when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't make sense to me. What if users really want to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then I shouldn't change There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In other words, it should be number when the field is inferred as double/float type. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan I updated it. Non-quoted non-numeric numbers are parsed as double when the corresponding field is double/float type. This behavior is as same as before this patch. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did I say that anywhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea I don't think that's what I meant there. |
||
} | ||
} | ||
|
||
ignore("allowNonNumericNumbers on") { | ||
val str = """{"age": NaN}""" | ||
val rdd = spark.sparkContext.parallelize(Seq(str)) | ||
val df = spark.read.option("allowNonNumericNumbers", "true").json(rdd) | ||
|
||
assert(df.schema.head.name == "age") | ||
assert(df.first().getDouble(0).isNaN) | ||
test("allowNonNumericNumbers on") { | ||
val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we still read them if they are quoted? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, so if we don't set |
||
"""{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": +INF}""", | ||
"""{"age": -INF}""", """{"age": "NaN"}""", """{"age": "Infinity"}""", | ||
"""{"age": "-Infinity"}""") | ||
val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity, | ||
_.isNegInfinity, _.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity, | ||
_.isNegInfinity, _.isPosInfinity, _.isNegInfinity) | ||
val schema = StructType(StructField("age", DoubleType, true) :: Nil) | ||
testCases.zipWithIndex.foreach { case (str, idx) => | ||
val rdd = spark.sparkContext.parallelize(Seq(str)) | ||
val df = spark.read.option("allowNonNumericNumbers", "true").schema(schema).json(rdd) | ||
|
||
assert(df.schema.head.name == "age") | ||
assert(tests(idx)(df.first().getDouble(0))) | ||
} | ||
} | ||
|
||
test("allowBackslashEscapingAnyCharacter off") { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why are we removing the special handling for float types here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea, should revert it back. BTW, do we actually test "inf" and "-inf" before? Because "inf".toFloat is not legal.