Skip to content

Commit

Permalink
[SPARK-47309][SQL][XML] Add schema inference unit tests
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

As titled.

### Why are the changes needed?

Fix a bug.

### Does this PR introduce _any_ user-facing change?

Yes

### How was this patch tested?

Unit tests

### Was this patch authored or co-authored using generative AI tooling?

No

Closes apache#45411 from shujingyang-db/xml-inference-check.

Authored-by: Shujing Yang <shujing.yang@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
  • Loading branch information
shujingyang-db authored and sweisdb committed Apr 1, 2024
1 parent 7a41a34 commit 97b0d97
Show file tree
Hide file tree
Showing 2 changed files with 606 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,314 @@ private[xml] trait TestXmlData {
f(dir)
fs.setVerifyChecksum(true)
}

def primitiveFieldValueTypeConflict: Seq[String] =
"""<ROW>
| <num_num_1>11</num_num_1>
| <num_num_2/>
| <num_num_3>1.1</num_num_3>
| <num_bool>true</num_bool>
| <num_str>13.1</num_str>
| <str_bool>str1</str_bool>
|</ROW>
|""".stripMargin ::
"""
|<ROW>
| <num_num_1/>
| <num_num_2>21474836470.9</num_num_2>
| <num_num_3/>
| <num_bool>12</num_bool>
| <num_str/>
| <str_bool>true</str_bool>
|</ROW>""".stripMargin ::
"""
|<ROW>
| <num_num_1>21474836470</num_num_1>
| <num_num_2>92233720368547758070</num_num_2>
| <num_num_3>100</num_num_3>
| <num_bool>false</num_bool>
| <num_str>str1</num_str>
| <str_bool>false</str_bool>
|</ROW>""".stripMargin ::
"""
|<ROW>
| <num_num_1>21474836570</num_num_1>
| <num_num_2>1.1</num_num_2>
| <num_num_3>21474836470</num_num_3>
| <num_bool/>
| <num_str>92233720368547758070</num_str>
| <str_bool/>
|</ROW>""".stripMargin :: Nil

def xmlNullStruct: Seq[String] =
"""<ROW>
| <nullstr></nullstr>
| <ip>27.31.100.29</ip>
| <headers>
| <Host>1.abc.com</Host>
| <Charset>UTF-8</Charset>
| </headers>
|</ROW>""".stripMargin ::
"""<ROW>
| <nullstr></nullstr>
| <ip>27.31.100.29</ip>
| <headers/>
|</ROW>""".stripMargin ::
"""<ROW>
| <nullstr></nullstr>
| <ip>27.31.100.29</ip>
| <headers></headers>
|</ROW>""".stripMargin ::
"""<ROW>
| <nullstr/>
| <ip>27.31.100.29</ip>
| <headers/>
|</ROW>""".stripMargin :: Nil

def complexFieldValueTypeConflict: Seq[String] =
"""<ROW>
<num_struct>11</num_struct>
<str_array>1</str_array>
<str_array>2</str_array>
<str_array>3</str_array>
<array></array>
<struct_array></struct_array>
<struct></struct>
</ROW>""" ::
"""<ROW>
<num_struct>
<field>false</field>
</num_struct>
<str_array/>
<array/>
<struct_array></struct_array>
<struct/>
</ROW>""" ::
"""<ROW>
<num_struct/>
<str_array>str</str_array>
<array>4</array>
<array>5</array>
<array>6</array>
<struct_array>7</struct_array>
<struct_array>8</struct_array>
<struct_array>9</struct_array>
<struct>
<field/>
</struct>
</ROW>""" ::
"""<ROW>
<num_struct></num_struct>
<str_array>str1</str_array>
<str_array>str2</str_array>
<str_array>33</str_array>
<array>7</array>
<struct_array>
<field>true</field>
</struct_array>
<struct>
<field>str</field>
</struct>
</ROW>""" :: Nil

def arrayElementTypeConflict: Seq[String] =
"""
|<ROW>
| <array1>
| <element>1</element>
| <element>1.1</element>
| <element>true</element>
| <element/>
| <element>
| <array/>
| </element>
| <element>
| <object/>
| </element>
| </array1>
| <array1>
| <element>
| <array>
| <element>2</element>
| <element>3</element>
| <element>4</element>
| </array>
| </element>
| <element>
| <object>
| <field>str</field>
| </object>
| </element>
| </array1>
| <array2>
| <field>214748364700</field>
| </array2>
| <array2>
| <field>1</field>
| </array2>
|</ROW>
|""".stripMargin ::
"""
|<ROW>
| <array3>
| <field>str</field>
| </array3>
| <array3>
| <field>1</field>
| </array3>
|</ROW>
|""".stripMargin ::
"""
|<ROW>
| <array3>1</array3>
| <array3>2</array3>
| <array3>3</array3>
|</ROW>
|""".stripMargin :: Nil

def missingFields: Seq[String] =
"""
<ROW><a>true</a></ROW>
""" ::
"""
<ROW><b>21474836470</b></ROW>
""" ::
"""
<ROW><c>33</c><c>44</c></ROW>
""" ::
"""
<ROW><d><field>true</field></d></ROW>
""" ::
"""
<ROW><e>str</e></ROW>
""" :: Nil

// XML doesn't support array of arrays
// It only supports array of structs
def complexFieldAndType1: Seq[String] =
"""
|<ROW>
| <struct>
| <field1>true</field1>
| <field2>92233720368547758070</field2>
| </struct>
| <structWithArrayFields>
| <field1>4</field1>
| <field1>5</field1>
| <field1>6</field1>
| <field2>str1</field2>
| <field2>str2</field2>
| </structWithArrayFields>
| <arrayOfString>str1</arrayOfString>
| <arrayOfString>str2</arrayOfString>
| <arrayOfInteger>1</arrayOfInteger>
| <arrayOfInteger>2147483647</arrayOfInteger>
| <arrayOfInteger>-2147483648</arrayOfInteger>
| <arrayOfLong>21474836470</arrayOfLong>
| <arrayOfLong>9223372036854775807</arrayOfLong>
| <arrayOfLong>-9223372036854775808</arrayOfLong>
| <arrayOfBigInteger>922337203685477580700</arrayOfBigInteger>
| <arrayOfBigInteger>-922337203685477580800</arrayOfBigInteger>
| <arrayOfDouble>1.2</arrayOfDouble>
| <arrayOfDouble>1.7976931348623157</arrayOfDouble>
| <arrayOfDouble>4.9E-324</arrayOfDouble>
| <arrayOfDouble>2.2250738585072014E-308</arrayOfDouble>
| <arrayOfBoolean>true</arrayOfBoolean>
| <arrayOfBoolean>false</arrayOfBoolean>
| <arrayOfBoolean>true</arrayOfBoolean>
| <arrayOfNull></arrayOfNull>
| <arrayOfNull></arrayOfNull>
| <arrayOfStruct>
| <field1>true</field1>
| <field2>str1</field2>
| </arrayOfStruct>
| <arrayOfStruct>
| <field1>false</field1>
| </arrayOfStruct>
| <arrayOfStruct>
| <field3/>
| </arrayOfStruct>
|<arrayOfArray1>
| <item>1</item><item>2</item><item>3</item>
|</arrayOfArray1>
|<arrayOfArray1>
| <item>str1</item><item>str2</item>
|</arrayOfArray1>
|<arrayOfArray2>
| <item>1</item><item>2</item><item>3</item>
|</arrayOfArray2>
|<arrayOfArray2>
| <item>1.1</item><item>2.1</item><item>3.1</item>
|</arrayOfArray2>
|</ROW>
|
|""".stripMargin :: Nil

def complexFieldAndType2: Seq[String] =
"""
|<ROW>
| <arrayOfArray1>
| <array>
| <item>5</item>
| </array>
|</arrayOfArray1>
|<arrayOfArray1>
| <array>
| <item>6</item><item>7</item>
| </array>
| <array>
| <item>8</item>
| </array>
|</arrayOfArray1>
| <arrayOfArray2>
| <array>
| <item>
| <inner1>str1</inner1>
| </item>
| </array>
|</arrayOfArray2>
|<arrayOfArray2>
| <array/>
| <array>
| <item>
| <inner2>str3</inner2>
| <inner2>str33</inner2>
| </item>
| <item>
| <inner2>str4</inner2>
| <inner1>str11</inner1>
| </item>
| </array>
|</arrayOfArray2>
|<arrayOfArray2>
| <array>
| <item>
| <inner3>
| <inner4>2</inner4>
| <inner4>3</inner4>
| </inner3>
| <inner3/>
| </item>
| </array>
|</arrayOfArray2>
|</ROW>
|""".stripMargin :: Nil

def emptyRecords: Seq[String] =
"""<ROW>
<a><struct></struct></a>
</ROW>""" ::
"""<ROW>
<a>
<struct><b><c/></b></struct>
</a>
</ROW>""" ::
"""<ROW>
<b>
<item>
<c><struct></struct></c>
</item>
<item/>
</b>
</ROW>""" :: Nil
}
Loading

0 comments on commit 97b0d97

Please sign in to comment.