Skip to content

Commit

Permalink
[SPARK-7319][SQL] Improve the output from DataFrame.show()
Browse files Browse the repository at this point in the history
Author: 云峤 <chensong.cs@alibaba-inc.com>

Closes apache#5865 from kaka1992/df.show and squashes the following commits:

c79204b [云峤] Update
a1338f6 [云峤] Update python dataFrame show test and add empty df unit test.
734369c [云峤] Update python dataFrame show test and add empty df unit test.
84aec3e [云峤] Update python dataFrame show test and add empty df unit test.
159b3d5 [云峤] update
03ef434 [云峤] update
7394fd5 [云峤] update test show
ced487a [云峤] update pep8
b6e690b [云峤] Merge remote-tracking branch 'upstream/master' into df.show
30ac311 [云峤] [SPARK-7294] ADD BETWEEN
7d62368 [云峤] [SPARK-7294] ADD BETWEEN
baf839b [云峤] [SPARK-7294] ADD BETWEEN
d11d5b9 [云峤] [SPARK-7294] ADD BETWEEN
  • Loading branch information
云峤 authored and nemccarthy committed Jun 19, 2015
1 parent a2f7615 commit 383b930
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 44 deletions.
2 changes: 1 addition & 1 deletion R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ setMethod("isLocal",
setMethod("showDF",
signature(x = "DataFrame"),
function(x, numRows = 20) {
cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n")
callJMethod(x@sdf, "showString", numToInt(numRows))
})

#' show
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/inst/tests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ test_that("toJSON() returns an RDD of the correct values", {

test_that("showDF()", {
df <- jsonFile(sqlCtx, jsonPath)
expect_output(showDF(df), "age name \nnull Michael\n30 Andy \n19 Justin ")
expect_output(showDF(df), "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
})

test_that("isLocal()", {
Expand Down
105 changes: 69 additions & 36 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,12 @@ def show(self, n=20):
>>> df
DataFrame[age: int, name: string]
>>> df.show()
age name
2 Alice
5 Bob
+---+-----+
|age| name|
+---+-----+
| 2|Alice|
| 5| Bob|
+---+-----+
"""
print(self._jdf.showString(n))

Expand Down Expand Up @@ -591,12 +594,15 @@ def describe(self, *cols):
given, this function computes statistics for all numerical columns.
>>> df.describe().show()
summary age
count 2
mean 3.5
stddev 1.5
min 2
max 5
+-------+---+
|summary|age|
+-------+---+
| count| 2|
| mean|3.5|
| stddev|1.5|
| min| 2|
| max| 5|
+-------+---+
"""
jdf = self._jdf.describe(self._jseq(cols))
return DataFrame(jdf, self.sql_ctx)
Expand Down Expand Up @@ -801,12 +807,18 @@ def dropna(self, how='any', thresh=None, subset=None):
:param subset: optional list of column names to consider.
>>> df4.dropna().show()
age height name
10 80 Alice
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
+---+------+-----+
>>> df4.na.drop().show()
age height name
10 80 Alice
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
+---+------+-----+
"""
if how is not None and how not in ['any', 'all']:
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
Expand Down Expand Up @@ -837,25 +849,34 @@ def fillna(self, value, subset=None):
then the non-string column is simply ignored.
>>> df4.fillna(50).show()
age height name
10 80 Alice
5 50 Bob
50 50 Tom
50 50 null
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
| 5| 50| Bob|
| 50| 50| Tom|
| 50| 50| null|
+---+------+-----+
>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
age height name
10 80 Alice
5 null Bob
50 null Tom
50 null unknown
+---+------+-------+
|age|height| name|
+---+------+-------+
| 10| 80| Alice|
| 5| null| Bob|
| 50| null| Tom|
| 50| null|unknown|
+---+------+-------+
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
age height name
10 80 Alice
5 null Bob
50 null Tom
50 null unknown
+---+------+-------+
|age|height| name|
+---+------+-------+
| 10| 80| Alice|
| 5| null| Bob|
| 50| null| Tom|
| 50| null|unknown|
+---+------+-------+
"""
if not isinstance(value, (float, int, long, basestring, dict)):
raise ValueError("value should be a float, int, long, string, or dict")
Expand Down Expand Up @@ -1241,11 +1262,17 @@ def getItem(self, key):
>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
l[0] d[key]
1 value
+----+------+
|l[0]|d[key]|
+----+------+
| 1| value|
+----+------+
>>> df.select(df.l[0], df.d["key"]).show()
l[0] d[key]
1 value
+----+------+
|l[0]|d[key]|
+----+------+
| 1| value|
+----+------+
"""
return self[key]

Expand All @@ -1255,11 +1282,17 @@ def getField(self, name):
>>> from pyspark.sql import Row
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
>>> df.select(df.r.getField("b")).show()
r.b
b
+---+
|r.b|
+---+
| b|
+---+
>>> df.select(df.r.a).show()
r.a
1
+---+
|r.a|
+---+
| 1|
+---+
"""
return Column(self._jc.getField(name))

Expand Down
28 changes: 22 additions & 6 deletions sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.sql
import java.io.CharArrayWriter
import java.sql.DriverManager


import scala.collection.JavaConversions._
import scala.language.implicitConversions
import scala.reflect.ClassTag
Expand All @@ -28,6 +29,7 @@ import scala.util.control.NonFatal

import com.fasterxml.jackson.core.JsonFactory

import org.apache.commons.lang3.StringUtils
import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.python.SerDeUtil
Expand Down Expand Up @@ -175,6 +177,7 @@ class DataFrame private[sql](
* @param numRows Number of rows to show
*/
private[sql] def showString(numRows: Int): String = {
val sb = new StringBuilder
val data = take(numRows)
val numCols = schema.fieldNames.length

Expand All @@ -194,12 +197,25 @@ class DataFrame private[sql](
}
}

// Pad the cells
rows.map { row =>
row.zipWithIndex.map { case (cell, i) =>
String.format(s"%-${colWidths(i)}s", cell)
}.mkString(" ")
}.mkString("\n")
// Create SeparateLine
val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()

// column names
rows.head.zipWithIndex.map { case (cell, i) =>
StringUtils.leftPad(cell.toString, colWidths(i))
}.addString(sb, "|", "|", "|\n")

sb.append(sep)

// data
rows.tail.map {
_.zipWithIndex.map { case (cell, i) =>
StringUtils.leftPad(cell.toString, colWidths(i))
}.addString(sb, "|", "|", "|\n")
}

sb.append(sep)
sb.toString()
}

override def toString: String = {
Expand Down
19 changes: 19 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,25 @@ class DataFrameSuite extends QueryTest {
testData.select($"*").show(1000)
}

test("SPARK-7319 showString") {
val expectedAnswer = """+---+-----+
||key|value|
|+---+-----+
|| 1| 1|
|+---+-----+
|""".stripMargin
assert(testData.select($"*").showString(1) === expectedAnswer)
}

test("SPARK-7327 show with empty dataFrame") {
val expectedAnswer = """+---+-----+
||key|value|
|+---+-----+
|+---+-----+
|""".stripMargin
assert(testData.select($"*").filter($"key" < 0).showString(1) === expectedAnswer)
}

test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
Expand Down

0 comments on commit 383b930

Please sign in to comment.