Skip to content

Commit

Permalink
[SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Py…
Browse files Browse the repository at this point in the history
…thon.

To maintain consistency with the Scala API.

Author: Reynold Xin <rxin@databricks.com>

Closes apache#5284 from rxin/df-na-alias and squashes the following commits:

19f46b7 [Reynold Xin] Show DataFrameNaFunctions in docs.
6618118 [Reynold Xin] [SPARK-6623][SQL] Alias DataFrame.na.drop and DataFrame.na.fill in Python.
  • Loading branch information
rxin committed Mar 31, 2015
1 parent f07e714 commit b80a030
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 6 deletions.
10 changes: 6 additions & 4 deletions python/pyspark/sql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,24 @@
Main entry point for :class:`DataFrame` and SQL functionality.
- L{DataFrame}
A distributed collection of data grouped into named columns.
- L{GroupedData}
Aggregation methods, returned by :func:`DataFrame.groupBy`.
- L{Column}
A column expression in a :class:`DataFrame`.
- L{Row}
A row of data in a :class:`DataFrame`.
- L{HiveContext}
Main entry point for accessing data stored in Apache Hive.
- L{GroupedData}
Aggregation methods, returned by :func:`DataFrame.groupBy`.
- L{DataFrameNaFunctions}
Methods for handling missing data (null values).
- L{functions}
List of built-in functions available for :class:`DataFrame`.
"""

from pyspark.sql.context import SQLContext, HiveContext
from pyspark.sql.types import Row
from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD
from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD, DataFrameNaFunctions

__all__ = [
'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', 'DataFrameNaFunctions'
]
41 changes: 39 additions & 2 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from pyspark.sql.types import _create_cls, _parse_datatype_json_string


__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD"]
__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions"]


class DataFrame(object):
Expand Down Expand Up @@ -86,6 +86,12 @@ def applySchema(it):

return self._lazy_rdd

@property
def na(self):
"""Returns a :class:`DataFrameNaFunctions` for handling missing values.
"""
return DataFrameNaFunctions(self)

def toJSON(self, use_unicode=False):
"""Convert a :class:`DataFrame` into a MappedRDD of JSON documents; one document per row.
Expand Down Expand Up @@ -693,6 +699,8 @@ def subtract(self, other):
def dropna(self, how='any', thresh=None, subset=None):
"""Returns a new :class:`DataFrame` omitting rows with null values.
This is an alias for `na.drop`.
:param how: 'any' or 'all'.
If 'any', drop a row if it contains any nulls.
If 'all', drop a row only if all its values are null.
Expand All @@ -704,6 +712,10 @@ def dropna(self, how='any', thresh=None, subset=None):
>>> df4.dropna().show()
age height name
10 80 Alice
>>> df4.na.drop().show()
age height name
10 80 Alice
"""
if how is not None and how not in ['any', 'all']:
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
Expand All @@ -723,7 +735,7 @@ def dropna(self, how='any', thresh=None, subset=None):
return DataFrame(self._jdf.na().drop(thresh, cols), self.sql_ctx)

def fillna(self, value, subset=None):
"""Replace null values.
"""Replace null values, alias for `na.fill`.
:param value: int, long, float, string, or dict.
Value to replace null values with.
Expand All @@ -748,6 +760,13 @@ def fillna(self, value, subset=None):
5 null Bob
50 null Tom
50 null unknown
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
age height name
10 80 Alice
5 null Bob
50 null Tom
50 null unknown
"""
if not isinstance(value, (float, int, long, basestring, dict)):
raise ValueError("value should be a float, int, long, string, or dict")
Expand Down Expand Up @@ -1134,6 +1153,24 @@ def __repr__(self):
return 'Column<%s>' % self._jc.toString().encode('utf8')


class DataFrameNaFunctions(object):
"""Functionality for working with missing data in :class:`DataFrame`.
"""

def __init__(self, df):
self.df = df

def drop(self, how='any', thresh=None, subset=None):
return self.df.dropna(how=how, thresh=thresh, subset=subset)

drop.__doc__ = DataFrame.dropna.__doc__

def fill(self, value, subset=None):
return self.df.fillna(value=value, subset=subset)

fill.__doc__ = DataFrame.fillna.__doc__


def _test():
import doctest
from pyspark.context import SparkContext
Expand Down

0 comments on commit b80a030

Please sign in to comment.