diff --git a/python/pyspark/sql/__init__.py b/python/pyspark/sql/__init__.py index 54a01631d8899..9d39e5d9c2449 100644 --- a/python/pyspark/sql/__init__.py +++ b/python/pyspark/sql/__init__.py @@ -22,22 +22,24 @@ Main entry point for :class:`DataFrame` and SQL functionality. - L{DataFrame} A distributed collection of data grouped into named columns. - - L{GroupedData} - Aggregation methods, returned by :func:`DataFrame.groupBy`. - L{Column} A column expression in a :class:`DataFrame`. - L{Row} A row of data in a :class:`DataFrame`. - L{HiveContext} Main entry point for accessing data stored in Apache Hive. + - L{GroupedData} + Aggregation methods, returned by :func:`DataFrame.groupBy`. + - L{DataFrameNaFunctions} + Methods for handling missing data (null values). - L{functions} List of built-in functions available for :class:`DataFrame`. """ from pyspark.sql.context import SQLContext, HiveContext from pyspark.sql.types import Row -from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD +from pyspark.sql.dataframe import DataFrame, GroupedData, Column, SchemaRDD, DataFrameNaFunctions __all__ = [ - 'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', + 'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row', 'DataFrameNaFunctions' ] diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 4f174de811697..15508023326cc 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -31,7 +31,7 @@ from pyspark.sql.types import _create_cls, _parse_datatype_json_string -__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD"] +__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions"] class DataFrame(object): @@ -86,6 +86,12 @@ def applySchema(it): return self._lazy_rdd + @property + def na(self): + """Returns a :class:`DataFrameNaFunctions` for handling missing values. + """ + return DataFrameNaFunctions(self) + def toJSON(self, use_unicode=False): """Convert a :class:`DataFrame` into a MappedRDD of JSON documents; one document per row. @@ -693,6 +699,8 @@ def subtract(self, other): def dropna(self, how='any', thresh=None, subset=None): """Returns a new :class:`DataFrame` omitting rows with null values. + This is an alias for `na.drop`. + :param how: 'any' or 'all'. If 'any', drop a row if it contains any nulls. If 'all', drop a row only if all its values are null. @@ -704,6 +712,10 @@ def dropna(self, how='any', thresh=None, subset=None): >>> df4.dropna().show() age height name 10 80 Alice + + >>> df4.na.drop().show() + age height name + 10 80 Alice """ if how is not None and how not in ['any', 'all']: raise ValueError("how ('" + how + "') should be 'any' or 'all'") @@ -723,7 +735,7 @@ def dropna(self, how='any', thresh=None, subset=None): return DataFrame(self._jdf.na().drop(thresh, cols), self.sql_ctx) def fillna(self, value, subset=None): - """Replace null values. + """Replace null values, alias for `na.fill`. :param value: int, long, float, string, or dict. Value to replace null values with. @@ -748,6 +760,13 @@ def fillna(self, value, subset=None): 5 null Bob 50 null Tom 50 null unknown + + >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show() + age height name + 10 80 Alice + 5 null Bob + 50 null Tom + 50 null unknown """ if not isinstance(value, (float, int, long, basestring, dict)): raise ValueError("value should be a float, int, long, string, or dict") @@ -1134,6 +1153,24 @@ def __repr__(self): return 'Column<%s>' % self._jc.toString().encode('utf8') +class DataFrameNaFunctions(object): + """Functionality for working with missing data in :class:`DataFrame`. + """ + + def __init__(self, df): + self.df = df + + def drop(self, how='any', thresh=None, subset=None): + return self.df.dropna(how=how, thresh=thresh, subset=subset) + + drop.__doc__ = DataFrame.dropna.__doc__ + + def fill(self, value, subset=None): + return self.df.fillna(value=value, subset=subset) + + fill.__doc__ = DataFrame.fillna.__doc__ + + def _test(): import doctest from pyspark.context import SparkContext