diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6620afab850a7..5140a96d2a09e 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -309,8 +309,9 @@ Bug Fixes Conversion ^^^^^^^^^^ -- Bug in assignment against datetime-like data with ``int`` may incorrectly converte to datetime-like (:issue:`14145`) +- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) +- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`) Indexing diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 33b41d61aa978..b367fda002b74 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -31,6 +31,7 @@ from distutils.version import LooseVersion from itertools import product import sys +import platform import types from unicodedata import east_asian_width import struct @@ -41,6 +42,7 @@ PY3 = (sys.version_info[0] >= 3) PY35 = (sys.version_info >= (3, 5)) PY36 = (sys.version_info >= (3, 6)) +PYPY = (platform.python_implementation() == 'PyPy') try: import __builtin__ as builtins diff --git a/pandas/core/base.py b/pandas/core/base.py index 8f21e3125a27e..4ae4736035793 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -15,6 +15,7 @@ import pandas.core.nanops as nanops import pandas._libs.lib as lib from pandas.compat.numpy import function as nv +from pandas.compat import PYPY from pandas.util._decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError @@ -1061,7 +1062,7 @@ def memory_usage(self, deep=False): Notes ----- Memory usage does not include memory consumed by elements that - are not components of the array if deep=False + are not components of the array if deep=False or if used on PyPy See Also -------- @@ -1071,9 +1072,8 @@ def memory_usage(self, deep=False): return self.values.memory_usage(deep=deep) v = self.values.nbytes - if deep and is_object_dtype(self): + if deep and is_object_dtype(self) and not PYPY: v += lib.memory_usage_of_objects(self.values) - return v def factorize(self, sort=False, na_sentinel=-1): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 420788f9008cd..ea45b4700172f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -465,9 +465,13 @@ def _nbytes(self, deep=False): *this is in internal routine* """ + + # for implementations with no useful getsizeof (PyPy) + objsize = 24 + level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels)) label_nbytes = sum((i.nbytes for i in self.labels)) - names_nbytes = sum((getsizeof(i) for i in self.names)) + names_nbytes = sum((getsizeof(i, objsize) for i in self.names)) result = level_nbytes + label_nbytes + names_nbytes # include our engine hashtable diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5071b50bbebdf..ac4cc6986cace 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -194,8 +194,12 @@ def _format_data(self): @cache_readonly def nbytes(self): - """ return the number of bytes in the underlying data """ - return sum([getsizeof(getattr(self, v)) for v in + """ + Return the number of bytes in the underlying data + On implementations where this is undetermined (PyPy) + assume 24 bytes for each value + """ + return sum([getsizeof(getattr(self, v), 24) for v in ['_start', '_stop', '_step']]) def memory_usage(self, deep=False): diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index c317ad542659a..37f8c0cc85b23 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -11,7 +11,7 @@ import pytest from pandas import (DataFrame, compat, option_context) -from pandas.compat import StringIO, lrange, u +from pandas.compat import StringIO, lrange, u, PYPY import pandas.io.formats.format as fmt import pandas as pd @@ -323,23 +323,6 @@ def test_info_memory_usage(self): # excluded column with object dtype, so estimate is accurate assert not re.match(r"memory usage: [^+]+\+", res[-1]) - df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) - df_with_object_index.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df_with_object_index.info(buf=buf, memory_usage='deep') - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+$", res[-1]) - - assert (df_with_object_index.memory_usage( - index=True, deep=True).sum() > df_with_object_index.memory_usage( - index=True).sum()) - - df_object = pd.DataFrame({'a': ['a']}) - assert (df_object.memory_usage(deep=True).sum() > - df_object.memory_usage().sum()) - # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {} @@ -349,6 +332,15 @@ def test_info_memory_usage(self): df = DataFrame(data) df.columns = dtypes + df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df_with_object_index.info(buf=buf, memory_usage='deep') + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+$", res[-1]) + # Ensure df size is as expected # (cols * rows * bytes) + index size df_size = df.memory_usage().sum() @@ -377,9 +369,47 @@ def test_info_memory_usage(self): df.memory_usage(index=True) df.index.values.nbytes + mem = df.memory_usage(deep=True).sum() + assert mem > 0 + + @pytest.mark.skipif(PYPY, + reason="on PyPy deep=True doesn't change result") + def test_info_memory_usage_deep_not_pypy(self): + df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) + assert (df_with_object_index.memory_usage( + index=True, deep=True).sum() > + df_with_object_index.memory_usage( + index=True).sum()) + + df_object = pd.DataFrame({'a': ['a']}) + assert (df_object.memory_usage(deep=True).sum() > + df_object.memory_usage().sum()) + + @pytest.mark.skipif(not PYPY, + reason="on PyPy deep=True does not change result") + def test_info_memory_usage_deep_pypy(self): + df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) + assert (df_with_object_index.memory_usage( + index=True, deep=True).sum() == + df_with_object_index.memory_usage( + index=True).sum()) + + df_object = pd.DataFrame({'a': ['a']}) + assert (df_object.memory_usage(deep=True).sum() == + df_object.memory_usage().sum()) + + @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") + def test_usage_via_getsizeof(self): + df = DataFrame( + data=1, + index=pd.MultiIndex.from_product( + [['a'], range(1000)]), + columns=['A'] + ) + mem = df.memory_usage(deep=True).sum() # sys.getsizeof will call the .memory_usage with # deep=True, and add on some GC overhead - diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) + diff = mem - sys.getsizeof(df) assert abs(diff) < 100 def test_info_memory_usage_qualified(self): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 9af4a9edeb8b1..9e92c7cf1a9b8 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -15,7 +15,7 @@ import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta, IntervalIndex, Interval) -from pandas.compat import StringIO +from pandas.compat import StringIO, PYPY from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import PandasDelegate, NoNewAttributesMixin from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -144,6 +144,7 @@ def f(): pytest.raises(TypeError, f) + @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") def test_memory_usage(self): # Delegate does not implement memory_usage. # Check that we fall back to in-built `__sizeof__` @@ -941,6 +942,7 @@ def test_fillna(self): # check shallow_copied assert o is not result + @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") def test_memory_usage(self): for o in self.objs: res = o.memory_usage() diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index eecdd672095b0..a0b585a16ad9a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -24,7 +24,7 @@ period_range, PeriodIndex, timedelta_range, TimedeltaIndex, NaT, Interval, IntervalIndex) -from pandas.compat import range, lrange, u, PY3 +from pandas.compat import range, lrange, u, PY3, PYPY from pandas.core.config import option_context @@ -1448,10 +1448,11 @@ def test_memory_usage(self): cat = pd.Categorical(['foo', 'foo', 'bar']) assert cat.memory_usage(deep=True) > cat.nbytes - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = cat.memory_usage(deep=True) - sys.getsizeof(cat) - assert abs(diff) < 100 + if not PYPY: + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = cat.memory_usage(deep=True) - sys.getsizeof(cat) + assert abs(diff) < 100 def test_searchsorted(self): # https://github.com/pandas-dev/pandas/issues/8420 diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a000e189dfaa9..5a17cb6d7dc47 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -56,7 +56,6 @@ K = 4 _RAISE_NETWORK_ERROR_DEFAULT = False - # set testing_mode _testing_mode_warnings = (DeprecationWarning, compat.ResourceWarning)