Skip to content

Commit

Permalink
COMPAT: avoid calling getsizeof() on PyPy
Browse files Browse the repository at this point in the history
closes pandas-dev#17228

Author: mattip <matti.picus@gmail.com>

Closes pandas-dev#17229 from mattip/getsizeof-unavailable and squashes the following commits:

d2623e4 [mattip] COMPAT: avoid calling getsizeof() on PyPy
  • Loading branch information
mattip authored and jowens committed Sep 20, 2017
1 parent 2e55156 commit b49446e
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 33 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -309,8 +309,9 @@ Bug Fixes
Conversion
^^^^^^^^^^

- Bug in assignment against datetime-like data with ``int`` may incorrectly converte to datetime-like (:issue:`14145`)
- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`)


Indexing
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from distutils.version import LooseVersion
from itertools import product
import sys
import platform
import types
from unicodedata import east_asian_width
import struct
Expand All @@ -41,6 +42,7 @@
PY3 = (sys.version_info[0] >= 3)
PY35 = (sys.version_info >= (3, 5))
PY36 = (sys.version_info >= (3, 6))
PYPY = (platform.python_implementation() == 'PyPy')

try:
import __builtin__ as builtins
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pandas.core.nanops as nanops
import pandas._libs.lib as lib
from pandas.compat.numpy import function as nv
from pandas.compat import PYPY
from pandas.util._decorators import (Appender, cache_readonly,
deprecate_kwarg, Substitution)
from pandas.core.common import AbstractMethodError
Expand Down Expand Up @@ -1061,7 +1062,7 @@ def memory_usage(self, deep=False):
Notes
-----
Memory usage does not include memory consumed by elements that
are not components of the array if deep=False
are not components of the array if deep=False or if used on PyPy
See Also
--------
Expand All @@ -1071,9 +1072,8 @@ def memory_usage(self, deep=False):
return self.values.memory_usage(deep=deep)

v = self.values.nbytes
if deep and is_object_dtype(self):
if deep and is_object_dtype(self) and not PYPY:
v += lib.memory_usage_of_objects(self.values)

return v

def factorize(self, sort=False, na_sentinel=-1):
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,9 +465,13 @@ def _nbytes(self, deep=False):
*this is in internal routine*
"""

# for implementations with no useful getsizeof (PyPy)
objsize = 24

level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels))
label_nbytes = sum((i.nbytes for i in self.labels))
names_nbytes = sum((getsizeof(i) for i in self.names))
names_nbytes = sum((getsizeof(i, objsize) for i in self.names))
result = level_nbytes + label_nbytes + names_nbytes

# include our engine hashtable
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,12 @@ def _format_data(self):

@cache_readonly
def nbytes(self):
""" return the number of bytes in the underlying data """
return sum([getsizeof(getattr(self, v)) for v in
"""
Return the number of bytes in the underlying data
On implementations where this is undetermined (PyPy)
assume 24 bytes for each value
"""
return sum([getsizeof(getattr(self, v), 24) for v in
['_start', '_stop', '_step']])

def memory_usage(self, deep=False):
Expand Down
68 changes: 49 additions & 19 deletions pandas/tests/frame/test_repr_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pytest

from pandas import (DataFrame, compat, option_context)
from pandas.compat import StringIO, lrange, u
from pandas.compat import StringIO, lrange, u, PYPY
import pandas.io.formats.format as fmt
import pandas as pd

Expand Down Expand Up @@ -323,23 +323,6 @@ def test_info_memory_usage(self):
# excluded column with object dtype, so estimate is accurate
assert not re.match(r"memory usage: [^+]+\+", res[-1])

df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])

df_with_object_index.info(buf=buf, memory_usage='deep')
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])

assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() > df_with_object_index.memory_usage(
index=True).sum())

df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() >
df_object.memory_usage().sum())

# Test a DataFrame with duplicate columns
dtypes = ['int64', 'int64', 'int64', 'float64']
data = {}
Expand All @@ -349,6 +332,15 @@ def test_info_memory_usage(self):
df = DataFrame(data)
df.columns = dtypes

df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])

df_with_object_index.info(buf=buf, memory_usage='deep')
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])

# Ensure df size is as expected
# (cols * rows * bytes) + index size
df_size = df.memory_usage().sum()
Expand Down Expand Up @@ -377,9 +369,47 @@ def test_info_memory_usage(self):
df.memory_usage(index=True)
df.index.values.nbytes

mem = df.memory_usage(deep=True).sum()
assert mem > 0

@pytest.mark.skipif(PYPY,
reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy(self):
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() >
df_with_object_index.memory_usage(
index=True).sum())

df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() >
df_object.memory_usage().sum())

@pytest.mark.skipif(not PYPY,
reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy(self):
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() ==
df_with_object_index.memory_usage(
index=True).sum())

df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() ==
df_object.memory_usage().sum())

@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof(self):
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product(
[['a'], range(1000)]),
columns=['A']
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
diff = mem - sys.getsizeof(df)
assert abs(diff) < 100

def test_info_memory_usage_qualified(self):
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pandas.util.testing as tm
from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex,
Timedelta, IntervalIndex, Interval)
from pandas.compat import StringIO
from pandas.compat import StringIO, PYPY
from pandas.compat.numpy import np_array_datetime64_compat
from pandas.core.base import PandasDelegate, NoNewAttributesMixin
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
Expand Down Expand Up @@ -144,6 +144,7 @@ def f():

pytest.raises(TypeError, f)

@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
def test_memory_usage(self):
# Delegate does not implement memory_usage.
# Check that we fall back to in-built `__sizeof__`
Expand Down Expand Up @@ -941,6 +942,7 @@ def test_fillna(self):
# check shallow_copied
assert o is not result

@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
def test_memory_usage(self):
for o in self.objs:
res = o.memory_usage()
Expand Down
11 changes: 6 additions & 5 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
period_range, PeriodIndex,
timedelta_range, TimedeltaIndex, NaT,
Interval, IntervalIndex)
from pandas.compat import range, lrange, u, PY3
from pandas.compat import range, lrange, u, PY3, PYPY
from pandas.core.config import option_context


Expand Down Expand Up @@ -1448,10 +1448,11 @@ def test_memory_usage(self):
cat = pd.Categorical(['foo', 'foo', 'bar'])
assert cat.memory_usage(deep=True) > cat.nbytes

# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100
if not PYPY:
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100

def test_searchsorted(self):
# https://github.com/pandas-dev/pandas/issues/8420
Expand Down
1 change: 0 additions & 1 deletion pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
K = 4
_RAISE_NETWORK_ERROR_DEFAULT = False


# set testing_mode
_testing_mode_warnings = (DeprecationWarning, compat.ResourceWarning)

Expand Down

0 comments on commit b49446e

Please sign in to comment.