test(python): Memory usage test infrastructure, plus a test for #15098 (

#15285) Co-authored-by: Itamar Turner-Trauring <itamar@pythonspeed.com> Co-authored-by: Stijn de Gooijer <stijndegooijer@gmail.com>
pola-rs · Mar 28, 2024 · 9c46183 · 9c46183
1 parent 0061c92
commit 9c46183
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 3 deletions.
diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs
@@ -28,6 +28,7 @@ mod gil_once_cell;
 mod lazyframe;
 mod lazygroupby;
 mod map;
+mod memory;
 #[cfg(feature = "object")]
 mod object;
 #[cfg(feature = "object")]
@@ -62,16 +63,26 @@ use crate::expr::PyExpr;
 use crate::functions::PyStringCacheHolder;
 use crate::lazyframe::{PyInProcessQuery, PyLazyFrame};
 use crate::lazygroupby::PyLazyGroupBy;
+#[cfg(debug_assertions)]
+use crate::memory::TracemallocAllocator;
 use crate::series::PySeries;
 #[cfg(feature = "sql")]
 use crate::sql::PySQLContext;
 
+// On Windows tracemalloc does work. However, we build abi3 wheels, and the
+// relevant C APIs are not part of the limited stable CPython API. As a result,
+// linking breaks on Windows if we use tracemalloc C APIs. So we only use this
+// on Windows for now.
 #[global_allocator]
-#[cfg(all(target_family = "unix", not(use_mimalloc)))]
+#[cfg(all(target_family = "unix", debug_assertions))]
+static ALLOC: TracemallocAllocator<Jemalloc> = TracemallocAllocator::new(Jemalloc);
+
+#[global_allocator]
+#[cfg(all(target_family = "unix", not(use_mimalloc), not(debug_assertions)))]
 static ALLOC: Jemalloc = Jemalloc;
 
 #[global_allocator]
-#[cfg(any(not(target_family = "unix"), use_mimalloc))]
+#[cfg(all(any(not(target_family = "unix"), use_mimalloc), not(debug_assertions)))]
 static ALLOC: MiMalloc = MiMalloc;
 
 #[pymodule]

diff --git a/py-polars/src/memory.rs b/py-polars/src/memory.rs
@@ -0,0 +1,75 @@
+//! Utilities for dealing with memory allocations.
+
+use std::alloc::GlobalAlloc;
+
+use libc::{c_int, c_uint, size_t, uintptr_t};
+
+// When debug_assertions is enabled, use Python's tracemalloc to track memory
+// allocations. This is a useful feature for production use too, but has a
+// potential performance impact and so would need additional benchmarking. In
+// addition, these APIs are not part of the limited Python ABI Polars uses,
+// though they are unchanged between 3.7 and 3.12.
+#[cfg(not(target_os = "windows"))]
+extern "C" {
+    fn PyTraceMalloc_Track(domain: c_uint, ptr: uintptr_t, size: size_t) -> c_int;
+    fn PyTraceMalloc_Untrack(domain: c_uint, ptr: uintptr_t) -> c_int;
+}
+
+// Windows has issues linking to the tracemalloc APIs, so the functionality is
+// disabled. We have fake implementations just to make sure we don't have
+// issues building.
+#[cfg(target_os = "windows")]
+#[allow(non_snake_case)]
+fn PyTraceMalloc_Track(_domain: c_uint, _ptr: uintptr_t, _size: size_t) -> c_int {
+    -2
+}
+
+#[cfg(target_os = "windows")]
+#[allow(non_snake_case)]
+fn PyTraceMalloc_Untrack(_domain: c_uint, _ptr: uintptr_t) -> c_int {
+    -2
+}
+
+/// Allocations require a domain to identify them when registering with
+/// tracemalloc. Following NumPy's lead, we just pick a random constant that is
+/// unlikely to clash with anyone else.
+const TRACEMALLOC_DOMAIN: c_uint = 36740582;
+
+/// Wrap an existing allocator, and register allocations and frees with Python's
+/// `tracemalloc`. Registration functionality is disabled on Windows.
+pub struct TracemallocAllocator<A: GlobalAlloc> {
+    wrapped_alloc: A,
+}
+
+impl<A: GlobalAlloc> TracemallocAllocator<A> {
+    /// Wrap the allocator such that allocations are registered with
+    /// tracemalloc.
+    pub const fn new(wrapped_alloc: A) -> Self {
+        Self { wrapped_alloc }
+    }
+}
+
+unsafe impl<A: GlobalAlloc> GlobalAlloc for TracemallocAllocator<A> {
+    unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
+        let result = self.wrapped_alloc.alloc(layout);
+        PyTraceMalloc_Track(TRACEMALLOC_DOMAIN, result as uintptr_t, layout.size());
+        result
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
+        PyTraceMalloc_Untrack(TRACEMALLOC_DOMAIN, ptr as uintptr_t);
+        self.wrapped_alloc.dealloc(ptr, layout)
+    }
+
+    unsafe fn alloc_zeroed(&self, layout: std::alloc::Layout) -> *mut u8 {
+        let result = self.wrapped_alloc.alloc_zeroed(layout);
+        PyTraceMalloc_Track(TRACEMALLOC_DOMAIN, result as uintptr_t, layout.size());
+        result
+    }
+
+    unsafe fn realloc(&self, ptr: *mut u8, layout: std::alloc::Layout, new_size: usize) -> *mut u8 {
+        let result = self.wrapped_alloc.realloc(ptr, layout, new_size);
+        PyTraceMalloc_Track(TRACEMALLOC_DOMAIN, result as uintptr_t, new_size);
+        result
+    }
+}
diff --git a/py-polars/tests/unit/conftest.py b/py-polars/tests/unit/conftest.py
@@ -1,8 +1,11 @@
 from __future__ import annotations
 
+import gc
 import random
 import string
-from typing import List, cast
+import sys
+import tracemalloc
+from typing import Any, Generator, List, cast
 
 import numpy as np
 import pytest
@@ -138,3 +141,62 @@ def iso8601_tz_aware_format_datetime(request: pytest.FixtureRequest) -> list[str
 @pytest.fixture(params=ISO8601_FORMATS_DATE)
 def iso8601_format_date(request: pytest.FixtureRequest) -> list[str]:
     return cast(List[str], request.param)
+
+
+class MemoryUsage:
+    """
+    Provide an API for measuring peak memory usage.
+
+    Memory from PyArrow is not tracked at the moment.
+    """
+
+    def reset_tracking(self) -> None:
+        """Reset tracking to zero."""
+        gc.collect()
+        tracemalloc.stop()
+        tracemalloc.start()
+        assert self.get_peak() < 100_000
+
+    def get_current(self) -> int:
+        """
+        Return currently allocated memory, in bytes.
+
+        This only tracks allocations since this object was created or
+        ``reset_tracking()`` was called, whichever is later.
+        """
+        return tracemalloc.get_traced_memory()[0]
+
+    def get_peak(self) -> int:
+        """
+        Return peak allocated memory, in bytes.
+
+        This returns peak allocations since this object was created or
+        ``reset_tracking()`` was called, whichever is later.
+        """
+        return tracemalloc.get_traced_memory()[1]
+
+
+@pytest.fixture()
+def memory_usage_without_pyarrow() -> Generator[MemoryUsage, Any, Any]:
+    """
+    Provide an API for measuring peak memory usage.
+
+    Not thread-safe: there should only be one instance of MemoryUsage at any
+    given time.
+
+    Memory usage from PyArrow is not tracked.
+    """
+    if not pl.build_info()["build"]["debug"]:
+        pytest.skip("Memory usage only available in debug/dev builds.")
+
+    if sys.platform == "win32":
+        # abi3 wheels don't have the tracemalloc C APIs, which breaks linking
+        # on Windows.
+        pytest.skip("Windows not supported at the moment.")
+
+    gc.collect()
+    tracemalloc.start()
+    try:
+        yield MemoryUsage()
+    finally:
+        tracemalloc.stop()
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
@@ -19,6 +19,7 @@
     from pathlib import Path
 
     from polars.type_aliases import ParquetCompression
+    from tests.unit.conftest import MemoryUsage
 
 
 def test_round_trip(df: pl.DataFrame) -> None:
@@ -788,3 +789,33 @@ def test_parquet_array_statistics(tmp_path: Path) -> None:
 
     result = pl.scan_parquet(file_path).filter(pl.col("a") != [1, 2, 3]).collect()
     assert result.to_dict(as_series=False) == {"a": [[4, 5, 6], [7, 8, 9]], "b": [2, 3]}
+
+
+@pytest.mark.write_disk()
+def test_read_parquet_only_loads_selected_columns_15098(
+    memory_usage_without_pyarrow: MemoryUsage, tmp_path: Path
+) -> None:
+    """Only requested columns are loaded by ``read_parquet()``."""
+    tmp_path.mkdir(exist_ok=True)
+
+    # Each column will be about 8MB of RAM
+    series = pl.arange(0, 1_000_000, dtype=pl.Int64, eager=True)
+
+    file_path = tmp_path / "multicolumn.parquet"
+    df = pl.DataFrame(
+        {
+            "a": series,
+            "b": series,
+        }
+    )
+    df.write_parquet(file_path)
+    del df, series
+
+    memory_usage_without_pyarrow.reset_tracking()
+
+    # Only load one column:
+    df = pl.read_parquet([file_path], columns=["b"], rechunk=False)
+    del df
+    # Only one column's worth of memory should be used; 2 columns would be
+    # 16_000_000 at least, but there's some overhead.
+    assert 8_000_000 < memory_usage_without_pyarrow.get_peak() < 13_000_000
diff --git a/py-polars/tests/unit/test_conftest.py b/py-polars/tests/unit/test_conftest.py
@@ -0,0 +1,39 @@
+"""Tests for the testing infrastructure."""
+
+import numpy as np
+
+import polars as pl
+from tests.unit.conftest import MemoryUsage
+
+
+def test_memory_usage(memory_usage_without_pyarrow: MemoryUsage) -> None:
+    """The ``memory_usage`` fixture gives somewhat accurate results."""
+    memory_usage = memory_usage_without_pyarrow
+    assert memory_usage.get_current() < 100_000
+    assert memory_usage.get_peak() < 100_000
+
+    # Memory from Python is tracked:
+    b = b"X" * 1_300_000
+    assert 1_300_000 <= memory_usage.get_current() <= 2_000_000
+    assert 1_300_000 <= memory_usage.get_peak() <= 2_000_000
+    del b
+    assert memory_usage.get_current() <= 500_000
+    assert 1_300_000 <= memory_usage.get_peak() <= 2_000_000
+    memory_usage.reset_tracking()
+    assert memory_usage.get_current() < 100_000
+    assert memory_usage.get_peak() < 100_000
+
+    # Memory from Polars is tracked:
+    df = pl.DataFrame({"x": pl.arange(0, 1_000_000, eager=True, dtype=pl.Int64)})
+    del df
+    peak_bytes = memory_usage.get_peak()
+    assert 8_000_000 <= peak_bytes < 8_500_000
+
+    memory_usage.reset_tracking()
+    assert memory_usage.get_peak() < 1_000_000
+
+    # Memory from NumPy is tracked:
+    arr = np.ones((1_400_000,), dtype=np.uint8)
+    del arr
+    peak = memory_usage.get_peak()
+    assert 1_400_000 < peak < 1_500_000