diff --git a/doc/scripts/eval_performance.py b/doc/scripts/eval_performance.py new file mode 100644 index 0000000000000..27d9bf23fc1af --- /dev/null +++ b/doc/scripts/eval_performance.py @@ -0,0 +1,108 @@ +from timeit import repeat as timeit + +import numpy as np +import seaborn as sns + +from pandas import DataFrame + +setup_common = """from pandas import DataFrame +from numpy.random import randn +df = DataFrame(randn(%d, 3), columns=list('abc')) +%s""" + +setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" + + +def bench_with(n, times=10, repeat=3, engine="numexpr"): + return ( + np.array( + timeit( + "df.eval(s, engine=%r)" % engine, + setup=setup_common % (n, setup_with), + repeat=repeat, + number=times, + ) + ) + / times + ) + + +setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" + + +def bench_subset(n, times=20, repeat=3, engine="numexpr"): + return ( + np.array( + timeit( + "df.query(s, engine=%r)" % engine, + setup=setup_common % (n, setup_subset), + repeat=repeat, + number=times, + ) + ) + / times + ) + + +def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False): + r = np.logspace(mn, mx, num=num).round().astype(int) + + ev = DataFrame(np.empty((num, len(engines))), columns=engines) + qu = ev.copy(deep=True) + + ev["size"] = qu["size"] = r + + for engine in engines: + for i, n in enumerate(r): + if verbose & (i % 10 == 0): + print("engine: %r, i == %d" % (engine, i)) + ev_times = bench_with(n, times=1, repeat=1, engine=engine) + ev.loc[i, engine] = np.mean(ev_times) + qu_times = bench_subset(n, times=1, repeat=1, engine=engine) + qu.loc[i, engine] = np.mean(qu_times) + + return ev, qu + + +def plot_perf(df, engines, title, filename=None): + from matplotlib.pyplot import figure + + sns.set() + sns.set_palette("Set2") + + fig = figure(figsize=(4, 3), dpi=120) + ax = fig.add_subplot(111) + + for engine in engines: + ax.loglog(df["size"], df[engine], label=engine, lw=2) + + ax.set_xlabel("Number of Rows") + ax.set_ylabel("Time (s)") + ax.set_title(title) + ax.legend(loc="best") + ax.tick_params(top=False, right=False) + + fig.tight_layout() + + if filename is not None: + fig.savefig(filename) + + +if __name__ == "__main__": + import os + + pandas_dir = os.path.dirname( + os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + ) + static_path = os.path.join(pandas_dir, "doc", "source", "_static") + + join = lambda p: os.path.join(static_path, p) + + fn = join("eval-query-perf-data.h5") + + engines = "python", "numexpr" + + ev, qu = bench(verbose=True) # only this one + + plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png")) + plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png")) diff --git a/doc/source/_static/eval-perf-small.png b/doc/source/_static/eval-perf-small.png deleted file mode 100644 index d86018363ffdc..0000000000000 Binary files a/doc/source/_static/eval-perf-small.png and /dev/null differ diff --git a/doc/source/_static/eval-perf.png b/doc/source/_static/eval-perf.png index 14c69c1b85d9e..ed92337c1d995 100644 Binary files a/doc/source/_static/eval-perf.png and b/doc/source/_static/eval-perf.png differ diff --git a/doc/source/_static/query-perf-small.png b/doc/source/_static/query-perf-small.png deleted file mode 100644 index e14fa69db7fe8..0000000000000 Binary files a/doc/source/_static/query-perf-small.png and /dev/null differ diff --git a/doc/source/_static/query-perf.png b/doc/source/_static/query-perf.png index d96318df94357..c52849a0edd53 100644 Binary files a/doc/source/_static/query-perf.png and b/doc/source/_static/query-perf.png differ diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 1a1229f95523b..9375bb066781b 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -690,21 +690,12 @@ The equivalent in standard Python would be df["a"] = 1 df -The :class:`DataFrame.query` method has a ``inplace`` keyword which determines -whether the query modifies the original frame. - -.. ipython:: python - - df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df.query("a > 2") - df.query("a > 2", inplace=True) - df - Local variables ~~~~~~~~~~~~~~~ You must *explicitly reference* any local variable that you want to use in an -expression by placing the ``@`` character in front of the name. For example, +expression by placing the ``@`` character in front of the name. This mechanism is +the same for both :meth:`DataFrame.query` and :meth:`DataFrame.eval`. For example, .. ipython:: python @@ -820,17 +811,12 @@ significant performance benefit. Here is a plot showing the running time of :func:`pandas.eval` as function of the size of the frame involved in the computation. The two lines are two different engines. +.. + The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py .. image:: ../_static/eval-perf.png - -.. note:: - - Operations with smallish objects (around 15k-20k rows) are faster using - plain Python: - - .. image:: ../_static/eval-perf-small.png - +You will only see the performance benefits of using the ``numexpr`` engine with :func:`pandas.eval` if your frame has more than approximately 100,000 rows. This plot was created using a :class:`DataFrame` with 3 columns each containing floating point values generated using ``numpy.random.randn()``. diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index f939945fc6cda..dbd6d2757e1be 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1240,6 +1240,17 @@ If instead you don't want to or cannot name your index, you can use the name renaming your columns to something less ambiguous. +The :class:`DataFrame.query` method has a ``inplace`` keyword which determines +whether the query modifies the original frame. + +.. ipython:: python + + df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) + df.query("a > 2") + df.query("a > 2", inplace=True) + df + + :class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1438,15 +1449,18 @@ Performance of :meth:`~pandas.DataFrame.query` ``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for large frames. +.. + The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py + .. image:: ../_static/query-perf.png -.. note:: - You will only see the performance benefits of using the ``numexpr`` engine - with ``DataFrame.query()`` if your frame has more than approximately 200,000 - rows. - .. image:: ../_static/query-perf-small.png +You will only see the performance benefits of using the ``numexpr`` engine +with ``DataFrame.query()`` if your frame has more than approximately 100,000 +rows. + + This plot was created using a ``DataFrame`` with 3 columns each containing floating point values generated using ``numpy.random.randn()``.