From 20d03689b321c306b6cd899b38eb9e314e2ccd5e Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 11 Jan 2023 22:02:37 -0800 Subject: [PATCH] generate GC stats Port of Pablo's GH-100403 to Python 3.11.x. --- Modules/gcmodule.c | 186 +++++++++++++++++++++++ Tools/scripts/generate_gc_stats_plots.py | 177 +++++++++++++++++++++ 2 files changed, 363 insertions(+) create mode 100644 Tools/scripts/generate_gc_stats_plots.py diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c index dcd46feff0cc48..829d6d9d0047ea 100644 --- a/Modules/gcmodule.c +++ b/Modules/gcmodule.c @@ -39,6 +39,22 @@ module gc [clinic start generated code]*/ /*[clinic end generated code: output=da39a3ee5e6b4b0d input=b5c9690ecc842d79]*/ +#define GC_STATS 1 + +#ifdef GC_STATS + +typedef struct _gc_stats { + size_t n_collections; + PyObject* generation_number; + PyObject* total_objects; + PyObject* uncollectable; + PyObject* collected_cycles; + PyObject* collection_time; +} PyGCStats; + +PyGCStats _pygc_stats_struct = { 0 }; +#endif + #ifdef Py_DEBUG # define GC_DEBUG @@ -135,6 +151,34 @@ get_gc_state(void) return &interp->gc; } +#ifdef GC_STATS + +static inline int +is_main_interpreter(void) +{ + return (PyInterpreterState_Get() == PyInterpreterState_Main()); +} + +static int +_PyInitGCStats() { + if (!is_main_interpreter()) { + return 0; + } +#define INIT_FIELD(field) \ + _pygc_stats_struct.field = PyList_New(0);\ + if (_pygc_stats_struct.field== NULL) {\ + return -1; \ + }\ + + INIT_FIELD(generation_number); + INIT_FIELD(total_objects); + INIT_FIELD(uncollectable); + INIT_FIELD(collected_cycles); + INIT_FIELD(collection_time); +#undef INIT_FIELD + return 0; +} +#endif void _PyGC_InitState(GCState *gcstate) @@ -171,6 +215,12 @@ _PyGC_Init(PyInterpreterState *interp) return _PyStatus_NO_MEMORY(); } +#ifdef GC_STATS + if(_PyInitGCStats()) { + Py_FatalError("Could not initialize GC stats"); + } +#endif + return _PyStatus_OK(); } @@ -1190,6 +1240,12 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyTime_t t1 = 0; /* initialize to prevent a compiler warning */ GCState *gcstate = &tstate->interp->gc; +#ifdef GC_STATS + _PyTime_t gc_t1 = 0; + _PyTime_t gc_t2 = 0; + gc_t1 = _PyTime_GetPerfCounter(); +#endif + // gc_collect_main() must not be called before _PyGC_Init // or after _PyGC_Fini() assert(gcstate->garbage != NULL); @@ -1223,6 +1279,11 @@ gc_collect_main(PyThreadState *tstate, int generation, old = young; validate_list(old, collecting_clear_unreachable_clear); +#if GC_STATS + Py_ssize_t t = 0; /* # total objects being collected */ + t = gc_list_size(young); +#endif + deduce_unreachable(young, &unreachable); untrack_tuples(young); @@ -1321,6 +1382,42 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyErr_WriteUnraisableMsg("in garbage collection", NULL); } } +#ifdef GC_STATS + gc_t2 = _PyTime_GetPerfCounter(); +#define ADD_ELEMENT(field, elem) \ + { \ + PyObject* item = PyLong_FromLong(elem); \ + if (!item) { \ + Py_FatalError("Arg!"); \ + _PyErr_WriteUnraisableMsg("in garbage collection", NULL); \ + } \ + if (item && PyList_Append(_pygc_stats_struct.field, item) < 0) { \ + Py_FatalError("Arg2!"); \ + _PyErr_WriteUnraisableMsg("in garbage collection", NULL); \ + } \ + Py_DECREF(item); \ + } \ + + if (_pygc_stats_struct.generation_number) { + _pygc_stats_struct.n_collections++; + ADD_ELEMENT(generation_number, generation); + ADD_ELEMENT(total_objects, t); + ADD_ELEMENT(uncollectable, n); + ADD_ELEMENT(collected_cycles, m); + double d = _PyTime_AsSecondsDouble(gc_t2 - gc_t1); + PyObject* item = PyFloat_FromDouble(d); + if (!item) { + Py_FatalError("Arg3!"); + _PyErr_WriteUnraisableMsg("in garbage collection", NULL); + } + if (item && PyList_Append(_pygc_stats_struct.collection_time, item) < 0) { + Py_FatalError("Arg4!"); + _PyErr_WriteUnraisableMsg("in garbage collection", NULL); + } + } +#undef ADD_ELEMENT + +#endif /* Update stats */ if (n_collected) { @@ -2167,9 +2264,98 @@ gc_fini_untrack(PyGC_Head *list) } + +#ifdef GC_STATS +static void +print_stats(FILE *out) { +#define WRITE_ITEM(collection, index) { \ + PyObject* item = PyList_GET_ITEM(_pygc_stats_struct.collection, index); \ + if (!item) { \ + _PyErr_WriteUnraisableMsg(" when writing gc stats", NULL); \ + } \ + long num = PyLong_AsLong(item); \ + if (!num && PyErr_Occurred()) { \ + _PyErr_WriteUnraisableMsg(" when writing gc stats", NULL); \ + } \ + fprintf(out, ",%zd", num); } \ + + fprintf(out, "collection,generation_number,total_objects,uncollectable,collected_cycles,collection_time\n"); + for (size_t i = 0; i < _pygc_stats_struct.n_collections; i++) { + fprintf(out, "%zd", i); + WRITE_ITEM(generation_number, i); + WRITE_ITEM(total_objects, i); + WRITE_ITEM(uncollectable, i); + WRITE_ITEM(collected_cycles, i); + { + PyObject* item = PyList_GET_ITEM(_pygc_stats_struct.collection_time, i); + if (!item) { + _PyErr_WriteUnraisableMsg(" when writing gc stats", NULL); + } + double num = PyFloat_AS_DOUBLE(item); + if (!num && PyErr_Occurred()) { + _PyErr_WriteUnraisableMsg(" when writing gc stats", NULL); + } + fprintf(out, ",%f", num); + } + fprintf(out, "\n"); + } +} + +void +_Py_PrintGCStats(int to_file) +{ + FILE *out = stderr; + if (to_file) { + /* Write to a file instead of stderr. */ +# ifdef MS_WINDOWS + const char *dirname = "c:\\temp\\py_stats\\"; +# else + const char *dirname = "/tmp/py_stats/"; +# endif + /* Use random 160 bit number as file name, + * to avoid both accidental collisions and + * symlink attacks. */ + unsigned char rand[20]; + char hex_name[41]; + _PyOS_URandomNonblock(rand, 20); + for (int i = 0; i < 20; i++) { + hex_name[2*i] = "0123456789abcdef"[rand[i]&15]; + hex_name[2*i+1] = "0123456789abcdef"[(rand[i]>>4)&15]; + } + hex_name[40] = '\0'; + char buf[64]; + assert(strlen(dirname) + 40 + strlen("_gc.txt") < 64); + sprintf(buf, "%s%s_gc.txt", dirname, hex_name); + FILE *fout = fopen(buf, "w"); + if (fout) { + out = fout; + } + } + else { + fprintf(out, "GC stats:\n"); + } + print_stats(out); + if (out != stderr) { + fclose(out); + } + + Py_CLEAR(_pygc_stats_struct.generation_number); + Py_CLEAR(_pygc_stats_struct.total_objects); + Py_CLEAR(_pygc_stats_struct.uncollectable); + Py_CLEAR(_pygc_stats_struct.collected_cycles); + Py_CLEAR(_pygc_stats_struct.collection_time); +# +} +#endif + void _PyGC_Fini(PyInterpreterState *interp) { +#ifdef GC_STATS + if (is_main_interpreter() && Py_GETENV("PYTHON_GC_STATS")) { + _Py_PrintGCStats(1); + } +#endif GCState *gcstate = &interp->gc; Py_CLEAR(gcstate->garbage); Py_CLEAR(gcstate->callbacks); diff --git a/Tools/scripts/generate_gc_stats_plots.py b/Tools/scripts/generate_gc_stats_plots.py new file mode 100644 index 00000000000000..82650b1a0a24a7 --- /dev/null +++ b/Tools/scripts/generate_gc_stats_plots.py @@ -0,0 +1,177 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import argparse + + +def get_obj_percentage_data(df): + gen_data = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + item = vals["collected_cycles"] / vals["total_objects"].values + if item.size == 0: + item = np.array([0]) + gen_data.append(item) + return gen_data + + +def get_gen_time_data(df): + gen_data = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + item = vals["collection_time"].values + if item.size == 0: + item = np.array([0]) + gen_data.append(item) + return gen_data + + +def get_gen_time_data_per_obj(df): + gen_data = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + item = vals["collection_time"] / (1+vals["collected_cycles"]).values + if item.size == 0: + item = np.array([0]) + gen_data.append(item) + return gen_data + +def get_gen_freq_per_us(df): + gen_data = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + item = 1.0 / (vals["collection_time"] / (1+vals["collected_cycles"])).values / 1.0e3 + if item.size == 0: + item = np.array([0]) + gen_data.append(item) + return gen_data + +def get_gc_summary(df): + gen_totals = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + obj_collected = vals["collected_cycles"].sum() + total_time = vals["collection_time"].sum() + total_objs = vals["total_objects"].sum() + gen_totals.append((obj_collected, total_time, total_objs)) + total_gc_time = df["collection_time"].sum() + return gen_totals, total_gc_time + +def gen_plot(df, output_filename): + def violinplot_with_custom_formatting( + ax, data, yticks, ylim, xlabel, title, formatter + ): + violin = ax.violinplot( + data, + vert=False, + showmeans=True, + showmedians=False, + widths=1, + quantiles=[[0.1, 0.9]] * len(data), + ) + violin["cquantiles"].set_linestyle(":") + ax.set_yticks(np.arange(len(yticks)) + 1, yticks) + ax.set_ylim(ylim) + ax.tick_params(axis="x", bottom=True, top=True, labelbottom=True, labeltop=True) + ax.xaxis.set_major_formatter(formatter) + ax.grid() + ax.set_title(title) + + def barplot_with_custom_formatting(ax, data, title, formatter): + ax.bar(names, data) + ax.set_ylabel(title) + ax.set_title(title) + ax.yaxis.set_major_formatter(formatter) + ax.grid() + + names = ["First generation", "Second generation", "Third generation"] + # fig, (ax1, ax2, ax3, ax4) = plt.subplots( + # 5, 1, figsize=(8, (2 + len(names) * 0.3) * 4), layout="constrained" + # ) + fig = plt.figure(figsize=(10, 10), layout="constrained") + ax0 = plt.subplot(7, 2, 1) + ax1 = plt.subplot(7, 2, 2) + ax2 = plt.subplot(7, 1, 3) + ax3 = plt.subplot(7, 1, 4) + ax4 = plt.subplot(7, 1, 5) + ax5 = plt.subplot(7, 1, 6) + + figsize=(8, (2 + len(names) * 0.3) * 4) + + running_time_data = [x.sum() for x in get_gen_time_data(df)] + formatter = lambda val, pos: f"{val*1000:.0f}ms" + barplot_with_custom_formatting(ax0, running_time_data, "Total running time", formatter) + + object_number_data = [np.sum(df[df["generation_number"] == i]["collected_cycles"]/1000.0) for i in range(3)] + formatter = lambda val, pos: f"{int(val)}e3 obj" + barplot_with_custom_formatting(ax1, object_number_data, "Total object collected", formatter) + + obj_percentage_data = get_obj_percentage_data(df) + formatter = lambda val, pos: f"{val*100:.0f}%" + violinplot_with_custom_formatting( + ax2, + obj_percentage_data, + names, + (0, len(names) + 1), + "efficiency", + "Generation efficiency", + formatter=formatter, + ) + + formatter = lambda val, pos: f"{val*1e3:.0f}ms" + gen_time_data = get_gen_time_data(df) + violinplot_with_custom_formatting( + ax3, + gen_time_data, + names, + (0, len(names) + 1), + "time", + "Generation Time", + formatter=formatter, + ) + + formatter = lambda val, pos: f"{val*1e3:.0f}ms" + gen_time_data_per_obj = get_gen_time_data_per_obj(df) + violinplot_with_custom_formatting( + ax4, + gen_time_data_per_obj, + names, + (0, len(names) + 1), + "time", + "Time per obj collected", + formatter=formatter, + ) + + formatter = lambda val, pos: f"{int(val)} obj/ms" + gen_freq_per_us = get_gen_freq_per_us(df) + violinplot_with_custom_formatting( + ax5, + gen_freq_per_us, + names, + (0, len(names) + 1), + "time", + "Objects collected per ms", + formatter=formatter, + ) + + plt.savefig(output_filename) + plt.close() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("file", help="The file to plot") + parser.add_argument("output_filename", help="The output filename") + args = parser.parse_args() + + df = pd.read_csv(args.file) + gen_plot(df, args.output_filename) + + gen_totals, total_gc_time = get_gc_summary(df) + for generation in range(3): + gen_data = gen_totals[generation] + print(f"Generation {generation}: {gen_data[0]} objects collected, {gen_data[1]:.2f} s, {gen_data[2]} total objects") + print("Total GC time: {:.2f} s".format(total_gc_time)) + +if __name__ == "__main__": + main()