diff --git a/acro/acro_tables.py b/acro/acro_tables.py index c4e17da..c3ec6f6 100644 --- a/acro/acro_tables.py +++ b/acro/acro_tables.py @@ -527,6 +527,195 @@ def survival_plot( # pylint: disable=too-many-arguments,too-many-locals ) return plot + def hist( # pylint: disable=too-many-arguments,too-many-locals + self, + data, + column, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + backend=None, + legend=False, + filename="histogram.png", + **kwargs, + ): + """Creates a histogram from a single column. + The dataset and the column's name should be passed to the function as parameters. + If more than one column is used the histogram will not be calculated. + + To save the histogram plot to a file, the user can specify a filename otherwise + 'histogram.png' will be used as the filename. A number will be appended automatically + to the filename to avoid overwriting the files. + + Returns + ------- + matplotlib.Axes + + Parameters: + ----------- + data : DataFrame + The pandas object holding the data. + column : str + The column that will be used to plot the histogram. + by : object, optional + If passed, then used to form histograms for separate groups. + grid : bool, default True + Whether to show axis grid lines. + xlabelsize : int, default None + If specified changes the x-axis label size. + xrot : float, default None + Rotation of x axis labels. For example, a value of 90 displays + the x labels rotated 90 degrees clockwise. + ylabelsize : int, default None + If specified changes the y-axis label size. + yrot : float, default None + Rotation of y axis labels. For example, a value of 90 displays + the y labels rotated 90 degrees clockwise. + ax : Matplotlib axes object, default None + The axes to plot the histogram on. + sharex : bool, default True if ax is None else False + In case subplots=True, share x axis and set some x axis labels to invisible; + defaults to True if ax is None otherwise False if an ax is passed in. + Note that passing in both an ax and sharex=True will alter all x axis + labels for all subplots in a figure. + sharey : bool, default False + In case subplots=True, share y axis and set some y axis labels to invisible. + figsize : tuple, optional + The size in inches of the figure to create. + Uses the value in matplotlib.rcParams by default. + layout : tuple, optional + Tuple of (rows, columns) for the layout of the histograms. + bins : int or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are + calculated and returned. If bins is a sequence, gives bin edges, + including left edge of first bin and right edge of last bin. + backend : str, default None + Backend to use instead of the backend specified in the option plotting.backend. + For instance, ‘matplotlib’. Alternatively, to specify the plotting.backend for the + whole session, set pd.options.plotting.backend. + legend : bool, default False + Whether to show the legend. + filename: + The name of the file where the plot will be saved. + """ + logger.debug("hist()") + command: str = utils.get_command("hist()", stack()) + + if isinstance(data, list): # pragma: no cover + logger.info( + "Calculating histogram for more than one columns is " + "not currently supported. Please do each column separately." + ) + return + + freq, _ = np.histogram( # pylint: disable=too-many-function-args + data[column], bins, range=(data[column].min(), data[column].max()) + ) + + # threshold check + threshold_mask = freq < THRESHOLD + + # plot the histogram + if np.any(threshold_mask): # the column is disclosive + status = "fail" + if self.suppress: + logger.warning( + "Histogram will not be shown as the %s column is disclosive.", + column, + ) + else: # pragma: no cover + data.hist( + column=column, + by=by, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + bins=bins, + backend=backend, + legend=legend, + **kwargs, + ) + else: + status = "review" + data.hist( + column=column, + by=by, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + bins=bins, + backend=backend, + legend=legend, + **kwargs, + ) + logger.info("status: %s", status) + + # create the summary + min_value = data[column].min() + max_value = data[column].max() + summary = ( + f"Please check the minimum and the maximum values. " + f"The minimum value of the {column} column is: {min_value}. " + f"The maximum value of the {column} column is: {max_value}" + ) + + # create the acro_artifacts directory to save the plot in it + try: + os.makedirs("acro_artifacts") + logger.debug("Directory acro_artifacts created successfully") + except FileExistsError: # pragma: no cover + logger.debug("Directory acro_artifacts already exists") + + # create a unique filename with number to avoid overwrite + filename, extension = os.path.splitext(filename) + if not extension: # pragma: no cover + logger.info("Please provide a valid file extension") + return + increment_number = 0 + while os.path.exists( + f"acro_artifacts/{filename}_{increment_number}{extension}" + ): + increment_number += 1 + unique_filename = f"acro_artifacts/{filename}_{increment_number}{extension}" + + # save the plot to the acro artifacts directory + plt.savefig(unique_filename) + + # record output + self.results.add( + status=status, + output_type="histogram", + properties={"method": "histogram"}, + sdc={}, + command=command, + summary=summary, + outcome=pd.DataFrame(), + output=[os.path.normpath(unique_filename)], + ) + def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals index, diff --git a/acro/record.py b/acro/record.py index ecdf4ff..c39a54b 100644 --- a/acro/record.py +++ b/acro/record.py @@ -175,9 +175,11 @@ def serialize_output(self, path: str = "outputs") -> list[str]: if os.path.exists(filename): shutil.copy(filename, path) output.append(Path(filename).name) - if self.output_type == "survival plot": + if self.output_type in ["survival plot", "histogram"]: for filename in self.output: - output.append(Path(filename).name) + if os.path.exists(filename): + output.append(Path(filename).name) + shutil.copy(filename, path) return output def __str__(self) -> str: @@ -446,9 +448,10 @@ def finalise(self, path: str, ext: str) -> None: self.finalise_excel(path) else: raise ValueError("Invalid file extension. Options: {json, xlsx}") - if os.path.exists("acro_artifacts"): - add_acro_artifacts(path) self.write_checksums(path) + # check if the directory acro_artifacts exists and delete it + if os.path.exists("acro_artifacts"): + shutil.rmtree("acro_artifacts") logger.info("outputs written to: %s", path) def finalise_json(self, path: str) -> None: @@ -565,20 +568,6 @@ def write_checksums(self, path: str) -> None: logger.debug("There is no file to do the checksums") # pragma: no cover -def add_acro_artifacts(path: str) -> None: - """Copy any file from the acro_artifacts directory to the output - directory then delete the directory. - - Parameters - ---------- - path : str - Name of the folder that files are to be written. - """ - for filename in os.listdir("acro_artifacts"): - shutil.copy(f"acro_artifacts/{filename}", path) - shutil.rmtree("acro_artifacts") - - def load_records(path: str) -> Records: """Loads outputs from a JSON file. diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb index 5b216be..3a30abe 100644 --- a/notebooks/test.ipynb +++ b/notebooks/test.ipynb @@ -643,7 +643,7 @@ " \n", " \n", " 2011\n", - " 8502247.0\n", + " 8502246.0\n", " 124013.859375\n", " 7716880.0\n", " NaN\n", @@ -666,7 +666,7 @@ " 2014\n", " 13748147.0\n", " 133198.250000\n", - " 8277525.5\n", + " 8277525.0\n", " NaN\n", " \n", " \n", @@ -684,10 +684,10 @@ "grant_type G N R R/G\n", "year \n", "2010 9921906.0 NaN 8402284.0 NaN\n", - "2011 8502247.0 124013.859375 7716880.0 NaN\n", + "2011 8502246.0 124013.859375 7716880.0 NaN\n", "2012 11458580.0 131859.062500 6958050.5 NaN\n", "2013 13557147.0 147937.796875 7202273.5 NaN\n", - "2014 13748147.0 133198.250000 8277525.5 NaN\n", + "2014 13748147.0 133198.250000 8277525.0 NaN\n", "2015 11133433.0 146572.187500 10812888.0 NaN" ] }, @@ -721,7 +721,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:[\"Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted.\"]\n", + "INFO:acro:Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted.\n", "INFO:acro:get_summary(): fail; threshold: 14 cells may need suppressing; p-ratio: 8 cells may need suppressing; nk-rule: 7 cells may need suppressing; \n", "INFO:acro:outcome_df:\n", "------------------------------------------------------------------------------------------------------------------------------------------------|\n", @@ -870,7 +870,7 @@ " \n", " \n", " 2011\n", - " 8502247.0\n", + " 8502246.0\n", " 124013.859375\n", " 7716880.0\n", " 16047500.0\n", @@ -893,7 +893,7 @@ " 2014\n", " 13748147.0\n", " 133198.250000\n", - " 8277525.5\n", + " 8277525.0\n", " 17845750.0\n", " \n", " \n", @@ -911,10 +911,10 @@ "grant_type G N R R/G\n", "year \n", "2010 9921906.0 0.000000 8402284.0 11636000.0\n", - "2011 8502247.0 124013.859375 7716880.0 16047500.0\n", + "2011 8502246.0 124013.859375 7716880.0 16047500.0\n", "2012 11458580.0 131859.062500 6958050.5 16810000.0\n", "2013 13557147.0 147937.796875 7202273.5 16765625.0\n", - "2014 13748147.0 133198.250000 8277525.5 17845750.0\n", + "2014 13748147.0 133198.250000 8277525.0 17845750.0\n", "2015 11133433.0 146572.187500 10812888.0 18278624.0" ] }, @@ -938,10 +938,224 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "fb7abfc9-e428-4b71-9066-01ac9a08d655", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:get_summary(): fail; threshold: 14 cells may need suppressing; p-ratio: 4 cells may need suppressing; nk-rule: 2 cells may need suppressing; \n", + "INFO:acro:outcome_df:\n", + "----------------------------------------------------------------------------------------------------------------------------------------------|\n", + " mean |std |\n", + "grant_type G N R R/G All |G N R R/G All|\n", + "year | |\n", + "----------------------------------------------------------------------------------------------------------------------------------------------|\n", + "2010 ok threshold; p-ratio; ok threshold; p-ratio; nk-rule; ok | ok threshold; p-ratio; ok threshold; p-ratio; nk-rule; ok|\n", + "2011 ok ok ok threshold; ok | ok ok ok threshold; ok|\n", + "2012 ok ok ok threshold; ok | ok ok ok threshold; ok|\n", + "2013 ok ok ok threshold; ok | ok ok ok threshold; ok|\n", + "2014 ok ok ok threshold; ok | ok ok ok threshold; ok|\n", + "2015 ok ok ok threshold; ok | ok ok ok threshold; ok|\n", + "All ok ok ok ok ok | ok ok ok ok ok|\n", + "----------------------------------------------------------------------------------------------------------------------------------------------|\n", + "\n", + "INFO:acro:records:add(): output_4\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
meanstd
grant_typeGNRR/GAllGNRR/GAll
year
20109921906.00.0000008402284.011636000.08308286.51.855055e+070.0000003.059557e+071.701088e+072.727398e+07
20118502246.0124013.8593757716880.016047500.05303808.51.688595e+07205959.4929032.954322e+071.561638e+072.137658e+07
201211458580.0131859.0625006958050.516810000.05259893.52.061090e+07210476.5391752.721184e+071.646449e+072.026400e+07
201313557147.0147937.7968757202273.516765625.05605045.52.486844e+07203747.4170172.989833e+071.671112e+072.251787e+07
201413748147.0133198.2500008277525.017845750.06117054.53.134559e+07181865.9255803.546348e+071.741251e+072.641722e+07
201511133433.0146572.18750010812888.018278624.06509989.52.553919e+07201602.8008324.130935e+071.730471e+072.784636e+07
All11412787.0134431.8906258098502.016648273.05997796.52.283220e+07198873.7266563.204495e+071.583532e+072.405324e+07
\n", + "
" + ], + "text/plain": [ + " mean \\\n", + "grant_type G N R R/G All \n", + "year \n", + "2010 9921906.0 0.000000 8402284.0 11636000.0 8308286.5 \n", + "2011 8502246.0 124013.859375 7716880.0 16047500.0 5303808.5 \n", + "2012 11458580.0 131859.062500 6958050.5 16810000.0 5259893.5 \n", + "2013 13557147.0 147937.796875 7202273.5 16765625.0 5605045.5 \n", + "2014 13748147.0 133198.250000 8277525.0 17845750.0 6117054.5 \n", + "2015 11133433.0 146572.187500 10812888.0 18278624.0 6509989.5 \n", + "All 11412787.0 134431.890625 8098502.0 16648273.0 5997796.5 \n", + "\n", + " std \\\n", + "grant_type G N R R/G \n", + "year \n", + "2010 1.855055e+07 0.000000 3.059557e+07 1.701088e+07 \n", + "2011 1.688595e+07 205959.492903 2.954322e+07 1.561638e+07 \n", + "2012 2.061090e+07 210476.539175 2.721184e+07 1.646449e+07 \n", + "2013 2.486844e+07 203747.417017 2.989833e+07 1.671112e+07 \n", + "2014 3.134559e+07 181865.925580 3.546348e+07 1.741251e+07 \n", + "2015 2.553919e+07 201602.800832 4.130935e+07 1.730471e+07 \n", + "All 2.283220e+07 198873.726656 3.204495e+07 1.583532e+07 \n", + "\n", + " \n", + "grant_type All \n", + "year \n", + "2010 2.727398e+07 \n", + "2011 2.137658e+07 \n", + "2012 2.026400e+07 \n", + "2013 2.251787e+07 \n", + "2014 2.641722e+07 \n", + "2015 2.784636e+07 \n", + "All 2.405324e+07 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "safe_table = acro.crosstab(\n", " df.year, df.grant_type, values=df.inc_grants, aggfunc=[\"mean\", \"std\"], margins=True\n", @@ -966,7 +1180,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "bf132239", "metadata": {}, "outputs": [ @@ -989,7 +1203,7 @@ "All | | | | | |\n", "-------------------------------------------------------|\n", "\n", - "INFO:acro:records:add(): output_4\n" + "INFO:acro:records:add(): output_5\n" ] }, { @@ -1033,15 +1247,15 @@ " 2010\n", " 9921906.0\n", " 0.000000\n", - " 8420372.0\n", + " 8420373.0\n", " 11636000.0\n", " 8320154.5\n", " \n", " \n", " 2011\n", - " 8502247.0\n", + " 8502246.0\n", " 125663.226562\n", - " 7689140.5\n", + " 7689140.0\n", " 16047500.0\n", " 5310392.0\n", " \n", @@ -1057,7 +1271,7 @@ " 2013\n", " 13557147.0\n", " 150488.453125\n", - " 7088096.0\n", + " 7088095.5\n", " 16765625.0\n", " 5578657.0\n", " \n", @@ -1065,7 +1279,7 @@ " 2014\n", " 13748147.0\n", " 135494.781250\n", - " 8118565.0\n", + " 8118565.5\n", " 17845750.0\n", " 6072600.0\n", " \n", @@ -1081,7 +1295,7 @@ " All\n", " 11412787.0\n", " 136158.859375\n", - " 8006360.5\n", + " 8006361.0\n", " 16648273.0\n", " 5968295.5\n", " \n", @@ -1092,16 +1306,16 @@ "text/plain": [ "grant_type G N R R/G All\n", "year \n", - "2010 9921906.0 0.000000 8420372.0 11636000.0 8320154.5\n", - "2011 8502247.0 125663.226562 7689140.5 16047500.0 5310392.0\n", + "2010 9921906.0 0.000000 8420373.0 11636000.0 8320154.5\n", + "2011 8502246.0 125663.226562 7689140.0 16047500.0 5310392.0\n", "2012 11458580.0 131859.062500 6896304.0 16810000.0 5220580.5\n", - "2013 13557147.0 150488.453125 7088096.0 16765625.0 5578657.0\n", - "2014 13748147.0 135494.781250 8118565.0 17845750.0 6072600.0\n", + "2013 13557147.0 150488.453125 7088095.5 16765625.0 5578657.0\n", + "2014 13748147.0 135494.781250 8118565.5 17845750.0 6072600.0\n", "2015 11133433.0 149143.625000 10596385.0 18278624.0 6442131.0\n", - "All 11412787.0 136158.859375 8006360.5 16648273.0 5968295.5" + "All 11412787.0 136158.859375 8006361.0 16648273.0 5968295.5" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1120,7 +1334,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "7cc417a0", "metadata": {}, "outputs": [], @@ -1138,7 +1352,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "15bcdc7c", "metadata": {}, "outputs": [ @@ -1160,7 +1374,7 @@ "2015 | | negative | negative | |\n", "----------------------------------------|\n", "\n", - "INFO:acro:records:add(): output_5\n" + "INFO:acro:records:add(): output_6\n" ] }, { @@ -1207,7 +1421,7 @@ " \n", " \n", " 2011\n", - " 8502247.0\n", + " 8502246.0\n", " 123496.445312\n", " 7577703.5\n", " 16047500.0\n", @@ -1223,21 +1437,21 @@ " 2013\n", " 13557147.0\n", " 147937.625000\n", - " 6988263.5\n", + " 6988263.0\n", " 16765625.0\n", " \n", " \n", " 2014\n", " 13748147.0\n", " 133198.078125\n", - " 7997392.5\n", + " 7997392.0\n", " 17845750.0\n", " \n", " \n", " 2015\n", " 11133433.0\n", " 146572.015625\n", - " 10388613.0\n", + " 10388612.0\n", " 18278624.0\n", " \n", " \n", @@ -1248,14 +1462,14 @@ "grant_type G N R R/G\n", "year \n", "2010 9921906.0 0.000000 8280032.5 11636000.0\n", - "2011 8502247.0 123496.445312 7577703.5 16047500.0\n", + "2011 8502246.0 123496.445312 7577703.5 16047500.0\n", "2012 11458580.0 131859.062500 6796357.5 16810000.0\n", - "2013 13557147.0 147937.625000 6988263.5 16765625.0\n", - "2014 13748147.0 133198.078125 7997392.5 17845750.0\n", - "2015 11133433.0 146572.015625 10388613.0 18278624.0" + "2013 13557147.0 147937.625000 6988263.0 16765625.0\n", + "2014 13748147.0 133198.078125 7997392.0 17845750.0\n", + "2015 11133433.0 146572.015625 10388612.0 18278624.0" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1278,7 +1492,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "b13b5f7e", "metadata": {}, "outputs": [ @@ -1300,7 +1514,7 @@ "All |\n", "------------------------------------------------------------------|\n", "\n", - "INFO:acro:records:add(): output_6\n" + "INFO:acro:records:add(): output_7\n" ] }, { @@ -1375,9 +1589,9 @@ " \n", " R\n", " 504137056.0\n", - " 532464736.0\n", + " 532464704.0\n", " 480105472.0\n", - " 511361440.0\n", + " 511361408.0\n", " 554594176.0\n", " 551457280.0\n", " 3.134120e+09\n", @@ -1395,8 +1609,8 @@ " \n", " All\n", " 689587776.0\n", - " 795571200.0\n", - " 794243840.0\n", + " 795571264.0\n", + " 794243904.0\n", " 857571968.0\n", " 911441088.0\n", " 839788672.0\n", @@ -1412,9 +1626,9 @@ "grant_type \n", "G 138906688.0 127533696.0 171878704.0 203357200.0 206222208.0 \n", "N 0.0 7192804.0 7779685.0 8728330.0 7858697.0 \n", - "R 504137056.0 532464736.0 480105472.0 511361440.0 554594176.0 \n", + "R 504137056.0 532464704.0 480105472.0 511361408.0 554594176.0 \n", "R/G 46544000.0 128380000.0 134480000.0 134125000.0 142766000.0 \n", - "All 689587776.0 795571200.0 794243840.0 857571968.0 911441088.0 \n", + "All 689587776.0 795571264.0 794243904.0 857571968.0 911441088.0 \n", "\n", " \n", "year 2015 All \n", @@ -1426,7 +1640,7 @@ "All 839788672.0 4.888204e+09 " ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1445,7 +1659,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "3f016823", "metadata": {}, "outputs": [ @@ -1513,9 +1727,9 @@ " \n", " R\n", " 504137056.0\n", - " 532464736.0\n", + " 532464704.0\n", " 480105472.0\n", - " 511361440.0\n", + " 511361408.0\n", " 554594176.0\n", " 551457280.0\n", " 3.134120e+09\n", @@ -1533,8 +1747,8 @@ " \n", " All\n", " 689587776.0\n", - " 795571200.0\n", - " 794243840.0\n", + " 795571264.0\n", + " 794243904.0\n", " 857571968.0\n", " 911441088.0\n", " 839788672.0\n", @@ -1549,9 +1763,9 @@ "grant_type \n", "G 138906688.0 127533696.0 171878704.0 203357200.0 206222208.0 \n", "N 0.0 7192804.0 7779685.0 8728330.0 7858697.0 \n", - "R 504137056.0 532464736.0 480105472.0 511361440.0 554594176.0 \n", + "R 504137056.0 532464704.0 480105472.0 511361408.0 554594176.0 \n", "R/G 46544000.0 128380000.0 134480000.0 134125000.0 142766000.0 \n", - "All 689587776.0 795571200.0 794243840.0 857571968.0 911441088.0 \n", + "All 689587776.0 795571264.0 794243904.0 857571968.0 911441088.0 \n", "\n", "year 2015 All \n", "grant_type \n", @@ -1562,7 +1776,7 @@ "All 839788672.0 4.888204e+09 " ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1580,7 +1794,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "6d4730c4", "metadata": {}, "outputs": [ @@ -1601,7 +1815,7 @@ "R/G missing | missing |\n", "---------------------------------|\n", "\n", - "INFO:acro:records:add(): output_7\n" + "INFO:acro:records:add(): output_8\n" ] }, { @@ -1678,7 +1892,7 @@ "R/G 1.664827e+07 1.583532e+07" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1700,7 +1914,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "f3a87c20", "metadata": {}, "outputs": [ @@ -1721,7 +1935,7 @@ "R/G missing | missing |\n", "---------------------------------|\n", "\n", - "INFO:acro:records:add(): output_8\n" + "INFO:acro:records:add(): output_9\n" ] }, { @@ -1776,7 +1990,7 @@ " \n", " \n", " R\n", - " 8.006360e+06\n", + " 8.006361e+06\n", " 3.228216e+07\n", " \n", " \n", @@ -1794,11 +2008,11 @@ "grant_type \n", "G 1.141279e+07 2.283220e+07\n", "N 1.364700e+05 1.999335e+05\n", - "R 8.006360e+06 3.228216e+07\n", + "R 8.006361e+06 3.228216e+07\n", "R/G 1.664827e+07 1.583532e+07" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1816,7 +2030,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "8b603548", "metadata": {}, "outputs": [], @@ -1834,7 +2048,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "de4266cd-b4d4-417b-ae44-5d972e8bfdde", "metadata": {}, "outputs": [ @@ -1855,7 +2069,7 @@ "R/G | |\n", "---------------------------------|\n", "\n", - "INFO:acro:records:add(): output_9\n" + "INFO:acro:records:add(): output_10\n" ] }, { @@ -1910,7 +2124,7 @@ " \n", " \n", " R\n", - " 7.882231e+06\n", + " 7.882230e+06\n", " 3.204558e+07\n", " \n", " \n", @@ -1928,11 +2142,11 @@ "grant_type \n", "G 1.141279e+07 2.283220e+07\n", "N 1.341800e+05 1.990196e+05\n", - "R 7.882231e+06 3.204558e+07\n", + "R 7.882230e+06 3.204558e+07\n", "R/G 1.664827e+07 1.583532e+07" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1956,7 +2170,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "a521cb83", "metadata": {}, "outputs": [ @@ -1971,7 +2185,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:records:add(): output_10\n" + "INFO:acro:records:add(): output_11\n" ] }, { @@ -1989,10 +2203,10 @@ " Method: Least Squares F-statistic: 2261. \n", "\n", "\n", - " Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.00 \n", + " Date: Wed, 18 Oct 2023 Prob (F-statistic): 0.00 \n", "\n", "\n", - " Time: 18:04:09 Log-Likelihood: -14495. \n", + " Time: 10:45:22 Log-Likelihood: -14495. \n", "\n", "\n", " No. Observations: 811 AIC: 2.900e+04\n", @@ -2047,8 +2261,8 @@ "Dep. Variable: inc_activity R-squared: 0.894\n", "Model: OLS Adj. R-squared: 0.893\n", "Method: Least Squares F-statistic: 2261.\n", - "Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.00\n", - "Time: 18:04:09 Log-Likelihood: -14495.\n", + "Date: Wed, 18 Oct 2023 Prob (F-statistic): 0.00\n", + "Time: 10:45:22 Log-Likelihood: -14495.\n", "No. Observations: 811 AIC: 2.900e+04\n", "Df Residuals: 807 BIC: 2.902e+04\n", "Df Model: 3 \n", @@ -2074,7 +2288,7 @@ "\"\"\"" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2101,7 +2315,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "cc90f7c9", "metadata": {}, "outputs": [ @@ -2116,7 +2330,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:records:add(): output_11\n" + "INFO:acro:records:add(): output_12\n" ] }, { @@ -2134,10 +2348,10 @@ " Method: Least Squares F-statistic: 2261. \n", "\n", "\n", - " Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.00 \n", + " Date: Wed, 18 Oct 2023 Prob (F-statistic): 0.00 \n", "\n", "\n", - " Time: 18:04:09 Log-Likelihood: -14495. \n", + " Time: 10:45:22 Log-Likelihood: -14495. \n", "\n", "\n", " No. Observations: 811 AIC: 2.900e+04\n", @@ -2192,8 +2406,8 @@ "Dep. Variable: inc_activity R-squared: 0.894\n", "Model: OLS Adj. R-squared: 0.893\n", "Method: Least Squares F-statistic: 2261.\n", - "Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.00\n", - "Time: 18:04:09 Log-Likelihood: -14495.\n", + "Date: Wed, 18 Oct 2023 Prob (F-statistic): 0.00\n", + "Time: 10:45:22 Log-Likelihood: -14495.\n", "No. Observations: 811 AIC: 2.900e+04\n", "Df Residuals: 807 BIC: 2.902e+04\n", "Df Model: 3 \n", @@ -2219,7 +2433,7 @@ "\"\"\"" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -2241,7 +2455,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "5b1a1611", "metadata": {}, "outputs": [ @@ -2256,7 +2470,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:records:add(): output_12\n" + "INFO:acro:records:add(): output_13\n" ] }, { @@ -2283,10 +2497,10 @@ " Method: MLE Df Model: 4 \n", "\n", "\n", - " Date: Thu, 05 Oct 2023 Pseudo R-squ.: 0.2140 \n", + " Date: Wed, 18 Oct 2023 Pseudo R-squ.: 0.2140 \n", "\n", "\n", - " Time: 18:04:09 Log-Likelihood: -400.46 \n", + " Time: 10:45:22 Log-Likelihood: -400.46 \n", "\n", "\n", " converged: True LL-Null: -509.50 \n", @@ -2324,8 +2538,8 @@ "Dep. Variable: survivor No. Observations: 811\n", "Model: Probit Df Residuals: 806\n", "Method: MLE Df Model: 4\n", - "Date: Thu, 05 Oct 2023 Pseudo R-squ.: 0.2140\n", - "Time: 18:04:09 Log-Likelihood: -400.46\n", + "Date: Wed, 18 Oct 2023 Pseudo R-squ.: 0.2140\n", + "Time: 10:45:22 Log-Likelihood: -400.46\n", "converged: True LL-Null: -509.50\n", "Covariance Type: nonrobust LLR p-value: 4.875e-46\n", "=================================================================================\n", @@ -2344,7 +2558,7 @@ "\"\"\"" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2372,7 +2586,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "id": "dcf30f8f", "metadata": {}, "outputs": [ @@ -2380,14 +2594,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:logit() outcome: pass; dof=806.0 >= 10\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:acro:records:add(): output_13\n" + "INFO:acro:logit() outcome: pass; dof=806.0 >= 10\n", + "INFO:acro:records:add(): output_14\n" ] }, { @@ -2414,10 +2622,10 @@ " Method: MLE Df Model: 4 \n", "\n", "\n", - " Date: Thu, 05 Oct 2023 Pseudo R-squ.: 0.2187 \n", + " Date: Wed, 18 Oct 2023 Pseudo R-squ.: 0.2187 \n", "\n", "\n", - " Time: 18:04:09 Log-Likelihood: -398.07 \n", + " Time: 10:45:22 Log-Likelihood: -398.07 \n", "\n", "\n", " converged: True LL-Null: -509.50 \n", @@ -2455,8 +2663,8 @@ "Dep. Variable: survivor No. Observations: 811\n", "Model: Logit Df Residuals: 806\n", "Method: MLE Df Model: 4\n", - "Date: Thu, 05 Oct 2023 Pseudo R-squ.: 0.2187\n", - "Time: 18:04:09 Log-Likelihood: -398.07\n", + "Date: Wed, 18 Oct 2023 Pseudo R-squ.: 0.2187\n", + "Time: 10:45:22 Log-Likelihood: -398.07\n", "converged: True LL-Null: -509.50\n", "Covariance Type: nonrobust LLR p-value: 4.532e-47\n", "=================================================================================\n", @@ -2475,7 +2683,7 @@ "\"\"\"" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -2485,6 +2693,97 @@ "results.summary()" ] }, + { + "cell_type": "markdown", + "id": "3631a59d", + "metadata": {}, + "source": [ + "### ACRO Histogram without suppression" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "af2f4313", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:status: fail\n", + "INFO:acro:records:add(): output_15\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "hist = acro.hist(df, \"inc_grants\")" + ] + }, + { + "cell_type": "markdown", + "id": "5faf9a98", + "metadata": {}, + "source": [ + "### ACRO Histogram with suppression" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "349d8a29", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:acro:Histogram will not be shown as the inc_grants column is disclosive.\n", + "INFO:acro:status: fail\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:records:add(): output_16\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "acro.suppress = True\n", + "hist = acro.hist(df, \"inc_grants\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "ab0fe892", + "metadata": {}, + "outputs": [], + "source": [ + "acro.suppress = False" + ] + }, { "cell_type": "markdown", "id": "dc99fa71", @@ -2495,7 +2794,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 29, "id": "ec960039", "metadata": { "scrolled": true @@ -2528,7 +2827,7 @@ "2013 15 59 71 8\n", "2014 15 59 71 8\n", "2015 15 59 71 8]\n", - "timestamp: 2023-10-05T18:03:53.064163\n", + "timestamp: 2023-10-18T10:45:20.095974\n", "comments: []\n", "exception: \n", "\n", @@ -2550,12 +2849,12 @@ "output: [grant_type G N R R/G\n", "year \n", "2010 9921906.0 NaN 8402284.0 NaN\n", - "2011 8502247.0 124013.859375 7716880.0 NaN\n", + "2011 8502246.0 124013.859375 7716880.0 NaN\n", "2012 11458580.0 131859.062500 6958050.5 NaN\n", "2013 13557147.0 147937.796875 7202273.5 NaN\n", - "2014 13748147.0 133198.250000 8277525.5 NaN\n", + "2014 13748147.0 133198.250000 8277525.0 NaN\n", "2015 11133433.0 146572.187500 10812888.0 NaN]\n", - "timestamp: 2023-10-05T18:03:54.913352\n", + "timestamp: 2023-10-18T10:45:20.272862\n", "comments: []\n", "exception: \n", "\n", @@ -2608,7 +2907,7 @@ "2014 24 8 149 \n", "2015 23 8 129 \n", "All 139 44 815 ]\n", - "timestamp: 2023-10-05T18:03:56.973956\n", + "timestamp: 2023-10-18T10:45:20.500344\n", "comments: [\"Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted.\"]\n", "exception: \n", "\n", @@ -2630,16 +2929,80 @@ "output: [grant_type G N R R/G\n", "year \n", "2010 9921906.0 0.000000 8402284.0 11636000.0\n", - "2011 8502247.0 124013.859375 7716880.0 16047500.0\n", + "2011 8502246.0 124013.859375 7716880.0 16047500.0\n", "2012 11458580.0 131859.062500 6958050.5 16810000.0\n", "2013 13557147.0 147937.796875 7202273.5 16765625.0\n", - "2014 13748147.0 133198.250000 8277525.5 17845750.0\n", + "2014 13748147.0 133198.250000 8277525.0 17845750.0\n", "2015 11133433.0 146572.187500 10812888.0 18278624.0]\n", - "timestamp: 2023-10-05T18:04:01.751627\n", + "timestamp: 2023-10-18T10:45:20.652740\n", "comments: []\n", "exception: \n", "\n", "uid: output_4\n", + "status: fail\n", + "type: table\n", + "properties: {'method': 'crosstab'}\n", + "sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 0, 'threshold': 14, 'p-ratio': 4, 'nk-rule': 2}, 'cells': {'negative': [], 'missing': [], 'threshold': [[0, 1], [0, 3], [0, 6], [0, 8], [1, 3], [1, 8], [2, 3], [2, 8], [3, 3], [3, 8], [4, 3], [4, 8], [5, 3], [5, 8]], 'p-ratio': [[0, 1], [0, 3], [0, 6], [0, 8]], 'nk-rule': [[0, 3], [0, 8]]}}\n", + "command: safe_table = acro.crosstab(\n", + "summary: fail; threshold: 14 cells may need suppressing; p-ratio: 4 cells may need suppressing; nk-rule: 2 cells may need suppressing; \n", + "outcome: mean \\\n", + "grant_type G N R R/G All \n", + "year \n", + "2010 ok threshold; p-ratio; ok threshold; p-ratio; nk-rule; ok \n", + "2011 ok ok ok threshold; ok \n", + "2012 ok ok ok threshold; ok \n", + "2013 ok ok ok threshold; ok \n", + "2014 ok ok ok threshold; ok \n", + "2015 ok ok ok threshold; ok \n", + "All ok ok ok ok ok \n", + "\n", + " std \n", + "grant_type G N R R/G All \n", + "year \n", + "2010 ok threshold; p-ratio; ok threshold; p-ratio; nk-rule; ok \n", + "2011 ok ok ok threshold; ok \n", + "2012 ok ok ok threshold; ok \n", + "2013 ok ok ok threshold; ok \n", + "2014 ok ok ok threshold; ok \n", + "2015 ok ok ok threshold; ok \n", + "All ok ok ok ok ok \n", + "output: [ mean \\\n", + "grant_type G N R R/G All \n", + "year \n", + "2010 9921906.0 0.000000 8402284.0 11636000.0 8308286.5 \n", + "2011 8502246.0 124013.859375 7716880.0 16047500.0 5303808.5 \n", + "2012 11458580.0 131859.062500 6958050.5 16810000.0 5259893.5 \n", + "2013 13557147.0 147937.796875 7202273.5 16765625.0 5605045.5 \n", + "2014 13748147.0 133198.250000 8277525.0 17845750.0 6117054.5 \n", + "2015 11133433.0 146572.187500 10812888.0 18278624.0 6509989.5 \n", + "All 11412787.0 134431.890625 8098502.0 16648273.0 5997796.5 \n", + "\n", + " std \\\n", + "grant_type G N R R/G \n", + "year \n", + "2010 1.855055e+07 0.000000 3.059557e+07 1.701088e+07 \n", + "2011 1.688595e+07 205959.492903 2.954322e+07 1.561638e+07 \n", + "2012 2.061090e+07 210476.539175 2.721184e+07 1.646449e+07 \n", + "2013 2.486844e+07 203747.417017 2.989833e+07 1.671112e+07 \n", + "2014 3.134559e+07 181865.925580 3.546348e+07 1.741251e+07 \n", + "2015 2.553919e+07 201602.800832 4.130935e+07 1.730471e+07 \n", + "All 2.283220e+07 198873.726656 3.204495e+07 1.583532e+07 \n", + "\n", + " \n", + "grant_type All \n", + "year \n", + "2010 2.727398e+07 \n", + "2011 2.137658e+07 \n", + "2012 2.026400e+07 \n", + "2013 2.251787e+07 \n", + "2014 2.641722e+07 \n", + "2015 2.784636e+07 \n", + "All 2.405324e+07 ]\n", + "timestamp: 2023-10-18T10:45:20.938806\n", + "comments: []\n", + "exception: \n", + "\n", + "uid: output_5\n", "status: review\n", "type: table\n", "properties: {'method': 'crosstab'}\n", @@ -2657,18 +3020,18 @@ "All \n", "output: [grant_type G N R R/G All\n", "year \n", - "2010 9921906.0 0.000000 8420372.0 11636000.0 8320154.5\n", - "2011 8502247.0 125663.226562 7689140.5 16047500.0 5310392.0\n", + "2010 9921906.0 0.000000 8420373.0 11636000.0 8320154.5\n", + "2011 8502246.0 125663.226562 7689140.0 16047500.0 5310392.0\n", "2012 11458580.0 131859.062500 6896304.0 16810000.0 5220580.5\n", - "2013 13557147.0 150488.453125 7088096.0 16765625.0 5578657.0\n", - "2014 13748147.0 135494.781250 8118565.0 17845750.0 6072600.0\n", + "2013 13557147.0 150488.453125 7088095.5 16765625.0 5578657.0\n", + "2014 13748147.0 135494.781250 8118565.5 17845750.0 6072600.0\n", "2015 11133433.0 149143.625000 10596385.0 18278624.0 6442131.0\n", - "All 11412787.0 136158.859375 8006360.5 16648273.0 5968295.5]\n", - "timestamp: 2023-10-05T18:04:05.126101\n", + "All 11412787.0 136158.859375 8006361.0 16648273.0 5968295.5]\n", + "timestamp: 2023-10-18T10:45:21.145552\n", "comments: []\n", "exception: \n", "\n", - "uid: output_5\n", + "uid: output_6\n", "status: review\n", "type: table\n", "properties: {'method': 'crosstab'}\n", @@ -2686,16 +3049,16 @@ "output: [grant_type G N R R/G\n", "year \n", "2010 9921906.0 0.000000 8280032.5 11636000.0\n", - "2011 8502247.0 123496.445312 7577703.5 16047500.0\n", + "2011 8502246.0 123496.445312 7577703.5 16047500.0\n", "2012 11458580.0 131859.062500 6796357.5 16810000.0\n", - "2013 13557147.0 147937.625000 6988263.5 16765625.0\n", - "2014 13748147.0 133198.078125 7997392.5 17845750.0\n", - "2015 11133433.0 146572.015625 10388613.0 18278624.0]\n", - "timestamp: 2023-10-05T18:04:08.961665\n", + "2013 13557147.0 147937.625000 6988263.0 16765625.0\n", + "2014 13748147.0 133198.078125 7997392.0 17845750.0\n", + "2015 11133433.0 146572.015625 10388612.0 18278624.0]\n", + "timestamp: 2023-10-18T10:45:21.301223\n", "comments: []\n", "exception: \n", "\n", - "uid: output_6\n", + "uid: output_7\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -2715,9 +3078,9 @@ "grant_type \n", "G 138906688.0 127533696.0 171878704.0 203357200.0 206222208.0 \n", "N 0.0 7192804.0 7779685.0 8728330.0 7858697.0 \n", - "R 504137056.0 532464736.0 480105472.0 511361440.0 554594176.0 \n", + "R 504137056.0 532464704.0 480105472.0 511361408.0 554594176.0 \n", "R/G 46544000.0 128380000.0 134480000.0 134125000.0 142766000.0 \n", - "All 689587776.0 795571200.0 794243840.0 857571968.0 911441088.0 \n", + "All 689587776.0 795571264.0 794243904.0 857571968.0 911441088.0 \n", "\n", " \n", "year 2015 All \n", @@ -2727,11 +3090,11 @@ "R 551457280.0 3.134120e+09 \n", "R/G 146228992.0 7.325240e+08 \n", "All 839788672.0 4.888204e+09 ]\n", - "timestamp: 2023-10-05T18:04:09.105670\n", + "timestamp: 2023-10-18T10:45:21.498602\n", "comments: []\n", "exception: \n", "\n", - "uid: output_7\n", + "uid: output_8\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -2752,11 +3115,11 @@ "N 1.344319e+05 1.988737e+05\n", "R 8.098502e+06 3.204495e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-05T18:04:09.203761\n", + "timestamp: 2023-10-18T10:45:21.682294\n", "comments: []\n", "exception: \n", "\n", - "uid: output_8\n", + "uid: output_9\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -2775,13 +3138,13 @@ "grant_type \n", "G 1.141279e+07 2.283220e+07\n", "N 1.364700e+05 1.999335e+05\n", - "R 8.006360e+06 3.228216e+07\n", + "R 8.006361e+06 3.228216e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-05T18:04:09.264100\n", + "timestamp: 2023-10-18T10:45:21.802799\n", "comments: []\n", "exception: \n", "\n", - "uid: output_9\n", + "uid: output_10\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -2800,13 +3163,13 @@ "grant_type \n", "G 1.141279e+07 2.283220e+07\n", "N 1.341800e+05 1.990196e+05\n", - "R 7.882231e+06 3.204558e+07\n", + "R 7.882230e+06 3.204558e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-05T18:04:09.342995\n", + "timestamp: 2023-10-18T10:45:21.951148\n", "comments: []\n", "exception: \n", "\n", - "uid: output_10\n", + "uid: output_11\n", "status: pass\n", "type: regression\n", "properties: {'method': 'ols', 'dof': 807.0}\n", @@ -2820,8 +3183,8 @@ "Dep. Variable: \n", "Model: OLS Adj. R-squared: 0.893\n", "Method: Least Squares F-statistic: 2261.000\n", - "Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.000\n", - "Time: 18:04:09 Log-Likelihood: -14495.000\n", + "Date: Wed, 18 Oct 2023 Prob (F-statistic): 0.000\n", + "Time: 10:45:22 Log-Likelihood: -14495.000\n", "No. Observations: 811 AIC: 29000.000\n", "Df Residuals: 807 BIC: 29020.000\n", "Df Model: 3 NaN NaN\n", @@ -2834,11 +3197,11 @@ "Prob(Omnibus): 0.000 Jarque-Bera (JB): 1.253318e+06\n", "Skew: 9.899 Prob(JB): 0.000000e+00\n", "Kurtosis: 194.566 Cond. No. 1.050000e+08]\n", - "timestamp: 2023-10-05T18:04:09.406745\n", + "timestamp: 2023-10-18T10:45:22.063254\n", "comments: []\n", "exception: \n", "\n", - "uid: output_11\n", + "uid: output_12\n", "status: pass\n", "type: regression\n", "properties: {'method': 'olsr', 'dof': 807.0}\n", @@ -2852,8 +3215,8 @@ "Dep. Variable: \n", "Model: OLS Adj. R-squared: 0.893\n", "Method: Least Squares F-statistic: 2261.000\n", - "Date: Thu, 05 Oct 2023 Prob (F-statistic): 0.000\n", - "Time: 18:04:09 Log-Likelihood: -14495.000\n", + "Date: Wed, 18 Oct 2023 Prob (F-statistic): 0.000\n", + "Time: 10:45:22 Log-Likelihood: -14495.000\n", "No. Observations: 811 AIC: 29000.000\n", "Df Residuals: 807 BIC: 29020.000\n", "Df Model: 3 NaN NaN\n", @@ -2866,11 +3229,11 @@ "Prob(Omnibus): 0.000 Jarque-Bera (JB): 1.253318e+06\n", "Skew: 9.899 Prob(JB): 0.000000e+00\n", "Kurtosis: 194.566 Cond. No. 1.050000e+08]\n", - "timestamp: 2023-10-05T18:04:09.449726\n", + "timestamp: 2023-10-18T10:45:22.159163\n", "comments: []\n", "exception: \n", "\n", - "uid: output_12\n", + "uid: output_13\n", "status: pass\n", "type: regression\n", "properties: {'method': 'probit', 'dof': 806.0}\n", @@ -2884,8 +3247,8 @@ "Dep. Variable: \n", "Model: Probit Df Residuals: 8.060000e+02\n", "Method: MLE Df Model: 4.000000e+00\n", - "Date: Thu, 05 Oct 2023 Pseudo R-squ.: 2.140000e-01\n", - "Time: 18:04:09 Log-Likelihood: -4.004600e+02\n", + "Date: Wed, 18 Oct 2023 Pseudo R-squ.: 2.140000e-01\n", + "Time: 10:45:22 Log-Likelihood: -4.004600e+02\n", "converged: True LL-Null: -5.095000e+02\n", "Covariance Type: nonrobust LLR p-value: 4.875000e-46, coef std err z P>|z| [0.025 \\\n", "const 4.740000e-02 5.700000e-02 0.838 0.402 -6.300000e-02 \n", @@ -2900,11 +3263,11 @@ "inc_grants 1.620000e-07 \n", "inc_donations 3.300000e-07 \n", "total_costs -1.440000e-08 ]\n", - "timestamp: 2023-10-05T18:04:09.499724\n", + "timestamp: 2023-10-18T10:45:22.278068\n", "comments: []\n", "exception: \n", "\n", - "uid: output_13\n", + "uid: output_14\n", "status: pass\n", "type: regression\n", "properties: {'method': 'logit', 'dof': 806.0}\n", @@ -2918,8 +3281,8 @@ "Dep. Variable: \n", "Model: Logit Df Residuals: 8.060000e+02\n", "Method: MLE Df Model: 4.000000e+00\n", - "Date: Thu, 05 Oct 2023 Pseudo R-squ.: 2.187000e-01\n", - "Time: 18:04:09 Log-Likelihood: -3.980700e+02\n", + "Date: Wed, 18 Oct 2023 Pseudo R-squ.: 2.187000e-01\n", + "Time: 10:45:22 Log-Likelihood: -3.980700e+02\n", "converged: True LL-Null: -5.095000e+02\n", "Covariance Type: nonrobust LLR p-value: 4.532000e-47, coef std err z P>|z| [0.025 \\\n", "const 5.120000e-02 9.100000e-02 0.561 0.575 -1.280000e-01 \n", @@ -2934,7 +3297,37 @@ "inc_grants 2.660000e-07 \n", "inc_donations 7.160000e-07 \n", "total_costs -2.150000e-08 ]\n", - "timestamp: 2023-10-05T18:04:09.537725\n", + "timestamp: 2023-10-18T10:45:22.363104\n", + "comments: []\n", + "exception: \n", + "\n", + "uid: output_15\n", + "status: fail\n", + "type: histogram\n", + "properties: {'method': 'histogram'}\n", + "sdc: {}\n", + "command: hist = acro.hist(\n", + "summary: Please check the minimum and the maximum values. The minimum value of the inc_grants column is: -10.0. The maximum value of the inc_grants column is: 249327008.0\n", + "outcome: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "output: ['acro_artifacts\\\\histogram_0.png']\n", + "timestamp: 2023-10-18T10:45:22.607418\n", + "comments: []\n", + "exception: \n", + "\n", + "uid: output_16\n", + "status: fail\n", + "type: histogram\n", + "properties: {'method': 'histogram'}\n", + "sdc: {}\n", + "command: hist = acro.hist(\n", + "summary: Please check the minimum and the maximum values. The minimum value of the inc_grants column is: -10.0. The maximum value of the inc_grants column is: 249327008.0\n", + "outcome: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "output: ['acro_artifacts\\\\histogram_1.png']\n", + "timestamp: 2023-10-18T10:45:22.800147\n", "comments: []\n", "exception: \n", "\n", @@ -2956,7 +3349,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 30, "id": "b1f77749", "metadata": {}, "outputs": [ @@ -2984,7 +3377,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 31, "id": "45ec04ef", "metadata": {}, "outputs": [ @@ -3010,7 +3403,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 32, "id": "0c826271", "metadata": {}, "outputs": [ @@ -3038,7 +3431,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 33, "id": "2816eac7", "metadata": {}, "outputs": [ @@ -3046,7 +3439,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:acro:records:add_custom(): output_14\n" + "INFO:acro:records:add_custom(): output_17\n" ] } ], @@ -3066,7 +3459,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 34, "id": "f38b4334", "metadata": {}, "outputs": [ @@ -3098,7 +3491,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 35, "id": "9e554eea", "metadata": {}, "outputs": [ @@ -3111,6 +3504,52 @@ "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", + "sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 11, 'threshold': 7, 'p-ratio': 2, 'nk-rule': 1}, 'cells': {'negative': [], 'missing': [[0, 0], [0, 5], [1, 0], [1, 1], [1, 5], [2, 0], [2, 1], [2, 2], [2, 4], [2, 5], [3, 0]], 'threshold': [[1, 0], [3, 0], [3, 1], [3, 2], [3, 3], [3, 4], [3, 5]], 'p-ratio': [[1, 0], [3, 0]], 'nk-rule': [[3, 0]]}}\n", + "command: table = acro.pivot_table(\n", + "summary: review; missing values found\n", + "outcome: inc_grants \n", + "year 2010 2011 2012 2013 2014 2015 All\n", + "grant_type \n", + "G missing missing \n", + "N missing missing missing \n", + "R missing missing missing missing missing \n", + "R/G missing \n", + "All \n", + "output: [ inc_grants \\\n", + "year 2010 2011 2012 2013 2014 \n", + "grant_type \n", + "G 138906688.0 127533696.0 171878704.0 203357200.0 206222208.0 \n", + "N 0.0 7192804.0 7779685.0 8728330.0 7858697.0 \n", + "R 504137056.0 532464704.0 480105472.0 511361408.0 554594176.0 \n", + "R/G 46544000.0 128380000.0 134480000.0 134125000.0 142766000.0 \n", + "All 689587776.0 795571264.0 794243904.0 857571968.0 911441088.0 \n", + "\n", + " \n", + "year 2015 All \n", + "grant_type \n", + "G 133601200.0 9.814997e+08 \n", + "N 8501187.0 4.006070e+07 \n", + "R 551457280.0 3.134120e+09 \n", + "R/G 146228992.0 7.325240e+08 \n", + "All 839788672.0 4.888204e+09 ]\n", + "timestamp: 2023-10-18T10:45:21.498602\n", + "comments: []\n", + "exception: \n", + "\n", + "The status of the record above is: review.\n", + "Please explain why an exception should be granted.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:acro:records:\n", + "uid: output_8\n", + "status: review\n", + "type: table\n", + "properties: {'method': 'pivot_table'}\n", "sdc: {'summary': {'suppressed': False, 'negative': 0, 'missing': 8, 'threshold': 0, 'p-ratio': 0, 'nk-rule': 0}, 'cells': {'negative': [], 'missing': [[0, 0], [0, 1], [1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1]], 'threshold': [], 'p-ratio': [], 'nk-rule': []}}\n", "command: table = acro.pivot_table(\n", "summary: review; missing values found\n", @@ -3128,7 +3567,7 @@ "N 1.344319e+05 1.988737e+05\n", "R 8.098502e+06 3.204495e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-05T18:04:09.203761\n", + "timestamp: 2023-10-18T10:45:21.682294\n", "comments: []\n", "exception: \n", "\n", @@ -3136,7 +3575,7 @@ "Please explain why an exception should be granted.\n", "\n", "INFO:acro:records:\n", - "uid: output_8\n", + "uid: output_9\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -3155,9 +3594,9 @@ "grant_type \n", "G 1.141279e+07 2.283220e+07\n", "N 1.364700e+05 1.999335e+05\n", - "R 8.006360e+06 3.228216e+07\n", + "R 8.006361e+06 3.228216e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-05T18:04:09.264100\n", + "timestamp: 2023-10-18T10:45:21.802799\n", "comments: []\n", "exception: \n", "\n", @@ -3165,7 +3604,7 @@ "Please explain why an exception should be granted.\n", "\n", "INFO:acro:records:\n", - "uid: output_9\n", + "uid: output_10\n", "status: review\n", "type: table\n", "properties: {'method': 'pivot_table'}\n", @@ -3184,9 +3623,9 @@ "grant_type \n", "G 1.141279e+07 2.283220e+07\n", "N 1.341800e+05 1.990196e+05\n", - "R 7.882231e+06 3.204558e+07\n", + "R 7.882230e+06 3.204558e+07\n", "R/G 1.664827e+07 1.583532e+07]\n", - "timestamp: 2023-10-05T18:04:09.342995\n", + "timestamp: 2023-10-18T10:45:21.951148\n", "comments: []\n", "exception: \n", "\n", @@ -3194,6 +3633,44 @@ "Please explain why an exception should be granted.\n", "\n", "INFO:acro:records:\n", + "uid: output_15\n", + "status: fail\n", + "type: histogram\n", + "properties: {'method': 'histogram'}\n", + "sdc: {}\n", + "command: hist = acro.hist(\n", + "summary: Please check the minimum and the maximum values. The minimum value of the inc_grants column is: -10.0. The maximum value of the inc_grants column is: 249327008.0\n", + "outcome: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "output: ['acro_artifacts\\\\histogram_0.png']\n", + "timestamp: 2023-10-18T10:45:22.607418\n", + "comments: []\n", + "exception: \n", + "\n", + "The status of the record above is: fail.\n", + "Please explain why an exception should be granted.\n", + "\n", + "INFO:acro:records:\n", + "uid: output_16\n", + "status: fail\n", + "type: histogram\n", + "properties: {'method': 'histogram'}\n", + "sdc: {}\n", + "command: hist = acro.hist(\n", + "summary: Please check the minimum and the maximum values. The minimum value of the inc_grants column is: -10.0. The maximum value of the inc_grants column is: 249327008.0\n", + "outcome: Empty DataFrame\n", + "Columns: []\n", + "Index: []\n", + "output: ['acro_artifacts\\\\histogram_1.png']\n", + "timestamp: 2023-10-18T10:45:22.800147\n", + "comments: []\n", + "exception: \n", + "\n", + "The status of the record above is: fail.\n", + "Please explain why an exception should be granted.\n", + "\n", + "INFO:acro:records:\n", "uid: pivot_table\n", "status: fail\n", "type: table\n", @@ -3243,7 +3720,7 @@ "2014 24 8 149 \n", "2015 23 8 129 \n", "All 139 44 815 ]\n", - "timestamp: 2023-10-05T18:03:56.973956\n", + "timestamp: 2023-10-18T10:45:20.500344\n", "comments: [\"Empty columns: ('N', 'Dead in 2015'), ('R/G', 'Dead in 2015') were deleted.\"]\n", "exception: \n", "\n", @@ -3251,7 +3728,7 @@ "Please explain why an exception should be granted.\n", "\n", "INFO:acro:records:\n", - "uid: output_14\n", + "uid: output_17\n", "status: review\n", "type: custom\n", "properties: {}\n", @@ -3262,7 +3739,7 @@ "Columns: []\n", "Index: []\n", "output: ['XandY.jpeg']\n", - "timestamp: 2023-10-05T18:04:09.660560\n", + "timestamp: 2023-10-18T10:45:23.039087\n", "comments: ['This output is an image showing the relationship between X and Y']\n", "exception: \n", "\n", @@ -3290,7 +3767,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 36, "id": "f78b5a08", "metadata": {}, "outputs": [ @@ -3300,17 +3777,20 @@ "text": [ "XandY.jpeg\n", "config.json\n", + "histogram_0.png\n", + "histogram_1.png\n", "output_0_0.csv\n", "output_10_0.csv\n", - "output_10_1.csv\n", - "output_10_2.csv\n", "output_11_0.csv\n", "output_11_1.csv\n", "output_11_2.csv\n", "output_12_0.csv\n", "output_12_1.csv\n", + "output_12_2.csv\n", "output_13_0.csv\n", "output_13_1.csv\n", + "output_14_0.csv\n", + "output_14_1.csv\n", "output_3_0.csv\n", "output_5_0.csv\n", "output_6_0.csv\n", @@ -3342,7 +3822,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 37, "id": "df2a02e0", "metadata": {}, "outputs": [ @@ -3351,17 +3831,21 @@ "output_type": "stream", "text": [ "XandY.jpeg.txt\n", + "config.json.txt\n", + "histogram_0.png.txt\n", + "histogram_1.png.txt\n", "output_0_0.csv.txt\n", "output_10_0.csv.txt\n", - "output_10_1.csv.txt\n", - "output_10_2.csv.txt\n", "output_11_0.csv.txt\n", "output_11_1.csv.txt\n", "output_11_2.csv.txt\n", "output_12_0.csv.txt\n", "output_12_1.csv.txt\n", + "output_12_2.csv.txt\n", "output_13_0.csv.txt\n", "output_13_1.csv.txt\n", + "output_14_0.csv.txt\n", + "output_14_1.csv.txt\n", "output_3_0.csv.txt\n", "output_5_0.csv.txt\n", "output_6_0.csv.txt\n", @@ -3385,7 +3869,6 @@ ] }, { - "cell_type": "code", "execution_count": null, "id": "f241054a-c91e-4a91-bdc0-0395bbe084dd", diff --git a/test/test_initial.py b/test/test_initial.py index ef46863..12d2854 100644 --- a/test/test_initial.py +++ b/test/test_initial.py @@ -868,3 +868,31 @@ def test_crosstab_with_manual_totals_with_suppression_with_two_aggfunc( "We can not calculate the margins with a list of aggregation functions. " "Please create a table for each aggregation function" in caplog.text ) + + +def test_histogram_discolsive(data, acro, caplog): + """Test a discolsive histogram.""" + filename = os.path.normpath("acro_artifacts/histogram_0.png") + _ = acro.hist(data, "inc_grants") + assert os.path.exists(filename) + acro.add_exception("output_0", "Let me have it") + results: Records = acro.finalise(path=PATH) + output_0 = results.get_index(0) + assert output_0.output == [filename] + assert ( + "Histogram will not be shown as the inc_grants column is disclosive." + in caplog.text + ) + assert output_0.status == "fail" + + +def test_histogram_non_disclosive(data, acro): + """Test a non discolsive histogram.""" + filename = os.path.normpath("acro_artifacts/histogram_0.png") + _ = acro.hist(data, "inc_grants", bins=1) + assert os.path.exists(filename) + acro.add_exception("output_0", "Let me have it") + results: Records = acro.finalise(path=PATH) + output_0 = results.get_index(0) + assert output_0.output == [filename] + assert output_0.status == "review"