Skip to content

Commit

Permalink
histogram
Browse files Browse the repository at this point in the history
  • Loading branch information
mahaalbashir committed Oct 18, 2023
1 parent 52d471f commit 8288474
Show file tree
Hide file tree
Showing 5 changed files with 892 additions and 204 deletions.
182 changes: 182 additions & 0 deletions acro/acro_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,188 @@ def survival_plot( # pylint: disable=too-many-arguments,too-many-locals
)
return plot

def hist(
self,
data,
column,
by=None,
grid=True,
xlabelsize=None,
xrot=None,
ylabelsize=None,
yrot=None,
ax=None,
sharex=False,
sharey=False,
figsize=None,
layout=None,
bins=10,
backend=None,
legend=False,
filename="histogram.png",
**kwargs,
):
"""Create a histogram.
Returns
-------
matplotlib.Axes
Parameters:
-----------
data : DataFrame
The pandas object holding the data.
column : str or sequence, optional
If passed, will be used to limit data to a subset of columns.
by : object, optional
If passed, then used to form histograms for separate groups.
grid : bool, default True
Whether to show axis grid lines.
xlabelsize : int, default None
If specified changes the x-axis label size.
xrot : float, default None
Rotation of x axis labels. For example, a value of 90 displays
the x labels rotated 90 degrees clockwise.
ylabelsize : int, default None
If specified changes the y-axis label size.
yrot : float, default None
Rotation of y axis labels. For example, a value of 90 displays
the y labels rotated 90 degrees clockwise.
ax : Matplotlib axes object, default None
The axes to plot the histogram on.
sharex : bool, default True if ax is None else False
In case subplots=True, share x axis and set some x axis labels to invisible;
defaults to True if ax is None otherwise False if an ax is passed in.
Note that passing in both an ax and sharex=True will alter all x axis
labels for all subplots in a figure.
sharey : bool, default False
In case subplots=True, share y axis and set some y axis labels to invisible.
figsize : tuple, optional
The size in inches of the figure to create.
Uses the value in matplotlib.rcParams by default.
layout : tuple, optional
Tuple of (rows, columns) for the layout of the histograms.
bins : int or sequence, default 10
Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are
calculated and returned. If bins is a sequence, gives bin edges,
including left edge of first bin and right edge of last bin.
backend : str, default None
Backend to use instead of the backend specified in the option plotting.backend.
For instance, ‘matplotlib’. Alternatively, to specify the plotting.backend for the
whole session, set pd.options.plotting.backend.
legend : bool, default False
Whether to show the legend.
filename:
The name of the file where the plot will be saved.
"""
logger.debug("hist()")
command: str = utils.get_command("hist()", stack())

if isinstance(data, list): # pragma: no cover
logger.info(
"Calculating histogram for more than one columns is "
"not currently supported. Please do each column separately."
)
return

freq, _ = np.histogram( # pylint: disable=too-many-function-args
data[column], bins, range=(data[column].min(), data[column].max())
)

# threshold check
threshold_mask = freq < THRESHOLD

# plot the histogram
if np.any(threshold_mask): # the column is disclosive
status = "fail"
if self.suppress:
logger.warning(
f"Histogram will not be shown as the {column} column is disclosive."
)
else: # pragma: no cover
data.hist(
column=column,
by=by,
grid=grid,
xlabelsize=xlabelsize,
xrot=xrot,
ylabelsize=ylabelsize,
yrot=yrot,
ax=ax,
sharex=sharex,
sharey=sharey,
figsize=figsize,
layout=layout,
bins=bins,
backend=backend,
legend=legend,
**kwargs,
)
else:
status = "review"
data.hist(
column=column,
by=by,
grid=grid,
xlabelsize=xlabelsize,
xrot=xrot,
ylabelsize=ylabelsize,
yrot=yrot,
ax=ax,
sharex=sharex,
sharey=sharey,
figsize=figsize,
layout=layout,
bins=bins,
backend=backend,
legend=legend,
**kwargs,
)
logger.info(f"status: {status}")

# create the summary
min_value = data[column].min()
max_value = data[column].max()
summary = (
f"Please check the minimum and the maximum values. "
f"The minimum value of the {column} column is: {min_value}. "
f"The maximum value of the {column} column is: {max_value}"
)

# create the acro_artifacts directory to save the plot in it
try:
os.makedirs("acro_artifacts")
logger.debug("Directory acro_artifacts created successfully")
except FileExistsError: # pragma: no cover
logger.debug("Directory acro_artifacts already exists")

# create a unique filename with number to avoid overwrite
filename, extension = os.path.splitext(filename)
if not extension: # pragma: no cover
logger.info("Please provide a valid file extension")
return
increment_number = 0
while os.path.exists(
f"acro_artifacts/{filename}_{increment_number}{extension}"
):
increment_number += 1
unique_filename = f"acro_artifacts/{filename}_{increment_number}{extension}"

# save the plot to the acro artifacts directory
plt.savefig(unique_filename)

# record output
self.results.add(
status=status,
output_type="histogram",
properties={"method": "histogram"},
sdc={},
command=command,
summary=summary,
outcome=pd.DataFrame(),
output=[os.path.normpath(unique_filename)],
)


def create_crosstab_masks( # pylint: disable=too-many-arguments,too-many-locals
index,
Expand Down
25 changes: 7 additions & 18 deletions acro/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,11 @@ def serialize_output(self, path: str = "outputs") -> list[str]:
if os.path.exists(filename):
shutil.copy(filename, path)
output.append(Path(filename).name)
if self.output_type == "survival plot":
if self.output_type == "survival plot" or self.output_type == "histogram":
for filename in self.output:
output.append(Path(filename).name)
if os.path.exists(filename):
output.append(Path(filename).name)
shutil.copy(filename, path)
return output

def __str__(self) -> str:
Expand Down Expand Up @@ -446,9 +448,10 @@ def finalise(self, path: str, ext: str) -> None:
self.finalise_excel(path)
else:
raise ValueError("Invalid file extension. Options: {json, xlsx}")
if os.path.exists("acro_artifacts"):
add_acro_artifacts(path)
self.write_checksums(path)
# check if the directory acro_artifacts exists and delete it
if os.path.exists("acro_artifacts"):
shutil.rmtree("acro_artifacts")
logger.info("outputs written to: %s", path)

def finalise_json(self, path: str) -> None:
Expand Down Expand Up @@ -565,20 +568,6 @@ def write_checksums(self, path: str) -> None:
logger.debug("There is no file to do the checksums") # pragma: no cover


def add_acro_artifacts(path: str) -> None:
"""Copy any file from the acro_artifacts directory to the output
directory then delete the directory.
Parameters
----------
path : str
Name of the folder that files are to be written.
"""
for filename in os.listdir("acro_artifacts"):
shutil.copy(f"acro_artifacts/{filename}", path)
shutil.rmtree("acro_artifacts")


def load_records(path: str) -> Records:
"""Loads outputs from a JSON file.
Expand Down
26 changes: 16 additions & 10 deletions notebooks/acro_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd
from scipy.io.arff import loadarff

from acro import ACRO, add_constant, add_to_acro
from acro import ACRO

# Instantiate ACRO by making an acro object
print(
Expand All @@ -21,8 +21,10 @@

# Load test data
# The dataset used in this notebook is the nursery dataset from OpenML.
# - In this version, the data can be read directly from the local machine after it has been downloaded.
# - The code below reads the data from a folder called "data" which we assume is at the same level as the folder where you are working.
# - In this version, the data can be read directly from the local machine after
# it has been downloaded.
# - The code below reads the data from a folder called "data" which we assume is at
# the same level as the folder where you are working.
# - The path might need to be changed if the data has been downloaded and stored elsewhere.
# - for example use:
# path = os.path.join("data", "nursery.arff")
Expand All @@ -39,7 +41,8 @@
# Examples of producing tabular output
# We rely on the industry-standard package **pandas** for tabulating data.
# In the next few examples we show:
# - first, how a researcher would normally make a call in pandas, saving the results in a variable that they can view on screen (or save to file?)
# - first, how a researcher would normally make a call in pandas, saving the results
# in a variable that they can view on screen (or save to file?)
# - then how the call is identical in SACRO, except that:
# - "pd" is replaced by "acro"
# - the researcher immediately sees a copy of what the TRE output checker will see.
Expand Down Expand Up @@ -69,9 +72,11 @@
print(safe_table)

# ACRO crosstab with suppression
# - This is an example of crosstab with suppressing the cells that violate the disclosure tests.
# - Note that you need to change the value of the suppress variable in the acro object to True. Then run the crosstab command.
# - If you wish to continue the research while suppressing the outputs, leave the suppress variable as it is, otherwise turn it off.
# - This is an example of crosstab with suppressing the cells violate the disclosure tests.
# - Note that you need to change the value of the suppress variable in the acro
# object to True. Then run the crosstab command.
# - If you wish to continue the research while suppressing the outputs,
# leave the suppress variable as it is, otherwise turn it off.

print("\nTurn on the suppression variable")
acro.suppress = True
Expand All @@ -96,7 +101,7 @@

# 2: Remove some ACRO outputs before finalising
# This is an example of deleting some of the ACRO outputs.
# The name of the output that needs to be removed should be passed to the function remove_output.
# The name of the output needs to be removed should be passed to the function remove_output.
# - The output name can be taken from the outputs listed by the print_outputs function,
# - or by listing the results and choosing the specific output that needs to be removed

Expand All @@ -111,13 +116,14 @@

# 4: Add a comment to output
# This is an example to add a comment to outputs.
# It can be used to provide a description or to pass additional information to the output checkers.
# It can be used to provide a description or to pass information to the output checkers.

print("\nUsers can add comments which the output checkers will see.")
acro.add_comments("cross_tabulation", "Please let me have this data.")

# 5: (the big one) Finalise ACRO
# This is an example of the function _finalise()_ which the users must call at the end of each session.
# This is an example of the function _finalise()_
# which the users must call at the end of each session.
# - It takes each output and saves it to a CSV file.
# - It also saves the SDC analysis for each output to a json file or Excel file
# (depending on the extension of the name of the file provided as an input to the function)
Expand Down
Loading

0 comments on commit 8288474

Please sign in to comment.