histogram

AI-SDC · Oct 18, 2023 · 8288474 · 8288474
1 parent 52d471f
commit 8288474
Show file tree

Hide file tree

Showing 5 changed files with 892 additions and 204 deletions.
diff --git a/acro/acro_tables.py b/acro/acro_tables.py
@@ -527,6 +527,188 @@ def survival_plot(  # pylint: disable=too-many-arguments,too-many-locals
         )
         return plot
 
+    def hist(
+        self,
+        data,
+        column,
+        by=None,
+        grid=True,
+        xlabelsize=None,
+        xrot=None,
+        ylabelsize=None,
+        yrot=None,
+        ax=None,
+        sharex=False,
+        sharey=False,
+        figsize=None,
+        layout=None,
+        bins=10,
+        backend=None,
+        legend=False,
+        filename="histogram.png",
+        **kwargs,
+    ):
+        """Create a histogram.
+
+        Returns
+        -------
+        matplotlib.Axes
+
+        Parameters:
+        -----------
+        data : DataFrame
+            The pandas object holding the data.
+        column : str or sequence, optional
+            If passed, will be used to limit data to a subset of columns.
+        by : object, optional
+            If passed, then used to form histograms for separate groups.
+        grid : bool, default True
+            Whether to show axis grid lines.
+        xlabelsize : int, default None
+            If specified changes the x-axis label size.
+        xrot : float, default None
+            Rotation of x axis labels. For example, a value of 90 displays
+            the x labels rotated 90 degrees clockwise.
+        ylabelsize : int, default None
+            If specified changes the y-axis label size.
+        yrot : float, default None
+            Rotation of y axis labels. For example, a value of 90 displays
+            the y labels rotated 90 degrees clockwise.
+        ax : Matplotlib axes object, default None
+            The axes to plot the histogram on.
+        sharex : bool, default True if ax is None else False
+            In case subplots=True, share x axis and set some x axis labels to invisible;
+            defaults to True if ax is None otherwise False if an ax is passed in.
+            Note that passing in both an ax and sharex=True will alter all x axis
+            labels for all subplots in a figure.
+        sharey : bool, default False
+            In case subplots=True, share y axis and set some y axis labels to invisible.
+        figsize : tuple, optional
+            The size in inches of the figure to create.
+            Uses the value in matplotlib.rcParams by default.
+        layout : tuple, optional
+            Tuple of (rows, columns) for the layout of the histograms.
+        bins : int or sequence, default 10
+            Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are
+            calculated and returned. If bins is a sequence, gives bin edges,
+            including left edge of first bin and right edge of last bin.
+        backend : str, default None
+            Backend to use instead of the backend specified in the option plotting.backend.
+            For instance, ‘matplotlib’. Alternatively, to specify the plotting.backend for the
+            whole session, set pd.options.plotting.backend.
+        legend : bool, default False
+            Whether to show the legend.
+        filename:
+            The name of the file where the plot will be saved.
+        """
+        logger.debug("hist()")
+        command: str = utils.get_command("hist()", stack())
+
+        if isinstance(data, list):  # pragma: no cover
+            logger.info(
+                "Calculating histogram for more than one columns is "
+                "not currently supported. Please do each column separately."
+            )
+            return
+
+        freq, _ = np.histogram(  # pylint: disable=too-many-function-args
+            data[column], bins, range=(data[column].min(), data[column].max())
+        )
+
+        # threshold check
+        threshold_mask = freq < THRESHOLD
+
+        # plot the histogram
+        if np.any(threshold_mask):  # the column is disclosive
+            status = "fail"
+            if self.suppress:
+                logger.warning(
+                    f"Histogram will not be shown as the {column} column is disclosive."
+                )
+            else:  # pragma: no cover
+                data.hist(
+                    column=column,
+                    by=by,
+                    grid=grid,
+                    xlabelsize=xlabelsize,
+                    xrot=xrot,
+                    ylabelsize=ylabelsize,
+                    yrot=yrot,
+                    ax=ax,
+                    sharex=sharex,
+                    sharey=sharey,
+                    figsize=figsize,
+                    layout=layout,
+                    bins=bins,
+                    backend=backend,
+                    legend=legend,
+                    **kwargs,
+                )
+        else:
+            status = "review"
+            data.hist(
+                column=column,
+                by=by,
+                grid=grid,
+                xlabelsize=xlabelsize,
+                xrot=xrot,
+                ylabelsize=ylabelsize,
+                yrot=yrot,
+                ax=ax,
+                sharex=sharex,
+                sharey=sharey,
+                figsize=figsize,
+                layout=layout,
+                bins=bins,
+                backend=backend,
+                legend=legend,
+                **kwargs,
+            )
+        logger.info(f"status: {status}")
+
+        # create the summary
+        min_value = data[column].min()
+        max_value = data[column].max()
+        summary = (
+            f"Please check the minimum and the maximum values. "
+            f"The minimum value of the {column} column is: {min_value}. "
+            f"The maximum value of the {column} column is: {max_value}"
+        )
+
+        # create the acro_artifacts directory to save the plot in it
+        try:
+            os.makedirs("acro_artifacts")
+            logger.debug("Directory acro_artifacts created successfully")
+        except FileExistsError:  # pragma: no cover
+            logger.debug("Directory acro_artifacts already exists")
+
+        # create a unique filename with number to avoid overwrite
+        filename, extension = os.path.splitext(filename)
+        if not extension:  # pragma: no cover
+            logger.info("Please provide a valid file extension")
+            return
+        increment_number = 0
+        while os.path.exists(
+            f"acro_artifacts/{filename}_{increment_number}{extension}"
+        ):
+            increment_number += 1
+        unique_filename = f"acro_artifacts/{filename}_{increment_number}{extension}"
+
+        # save the plot to the acro artifacts directory
+        plt.savefig(unique_filename)
+
+        # record output
+        self.results.add(
+            status=status,
+            output_type="histogram",
+            properties={"method": "histogram"},
+            sdc={},
+            command=command,
+            summary=summary,
+            outcome=pd.DataFrame(),
+            output=[os.path.normpath(unique_filename)],
+        )
+
 
 def create_crosstab_masks(  # pylint: disable=too-many-arguments,too-many-locals
     index,

diff --git a/acro/record.py b/acro/record.py
@@ -175,9 +175,11 @@ def serialize_output(self, path: str = "outputs") -> list[str]:
                 if os.path.exists(filename):
                     shutil.copy(filename, path)
                     output.append(Path(filename).name)
-        if self.output_type == "survival plot":
+        if self.output_type == "survival plot" or self.output_type == "histogram":
             for filename in self.output:
-                output.append(Path(filename).name)
+                if os.path.exists(filename):
+                    output.append(Path(filename).name)
+                    shutil.copy(filename, path)
         return output
 
     def __str__(self) -> str:
@@ -446,9 +448,10 @@ def finalise(self, path: str, ext: str) -> None:
             self.finalise_excel(path)
         else:
             raise ValueError("Invalid file extension. Options: {json, xlsx}")
-        if os.path.exists("acro_artifacts"):
-            add_acro_artifacts(path)
         self.write_checksums(path)
+        # check if the directory acro_artifacts exists and delete it
+        if os.path.exists("acro_artifacts"):
+            shutil.rmtree("acro_artifacts")
         logger.info("outputs written to: %s", path)
 
     def finalise_json(self, path: str) -> None:
@@ -565,20 +568,6 @@ def write_checksums(self, path: str) -> None:
             logger.debug("There is no file to do the checksums")  # pragma: no cover
 
 
-def add_acro_artifacts(path: str) -> None:
-    """Copy any file from the acro_artifacts directory to the output
-        directory then delete the directory.
-
-    Parameters
-    ----------
-    path : str
-        Name of the folder that files are to be written.
-    """
-    for filename in os.listdir("acro_artifacts"):
-        shutil.copy(f"acro_artifacts/{filename}", path)
-    shutil.rmtree("acro_artifacts")
-
-
 def load_records(path: str) -> Records:
     """Loads outputs from a JSON file.
 

diff --git a/notebooks/acro_demo.py b/notebooks/acro_demo.py
@@ -9,7 +9,7 @@
 import pandas as pd
 from scipy.io.arff import loadarff
 
-from acro import ACRO, add_constant, add_to_acro
+from acro import ACRO
 
 # Instantiate ACRO by making an acro object
 print(
@@ -21,8 +21,10 @@
 
 # Load test data
 # The dataset used in this notebook is the nursery dataset from OpenML.
-# - In this version, the data can be read directly from the local machine after it has been downloaded.
-# - The code below reads the data from a folder called "data" which we assume is at the same level as the folder where you are working.
+# - In this version, the data can be read directly from the local machine after
+#  it has been downloaded.
+# - The code below reads the data from a folder called "data" which we assume is at
+# the same level as the folder where you are working.
 # - The path might need to be changed if the data has been downloaded and stored elsewhere.
 #  - for example use:
 #     path = os.path.join("data", "nursery.arff")
@@ -39,7 +41,8 @@
 # Examples of producing tabular output
 # We rely on the industry-standard package **pandas** for tabulating data.
 # In the next few examples we show:
-# - first, how a researcher would normally make a call in pandas, saving the results in a variable that they can view on screen (or save to file?)
+# - first, how a researcher would normally make a call in pandas, saving the results
+# in a variable that they can view on screen (or save to file?)
 # - then how the call is identical in SACRO, except that:
 #   - "pd" is replaced by "acro"
 #   - the researcher immediately sees a copy of what the TRE output checker will see.
@@ -69,9 +72,11 @@
 print(safe_table)
 
 # ACRO crosstab with suppression
-# - This is an example of crosstab with suppressing the cells that violate the disclosure tests.
-# - Note that you need to change the value of the suppress variable in the acro object to True. Then run the crosstab command.
-# - If you wish to continue the research while suppressing the outputs, leave the suppress variable as it is, otherwise turn it off.
+# - This is an example of crosstab with suppressing the cells violate the disclosure tests.
+# - Note that you need to change the value of the suppress variable in the acro
+# object to True. Then run the crosstab command.
+# - If you wish to continue the research while suppressing the outputs,
+# leave the suppress variable as it is, otherwise turn it off.
 
 print("\nTurn on the suppression variable")
 acro.suppress = True
@@ -96,7 +101,7 @@
 
 # 2: Remove some ACRO outputs before finalising
 # This is an example of deleting some of the ACRO outputs.
-# The name of the output that needs to be removed should be passed to the function remove_output.
+# The name of the output needs to be removed should be passed to the function remove_output.
 # - The output name can be taken from the outputs listed by the print_outputs function,
 # - or by listing the results and choosing the specific output that needs to be removed
 
@@ -111,13 +116,14 @@
 
 # 4: Add a comment to output
 # This is an example to add a comment to outputs.
-# It can be used to provide a description or to pass additional information to the output checkers.
+# It can be used to provide a description or to pass information to the output checkers.
 
 print("\nUsers can add comments which the output checkers will see.")
 acro.add_comments("cross_tabulation", "Please let me have this data.")
 
 # 5: (the big one) Finalise ACRO
-# This is an example of the function _finalise()_ which the users must call at the end of each session.
+# This is an example of the function _finalise()_
+# which the users must call at the end of each session.
 # - It takes each output and saves it to a CSV file.
 # - It also saves the SDC analysis for each output to a json file or Excel file
 #   (depending on the extension of the name of the file provided as an input to the function)