From 90dab1544dacb9972df5d3aa184f7d84fad2452c Mon Sep 17 00:00:00 2001
From: Paul <paul.fleming@nrel.gov>
Date: Tue, 23 Jul 2024 22:28:45 -0700
Subject: [PATCH] docstrings for filtering

---
 flasc/data_processing/filtering.py | 241 +++++++++++++++++++----------
 1 file changed, 161 insertions(+), 80 deletions(-)

diff --git a/flasc/data_processing/filtering.py b/flasc/data_processing/filtering.py
index 75c02d79..db34022a 100644
--- a/flasc/data_processing/filtering.py
+++ b/flasc/data_processing/filtering.py
@@ -1,3 +1,6 @@
+"""Implement filtering class and functions for FLASC data."""
+
+
 import itertools
 
 import matplotlib.pyplot as plt
@@ -18,16 +21,47 @@
 
 
 def df_get_no_faulty_measurements(df, turbine):
+    """Get the number of faulty measurements for a specific turbine.
+
+    Args:
+        df (pd.DataFrame): Dataframe containing the turbine data,
+            formatted in the generic SCADA data format. Namely, the
+            dataframe should at the very least contain the columns:
+              * Time of each measurement: time
+              * Wind speed of each turbine: ws_000, ws_001, ...
+              * Power production of each turbine: pow_000, pow_001, ...
+        turbine (int): The turbine identifier for which the number of
+            faulty measurements should be counted.
+
+    Returns:
+        N_isnan (int): Number of faulty measurements for the turbine.
+    """
     if isinstance(turbine, str):
         turbine = int(turbine)
     entryisnan = np.isnan(df["pow_%03d" % turbine].astype(float))
-    # cols = [s for s in df.columns if s[-4::] == ('_%03d' % turbine)]
-    # entryisnan = (np.sum(np.isnan(df[cols]),axis=1) > 0)
     N_isnan = np.sum(entryisnan)
     return N_isnan
 
 
 def df_mark_turbdata_as_faulty(df, cond, turbine_list, exclude_columns=[]):
+    """Mark turbine data as faulty based on a condition.
+
+    Args:
+        df (pd.DataFrame): Dataframe containing the turbine data,
+            formatted in the generic SCADA data format.
+        cond (iteratible): List or array-like variable with bool entries
+            depicting whether the condition is met or not. These should be
+            situations in which you classify the data as faulty. For example,
+            high wind speeds but low power productions, or NaNs, self-flagged
+            status variables.
+        turbine_list (int, list): Turbine identifier(s) for which the data
+            should be flagged as faulty when the condition is met.
+        exclude_columns (list, optional): List of columns that should not
+            be considered for the filtering. Defaults to [].
+
+    Returns:
+        pd.DataFrame: Dataframe with the faulty measurements marked as None.
+    """
     if isinstance(turbine_list, (np.integer, int)):
         turbine_list = [turbine_list]
 
@@ -39,7 +73,9 @@ def df_mark_turbdata_as_faulty(df, cond, turbine_list, exclude_columns=[]):
 
 
 class FlascFilter:
-    """This class allows a user to filter turbine data based on the
+    """Implement filtering class for SCADA data.
+
+    This class allows a user to filter turbine data based on the
     wind-speed power curve. This class includes several useful filtering
     methods:
         1. Filtering based on prespecified boxes/windows. Any data outside
@@ -56,14 +92,14 @@ def __init__(self, df, turbine_names=None):
         """Initializes the class.
 
         Args:
-            df ([pd.DataFrame]): Dataframe containing the turbine data,
+            df (pd.DataFrame): Dataframe containing the turbine data,
                 formatted in the generic SCADA data format. Namely, the
                 dataframe should at the very least contain the columns:
                   * Time of each measurement: time
                   * Wind speed of each turbine: ws_000, ws_001, ...
                   * Power production of each turbine: pow_000, pow_001, ...
+            turbine_names (list, optional): List of turbine names. Defaults to None.
         """
-
         # Write dataframe to self
         self._df_initial = df.copy()
         self.reset_filters()
@@ -73,7 +109,9 @@ def __init__(self, df, turbine_names=None):
 
     # Private methods
     def _get_all_unique_flags(self):
-        """Private function that grabs all the unique filter flags
+        """Returns all unique flags in the dataframe.
+
+        Private function that grabs all the unique filter flags
         that are available in self.df_filters and returns them
         as a list of strings. This is helpful when plotting the
         various filter sources in a scatter plot, for example.
@@ -108,13 +146,15 @@ def _reset_mean_power_curves(self, ws_bins=np.arange(0.0, 25.5, 0.5)):
         self.pw_curve_df = pw_curve_df
 
     def _get_mean_power_curves(self, df=None, turbine_subset=None):
-        """Calculates the mean power production in bins of the wind speed,
+        """Calculates the mean power production in bins of the wind speed.
+
+        Calculates the mean power production in bins of the wind speed,
         for all turbines in the wind farm.
 
         Args:
             ws_bins ([iteratible], optional): Wind speed bins. Defaults to
                 np.arange(0.0, 25.5, 0.5).
-            df ([pd.DataFrame]): Dataframe containing the turbine data,
+            df (pd.DataFrame): Dataframe containing the turbine data,
                 formatted in the generic SCADA data format. Namely, the
                 dataframe should at the very least contain the columns:
                   * Time of each measurement: time
@@ -123,12 +163,12 @@ def _get_mean_power_curves(self, df=None, turbine_subset=None):
             turbine_subset (list, optional): List of turbine indices to
                 calculate the mean power curve for. If None is specified,
                 defaults to calculating it for all turbines.
+
         Returns:
-            pw_curve_df ([pd.DataFrame]): Dataframe containing the wind
+            pd.DataFrame: Dataframe containing the wind
                 speed bins and the mean power production value for every
                 turbine.
         """
-
         # If df unspecified, use the locally filtered variable
         if df is None:
             df = self.df
@@ -173,7 +213,6 @@ def _get_mean_power_curves(self, df=None, turbine_subset=None):
     # Public methods
     def reset_filters(self):
         """Reset all filter variables and assume all data is clean."""
-
         # Copy the original, unfiltered dataframe from self
         df = self._df_initial
         self.df = df.reset_index(drop=("time" in df.columns))
@@ -200,7 +239,9 @@ def filter_by_condition(
         verbose: bool = True,
         apply_filters_to_df: bool = True,
     ):
-        """This is a generic method to filter the dataframe for any particular
+        """Filter the dataframe for a specific condition, for a specific turbine.
+
+        This is a generic method to filter the dataframe for any particular
         condition, for a specific turbine or specific set of turbines. This
         provides a platform for user-specific queries to filter and then inspect
         the data with. You can call this function multiple times and the filters
@@ -230,7 +271,7 @@ def filter_by_condition(
                 high wind speeds but low power productions, or NaNs, self-flagged
                 status variables.
             label (str): Name or description of the fault/condition that is flagged.
-            ti (int): Turbine indentifier, typically an integer, but may also be a
+            ti (int): Turbine identifier, typically an integer, but may also be a
                 list. This flags the measurements of all these turbines as faulty
                 for which condition==True.
             verbose (bool, optional): Print information to console. Defaults to True.
@@ -238,11 +279,10 @@ def filter_by_condition(
                 self.df directly as NaN. Defaults to True.
 
         Returns:
-            df_out: The filtered dataframe. All measurements that are flagged as faulty
+            pd.Dataframe: The filtered dataframe. All measurements that are flagged as faulty
                 are overwritten by "None"/"NaN". If apply_filters_to_df==True, then this
                 dataframe is equal to the internally filtered dataframe 'self.df'.
         """
-
         # Pour it into a list format
         if isinstance(ti, int):
             ti = [ti]
@@ -294,7 +334,9 @@ def filter_by_sensor_stuck_faults(
         plot: bool = False,
         verbose: bool = True,
     ):
-        """Filter the turbine measurements for sensor-stuck type of faults. This is
+        """Filter the turbine measurements for sensor-stuck type of faults.
+
+        This is
         the situation where a turbine measurement reads the exact same value for
         multiple consecutive timestamps. This typically indicates a "frozen" sensor
         rather than a true physical effect. This is particularly the case for
@@ -332,12 +374,11 @@ def filter_by_sensor_stuck_faults(
             verbose (bool, optional): Print information to console. Defaults to True.
 
         Returns:
-            self.df: Pandas DataFrame with the filtered data, in which faulty turbine
+            pd.Dataframe: Pandas DataFrame with the filtered data, in which faulty turbine
                 measurements are flagged as None/NaN. This is an aggregated filtering
                 variable, so it includes faulty-flagged measurements from filter
                 operations in previous steps.
         """
-
         # Filter sensor faults using the separate function call
         stuck_indices = find_sensor_stuck_faults(
             df=self.df,
@@ -375,37 +416,44 @@ def filter_by_power_curve(
         no_iterations=10,
         cutoff_ws=20.0,
     ):
-        """Filter the data by offset from the mean power curve in x-
-        directions. This is an iterative process because the estimated mean
+        """Filter the data by offset from the mean power curve in x-directions.
+
+        This is an iterative process because the estimated mean
         curve actually changes as data is filtered. This process typically
         converges within a couple iterations.
 
         Args:
             ti (int): The turbine identifier for which the data should be
-            filtered.
+                filtered.
             m_ws_lb (float, optional): Multiplier on the wind speed defining
-            the left bound for the power curve. Any data to the left of this
-            curve is considered faulty. Defaults to 0.95.
+                the left bound for the power curve. Any data to the left of this
+                curve is considered faulty. Defaults to 0.95.
             m_pow_lb (float, optional): Multiplier on the power defining
-            the left bound for the power curve. Any data to the left of this
-            curve is considered faulty. Defaults to 1.01.
+                the left bound for the power curve. Any data to the left of this
+                curve is considered faulty. Defaults to 1.01.
             m_ws_rb (float, optional): Multiplier on the wind speed defining
-            the right bound for the power curve. Any data to the right of this
-            curve is considered faulty. Defaults to 1.05.
+                the right bound for the power curve. Any data to the right of this
+                curve is considered faulty. Defaults to 1.05.
             m_pow_rb (float, optional): Multiplier on the power defining
-            the right bound for the power curve. Any data to the right of this
-            curve is considered faulty. Defaults to 0.99.
+                the right bound for the power curve. Any data to the right of this
+                curve is considered faulty. Defaults to 0.99.
+            ws_deadband (float, optional): Deadband in [m/s] around the median
+                power curve around which data is by default classified as valid.
+                Defaults to 0.50.
+            pow_deadband (float, optional): Deadband in [kW] around the median
+                power curve around which data is by default classified as valid.
+                Defaults to 20.0.
             no_iterations (int, optional): Number of iterations. The
-            solution typically converges in 2-3 steps, but as the process is
-            very fast, it's better to run a higher number of iterations.
-            Defaults to 10.
+                solution typically converges in 2-3 steps, but as the process is
+                very fast, it's better to run a higher number of iterations.
+                Defaults to 10.
             cutoff_ws (float, optional): Upper limit for the filtering to occur.
-            Typically, this is a value just below the cut-out wind speed. Namely,
-            issues arise if you put this wind speed above the cut-out wind speed,
-            because we effectively end up with two curves for the same power
-            production (one at region 2, one going down from cut-out wind speed).
-            This confuses the algorithm. Hence, suggested to put this somewhere
-            around 15-25 m/s. Defaults to 20 m/s.
+                Typically, this is a value just below the cut-out wind speed. Namely,
+                issues arise if you put this wind speed above the cut-out wind speed,
+                because we effectively end up with two curves for the same power
+                production (one at region 2, one going down from cut-out wind speed).
+                This confuses the algorithm. Hence, suggested to put this somewhere
+                around 15-25 m/s. Defaults to 20 m/s.
         """
         # Initialize the dataframe from self, as a starting point. Note
         # that in each iteration, we do not want to build upon the
@@ -545,36 +593,43 @@ def filter_by_floris_power_curve(
         pow_deadband=20.0,
         cutoff_ws=20.0,
     ):
-        """Filter the data by offset from the floris power curve in x-
-        directions.
+        """Filter the data by offset from the floris power curve.
 
         Args:
             fm (FlorisModel): The FlorisModel object for the farm
+            ti (int): The turbine identifier for which the data should be
+                filtered.
             m_ws_lb (float, optional): Multiplier on the wind speed defining
-            the left bound for the power curve. Any data to the left of this
-            curve is considered faulty. Defaults to 0.95.
+                the left bound for the power curve. Any data to the left of this
+                curve is considered faulty. Defaults to 0.95.
             m_pow_lb (float, optional): Multiplier on the power defining
-            the left bound for the power curve. Any data to the left of this
-            curve is considered faulty. Defaults to 1.01.
+                the left bound for the power curve. Any data to the left of this
+                curve is considered faulty. Defaults to 1.01.
             m_ws_rb (float, optional): Multiplier on the wind speed defining
-            the right bound for the power curve. Any data to the right of this
-            curve is considered faulty. Defaults to 1.05.
+                the right bound for the power curve. Any data to the right of this
+                curve is considered faulty. Defaults to 1.05.
             m_pow_rb (float, optional): Multiplier on the power defining
-            the right bound for the power curve. Any data to the right of this
-            curve is considered faulty. Defaults to 0.99.
+                the right bound for the power curve. Any data to the right of this
+                curve is considered faulty. Defaults to 0.99.
             ws_deadband (float, optional): Deadband in [m/s] around the median
-            power curve around which data is by default classified as valid.
-            Defaults to 0.50.
+                power curve around which data is by default classified as valid.
+                Defaults to 0.50.
             pow_deadband (float, optional): Deadband in [kW] around the median
-            power curve around which data is by default classified as valid.
-            Defaults to 20.0.
+                power curve around which data is by default classified as valid.
+                Defaults to 20.0.
             cutoff_ws (float, optional): Wind speed up to which the median
-            power curve is calculated and the data is filtered for. You should
-            make sure this variable is set to a value above the rated wind
-            speed and below the cut-out wind speed. If you are experiencing
-            problems with data filtering and your data points have a downward
-            trend near the high wind speeds, try decreasing this variable's
-            value to 15.0.
+                power curve is calculated and the data is filtered for. You should
+                make sure this variable is set to a value above the rated wind
+                speed and below the cut-out wind speed. If you are experiencing
+                problems with data filtering and your data points have a downward
+                trend near the high wind speeds, try decreasing this variable's
+                value to 15.0.
+
+        Returns:
+            pd.Dataframe: Pandas DataFrame with the filtered data, in which faulty turbine
+                measurements are flagged as None/NaN. This is an aggregated filtering
+                variable, so it includes faulty-flagged measurements from filter
+                operations in previous steps.
         """
         logger.info("Filtering data by deviations from the floris power curve...")
 
@@ -715,8 +770,9 @@ def get_power_curve(self, calculate_missing=True):
             calculate_missing (bool, optional): Calculate the median power
                 curves for the turbines for the turbines of which their
                 power curves were previously not yet calculated.
+
         Returns:
-            pw_curve_df ([pd.DataFrame]): Dataframe containing the wind
+            pw_curve_df (pd.DataFrame): Dataframe containing the wind
                 speed bins and the mean power production value for every
                 turbine.
             calculate_missing (bool, optional): Calculate the median power
@@ -734,15 +790,21 @@ def get_power_curve(self, calculate_missing=True):
         return self.pw_curve_df
 
     def plot_farm_mean_power_curve(self, fm=None):
-        """Plot all turbines' power curves in a single figure. Also estimate
+        """Plot all turbines' power curves in a single figure.
+
+        Also estimate
         and plot a mean turbine power curve.
 
         Args:
             fm (FlorisModel): The FlorisModel object for the farm. If
               specified by the user, then the farm-average turbine power curve
               from FLORIS will be plotted on top of the SCADA-based power curves.
-        """
 
+        Returns:
+            fig, ax: The figure and axis objects of the
+                plot. The user can further manipulate the plot
+                by calling methods on these objects
+        """
         # Get mean power curves for the turbines that are not yet calculated
         if self.pw_curve_df.isna().all(axis=0).any():
             turbine_subset = np.where(
@@ -785,7 +847,9 @@ def plot_farm_mean_power_curve(self, fm=None):
     def plot_filters_custom_scatter(
         self, ti, x_col, y_col, xlabel="Wind speed (m/s)", ylabel="Power (kW)", ax=None
     ):
-        """Plot the filtered data in a scatter plot, categorized
+        """Plot the filtered data in a scatter plot.
+
+        Plot the filtered data in a scatter plot, categorized
         by the source of their filter/fault. This is a generic
         function that allows the user to plot various numeric
         variables on the x and y axis.
@@ -862,7 +926,9 @@ def plot_filters_custom_scatter_bokeh(
         ylabel="Power (kW)",
         p=None,
     ):
-        """Plot the filtered data in a scatter plot, categorized
+        """Plot the filtered data in a scatter plot.
+
+        Plot the filtered data in a scatter plot, categorized
         by the source of their filter/fault. This is a generic
         function that allows the user to plot various numeric
         variables on the x and y axis.
@@ -951,7 +1017,9 @@ def plot_filters_custom_scatter_bokeh(
         return p
 
     def plot_filters_in_ws_power_curve(self, ti, fm=None, ax=None):
-        """Plot the wind speed power curve and connect each faulty datapoint
+        """Plot faulty data in the wind speed power curve.
+
+        Plot the wind speed power curve and connect each faulty datapoint
         to the label it was classified as faulty with.
 
         Args:
@@ -960,8 +1028,10 @@ def plot_filters_in_ws_power_curve(self, ti, fm=None, ax=None):
             use this to plot the turbine power curves as implemented in floris.
             Defaults to None.
             ax (plt.Axis): Pyplot Axis object.
-        """
 
+        Returns:
+            ax: The figure axis in which the scatter plot is drawn.
+        """
         if ax is None:
             _, ax = plt.subplots(figsize=(10, 5))
 
@@ -1017,7 +1087,9 @@ def plot_filters_in_ws_power_curve(self, ti, fm=None, ax=None):
         return ax
 
     def plot_postprocessed_in_ws_power_curve(self, ti, fm=None, ax=None):
-        """Plot the wind speed power curve and mark faulty data according to
+        """Plot the postprocessed data in the wind speed power curve.
+
+        Plot the wind speed power curve and mark faulty data according to
         their filters.
 
         Args:
@@ -1027,8 +1099,10 @@ def plot_postprocessed_in_ws_power_curve(self, ti, fm=None, ax=None):
             Defaults to None.
             ax (Matplotlib.pyplot Axis, optional): Axis to plot in. If None is
                specified, creates a new figure and axis. Defaults to None.
-        """
 
+        Returns:
+            ax: The figure axis in which the scatter plot is drawn.
+        """
         if ax is None:
             _, ax = plt.subplots(figsize=(10, 5))
 
@@ -1093,7 +1167,9 @@ def plot_postprocessed_in_ws_power_curve(self, ti, fm=None, ax=None):
         return ax
 
     def plot_filters_in_time(self, ti, ax=None):
-        """Generate bar plot where each week of data is gathered and its
+        """Plot the filtered data in time.
+
+        Generate bar plot where each week of data is gathered and its
         filtering results will be shown relative to the data size of each
         week. This plot can particularly be useful to investigate whether
         certain weeks/time periods show a particular high number of faulty
@@ -1135,7 +1211,9 @@ def plot_filters_in_time(self, ti, ax=None):
         return ax
 
     def plot_filters_in_time_bokeh(self, ti, p=None):
-        """Generate bar plot where each week of data is gathered and its
+        """Plot the filtered data in time.
+
+        Generate bar plot where each week of data is gathered and its
         filtering results will be shown relative to the data size of each
         week. This plot can particularly be useful to investigate whether
         certain weeks/time periods show a particular high number of faulty
@@ -1147,8 +1225,10 @@ def plot_filters_in_time_bokeh(self, ti, p=None):
             ti (int): Index of the turbine of interest.
             p (Bokeh Figure, optional): Figure to plot in. If None is
                specified, creates a new figure. Defaults to None.
-        """
 
+        Returns:
+            axis: The figure axis in which the scatter plot is
+        """
         if p is None:
             p = figure(
                 title="Filters over time",
@@ -1200,19 +1280,21 @@ def plot_filters_in_time_bokeh(self, ti, p=None):
 
 
 def filter_df_by_faulty_impacting_turbines(df, ti, df_impacting_turbines, verbose=True):
-    """Assigns a turbine's measurement to NaN for each timestamp for which any of the turbines
+    """Assign faulty measurements based on upstream turbines faults.
+
+    Assigns a turbine's measurement to NaN for each timestamp for which any of the turbines
       that are shedding a wake on this turbine is reporting NaN measurements.
 
     Args:
         df (pd.DataFrame): Dataframe with SCADA data with measurements
-        formatted according to wd_000, wd_001, wd_002, pow_000, pow_001,
-        pow_002, and so on.
-        ti (integer): Turbine number for which we are filtering the data.
-        Basically, each turbine that impacts that power production of
-        turbine 'ti' by more than 0.1% is required to be reporting a
-        non-faulty measurement. If not, we classify the measurement of
-        turbine 'ti' as faulty because we cannot sufficiently know the
-        inflow conditions of this turbine.
+            formatted according to wd_000, wd_001, wd_002, pow_000, pow_001,
+            pow_002, and so on.
+        ti (int): Turbine number for which we are filtering the data.
+            Basically, each turbine that impacts that power production of
+            turbine 'ti' by more than 0.1% is required to be reporting a
+            non-faulty measurement. If not, we classify the measurement of
+            turbine 'ti' as faulty because we cannot sufficiently know the
+            inflow conditions of this turbine.
         df_impacting_turbines (pd.DataFrame): A Pandas DataFrame in the
         format of:
 
@@ -1238,7 +1320,6 @@ def filter_df_by_faulty_impacting_turbines(df, ti, df_impacting_turbines, verbos
         pd.DataFrame: The postprocessed dataframe for 'df', filtered for
         inter-turbine issues like curtailment and turbine downtime.
     """
-
     # Get number of turbines
     n_turbines = dfm.get_num_turbines(df)