From 90dab1544dacb9972df5d3aa184f7d84fad2452c Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 23 Jul 2024 22:28:45 -0700 Subject: [PATCH] docstrings for filtering --- flasc/data_processing/filtering.py | 241 +++++++++++++++++++---------- 1 file changed, 161 insertions(+), 80 deletions(-) diff --git a/flasc/data_processing/filtering.py b/flasc/data_processing/filtering.py index 75c02d79..db34022a 100644 --- a/flasc/data_processing/filtering.py +++ b/flasc/data_processing/filtering.py @@ -1,3 +1,6 @@ +"""Implement filtering class and functions for FLASC data.""" + + import itertools import matplotlib.pyplot as plt @@ -18,16 +21,47 @@ def df_get_no_faulty_measurements(df, turbine): + """Get the number of faulty measurements for a specific turbine. + + Args: + df (pd.DataFrame): Dataframe containing the turbine data, + formatted in the generic SCADA data format. Namely, the + dataframe should at the very least contain the columns: + * Time of each measurement: time + * Wind speed of each turbine: ws_000, ws_001, ... + * Power production of each turbine: pow_000, pow_001, ... + turbine (int): The turbine identifier for which the number of + faulty measurements should be counted. + + Returns: + N_isnan (int): Number of faulty measurements for the turbine. + """ if isinstance(turbine, str): turbine = int(turbine) entryisnan = np.isnan(df["pow_%03d" % turbine].astype(float)) - # cols = [s for s in df.columns if s[-4::] == ('_%03d' % turbine)] - # entryisnan = (np.sum(np.isnan(df[cols]),axis=1) > 0) N_isnan = np.sum(entryisnan) return N_isnan def df_mark_turbdata_as_faulty(df, cond, turbine_list, exclude_columns=[]): + """Mark turbine data as faulty based on a condition. + + Args: + df (pd.DataFrame): Dataframe containing the turbine data, + formatted in the generic SCADA data format. + cond (iteratible): List or array-like variable with bool entries + depicting whether the condition is met or not. These should be + situations in which you classify the data as faulty. For example, + high wind speeds but low power productions, or NaNs, self-flagged + status variables. + turbine_list (int, list): Turbine identifier(s) for which the data + should be flagged as faulty when the condition is met. + exclude_columns (list, optional): List of columns that should not + be considered for the filtering. Defaults to []. + + Returns: + pd.DataFrame: Dataframe with the faulty measurements marked as None. + """ if isinstance(turbine_list, (np.integer, int)): turbine_list = [turbine_list] @@ -39,7 +73,9 @@ def df_mark_turbdata_as_faulty(df, cond, turbine_list, exclude_columns=[]): class FlascFilter: - """This class allows a user to filter turbine data based on the + """Implement filtering class for SCADA data. + + This class allows a user to filter turbine data based on the wind-speed power curve. This class includes several useful filtering methods: 1. Filtering based on prespecified boxes/windows. Any data outside @@ -56,14 +92,14 @@ def __init__(self, df, turbine_names=None): """Initializes the class. Args: - df ([pd.DataFrame]): Dataframe containing the turbine data, + df (pd.DataFrame): Dataframe containing the turbine data, formatted in the generic SCADA data format. Namely, the dataframe should at the very least contain the columns: * Time of each measurement: time * Wind speed of each turbine: ws_000, ws_001, ... * Power production of each turbine: pow_000, pow_001, ... + turbine_names (list, optional): List of turbine names. Defaults to None. """ - # Write dataframe to self self._df_initial = df.copy() self.reset_filters() @@ -73,7 +109,9 @@ def __init__(self, df, turbine_names=None): # Private methods def _get_all_unique_flags(self): - """Private function that grabs all the unique filter flags + """Returns all unique flags in the dataframe. + + Private function that grabs all the unique filter flags that are available in self.df_filters and returns them as a list of strings. This is helpful when plotting the various filter sources in a scatter plot, for example. @@ -108,13 +146,15 @@ def _reset_mean_power_curves(self, ws_bins=np.arange(0.0, 25.5, 0.5)): self.pw_curve_df = pw_curve_df def _get_mean_power_curves(self, df=None, turbine_subset=None): - """Calculates the mean power production in bins of the wind speed, + """Calculates the mean power production in bins of the wind speed. + + Calculates the mean power production in bins of the wind speed, for all turbines in the wind farm. Args: ws_bins ([iteratible], optional): Wind speed bins. Defaults to np.arange(0.0, 25.5, 0.5). - df ([pd.DataFrame]): Dataframe containing the turbine data, + df (pd.DataFrame): Dataframe containing the turbine data, formatted in the generic SCADA data format. Namely, the dataframe should at the very least contain the columns: * Time of each measurement: time @@ -123,12 +163,12 @@ def _get_mean_power_curves(self, df=None, turbine_subset=None): turbine_subset (list, optional): List of turbine indices to calculate the mean power curve for. If None is specified, defaults to calculating it for all turbines. + Returns: - pw_curve_df ([pd.DataFrame]): Dataframe containing the wind + pd.DataFrame: Dataframe containing the wind speed bins and the mean power production value for every turbine. """ - # If df unspecified, use the locally filtered variable if df is None: df = self.df @@ -173,7 +213,6 @@ def _get_mean_power_curves(self, df=None, turbine_subset=None): # Public methods def reset_filters(self): """Reset all filter variables and assume all data is clean.""" - # Copy the original, unfiltered dataframe from self df = self._df_initial self.df = df.reset_index(drop=("time" in df.columns)) @@ -200,7 +239,9 @@ def filter_by_condition( verbose: bool = True, apply_filters_to_df: bool = True, ): - """This is a generic method to filter the dataframe for any particular + """Filter the dataframe for a specific condition, for a specific turbine. + + This is a generic method to filter the dataframe for any particular condition, for a specific turbine or specific set of turbines. This provides a platform for user-specific queries to filter and then inspect the data with. You can call this function multiple times and the filters @@ -230,7 +271,7 @@ def filter_by_condition( high wind speeds but low power productions, or NaNs, self-flagged status variables. label (str): Name or description of the fault/condition that is flagged. - ti (int): Turbine indentifier, typically an integer, but may also be a + ti (int): Turbine identifier, typically an integer, but may also be a list. This flags the measurements of all these turbines as faulty for which condition==True. verbose (bool, optional): Print information to console. Defaults to True. @@ -238,11 +279,10 @@ def filter_by_condition( self.df directly as NaN. Defaults to True. Returns: - df_out: The filtered dataframe. All measurements that are flagged as faulty + pd.Dataframe: The filtered dataframe. All measurements that are flagged as faulty are overwritten by "None"/"NaN". If apply_filters_to_df==True, then this dataframe is equal to the internally filtered dataframe 'self.df'. """ - # Pour it into a list format if isinstance(ti, int): ti = [ti] @@ -294,7 +334,9 @@ def filter_by_sensor_stuck_faults( plot: bool = False, verbose: bool = True, ): - """Filter the turbine measurements for sensor-stuck type of faults. This is + """Filter the turbine measurements for sensor-stuck type of faults. + + This is the situation where a turbine measurement reads the exact same value for multiple consecutive timestamps. This typically indicates a "frozen" sensor rather than a true physical effect. This is particularly the case for @@ -332,12 +374,11 @@ def filter_by_sensor_stuck_faults( verbose (bool, optional): Print information to console. Defaults to True. Returns: - self.df: Pandas DataFrame with the filtered data, in which faulty turbine + pd.Dataframe: Pandas DataFrame with the filtered data, in which faulty turbine measurements are flagged as None/NaN. This is an aggregated filtering variable, so it includes faulty-flagged measurements from filter operations in previous steps. """ - # Filter sensor faults using the separate function call stuck_indices = find_sensor_stuck_faults( df=self.df, @@ -375,37 +416,44 @@ def filter_by_power_curve( no_iterations=10, cutoff_ws=20.0, ): - """Filter the data by offset from the mean power curve in x- - directions. This is an iterative process because the estimated mean + """Filter the data by offset from the mean power curve in x-directions. + + This is an iterative process because the estimated mean curve actually changes as data is filtered. This process typically converges within a couple iterations. Args: ti (int): The turbine identifier for which the data should be - filtered. + filtered. m_ws_lb (float, optional): Multiplier on the wind speed defining - the left bound for the power curve. Any data to the left of this - curve is considered faulty. Defaults to 0.95. + the left bound for the power curve. Any data to the left of this + curve is considered faulty. Defaults to 0.95. m_pow_lb (float, optional): Multiplier on the power defining - the left bound for the power curve. Any data to the left of this - curve is considered faulty. Defaults to 1.01. + the left bound for the power curve. Any data to the left of this + curve is considered faulty. Defaults to 1.01. m_ws_rb (float, optional): Multiplier on the wind speed defining - the right bound for the power curve. Any data to the right of this - curve is considered faulty. Defaults to 1.05. + the right bound for the power curve. Any data to the right of this + curve is considered faulty. Defaults to 1.05. m_pow_rb (float, optional): Multiplier on the power defining - the right bound for the power curve. Any data to the right of this - curve is considered faulty. Defaults to 0.99. + the right bound for the power curve. Any data to the right of this + curve is considered faulty. Defaults to 0.99. + ws_deadband (float, optional): Deadband in [m/s] around the median + power curve around which data is by default classified as valid. + Defaults to 0.50. + pow_deadband (float, optional): Deadband in [kW] around the median + power curve around which data is by default classified as valid. + Defaults to 20.0. no_iterations (int, optional): Number of iterations. The - solution typically converges in 2-3 steps, but as the process is - very fast, it's better to run a higher number of iterations. - Defaults to 10. + solution typically converges in 2-3 steps, but as the process is + very fast, it's better to run a higher number of iterations. + Defaults to 10. cutoff_ws (float, optional): Upper limit for the filtering to occur. - Typically, this is a value just below the cut-out wind speed. Namely, - issues arise if you put this wind speed above the cut-out wind speed, - because we effectively end up with two curves for the same power - production (one at region 2, one going down from cut-out wind speed). - This confuses the algorithm. Hence, suggested to put this somewhere - around 15-25 m/s. Defaults to 20 m/s. + Typically, this is a value just below the cut-out wind speed. Namely, + issues arise if you put this wind speed above the cut-out wind speed, + because we effectively end up with two curves for the same power + production (one at region 2, one going down from cut-out wind speed). + This confuses the algorithm. Hence, suggested to put this somewhere + around 15-25 m/s. Defaults to 20 m/s. """ # Initialize the dataframe from self, as a starting point. Note # that in each iteration, we do not want to build upon the @@ -545,36 +593,43 @@ def filter_by_floris_power_curve( pow_deadband=20.0, cutoff_ws=20.0, ): - """Filter the data by offset from the floris power curve in x- - directions. + """Filter the data by offset from the floris power curve. Args: fm (FlorisModel): The FlorisModel object for the farm + ti (int): The turbine identifier for which the data should be + filtered. m_ws_lb (float, optional): Multiplier on the wind speed defining - the left bound for the power curve. Any data to the left of this - curve is considered faulty. Defaults to 0.95. + the left bound for the power curve. Any data to the left of this + curve is considered faulty. Defaults to 0.95. m_pow_lb (float, optional): Multiplier on the power defining - the left bound for the power curve. Any data to the left of this - curve is considered faulty. Defaults to 1.01. + the left bound for the power curve. Any data to the left of this + curve is considered faulty. Defaults to 1.01. m_ws_rb (float, optional): Multiplier on the wind speed defining - the right bound for the power curve. Any data to the right of this - curve is considered faulty. Defaults to 1.05. + the right bound for the power curve. Any data to the right of this + curve is considered faulty. Defaults to 1.05. m_pow_rb (float, optional): Multiplier on the power defining - the right bound for the power curve. Any data to the right of this - curve is considered faulty. Defaults to 0.99. + the right bound for the power curve. Any data to the right of this + curve is considered faulty. Defaults to 0.99. ws_deadband (float, optional): Deadband in [m/s] around the median - power curve around which data is by default classified as valid. - Defaults to 0.50. + power curve around which data is by default classified as valid. + Defaults to 0.50. pow_deadband (float, optional): Deadband in [kW] around the median - power curve around which data is by default classified as valid. - Defaults to 20.0. + power curve around which data is by default classified as valid. + Defaults to 20.0. cutoff_ws (float, optional): Wind speed up to which the median - power curve is calculated and the data is filtered for. You should - make sure this variable is set to a value above the rated wind - speed and below the cut-out wind speed. If you are experiencing - problems with data filtering and your data points have a downward - trend near the high wind speeds, try decreasing this variable's - value to 15.0. + power curve is calculated and the data is filtered for. You should + make sure this variable is set to a value above the rated wind + speed and below the cut-out wind speed. If you are experiencing + problems with data filtering and your data points have a downward + trend near the high wind speeds, try decreasing this variable's + value to 15.0. + + Returns: + pd.Dataframe: Pandas DataFrame with the filtered data, in which faulty turbine + measurements are flagged as None/NaN. This is an aggregated filtering + variable, so it includes faulty-flagged measurements from filter + operations in previous steps. """ logger.info("Filtering data by deviations from the floris power curve...") @@ -715,8 +770,9 @@ def get_power_curve(self, calculate_missing=True): calculate_missing (bool, optional): Calculate the median power curves for the turbines for the turbines of which their power curves were previously not yet calculated. + Returns: - pw_curve_df ([pd.DataFrame]): Dataframe containing the wind + pw_curve_df (pd.DataFrame): Dataframe containing the wind speed bins and the mean power production value for every turbine. calculate_missing (bool, optional): Calculate the median power @@ -734,15 +790,21 @@ def get_power_curve(self, calculate_missing=True): return self.pw_curve_df def plot_farm_mean_power_curve(self, fm=None): - """Plot all turbines' power curves in a single figure. Also estimate + """Plot all turbines' power curves in a single figure. + + Also estimate and plot a mean turbine power curve. Args: fm (FlorisModel): The FlorisModel object for the farm. If specified by the user, then the farm-average turbine power curve from FLORIS will be plotted on top of the SCADA-based power curves. - """ + Returns: + fig, ax: The figure and axis objects of the + plot. The user can further manipulate the plot + by calling methods on these objects + """ # Get mean power curves for the turbines that are not yet calculated if self.pw_curve_df.isna().all(axis=0).any(): turbine_subset = np.where( @@ -785,7 +847,9 @@ def plot_farm_mean_power_curve(self, fm=None): def plot_filters_custom_scatter( self, ti, x_col, y_col, xlabel="Wind speed (m/s)", ylabel="Power (kW)", ax=None ): - """Plot the filtered data in a scatter plot, categorized + """Plot the filtered data in a scatter plot. + + Plot the filtered data in a scatter plot, categorized by the source of their filter/fault. This is a generic function that allows the user to plot various numeric variables on the x and y axis. @@ -862,7 +926,9 @@ def plot_filters_custom_scatter_bokeh( ylabel="Power (kW)", p=None, ): - """Plot the filtered data in a scatter plot, categorized + """Plot the filtered data in a scatter plot. + + Plot the filtered data in a scatter plot, categorized by the source of their filter/fault. This is a generic function that allows the user to plot various numeric variables on the x and y axis. @@ -951,7 +1017,9 @@ def plot_filters_custom_scatter_bokeh( return p def plot_filters_in_ws_power_curve(self, ti, fm=None, ax=None): - """Plot the wind speed power curve and connect each faulty datapoint + """Plot faulty data in the wind speed power curve. + + Plot the wind speed power curve and connect each faulty datapoint to the label it was classified as faulty with. Args: @@ -960,8 +1028,10 @@ def plot_filters_in_ws_power_curve(self, ti, fm=None, ax=None): use this to plot the turbine power curves as implemented in floris. Defaults to None. ax (plt.Axis): Pyplot Axis object. - """ + Returns: + ax: The figure axis in which the scatter plot is drawn. + """ if ax is None: _, ax = plt.subplots(figsize=(10, 5)) @@ -1017,7 +1087,9 @@ def plot_filters_in_ws_power_curve(self, ti, fm=None, ax=None): return ax def plot_postprocessed_in_ws_power_curve(self, ti, fm=None, ax=None): - """Plot the wind speed power curve and mark faulty data according to + """Plot the postprocessed data in the wind speed power curve. + + Plot the wind speed power curve and mark faulty data according to their filters. Args: @@ -1027,8 +1099,10 @@ def plot_postprocessed_in_ws_power_curve(self, ti, fm=None, ax=None): Defaults to None. ax (Matplotlib.pyplot Axis, optional): Axis to plot in. If None is specified, creates a new figure and axis. Defaults to None. - """ + Returns: + ax: The figure axis in which the scatter plot is drawn. + """ if ax is None: _, ax = plt.subplots(figsize=(10, 5)) @@ -1093,7 +1167,9 @@ def plot_postprocessed_in_ws_power_curve(self, ti, fm=None, ax=None): return ax def plot_filters_in_time(self, ti, ax=None): - """Generate bar plot where each week of data is gathered and its + """Plot the filtered data in time. + + Generate bar plot where each week of data is gathered and its filtering results will be shown relative to the data size of each week. This plot can particularly be useful to investigate whether certain weeks/time periods show a particular high number of faulty @@ -1135,7 +1211,9 @@ def plot_filters_in_time(self, ti, ax=None): return ax def plot_filters_in_time_bokeh(self, ti, p=None): - """Generate bar plot where each week of data is gathered and its + """Plot the filtered data in time. + + Generate bar plot where each week of data is gathered and its filtering results will be shown relative to the data size of each week. This plot can particularly be useful to investigate whether certain weeks/time periods show a particular high number of faulty @@ -1147,8 +1225,10 @@ def plot_filters_in_time_bokeh(self, ti, p=None): ti (int): Index of the turbine of interest. p (Bokeh Figure, optional): Figure to plot in. If None is specified, creates a new figure. Defaults to None. - """ + Returns: + axis: The figure axis in which the scatter plot is + """ if p is None: p = figure( title="Filters over time", @@ -1200,19 +1280,21 @@ def plot_filters_in_time_bokeh(self, ti, p=None): def filter_df_by_faulty_impacting_turbines(df, ti, df_impacting_turbines, verbose=True): - """Assigns a turbine's measurement to NaN for each timestamp for which any of the turbines + """Assign faulty measurements based on upstream turbines faults. + + Assigns a turbine's measurement to NaN for each timestamp for which any of the turbines that are shedding a wake on this turbine is reporting NaN measurements. Args: df (pd.DataFrame): Dataframe with SCADA data with measurements - formatted according to wd_000, wd_001, wd_002, pow_000, pow_001, - pow_002, and so on. - ti (integer): Turbine number for which we are filtering the data. - Basically, each turbine that impacts that power production of - turbine 'ti' by more than 0.1% is required to be reporting a - non-faulty measurement. If not, we classify the measurement of - turbine 'ti' as faulty because we cannot sufficiently know the - inflow conditions of this turbine. + formatted according to wd_000, wd_001, wd_002, pow_000, pow_001, + pow_002, and so on. + ti (int): Turbine number for which we are filtering the data. + Basically, each turbine that impacts that power production of + turbine 'ti' by more than 0.1% is required to be reporting a + non-faulty measurement. If not, we classify the measurement of + turbine 'ti' as faulty because we cannot sufficiently know the + inflow conditions of this turbine. df_impacting_turbines (pd.DataFrame): A Pandas DataFrame in the format of: @@ -1238,7 +1320,6 @@ def filter_df_by_faulty_impacting_turbines(df, ti, df_impacting_turbines, verbos pd.DataFrame: The postprocessed dataframe for 'df', filtered for inter-turbine issues like curtailment and turbine downtime. """ - # Get number of turbines n_turbines = dfm.get_num_turbines(df)