Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/provide frequency #123

Merged
merged 24 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b80c62e
Remove unused var and clean a little
paulf81 Sep 11, 2023
9969099
Split the energy ratio calculation a little
paulf81 Sep 11, 2023
bba7348
Clarify test comments
paulf81 Sep 11, 2023
2365990
Add check of df_freq and specify datatype binning
paulf81 Sep 11, 2023
e740463
Update test for passing in frequency
paulf81 Sep 11, 2023
34b6ea1
Allow passing in of frequency table
paulf81 Sep 11, 2023
b187337
Change count weight to weight
paulf81 Sep 18, 2023
500744c
Include details of missed bins
paulf81 Sep 18, 2023
ab27a41
Add enforcement all ws/wd appear in all dataframes
paulf81 Sep 19, 2023
c33fcbb
Redo tests and add for missing bin on one side
paulf81 Sep 19, 2023
06d66c6
Add more tests for df_freq
paulf81 Sep 19, 2023
db0f1f4
Add checks on ws/wd missing on df_freq
paulf81 Sep 19, 2023
13df246
Add tests
paulf81 Sep 19, 2023
abe8e86
Polars deprecating groupby to group_by
paulf81 Sep 19, 2023
6de33f4
Computing df_freq explicitly.
misi9170 Sep 26, 2023
9744500
Clearer about resampling on first bootstrap call.
misi9170 Sep 26, 2023
1e2bb52
df_freq now wired through to Output object.
misi9170 Sep 26, 2023
bd3c68c
Compute ws freqs alone, if needed.
misi9170 Sep 26, 2023
bd27ab9
Now handling uplift plots correctly.
misi9170 Sep 26, 2023
94a6c26
Option to overlay df_freq weighting.
misi9170 Sep 27, 2023
9372c2b
Renaming to frequency for clarity.
misi9170 Sep 27, 2023
65097ce
Remove commented code and nomralize weights
paulf81 Oct 13, 2023
bdb9e62
Update comment
paulf81 Oct 13, 2023
6626ec5
Rerun example notebook.
misi9170 Oct 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 91 additions & 3 deletions examples_smarteole/05_baseline_energy_ratio_analysis.ipynb

Large diffs are not rendered by default.

98 changes: 48 additions & 50 deletions examples_smarteole/06_wake_steering_energy_ratio_analysis.ipynb

Large diffs are not rendered by default.

243 changes: 162 additions & 81 deletions flasc/energy_ratio/energy_ratio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
filter_all_nulls,
filter_any_nulls
)
from flasc.dataframe_operations.dataframe_manipulations import df_reduce_precision


# Internal version, returns a polars dataframe
Expand All @@ -40,6 +41,7 @@ def _compute_energy_ratio_single(df_,
ws_max = 50.0,
bin_cols_in = ['wd_bin','ws_bin'],
weight_by = 'min', #min, sum
df_freq_pl = None,
wd_bin_overlap_radius = 0.,
uplift_pairs = [],
uplift_names = [],
Expand All @@ -66,6 +68,7 @@ def _compute_energy_ratio_single(df_,
weight_by (str): How to weight the energy ratio, options are 'min', or 'sum'. 'min' means
the minimum count across the dataframes is used to weight the energy ratio. 'sum' means the sum of the counts
across the dataframes is used to weight the energy ratio. Defaults to 'min'.
df_freq_pl (pl.Dataframe) Polars dataframe of pre-provided per bin weights
wd_bin_overlap_radius (float): The distance in degrees one wd bin overlaps into the next, must be
less or equal to half the value of wd_step
uplift_pairs: (list[tuple]): List of pairs of df_names to compute uplifts for. Each element
Expand All @@ -80,9 +83,10 @@ def _compute_energy_ratio_single(df_,

Returns:
pl.DataFrame: A dataframe containing the energy ratio for each wind direction bin
pl.DataFrame: A dataframe containing the weights each wind direction and wind speed bin
"""

# Identify the number of dataframes
# Get the number of dataframes
num_df = len(df_names)

# Filter df_ to remove null values
Expand All @@ -105,35 +109,61 @@ def _compute_energy_ratio_single(df_,
df_ = add_ws_bin(df_, ws_cols, ws_step, ws_min, ws_max, remove_all_nulls=remove_all_nulls)
df_ = add_wd_bin(df_, wd_cols, wd_step, wd_min, wd_max, remove_all_nulls=remove_all_nulls)



# Assign the reference and test power columns
df_ = add_power_ref(df_, ref_cols)
df_ = add_power_test(df_, test_cols)

bin_cols_without_df_name = [c for c in bin_cols_in if c != 'df_name']
bin_cols_with_df_name = bin_cols_without_df_name + ['df_name']



# Group df_
df_ = (df_
.filter(pl.all_horizontal(pl.col(bin_cols_with_df_name).is_not_null())) # Select for all bin cols present
.groupby(bin_cols_with_df_name, maintain_order=True)
.agg([pl.mean("pow_ref"), pl.mean("pow_test"),pl.count()])
.with_columns(
[
# Get the weighting by counts
pl.col('count').min().over(bin_cols_without_df_name).alias('count_weight') if weight_by == 'min' else
pl.col('count').sum().over(bin_cols_without_df_name).alias('count_weight')
]
.group_by(bin_cols_with_df_name, maintain_order=True)
.agg([pl.mean("pow_ref"), pl.mean("pow_test"),pl.count()])

# Enforce that each ws/wd bin combination has to appear in all dataframes
.filter(pl.count().over(bin_cols_without_df_name) == num_df)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe not important to address now, but really we just want to make sure each ws/wd bin combination appears in both data frames for each uplift pair right, not all data frames in the case where there are more than two?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this implementation is the most general, say you want to compare a data partioned into 3 stability types, this checks that each type has at least one point in the used bins. I think it's a fair qualifier.


)
# Determine the weighting of the ws/wd bins

if df_freq_pl is None:
# Determine the weights per bin as either the min or sum count
df_freq_pl = (df_
.select(bin_cols_without_df_name+['count'])
.group_by(bin_cols_without_df_name)
.agg([pl.min('count') if weight_by == 'min' else pl.sum('count')])
.rename({'count':'weight'})
)

df_ = (df_.join(df_freq_pl, on=['wd_bin','ws_bin'], how='left')
.with_columns(pl.col('weight'))
)

# Check if all the values in the weight column are null
if df_['weight'].is_null().all():
raise RuntimeError("None of the ws/wd bins in data appear in df_freq")

# Check if any of the values in the weight column are null
if df_['weight'].is_null().any():
warnings.warn('Some bins in data are not in df_freq and will get 0 weight')

# Fill the null values with zeros
df_= df_.with_columns(pl.col('weight').fill_null(strategy="zero"))

paulf81 marked this conversation as resolved.
Show resolved Hide resolved
# Normalize the weights
df_ = df_.with_columns(pl.col('weight').truediv(pl.col('weight').sum()))

# Calculate energy ratios
df_ = (df_
.with_columns(
[
pl.col('pow_ref').mul(pl.col('count_weight')).alias('ref_energy'), # Compute the reference energy
pl.col('pow_test').mul(pl.col('count_weight')).alias('test_energy'), # Compute the test energy
pl.col('pow_ref').mul(pl.col('weight')).alias('ref_energy'), # Compute the reference energy
pl.col('pow_test').mul(pl.col('weight')).alias('test_energy'), # Compute the test energy
]
)
.groupby(['wd_bin','df_name'], maintain_order=True)
.group_by(['wd_bin','df_name'], maintain_order=True)
.agg([pl.sum("ref_energy"), pl.sum("test_energy"),pl.sum("count")])
.with_columns(
energy_ratio = pl.col('test_energy') / pl.col('ref_energy')
Expand All @@ -146,14 +176,18 @@ def _compute_energy_ratio_single(df_,

# In the case of two turbines, compute an uplift column
for upp, upn in zip(uplift_pairs, uplift_names):
count_cols = ["count_"+upp[0], "count_"+upp[1]]
df_ = df_.with_columns(
(100 * (pl.col(upp[1]) - pl.col(upp[0])) / pl.col(upp[0])).alias(upn)
[(100 * (pl.col(upp[1]) - pl.col(upp[0])) / pl.col(upp[0])).alias(upn),
(pl.min_horizontal(count_cols) if weight_by == "min" else
pl.sum_horizontal(count_cols)).alias("count_"+upn)
]
)

# Enforce a column order
df_ = df_.select(['wd_bin'] + df_names + uplift_names + [f'count_{n}' for n in df_names])
df_ = df_.select(['wd_bin'] + df_names + uplift_names + [f'count_{n}' for n in df_names+uplift_names])

return(df_)
return df_, df_freq_pl

# Bootstrap function wraps the _compute_energy_ratio function
def _compute_energy_ratio_bootstrap(er_in,
Expand All @@ -169,6 +203,7 @@ def _compute_energy_ratio_bootstrap(er_in,
ws_max = 50.0,
bin_cols_in = ['wd_bin','ws_bin'],
weight_by = 'min', #min, sum
df_freq_pl = None,
wd_bin_overlap_radius = 0.,
uplift_pairs = [],
uplift_names = [],
Expand Down Expand Up @@ -196,6 +231,7 @@ def _compute_energy_ratio_bootstrap(er_in,
weight_by (str): How to weight the energy ratio, options are 'min', or 'sum'. 'min' means
the minimum count across the dataframes is used to weight the energy ratio. 'sum' means the sum of the counts
across the dataframes is used to weight the energy ratio.
df_freq_pl (pl.Dataframe) Polars dataframe of pre-provided per bin weights
wd_bin_overlap_radius (float): The distance in degrees one wd bin overlaps into the next, must be
less or equal to half the value of wd_step
uplift_pairs: (list[tuple]): List of pairs of df_names to compute uplifts for. Each element
Expand All @@ -218,38 +254,44 @@ def _compute_energy_ratio_bootstrap(er_in,
"""

# Otherwise run the function N times and concatenate the results to compute statistics

df_concat = pl.concat([_compute_energy_ratio_single(er_in.resample_energy_table(i),
er_in.df_names,
ref_cols,
test_cols,
wd_cols,
ws_cols,
wd_step,
wd_min,
wd_max,
ws_step,
ws_min,
ws_max,
bin_cols_in,
weight_by,
wd_bin_overlap_radius,
uplift_pairs,
uplift_names,
remove_all_nulls
) for i in range(N)])
er_single_outs = [
_compute_energy_ratio_single(
er_in.resample_energy_table(perform_resample=(i != 0)),
er_in.df_names,
ref_cols,
test_cols,
wd_cols,
ws_cols,
wd_step,
wd_min,
wd_max,
ws_step,
ws_min,
ws_max,
bin_cols_in,
weight_by,
df_freq_pl,
wd_bin_overlap_radius,
uplift_pairs,
uplift_names,
remove_all_nulls
) for i in range(N)
]
df_concat = pl.concat([er_single_out[0] for er_single_out in er_single_outs])
# First output contains the original table; use that df_freq_pl
df_freq_pl = er_single_outs[0][1]

bound_names = er_in.df_names + uplift_names

return (df_concat
.groupby(['wd_bin'], maintain_order=True)
.group_by(['wd_bin'], maintain_order=True)
.agg([pl.first(n) for n in bound_names] +
[pl.quantile(n, percentiles[0]/100).alias(n + "_ub") for n in bound_names] +
[pl.quantile(n, percentiles[1]/100).alias(n + "_lb") for n in bound_names] +
[pl.first(f'count_{n}') for n in er_in.df_names]
[pl.first(f'count_{n}') for n in bound_names]
)
.sort('wd_bin')
)
), df_freq_pl

def compute_energy_ratio(er_in: EnergyRatioInput,
ref_turbines = None,
Expand All @@ -267,6 +309,7 @@ def compute_energy_ratio(er_in: EnergyRatioInput,
ws_max = 50.0,
bin_cols_in = ['wd_bin','ws_bin'],
weight_by = 'min', #min or sum
df_freq = None,
wd_bin_overlap_radius = 0.,
uplift_pairs = None,
uplift_names = None,
Expand Down Expand Up @@ -297,6 +340,13 @@ def compute_energy_ratio(er_in: EnergyRatioInput,
weight_by (str): How to weight the energy ratio, options are 'min', , or 'sum'. 'min' means
the minimum count across the dataframes is used to weight the energy ratio. 'sum' means the sum of the counts
across the dataframes is used to weight the energy ratio.
df_freq (pd.Dataframe): A dataframe which specifies the frequency of the ws/wd bin combinations. Provides
a method to use an explicit or long-term weigthing of bins. Dataframe should include
columns ws, wd and freq_val. ws and wd should correspond to the bin centers resulting from
the choices of the ws/wd_min / _max / _step. In the case that df_freq has extra bins that aren't included
in those given by ws/wd min, max, step, they will be ignored in the energy ratio calculation.
Any bins given by ws/wd min, max, step not present in df_freq will be assigned a frequency of zero.
Defaults to None.
wd_bin_overlap_radius (float): The distance in degrees one wd bin overlaps into the next, must be
less or equal to half the value of wd_step
uplift_pairs: (list[tuple]): List of pairs of df_names to compute uplifts for. Each element
Expand Down Expand Up @@ -338,6 +388,7 @@ def compute_energy_ratio(er_in: EnergyRatioInput,
ws_max,
bin_cols_in,
weight_by,
df_freq,
wd_bin_overlap_radius,
uplift_pairs,
uplift_names,
Expand Down Expand Up @@ -381,62 +432,92 @@ def compute_energy_ratio(er_in: EnergyRatioInput,
# Convert the numbered arrays to appropriate column names
test_cols = [f'pow_{i:03d}' for i in test_turbines]

# If df_freq is provided, confirm is consistent with ws/wd min max and
# prepare a polars table of weights
if df_freq is not None:

# Maybe not test, not sure yet
paulf81 marked this conversation as resolved.
Show resolved Hide resolved
# ws_edges = np.arange(ws_min, ws_max+ws_step,ws_step)
# ws_labels = ws_edges[:-1] + np.diff(ws_edges)/2.0
# wd_edges = np.arange(wd_min, wd_max+wd_step,wd_step)
# wd_labels = wd_edges[:-1] + np.diff(wd_edges)/2.0

# Conver to polars dataframe
df_freq_pl = pl.from_pandas(df_reduce_precision(df_freq, allow_convert_to_integer=False))

# Rename the columns
df_freq_pl = df_freq_pl.rename({
'ws':'ws_bin',
'wd':'wd_bin',
'freq_val':'weight'
})

else:
df_freq_pl = None

# If N=1, don't use bootstrapping
if N == 1:
if percentiles is not None:
print("percentiles can only be used with bootstrapping (N > 1).")
# Compute the energy ratio
df_res = _compute_energy_ratio_single(df_,
er_in.df_names,
ref_cols,
test_cols,
wd_cols,
ws_cols,
wd_step,
wd_min,
wd_max,
ws_step,
ws_min,
ws_max,
bin_cols_in,
weight_by,
wd_bin_overlap_radius,
uplift_pairs,
uplift_names,
remove_all_nulls
)
df_res, df_freq_pl = _compute_energy_ratio_single(
df_,
er_in.df_names,
ref_cols,
test_cols,
wd_cols,
ws_cols,
wd_step,
wd_min,
wd_max,
ws_step,
ws_min,
ws_max,
bin_cols_in,
weight_by,
df_freq_pl,
wd_bin_overlap_radius,
uplift_pairs,
uplift_names,
remove_all_nulls
)
else:
if percentiles is None:
percentiles = [5, 95]
elif not hasattr(percentiles, "__len__") or len(percentiles) != 2:
raise ValueError("percentiles should be a two element list of the "+\
"upper and lower desired percentiles.")

df_res = _compute_energy_ratio_bootstrap(er_in,
ref_cols,
test_cols,
wd_cols,
ws_cols,
wd_step,
wd_min,
wd_max,
ws_step,
ws_min,
ws_max,
bin_cols_in,
weight_by,
wd_bin_overlap_radius,
uplift_pairs,
uplift_names,
N,
percentiles
)
df_res, df_freq_pl = _compute_energy_ratio_bootstrap(
er_in,
ref_cols,
test_cols,
wd_cols,
ws_cols,
wd_step,
wd_min,
wd_max,
ws_step,
ws_min,
ws_max,
bin_cols_in,
weight_by,
df_freq_pl,
wd_bin_overlap_radius,
uplift_pairs,
uplift_names,
N,
percentiles
)

# Return the df_freqs, handle as needed.

# Sort df_res by df_names, ws, wd

# Return the results as an EnergyRatioOutput object
return EnergyRatioOutput(df_res.to_pandas(),
er_in,
df_freq_pl.to_pandas(),
ref_cols,
test_cols,
wd_cols,
Expand Down Expand Up @@ -511,7 +592,7 @@ def compute_energy_ratio(er_in: EnergyRatioInput,
# df_ = (df_.with_columns(
# power_ratio = pl.col('pow_test') / pl.col('pow_ref'))
# .filter(pl.all_horizontal(pl.col(bin_cols_with_df_name).is_not_null())) # Select for all bin cols present
# .groupby(bin_cols_with_df_name, maintain_order=True)
# .group_by(bin_cols_with_df_name, maintain_order=True)
# .agg([pl.mean("pow_ref"), pl.mean("power_ratio"),pl.count()])
# .with_columns(
# [
Expand Down
Loading