diff --git a/scripts/demand/load.py b/scripts/demand/load.py index a1882811..f1b6d593 100644 --- a/scripts/demand/load.py +++ b/scripts/demand/load.py @@ -1,69 +1,111 @@ """Generate Calliope load time series.""" + import math -import pandas as pd import geopandas as gpd +import pandas as pd -def load(path_to_units, path_to_demand_per_unit, path_to_electricity_load, scaling_factor, path_to_result): +def load( + path_to_units, + path_to_demand_per_unit, + path_to_electricity_load, + scaling_factor, + path_to_result, +): """Generate load time series for every location.""" units = gpd.read_file(path_to_units).set_index("id") demand_per_unit = pd.read_csv(path_to_demand_per_unit, index_col=0) national_load = pd.read_csv(path_to_electricity_load, index_col=0, parse_dates=True) - if (len(units.index) == 1) and (units.index[0] == "EUR"): # special case for continental level + if (len(units.index) == 1) and ( + units.index[0] == "EUR" + ): # special case for continental level national_load = pd.DataFrame(national_load.sum(axis=1).rename("EUR")) # demand_per_unit.demand_twh_per_year is not necessarily the demand in the year # used here and thus its absolute value must be ignored. - units["industrial_demand"] = demand_per_unit.demand_twh_per_year * demand_per_unit.industrial_demand_fraction - units["residential_demand"] = demand_per_unit.demand_twh_per_year - units.industrial_demand - assert not units["industrial_demand"].isna().any() - units["fraction_of_national_industrial_load"] = units.groupby("country_code").industrial_demand.transform( - lambda x: x / x.sum() - ).fillna(0) # if national demand is 0, division by zero - units["fraction_of_national_residential_load"] = units.groupby("country_code").residential_demand.transform( - lambda x: x / x.sum() + units["industrial_demand"] = ( + demand_per_unit.demand_twh_per_year * demand_per_unit.industrial_demand_fraction + ) + units["residential_demand"] = ( + demand_per_unit.demand_twh_per_year - units.industrial_demand ) + assert not units["industrial_demand"].isna().any() + units["fraction_of_national_industrial_load"] = ( + units.groupby("country_code") + .industrial_demand.transform(lambda x: x / x.sum()) + .fillna(0) + ) # if national demand is 0, division by zero + units["fraction_of_national_residential_load"] = units.groupby( + "country_code" + ).residential_demand.transform(lambda x: x / x.sum()) - national_industrial_load, national_residential_load = split_national_load(national_load, units) + national_industrial_load, national_residential_load = split_national_load( + national_load, units + ) load_ts = pd.concat( - [unit_time_series(unit, unit_name, national_industrial_load, national_residential_load, scaling_factor) - for unit_name, unit in units.iterrows()], - axis=1 + [ + unit_time_series( + unit, + unit_name, + national_industrial_load, + national_residential_load, + scaling_factor, + ) + for unit_name, unit in units.iterrows() + ], + axis=1, ) assert math.isclose( load_ts.sum().sum() * (-1) / scaling_factor, - national_load.reindex(columns=units.country_code.unique()).sum().sum() + national_load.reindex(columns=units.country_code.unique()).sum().sum(), ) load_ts.tz_convert(None).to_csv(path_to_result) def split_national_load(national_load, units): - national_industrial_demand = units.groupby("country_code").industrial_demand.sum() * 1e6 # from TWh to MWh - industrial_load = pd.DataFrame( # ASSUME flat industry load profiles - index=national_load.index, - data=national_industrial_demand.to_dict() - ).div(len(national_load.index)).reindex(columns=national_load.columns, fill_value=0) + national_industrial_demand = ( + units.groupby("country_code").industrial_demand.sum() * 1e6 + ) # from TWh to MWh + industrial_load = ( + pd.DataFrame( # ASSUME flat industry load profiles + index=national_load.index, data=national_industrial_demand.to_dict() + ) + .div(len(national_load.index)) + .reindex(columns=national_load.columns, fill_value=0) + ) residential_load = national_load - industrial_load return industrial_load, residential_load -def unit_time_series(unit, unit_name, national_industrial_load, national_residential_load, scaling_factor): +def unit_time_series( + unit, unit_name, national_industrial_load, national_residential_load, scaling_factor +): country_code = unit.country_code multiplier = unit.fraction_of_national_industrial_load - unit_industrial_ts = national_industrial_load.loc[:, country_code].copy() * multiplier * (-1) * scaling_factor + unit_industrial_ts = ( + national_industrial_load.loc[:, country_code].copy() + * multiplier + * (-1) + * scaling_factor + ) multiplier = unit.fraction_of_national_residential_load - unit_residential_ts = national_residential_load.loc[:, country_code].copy() * multiplier * (-1) * scaling_factor + unit_residential_ts = ( + national_residential_load.loc[:, country_code].copy() + * multiplier + * (-1) + * scaling_factor + ) unit_ts = unit_industrial_ts + unit_residential_ts unit_ts.name = unit_name.replace(".", "-") return unit_ts -if __name__ == '__main__': +if __name__ == "__main__": load( path_to_units=snakemake.input.units, path_to_demand_per_unit=snakemake.input.demand_per_unit, path_to_electricity_load=snakemake.input.national_load, path_to_result=snakemake.output[0], - scaling_factor=snakemake.params.scaling_factor + scaling_factor=snakemake.params.scaling_factor, ) diff --git a/scripts/demand/national_load.py b/scripts/demand/national_load.py index b89fd90f..927b41a0 100644 --- a/scripts/demand/national_load.py +++ b/scripts/demand/national_load.py @@ -14,6 +14,7 @@ This process will fail if the 'most complete dataset' for any country has any remaining gaps. """ + import calendar import pandas as pd @@ -21,30 +22,46 @@ def national_load( - path_to_raw_load, first_year, final_year, data_quality_config, path_to_output, countries + path_to_raw_load, + first_year, + final_year, + data_quality_config, + path_to_output, + countries, ): """Extracts national load time series for all countries in a specified year.""" - load = clean_load_data(path_to_raw_load, first_year, final_year, data_quality_config, countries) + load = clean_load_data( + path_to_raw_load, first_year, final_year, data_quality_config, countries + ) load.to_csv(path_to_output, header=True) -def clean_load_data(path_to_raw_load, first_year, final_year, data_quality_config, countries): +def clean_load_data( + path_to_raw_load, first_year, final_year, data_quality_config, countries +): data_sources = data_quality_config["data-source-priority-order"] raw_load = read_load_profiles(path_to_raw_load, data_sources) filtered_load = filter_countries(raw_load, countries) filtered_load = filter_outliers(filtered_load, data_quality_config) gap_filled_load = pd.concat( fill_gaps_per_source(filtered_load, year, data_quality_config, source) - for source in data_sources for year in range(first_year, final_year + 1) + for source in data_sources + for year in range(first_year, final_year + 1) ).sort_index() return get_source_choice_per_country( filtered_load[ - (filtered_load.index.get_level_values("utc_timestamp") >= f"{first_year}-01-01 00:00") & - (filtered_load.index.get_level_values("utc_timestamp") <= f"{final_year}-12-31 23:00") + ( + filtered_load.index.get_level_values("utc_timestamp") + >= f"{first_year}-01-01 00:00" + ) + & ( + filtered_load.index.get_level_values("utc_timestamp") + <= f"{final_year}-12-31 23:00" + ) ], gap_filled_load, - data_sources + data_sources, ) @@ -52,8 +69,7 @@ def read_load_profiles(path_to_raw_load, entsoe_priority): """Reads national load data.""" data = pd.read_csv(path_to_raw_load, parse_dates=["utc_timestamp"]) load_by_attribute = ( - data - [(data.variable == "load") & (data.attribute.isin(entsoe_priority))] + data[(data.variable == "load") & (data.attribute.isin(entsoe_priority))] .set_index(["utc_timestamp", "attribute", "region"]) .loc[:, "data"] .unstack("region") @@ -67,13 +83,14 @@ def filter_countries(load, countries): # Other subsets of the UK exist in the data, with UKM being the only one to cover the whole country. load.rename(columns={"GB_UKM": "GB"}, inplace=True) country_codes = { - pycountry.countries.lookup(country).alpha_2: pycountry.countries.lookup(country).alpha_3 + pycountry.countries.lookup(country).alpha_2: pycountry.countries.lookup( + country + ).alpha_3 for country in countries } national = ( - load - .loc[:, country_codes.keys()] + load.loc[:, country_codes.keys()] .rename(columns=country_codes) .rename_axis(columns="country_code") ) @@ -84,8 +101,14 @@ def filter_countries(load, countries): def filter_outliers(load, data_quality_config): normed_load = load / load.mean() cleaned_load = load.where( - (normed_load >= data_quality_config["outlier-data-thresholds"]["relative-to-mean-min"]) & - (normed_load <= data_quality_config["outlier-data-thresholds"]["relative-to-mean-max"]) + ( + normed_load + >= data_quality_config["outlier-data-thresholds"]["relative-to-mean-min"] + ) + & ( + normed_load + <= data_quality_config["outlier-data-thresholds"]["relative-to-mean-max"] + ) ) return cleaned_load @@ -93,32 +116,44 @@ def filter_outliers(load, data_quality_config): def fill_gaps_per_source(all_load, model_year, data_quality_config, source): """For each valid data source, fills in all NaNs by interpolation or with data from other years""" source_specific_load = all_load.xs(source, level="attribute") - source_specific_load = _interpolate_gaps(source_specific_load, data_quality_config["max-interpolate-timesteps"]) + source_specific_load = _interpolate_gaps( + source_specific_load, data_quality_config["max-interpolate-timesteps"] + ) # If there is some data in given year, fill in the rest from other years if model_year in source_specific_load.index.year.unique(): source_specific_model_year_load = source_specific_load.loc[str(model_year)] source_specific_model_year_load = _fill_gaps_from_other_years( - source_specific_model_year_load, source_specific_load, data_quality_config, source, model_year + source_specific_model_year_load, + source_specific_load, + data_quality_config, + source, + model_year, ) # If there is no data in given year, return a series of NaNs spanning the whole year else: source_specific_model_year_load = source_specific_load.reindex( - pd.date_range(f"{model_year}-01-01", f"{model_year}-12-31", freq="H", tz="UTC") + pd.date_range( + f"{model_year}-01-01", f"{model_year}-12-31", freq="H", tz="UTC" + ) ) - return ( - source_specific_model_year_load - .assign(attribute=source) - .set_index("attribute", append=True) + return source_specific_model_year_load.assign(attribute=source).set_index( + "attribute", append=True ) -def _fill_gaps_from_other_years(model_year_load, all_load, data_quality_config, source, model_year): +def _fill_gaps_from_other_years( + model_year_load, all_load, data_quality_config, source, model_year +): missing_data_countries = _countries_with_missing_data_in_model_year(model_year_load) for country in missing_data_countries: - updated_country_series, N_missing_timesteps, fill_years = _fill_missing_data_in_country( - all_load.loc[:, country], model_year, data_quality_config["acceptable-year-diff-for-gap-filling"] + updated_country_series, N_missing_timesteps, fill_years = ( + _fill_missing_data_in_country( + all_load.loc[:, country], + model_year, + data_quality_config["acceptable-year-diff-for-gap-filling"], + ) ) if data_quality_config["fill-29th-feb-from-28th"]: updated_country_series = _fill_29th_feb(updated_country_series, model_year) @@ -133,27 +168,41 @@ def _fill_gaps_from_other_years(model_year_load, all_load, data_quality_config, return model_year_load -def _fill_missing_data_in_country(country_series, model_year, acceptable_year_diff_for_gap_filling): +def _fill_missing_data_in_country( + country_series, model_year, acceptable_year_diff_for_gap_filling +): all_missing_timesteps = _get_index_of_missing_data(country_series, model_year) fill_years = [] - years_with_data = country_series.dropna().index.year.drop(model_year, errors="ignore").unique() + years_with_data = ( + country_series.dropna().index.year.drop(model_year, errors="ignore").unique() + ) allowed_years_with_data = years_with_data[ - (years_with_data >= model_year - acceptable_year_diff_for_gap_filling) & - (years_with_data <= model_year + acceptable_year_diff_for_gap_filling) + (years_with_data >= model_year - acceptable_year_diff_for_gap_filling) + & (years_with_data <= model_year + acceptable_year_diff_for_gap_filling) ] order = (allowed_years_with_data.values - model_year).astype(float) - order[order < 0] -= 0.1 # negative years = older + order[order < 0] -= 0.1 # negative years = older - preferred_order_of_years_with_data = allowed_years_with_data.sort_values(key=lambda x: abs(order)) + preferred_order_of_years_with_data = allowed_years_with_data.sort_values( + key=lambda x: abs(order) + ) _missing_timesteps = all_missing_timesteps.copy() for next_avail_year in preferred_order_of_years_with_data: if len(_missing_timesteps) == 0: break _missing_timesteps = _get_index_of_missing_data(country_series, model_year) - _missing_timesteps = _ignore_feb_29th(model_year, next_avail_year, _missing_timesteps) - new_data = country_series.reindex(_missing_timesteps.map(lambda dt: dt.replace(year=next_avail_year))) + _missing_timesteps = _ignore_feb_29th( + model_year, next_avail_year, _missing_timesteps + ) + new_data = country_series.reindex( + _missing_timesteps.map( + lambda dt, next_avail_year=next_avail_year: dt.replace( + year=next_avail_year + ) + ) + ) new_data.index = _missing_timesteps country_series.update(new_data) fill_years.append(str(next_avail_year)) @@ -167,21 +216,22 @@ def _get_index_of_missing_data(series, model_year): def _ignore_feb_29th(this_year, next_avail_year, missing_timesteps): - """ Ignore February 29th if next available year doesn't have that data available.""" - if (calendar.isleap(this_year) + """Ignore February 29th if next available year doesn't have that data available.""" + if ( + calendar.isleap(this_year) and not calendar.isleap(next_avail_year) - and pd.to_datetime(f'{this_year}-02-29').date() in missing_timesteps.date + and pd.to_datetime(f"{this_year}-02-29").date() in missing_timesteps.date ): return missing_timesteps[ - missing_timesteps.date != pd.to_datetime(f'{this_year}-02-29').date() + missing_timesteps.date != pd.to_datetime(f"{this_year}-02-29").date() ] else: return missing_timesteps def _fill_29th_feb(load, year): - if f'{year}-02-29' in load.index.strftime("%Y-%m-%d"): - filler = load.loc[f'{year}-02-28'] + if f"{year}-02-29" in load.index.strftime("%Y-%m-%d"): + filler = load.loc[f"{year}-02-28"] filler.index = filler.index + pd.DateOffset(1) return load.fillna(filler) else: @@ -208,8 +258,7 @@ def get_source_choice_per_country(raw_load, gap_filled_load, entsoe_priority): """ source_choice = ( - gap_filled_load - .notnull() + gap_filled_load.notnull() .groupby(level="attribute") .sum() .loc[entsoe_priority] @@ -219,8 +268,9 @@ def get_source_choice_per_country(raw_load, gap_filled_load, entsoe_priority): ) print( - "Using the following data sources for national load:\n{}" - .format("\n".join([f"{idx[0]}: {idx[1]}" for idx in source_choice.index])) + "Using the following data sources for national load:\n{}".format( + "\n".join([f"{idx[0]}: {idx[1]}" for idx in source_choice.index]) + ) ) new_load = _select_load_by_source_priority(gap_filled_load, source_choice) raw_load_for_comparison = _select_load_by_source_priority(raw_load, source_choice) @@ -229,7 +279,10 @@ def get_source_choice_per_country(raw_load, gap_filled_load, entsoe_priority): bad_index_values = new_load.isnull().stack() error_msg = "Gap filling thresholds do not allow for a complete load dataset to be produced. " if bad_index_values[bad_index_values].size < 100: - error_msg = error_msg + f"Remaining empty data: {bad_index_values[bad_index_values].index.to_list()}" + error_msg = ( + error_msg + + f"Remaining empty data: {bad_index_values[bad_index_values].index.to_list()}" + ) raise AssertionError(error_msg) else: @@ -242,8 +295,7 @@ def get_source_choice_per_country(raw_load, gap_filled_load, entsoe_priority): def _select_load_by_source_priority(load, source_priority): load_filtered_by_priority_source = ( - load - .unstack("attribute") + load.unstack("attribute") .loc[:, source_priority.index] .droplevel("attribute", axis="columns") ) @@ -259,5 +311,5 @@ def _select_load_by_source_priority(load, source_priority): final_year=snakemake.params.final_year, data_quality_config=snakemake.params.data_quality_config, countries=snakemake.params.countries, - path_to_output=snakemake.output[0] + path_to_output=snakemake.output[0], )