-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathplot_scaling_convergence.py
139 lines (123 loc) · 7.62 KB
/
plot_scaling_convergence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import uproot
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import yaml
from glob import glob
import os
import sys
from collections import defaultdict
import click
sns.set_theme(context='notebook', font='sans-serif', style='white', palette=None, font_scale=1.5,
rc={"lines.linewidth": 2.5, "font.sans-serif": 'DejaVu Sans', "text.usetex": True})
def read_params_to_df(file_names, cone_type, normal_features):
mean_dict, std_dict = defaultdict(list), defaultdict(list)
for file_name in file_names:
with open(file_name) as f:
scalings = json.load(f)
for var_type in scalings.keys():
mean, std = {}, {}
for feature, feature_scaling in scalings[var_type].items():
if cone_type not in feature_scaling: continue
if feature not in normal_features[var_type]: continue
if (mean_ := feature_scaling[cone_type]['mean']) is not None:
mean[feature] = mean_
if (std_:=feature_scaling[cone_type]['std']) is not None:
std[feature] = std_
mean_dict[var_type].append(mean)
std_dict[var_type].append(std)
df_mean_dict = {var_type: pd.DataFrame(mean_dict[var_type]) for var_type in mean_dict}
df_std_dict = {var_type: pd.DataFrame(std_dict[var_type]) for var_type in std_dict}
return df_mean_dict, df_std_dict
def plot_running_diff(running_diff_mean, running_diff_std,
n_files_per_step, var_type, cone_type, savepath=None, close_plot=False):
fig, axs = plt.subplots(2, 1, figsize=(23,8))
assert len(running_diff_mean)==len(running_diff_std)
n_snapshots = len(running_diff_mean)
axs[0].boxplot(list(running_diff_mean.values), notch=True, bootstrap=10000, whis=(5, 95))
# axs[0].set_title(f'\text{var_type}{cone_type}')
axs[0].set_ylabel(r'$\frac{\Big|\mathrm{mean}[i] - \mathrm{mean}[i-1]\Big|}{\Big|\mathrm{mean}[i-1]\Big|}$')
axs[0].set_yscale('log')
axs[0].set_xlim(.3, axs[0].get_xlim()[1])
axs[0].set_xticks(list(range(1, n_snapshots+1)))
axs[0].set_xticklabels(list(range(1, n_snapshots))+['all'])
axs[0].hlines(.1, axs[0].get_xlim()[0], axs[0].get_xlim()[1]-1, ls=':', linewidth=3, label='10\% level')
axs[0].vlines(49.5, *axs[0].get_ylim(), 'black', linewidth=1.5)
axs[0].set_title(f"{var_type.replace('_', '-')}, {cone_type.replace('_', '-')}")
#
axs[1].boxplot(list(running_diff_std.values), notch=True, bootstrap=10000, whis=(5, 95))
axs[1].set_ylabel(r'$\frac{\Big|\mathrm{std}[i] - \mathrm{std}[i-1]\Big|}{\Big|\mathrm{std}[i-1]\Big|}$')
axs[1].set_yscale('log')
axs[1].set_xlabel(f'snapshot i, increment of {n_files_per_step} files per step')
axs[1].set_xlim(.3, axs[1].get_xlim()[1])
axs[1].set_xticks(list(range(1, n_snapshots+1)))
axs[1].set_xticklabels(list(range(1, n_snapshots))+['all'])
axs[1].hlines(.1, axs[1].get_xlim()[0], axs[1].get_xlim()[1]-1, ls=':', linewidth=3, label='10\% level')
axs[1].vlines(49.5, *axs[1].get_ylim(), 'black', linewidth=1.5)
# plt.legend(loc='upper center')
if not close_plot: plt.show()
if savepath is not None:
fig.savefig(f'{savepath}/{var_type}_{cone_type}.png')
if close_plot:
plt.close()
@click.command(
help="Plot the convergence of scaling parameters' computation as a running difference between consequent snapshots."
)
@click.option("--train-cfg", type=str, default='../configs/training_v1.yaml', help="Path to yaml configuration file used for training", show_default=True)
@click.option('-p', '--path-to-snapshots', type=str, help='Path to the directory with json snapshot files')
@click.option('-pr', '--snapshot-prefix', type=str, help='Prefix of the name of scaling json files')
@click.option('-n', '--nfiles-step', 'N_FILES_PER_STEP', type=int, help='Number of processed files per log step, as it was set while running the scaling computation')
@click.option('-t', '--var-types', 'VAR_TYPES', multiple=True, default=['-1'], help='Variables\' types for which scaling parameters are to be plotted, -1 to run on all of them', show_default=True)
@click.option('-c', '--cone-types', 'CONE_TYPES', multiple=True, default=['global', 'inner', 'outer'], help='Cone types for which scaling parameters are to be plotted', show_default=True)
@click.option('-o', "--output-folder", type=str, default='scaling_plots/convergence', help='Output directory to save plots', show_default=True)
def main(
train_cfg, path_to_snapshots, snapshot_prefix, N_FILES_PER_STEP,
VAR_TYPES, CONE_TYPES, output_folder
):
if not os.path.isdir(output_folder):
os.makedirs(output_folder)
# open the main training cfg
with open(train_cfg) as f:
training_cfg = yaml.safe_load(f)
# scaling json doesn't store scaling_type, so read it from the scaling cfg
normal_features = defaultdict(list) # interested only in scaling type "normal", where means/stds were computed
for var_type, var_list in training_cfg['Features_all'].items():
for var_dict in var_list:
var_name = list(var_dict.keys())[0]
if var_dict[var_name][2] == 'normal':
normal_features[var_type].append(var_name)
if len(VAR_TYPES)==1 and VAR_TYPES[0]=='-1':
VAR_TYPES = list(normal_features.keys())
# read json file names into list
n_snapshots = len(glob(f'{path_to_snapshots}/{snapshot_prefix}_log_*.json'))
sorted_file_names = [f'{path_to_snapshots}/{snapshot_prefix}_log_{i}.json' for i in range(n_snapshots)] # sorting in ascending order
sorted_file_names += [f'{path_to_snapshots}/{snapshot_prefix}.json'] # append final snapshot to the end
for cone_type in CONE_TYPES:
# read scaling params into pandas DataFrame
df_mean_dict, df_std_dict = read_params_to_df(sorted_file_names, cone_type, normal_features)
for var_type in VAR_TYPES:
df_mean = df_mean_dict[var_type]
df_std = df_std_dict[var_type]
if len(df_mean.columns)==0 and len(df_std.columns)==0:
print(f"[INFO] Found no variables for var_type ({var_type}) with scaling type (normal) and cone type ({cone_type}), skipping it")
continue
# drop nans and remove inf columns
not_inf_columns = list(df_mean.columns[~np.isinf(df_mean).any()])
df_mean = df_mean[not_inf_columns].dropna(axis=1, how='any')
if np.any(np.isinf(df_mean)) > 0:
print("Found columns with inf in df_mean: ", df_mean.columns[np.isinf(df_mean)])
not_inf_columns = list(df_std.columns[~np.isinf(df_std).any()])
df_std = df_std[not_inf_columns].dropna(axis=1, how='any')
if np.any(np.isinf(df_std)) > 0:
print("Found columns with inf in df_std: ", df_std.columns[np.isinf(df_std)])
# running diffs = difference of mean/std value between two consecutive snapshots, normalised by the value
running_diff_mean = abs(df_mean.iloc[1:].values - (df_mean.iloc[0:-1].values)) / abs(df_mean.iloc[0:-1])
running_diff_std = abs(df_std.iloc[1:].values - (df_std.iloc[0:-1].values)) / abs(df_std.iloc[0:-1])
# append to the end difference between 1st and final snapshots
running_diff_mean.iloc[-1] = abs(df_mean.iloc[-1].values - (df_mean.iloc[0].values)) / abs(df_mean.iloc[0])
running_diff_std.iloc[-1] = abs(df_std.iloc[-1].values - (df_std.iloc[0].values)) / abs(df_std.iloc[0])
plot_running_diff(running_diff_mean, running_diff_std, N_FILES_PER_STEP, var_type, cone_type, savepath=output_folder, close_plot=True)
if __name__ == '__main__':
main()