diff --git a/.github/workflows/pip.yaml b/.github/workflows/pip.yaml index 2576246..e4b58a7 100644 --- a/.github/workflows/pip.yaml +++ b/.github/workflows/pip.yaml @@ -14,12 +14,12 @@ jobs: shell: bash -l {0} strategy: matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v2 - name: Set up Python - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: activate-environment: test auto-update-conda: true diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b36dccb..6ccc466 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -13,12 +13,12 @@ jobs: shell: bash -l {0} strategy: matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v2 - name: Set up Python - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: activate-environment: test auto-update-conda: true diff --git a/CompStats/.DS_Store b/CompStats/.DS_Store new file mode 100644 index 0000000..1a19dca Binary files /dev/null and b/CompStats/.DS_Store differ diff --git a/CompStats/__init__.py b/CompStats/__init__.py index 4f12734..cf0d0ad 100644 --- a/CompStats/__init__.py +++ b/CompStats/__init__.py @@ -11,7 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = '0.0.6' +__version__ = '0.1.0' from CompStats.bootstrap import StatisticSamples -from CompStats.measurements import CI, difference_p_value +from CompStats.measurements import CI, SE, difference_p_value from CompStats.performance import performance, difference, all_differences, plot_performance, plot_difference +from CompStats.performance import performance_multiple_metrics, difference_multiple, plot_performance_multiple, plot_difference_multiple +from CompStats.performance import all_differences_multiple, plot_performance2, plot_difference2, plot_scatter_matrix diff --git a/CompStats/bootstrap.py b/CompStats/bootstrap.py index 6fcbc57..1648edf 100644 --- a/CompStats/bootstrap.py +++ b/CompStats/bootstrap.py @@ -46,10 +46,12 @@ class StatisticSamples: def __init__(self, statistic: Callable[[np.ndarray], float]=np.mean, num_samples: int=500, - n_jobs: int=1): + n_jobs: int=1, + BiB: bool=True): self.statistic = statistic self.num_samples = num_samples self.n_jobs = n_jobs + self.BiB = BiB # Guardar el parámetro BiB self._samples = None self._calls = {} self._info = {} @@ -67,7 +69,8 @@ def get_params(self): """Parameters""" return dict(statistic=self.statistic, num_samples=self.num_samples, - n_jobs=self.n_jobs) + n_jobs=self.n_jobs, + BiB=self.BiB) # Añadir BiB a los parámetros def __sklearn_clone__(self): klass = self.__class__ diff --git a/CompStats/measurements.py b/CompStats/measurements.py index b3eacc6..7cbf06d 100644 --- a/CompStats/measurements.py +++ b/CompStats/measurements.py @@ -34,13 +34,38 @@ def CI(samples: np.ndarray, alpha=0.05): (0.6, 1.0) """ if isinstance(samples, StatisticSamples): - return {k: CI(v) for k, v in samples.calls.items()} + return {k: CI(v, alpha=alpha) for k, v in samples.calls.items()} alpha = alpha / 2 return (np.percentile(samples, alpha * 100, axis=0), np.percentile(samples, (1 - alpha) * 100, axis=0)) + +def SE(samples: np.ndarray): + """Compute the Standard Error of a statistic using bootstrap. + + >>> from CompStats import StatisticSamples, SE + >>> from sklearn.metrics import accuracy_score + >>> import numpy as np + >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]] + >>> pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]] + >>> bootstrap = StatisticSamples(statistic=accuracy_score) + >>> samples = bootstrap(labels, pred) + >>> SE(samples) + """ + if isinstance(samples, StatisticSamples): + return {k: SE(v) for k, v in samples.calls.items()} + return np.std(samples, axis=0) + -def difference_p_value(statistic_samples: StatisticSamples): +def difference_p_value(samples: np.ndarray, BiB: bool = True): """Compute the difference p-value""" - return {k: (v > 2 * np.mean(v)).mean() - for k, v in statistic_samples.calls.items()} \ No newline at end of file + if isinstance(samples, StatisticSamples): + if samples.BiB: + return {k: (v > 2 * np.mean(v)).mean() for k, v in samples.calls.items()} + else: + return {k: (v < 2 * np.mean(v)).mean() for k, v in samples.calls.items()} + else: + if BiB: + return np.mean(samples > 2 * np.mean(samples, axis=0), axis=0) + else: + return np.mean(samples < 2 * np.mean(samples, axis=0), axis=0) \ No newline at end of file diff --git a/CompStats/performance.py b/CompStats/performance.py index 1a09f6f..5763a0e 100644 --- a/CompStats/performance.py +++ b/CompStats/performance.py @@ -13,13 +13,16 @@ # limitations under the License. from sklearn.metrics import accuracy_score from sklearn.base import clone -from typing import Callable +from typing import List, Callable import pandas as pd import numpy as np import seaborn as sns +import math from CompStats.bootstrap import StatisticSamples from CompStats.utils import progress_bar from CompStats import measurements +import matplotlib.pyplot as plt +from statsmodels.stats.multitest import multipletests def performance(data: pd.DataFrame, @@ -27,23 +30,72 @@ def performance(data: pd.DataFrame, score: Callable[[np.ndarray, np.ndarray], float]=accuracy_score, num_samples: int=500, n_jobs: int=-1, + BiB: bool=True, statistic_samples: StatisticSamples=None) -> StatisticSamples: - """Bootstrap samples of a performance score""" + """ + Calculate bootstrap samples of a performance score for a given dataset. + + Parameters: + data (pd.DataFrame): Input dataset. + gold (str, optional): Column name of the ground truth or target variable. Defaults to 'y'. + score (Callable, optional): Performance score function. Defaults to accuracy_score. + num_samples (int, optional): Number of bootstrap samples. Defaults to 500. + n_jobs (int, optional): Number of jobs to run in parallel. Defaults to -1. + BiB (bool, optional): Whether the metric is Bigger is Better. Defaults to True. + statistic_samples (StatisticSamples, optional): Pre-initialized StatisticSamples object. Defaults to None. + + Returns: + StatisticSamples: Object containing the bootstrap samples of the performance score. + + Example usage: + >>> from sklearn.metrics import accuracy_score + >>> import pandas as pd + >>> from CompStats import performance + >>> df = pd.read_csv('path/to/data.csv') + >>> perf = performance(df, gold='y', score=accuracy_score, num_samples=1000) + """ if statistic_samples is None: statistic_samples = StatisticSamples(statistic=score, num_samples=num_samples, - n_jobs=n_jobs) + n_jobs=n_jobs, BiB=BiB) columns = data.columns y = data[gold] for column in progress_bar(columns): if column == gold: continue statistic_samples(y, data[column], name=column) + return statistic_samples -def difference(statistic_samples: StatisticSamples, best_index: int=-1): - """Bootstrap samples of a difference in performnace""" +def difference(statistic_samples: StatisticSamples): #, best_index: int=-1): + """ + Computes the difference in performance between the best performing algorithm and others using bootstrap samples. + + Parameters: + statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data. + + Returns: + StatisticSamples: A new instance of StatisticSamples with the computed differences and information about the best algorithm. + + The function works as follows: + 1. Determines the index of the best performing algorithm based on the BiB attribute. + 2. Extracts and calculates the mean performance for each algorithm. + 3. Sorts the algorithms by their mean performance. + 4. Identifies the best performing algorithm. + 5. Computes the difference in performance between the best algorithm and each other algorithm. + 6. Returns a new StatisticSamples instance with the computed differences and the name of the best performing algorithm. + Example usage: + >>> from CompStats import performance, difference + >>> from CompStats.tests.test_performance import DATA + >>> from sklearn.metrics import f1_score + >>> import pandas as pd + >>> df = pd.read_csv(DATA) + >>> score = lambda y, hy: f1_score(y, hy, average='weighted') + >>> perf = performance(df, score=score) + >>> diff = difference(perf) + """ + best_index = -1 if statistic_samples.BiB else 0 items = list(statistic_samples.calls.items()) perf = [(k, v, np.mean(v)) for k, v in items] perf.sort(key=lambda x: x[-1]) @@ -59,13 +111,37 @@ def difference(statistic_samples: StatisticSamples, best_index: int=-1): return output -def all_differences(statistic_samples: StatisticSamples, reverse: bool=True): - """Calculates all possible differences in performance among algorithms and sorts by average performance""" - +def all_differences(statistic_samples: StatisticSamples): + """ + Calculates all possible differences in performance among algorithms and sorts them by average performance. + + Parameters: + statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data. + + Returns: + StatisticSamples: A new instance of StatisticSamples with the computed performance differences among all algorithms. + + The function works as follows: + 1. Extracts the performance data for each algorithm. + 2. Calculates the mean performance for each algorithm and sorts the algorithms based on their mean performance. + 3. Iterates over all possible pairs of algorithms. + 4. Computes the difference in performance for each pair and stores it in a dictionary. + 5. Returns a new StatisticSamples instance with the computed differences. + + Example usage: + >>> from CompStats import performance, all_differences + >>> from CompStats.tests.test_performance import DATA + >>> from sklearn.metrics import f1_score + >>> import pandas as pd + >>> df = pd.read_csv(DATA) + >>> score = lambda y, hy: f1_score(y, hy, average='weighted') + >>> perf = performance(df, score=score) + >>> all_diff = all_differences(perf) + """ items = list(statistic_samples.calls.items()) # Calculamos el rendimiento medio y ordenamos los algoritmos basándonos en este perf = [(k, v, np.mean(v)) for k, v in items] - perf.sort(key=lambda x: x[2], reverse=reverse) # Orden descendente por rendimiento medio + perf.sort(key=lambda x: x[2], reverse=statistic_samples.BiB) # Orden por rendimiento medio diffs = {} # Diccionario para guardar las diferencias @@ -87,7 +163,30 @@ def plot_performance(statistic_samples: StatisticSamples, CI: float=0.05, var_name='Algorithm', value_name='Score', capsize=0.2, linestyle='none', kind='point', sharex=False, **kwargs): - """Plot the performance with the confidence intervals + """ + Plots the performance of algorithms with confidence intervals. + + Parameters: + statistic_samples (StatisticSamples or pd.DataFrame): An instance of StatisticSamples containing the performance data, + or a DataFrame in long format. + CI (float): Confidence interval level (default is 0.05). + var_name (str): Variable name for algorithms (default is 'Algorithm'). + value_name (str): Variable name for scores (default is 'Score'). + capsize (float): Size of the caps on error bars (default is 0.2). + linestyle (str): Line style for the plot (default is 'none'). + kind (str): Type of plot (default is 'point'). + sharex (bool): Whether to share the x-axis among subplots (default is False). + **kwargs: Additional keyword arguments passed to seaborn's catplot function. + + Returns: + sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot. + + The function works as follows: + 1. If statistic_samples is an instance of StatisticSamples, it extracts and sorts the performance data. + 2. Converts the data into a long format DataFrame. + 3. Computes the confidence intervals if CI is provided as a float. + 4. Plots the performance data with confidence intervals using seaborn's catplot. + >>> from CompStats import performance, plot_performance >>> from CompStats.tests.test_performance import DATA @@ -100,7 +199,9 @@ def plot_performance(statistic_samples: StatisticSamples, CI: float=0.05, """ if isinstance(statistic_samples, StatisticSamples): - df2 = pd.DataFrame(statistic_samples.calls).melt(var_name=var_name, + lista_ordenada = sorted(statistic_samples.calls.items(), key=lambda x: np.mean(x[1]), reverse=statistic_samples.BiB) + diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada} + df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name, value_name=value_name) else: df2 = statistic_samples @@ -117,7 +218,28 @@ def plot_difference(statistic_samples: StatisticSamples, CI: float=0.05, set_refline=True, set_title=True, hue='Significant', palette=None, **kwargs): - """Plot the difference in performance with its confidence intervals + """ + Plot the difference in performance with its confidence intervals. + + Parameters: + statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data. + CI (float, optional): Confidence interval level. Defaults to 0.05. + var_name (str, optional): Variable name for the comparisons. Defaults to 'Comparison'. + value_name (str, optional): Variable name for the differences. Defaults to 'Difference'. + set_refline (bool, optional): Whether to set a reference line at x=0. Defaults to True. + set_title (bool, optional): Whether to set the title of the plot with the best performing algorithm. Defaults to True. + hue (str or None, optional): Column name for hue encoding. Defaults to 'Significant'. + palette (list or None, optional): Colors to use for different hue levels. Defaults to None. + **kwargs: Additional keyword arguments passed to the plot_performance function. + + Returns: + sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot. + + The function works as follows: + 1. Converts the differences stored in statistic_samples into a long format DataFrame. + 2. Adds a 'Significant' column to indicate whether the confidence interval includes zero. + 3. Plots the differences with confidence intervals using the plot_performance function. + 4. Optionally sets a reference line at x=0 and a title indicating the best performing algorithm. >>> from CompStats import performance, difference, plot_difference >>> from CompStats.tests.test_performance import DATA @@ -129,20 +251,24 @@ def plot_difference(statistic_samples: StatisticSamples, CI: float=0.05, >>> diff = difference(perf) >>> ins = plot_difference(diff) """ - - df2 = pd.DataFrame(statistic_samples.calls).melt(var_name=var_name, - value_name=value_name) + if isinstance(statistic_samples, StatisticSamples): + lista_ordenada = sorted(statistic_samples.calls.items(), key=lambda x: np.mean(x[1]), reverse=statistic_samples.BiB) + diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada} + df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name, + value_name=value_name) if hue is not None: df2[hue] = True at_least_one = False - for key, (left, _) in measurements.CI(statistic_samples, alpha=CI).items(): - if left < 0: + for key, (left, right) in measurements.CI(statistic_samples, alpha=CI).items(): + if left < 0 < right: rows = df2[var_name] == key df2.loc[rows, hue] = False at_least_one = True if at_least_one and palette is None: palette = ['r', 'b'] - f_grid = plot_performance(df2, var_name=var_name, + else: + palette = ['b'] + f_grid = plot_performance(df2, CI=CI, var_name=var_name, value_name=value_name, hue=hue, palette=palette, **kwargs) @@ -152,3 +278,395 @@ def plot_difference(statistic_samples: StatisticSamples, CI: float=0.05, best = statistic_samples.info['best'] f_grid.facet_axis(0, 0).set_title(f'Best: {best}') return f_grid + +def performance_multiple_metrics(data: pd.DataFrame, gold: str, + scores: List[dict], + num_samples: int = 500, n_jobs: int = -1): + """ + Calculate bootstrap samples of multiple performance metrics for a given dataset. + + Parameters: + data (pd.DataFrame): Input dataset. + gold (str): Column name of the ground truth or target variable. + scores (List[dict]): A list of dictionaries, each containing: + - "func": The performance score function. + - "args" (optional): Arguments to pass to the score function. + - "BiB": Whether the metric is Bigger is Better. + num_samples (int, optional): Number of bootstrap samples. Defaults to 500. + n_jobs (int, optional): Number of jobs to run in parallel. Defaults to -1. + + Returns: + dict: A dictionary containing the results for each metric, including: + - 'samples': Bootstrap samples of the performance scores. + - 'performance': Calculated performance scores for each algorithm. + - 'compg': General performance comparison metrics, including: + - 'n': Number of samples. + - 'm': Number of algorithms. + - 'cv': Coefficient of variation for each metric. + - 'dist': Distance metric for each metric. + - 'PPI': Performance potential index for each metric. + - 'BiB': Whether each metric is Bigger is Better. + + The function works as follows: + 1. Defines auxiliary functions for calculating additional performance metrics. + 2. Iterates over the list of score functions and their respective arguments. + 3. Initializes a StatisticSamples object for each score function. + 4. Calculates the performance scores for each column in the dataset (excluding the ground truth column). + 5. Computes additional performance metrics (CV, distance, PPI) for each score function. + 6. Compiles the results into a dictionary and returns it. + + Example usage: + >>> from sklearn.metrics import accuracy_score, f1_score + >>> import pandas as pd + >>> from CompStats import performance_multiple_metrics + >>> df = pd.read_csv('path/to/data.csv') + >>> scores = [ + >>> {"func": accuracy_score, "BiB": True}, + >>> {"func": f1_score, "args": {"average": "weighted"}, "BiB": True} + >>> ] + >>> results = performance_multiple_metrics(df, gold='target', scores=scores, num_samples=1000) + """ + results, performance_dict, perfo, dist, ccv, cppi, compg, cBiB = {}, {}, {}, {}, {}, {}, {}, {} + n,m = data.shape + # definimos las funciones para las metricas + cv = lambda x: np.std(x, ddof=1) / np.mean(x) * 100 + dista = lambda x: np.abs(np.max(x) - np.median(x)) + ppi = lambda x: (1 - np.max(x)) * 100 + for score_info in scores: + score_func = score_info["func"] + score_args = score_info.get("args", {}) + score_BiB = score_info.get("BiB", True) # Default to True if not specified + # Prepara el StatisticSamples con los argumentos específicos para esta métrica + statistic_samples = StatisticSamples(num_samples=num_samples, n_jobs=n_jobs, BiB=score_BiB) + # Calcula la métrica para cada muestra + statistic_samples.statistic = statistic = lambda y_true, y_pred: score_func(y_true, y_pred, **score_args) + # metric_name = score_func.__name__ + "_" + "_".join([f"{key}={value}" for key, value in score_args.items()]) + metric_name = score_func.__name__ + ("" if not score_args else "_" + "_".join([f"{key}={value}" for key, value in score_args.items()])) + results[metric_name] = {} + perfo[metric_name] = {} + for column in data.columns: + if column == gold: + continue + results[metric_name][column] = statistic_samples(data[gold], data[column]) + perfo[metric_name][column] = statistic(data[gold], data[column]) + ccv[metric_name] = cv(np.array(list(perfo[metric_name].values()))) + dist[metric_name] = dista(np.array(list(perfo[metric_name].values()))) + cppi[metric_name] = ppi(np.array(list(perfo[metric_name].values()))) + cBiB[metric_name] = score_BiB + compg = {'n' : n, + 'm' : m-1, + 'cv' : ccv, + 'dist' : dist, + 'PPI' : cppi} + performance_dict = {'samples' : results, + 'performance' : perfo, + 'compg' : compg, + 'BiB': cBiB} + return performance_dict + +def plot_performance2(results: dict, CI: float=0.05, + var_name='Algorithm', value_name='Score', + capsize=0.2, linestyle='none', kind='point', + sharex=False, **kwargs): + """ + Plot the performance with confidence intervals. This function is used by plot_difference_multiple + + Parameters: + results (dict): A dictionary where keys are algorithm names and values are lists of performance scores. + CI (float, optional): Confidence interval level for error bars. Defaults to 0.05. + var_name (str, optional): Variable name for the algorithms. Defaults to 'Algorithm'. + value_name (str, optional): Variable name for the scores. Defaults to 'Score'. + capsize (float, optional): Cap size for error bars. Defaults to 0.2. + linestyle (str, optional): Line style for the plot. Defaults to 'none'. + kind (str, optional): Type of the plot, e.g., 'point', 'bar'. Defaults to 'point'. + sharex (bool, optional): Whether to share the x-axis among subplots. Defaults to False. + **kwargs: Additional keyword arguments for seaborn.catplot. + + Returns: + sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot. + + The function works as follows: + 1. If results is a dictionary, it sorts the algorithms by their mean performance scores. + 2. Converts the sorted data into a long format DataFrame. + 3. Computes the confidence intervals if CI is provided as a float. + 4. Uses seaborn's catplot to create and display the performance plot with confidence intervals. + """ + if isinstance(results, dict): + lista_ordenada = sorted(results.items(), key=lambda x: np.mean(x[1]), reverse=True) + diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada} + df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name, + value_name=value_name) + + if isinstance(CI, float): + ci = lambda x: measurements.CI(x, alpha=CI) + f_grid = sns.catplot(df2, x=value_name, y=var_name, + capsize=capsize, linestyle=linestyle, + kind=kind, errorbar=ci, sharex=sharex, **kwargs) + return f_grid + + + + +def difference_multiple(results_dict, CI: float=0.05,): + """ + Calculate performance differences for multiple metrics, excluding the comparison of the best + with itself. Additionally, identify the best performing algorithm for each metric. + + Parameters: + results_dict (dict): A dictionary where keys are metric names and values are dictionaries. + Each sub-dictionary has algorithm names as keys and lists of performance scores as values. + CI (float, optional): Confidence interval level. Defaults to 0.05. + + Returns: + dict: A dictionary with the same structure, but where the scores for each algorithm are replaced + by their differences to the scores of the best performing algorithm for that metric, + excluding the best performing algorithm comparing with itself. + Also includes the best algorithm name for each metric. + + The function works as follows: + 1. Iterates over each metric in the results dictionary. + 2. Converts performance scores to numpy arrays for efficient computations. + 3. Identifies the best performing algorithm for each metric based on the mean performance scores. + 4. Calculates the differences in performance scores relative to the best performing algorithm. + 5. Computes confidence intervals and p-values for these differences. + 6. Stores the differences, confidence intervals, p-values, and the best algorithm for each metric. + 7. Returns a dictionary with these calculated differences and additional information. + + Example usage: + >>> from CompStats import performance, difference_multiple + >>> from CompStats.tests.test_performance import DATA + >>> from sklearn.metrics import f1_score + >>> import pandas as pd + >>> df = pd.read_csv(DATA) + >>> score = lambda y, hy: f1_score(y, hy, average='weighted') + >>> perf = performance(df, score=score) + >>> diff_mult = difference_multiple(perf, CI=0.05) + """ + differences_dict = results_dict.copy() + winner = {} + alpha = CI + for metric, results in results_dict['samples'].items(): + # Convert scores to arrays for vectorized operations + scores_arrays = {alg: np.array(scores) for alg, scores in results.items()} + # Identify the best performing algorithm (highest mean score) + if results_dict['BiB'][metric]: + best_alg = max(scores_arrays, key=lambda alg: np.mean(scores_arrays[alg])) + else: + best_alg = min(scores_arrays, key=lambda alg: np.mean(scores_arrays[alg])) + best_scores = scores_arrays[best_alg] + + # Calculate differences to the best performing algorithm, excluding the best from comparing with itself + differences = {alg: best_scores - scores for alg, scores in scores_arrays.items() if alg != best_alg} + + # Calculate Confidence interval for differences to the bet performing algorithm. + CI_differences = {alg: measurements.CI(np.array(scores), alpha=CI) for alg, scores in differences.items()} + p_value_differences = {alg: measurements.difference_p_value(np.array(scores), BiB= results_dict['BiB'][metric]) for alg, scores in differences.items()} + + + # Store the differences and the best algorithm under the current metric + winner[metric] = {'best': best_alg, 'diff': differences,'CI':CI_differences, + 'p_value': p_value_differences, + 'none': sum(valor > alpha for valor in p_value_differences.values()), + 'bonferroni': sum(multipletests(list(p_value_differences.values()), method='bonferroni')[1] > alpha), + 'holm': sum(multipletests(list(p_value_differences.values()), method='holm')[1] > alpha), + 'HB': sum(multipletests(list(p_value_differences.values()), method='fdr_bh')[1] > alpha) } + differences_dict['winner'] = winner + return differences_dict + + +def plot_difference2(diff_dictionary: dict, CI: float = 0.05, + var_name='Comparison', value_name='Difference', + set_refline=True, set_title=True, + hue='Significant', palette=None, BiB: bool=True, + **kwargs): + """Plot the difference in performance with its confidence intervals + + >>> from CompStats import performance, difference, plot_difference + >>> from CompStats.tests.test_performance import DATA + >>> from sklearn.metrics import f1_score + >>> import pandas as pd + >>> df = pd.read_csv(DATA) + >>> score = lambda y, hy: f1_score(y, hy, average='weighted') + >>> perf = performance(df, score=score) + >>> diff = difference(perf) + >>> ins = plot_difference(diff) + """ + if isinstance(diff_dictionary, dict): + lista_ordenada = sorted(diff_dictionary['diff'].items(), key=lambda x: np.mean(x[1]), reverse=BiB) + diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada} + df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name, + value_name=value_name) + if hue is not None: + df2[hue] = True + at_least_one = False + for key, (left, right) in diff_dictionary['CI'].items(): + if left < 0 < right: + rows = df2[var_name] == key + df2.loc[rows, hue] = False + at_least_one = True + if at_least_one and palette is None: + palette = ['r', 'b'] + else: + palette = ['b'] + f_grid = plot_performance(df2, CI=CI, var_name=var_name, + value_name=value_name, hue=hue, + palette=palette, + **kwargs) + if set_refline: + f_grid.refline(x=0) + if set_title: + best = diff_dictionary['best'] + f_grid.facet_axis(0, 0).set_title(f'Best: {best}') + return f_grid + +def plot_performance_multiple(results_dict: dict, CI: float = 0.05, capsize: float = 0.2, + linestyle: str = 'none', kind: str = 'point', **kwargs): + """ + Create multiple performance plots, one for each performance metric in the results dictionary. + + Parameters: + results_dict (dict): A dictionary where keys are metric names and values are dictionaries + with algorithm names as keys and lists of performance scores as values. + CI (float, optional): Confidence interval level for error bars. Defaults to 0.05. + capsize (float, optional): Cap size for error bars. Defaults to 0.2. + linestyle (str, optional): Line style for the plot. Defaults to 'none'. + kind (str, optional): Type of the plot, e.g., 'point', 'bar'. Defaults to 'point'. + **kwargs: Additional keyword arguments for seaborn.catplot. + + Returns: + None: The function creates and displays plots. + + The function works as follows: + 1. Iterates over each metric in the results dictionary. + 2. Uses the plot_performance2 function to create and display the plot for each metric. + 3. Sets the title of each plot to the metric name and the best performing algorithm. + + Example usage: + >>> from CompStats import plot_performance_multiple + >>> results = { + >>> 'accuracy': { + >>> 'alg1': [0.1, 0.2, 0.15], + >>> 'alg2': [0.05, 0.1, 0.07] + >>> }, + >>> 'f1_score': { + >>> 'alg1': [0.3, 0.25, 0.2], + >>> 'alg2': [0.2, 0.15, 0.1] + >>> } + >>> } + >>> plot_performance_multiple(results, CI=0.05) + """ + + for metric_name, metric_results in results_dict['samples'].items(): + BiB = results_dict['BiB'].get(metric_name, True) + # Convert results to long format DataFrame + if isinstance(metric_results, dict): + lista_ordenada = sorted(metric_results.items(), key=lambda x: np.mean(x[1]), reverse=BiB) + diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada} + df2 = pd.DataFrame(diccionario_ordenado).melt(var_name='Algorithm', + value_name='Score') + + # Define the confidence interval function + if isinstance(CI, float): + ci = lambda x: measurements.CI(x, alpha=CI) + + # Create the plot + g = sns.catplot(df2, x='Score', y='Algorithm', capsize=capsize, linestyle=linestyle, + kind=kind, errorbar=ci, **kwargs) + + # Set the title of the plot + g.figure.suptitle(metric_name) + + # Display the plot + plt.show() + + +def plot_difference_multiple(results_dict, CI=0.05, capsize=0.2, linestyle='none', kind='point', **kwargs): + """ + Create multiple performance plots, one for each performance metric in the results dictionary. + + :param results_dict: A dictionary where keys are metric names and values are dictionaries with algorithm names as keys and lists of scores as values. + :param CI: Confidence interval level for error bars. + :param capsize: Cap size for error bars. + :param linestyle: Line style for the plot. + :param kind: Type of the plot, e.g., 'point', 'bar'. + :param kwargs: Additional keyword arguments for seaborn.catplot. + """ + for metric_name, metric_results in results_dict['winner'].items(): + BiB = results_dict['BiB'].get(metric_name, True) + # Usa catplot para crear y mostrar el gráfico + g = plot_difference2(metric_results, BiB=BiB, CI=CI) + g.figure.suptitle(metric_name) + # plt.show() + + + + +### este por el momento no. +def plot_scatter_matrix(perf): + """ + Generate a scatter plot matrix comparing the performance of the same algorithm + across different metrics contained in the 'perf' dictionary. + + :param perf: A dictionary where keys are metric names and values are dictionaries with algorithm names as keys + and lists of performance scores as values. + """ + # Convertir 'perf' en un DataFrame de pandas para facilitar la manipulación + df_long = pd.DataFrame([ + {"Metric": metric, "Algorithm": alg, "Score": score, "Indice": i} + for metric, alg_scores in perf['samples'].items() + for alg, scores in alg_scores.items() + for i, (score) in enumerate(scores) + ]) + df_wide = df_long.pivot(index=['Algorithm','Indice'],columns='Metric',values='Score') + df_wide = df_wide.reset_index(level=[0]) + sns.pairplot(df_wide, diag_kind='kde',hue="Algorithm", corner=True) + plt.suptitle('Scatter Plot Matrix of Algorithms Performance Across Different Metrics', y=1.02) + plt.show() + + + +def all_differences_multiple(results_dict, alpha: float=0.05): + """ + Calculate performance differences for unique pairs of algorithms for multiple metrics. + Also, calculates the confidence interval for the differences. + + :param results_dict: A dictionary where keys are metric names and values are dictionaries. + Each sub-dictionary has algorithm names as keys and lists of performance scores as values. + :return: A dictionary where each metric name maps to another dictionary. + This dictionary contains keys for unique pairs of algorithms and their performance differences, + including the confidence interval for these differences. + """ + differences_dict = results_dict.copy() + all = {} + for metric, results in results_dict['samples'].items(): + # Convert scores to arrays for vectorized operations + scores_arrays = {alg: np.array(scores) for alg, scores in results.items()} + scores_arrays = dict(sorted(scores_arrays.items(), key=lambda item: np.mean(item[1]), reverse=results_dict['BiB'][metric])) + + + differences = {} + p_value_differences = {} + + algorithms = list(scores_arrays.keys()) + # Calculate differences for unique pairs of algorithms + for i, alg_a in enumerate(algorithms): + for alg_b in algorithms[i+1:]: # Start from the next algorithm to avoid duplicate comparisons + # Calculate the difference between alg_a and alg_b + diff = scores_arrays[alg_a] - scores_arrays[alg_b] + differences[f"{alg_a} vs {alg_b}"] = diff + + # Placeholder for confidence interval calculation + # Replace the string with an actual call to your CI calculation function + p_value_differences[f"{alg_a} vs {alg_b}"] = measurements.difference_p_value(diff, BiB=results_dict['BiB'][metric]) + # For example: + # CI_differences[f"{alg_a} vs {alg_b}"] = measurements.CI(diff, alpha=CI) + + # Store the differences under the current metric + all[metric] = {'diff': differences, 'p_value': p_value_differences, + 'none': sum(valor > alpha for valor in p_value_differences.values()), + 'bonferroni': sum(multipletests(list(p_value_differences.values()), method='bonferroni')[1] > alpha), + 'holm': sum(multipletests(list(p_value_differences.values()), method='holm')[1] > alpha), + 'HB': sum(multipletests(list(p_value_differences.values()), method='fdr_bh')[1] > alpha) } + differences_dict['all'] = all + return differences_dict + diff --git a/CompStats/tests/test_measurements.py b/CompStats/tests/test_measurements.py index f40d247..bc8c9b7 100644 --- a/CompStats/tests/test_measurements.py +++ b/CompStats/tests/test_measurements.py @@ -15,7 +15,7 @@ import pandas as pd import os from sklearn.metrics import f1_score -from CompStats.measurements import CI, difference_p_value +from CompStats.measurements import CI, SE, difference_p_value from CompStats.bootstrap import StatisticSamples from CompStats.performance import performance, difference @@ -38,4 +38,15 @@ def test_difference_p_value(): perf = performance(df, score=lambda y, hy: f1_score(y, hy, average='weighted')) res = difference(perf) p_value = difference_p_value(res) - assert p_value['BoW'] > 0.2 \ No newline at end of file + assert p_value['BoW'] > 0.2 + + +def test_SE(): + """Test confidence interval""" + + statistic = StatisticSamples(num_samples=26, n_jobs=-1) + pop = np.r_[3, 4, 5, 2, 4] + samples = statistic(pop, name='test') + se = SE(samples) + se2 = SE(statistic) + assert se2['test'] == se diff --git a/CompStats/tests/test_performance.py b/CompStats/tests/test_performance.py index 1a2a177..3941011 100644 --- a/CompStats/tests/test_performance.py +++ b/CompStats/tests/test_performance.py @@ -14,9 +14,13 @@ import numpy as np import pandas as pd import os -from sklearn.metrics import f1_score +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error import seaborn as sns -from CompStats.performance import performance, plot_performance, difference, plot_difference, all_differences +from CompStats import performance, plot_performance, difference, plot_difference, all_differences +from CompStats import performance_multiple_metrics, plot_performance2 +from CompStats import difference_multiple, plot_scatter_matrix, all_differences_multiple, plot_performance_multiple +from CompStats import plot_difference2, plot_difference_multiple + DATA = os.path.join(os.path.dirname(__file__), 'data.csv') @@ -55,5 +59,55 @@ def test_all_differences(): """Test all_differences""" df = pd.read_csv(DATA) perf = performance(df, score=lambda y, hy: f1_score(y, hy, average='weighted')) - res = all_differences(perf) - assert 'INGEOTEC - BoW' in res.calls \ No newline at end of file + resa = all_differences(perf) + assert 'INGEOTEC - BoW' in resa.calls + + +def test_performance_multiple_metrics(): + """Test performance_multiple_metrics""" + df = pd.read_csv(DATA) + metrics = [ + {"func": accuracy_score, 'BiB': True}, + {"func": f1_score, "args": {"average": "macro"}, 'BiB': True}, + {"func": precision_score, "args": {"average": "macro"}, 'BiB': True}, + {"func": mean_absolute_error, 'BiB': False} + ] + perf = performance_multiple_metrics(df, "y", metrics) + ins = plot_performance_multiple(perf) + assert 'accuracy_score' in perf['samples'] + assert 'y' not in perf['samples']['accuracy_score'] + assert 'INGEOTEC' in perf['samples']['accuracy_score'] + + +def test_difference_multiple(): + """Test difference_multiple""" + df = pd.read_csv(DATA) + metrics = [ + {"func": accuracy_score, 'BiB': True}, + {"func": f1_score, "args": {"average": "macro"}, 'BiB': True}, + {"func": precision_score, "args": {"average": "macro"}, 'BiB': True}, + {"func": mean_absolute_error, 'BiB': False} + ] + perf = performance_multiple_metrics(df, "y", metrics) + diff = difference_multiple(perf) + ins = plot_difference_multiple(diff) + assert diff['winner']['accuracy_score']['best'] == 'BoW' + assert 'BoW' not in diff['winner']['accuracy_score']['diff'].keys() + # assert isinstance(ins, sns.FacetGrid) + + +def test_difference_summary(): + """Test difference_summary""" + df = pd.read_csv(DATA) + metrics = [ + {"func": accuracy_score, 'BiB': True}, + {"func": f1_score, "args": {"average": "macro"}, 'BiB': True}, + {"func": precision_score, "args": {"average": "macro"}, 'BiB': True}, + {"func": mean_absolute_error, 'BiB': False} + ] + perf = performance_multiple_metrics(df, "y", metrics) + diff = difference_multiple(perf) + all_dif = all_differences_multiple(diff) + assert diff['winner']['accuracy_score']['best'] == 'BoW' + assert 'BoW' not in diff['winner']['accuracy_score']['diff'].keys() + assert all_dif['all']['accuracy_score']['none'] == 6 diff --git a/pyproject.toml b/pyproject.toml index ce9be89..8108be5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,7 @@ [project] name = 'CompStats' +description = 'CompStats implements an evaluation methodology for statistically analyzing competition results and competition' +readme = "README.rst" dependencies = [ 'numpy', 'scikit-learn>=1.3.0', @@ -8,5 +10,23 @@ dependencies = [ ] dynamic = ['version'] +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Information Analysis" +] + [tool.setuptools.dynamic] -version = {attr = 'CompStats.__version__'} \ No newline at end of file +version = {attr = 'CompStats.__version__'} + +[project.urls] +Homepage = "https://compstats.readthedocs.io" +Repository = "https://github.com/INGEOTEC/CompStats" +Issues = "https://github.com/INGEOTEC/CompStats/issues" \ No newline at end of file