diff --git a/.github/workflows/pip.yaml b/.github/workflows/pip.yaml
index 2576246..e4b58a7 100644
--- a/.github/workflows/pip.yaml
+++ b/.github/workflows/pip.yaml
@@ -14,12 +14,12 @@ jobs:
         shell: bash -l {0}
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, windows-latest]
         python-version: ["3.9", "3.10", "3.11"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python
-      uses: conda-incubator/setup-miniconda@v2
+      uses: conda-incubator/setup-miniconda@v3
       with:
         activate-environment: test
         auto-update-conda: true
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b36dccb..6ccc466 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -13,12 +13,12 @@ jobs:
         shell: bash -l {0}      
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, windows-latest]
         python-version: ["3.9", "3.10", "3.11"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python
-      uses: conda-incubator/setup-miniconda@v2
+      uses: conda-incubator/setup-miniconda@v3
       with:
         activate-environment: test
         auto-update-conda: true
diff --git a/CompStats/.DS_Store b/CompStats/.DS_Store
new file mode 100644
index 0000000..1a19dca
Binary files /dev/null and b/CompStats/.DS_Store differ
diff --git a/CompStats/__init__.py b/CompStats/__init__.py
index 4f12734..cf0d0ad 100644
--- a/CompStats/__init__.py
+++ b/CompStats/__init__.py
@@ -11,7 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = '0.0.6'
+__version__ = '0.1.0'
 from CompStats.bootstrap import StatisticSamples
-from CompStats.measurements import CI, difference_p_value
+from CompStats.measurements import CI, SE, difference_p_value
 from CompStats.performance import performance, difference, all_differences, plot_performance, plot_difference
+from CompStats.performance import performance_multiple_metrics, difference_multiple, plot_performance_multiple, plot_difference_multiple
+from CompStats.performance import all_differences_multiple, plot_performance2, plot_difference2, plot_scatter_matrix
diff --git a/CompStats/bootstrap.py b/CompStats/bootstrap.py
index 6fcbc57..1648edf 100644
--- a/CompStats/bootstrap.py
+++ b/CompStats/bootstrap.py
@@ -46,10 +46,12 @@ class StatisticSamples:
     def __init__(self,
                  statistic: Callable[[np.ndarray], float]=np.mean,
                  num_samples: int=500,
-                 n_jobs: int=1):
+                 n_jobs: int=1,
+                 BiB: bool=True):
         self.statistic = statistic
         self.num_samples = num_samples
         self.n_jobs = n_jobs
+        self.BiB = BiB  # Guardar el parámetro BiB        
         self._samples = None
         self._calls = {}
         self._info = {}
@@ -67,7 +69,8 @@ def get_params(self):
         """Parameters"""
         return dict(statistic=self.statistic,
                     num_samples=self.num_samples,
-                    n_jobs=self.n_jobs)
+                    n_jobs=self.n_jobs,
+                    BiB=self.BiB)  # Añadir BiB a los parámetros
 
     def __sklearn_clone__(self):
         klass = self.__class__
diff --git a/CompStats/measurements.py b/CompStats/measurements.py
index b3eacc6..7cbf06d 100644
--- a/CompStats/measurements.py
+++ b/CompStats/measurements.py
@@ -34,13 +34,38 @@ def CI(samples: np.ndarray, alpha=0.05):
     (0.6, 1.0)
     """
     if isinstance(samples, StatisticSamples):
-        return {k: CI(v) for k, v in samples.calls.items()}
+        return {k: CI(v, alpha=alpha) for k, v in samples.calls.items()}
     alpha = alpha / 2
     return (np.percentile(samples, alpha * 100, axis=0),
             np.percentile(samples, (1 - alpha) * 100, axis=0))
 
+
+def SE(samples: np.ndarray):
+    """Compute the Standard Error of a statistic using bootstrap.
+    
+    >>> from CompStats import StatisticSamples, SE
+    >>> from sklearn.metrics import accuracy_score
+    >>> import numpy as np    
+    >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]
+    >>> pred   = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]]
+    >>> bootstrap = StatisticSamples(statistic=accuracy_score)
+    >>> samples = bootstrap(labels, pred)
+    >>> SE(samples)
+    """
+    if isinstance(samples, StatisticSamples):
+        return {k: SE(v) for k, v in samples.calls.items()}
+    return np.std(samples, axis=0)
+
     
-def difference_p_value(statistic_samples: StatisticSamples):
+def difference_p_value(samples: np.ndarray, BiB: bool = True):
     """Compute the difference p-value"""
-    return {k: (v > 2 * np.mean(v)).mean()
-            for k, v in statistic_samples.calls.items()}
\ No newline at end of file
+    if isinstance(samples, StatisticSamples):
+        if samples.BiB:
+            return {k: (v > 2 * np.mean(v)).mean() for k, v in samples.calls.items()}
+        else:
+            return {k: (v < 2 * np.mean(v)).mean() for k, v in samples.calls.items()}
+    else:
+        if BiB:
+            return np.mean(samples > 2 * np.mean(samples, axis=0), axis=0)
+        else:
+            return np.mean(samples < 2 * np.mean(samples, axis=0), axis=0)
\ No newline at end of file
diff --git a/CompStats/performance.py b/CompStats/performance.py
index 1a09f6f..5763a0e 100644
--- a/CompStats/performance.py
+++ b/CompStats/performance.py
@@ -13,13 +13,16 @@
 # limitations under the License.
 from sklearn.metrics import accuracy_score
 from sklearn.base import clone
-from typing import Callable
+from typing import List, Callable
 import pandas as pd
 import numpy as np
 import seaborn as sns
+import math
 from CompStats.bootstrap import StatisticSamples
 from CompStats.utils import progress_bar
 from CompStats import measurements
+import matplotlib.pyplot as plt
+from statsmodels.stats.multitest import multipletests
 
 
 def performance(data: pd.DataFrame,
@@ -27,23 +30,72 @@ def performance(data: pd.DataFrame,
                 score: Callable[[np.ndarray, np.ndarray], float]=accuracy_score,
                 num_samples: int=500,
                 n_jobs: int=-1,
+                BiB: bool=True,
                 statistic_samples: StatisticSamples=None) -> StatisticSamples:
-    """Bootstrap samples of a performance score"""
+    """
+    Calculate bootstrap samples of a performance score for a given dataset.
+
+    Parameters:
+    data (pd.DataFrame): Input dataset.
+    gold (str, optional): Column name of the ground truth or target variable. Defaults to 'y'.
+    score (Callable, optional): Performance score function. Defaults to accuracy_score.
+    num_samples (int, optional): Number of bootstrap samples. Defaults to 500.
+    n_jobs (int, optional): Number of jobs to run in parallel. Defaults to -1.
+    BiB (bool, optional): Whether the metric is Bigger is Better. Defaults to True.
+    statistic_samples (StatisticSamples, optional): Pre-initialized StatisticSamples object. Defaults to None.
+
+    Returns:
+    StatisticSamples: Object containing the bootstrap samples of the performance score.
+
+    Example usage:
+    >>> from sklearn.metrics import accuracy_score
+    >>> import pandas as pd
+    >>> from CompStats import performance
+    >>> df = pd.read_csv('path/to/data.csv')
+    >>> perf = performance(df, gold='y', score=accuracy_score, num_samples=1000)
+    """
     if statistic_samples is None:
         statistic_samples = StatisticSamples(statistic=score, num_samples=num_samples,
-                                             n_jobs=n_jobs)
+                                             n_jobs=n_jobs, BiB=BiB)
     columns = data.columns
     y = data[gold]
     for column in progress_bar(columns):
         if column == gold:
             continue
         statistic_samples(y, data[column], name=column)
+        
     return statistic_samples
 
 
-def difference(statistic_samples: StatisticSamples, best_index: int=-1):
-    """Bootstrap samples of a difference in performnace"""
+def difference(statistic_samples: StatisticSamples): #, best_index: int=-1):
+    """
+    Computes the difference in performance between the best performing algorithm and others using bootstrap samples.
+
+    Parameters:
+    statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data.
+
+    Returns:
+    StatisticSamples: A new instance of StatisticSamples with the computed differences and information about the best algorithm.
+
+    The function works as follows:
+    1. Determines the index of the best performing algorithm based on the BiB attribute.
+    2. Extracts and calculates the mean performance for each algorithm.
+    3. Sorts the algorithms by their mean performance.
+    4. Identifies the best performing algorithm.
+    5. Computes the difference in performance between the best algorithm and each other algorithm.
+    6. Returns a new StatisticSamples instance with the computed differences and the name of the best performing algorithm.
 
+    Example usage:
+    >>> from CompStats import performance, difference
+    >>> from CompStats.tests.test_performance import DATA
+    >>> from sklearn.metrics import f1_score
+    >>> import pandas as pd
+    >>> df = pd.read_csv(DATA)
+    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
+    >>> perf = performance(df, score=score)
+    >>> diff = difference(perf)
+    """
+    best_index = -1 if statistic_samples.BiB else 0
     items = list(statistic_samples.calls.items())
     perf = [(k, v, np.mean(v)) for k, v in items]
     perf.sort(key=lambda x: x[-1])
@@ -59,13 +111,37 @@ def difference(statistic_samples: StatisticSamples, best_index: int=-1):
     return output
 
 
-def all_differences(statistic_samples: StatisticSamples, reverse: bool=True):
-    """Calculates all possible differences in performance among algorithms and sorts by average performance"""
-    
+def all_differences(statistic_samples: StatisticSamples):
+    """
+    Calculates all possible differences in performance among algorithms and sorts them by average performance.
+
+    Parameters:
+    statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data.
+
+    Returns:
+    StatisticSamples: A new instance of StatisticSamples with the computed performance differences among all algorithms.
+
+    The function works as follows:
+    1. Extracts the performance data for each algorithm.
+    2. Calculates the mean performance for each algorithm and sorts the algorithms based on their mean performance.
+    3. Iterates over all possible pairs of algorithms.
+    4. Computes the difference in performance for each pair and stores it in a dictionary.
+    5. Returns a new StatisticSamples instance with the computed differences.
+
+    Example usage:
+    >>> from CompStats import performance, all_differences
+    >>> from CompStats.tests.test_performance import DATA
+    >>> from sklearn.metrics import f1_score
+    >>> import pandas as pd
+    >>> df = pd.read_csv(DATA)
+    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
+    >>> perf = performance(df, score=score)
+    >>> all_diff = all_differences(perf)
+    """
     items = list(statistic_samples.calls.items())
     # Calculamos el rendimiento medio y ordenamos los algoritmos basándonos en este
     perf = [(k, v, np.mean(v)) for k, v in items]
-    perf.sort(key=lambda x: x[2], reverse=reverse)  # Orden descendente por rendimiento medio
+    perf.sort(key=lambda x: x[2], reverse=statistic_samples.BiB)  # Orden por rendimiento medio
     
     diffs = {}  # Diccionario para guardar las diferencias
     
@@ -87,7 +163,30 @@ def plot_performance(statistic_samples: StatisticSamples, CI: float=0.05,
                      var_name='Algorithm', value_name='Score',
                      capsize=0.2, linestyle='none', kind='point',
                      sharex=False, **kwargs):
-    """Plot the performance with the confidence intervals
+    """
+    Plots the performance of algorithms with confidence intervals.
+
+    Parameters:
+    statistic_samples (StatisticSamples or pd.DataFrame): An instance of StatisticSamples containing the performance data, 
+                                                          or a DataFrame in long format.
+    CI (float): Confidence interval level (default is 0.05).
+    var_name (str): Variable name for algorithms (default is 'Algorithm').
+    value_name (str): Variable name for scores (default is 'Score').
+    capsize (float): Size of the caps on error bars (default is 0.2).
+    linestyle (str): Line style for the plot (default is 'none').
+    kind (str): Type of plot (default is 'point').
+    sharex (bool): Whether to share the x-axis among subplots (default is False).
+    **kwargs: Additional keyword arguments passed to seaborn's catplot function.
+
+    Returns:
+    sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot.
+
+    The function works as follows:
+    1. If statistic_samples is an instance of StatisticSamples, it extracts and sorts the performance data.
+    2. Converts the data into a long format DataFrame.
+    3. Computes the confidence intervals if CI is provided as a float.
+    4. Plots the performance data with confidence intervals using seaborn's catplot.
+
     
     >>> from CompStats import performance, plot_performance
     >>> from CompStats.tests.test_performance import DATA
@@ -100,7 +199,9 @@ def plot_performance(statistic_samples: StatisticSamples, CI: float=0.05,
     """
 
     if isinstance(statistic_samples, StatisticSamples):
-        df2 = pd.DataFrame(statistic_samples.calls).melt(var_name=var_name,
+        lista_ordenada = sorted(statistic_samples.calls.items(), key=lambda x: np.mean(x[1]), reverse=statistic_samples.BiB)
+        diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
+        df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name,
                                                          value_name=value_name)
     else:
         df2 = statistic_samples
@@ -117,7 +218,28 @@ def plot_difference(statistic_samples: StatisticSamples, CI: float=0.05,
                     set_refline=True, set_title=True,
                     hue='Significant', palette=None,
                     **kwargs):
-    """Plot the difference in performance with its confidence intervals
+    """
+    Plot the difference in performance with its confidence intervals.
+
+    Parameters:
+    statistic_samples (StatisticSamples): An instance of StatisticSamples containing the performance data.
+    CI (float, optional): Confidence interval level. Defaults to 0.05.
+    var_name (str, optional): Variable name for the comparisons. Defaults to 'Comparison'.
+    value_name (str, optional): Variable name for the differences. Defaults to 'Difference'.
+    set_refline (bool, optional): Whether to set a reference line at x=0. Defaults to True.
+    set_title (bool, optional): Whether to set the title of the plot with the best performing algorithm. Defaults to True.
+    hue (str or None, optional): Column name for hue encoding. Defaults to 'Significant'.
+    palette (list or None, optional): Colors to use for different hue levels. Defaults to None.
+    **kwargs: Additional keyword arguments passed to the plot_performance function.
+
+    Returns:
+    sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot.
+
+    The function works as follows:
+    1. Converts the differences stored in statistic_samples into a long format DataFrame.
+    2. Adds a 'Significant' column to indicate whether the confidence interval includes zero.
+    3. Plots the differences with confidence intervals using the plot_performance function.
+    4. Optionally sets a reference line at x=0 and a title indicating the best performing algorithm.
     
     >>> from CompStats import performance, difference, plot_difference
     >>> from CompStats.tests.test_performance import DATA
@@ -129,20 +251,24 @@ def plot_difference(statistic_samples: StatisticSamples, CI: float=0.05,
     >>> diff = difference(perf)
     >>> ins = plot_difference(diff)
     """
-
-    df2 = pd.DataFrame(statistic_samples.calls).melt(var_name=var_name,
-                                                     value_name=value_name)
+    if isinstance(statistic_samples, StatisticSamples):
+        lista_ordenada = sorted(statistic_samples.calls.items(), key=lambda x: np.mean(x[1]), reverse=statistic_samples.BiB)
+        diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
+        df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name,
+                                                         value_name=value_name)
     if hue is not None:
         df2[hue] = True
     at_least_one = False
-    for key, (left, _) in measurements.CI(statistic_samples, alpha=CI).items():
-        if left < 0:
+    for key, (left, right) in measurements.CI(statistic_samples, alpha=CI).items():
+        if left < 0 < right:
             rows = df2[var_name] == key
             df2.loc[rows, hue] = False
             at_least_one = True
     if at_least_one and palette is None:
         palette = ['r', 'b']
-    f_grid = plot_performance(df2, var_name=var_name,
+    else:
+        palette = ['b']        
+    f_grid = plot_performance(df2, CI=CI, var_name=var_name,
                               value_name=value_name, hue=hue,
                               palette=palette,
                               **kwargs)
@@ -152,3 +278,395 @@ def plot_difference(statistic_samples: StatisticSamples, CI: float=0.05,
         best = statistic_samples.info['best']
         f_grid.facet_axis(0, 0).set_title(f'Best: {best}')
     return f_grid
+
+def performance_multiple_metrics(data: pd.DataFrame, gold: str, 
+                                 scores: List[dict],
+                                 num_samples: int = 500, n_jobs: int = -1):
+    """
+    Calculate bootstrap samples of multiple performance metrics for a given dataset.
+
+    Parameters:
+    data (pd.DataFrame): Input dataset.
+    gold (str): Column name of the ground truth or target variable.
+    scores (List[dict]): A list of dictionaries, each containing:
+        - "func": The performance score function.
+        - "args" (optional): Arguments to pass to the score function.
+        - "BiB": Whether the metric is Bigger is Better.
+    num_samples (int, optional): Number of bootstrap samples. Defaults to 500.
+    n_jobs (int, optional): Number of jobs to run in parallel. Defaults to -1.
+
+    Returns:
+    dict: A dictionary containing the results for each metric, including:
+        - 'samples': Bootstrap samples of the performance scores.
+        - 'performance': Calculated performance scores for each algorithm.
+        - 'compg': General performance comparison metrics, including:
+            - 'n': Number of samples.
+            - 'm': Number of algorithms.
+            - 'cv': Coefficient of variation for each metric.
+            - 'dist': Distance metric for each metric.
+            - 'PPI': Performance potential index for each metric.
+        - 'BiB': Whether each metric is Bigger is Better.
+
+    The function works as follows:
+    1. Defines auxiliary functions for calculating additional performance metrics.
+    2. Iterates over the list of score functions and their respective arguments.
+    3. Initializes a StatisticSamples object for each score function.
+    4. Calculates the performance scores for each column in the dataset (excluding the ground truth column).
+    5. Computes additional performance metrics (CV, distance, PPI) for each score function.
+    6. Compiles the results into a dictionary and returns it.
+
+    Example usage:
+    >>> from sklearn.metrics import accuracy_score, f1_score
+    >>> import pandas as pd
+    >>> from CompStats import performance_multiple_metrics
+    >>> df = pd.read_csv('path/to/data.csv')
+    >>> scores = [
+    >>>     {"func": accuracy_score, "BiB": True},
+    >>>     {"func": f1_score, "args": {"average": "weighted"}, "BiB": True}
+    >>> ]
+    >>> results = performance_multiple_metrics(df, gold='target', scores=scores, num_samples=1000)
+    """
+    results, performance_dict, perfo, dist, ccv, cppi, compg, cBiB = {}, {}, {}, {}, {}, {}, {}, {}
+    n,m = data.shape
+    # definimos las funciones para las metricas
+    cv = lambda x: np.std(x, ddof=1) / np.mean(x) * 100
+    dista = lambda x: np.abs(np.max(x) - np.median(x))
+    ppi = lambda x: (1 - np.max(x)) * 100
+    for score_info in scores:
+        score_func = score_info["func"]
+        score_args = score_info.get("args", {})
+        score_BiB = score_info.get("BiB", True)  # Default to True if not specified
+        # Prepara el StatisticSamples con los argumentos específicos para esta métrica
+        statistic_samples = StatisticSamples(num_samples=num_samples, n_jobs=n_jobs, BiB=score_BiB)
+        # Calcula la métrica para cada muestra
+        statistic_samples.statistic = statistic = lambda y_true, y_pred: score_func(y_true, y_pred, **score_args)
+        # metric_name = score_func.__name__ + "_" + "_".join([f"{key}={value}" for key, value in score_args.items()])
+        metric_name = score_func.__name__ + ("" if not score_args else "_" + "_".join([f"{key}={value}" for key, value in score_args.items()]))
+        results[metric_name] = {}
+        perfo[metric_name] = {}
+        for column in data.columns:
+            if column == gold:
+                continue
+            results[metric_name][column] = statistic_samples(data[gold], data[column])
+            perfo[metric_name][column]  = statistic(data[gold], data[column])
+        ccv[metric_name] = cv(np.array(list(perfo[metric_name].values())))
+        dist[metric_name] = dista(np.array(list(perfo[metric_name].values())))
+        cppi[metric_name] = ppi(np.array(list(perfo[metric_name].values())))
+        cBiB[metric_name] = score_BiB
+    compg = {'n' : n,
+             'm' : m-1,
+             'cv' : ccv,
+             'dist' : dist,
+             'PPI' : cppi}
+    performance_dict = {'samples' : results,
+                        'performance' : perfo,
+                        'compg' : compg,
+                        'BiB': cBiB}
+    return performance_dict 
+
+def plot_performance2(results: dict, CI: float=0.05,
+                     var_name='Algorithm', value_name='Score',
+                     capsize=0.2, linestyle='none', kind='point',
+                     sharex=False, **kwargs):
+    """
+    Plot the performance with confidence intervals. This function is used by plot_difference_multiple
+
+    Parameters:
+    results (dict): A dictionary where keys are algorithm names and values are lists of performance scores.
+    CI (float, optional): Confidence interval level for error bars. Defaults to 0.05.
+    var_name (str, optional): Variable name for the algorithms. Defaults to 'Algorithm'.
+    value_name (str, optional): Variable name for the scores. Defaults to 'Score'.
+    capsize (float, optional): Cap size for error bars. Defaults to 0.2.
+    linestyle (str, optional): Line style for the plot. Defaults to 'none'.
+    kind (str, optional): Type of the plot, e.g., 'point', 'bar'. Defaults to 'point'.
+    sharex (bool, optional): Whether to share the x-axis among subplots. Defaults to False.
+    **kwargs: Additional keyword arguments for seaborn.catplot.
+
+    Returns:
+    sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot.
+
+    The function works as follows:
+    1. If results is a dictionary, it sorts the algorithms by their mean performance scores.
+    2. Converts the sorted data into a long format DataFrame.
+    3. Computes the confidence intervals if CI is provided as a float.
+    4. Uses seaborn's catplot to create and display the performance plot with confidence intervals.
+    """    
+    if isinstance(results, dict):
+        lista_ordenada = sorted(results.items(), key=lambda x: np.mean(x[1]), reverse=True)
+        diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
+        df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name,
+                                                         value_name=value_name)
+
+    if isinstance(CI, float):
+        ci = lambda x: measurements.CI(x, alpha=CI)
+    f_grid = sns.catplot(df2, x=value_name, y=var_name,
+                         capsize=capsize, linestyle=linestyle,
+                         kind=kind, errorbar=ci, sharex=sharex, **kwargs)
+    return f_grid
+
+
+
+
+def difference_multiple(results_dict, CI: float=0.05,):
+    """
+    Calculate performance differences for multiple metrics, excluding the comparison of the best
+    with itself. Additionally, identify the best performing algorithm for each metric.
+
+    Parameters:
+    results_dict (dict): A dictionary where keys are metric names and values are dictionaries.
+                         Each sub-dictionary has algorithm names as keys and lists of performance scores as values.
+    CI (float, optional): Confidence interval level. Defaults to 0.05.
+
+    Returns:
+    dict: A dictionary with the same structure, but where the scores for each algorithm are replaced
+          by their differences to the scores of the best performing algorithm for that metric,
+          excluding the best performing algorithm comparing with itself.
+          Also includes the best algorithm name for each metric.
+
+    The function works as follows:
+    1. Iterates over each metric in the results dictionary.
+    2. Converts performance scores to numpy arrays for efficient computations.
+    3. Identifies the best performing algorithm for each metric based on the mean performance scores.
+    4. Calculates the differences in performance scores relative to the best performing algorithm.
+    5. Computes confidence intervals and p-values for these differences.
+    6. Stores the differences, confidence intervals, p-values, and the best algorithm for each metric.
+    7. Returns a dictionary with these calculated differences and additional information.
+
+    Example usage:
+    >>> from CompStats import performance, difference_multiple
+    >>> from CompStats.tests.test_performance import DATA
+    >>> from sklearn.metrics import f1_score
+    >>> import pandas as pd
+    >>> df = pd.read_csv(DATA)
+    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
+    >>> perf = performance(df, score=score)
+    >>> diff_mult = difference_multiple(perf, CI=0.05)
+    """
+    differences_dict = results_dict.copy()
+    winner = {}
+    alpha = CI
+    for metric, results in results_dict['samples'].items():
+        # Convert scores to arrays for vectorized operations
+        scores_arrays = {alg: np.array(scores) for alg, scores in results.items()}
+        # Identify the best performing algorithm (highest mean score)
+        if results_dict['BiB'][metric]:
+            best_alg = max(scores_arrays, key=lambda alg: np.mean(scores_arrays[alg]))
+        else:
+            best_alg = min(scores_arrays, key=lambda alg: np.mean(scores_arrays[alg]))
+        best_scores = scores_arrays[best_alg]
+        
+        # Calculate differences to the best performing algorithm, excluding the best from comparing with itself
+        differences = {alg: best_scores - scores for alg, scores in scores_arrays.items() if alg != best_alg}
+
+        # Calculate Confidence interval for differences to the bet performing algorithm.
+        CI_differences = {alg: measurements.CI(np.array(scores), alpha=CI) for alg, scores in differences.items()}
+        p_value_differences = {alg: measurements.difference_p_value(np.array(scores), BiB= results_dict['BiB'][metric]) for alg, scores in differences.items()}
+
+
+        # Store the differences and the best algorithm under the current metric
+        winner[metric] = {'best': best_alg, 'diff': differences,'CI':CI_differences,
+                                    'p_value': p_value_differences,
+                                    'none': sum(valor > alpha for valor in p_value_differences.values()),
+                                    'bonferroni': sum(multipletests(list(p_value_differences.values()), method='bonferroni')[1] > alpha), 
+                                    'holm': sum(multipletests(list(p_value_differences.values()), method='holm')[1] > alpha),
+                                    'HB': sum(multipletests(list(p_value_differences.values()), method='fdr_bh')[1] > alpha) }
+    differences_dict['winner'] = winner
+    return differences_dict
+
+
+def plot_difference2(diff_dictionary: dict, CI: float = 0.05,
+                    var_name='Comparison', value_name='Difference',
+                    set_refline=True, set_title=True,
+                    hue='Significant', palette=None, BiB: bool=True,
+                    **kwargs):
+    """Plot the difference in performance with its confidence intervals
+    
+    >>> from CompStats import performance, difference, plot_difference
+    >>> from CompStats.tests.test_performance import DATA
+    >>> from sklearn.metrics import f1_score
+    >>> import pandas as pd
+    >>> df = pd.read_csv(DATA)
+    >>> score = lambda y, hy: f1_score(y, hy, average='weighted')
+    >>> perf = performance(df, score=score)
+    >>> diff = difference(perf)
+    >>> ins = plot_difference(diff)
+    """
+    if isinstance(diff_dictionary, dict):
+        lista_ordenada = sorted(diff_dictionary['diff'].items(), key=lambda x: np.mean(x[1]), reverse=BiB)
+        diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
+        df2 = pd.DataFrame(diccionario_ordenado).melt(var_name=var_name,
+                                                         value_name=value_name)
+    if hue is not None:
+        df2[hue] = True
+    at_least_one = False
+    for key, (left, right) in diff_dictionary['CI'].items():
+        if left < 0 < right:
+            rows = df2[var_name] == key
+            df2.loc[rows, hue] = False
+            at_least_one = True
+    if at_least_one and palette is None:
+        palette = ['r', 'b']
+    else:
+        palette = ['b']
+    f_grid = plot_performance(df2, CI=CI, var_name=var_name,
+                              value_name=value_name, hue=hue,
+                              palette=palette, 
+                              **kwargs)
+    if set_refline:
+        f_grid.refline(x=0)
+    if set_title:
+        best = diff_dictionary['best']
+        f_grid.facet_axis(0, 0).set_title(f'Best: {best}')
+    return f_grid
+
+def plot_performance_multiple(results_dict: dict, CI: float = 0.05, capsize: float = 0.2, 
+                              linestyle: str = 'none', kind: str = 'point', **kwargs):
+    """
+    Create multiple performance plots, one for each performance metric in the results dictionary.
+
+    Parameters:
+    results_dict (dict): A dictionary where keys are metric names and values are dictionaries 
+                         with algorithm names as keys and lists of performance scores as values.
+    CI (float, optional): Confidence interval level for error bars. Defaults to 0.05.
+    capsize (float, optional): Cap size for error bars. Defaults to 0.2.
+    linestyle (str, optional): Line style for the plot. Defaults to 'none'.
+    kind (str, optional): Type of the plot, e.g., 'point', 'bar'. Defaults to 'point'.
+    **kwargs: Additional keyword arguments for seaborn.catplot.
+
+    Returns:
+    None: The function creates and displays plots.
+
+    The function works as follows:
+    1. Iterates over each metric in the results dictionary.
+    2. Uses the plot_performance2 function to create and display the plot for each metric.
+    3. Sets the title of each plot to the metric name and the best performing algorithm.
+
+    Example usage:
+    >>> from CompStats import plot_performance_multiple
+    >>> results = {
+    >>>     'accuracy': {
+    >>>         'alg1': [0.1, 0.2, 0.15], 
+    >>>         'alg2': [0.05, 0.1, 0.07]
+    >>>     },
+    >>>     'f1_score': {
+    >>>         'alg1': [0.3, 0.25, 0.2], 
+    >>>         'alg2': [0.2, 0.15, 0.1]
+    >>>     }
+    >>> }
+    >>> plot_performance_multiple(results, CI=0.05)
+    """
+    
+    for metric_name, metric_results in results_dict['samples'].items():
+        BiB = results_dict['BiB'].get(metric_name, True)
+        # Convert results to long format DataFrame
+        if isinstance(metric_results, dict):
+            lista_ordenada = sorted(metric_results.items(), key=lambda x: np.mean(x[1]), reverse=BiB)
+            diccionario_ordenado = {nombre: muestras for nombre, muestras in lista_ordenada}
+            df2 = pd.DataFrame(diccionario_ordenado).melt(var_name='Algorithm',
+                                                             value_name='Score')
+         
+        # Define the confidence interval function
+        if isinstance(CI, float):
+            ci = lambda x: measurements.CI(x, alpha=CI)
+        
+        # Create the plot
+        g = sns.catplot(df2, x='Score', y='Algorithm', capsize=capsize, linestyle=linestyle, 
+                        kind=kind, errorbar=ci, **kwargs)
+        
+        # Set the title of the plot
+        g.figure.suptitle(metric_name)
+        
+        # Display the plot
+        plt.show()
+
+
+def plot_difference_multiple(results_dict, CI=0.05, capsize=0.2, linestyle='none', kind='point', **kwargs):
+    """
+    Create multiple performance plots, one for each performance metric in the results dictionary.
+    
+    :param results_dict: A dictionary where keys are metric names and values are dictionaries with algorithm names as keys and lists of scores as values.
+    :param CI: Confidence interval level for error bars.
+    :param capsize: Cap size for error bars.
+    :param linestyle: Line style for the plot.
+    :param kind: Type of the plot, e.g., 'point', 'bar'.
+    :param kwargs: Additional keyword arguments for seaborn.catplot.
+    """   
+    for metric_name, metric_results in results_dict['winner'].items():
+        BiB = results_dict['BiB'].get(metric_name, True)
+        # Usa catplot para crear y mostrar el gráfico        
+        g = plot_difference2(metric_results, BiB=BiB, CI=CI)
+        g.figure.suptitle(metric_name)  
+        # plt.show()
+ 
+
+
+
+### este por el momento no.
+def plot_scatter_matrix(perf):
+    """
+    Generate a scatter plot matrix comparing the performance of the same algorithm
+    across different metrics contained in the 'perf' dictionary.
+    
+    :param perf: A dictionary where keys are metric names and values are dictionaries with algorithm names as keys
+                 and lists of performance scores as values.
+    """
+    # Convertir 'perf' en un DataFrame de pandas para facilitar la manipulación
+    df_long = pd.DataFrame([
+        {"Metric": metric, "Algorithm": alg, "Score": score, "Indice": i}
+        for metric, alg_scores in perf['samples'].items()
+        for alg, scores in alg_scores.items()
+        for i, (score)  in enumerate(scores)
+        ])
+    df_wide = df_long.pivot(index=['Algorithm','Indice'],columns='Metric',values='Score')
+    df_wide = df_wide.reset_index(level=[0])
+    sns.pairplot(df_wide, diag_kind='kde',hue="Algorithm", corner=True)
+    plt.suptitle('Scatter Plot Matrix of Algorithms Performance Across Different Metrics', y=1.02)
+    plt.show()
+
+
+
+def all_differences_multiple(results_dict, alpha: float=0.05):
+    """
+    Calculate performance differences for unique pairs of algorithms for multiple metrics.
+    Also, calculates the confidence interval for the differences.
+    
+    :param results_dict: A dictionary where keys are metric names and values are dictionaries.
+                         Each sub-dictionary has algorithm names as keys and lists of performance scores as values.
+    :return: A dictionary where each metric name maps to another dictionary.
+             This dictionary contains keys for unique pairs of algorithms and their performance differences,
+             including the confidence interval for these differences.
+    """
+    differences_dict = results_dict.copy()
+    all = {}
+    for metric, results in results_dict['samples'].items():
+        # Convert scores to arrays for vectorized operations
+        scores_arrays = {alg: np.array(scores) for alg, scores in results.items()}      
+        scores_arrays = dict(sorted(scores_arrays.items(), key=lambda item: np.mean(item[1]), reverse=results_dict['BiB'][metric]))
+
+        
+        differences = {}
+        p_value_differences = {}
+        
+        algorithms = list(scores_arrays.keys())
+        # Calculate differences for unique pairs of algorithms
+        for i, alg_a in enumerate(algorithms):
+            for alg_b in algorithms[i+1:]:  # Start from the next algorithm to avoid duplicate comparisons
+                # Calculate the difference between alg_a and alg_b
+                diff = scores_arrays[alg_a] - scores_arrays[alg_b]
+                differences[f"{alg_a} vs {alg_b}"] = diff
+                
+                # Placeholder for confidence interval calculation
+                # Replace the string with an actual call to your CI calculation function
+                p_value_differences[f"{alg_a} vs {alg_b}"] = measurements.difference_p_value(diff, BiB=results_dict['BiB'][metric])
+                # For example:
+                # CI_differences[f"{alg_a} vs {alg_b}"] = measurements.CI(diff, alpha=CI)
+                
+        # Store the differences under the current metric
+        all[metric] = {'diff': differences, 'p_value': p_value_differences, 
+                                    'none': sum(valor > alpha for valor in p_value_differences.values()),
+                                    'bonferroni': sum(multipletests(list(p_value_differences.values()), method='bonferroni')[1] > alpha), 
+                                    'holm': sum(multipletests(list(p_value_differences.values()), method='holm')[1] > alpha),
+                                    'HB': sum(multipletests(list(p_value_differences.values()), method='fdr_bh')[1] > alpha)  }
+    differences_dict['all'] = all
+    return differences_dict
+
diff --git a/CompStats/tests/test_measurements.py b/CompStats/tests/test_measurements.py
index f40d247..bc8c9b7 100644
--- a/CompStats/tests/test_measurements.py
+++ b/CompStats/tests/test_measurements.py
@@ -15,7 +15,7 @@
 import pandas as pd
 import os
 from sklearn.metrics import f1_score
-from CompStats.measurements import CI, difference_p_value
+from CompStats.measurements import CI, SE, difference_p_value
 from CompStats.bootstrap import StatisticSamples
 from CompStats.performance import performance, difference
 
@@ -38,4 +38,15 @@ def test_difference_p_value():
     perf = performance(df, score=lambda y, hy: f1_score(y, hy, average='weighted'))
     res = difference(perf)
     p_value = difference_p_value(res)
-    assert p_value['BoW'] > 0.2
\ No newline at end of file
+    assert p_value['BoW'] > 0.2
+
+
+def test_SE():
+    """Test confidence interval"""
+
+    statistic = StatisticSamples(num_samples=26, n_jobs=-1)
+    pop = np.r_[3, 4, 5, 2, 4]
+    samples = statistic(pop, name='test')
+    se = SE(samples)
+    se2 = SE(statistic)
+    assert se2['test'] == se
diff --git a/CompStats/tests/test_performance.py b/CompStats/tests/test_performance.py
index 1a2a177..3941011 100644
--- a/CompStats/tests/test_performance.py
+++ b/CompStats/tests/test_performance.py
@@ -14,9 +14,13 @@
 import numpy as np
 import pandas as pd
 import os
-from sklearn.metrics import f1_score
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error
 import seaborn as sns
-from CompStats.performance import performance, plot_performance, difference, plot_difference, all_differences
+from CompStats import performance, plot_performance, difference, plot_difference, all_differences
+from CompStats import performance_multiple_metrics, plot_performance2
+from CompStats import difference_multiple, plot_scatter_matrix, all_differences_multiple, plot_performance_multiple
+from CompStats import plot_difference2, plot_difference_multiple
+
 
 
 DATA = os.path.join(os.path.dirname(__file__), 'data.csv')
@@ -55,5 +59,55 @@ def test_all_differences():
     """Test all_differences"""
     df = pd.read_csv(DATA)
     perf = performance(df, score=lambda y, hy: f1_score(y, hy, average='weighted'))
-    res = all_differences(perf)
-    assert 'INGEOTEC - BoW' in res.calls
\ No newline at end of file
+    resa = all_differences(perf)
+    assert 'INGEOTEC - BoW' in resa.calls
+
+
+def test_performance_multiple_metrics():
+    """Test performance_multiple_metrics"""
+    df = pd.read_csv(DATA)
+    metrics = [
+        {"func": accuracy_score, 'BiB': True},
+        {"func": f1_score, "args": {"average": "macro"}, 'BiB': True},
+        {"func": precision_score, "args": {"average": "macro"}, 'BiB': True},
+        {"func": mean_absolute_error, 'BiB': False}
+        ]
+    perf = performance_multiple_metrics(df, "y", metrics)
+    ins = plot_performance_multiple(perf)
+    assert 'accuracy_score' in perf['samples']
+    assert 'y' not in perf['samples']['accuracy_score']
+    assert 'INGEOTEC' in perf['samples']['accuracy_score']
+
+
+def test_difference_multiple():
+    """Test difference_multiple"""
+    df = pd.read_csv(DATA)
+    metrics = [
+        {"func": accuracy_score, 'BiB': True},
+        {"func": f1_score, "args": {"average": "macro"}, 'BiB': True},
+        {"func": precision_score, "args": {"average": "macro"}, 'BiB': True},
+        {"func": mean_absolute_error, 'BiB': False}
+        ]
+    perf = performance_multiple_metrics(df, "y", metrics)
+    diff = difference_multiple(perf)
+    ins = plot_difference_multiple(diff)
+    assert diff['winner']['accuracy_score']['best'] == 'BoW'
+    assert 'BoW' not in diff['winner']['accuracy_score']['diff'].keys()
+    # assert isinstance(ins, sns.FacetGrid)
+
+
+def test_difference_summary():
+    """Test difference_summary"""
+    df = pd.read_csv(DATA)
+    metrics = [
+        {"func": accuracy_score, 'BiB': True},
+        {"func": f1_score, "args": {"average": "macro"}, 'BiB': True},
+        {"func": precision_score, "args": {"average": "macro"}, 'BiB': True},
+        {"func": mean_absolute_error, 'BiB': False}
+        ]
+    perf = performance_multiple_metrics(df, "y", metrics)
+    diff = difference_multiple(perf)
+    all_dif = all_differences_multiple(diff)
+    assert diff['winner']['accuracy_score']['best'] == 'BoW'
+    assert 'BoW' not in diff['winner']['accuracy_score']['diff'].keys()
+    assert all_dif['all']['accuracy_score']['none'] == 6
diff --git a/pyproject.toml b/pyproject.toml
index ce9be89..8108be5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,7 @@
 [project]
 name = 'CompStats'
+description = 'CompStats implements an evaluation methodology for statistically analyzing competition results and competition'
+readme = "README.rst"
 dependencies = [
     'numpy',
     'scikit-learn>=1.3.0',
@@ -8,5 +10,23 @@ dependencies = [
 ]
 dynamic = ['version']
 
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Environment :: Console",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Information Analysis"
+]
+
 [tool.setuptools.dynamic]
-version = {attr = 'CompStats.__version__'}
\ No newline at end of file
+version = {attr = 'CompStats.__version__'}
+
+[project.urls]
+Homepage = "https://compstats.readthedocs.io"
+Repository = "https://github.com/INGEOTEC/CompStats"
+Issues = "https://github.com/INGEOTEC/CompStats/issues"
\ No newline at end of file