From 592f718216c43850ca73d44156c4b1dbd000185e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Czy=C5=BC?= Date: Wed, 13 Mar 2024 17:24:45 +0100 Subject: [PATCH] Clean up redundant files (#24) * Remove redundant scripts. * Update README --- .gitignore | 2 + README.md | 2 - requirements.txt | 3 +- scripts/design_categorical.py | 156 --------------- scripts/experiment1/1-1.py | 32 --- scripts/experiment1/1-2.py | 33 --- scripts/experiment1/1-3.py | 33 --- scripts/experiment1/plot_figure.py | 97 --------- scripts/experiment_external_dataset.py | 134 ------------- scripts/experiment_external_dataset2.py | 179 ----------------- scripts/experiment_gaussian.py | 144 ------------- scripts/run_categorical.py | 256 ------------------------ 12 files changed, 4 insertions(+), 1067 deletions(-) delete mode 100644 scripts/design_categorical.py delete mode 100644 scripts/experiment1/1-1.py delete mode 100644 scripts/experiment1/1-2.py delete mode 100644 scripts/experiment1/1-3.py delete mode 100644 scripts/experiment1/plot_figure.py delete mode 100644 scripts/experiment_external_dataset.py delete mode 100644 scripts/experiment_external_dataset2.py delete mode 100644 scripts/experiment_gaussian.py delete mode 100644 scripts/run_categorical.py diff --git a/.gitignore b/.gitignore index 5b6e898..260a381 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ *.nb *.ipynb +data/Darmanis + # Directories for local files local/ private/ diff --git a/README.md b/README.md index 975b049..6a3a324 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -![build](https://github.com/labelshift/labelshift/actions/workflows/build.yml/badge.svg) - # Label Shift Python library for *quantification* (estimating the class prevalence in an unlabeled data set) under the prior probability shift assumption. diff --git a/requirements.txt b/requirements.txt index 0aa9708..4446c8f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ arviz +matplotlib numpy -petname pydantic scikit-learn scipy +subplots_from_axsize # Code quality tools black flake8 diff --git a/scripts/design_categorical.py b/scripts/design_categorical.py deleted file mode 100644 index 6e0ce90..0000000 --- a/scripts/design_categorical.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Experimental design for the categorical experiment. - -Use it to generate a list of commands to be run.""" -from pathlib import Path -from typing import Optional - -DIRECTORY = Path("data/generated/categorical_experiment") - -ESTIMATOR_CONFIGURATIONS = { - "MAP-1": "--algorithm MAP --bayesian-alpha 1", - "MAP-2": "--algorithm MAP --bayesian-alpha 2", - "CC": "--algorithm CC", - "IR": "--algorithm IR --restricted true", - "BBSE": "--algorithm BBSE", -} - -N_SEEDS: int = 2 - -N_LABELED: int = 1_000 -N_UNLABELED: int = 500 -QUALITY_LABELED: float = 0.85 -PI_UNLABELED: float = 0.7 -L: int = 5 -K: int = 5 - - -def command( - estimator_key: str, - seed: int, - output_dir: Path, - n_y: int = L, - n_c: int = K, - n_labeled: int = N_LABELED, - n_unlabeled: int = N_UNLABELED, - quality_labeled: float = QUALITY_LABELED, - quality_unlabeled: Optional[float] = None, - pi_unlabeled: float = PI_UNLABELED, -) -> str: - estimator_args = ESTIMATOR_CONFIGURATIONS[estimator_key] - - quality_unlabeled_str = ( - "" if quality_unlabeled is None else f"--quality-unlabeled {quality_unlabeled}" - ) - - print( - f"python scripts/run_categorical.py " - f"--n-labeled {n_labeled} --n-unlabeled {n_unlabeled} " - f"--quality {quality_labeled} {quality_unlabeled_str} " - f"--prevalence-unlabeled {pi_unlabeled} " - f"--seed {seed} " - f"--output-dir {output_dir} " - f"--K {n_y} --L {n_c} " - f"--tag {estimator_key} {estimator_args}" - ) - - -def experiment_change_prevalence() -> None: - """Fix L = K = 5 and change pi'_1.""" - for seed in range(N_SEEDS): - for pi_unlabeled in [0.5, 0.6, 0.7, 0.8, 0.9]: - for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): - output_dir = ( - DIRECTORY / "change_prevalence" / f"{algorithm}-{pi_unlabeled}" - ) - command( - output_dir=output_dir, - pi_unlabeled=pi_unlabeled, - seed=seed, - estimator_key=algorithm, - ) - - -def experiment_change_n_unlabeled() -> None: - """Change N'.""" - for seed in range(N_SEEDS): - for n_unlabeled in [10, 50, 100, 500, 1000, 10000]: - for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): - output_dir = ( - DIRECTORY / "change_n_unlabeled" / f"{algorithm}-{n_unlabeled}" - ) - command( - n_unlabeled=n_unlabeled, - seed=seed, - estimator_key=algorithm, - output_dir=output_dir, - ) - - -def experiment_change_k() -> None: - """Change K, keeping L fixed.""" - for seed in range(N_SEEDS): - for n_c in [2, 3, 5, 7, 9]: - for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): - output_dir = DIRECTORY / "change_k" / f"{algorithm}-{n_c}" - command( - seed=seed, - output_dir=output_dir, - estimator_key=algorithm, - n_c=n_c, - ) - - -def experiment_change_jointly_l_and_k() -> None: - """Jointly change L = K.""" - for seed in range(N_SEEDS): - for lk in [2, 3, 5, 7, 9, 10]: - for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): - output_dir = DIRECTORY / "change_jointly_lk" / f"{algorithm}-{lk}" - command( - seed=seed, - estimator_key=algorithm, - output_dir=output_dir, - n_c=lk, - n_y=lk, - ) - - -def experiment_change_quality() -> None: - """Change quality.""" - for seed in range(N_SEEDS): - for quality in [0.55, 0.65, 0.75, 0.85, 0.95]: - for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): - output_dir = DIRECTORY / "change_quality" / f"{algorithm}-{quality}" - command( - quality_labeled=quality, - seed=seed, - estimator_key=algorithm, - output_dir=output_dir, - ) - - -def experiment_misspecified() -> None: - """Change quality in the unlabeled population, so that the model is misspecified.""" - for seed in range(N_SEEDS): - for quality_prime in [0.45, 0.55, 0.65, 0.75, 0.80, 0.85, 0.90, 0.95]: - for algorithm in ESTIMATOR_CONFIGURATIONS.keys(): - output_dir = DIRECTORY / "misspecified" / f"{algorithm}-{quality_prime}" - command( - quality_unlabeled=quality_prime, - seed=seed, - output_dir=output_dir, - estimator_key=algorithm, - ) - - -def main() -> None: - experiment_change_prevalence() - experiment_change_n_unlabeled() - experiment_change_quality() - experiment_change_jointly_l_and_k() - experiment_change_k() - experiment_misspecified() - - -if __name__ == "__main__": - main() diff --git a/scripts/experiment1/1-1.py b/scripts/experiment1/1-1.py deleted file mode 100644 index faf0d71..0000000 --- a/scripts/experiment1/1-1.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Fixed q = 0.85, N' = 500 and changed the prevalence π'1 in range {0.5, 0.6, 0.7, 0.8, 0.9}. -""" -algorithms = [ - "ClassifyAndCount", - "RatioEstimator", - "BlackBoxShiftEstimator", - "BayesianMAP", -] - - -def main() -> None: - n_labeled = 1000 - n_unlabeled = 500 - quality = 0.85 - n_seeds = 30 - - pi_labeled = 0.5 - - for pi_unlabeled in [0.5, 0.6, 0.7, 0.8, 0.9]: - for seed in range(n_seeds): - for algorithm in algorithms: - try: - output_dir = f"experiment1-1/{algorithm}" - command = f"python scripts/experiment_categorical.py --n-labeled {n_labeled} --n-unlabeled {n_unlabeled} --quality {quality} --pi-labeled {pi_labeled} --pi-unlabeled {pi_unlabeled} --seed {seed} --algorithm {algorithm} --output-dir {output_dir}" - print(command) - except Exception as e: - print(e) - - -if __name__ == "__main__": - main() diff --git a/scripts/experiment1/1-2.py b/scripts/experiment1/1-2.py deleted file mode 100644 index 9a924d0..0000000 --- a/scripts/experiment1/1-2.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Fixed q = 0.85, π′1 = 0.7 and changed N' in range -{10, 50, 100, 500, 1000, 10000} -""" -algorithms = [ - "ClassifyAndCount", - "RatioEstimator", - "BlackBoxShiftEstimator", - "BayesianMAP", -] - - -def main() -> None: - n_labeled = 1000 - quality = 0.85 - n_seeds = 30 - - pi_unlabeled = 0.7 - pi_labeled = 0.5 - - for n_unlabeled in [10, 50, 100, 500, 1000, 10000]: - for seed in range(n_seeds): - for algorithm in algorithms: - try: - output_dir = f"experiment1-2/{algorithm}" - command = f"python scripts/experiment_categorical.py --n-labeled {n_labeled} --n-unlabeled {n_unlabeled} --quality {quality} --pi-labeled {pi_labeled} --pi-unlabeled {pi_unlabeled} --seed {seed} --algorithm {algorithm} --output-dir {output_dir}" - print(command) - except Exception as e: - print(e) - - -if __name__ == "__main__": - main() diff --git a/scripts/experiment1/1-3.py b/scripts/experiment1/1-3.py deleted file mode 100644 index 3389f8b..0000000 --- a/scripts/experiment1/1-3.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Fixed π′1 = 0.7, N'= 500 and changed q in range -{0.55, 0.65, 0.75, 0.85, 0.95} -""" -algorithms = [ - "ClassifyAndCount", - "RatioEstimator", - "BlackBoxShiftEstimator", - "BayesianMAP", -] - - -def main() -> None: - n_labeled = 1000 - n_unlabeled = 500 - n_seeds = 30 - - pi_unlabeled = 0.7 - pi_labeled = 0.5 - - for quality in [0.55, 0.65, 0.75, 0.85, 0.95]: - for seed in range(n_seeds): - for algorithm in algorithms: - try: - output_dir = f"experiment1-3/{algorithm}" - command = f"python scripts/experiment_categorical.py --n-labeled {n_labeled} --n-unlabeled {n_unlabeled} --quality {quality} --pi-labeled {pi_labeled} --pi-unlabeled {pi_unlabeled} --seed {seed} --algorithm {algorithm} --output-dir {output_dir}" - print(command) - except Exception as e: - print(e) - - -if __name__ == "__main__": - main() diff --git a/scripts/experiment1/plot_figure.py b/scripts/experiment1/plot_figure.py deleted file mode 100644 index d0aa63a..0000000 --- a/scripts/experiment1/plot_figure.py +++ /dev/null @@ -1,97 +0,0 @@ -import json -import string -from pathlib import Path - -import seaborn as sns -import matplotlib.pyplot as plt -import pandas as pd - - -rename_dict = { - "ClassifyAndCount": "CC", - "RatioEstimator": "IR", - "BlackBoxShiftEstimator": "BBSE", - "BayesianMAP": "MAP", -} - -hue_order = [ - "CC", - "IR", - "BBSE", - "MAP", -] - - -def file_to_row(file): - with open(file) as f: - x = json.load(f) - return { - "Algorithm": rename_dict[x["algorithm"]], - "true": x["true"][0], - "estimated": x["estimated"][0], - "quality": x["sampler"]["p_c_cond_y"][0][0], - "n_labeled": x["sampler"]["n_labeled"], - "n_unlabeled": x["sampler"]["n_unlabeled"], - } - - -def experiment_directory_to_dataframe(experiment_directory) -> pd.DataFrame: - files = list( - Path(experiment_directory).rglob( - "*.json", - ) - ) - df = pd.DataFrame([file_to_row(f) for f in files]) - df["error"] = df["estimated"] - df["true"] - return df - - -def main() -> None: - fig, axs = plt.subplots(3, 1, figsize=(4, 12), sharey=False) - - experiment1 = "experiment1-1" - df1 = experiment_directory_to_dataframe(experiment1) - sns.boxplot( - df1, x="true", y="error", hue="Algorithm", ax=axs[0], hue_order=hue_order - ) - axs[0].set_xlabel(r"Prevalence $\pi'_1$") - axs[0].set_ylabel(r"Signed difference $\hat \pi'_1 - \pi'_1$") - - experiment2 = "experiment1-2" - df2 = experiment_directory_to_dataframe(experiment2) - sns.boxplot( - df2, x="n_unlabeled", y="error", hue="Algorithm", ax=axs[1], hue_order=hue_order - ) - - axs[1].set_xlabel(r"Unlabeled data set size $N'$") - axs[1].set_ylabel(r"Signed difference $\hat \pi'_1 - \pi'_1$") - axs[1].legend([], [], frameon=False) - - experiment3 = "experiment1-3" - df3 = experiment_directory_to_dataframe(experiment3) - sns.boxplot( - df3, x="quality", y="error", hue="Algorithm", ax=axs[2], hue_order=hue_order - ) - - axs[2].set_xlabel(r"Classifier quality $q$") - axs[2].set_ylabel(r"Signed difference $\hat \pi'_1 - \pi'_1$") - axs[2].legend([], [], frameon=False) - - for n, ax in enumerate(axs): - ax.text( - -0.1, - 1.1, - string.ascii_uppercase[n], - transform=ax.transAxes, - size=20, - weight="bold", - ) - - sns.move_legend(axs[0], "lower left") # , bbox_to_anchor=(1, 1)) - - fig.tight_layout() - fig.savefig("experiment1.pdf") - - -if __name__ == "__main__": - main() diff --git a/scripts/experiment_external_dataset.py b/scripts/experiment_external_dataset.py deleted file mode 100644 index a5a892c..0000000 --- a/scripts/experiment_external_dataset.py +++ /dev/null @@ -1,134 +0,0 @@ -import enum - -import numpy as np -import sklearn.datasets -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LogisticRegression - -import labelshift.datasets.split as split -import labelshift.summary_statistic as summ - -import labelshift.algorithms.api as algos -import labelshift.algorithms.ratio_estimator as re -from labelshift.algorithms.expectation_maximization import expectation_maximization - - -class Algorithm(enum.Enum): - EM = "ExpectationMaximization" - CC = "ClassifyAndCount" - BBSE_HARD = "BBSE-Hard" - RATIO_HARD = "InvariantRatio-Hard" - BAYESIAN = "Bayesian-MAP" - RATIO_SOFT = "InvariantRatio-Soft" - - -def get_estimate( - algorithm: Algorithm, - n_y_c_labeled: np.ndarray, - n_c_unlabeled: np.ndarray, - y_labeled: np.ndarray, - prob_c_labeled: np.ndarray, - prob_c_unlabeled: np.ndarray, - labeled_prevalence: np.ndarray, -) -> np.ndarray: - """Function running the (point) prevalence estimator. - - Args: - algorithm: estimator - n_y_c_labeled: matrix with counts of predictions and true values, shape (L, K) - n_c_unlabeled: vector with prediction counts on unlabeled data set, shape (K,) - y_labeled: true labels in the labeled data set, shape (N,) - prob_c_labeled: predictions of the classifier on the labeled data set, shape (N, K) - prob_c_unlabeled: predictions of the classifier on the unlabeled data set, shape (N', K) - labeled_prevalence: prevalence vector on the labeled distribution, shape (L,) - """ - summary_statistic = algos.SummaryStatistic( - n_y_labeled=None, n_y_and_c_labeled=n_y_c_labeled, n_c_unlabeled=n_c_unlabeled - ) - - if algorithm == Algorithm.EM: - return expectation_maximization( - predictions=prob_c_unlabeled, training_prevalences=labeled_prevalence - ) - elif algorithm == Algorithm.CC: - return algos.ClassifyAndCount().estimate_from_summary_statistic( - summary_statistic - ) - elif algorithm == Algorithm.BBSE_HARD: - return algos.BlackBoxShiftEstimator( - p_y_labeled=labeled_prevalence - ).estimate_from_summary_statistic(summary_statistic) - elif algorithm == Algorithm.RATIO_HARD: - return algos.InvariantRatioEstimator( - restricted=True - ).estimate_from_summary_statistic(summary_statistic) - elif algorithm == Algorithm.BAYESIAN: - return algos.DiscreteCategoricalMAPEstimator().estimate_from_summary_statistic( - summary_statistic - ) - elif algorithm == Algorithm.RATIO_SOFT: - return re.calculate_vector_and_matrix_from_predictions( - unlabeled_predictions=prob_c_unlabeled, - labeled_predictions=prob_c_labeled, - labeled_ground_truth=y_labeled, - ) - else: - raise ValueError(f"Algorithm {algorithm} not recognized.") - - -def main() -> None: - L = 2 - K = L - dataset = sklearn.datasets.load_breast_cancer() - print(len(dataset.target)) - - random_seed: int = 22 - n_training_examples: int = 200 - n_labeled_examples: int = 100 - n_unlabeled_examples: int = 150 - prevalence_labeled: np.ndarray = np.ones(2) / 2 - prevalence_unlabeled: np.ndarray = np.asarray([0.3, 0.7]) - - specification = split.SplitSpecification( - train=np.asarray(prevalence_labeled * n_training_examples, dtype=int).tolist(), - valid=np.asarray(prevalence_labeled * n_labeled_examples, dtype=int).tolist(), - test=np.asarray( - prevalence_unlabeled * n_unlabeled_examples, dtype=int - ).tolist(), - ) - - datasets = split.split_dataset( - dataset=dataset, specification=specification, random_seed=random_seed - ) - - classifier = DecisionTreeClassifier(random_state=random_seed + 1) - classifier = RandomForestClassifier(random_state=random_seed + 1) - classifier = LogisticRegression(random_state=random_seed + 1) - classifier.fit(datasets.train_x, datasets.train_y) - - # The count values - n_y_c_labeled = summ.count_values_joint( - L, K, datasets.valid_y, classifier.predict(datasets.valid_x) - ) - n_c_unlabeled = summ.count_values(K, classifier.predict(datasets.test_x)) - - labeled_probabilities = classifier.predict_proba(datasets.valid_x) - unlabeled_probabilities = classifier.predict_proba(datasets.test_x) - - for alg in Algorithm: - print(alg) - estimate = get_estimate( - algorithm=alg, - n_y_c_labeled=n_y_c_labeled, - n_c_unlabeled=n_c_unlabeled, - y_labeled=datasets.valid_y, - prob_c_labeled=labeled_probabilities, - prob_c_unlabeled=unlabeled_probabilities, - labeled_prevalence=prevalence_labeled, - ) - print(estimate) - - -if __name__ == "__main__": - main() diff --git a/scripts/experiment_external_dataset2.py b/scripts/experiment_external_dataset2.py deleted file mode 100644 index e143fef..0000000 --- a/scripts/experiment_external_dataset2.py +++ /dev/null @@ -1,179 +0,0 @@ -import enum - -import arviz as az -import matplotlib.pyplot as plt -import numpy as np -import pymc as pm -import sklearn.datasets -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LogisticRegression - -import labelshift.datasets.split as split -import labelshift.summary_statistic as summ - -import labelshift.algorithms.api as algos -import labelshift.algorithms.ratio_estimator as re -import labelshift.algorithms.bayesian_discrete as bay -from labelshift.algorithms.expectation_maximization import expectation_maximization - -plt.rcParams.update({"font.size": 14}) - - -class Algorithm(enum.Enum): - EM = "EM" - CC = "CC" - BBSE_HARD = "BBSE" - RATIO_HARD = "IR: hard" - RATIO_SOFT = "IR: soft" - - -def get_estimate( - algorithm: Algorithm, - n_y_c_labeled: np.ndarray, - n_c_unlabeled: np.ndarray, - y_labeled: np.ndarray, - prob_c_labeled: np.ndarray, - prob_c_unlabeled: np.ndarray, - labeled_prevalence: np.ndarray, -) -> np.ndarray: - """Function running the (point) prevalence estimator. - - Args: - algorithm: estimator - n_y_c_labeled: matrix with counts of predictions and true values, shape (L, K) - n_c_unlabeled: vector with prediction counts on unlabeled data set, shape (K,) - y_labeled: true labels in the labeled data set, shape (N,) - prob_c_labeled: predictions of the classifier on the labeled data set, shape (N, K) - prob_c_unlabeled: predictions of the classifier on the unlabeled data set, shape (N', K) - labeled_prevalence: prevalence vector on the labeled distribution, shape (L,) - """ - summary_statistic = algos.SummaryStatistic( - n_y_labeled=None, n_y_and_c_labeled=n_y_c_labeled, n_c_unlabeled=n_c_unlabeled - ) - - if algorithm == Algorithm.EM: - return expectation_maximization( - predictions=prob_c_unlabeled, training_prevalences=labeled_prevalence - ) - elif algorithm == Algorithm.CC: - return algos.ClassifyAndCount().estimate_from_summary_statistic( - summary_statistic - ) - elif algorithm == Algorithm.BBSE_HARD: - return algos.BlackBoxShiftEstimator( - p_y_labeled=labeled_prevalence - ).estimate_from_summary_statistic(summary_statistic) - elif algorithm == Algorithm.RATIO_HARD: - return algos.InvariantRatioEstimator( - restricted=True - ).estimate_from_summary_statistic(summary_statistic) - elif algorithm == Algorithm.RATIO_SOFT: - return re.calculate_vector_and_matrix_from_predictions( - unlabeled_predictions=prob_c_unlabeled, - labeled_predictions=prob_c_labeled, - labeled_ground_truth=y_labeled, - ) - else: - raise ValueError(f"Algorithm {algorithm} not recognized.") - - -def main() -> None: - L = 2 - K = L - dataset = sklearn.datasets.load_breast_cancer() - print(len(dataset.target)) - - ymax: float = 7.0 - random_seed: int = 20 - n_training_examples: int = 200 - n_labeled_examples: int = 100 - n_unlabeled_examples: int = 150 - prevalence_labeled: np.ndarray = np.ones(2) / 2 - prevalence_unlabeled: np.ndarray = np.asarray([0.3, 0.7]) - - specification = split.SplitSpecification( - train=np.asarray(prevalence_labeled * n_training_examples, dtype=int).tolist(), - valid=np.asarray(prevalence_labeled * n_labeled_examples, dtype=int).tolist(), - test=np.asarray( - prevalence_unlabeled * n_unlabeled_examples, dtype=int - ).tolist(), - ) - - datasets = split.split_dataset( - dataset=dataset, specification=specification, random_seed=random_seed - ) - - # classifier = DecisionTreeClassifier(random_state=random_seed + 1) - classifier = RandomForestClassifier(random_state=random_seed + 1) - # classifier = LogisticRegression(random_state=random_seed + 1) - classifier.fit(datasets.train_x, datasets.train_y) - - # The count values - n_y_c_labeled = summ.count_values_joint( - L, K, datasets.valid_y, classifier.predict(datasets.valid_x) - ) - n_c_unlabeled = summ.count_values(K, classifier.predict(datasets.test_x)) - - labeled_probabilities = classifier.predict_proba(datasets.valid_x) - unlabeled_probabilities = classifier.predict_proba(datasets.test_x) - - with bay.build_model( - n_y_and_c_labeled=n_y_c_labeled, - n_c_unlabeled=n_c_unlabeled, - ): - idata = pm.sample() - - fig, ax = plt.subplots(figsize=(6, 4)) - _, ax_trash = plt.subplots() - - az.plot_posterior(idata, ax=[ax, ax_trash], var_names=bay.P_TEST_Y) - ax.set_title(r"$\pi'_1$ posterior") - - ax.vlines( - x=prevalence_unlabeled[0], - ymin=0, - ymax=ymax, - label="Ground truth", - colors=["k"], - linestyles=["--"], - ) - - linestyles = [ - "dashdot", - (0, (1, 1)), - "solid", - "dashed", - (0, (3, 10, 1, 10)), - ] - - for i, alg in enumerate(Algorithm): - print(alg) - estimate = get_estimate( - algorithm=alg, - n_y_c_labeled=n_y_c_labeled, - n_c_unlabeled=n_c_unlabeled, - y_labeled=datasets.valid_y, - prob_c_labeled=labeled_probabilities, - prob_c_unlabeled=unlabeled_probabilities, - labeled_prevalence=prevalence_labeled, - ) - - ax.vlines( - estimate[0], - ymin=0, - ymax=ymax, - label=alg.value, - colors=[f"C{i+2}"], - linestyles=[linestyles[i]], - ) - - print(estimate) - - fig.legend() - fig.tight_layout() - fig.savefig("plot_cancer.pdf") - - -if __name__ == "__main__": - main() diff --git a/scripts/experiment_gaussian.py b/scripts/experiment_gaussian.py deleted file mode 100644 index 285980d..0000000 --- a/scripts/experiment_gaussian.py +++ /dev/null @@ -1,144 +0,0 @@ -"""This experiment plots the posterior in the Gaussian mixture model as well -as a discretized version of that. -""" -import string -from typing import List - -import arviz as az -import matplotlib.pyplot as plt -import numpy as np -import pymc as pm -import seaborn as sns - -import labelshift.partition as part -import labelshift.summary_statistic as summ -import labelshift.algorithms.bayesian_discrete as discrete - - -plt.rcParams.update({"font.size": 22}) - - -def plot_distributions( - ax: plt.Axes, - X: np.ndarray, - X1: np.ndarray, - breakpoints: np.ndarray, - height: float = 1.0, -) -> None: - """ - - Args: - ax: axes where to draw the plot - X: points from the labeled distribution, shape (n_labeled,) - X1: points from the unlabeled distribution, shape (n_unlabeled,) - breakpoints: breakpoints to be plotted, shape (n_breakpoints,) - """ - sns.kdeplot(data=np.hstack(X), ax=ax) - sns.kdeplot(data=np.hstack(X1), ax=ax) - - for bp in breakpoints: - ax.axvline(bp, ymax=height, linestyle="--", c="k", alpha=0.5) - - -def gaussian_model( - labeled_data: List[np.ndarray], unlabeled_data: np.ndarray -) -> pm.Model: - """ - Args: - labeled_data: list of samples attributed to each Y: - [ - [a1, ..., a_n0], - [b1, ..., b_n1] - ] - unlabeled_data: array of shape (n_unlabeled,) - """ - with pm.Model() as model: - mu = pm.Normal("mu", mu=0, sigma=1, shape=2) - sigma = pm.HalfNormal("sigma", sigma=1, shape=2) - - for i in range(2): - pm.Normal( - f"X_labeled{i}", mu=mu[i], sigma=sigma[i], observed=labeled_data[i] - ) - - weights = pm.Dirichlet("P_unlabeled(Y)", np.ones(2)) - - pm.NormalMixture( - "X_unlabeled", w=weights, mu=mu, sigma=sigma, observed=unlabeled_data - ) - - return model - - -def main() -> None: - """The main method.""" - mus = [0.0, 1.0] - sigmas = [0.3, 0.4] - ns = [500, 500] - ns_ = [200, 800] - K = 7 - L = 2 - - partition = part.RealLinePartition(np.linspace(-0.5, 1.5, K - 1)) - print(partition.breakpoints) - - assert len(partition) == K - - rng = np.random.default_rng(42) - - X_stratified = [ - rng.normal(loc=mu, scale=sigma, size=n) for mu, sigma, n in zip(mus, sigmas, ns) - ] - X = np.hstack(X_stratified) - Y = np.hstack([[i] * n for i, n in enumerate(ns)]) - - C = partition.predict(X) - - X1_stratified = [ - rng.normal(loc=mu, scale=sigma, size=n_) - for mu, sigma, n_ in zip(mus, sigmas, ns_) - ] - X1 = np.hstack(X1_stratified) - C1 = partition.predict(X1) - - n_c_unlabeled = summ.count_values(K, C1) - n_y_c_labeled = summ.count_values_joint(L, K, Y, C) - - print(n_c_unlabeled) - print(n_y_c_labeled) - - fig, axs = plt.subplots(3, figsize=(6, 9)) - plot_distributions(ax=axs[0], X=X, X1=X1, breakpoints=partition.breakpoints) - - with gaussian_model(labeled_data=X_stratified, unlabeled_data=X1): - gaussian_data = pm.sample() - - _, ax_trash = plt.subplots() - - az.plot_posterior(gaussian_data, ax=[axs[1], ax_trash], var_names="P_unlabeled(Y)") - axs[1].set_title(r"$\pi'_1$ (Gaussian)") - - with discrete.build_model( - n_y_and_c_labeled=n_y_c_labeled, n_c_unlabeled=n_c_unlabeled - ): - discrete_data = pm.sample() - - az.plot_posterior(discrete_data, ax=[axs[2], ax_trash], var_names=discrete.P_TEST_Y) - axs[2].set_title(r"$\pi'_1$ (Discrete)") - - for n, ax in enumerate(axs): - ax.text( - -0.1, - 1.1, - string.ascii_uppercase[n], - transform=ax.transAxes, - size=20, - weight="bold", - ) - - fig.tight_layout() - fig.savefig("plot.pdf") - - -if __name__ == "__main__": - main() diff --git a/scripts/run_categorical.py b/scripts/run_categorical.py deleted file mode 100644 index 496f58c..0000000 --- a/scripts/run_categorical.py +++ /dev/null @@ -1,256 +0,0 @@ -"""Sample data directly from P(C|Y) distribution and run specified quantification estimator.""" -import argparse -import enum -from pathlib import Path -from typing import List - -import pydantic - -import labelshift.interfaces.point_estimators as pe -import labelshift.datasets.discrete_categorical as dc -import labelshift.algorithms.api as algo -import labelshift.experiments.api as exp - - -class Algorithm(enum.Enum): - CLASSIFY_AND_COUNT = "CC" - RATIO_ESTIMATOR = "IR" - BBSE = "BBSE" - BAYESIAN = "MAP" - - -def create_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser() - parser.add_argument( - "--n-labeled", type=int, default=1_000, help="Number of labeled examples." - ) - parser.add_argument( - "--n-unlabeled", type=int, default=1_000, help="Number of unlabeled examples." - ) - parser.add_argument( - "--quality", - type=float, - default=0.85, - help="Quality of the classifier on the labeled data.", - ) - parser.add_argument( - "--quality-unlabeled", - type=float, - default=None, - help="Quality of the classifier on the unlabeled data." - "Can be used to assess model misspecification. " - "If None, the quality will be the same for both labeled" - "and unlabeled data set (no misspecification).", - ) - parser.add_argument("--L", type=int, default=2, help="Number of classes L.") - parser.add_argument( - "--K", - type=int, - default=None, - help="Number of available predictions. Default: the same as L.", - ) - parser.add_argument( - "--prevalence-labeled", - type=float, - default=None, - help="Prevalence of the first class in the labeled data set. Default: 1/L (uniform).", - ) - parser.add_argument( - "--prevalence-unlabeled", - type=float, - default=None, - help="Prevalence of the first class in the unlabeled data set. Default: 1/L (uniform).", - ) - parser.add_argument( - "--seed", type=int, default=1, help="Random seed to sample the data." - ) - parser.add_argument("--algorithm", type=Algorithm, default=Algorithm.BAYESIAN) - parser.add_argument( - "--output", type=Path, default=Path(f"{exp.generate_name()}.json") - ) - parser.add_argument("--output-dir", type=Path, default=None) - - parser.add_argument( - "--bayesian-alpha", - type=float, - default=1.0, - help="Dirichlet prior specification for the Bayesian quantification.", - ) - parser.add_argument( - "--restricted", - type=bool, - default=True, - help="Whether to use restricted invariant ratio estimator.", - ) - - parser.add_argument( - "--tag", type=str, default="", help="Can be used to tag the run." - ) - - parser.add_argument("--dry-run", action="store_true") - - return parser - - -class EstimatorArguments(pydantic.BaseModel): - bayesian_alpha: float - restricted: bool - - -class Arguments(pydantic.BaseModel): - p_y_labeled: pydantic.confloat(gt=0, lt=1) - p_y_unlabeled: pydantic.confloat(gt=0, lt=1) - - quality_labeled: pydantic.confloat(ge=0, le=1) - quality_unlabeled: pydantic.confloat(ge=0, le=1) - - n_y: pydantic.PositiveInt = pydantic.Field(description="Number of labels, L.") - n_c: pydantic.PositiveInt = pydantic.Field(description="Number of predictions, K.") - - n_labeled: pydantic.PositiveInt - n_unlabeled: pydantic.PositiveInt - - seed: int - - algorithm: Algorithm - tag: str - estimator_arguments: EstimatorArguments - - -def parse_args(args) -> Arguments: - n_y = args.L - n_c = exp.calculate_value(overwrite=args.K, default=n_y) - - quality_unlabeled = exp.calculate_value( - overwrite=args.quality_unlabeled, default=args.quality - ) - - p_y_labeled = exp.calculate_value( - overwrite=args.prevalence_labeled, default=1 / n_y - ) - p_y_unlabeled = exp.calculate_value( - overwrite=args.prevalence_unlabeled, default=1 / n_y - ) - - return Arguments( - p_y_labeled=p_y_labeled, - p_y_unlabeled=p_y_unlabeled, - quality_labeled=args.quality, - quality_unlabeled=quality_unlabeled, - n_y=n_y, - n_c=n_c, - seed=args.seed, - n_labeled=args.n_labeled, - n_unlabeled=args.n_unlabeled, - algorithm=args.algorithm, - tag=args.tag, - estimator_arguments=EstimatorArguments( - bayesian_alpha=args.bayesian_alpha, - restricted=args.restricted, - ), - ) - - -def create_sampler(args: Arguments) -> dc.DiscreteSampler: - L = args.n_y - p_y_labeled = dc.almost_eye(L, L, diagonal=args.p_y_labeled)[0, :] - p_y_unlabeled = dc.almost_eye(L, L, diagonal=args.p_y_unlabeled)[0, :] - - p_c_cond_y_labeled = dc.almost_eye( - y=L, - c=args.n_c, - diagonal=args.quality_labeled, - ) - p_c_cond_y_unlabeled = dc.almost_eye( - y=L, - c=args.n_c, - diagonal=args.quality_unlabeled, - ) - - return dc.discrete_sampler_factory( - p_y_labeled=p_y_labeled, - p_y_unlabeled=p_y_unlabeled, - p_c_cond_y_labeled=p_c_cond_y_labeled, - p_c_cond_y_unlabeled=p_c_cond_y_unlabeled, - ) - - -def get_estimator(args: Arguments) -> pe.SummaryStatisticPrevalenceEstimator: - if args.algorithm == Algorithm.CLASSIFY_AND_COUNT: - if args.n_c != args.n_y: - raise ValueError("For classify and count you need K = L.") - return algo.ClassifyAndCount() - elif args.algorithm == Algorithm.RATIO_ESTIMATOR: - return algo.InvariantRatioEstimator( - restricted=args.estimator_arguments.restricted, enforce_square=False - ) - elif args.algorithm == Algorithm.BBSE: - return algo.BlackBoxShiftEstimator(enforce_square=False) - elif args.algorithm == Algorithm.BAYESIAN: - return algo.DiscreteCategoricalMAPEstimator( - alpha_unlabeled=args.estimator_arguments.bayesian_alpha - ) - else: - raise ValueError(f"Algorithm {args.algorithm} not recognized.") - - -class Result(pydantic.BaseModel): - p_y_unlabeled_true: List[float] - p_y_unlabeled_estimate: List[float] - time: float - algorithm: Algorithm - - input_arguments: Arguments - - -def dry_run(args: Arguments) -> None: - print("-- Dry run --\nUsed settings:") - print(args) - print("Exiting...") - - -def main() -> None: - """The main function of the experiment.""" - raw_args = create_parser().parse_args() - args: Arguments = parse_args(raw_args) - - if raw_args.dry_run: - dry_run(args) - return - - sampler = create_sampler(args) - - summary_statistic = sampler.sample_summary_statistic( - n_labeled=args.n_labeled, - n_unlabeled=args.n_unlabeled, - seed=args.seed, - ) - - estimator = get_estimator(args) - timer = exp.Timer() - estimate = estimator.estimate_from_summary_statistic(summary_statistic) - elapsed_time = timer.check() - - result = Result( - algorithm=args.algorithm, - time=elapsed_time, - p_y_unlabeled_true=sampler.unlabeled.p_y.tolist(), - p_y_unlabeled_estimate=estimate.tolist(), - input_arguments=args, - ) - - if raw_args.output_dir is not None: - raw_args.output_dir.mkdir(exist_ok=True, parents=True) - output_path = raw_args.output_dir / raw_args.output - else: - output_path = raw_args.output - - with open(output_path, "w") as f: - f.write(result.json()) - - print(result) - print("Finished.") - - -if __name__ == "__main__": - main()