From cf8e9119a77e3e2d2d45020d6785912dda3854c9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 9 Sep 2023 22:01:18 -0400 Subject: [PATCH] Adds benchmarking scripts and pipeline (#124) Signed-off-by: Adam Li --- .gitignore | 6 + asv.conf.json | 180 +++++++++++ benchmarks/__init__.py | 1 + benchmarks/common.py | 282 ++++++++++++++++++ benchmarks/config.json | 33 ++ benchmarks/datasets.py | 161 ++++++++++ benchmarks/ensemble_supervised.py | 74 +++++ benchmarks/utils.py | 46 +++ benchmarks_nonasv/README.md | 2 + benchmarks_nonasv/bench_mnist.py | 185 ++++++++++++ benchmarks_nonasv/bench_oblique_tree.py | 169 +++++++++++ .../bench_plot_urf.py | 0 12 files changed, 1139 insertions(+) create mode 100644 asv.conf.json create mode 100644 benchmarks/__init__.py create mode 100644 benchmarks/common.py create mode 100644 benchmarks/config.json create mode 100644 benchmarks/datasets.py create mode 100644 benchmarks/ensemble_supervised.py create mode 100644 benchmarks/utils.py create mode 100644 benchmarks_nonasv/README.md create mode 100644 benchmarks_nonasv/bench_mnist.py create mode 100644 benchmarks_nonasv/bench_oblique_tree.py rename {benchmarks => benchmarks_nonasv}/bench_plot_urf.py (100%) diff --git a/.gitignore b/.gitignore index b05462bf4..a1ecb917d 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,12 @@ doc/samples cover examples/*.jpg +env/ +html/ +results/ +scikit-learn/ +benchmarks/cache/ + # Pycharm .idea/ diff --git a/asv.conf.json b/asv.conf.json new file mode 100644 index 000000000..2dad0808a --- /dev/null +++ b/asv.conf.json @@ -0,0 +1,180 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "scikit-tree", + + // The project's homepage + "project_url": "https://docs.neurodata.io/scikit-tree/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": ".", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // export ASV_ENV_DIR=/Users/adam2392/miniforge3 + "install_command": [ + // "source /Users/adam2392/miniforge3/etc/profile.d/conda.sh", + // "conda activate base" + "spin build -j 6 --clean", + "pip install --no-build-isolation --editable ." + ], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + "build_command": [ + "spin build -j 6 --clean", + "pip install --no-build-isolation --editable ." + ], + + // List of branches to benchmark. If not provided, defaults to "master + // (for git) or "default" (for mercurial). + "branches": ["main"], + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/neurodata/scikit-tree/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["/Users/adam2392/miniforge3/envs/sktree/bin/python"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + // "conda_channels": ["conda-forge", "defaults"] + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + "matrix": { + "numpy": [], + "scipy": [], + "cython": ["0.29.36"], + "joblib": [], + "threadpoolctl": [], + "pandas": [], + "meson": [], + "meson-python": [], + "scikit-learn": [], + "spin": [], + "click": [], + "rich-click": [], + "doit": [], + "pydevtool": [], + "build": [] + }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "six": null}, // don't run without six on conda + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "numpy": "1.8"}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + "results_dir": ".asv/results", + "html_dir": ".asv/html" + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + // "results_dir": "results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + // "html_dir": "html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} \ No newline at end of file diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 000000000..be81f6c14 --- /dev/null +++ b/benchmarks/__init__.py @@ -0,0 +1 @@ +"""Benchmark suite for scikit-tree using ASV""" diff --git a/benchmarks/common.py b/benchmarks/common.py new file mode 100644 index 000000000..16c9dd724 --- /dev/null +++ b/benchmarks/common.py @@ -0,0 +1,282 @@ +import itertools +import json +import os +import pickle +import timeit +from abc import ABC, abstractmethod +from multiprocessing import cpu_count +from pathlib import Path + +import numpy as np + + +def get_from_config(): + """Get benchmarks configuration from the config.json file""" + current_path = Path(__file__).resolve().parent + + config_path = current_path / "config.json" + with open(config_path, "r") as config_file: + config_file = "".join(line for line in config_file if line and "//" not in line) + config = json.loads(config_file) + + profile = os.getenv("SKLBENCH_PROFILE", config["profile"]) + + n_jobs_vals_env = os.getenv("SKLBENCH_NJOBS") + if n_jobs_vals_env: + n_jobs_vals = eval(n_jobs_vals_env) + else: + n_jobs_vals = config["n_jobs_vals"] + if not n_jobs_vals: + n_jobs_vals = list(range(1, 1 + cpu_count())) + + cache_path = current_path / "cache" + cache_path.mkdir(exist_ok=True) + (cache_path / "estimators").mkdir(exist_ok=True) + (cache_path / "tmp").mkdir(exist_ok=True) + + save_estimators = os.getenv("SKLBENCH_SAVE_ESTIMATORS", config["save_estimators"]) + save_dir = os.getenv("ASV_COMMIT", "new")[:8] + + if save_estimators: + (cache_path / "estimators" / save_dir).mkdir(exist_ok=True) + + base_commit = os.getenv("SKLBENCH_BASE_COMMIT", config["base_commit"]) + + bench_predict = os.getenv("SKLBENCH_PREDICT", config["bench_predict"]) + bench_transform = os.getenv("SKLBENCH_TRANSFORM", config["bench_transform"]) + + return ( + profile, + n_jobs_vals, + save_estimators, + save_dir, + base_commit, + bench_predict, + bench_transform, + ) + + +def get_estimator_path(benchmark, directory, params, save=False): + """Get path of pickled fitted estimator""" + path = Path(__file__).resolve().parent / "cache" + path = (path / "estimators" / directory) if save else (path / "tmp") + + filename = ( + benchmark.__class__.__name__ + "_estimator_" + "_".join(list(map(str, params))) + ".pkl" + ) + + return path / filename + + +def clear_tmp(): + """Clean the tmp directory""" + path = Path(__file__).resolve().parent / "cache" / "tmp" + for child in path.iterdir(): + child.unlink() + + +class Benchmark(ABC): + """Abstract base class for all the benchmarks""" + + timer = timeit.default_timer # wall time + processes = 1 + timeout = 500 + + ( + profile, + n_jobs_vals, + save_estimators, + save_dir, + base_commit, + bench_predict, + bench_transform, + ) = get_from_config() + + if profile == "fast": + warmup_time = 0 + repeat = 1 + number = 1 + min_run_count = 1 + data_size = "small" + elif profile == "regular": + warmup_time = 1 + repeat = (3, 100, 30) + data_size = "small" + elif profile == "large_scale": + warmup_time = 1 + repeat = 3 + number = 1 + data_size = "large" + + @property + @abstractmethod + def params(self): + pass + + +class Estimator(ABC): + """Abstract base class for all benchmarks of estimators""" + + @abstractmethod + def make_data(self, params): + """Return the dataset for a combination of parameters""" + # The datasets are cached using joblib.Memory so it's fast and can be + # called for each repeat + pass + + @abstractmethod + def make_estimator(self, params): + """Return an instance of the estimator for a combination of parameters""" + pass + + def skip(self, params): + """Return True if the benchmark should be skipped for these params""" + return False + + def setup_cache(self): + """Pickle a fitted estimator for all combinations of parameters""" + # This is run once per benchmark class. + + clear_tmp() + + param_grid = list(itertools.product(*self.params)) + + for params in param_grid: + if self.skip(params): + continue + + estimator = self.make_estimator(params) + X, _, y, _ = self.make_data(params) + + estimator.fit(X, y) + + est_path = get_estimator_path( + self, Benchmark.save_dir, params, Benchmark.save_estimators + ) + with est_path.open(mode="wb") as f: + pickle.dump(estimator, f) + + def setup(self, *params): + """Generate dataset and load the fitted estimator""" + # This is run once per combination of parameters and per repeat so we + # need to avoid doing expensive operations there. + + if self.skip(params): + raise NotImplementedError + + self.X, self.X_val, self.y, self.y_val = self.make_data(params) + + est_path = get_estimator_path(self, Benchmark.save_dir, params, Benchmark.save_estimators) + with est_path.open(mode="rb") as f: + self.estimator = pickle.load(f) + + self.make_scorers() + + def time_fit(self, *args): + self.estimator.fit(self.X, self.y) + + def peakmem_fit(self, *args): + self.estimator.fit(self.X, self.y) + + def track_train_score(self, *args): + if hasattr(self.estimator, "predict"): + y_pred = self.estimator.predict(self.X) + else: + y_pred = None + return float(self.train_scorer(self.y, y_pred)) + + def track_test_score(self, *args): + if hasattr(self.estimator, "predict"): + y_val_pred = self.estimator.predict(self.X_val) + else: + y_val_pred = None + return float(self.test_scorer(self.y_val, y_val_pred)) + + +class Predictor(ABC): + """Abstract base class for benchmarks of estimators implementing predict""" + + if Benchmark.bench_predict: + + def time_predict(self, *args): + self.estimator.predict(self.X) + + def peakmem_predict(self, *args): + self.estimator.predict(self.X) + + if Benchmark.base_commit is not None: + + def track_same_prediction(self, *args): + est_path = get_estimator_path(self, Benchmark.base_commit, args, True) + with est_path.open(mode="rb") as f: + estimator_base = pickle.load(f) + + y_val_pred_base = estimator_base.predict(self.X_val) + y_val_pred = self.estimator.predict(self.X_val) + + return np.allclose(y_val_pred_base, y_val_pred) + + @property + @abstractmethod + def params(self): + pass + + +class Transformer(ABC): + """Abstract base class for benchmarks of estimators implementing transform""" + + if Benchmark.bench_transform: + + def time_transform(self, *args): + self.estimator.transform(self.X) + + def peakmem_transform(self, *args): + self.estimator.transform(self.X) + + if Benchmark.base_commit is not None: + + def track_same_transform(self, *args): + est_path = get_estimator_path(self, Benchmark.base_commit, args, True) + with est_path.open(mode="rb") as f: + estimator_base = pickle.load(f) + + X_val_t_base = estimator_base.transform(self.X_val) + X_val_t = self.estimator.transform(self.X_val) + + return np.allclose(X_val_t_base, X_val_t) + + @property + @abstractmethod + def params(self): + pass + + +def get_mem_info(): + """Get information about available memory""" + import psutil + + vm = psutil.virtual_memory() + return { + "memtotal": vm.total, + "memavailable": vm.available, + } + + +def set_mem_rlimit(max_mem=None): + """ + Set address space rlimit + """ + import resource + + if max_mem is None: + mem_info = get_mem_info() + max_mem = int(mem_info["memtotal"] * 0.7) + cur_limit = resource.getrlimit(resource.RLIMIT_AS) + if cur_limit[0] > 0: + max_mem = min(max_mem, cur_limit[0]) + + try: + resource.setrlimit(resource.RLIMIT_AS, (max_mem, cur_limit[1])) + except ValueError: + # on macOS may raise: current limit exceeds maximum limit + pass diff --git a/benchmarks/config.json b/benchmarks/config.json new file mode 100644 index 000000000..d1e12dcc3 --- /dev/null +++ b/benchmarks/config.json @@ -0,0 +1,33 @@ +{ + // "regular": Bencharks are run on small to medium datasets. Each benchmark + // is run multiple times and averaged. + // "fast": Benchmarks are run on small to medium datasets. Each benchmark + // is run only once. May provide unstable benchmarks. + // "large_scale": Benchmarks are run on large datasets. Each benchmark is + // run multiple times and averaged. This profile is meant to + // benchmark scalability and will take hours on single core. + // Can be overridden by environment variable SKLBENCH_PROFILE. + "profile": "regular", + + // List of values of n_jobs to use for estimators which accept this + // parameter (-1 means all cores). An empty list means all values from 1 to + // the maximum number of available cores. + // Can be overridden by environment variable SKLBENCH_NJOBS. + "n_jobs_vals": [1], + + // If true, fitted estimators are saved in ./cache/estimators/ + // Can be overridden by environment variable SKLBENCH_SAVE_ESTIMATORS. + "save_estimators": false, + + // Commit hash to compare estimator predictions with. + // If null, predictions are not compared. + // Can be overridden by environment variable SKLBENCH_BASE_COMMIT. + "base_commit": null, + + // If false, the predict (resp. transform) method of the estimators won't + // be benchmarked. + // Can be overridden by environment variables SKLBENCH_PREDICT and + // SKLBENCH_TRANSFORM. + "bench_predict": true, + "bench_transform": true +} \ No newline at end of file diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py new file mode 100644 index 000000000..4139fb020 --- /dev/null +++ b/benchmarks/datasets.py @@ -0,0 +1,161 @@ +from pathlib import Path + +import numpy as np +import scipy.sparse as sp +from joblib import Memory +from sklearn.datasets import ( + fetch_20newsgroups, + fetch_olivetti_faces, + fetch_openml, + load_digits, + make_blobs, + make_classification, + make_regression, +) +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MaxAbsScaler, StandardScaler + +# memory location for caching datasets +M = Memory(location=str(Path(__file__).resolve().parent / "cache")) + + +@M.cache +def _blobs_dataset(n_samples=500000, n_features=3, n_clusters=100, dtype=np.float32): + X, _ = make_blobs( + n_samples=n_samples, n_features=n_features, centers=n_clusters, random_state=0 + ) + X = X.astype(dtype, copy=False) + + X, X_val = train_test_split(X, test_size=0.1, random_state=0) + return X, X_val, None, None + + +@M.cache +def _20newsgroups_highdim_dataset(n_samples=None, ngrams=(1, 1), dtype=np.float32): + newsgroups = fetch_20newsgroups(random_state=0) + vectorizer = TfidfVectorizer(ngram_range=ngrams, dtype=dtype) + X = vectorizer.fit_transform(newsgroups.data[:n_samples]) + y = newsgroups.target[:n_samples] + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), dtype=np.float32): + newsgroups = fetch_20newsgroups() + vectorizer = TfidfVectorizer(ngram_range=ngrams) + X = vectorizer.fit_transform(newsgroups.data) + X = X.astype(dtype, copy=False) + svd = TruncatedSVD(n_components=n_components) + X = svd.fit_transform(X) + y = newsgroups.target + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _mnist_dataset(dtype=np.float32): + X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas") + X = X.astype(dtype, copy=False) + X = MaxAbsScaler().fit_transform(X) + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _digits_dataset(n_samples=None, dtype=np.float32): + X, y = load_digits(return_X_y=True) + X = X.astype(dtype, copy=False) + X = MaxAbsScaler().fit_transform(X) + X = X[:n_samples] + y = y[:n_samples] + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _synth_regression_dataset(n_samples=100000, n_features=100, dtype=np.float32): + X, y = make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_features // 10, + noise=50, + random_state=0, + ) + X = X.astype(dtype, copy=False) + X = StandardScaler().fit_transform(X) + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _synth_regression_sparse_dataset( + n_samples=10000, n_features=10000, density=0.01, dtype=np.float32 +): + X = sp.random(m=n_samples, n=n_features, density=density, format="csr", random_state=0) + X.data = np.random.RandomState(0).randn(X.getnnz()) + X = X.astype(dtype, copy=False) + coefs = sp.random(m=n_features, n=1, density=0.5, random_state=0) + coefs.data = np.random.RandomState(0).randn(coefs.getnnz()) + y = X.dot(coefs.toarray()).reshape(-1) + y += 0.2 * y.std() * np.random.randn(n_samples) + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _synth_classification_dataset(n_samples=1000, n_features=10000, n_classes=2, dtype=np.float32): + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_classes=n_classes, + random_state=0, + n_informative=n_features, + n_redundant=0, + ) + X = X.astype(dtype, copy=False) + X = StandardScaler().fit_transform(X) + + X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0) + return X, X_val, y, y_val + + +@M.cache +def _olivetti_faces_dataset(): + dataset = fetch_olivetti_faces(shuffle=True, random_state=42) + faces = dataset.data + n_samples, n_features = faces.shape + faces_centered = faces - faces.mean(axis=0) + # local centering + faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1) + X = faces_centered + + X, X_val = train_test_split(X, test_size=0.1, random_state=0) + return X, X_val, None, None + + +@M.cache +def _random_dataset(n_samples=1000, n_features=1000, representation="dense", dtype=np.float32): + if representation == "dense": + X = np.random.RandomState(0).random_sample((n_samples, n_features)) + X = X.astype(dtype, copy=False) + else: + X = sp.random( + n_samples, + n_features, + density=0.05, + format="csr", + dtype=dtype, + random_state=0, + ) + + X, X_val = train_test_split(X, test_size=0.1, random_state=0) + return X, X_val, None, None diff --git a/benchmarks/ensemble_supervised.py b/benchmarks/ensemble_supervised.py new file mode 100644 index 000000000..cba1e8189 --- /dev/null +++ b/benchmarks/ensemble_supervised.py @@ -0,0 +1,74 @@ +from sktree.ensemble import ObliqueRandomForestClassifier + +from .common import Benchmark, Estimator, Predictor +from .datasets import ( + _20newsgroups_highdim_dataset, + _20newsgroups_lowdim_dataset, + _synth_classification_dataset, +) +from .utils import make_gen_classif_scorers + + +class ObliqueRandomForestClassifierBenchmark(Predictor, Estimator, Benchmark): + """ + Benchmarks for RandomForestClassifier. + """ + + param_names = ["representation", "n_jobs"] + params = (["dense", "sparse"], Benchmark.n_jobs_vals) + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + representation, n_jobs = params + + if representation == "sparse": + data = _20newsgroups_highdim_dataset() + else: + data = _20newsgroups_lowdim_dataset() + + return data + + def make_estimator(self, params): + representation, n_jobs = params + + n_estimators = 500 if Benchmark.data_size == "large" else 100 + + estimator = ObliqueRandomForestClassifier( + n_estimators=n_estimators, + min_samples_split=10, + max_features="log2", + n_jobs=n_jobs, + random_state=0, + ) + + return estimator + + def make_scorers(self): + make_gen_classif_scorers(self) + + +class ObliqueRandomForestClassifierBenchmarkSynth(Predictor, Estimator, Benchmark): + """ + Benchmarks for Oblique RF Classifier using synthetic classification data. + """ + + param_names = [] + params = () + + def setup_cache(self): + super().setup_cache() + + def make_data(self, params): + data = _synth_classification_dataset(n_samples=10000, n_features=100, n_classes=5) + + return data + + def make_estimator(self, params): + estimator = ObliqueRandomForestClassifier(max_leaf_nodes=15, random_state=0) + + return estimator + + def make_scorers(self): + make_gen_classif_scorers(self) diff --git a/benchmarks/utils.py b/benchmarks/utils.py new file mode 100644 index 000000000..f52be99cb --- /dev/null +++ b/benchmarks/utils.py @@ -0,0 +1,46 @@ +import numpy as np +from sklearn.metrics import balanced_accuracy_score, r2_score + + +def neg_mean_inertia(X, labels, centers): + return -(np.asarray(X - centers[labels]) ** 2).sum(axis=1).mean() + + +def make_gen_classif_scorers(caller): + caller.train_scorer = balanced_accuracy_score + caller.test_scorer = balanced_accuracy_score + + +def make_gen_reg_scorers(caller): + caller.test_scorer = r2_score + caller.train_scorer = r2_score + + +def neg_mean_data_error(X, U, V): + return -np.sqrt(((X - U.dot(V)) ** 2).mean()) + + +def make_dict_learning_scorers(caller): + caller.train_scorer = lambda _, __: ( + neg_mean_data_error( + caller.X, caller.estimator.transform(caller.X), caller.estimator.components_ + ) + ) + caller.test_scorer = lambda _, __: ( + neg_mean_data_error( + caller.X_val, + caller.estimator.transform(caller.X_val), + caller.estimator.components_, + ) + ) + + +def explained_variance_ratio(Xt, X): + return np.var(Xt, axis=0).sum() / np.var(X, axis=0).sum() + + +def make_pca_scorers(caller): + caller.train_scorer = lambda _, __: caller.estimator.explained_variance_ratio_.sum() + caller.test_scorer = lambda _, __: ( + explained_variance_ratio(caller.estimator.transform(caller.X_val), caller.X_val) + ) diff --git a/benchmarks_nonasv/README.md b/benchmarks_nonasv/README.md new file mode 100644 index 000000000..e735c0741 --- /dev/null +++ b/benchmarks_nonasv/README.md @@ -0,0 +1,2 @@ +A set of scripts that can be run to analyze runtime and performance of scikit-tree +estimators. diff --git a/benchmarks_nonasv/bench_mnist.py b/benchmarks_nonasv/bench_mnist.py new file mode 100644 index 000000000..0e9adec99 --- /dev/null +++ b/benchmarks_nonasv/bench_mnist.py @@ -0,0 +1,185 @@ +""" +======================= +MNIST dataset benchmark +======================= + +Benchmark on the MNIST dataset. The dataset comprises 70,000 samples +and 784 features. Here, we consider the task of predicting +10 classes - digits from 0 to 9 from their raw images. By contrast to the +covertype dataset, the feature space is homogeneous. + +Example of output : + [..] + + Classification performance: + =========================== + Classifier train-time test-time error-rate + ------------------------------------------------------------ + ExtraTrees 42.99s 0.57s 0.0294 + RandomForest 42.70s 0.49s 0.0318 + ObliqueRandomForest 135.81s 0.56s 0.0486 + PatchObliqueRandomForest 16.67s 0.06s 0.0824 + ExtraObliqueRandomForest 20.69s 0.02s 0.1219 + dummy 0.00s 0.01s 0.8973 +""" + +# License: BSD 3 clause + +import argparse +import os +from time import time + +import numpy as np +from joblib import Memory +from sklearn.datasets import fetch_openml, get_data_home +from sklearn.dummy import DummyClassifier +from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier +from sklearn.metrics import zero_one_loss +from sklearn.utils import check_array + +from sktree import ObliqueRandomForestClassifier, PatchObliqueRandomForestClassifier + +# Memoize the data extraction and memory map the resulting +# train / test splits in readonly mode +memory = Memory(os.path.join(get_data_home(), "mnist_benchmark_data"), mmap_mode="r") + + +@memory.cache +def load_data(dtype=np.float32, order="F"): + """Load the data, then cache and memmap the train/test split""" + ###################################################################### + # Load dataset + print("Loading dataset...") + data = fetch_openml("mnist_784", as_frame=True, parser="pandas") + X = check_array(data["data"], dtype=dtype, order=order) + y = data["target"] + + # Normalize features + X = X / 255 + + # Create train-test split (as [Joachims, 2006]) + print("Creating train-test split...") + n_train = 60000 + X_train = X[:n_train] + y_train = y[:n_train] + X_test = X[n_train:] + y_test = y[n_train:] + + return X_train, X_test, y_train, y_test + + +ESTIMATORS = { + "dummy": DummyClassifier(), + "ExtraTrees": ExtraTreesClassifier(), + "RandomForest": RandomForestClassifier(), + "ObliqueRandomForest": ObliqueRandomForestClassifier(), + "PatchObliqueRandomForest": PatchObliqueRandomForestClassifier(), +} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--classifiers", + nargs="+", + choices=ESTIMATORS, + type=str, + default=["ExtraTrees"], + help="list of classifiers to benchmark.", + ) + parser.add_argument( + "--n-jobs", + nargs="?", + default=1, + type=int, + help=("Number of concurrently running workers for " "models that support parallelism."), + ) + parser.add_argument( + "--order", + nargs="?", + default="C", + type=str, + choices=["F", "C"], + help="Allow to choose between fortran and C ordered data", + ) + parser.add_argument( + "--random-seed", + nargs="?", + default=0, + type=int, + help="Common seed used by random number generator.", + ) + args = vars(parser.parse_args()) + + print(__doc__) + + X_train, X_test, y_train, y_test = load_data(order=args["order"]) + + print("") + print("Dataset statistics:") + print("===================") + print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) + print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) + print("%s %s" % ("data type:".ljust(25), X_train.dtype)) + print( + "%s %d (size=%dMB)" + % ( + "number of train samples:".ljust(25), + X_train.shape[0], + int(X_train.nbytes / 1e6), + ) + ) + print( + "%s %d (size=%dMB)" + % ( + "number of test samples:".ljust(25), + X_test.shape[0], + int(X_test.nbytes / 1e6), + ) + ) + + print() + print("Training Classifiers") + print("====================") + error, train_time, test_time = {}, {}, {} + for name in sorted(args["classifiers"]): + print("Training %s ... " % name, end="") + estimator = ESTIMATORS[name] + estimator_params = estimator.get_params() + + estimator.set_params( + **{p: args["random_seed"] for p in estimator_params if p.endswith("random_state")} + ) + + if "n_jobs" in estimator_params: + estimator.set_params(n_jobs=args["n_jobs"]) + + time_start = time() + estimator.fit(X_train, y_train) + train_time[name] = time() - time_start + + time_start = time() + y_pred = estimator.predict(X_test) + test_time[name] = time() - time_start + + error[name] = zero_one_loss(y_test, y_pred) + + print("done") + + print() + print("Classification performance:") + print("===========================") + print( + "{0: <24} {1: >10} {2: >11} {3: >12}".format( + "Classifier ", "train-time", "test-time", "error-rate" + ) + ) + print("-" * 60) + for name in sorted(args["classifiers"], key=error.get): + print( + "{0: <23} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}".format( + name, train_time[name], test_time[name], error[name] + ) + ) + + print() diff --git a/benchmarks_nonasv/bench_oblique_tree.py b/benchmarks_nonasv/bench_oblique_tree.py new file mode 100644 index 000000000..246972646 --- /dev/null +++ b/benchmarks_nonasv/bench_oblique_tree.py @@ -0,0 +1,169 @@ +""" +To run this, you'll need to have installed. + + * scikit-learn + * scikit-tree + +Does two benchmarks + +First, we fix a training set, increase the number of +samples to classify and plot number of classified samples as a +function of time. + +In the second benchmark, we increase the number of dimensions of the +training set, classify a sample and plot the time taken as a function +of the number of dimensions. +""" +import gc +from datetime import datetime + +import matplotlib.pyplot as plt +import numpy as np + +# to store the results +scikit_classifier_results = [] +scikit_regressor_results = [] +sklearn_classifier_results = [] +sklearn_regressor_results = [] + +mu_second = 0.0 + 10**6 # number of microseconds in a second + + +def bench_scikitlearn_tree_classifier(X, Y): + """Benchmark with scikit-learn decision tree classifier""" + + from sklearn.tree import DecisionTreeClassifier + + gc.collect() + + # start time + tstart = datetime.now() + clf = DecisionTreeClassifier() + clf.fit(X, Y).predict(X) + delta = datetime.now() - tstart + # stop time + + sklearn_classifier_results.append(delta.seconds + delta.microseconds / mu_second) + + +def bench_scikitlearn_tree_regressor(X, Y): + """Benchmark with scikit-learn decision tree regressor""" + + from sklearn.tree import DecisionTreeRegressor + + gc.collect() + + # start time + tstart = datetime.now() + clf = DecisionTreeRegressor() + clf.fit(X, Y).predict(X) + delta = datetime.now() - tstart + # stop time + + sklearn_regressor_results.append(delta.seconds + delta.microseconds / mu_second) + + +def bench_oblique_tree_classifier(X, Y): + """Benchmark with scikit-learn decision tree classifier""" + + from sktree.tree import ObliqueDecisionTreeClassifier + + gc.collect() + + # start time + tstart = datetime.now() + clf = ObliqueDecisionTreeClassifier() + clf.fit(X, Y).predict(X) + delta = datetime.now() - tstart + # stop time + + scikit_classifier_results.append(delta.seconds + delta.microseconds / mu_second) + + +def bench_oblique_tree_regressor(X, Y): + """Benchmark with scikit-learn decision tree regressor""" + + from sktree.tree import ObliqueDecisionTreeRegressor + + gc.collect() + + # start time + tstart = datetime.now() + clf = ObliqueDecisionTreeRegressor() + clf.fit(X, Y).predict(X) + delta = datetime.now() - tstart + # stop time + + scikit_regressor_results.append(delta.seconds + delta.microseconds / mu_second) + + +if __name__ == "__main__": + print("============================================") + print("Warning: this is going to take a looong time") + print("============================================") + + n = 10 + step = 10000 + n_samples = 10000 + dim = 10 + n_classes = 10 + for i in range(n): + print("============================================") + print("Entering iteration %s of %s" % (i, n)) + print("============================================") + n_samples += step + X = np.random.randn(n_samples, dim) + Y = np.random.randint(0, n_classes, (n_samples,)) + bench_oblique_tree_classifier(X, Y) + bench_scikitlearn_tree_classifier(X, Y) + Y = np.random.randn(n_samples) + bench_oblique_tree_regressor(X, Y) + bench_scikitlearn_tree_regressor(X, Y) + + xx = range(0, n * step, step) + plt.figure("scikit-tree oblique tree benchmark results") + plt.subplot(211) + plt.title("Learning with varying number of samples") + plt.plot(xx, scikit_classifier_results, "g-", label="classification") + plt.plot(xx, scikit_regressor_results, "r-", label="regression") + plt.plot(xx, sklearn_regressor_results, "b--", label="sklearn-regression") + plt.plot(xx, sklearn_classifier_results, "o--", label="sklearn-classification") + plt.legend(loc="upper left") + plt.xlabel("number of samples") + plt.ylabel("Time (s)") + + scikit_classifier_results = [] + scikit_regressor_results = [] + sklearn_classifier_results = [] + sklearn_regressor_results = [] + n = 10 + step = 500 + start_dim = 500 + n_classes = 10 + + dim = start_dim + for i in range(0, n): + print("============================================") + print("Entering iteration %s of %s" % (i, n)) + print("============================================") + dim += step + X = np.random.randn(100, dim) + Y = np.random.randint(0, n_classes, (100,)) + bench_oblique_tree_classifier(X, Y) + bench_scikitlearn_tree_classifier(X, Y) + Y = np.random.randn(100) + bench_oblique_tree_regressor(X, Y) + bench_scikitlearn_tree_regressor(X, Y) + + xx = np.arange(start_dim, start_dim + n * step, step) + plt.subplot(212) + plt.title("Learning in high dimensional spaces") + plt.plot(xx, scikit_classifier_results, "g-", label="classification") + plt.plot(xx, scikit_regressor_results, "r-", label="regression") + plt.plot(xx, sklearn_regressor_results, "b--", label="sklearn-regression") + plt.plot(xx, sklearn_classifier_results, "o--", label="sklearn-classification") + plt.legend(loc="upper left") + plt.xlabel("number of dimensions") + plt.ylabel("Time (s)") + plt.axis("tight") + plt.show() diff --git a/benchmarks/bench_plot_urf.py b/benchmarks_nonasv/bench_plot_urf.py similarity index 100% rename from benchmarks/bench_plot_urf.py rename to benchmarks_nonasv/bench_plot_urf.py