From b95069fc1f6be3a4c8d3bd8ece37daf0c949f5a5 Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Fri, 16 Feb 2024 19:02:12 -0600 Subject: [PATCH 1/5] basic configuration --- CompStats/__init__.py | 14 ++++++++++++++ CompStats/tests/__init__.py | 13 +++++++++++++ environment.yml | 9 +++++++++ pyproject.toml | 11 +++++++++++ setup.py | 3 +++ 5 files changed, 50 insertions(+) create mode 100644 CompStats/__init__.py create mode 100644 CompStats/tests/__init__.py create mode 100644 environment.yml create mode 100644 pyproject.toml create mode 100644 setup.py diff --git a/CompStats/__init__.py b/CompStats/__init__.py new file mode 100644 index 0000000..41a8488 --- /dev/null +++ b/CompStats/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2024 Sergio Nava Muñoz and Mario Graff Guerrero + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +__version__ = '0.0.1' \ No newline at end of file diff --git a/CompStats/tests/__init__.py b/CompStats/tests/__init__.py new file mode 100644 index 0000000..2f1fa6e --- /dev/null +++ b/CompStats/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Sergio Nava Muñoz and Mario Graff Guerrero + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..caa8388 --- /dev/null +++ b/environment.yml @@ -0,0 +1,9 @@ +name: CompStats +channels: + - conda-forge +dependencies: + - python + - pip + - scikit-learn + - pandas + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3539c9c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = 'CompStats' +dependencies = [ + 'numpy', + 'scikit-learn>=1.3.0', + 'pandas' +] +dynamic = ['version'] + +[tool.setuptools.dynamic] +version = {attr = 'CompStats.__version__'} \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fc1f76c --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() \ No newline at end of file From dc72aef1d472b7851e4074bac15c6a85212d6a20 Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Fri, 16 Feb 2024 19:04:16 -0600 Subject: [PATCH 2/5] devcontainer --- .devcontainer/devcontainer.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..bc8d8c1 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,4 @@ +{ + "image":"mcr.microsoft.com/devcontainers/universal:2-linux", + "postCreateCommand": "conda install -c conda-forge --yes numpy scipy scikit-learn nose pandas" +} From 9de10f7b7c1802cbad06653077bbca4144c5010d Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Fri, 16 Feb 2024 19:13:43 -0600 Subject: [PATCH 3/5] test action --- .github/workflows/test.yaml | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 .github/workflows/test.yaml diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 0000000..a0acb95 --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,53 @@ +name: Tests + +on: + push: + branches: + - develop + +jobs: + build: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: test + auto-update-conda: true + python-version: ${{ matrix.python-version }} + channels: conda-forge + allow-softlinks: true + channel-priority: flexible + show-channel-urls: true + - name: Install dependencies + run: | + conda install --yes pip + pip install coverage + pip install coveralls + conda install --yes numpy scipy scikit-learn nose pandas + python setup.py build_ext --inplace + - name: Tests on Linux + if: ${{ runner.os == 'Linux' }} + run: | + which python + python --version + which coverage + nosetests --verbose --with-coverage --cover-package=CompStats CompStats/tests + - name: Tests on macOS and Windows + if: ${{ runner.os != 'Linux' }} + run: | + nosetests --verbose CompStats/tests + - name: coveralls + if: ${{ runner.os == 'Linux' }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + coveralls --service=github \ No newline at end of file From 9773f754f154ba9efab2f87fe39111b04225d2ac Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Sat, 17 Feb 2024 06:23:48 -0600 Subject: [PATCH 4/5] Bootstrap base class --- .coveragerc | 3 + CompStats/bootstrap.py | 279 ++++++++++++++++++++++++++++++ CompStats/tests/test_bootstrap.py | 94 ++++++++++ 3 files changed, 376 insertions(+) create mode 100644 .coveragerc create mode 100644 CompStats/bootstrap.py create mode 100644 CompStats/tests/test_bootstrap.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..9c734d2 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[run] +omit = + */tests* \ No newline at end of file diff --git a/CompStats/bootstrap.py b/CompStats/bootstrap.py new file mode 100644 index 0000000..b74ffa7 --- /dev/null +++ b/CompStats/bootstrap.py @@ -0,0 +1,279 @@ +# Copyright 2024 Sergio Nava Muñoz and Mario Graff Guerrero + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Callable +from joblib import delayed, Parallel +import numpy as np + + +class StatisticSamples(object): + """Apply the statistic to `num_samples` samples taken with replacement from the population (arguments). + + :param statistic: Statistic. + :type statistic: Callable + :param num_samples: Number of bootstrap samples, default=500. + :type num_samples: int + :param n_jobs: Number of jobs to run in parallel, default=1. + :type n_jobs: int + + + >>> from IngeoML import StatisticSamples + >>> from sklearn.metrics import accuracy_score + >>> import numpy as np + >>> statistic = StatisticSamples(num_samples=10, statistic=np.mean) + >>> empirical_distribution = np.r_[[3, 4, 5, 2, 4]] + >>> statistic(empirical_distribution) + array([2.8, 3.6, 3.6, 3.6, 2.6, 4. , 2.8, 3. , 3.8, 3.6]) + >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]] + >>> pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]] + >>> acc = StatisticSamples(num_samples=15, statistic=accuracy_score) + >>> acc(labels, pred) + array([0.9, 0.8, 0.7, 1. , 0.6, 1. , 0.7, 0.9, 0.9, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8]) + """ + + def __init__(self, + statistic: Callable[[np.ndarray], float]=np.mean, + num_samples: int=500, + n_jobs: int=1): + self.statistic = statistic + self.num_samples = num_samples + self.n_jobs = n_jobs + self._samples = None + + @property + def n_jobs(self): + """Number of jobs to do in parallel""" + return self._n_jobs + + @n_jobs.setter + def n_jobs(self, value): + self._n_jobs = value + + @property + def statistic(self): + """Statistic function.""" + return self._statistic + + @statistic.setter + def statistic(self, value): + self._statistic = value + + @property + def num_samples(self): + """Number of bootstrap samples.""" + return self._num_samples + + @num_samples.setter + def num_samples(self, value): + self._num_samples = value + + @property + def statistic_samples(self): + """It contains the statistic samples of the latest call.""" + assert hasattr(self, '_statistic_samples') + return self._statistic_samples + + @statistic_samples.setter + def statistic_samples(self, value): + self._statistic_samples = value + + def samples(self, N): + """Samples. + + :param N: Population size. + :type N: int + """ + def inner(N): + _ = np.random.randint(N, size=(self.num_samples, N)) + self._samples = _ + return self._samples + try: + if self._samples.shape[1] == N: + return self._samples + else: + return inner(N) + except AttributeError: + return inner(N) + + def __call__(self, *args: np.ndarray) -> np.ndarray: + """Population where the bootstrap process will be performed. + + :param *args: Population + :type *args: np.ndarray + """ + def inner(s): + _ = [arg[s] for arg in args] + return self.statistic(*_) + + B = [] + # statistic = self.statistic + B = Parallel(n_jobs=self.n_jobs)(delayed(inner)(s) + for s in self.samples(args[0].shape[0])) + # for s in self.samples(args[0].shape[0]): + #  _ = [arg[s] for arg in args] + #  B.append(statistic(*_)) + self.statistic_samples = np.array(B) + return self.statistic_samples + + +class CI(StatisticSamples): + """Compute the Confidence Interval of a statistic using bootstrap. + + :param alpha: :math:`[\\frac{\\alpha}{2}, 1 - \\frac{\\alpha}{2}]`. + :type alpha: float + + >>> from IngeoML import CI + >>> from sklearn.metrics import accuracy_score + >>> import numpy as np + >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]] + >>> pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]] + >>> acc = CI(statistic=accuracy_score) + >>> acc(labels, pred) + (0.7, 1.0) + """ + def __init__(self, alpha: float=0.05, + **kwargs): + super().__init__(**kwargs) + self.alpha = alpha + + @property + def alpha(self): + """The interval is computed for :math:`[\\frac{\\alpha}{2}, 1 - \\frac{\\alpha}{2}]`. + """ + return self._alpha + + @alpha.setter + def alpha(self, value): + self._alpha = value / 2 + + def __call__(self, *args: np.ndarray) -> np.ndarray: + B = super().__call__(*args) + alpha = self.alpha + return (np.percentile(B, alpha * 100, axis=0), + np.percentile(B, (1 - alpha) * 100, axis=0)) + + +# class SE(StatisticSamples): +# """Compute the Standard Error of a statistic using bootstrap. + +# >>> from IngeoML import SE +# >>> from sklearn.metrics import accuracy_score +# >>> import numpy as np +# >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]] +# >>> pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]] +# >>> se = SE(statistic=accuracy_score) +# >>> se(labels, pred) +# 0.11949493713124419 +# """ + +# def __call__(self, *args: np.ndarray) -> float: +# B = super().__call__(*args) +# return np.std(B, axis=0) + + +# class Difference(CI): +# def __init__(self, y: np.ndarray, +# algorithms: dict={}, +# performance: Callable[[np.ndarray, np.ndarray], float]=lambda y, hy: f1_score(y, hy, average='macro'), +# **kwargs) -> None: +# super(Difference, self).__init__(populations=algorithms, statistic=performance) +# self.y = y +# self._dist = dict() +# self._delta = dict() +# self._pvalue_r = dict() +# self._pvalue_l = dict() + +# @property +# def y(self): +# return self._y + +# @y.setter +# def y(self, value): +# self._y = value + +# @property +# def best(self): +# try: +# return self._best +# except AttributeError: +# y = self.y +# best = (None, -np.inf) +# for k, v in self.populations.items(): +# perf = self.statistic(y, v) +# if perf > best[1]: +# best = (k, perf) +# self._best = best[0] +# return self._best + +# def delta(self, key): +# assert key != self.best +# if key in self._delta: +# return self._delta[key] +# y = self.y +# algs = self.populations +# perf = self.statistic +# delta = perf(y, algs[self.best]) - perf(y, algs[key]) +# self._delta[key] = delta +# return delta + +# def samples(self, key): +# if key in self.statistic_samples: +# return self.statistic_samples[key] +# data = self.populations[key] +# y = self.y +# output = np.array([self.statistic(y[s], data[s]) +# for s in self.bootstrap]) +# self.statistic_samples[key] = output +# return output + +# @property +# def best_performance(self): +# return self.samples(self.best) + +# def distribution(self, key): +# best = self.best +# assert key != best +# if key in self._dist: +# return self._dist[key] +# output = self.best_performance - self.samples(key) +# self._dist[key] = output +# return output + +# def pvalue(self, key, side='right'): +# assert side in ['left', 'right'] +# assert key != self.best +# if side == 'right': +# if key in self._pvalue_r: +# return self._pvalue_r[key] +# elif key in self._pvalue_l: +# return self._pvalue_l[key] +# c = 0 +# delta_2 = 2 * self.delta(key) +# delta_i = self.distribution(key) +# if side == 'right': +# c = (delta_i >= delta_2).mean() +# else: +# c = (delta_i < 0).mean() +# if side == 'right': +# self._pvalue_r[key] = c +# else: +# self._pvalue_l[key] = c +# return c + +# def sort(self, side='right'): +# best = self.best +# algs = [(k, self.pvalue(k, side=side)) +# for k in self.populations if k != best] +# algs.sort(key=lambda x: x[1], reverse=True) +# return [k for k, _ in algs] + \ No newline at end of file diff --git a/CompStats/tests/test_bootstrap.py b/CompStats/tests/test_bootstrap.py new file mode 100644 index 0000000..b239864 --- /dev/null +++ b/CompStats/tests/test_bootstrap.py @@ -0,0 +1,94 @@ +# Copyright 2023 Mario Graff Guerrero + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from CompStats.bootstrap import StatisticSamples, CI + + +def problem_algorithms(): + labels = [0, 0, 0, 0, 0, + 1, 1, 1, 1, 1] + a = [0, 0, 0, 0, 0, + 1, 1, 1, 1, 0] + b = [0, 0, 1, 0, 0, + 1, 1, 1, 1, 0] + c = [0, 0, 0, 1, 0, + 1, 1, 0, 1, 0] + return (np.array(labels), + dict(a=np.array(a), + b=np.array(b), + c=np.array(c))) + + +def test_StatisticSample(): + """Test StatisticSamples""" + statistic = StatisticSamples(num_samples=26, n_jobs=-1) + samples = statistic(np.r_[[3, 4, 5, 2, 4]]) + assert samples.shape[0] == 26 + + +def test_CI(): + """Test CI""" + statistic = CI() + ci = statistic(np.r_[[3, 4, 5, 2, 4]]) + assert len(ci) == 2 + + +def test_CI2D(): + """Test CI with two values""" + from sklearn.metrics import f1_score + labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0]] + pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0]] + ci = CI(statistic=lambda y, hy: f1_score(y, hy, average=None)) + a = ci(labels, pred) + assert a[0].shape[0] == 2 and a[1].shape[0] == 2 + + +# def test_se(): +# labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]] +# pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]] +# se = SE(statistic=accuracy_score) +# res = se(labels, pred) +# assert res > 0 and isinstance(res, float) + +# def test_Difference_ci(): +# labels, algs = problem_algorithms() +# diff = Difference(labels, algs) +# a = diff.confidence_interval('a') +# assert a[0] > 0.6 and a[1] <= 1.0 + + +# def test_Difference_best(): +# labels, algs = problem_algorithms() +# diff = Difference(labels, algs) +# assert diff.best == 'a' + + +# def test_Difference_delta(): +# labels, algs = problem_algorithms() +# diff = Difference(labels, algs) +# assert diff.delta('b') > 0 and diff.delta('c') > 0 + + +# def test_Difference(): +# labels, algs = problem_algorithms() +# diff = Difference(labels, algs) +# assert diff.best == 'a' +# assert diff.pvalue('b') > diff.pvalue('c') + + +# def test_Difference_sort(): +# labels, algs = problem_algorithms() +# diff = Difference(labels, algs) +# for x, r in zip(diff.sort(), ['b', 'c']): +# assert x == r \ No newline at end of file From 6fd3394a65cd37e9a0150201ef0c58e834751286 Mon Sep 17 00:00:00 2001 From: Mario Graff Date: Sat, 17 Feb 2024 06:34:13 -0600 Subject: [PATCH 5/5] pip --- .github/workflows/pip.yaml | 48 ++++++++++++++++++++++++++++++++++++++ CompStats/bootstrap.py | 2 +- 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pip.yaml diff --git a/.github/workflows/pip.yaml b/.github/workflows/pip.yaml new file mode 100644 index 0000000..f208c32 --- /dev/null +++ b/.github/workflows/pip.yaml @@ -0,0 +1,48 @@ +name: Pip + +on: + workflow_dispatch: + push: + tags: + - v* + +jobs: + build: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: test + auto-update-conda: true + python-version: ${{ matrix.python-version }} + channels: conda-forge + allow-softlinks: true + channel-priority: flexible + show-channel-urls: true + - name: Install dependencies + run: | + conda install --yes pip + pip install twine build + conda install --yes numpy scipy scikit-learn nose pandas + python -m build + - name: Pip + if: ${{ runner.os == 'Linux' }} + env: + TWINE: ${{ secrets.TWINE }} + run: | + twine upload --skip-existing -u __token__ -p $TWINE dist/*.tar.gz; + - name: Wheel + if: ${{ runner.os != 'Linux' }} + env: + TWINE: ${{ secrets.TWINE }} + run: | + twine upload --skip-existing -u __token__ -p $TWINE dist/*.whl; diff --git a/CompStats/bootstrap.py b/CompStats/bootstrap.py index b74ffa7..1d4ed1a 100644 --- a/CompStats/bootstrap.py +++ b/CompStats/bootstrap.py @@ -16,7 +16,7 @@ import numpy as np -class StatisticSamples(object): +class StatisticSamples: """Apply the statistic to `num_samples` samples taken with replacement from the population (arguments). :param statistic: Statistic.