diff --git a/README.md b/README.md index 36c4644e..40eac2b8 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,10 @@ Python wrapper for MOA to allow efficient use of existing algorithms with a more modern API > [!IMPORTANT] -> * **[How to Install CapyMOA](docs/installation.md)** -> * **[How to Contribute Tests](docs/testing.md)** -> * **[How to Contribute Documentation](docs/README.md)** +> * **[How to install CapyMOA](docs/installation.md)** +> * **[How to add documentation](docs/contributing/docs.md)** +> * **[How to add tests](docs/contributing/tests.md)** +> * **[How to add new algorithms or methods](docs/contributing/learners.md)** # Functionality diff --git a/docs/api/classifiers.rst b/docs/api/classifiers.rst index d58707eb..cefeff58 100644 --- a/docs/api/classifiers.rst +++ b/docs/api/classifiers.rst @@ -1,9 +1,8 @@ Classifiers =========== -Classifiers implement the :class:`capymoa.learner.learners.Classifier` interface. +Classifiers implement the :class:`capymoa.base.Classifier` interface. -.. automodule:: capymoa.learner.classifier +.. automodule:: capymoa.classifier :members: :undoc-members: :show-inheritance: - :inherited-members: diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst index b8efe4ec..6e213949 100644 --- a/docs/api/datasets.rst +++ b/docs/api/datasets.rst @@ -8,3 +8,9 @@ and used being downloaded the first time you use them. :undoc-members: :show-inheritance: :inherited-members: + +.. automodule:: capymoa.datasets.downloader + :members: + :undoc-members: + :show-inheritance: + :inherited-members: \ No newline at end of file diff --git a/docs/api/api.rst b/docs/api/index.rst similarity index 74% rename from docs/api/api.rst rename to docs/api/index.rst index a025fb2f..818a4962 100644 --- a/docs/api/api.rst +++ b/docs/api/index.rst @@ -16,12 +16,18 @@ with the :ref:`tutorials`. datasets instance +.. toctree:: + :maxdepth: 1 + :caption: Interfaces + + learner + moa_learner + .. toctree:: :maxdepth: 1 :caption: Learners regressor - learners ssl classifiers @@ -30,3 +36,10 @@ with the :ref:`tutorials`. :caption: Evaluation evaluation + + +.. toctree:: + :maxdepth: 1 + :caption: Other + + splitcriteria diff --git a/docs/api/instance.rst b/docs/api/instance.rst index 7e93f96f..26a113f6 100644 --- a/docs/api/instance.rst +++ b/docs/api/instance.rst @@ -2,7 +2,7 @@ Instance ======== Instances are the basic unit of data in CapyMOA. -.. automodule:: capymoa.stream.instance +.. automodule:: capymoa.instance :members: :undoc-members: :show-inheritance: diff --git a/docs/api/learners.rst b/docs/api/learner.rst similarity index 64% rename from docs/api/learners.rst rename to docs/api/learner.rst index 643dd941..6472ad92 100644 --- a/docs/api/learners.rst +++ b/docs/api/learner.rst @@ -3,17 +3,17 @@ Learners CapyMOA defines different interfaces for learners performing different machine learning tasks. -.. autoclass:: capymoa.learner.learners.Classifier +.. autoclass:: capymoa.base.Classifier :members: :undoc-members: :inherited-members: -.. autoclass:: capymoa.learner.learners.Regressor +.. autoclass:: capymoa.base.Regressor :members: :undoc-members: :inherited-members: -.. autoclass:: capymoa.learner.learners.ClassifierSSL +.. autoclass:: capymoa.base.ClassifierSSL :members: :undoc-members: :inherited-members: \ No newline at end of file diff --git a/docs/api/moa_learner.rst b/docs/api/moa_learner.rst new file mode 100644 index 00000000..e10e9965 --- /dev/null +++ b/docs/api/moa_learner.rst @@ -0,0 +1,14 @@ +MOA Learners +============ +Interfaces for objects that wrap MOA functionality. + +.. autoclass:: capymoa.base.MOAClassifier + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: capymoa.base.MOARegressor + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/api/regressor.rst b/docs/api/regressor.rst index 79a64286..93c01821 100644 --- a/docs/api/regressor.rst +++ b/docs/api/regressor.rst @@ -1,10 +1,9 @@ Regressors ========== -Regressors implement the :class:`capymoa.learner.learners.Regressor` interface. +Regressors implement the :class:`capymoa.base.Regressor` interface. -.. automodule:: capymoa.learner.regressor +.. automodule:: capymoa.regressor :members: :undoc-members: :show-inheritance: - :inherited-members: diff --git a/docs/api/splitcriteria.rst b/docs/api/splitcriteria.rst new file mode 100644 index 00000000..9416fd37 --- /dev/null +++ b/docs/api/splitcriteria.rst @@ -0,0 +1,10 @@ +Split Criterions +================ +Decision trees are built by splitting the data into groups based on a split +criterion. The split criterion is a function that measures the quality of a +split. + +.. automodule:: capymoa.splitcriteria + :members: + :undoc-members: + :inherited-members: \ No newline at end of file diff --git a/docs/api/ssl.rst b/docs/api/ssl.rst index 752d76f6..2719af93 100644 --- a/docs/api/ssl.rst +++ b/docs/api/ssl.rst @@ -1,8 +1,8 @@ -Semi-Supervised Classifiers -=========================== -Semi-Supervised classifiers implement the :class:`capymoa.learner.learners.ClassifierSSL` interface. +Semi-Supervised Learners (SSL) +============================== +Semi-Supervised classifiers implement the :class:`capymoa.base.ClassifierSSL` interface. -.. automodule:: capymoa.learner.ssl.classifier +.. automodule:: capymoa.ssl.classifier :members: :undoc-members: :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index 30d2a854..51933aa2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,6 +25,18 @@ "myst_parser", ] +nitpick_ignore_regex = [ + ('py:class', r'sklearn\..*'), + ('py:class', r'numpy\..*'), + ('py:class', r'pathlib\..*'), + ('py:class', r'abc\..*'), + ('py:class', r'moa\..*'), + ('py:class', r'com\..*'), + ('py:class', r'java\..*'), + ('py:class', r'org\..*'), + ('py:class', r'torch\..*'), + +] bibtex_bibfiles = ['references.bib'] autoclass_content = 'class' @@ -45,11 +57,13 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = "sphinx_book_theme" +html_theme = "pydata_sphinx_theme" html_static_path = ['_static'] # Setup symbolic links for notebooks +python_maximum_signature_line_length = 88 + notebooks = Path("../notebooks") notebook_doc_source = Path("notebooks") if not notebook_doc_source.exists(): diff --git a/docs/README.md b/docs/contributing/docs.md similarity index 100% rename from docs/README.md rename to docs/contributing/docs.md diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst new file mode 100644 index 00000000..8b552ca7 --- /dev/null +++ b/docs/contributing/index.rst @@ -0,0 +1,10 @@ +Contributing +============ +This part of the documentation is for developers and contributors. + +.. toctree:: + :maxdepth: 2 + + learners + tests + docs diff --git a/docs/contributing/learners.md b/docs/contributing/learners.md new file mode 100644 index 00000000..29cc5628 --- /dev/null +++ b/docs/contributing/learners.md @@ -0,0 +1,76 @@ +# Adding Learners +This document describes adding a new classifier, regressor, or +another learner to CapyMOA. Before doing this, you should have read the +[installation guide](../installation.md) to set up your development environment. + +## Where does my new learner go? +You should add your new learner to the appropriate directory: +- Classifiers go in `src/capymoa/classifier`. +- Regressors go in `src/capymoa/regressor`. +- Semi-supervised classifiers go in `src/capymoa/ssl/classifier`. + +Each standalone learner should be in its own file, prefixed with `_` to indicate that they are not meant to be imported directly. Instead, they are imported by an `__init__.py` file. The `__init__.py` file is a special file that tells Python to treat the directory as a package. + +For example, to add a new classifier class called `MyNewLearner`, you should implement it in `src/capymoa/classifier/_my_new_learner.py` and add it to the `src/capymoa/classifier/__init__.py` file. The `__init__.py` will look like this: +```python +from ._my_new_learner import MyNewLearner +... +__all__ = [ + 'MyNewLearner', + ... +] +``` + +The prefix and init files allow users to import all classifiers, regressors, +or semi-supervised from one package while splitting the code into multiple files. You can, for example, import your new learner with the following: +```python +from capymoa.classifier import MyNewLearner +``` + +## What does a learner implement? + +A learner should implement the appropriate interface: +* `capymoa.base.Classifier` for classifiers. +* `capymoa.base.Regressor` for regressors. +* `capymoa.base.ClassifierSSL` for semi-supervised classifiers. + +If your method is a wrapper around a MOA learner, you should use the appropriate +base class: +* `capymoa.base.MOAClassifier` for classifiers. +* `capymoa.base.MOARegressor` for regressors. + +## How do I test my new learner? +You should add a test to ensure your learner achieves and continues to achieves +the expected performance in future versions. CapyMOA provides parametrized +tests for classifiers, regressors, and semi-supervised classifiers. You should +not need to write any new test code. Instead, you should add your test's +parameters to the appropriate test file: +- `tests/test_classifiers.py` for classifiers. +- `tests/test_regressors.py` for regressors. +- `tests/test_ssl_classifiers.py` for semi-supervised classifiers. + +To run your tests, use the following command: +```bash +python -m pytest -k MyNewLearner +``` +The `-k MyNewLearner` flag tells PyTest to run tests containing `MyNewLearner` in the test ID. + +* If you want to add documented exemplar usage of your learner, you can add doctests. +See the [testing guide](tests.md) for more information. + +* If you need custom test code for your learner, you can add a new test file in +`tests`. + +## How do I document my new learner? +You should add a docstring to your learner that describes the learner and its +parameters. The docstring should be in the Sphinx format. Check the +[documenation guide](docs.md) for more information and an example. + +## How to debug failed GitHub Actions? +Before submitting your pull request, you may wish to run all tests to +ensure your changes will succeed in GitHub Actions. You can run all tests with: +```bash +invoke test +``` +If you run into issues with GitHub actions failing to build documentation. Follow +the instructions in the [documentation guide](docs.md) to build the documentation locally. The documentation build settings are intentionally strict to ensure the documentation builds correctly. diff --git a/docs/testing.md b/docs/contributing/tests.md similarity index 95% rename from docs/testing.md rename to docs/contributing/tests.md index add6b1fd..1eacf348 100644 --- a/docs/testing.md +++ b/docs/contributing/tests.md @@ -1,6 +1,6 @@ # Adding Tests Ensure you have installed the development dependencies by following the instructions -in the [installation guide](installation.md). To run all tests, use the following command: +in the [installation guide](../installation.md). To run all tests, use the following command: ```bash invoke test ``` diff --git a/docs/index.rst b/docs/index.rst index a1b6f0a5..2a40b3f1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,7 +33,7 @@ and modules. .. toctree:: :maxdepth: 2 - api/api + api/index Contributing ------------ @@ -41,10 +41,8 @@ This part of the documentation is for developers and contributors. .. toctree:: :maxdepth: 2 - :caption: Contributing - testing - README + contributing/index Indices and tables ================== diff --git a/invoke.yml b/invoke.yml index 7421975d..913af84c 100644 --- a/invoke.yml +++ b/invoke.yml @@ -5,8 +5,14 @@ moa_url: "https://homepages.ecs.vuw.ac.nz/~antonlee/capymoa/versions/240412_moa. # What notebooks to skip when running them as tests. test_skip_notebooks: - - notebooks/04_drift_streams.ipynb - - notebooks/02_learners_api_examples.ipynb - - notebooks/Basic_Classification_Examples.ipynb - notebooks/00_getting_started.ipynb + - notebooks/01_evaluation_and_data_reading.ipynb + - notebooks/02_learners_api_examples.ipynb - notebooks/03_using_sklearn_pytorch.ipynb + - notebooks/04_drift_streams.ipynb + - notebooks/Basic_Classification_Examples.ipynb + - notebooks/Creating_new_classifier.ipynb + - notebooks/Data_Reading.ipynb + - notebooks/Preprocessing.ipynb + - notebooks/SSL_example.ipynb + diff --git a/pyproject.toml b/pyproject.toml index 805811b0..30f2ba08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,14 +33,15 @@ dev=[ "jupyter", "nbmake", "pytest-xdist", - "invoke" + "invoke", + "wget" ] doc=[ # Documentation generator "sphinx", # Theme for the documentation - "sphinx-book-theme", + "pydata-sphinx-theme", # Allows to include Jupyter notebooks in the documentation "sphinx-autobuild", # Allows to include Jupyter notebooks in the documentation diff --git a/src/capymoa/__init__.py b/src/capymoa/__init__.py index 4f3a3a77..757193c7 100644 --- a/src/capymoa/__init__.py +++ b/src/capymoa/__init__.py @@ -1,6 +1,4 @@ from .prepare_jpype import _start_jpype - +# It is important that this is called before importing any other module _start_jpype() -"""Whenever capymoa is imported, start jpype. -""" diff --git a/src/capymoa/_utils.py b/src/capymoa/_utils.py index 0a5f9609..e663904b 100644 --- a/src/capymoa/_utils.py +++ b/src/capymoa/_utils.py @@ -37,7 +37,7 @@ def _get_moa_creation_CLI(moa_object): >>> from moa.streams import ConceptDriftStream ... - >>> stream = ConceptDriftStream() + >>> stream = ConceptDriftStream() >>> _get_moa_creation_CLI(stream) 'streams.ConceptDriftStream' """ diff --git a/src/capymoa/learner/learners.py b/src/capymoa/base.py similarity index 97% rename from src/capymoa/learner/learners.py rename to src/capymoa/base.py index b368e6db..1fa86019 100644 --- a/src/capymoa/learner/learners.py +++ b/src/capymoa/base.py @@ -5,9 +5,8 @@ from moa.classifiers import Classifier as MOA_Classifier_Interface from moa.core import Utils -from capymoa.stream.instance import (Instance, LabeledInstance, - RegressionInstance) -from capymoa.stream.stream import Schema +from capymoa.instance import Instance, LabeledInstance, RegressionInstance +from capymoa.stream._stream import Schema from capymoa.type_alias import LabelIndex, LabelProbabilities, TargetValue ############################################################## @@ -153,7 +152,9 @@ def train(self, instance): self.moa_learner.trainOnInstance(instance.java_instance) def predict(self, instance): - return Utils.maxIndex(self.moa_learner.getVotesForInstance(instance.java_instance)) + return Utils.maxIndex( + self.moa_learner.getVotesForInstance(instance.java_instance) + ) def predict_proba(self, instance): return self.moa_learner.getVotesForInstance(instance.java_instance) diff --git a/src/capymoa/classifier/__init__.py b/src/capymoa/classifier/__init__.py new file mode 100644 index 00000000..3fd5388d --- /dev/null +++ b/src/capymoa/classifier/__init__.py @@ -0,0 +1,16 @@ +from ._adaptive_random_forest import AdaptiveRandomForest +from ._efdt import EFDT +from ._hoeffding_tree import HoeffdingTree +from ._naive_bayes import NaiveBayes +from ._online_bagging import OnlineBagging +from ._passive_aggressive_classifier import PassiveAggressiveClassifier + +__all__ = [ + "AdaptiveRandomForest", + "AdaptiveRandomForest", + "EFDT", + "HoeffdingTree", + "NaiveBayes", + "OnlineBagging", + "PassiveAggressiveClassifier", +] diff --git a/src/capymoa/learner/classifier/classifiers.py b/src/capymoa/classifier/_adaptive_random_forest.py similarity index 71% rename from src/capymoa/learner/classifier/classifiers.py rename to src/capymoa/classifier/_adaptive_random_forest.py index 8fe9cf95..50ed0549 100644 --- a/src/capymoa/learner/classifier/classifiers.py +++ b/src/capymoa/classifier/_adaptive_random_forest.py @@ -1,18 +1,9 @@ -# Library imports -from capymoa.learner.learners import ( +from capymoa.base import ( MOAClassifier, - MOARegressor, - _get_moa_creation_CLI, _extract_moa_learner_CLI, ) -# MOA/Java imports -from moa.classifiers import Classifier -from moa.classifiers.meta import AdaptiveRandomForest as MOA_AdaptiveRandomForest -from moa.classifiers.meta import OzaBag as MOA_OzaBag -from moa.classifiers.meta import ( - AdaptiveRandomForestRegressor as MOA_AdaptiveRandomForestRegressor, -) +from moa.classifiers.meta import AdaptiveRandomForest as _MOA_AdaptiveRandomForest # TODO: replace the m_features_mode logic such that we can infer from m_features_per_tree_size, e.g. if value is double between 0.0 and 1.0 = percentage @@ -85,29 +76,5 @@ def __init__( schema=schema, CLI=CLI, random_seed=random_seed, - moa_learner=MOA_AdaptiveRandomForest(), - ) - - -class OnlineBagging(MOAClassifier): - def __init__( - self, schema=None, CLI=None, random_seed=1, base_learner=None, ensemble_size=100 - ): - # This method basically configures the CLI, object creation is delegated to MOAClassifier (the super class, through super().__init___())) - # Initialize instance attributes with default values, if the CLI was not set. - if CLI is None: - self.base_learner = ( - "trees.HoeffdingTree" - if base_learner is None - else _extract_moa_learner_CLI(base_learner) - ) - self.ensemble_size = ensemble_size - CLI = f"-l {self.base_learner} -s {self.ensemble_size}" - - super().__init__( - schema=schema, CLI=CLI, random_seed=random_seed, moa_learner=MOA_OzaBag() + moa_learner=_MOA_AdaptiveRandomForest(), ) - - def __str__(self): - # Overrides the default class name from MOA (OzaBag) - return "OnlineBagging" diff --git a/src/capymoa/learner/classifier/efdt.py b/src/capymoa/classifier/_efdt.py similarity index 97% rename from src/capymoa/learner/classifier/efdt.py rename to src/capymoa/classifier/_efdt.py index 28eb13b4..ba399790 100644 --- a/src/capymoa/learner/classifier/efdt.py +++ b/src/capymoa/classifier/_efdt.py @@ -1,8 +1,8 @@ from __future__ import annotations from typing import Union -from capymoa.learner import MOAClassifier -from capymoa.learner.splitcriteria import SplitCriterion, _split_criterion_to_cli_str +from capymoa.base import MOAClassifier +from capymoa.splitcriteria import SplitCriterion, _split_criterion_to_cli_str from capymoa.stream import Schema from capymoa._utils import build_cli_str_from_mapping_and_locals diff --git a/src/capymoa/learner/classifier/hoeffding_tree.py b/src/capymoa/classifier/_hoeffding_tree.py similarity index 96% rename from src/capymoa/learner/classifier/hoeffding_tree.py rename to src/capymoa/classifier/_hoeffding_tree.py index 3e7656f8..0fc0fe5b 100644 --- a/src/capymoa/learner/classifier/hoeffding_tree.py +++ b/src/capymoa/classifier/_hoeffding_tree.py @@ -1,8 +1,8 @@ from __future__ import annotations from typing import Union -from capymoa.learner import MOAClassifier -from capymoa.learner.splitcriteria import SplitCriterion, _split_criterion_to_cli_str +from capymoa.base import MOAClassifier +from capymoa.splitcriteria import SplitCriterion, _split_criterion_to_cli_str from capymoa.stream import Schema from capymoa._utils import build_cli_str_from_mapping_and_locals diff --git a/src/capymoa/learner/classifier/naive_bayes.py b/src/capymoa/classifier/_naive_bayes.py similarity index 74% rename from src/capymoa/learner/classifier/naive_bayes.py rename to src/capymoa/classifier/_naive_bayes.py index 4320a4a0..f18bf68f 100644 --- a/src/capymoa/learner/classifier/naive_bayes.py +++ b/src/capymoa/classifier/_naive_bayes.py @@ -1,7 +1,7 @@ from __future__ import annotations import typing -from capymoa.learner import MOAClassifier +from capymoa.base import MOAClassifier from capymoa.stream import Schema import moa.classifiers.bayes as moa_bayes @@ -12,18 +12,14 @@ class NaiveBayes(MOAClassifier): Performs classic Bayesian prediction while making the naive assumption that all inputs are independent. Naive Bayes is a classifier algorithm known for its simplicity and low computational cost. Given n different classes, the trained Naive Bayes classifier predicts, for every unlabeled instance I, the class C to which it belongs with high accuracy. :param schema: The schema of the stream, defaults to None. - :type schema: object, optional :param random_seed: The random seed passed to the MOA learner, defaults to 0. - :type random_seed: int, optional """ - def __init__(self, schema: typing.Union[Schema, None] = None, random_seed: int = 0): - super(NaiveBayes, self).__init__(moa_learner=moa_bayes.NaiveBayes(), - schema=schema, - random_seed=random_seed) + super(NaiveBayes, self).__init__( + moa_learner=moa_bayes.NaiveBayes(), schema=schema, random_seed=random_seed + ) def __str__(self): # Overrides the default class name from MOA (OzaBag) return "Naive Bayes CapyMOA Classifier" - diff --git a/src/capymoa/classifier/_online_bagging.py b/src/capymoa/classifier/_online_bagging.py new file mode 100644 index 00000000..f27c1b34 --- /dev/null +++ b/src/capymoa/classifier/_online_bagging.py @@ -0,0 +1,29 @@ +from capymoa.base import ( + MOAClassifier, + _extract_moa_learner_CLI, +) + +from moa.classifiers.meta import OzaBag as _MOA_OzaBag + +class OnlineBagging(MOAClassifier): + def __init__( + self, schema=None, CLI=None, random_seed=1, base_learner=None, ensemble_size=100 + ): + # This method basically configures the CLI, object creation is delegated to MOAClassifier (the super class, through super().__init___())) + # Initialize instance attributes with default values, if the CLI was not set. + if CLI is None: + self.base_learner = ( + "trees.HoeffdingTree" + if base_learner is None + else _extract_moa_learner_CLI(base_learner) + ) + self.ensemble_size = ensemble_size + CLI = f"-l {self.base_learner} -s {self.ensemble_size}" + + super().__init__( + schema=schema, CLI=CLI, random_seed=random_seed, moa_learner=_MOA_OzaBag() + ) + + def __str__(self): + # Overrides the default class name from MOA (OzaBag) + return "OnlineBagging" diff --git a/src/capymoa/learner/classifier/sklearn.py b/src/capymoa/classifier/_passive_aggressive_classifier.py similarity index 95% rename from src/capymoa/learner/classifier/sklearn.py rename to src/capymoa/classifier/_passive_aggressive_classifier.py index 07acb964..30826e63 100644 --- a/src/capymoa/learner/classifier/sklearn.py +++ b/src/capymoa/classifier/_passive_aggressive_classifier.py @@ -1,10 +1,10 @@ from typing import Optional, Dict, Union, Literal -from capymoa.learner.learners import Classifier +from capymoa.base import Classifier from sklearn.linear_model import ( PassiveAggressiveClassifier as skPassiveAggressiveClassifier, ) -from capymoa.stream.instance import Instance, LabeledInstance -from capymoa.stream.stream import Schema +from capymoa.instance import Instance, LabeledInstance +from capymoa.stream._stream import Schema from capymoa.type_alias import LabelIndex, LabelProbabilities import numpy as np @@ -21,7 +21,7 @@ class PassiveAggressiveClassifier(Classifier): `_ >>> from capymoa.datasets import ElectricityTiny - >>> from capymoa.learner.classifier import PassiveAggressiveClassifier + >>> from capymoa.classifier import PassiveAggressiveClassifier >>> from capymoa.evaluation import prequential_evaluation >>> stream = ElectricityTiny() >>> schema = stream.get_schema() diff --git a/src/capymoa/datasets/__init__.py b/src/capymoa/datasets/__init__.py index c188cb33..cc1ee4cc 100644 --- a/src/capymoa/datasets/__init__.py +++ b/src/capymoa/datasets/__init__.py @@ -1,4 +1,4 @@ -from .datasets import ( +from ._datasets import ( CovtFD, Covtype, RBFm_100k, @@ -6,11 +6,9 @@ Hyper100k, Sensor, ElectricityTiny, - Fried -) -from .downloader import ( - get_download_dir + Fried, ) +from .downloader import get_download_dir __all__ = [ "Hyper100k", @@ -21,5 +19,5 @@ "Sensor", "ElectricityTiny", "Fried", - "get_download_dir" + "get_download_dir", ] diff --git a/src/capymoa/datasets/datasets.py b/src/capymoa/datasets/_datasets.py similarity index 99% rename from src/capymoa/datasets/datasets.py rename to src/capymoa/datasets/_datasets.py index 1d7d90e8..e006ca4e 100644 --- a/src/capymoa/datasets/datasets.py +++ b/src/capymoa/datasets/_datasets.py @@ -51,6 +51,7 @@ class ElectricityTiny(DownloadARFFGzip): filename = "electricity_tiny.arff" remote_url = ROOT_URL + class CovtypeTiny(DownloadARFFGzip): """A truncated version of the Covtype dataset with 1000 instances.""" diff --git a/src/capymoa/datasets/downloader.py b/src/capymoa/datasets/downloader.py index 83e5d94b..07c2d8e9 100644 --- a/src/capymoa/datasets/downloader.py +++ b/src/capymoa/datasets/downloader.py @@ -5,12 +5,12 @@ from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Optional -import shutil import wget from moa.streams import ArffFileStream -from capymoa.stream.stream import Stream +from capymoa.stream._stream import Stream + def get_download_dir(): """A default directory to store datasets in. Defaults to `./data` when the @@ -18,6 +18,7 @@ def get_download_dir(): """ return environ.get("CAPYMOA_DATASETS_DIR", "data") + class DownloadableDataset(ABC, Stream): filename: str = None """Name of the dataset in the capymoa dataset directory""" @@ -54,7 +55,7 @@ def _resolve_dataset(self, auto_download: bool, directory: Path): ) return stream - + def get_path(self): return self._path diff --git a/src/capymoa/evaluation/__init__.py b/src/capymoa/evaluation/__init__.py index 80e9dbe2..d87cba53 100644 --- a/src/capymoa/evaluation/__init__.py +++ b/src/capymoa/evaluation/__init__.py @@ -10,7 +10,7 @@ RegressionEvaluator, ) -__ALL__ = [ +__all__ = [ "prequential_evaluation", "prequential_SSL_evaluation", "test_then_train_evaluation", diff --git a/src/capymoa/evaluation/evaluation.py b/src/capymoa/evaluation/evaluation.py index 2732e536..a989e0ea 100644 --- a/src/capymoa/evaluation/evaluation.py +++ b/src/capymoa/evaluation/evaluation.py @@ -5,8 +5,8 @@ import warnings import random -from capymoa.stream.stream import Schema, Stream -from capymoa.learner.learners import ClassifierSSL +from capymoa.stream import Schema, Stream +from capymoa.base import ClassifierSSL from com.yahoo.labs.samoa.instances import Instances, Attribute, DenseInstance from moa.core import InstanceExample @@ -106,13 +106,19 @@ def update(self, y_target_index: int, y_pred_index: Optional[int]): :raises ValueError: If the values are not valid indexes in the schema. """ if not isinstance(y_target_index, (np.integer, int)): - raise ValueError(f"y_target_index must be an integer, not {type(y_target_index)}") + raise ValueError( + f"y_target_index must be an integer, not {type(y_target_index)}" + ) if not (y_pred_index is None or isinstance(y_pred_index, (np.integer, int))): - raise ValueError(f"y_pred_index must be an integer, not {type(y_pred_index)}") + raise ValueError( + f"y_pred_index must be an integer, not {type(y_pred_index)}" + ) - # If the prediction is invalid, it could mean the classifier is abstaining from making a prediction; + # If the prediction is invalid, it could mean the classifier is abstaining from making a prediction; # thus, it is allowed to continue (unless parameterized differently). - if y_pred_index is not None and not self.schema.is_y_index_in_range(y_pred_index): + if y_pred_index is not None and not self.schema.is_y_index_in_range( + y_pred_index + ): if self.allow_abstaining: y_pred_index = None else: @@ -131,7 +137,7 @@ def update(self, y_target_index: int, y_pred_index: Optional[int]): # if y_pred is None, it indicates the learner did not produce a prediction for this instance, # count as an error if y_pred_index is None: - # TODO: I'm not sure what the actual logic should be here, but for + # TODO: I'm not sure what the actual logic should be here, but for # now I'm just setting the prediction to the first class since this # does not break the tests. y_pred_index = 0 @@ -153,9 +159,7 @@ def update(self, y_target_index: int, y_pred_index: Optional[int]): # If the window_size is set, then check if it should record the intermediary results. if self.window_size is not None and self.instances_seen % self.window_size == 0: - performance_values = ( - self.metrics() - ) + performance_values = self.metrics() self.result_windows.append(performance_values) def metrics_header(self): @@ -172,7 +176,10 @@ def metrics(self): ] def metrics_dict(self): - return {header: value for header, value in zip(self.metrics_header(), self.metrics())} + return { + header: value + for header, value in zip(self.metrics_header(), self.metrics()) + } def metrics_per_window(self): return pd.DataFrame(self.result_windows, columns=self.metrics_header()) @@ -435,8 +442,8 @@ def test_then_train_evaluation( "cumulative": evaluator, "wallclock": elapsed_wallclock_time, "cpu_time": elapsed_cpu_time, - "max_instances":max_instances, - "stream":stream, + "max_instances": max_instances, + "stream": stream, } return results @@ -713,8 +720,8 @@ def prequential_SSL_evaluation( "windowed": evaluator_windowed, "wallclock": elapsed_wallclock_time, "cpu_time": elapsed_cpu_time, - "max_instances":max_instances, - "stream":stream, + "max_instances": max_instances, + "stream": stream, "unlabeled": unlabeled_counter, "unlabeled_ratio": unlabeled_counter / instancesProcessed, } @@ -788,8 +795,8 @@ def _test_then_train_evaluation_fast( "cumulative": evaluator, "wallclock": elapsed_wallclock_time, "cpu_time": elapsed_cpu_time, - "max_instances":max_instances, - "stream":stream, + "max_instances": max_instances, + "stream": stream, } return results @@ -855,8 +862,8 @@ def _prequential_evaluation_fast(stream, learner, max_instances=None, window_siz "windowed": windowed_evaluator, "wallclock": elapsed_wallclock_time, "cpu_time": elapsed_cpu_time, - "max_instances":max_instances, - "stream":stream, + "max_instances": max_instances, + "stream": stream, } return results @@ -942,8 +949,8 @@ def test_then_train_SSL_evaluation_fast( "cumulative": evaluator, "wallclock": elapsed_wallclock_time, "cpu_time": elapsed_cpu_time, - "max_instances":max_instances, - "stream":stream, + "max_instances": max_instances, + "stream": stream, } for measure in moa_results.otherMeasurements.keySet(): @@ -1019,8 +1026,8 @@ def prequential_SSL_evaluation_fast( "windowed": windowed_evaluator, "wallclock": elapsed_wallclock_time, "cpu_time": elapsed_cpu_time, - "max_instances":max_instances, - "stream":stream, + "max_instances": max_instances, + "stream": stream, "other_measurements": dict(moa_results.otherMeasurements), } @@ -1087,7 +1094,6 @@ def prequential_evaluation_multiple_learners( else: y = instance.y_value - results[learner_name]["cumulative"].update(y, prediction) if window_size is not None: results[learner_name]["windowed"].update(y, prediction) diff --git a/src/capymoa/evaluation/visualization.py b/src/capymoa/evaluation/visualization.py index ce0b9966..a49b849c 100644 --- a/src/capymoa/evaluation/visualization.py +++ b/src/capymoa/evaluation/visualization.py @@ -4,10 +4,18 @@ from com.yahoo.labs.samoa.instances import InstancesHeader -def plot_windowed_results(*results, metric="classifications correct (percent)", - plot_title=None, xlabel=None, ylabel=None, - figure_path="./", figure_name=None, save_only=True - ): +def plot_windowed_results( + *results, + metric="classifications correct (percent)", + plot_title=None, + xlabel=None, + ylabel=None, + figure_path="./", + figure_name=None, + save_only=True, + # , + # drift_locations=None, gradual_drift_window_lengths=None +): """ Plot a comparison of values from multiple evaluators based on a selected column using line plots. It assumes the results contain windowed results ('windowed') which often originate from metrics_per_window() @@ -17,34 +25,36 @@ def plot_windowed_results(*results, metric="classifications correct (percent)", """ dfs = [] labels = [] - - num_instances = results[0].get('max_instances', None) - stream = results[0].get('stream', None) + + num_instances = results[0].get("max_instances", None) + stream = results[0].get("stream", None) if num_instances is not None: - window_size = results[0]['windowed'].window_size - num_windows = results[0]['windowed'].metrics_per_window().shape[0] + window_size = results[0]["windowed"].window_size + num_windows = results[0]["windowed"].metrics_per_window().shape[0] x_values = [] - for i in range(1, num_windows+1): + for i in range(1, num_windows + 1): x_values.append(i * window_size) # print(f'x_values: {x_values}') # Check if the given metric exists in all DataFrames for result in results: - df = result['windowed'].metrics_per_window() + df = result["windowed"].metrics_per_window() if metric not in df.columns: - print(f"Column '{metric}' not found in metrics DataFrame for {result['learner']}. Skipping.") + print( + f"Column '{metric}' not found in metrics DataFrame for {result['learner']}. Skipping." + ) else: dfs.append(df) - if 'experiment_id' in result: - labels.append(result['experiment_id']) + if "experiment_id" in result: + labels.append(result["experiment_id"]) else: - labels.append(result['learner']) - + labels.append(result["learner"]) + if not dfs: print("No valid DataFrames to plot.") return - + # Create a figure plt.figure(figsize=(12, 5)) @@ -52,10 +62,23 @@ def plot_windowed_results(*results, metric="classifications correct (percent)", for i, df in enumerate(dfs): # print(f'df.index: {df.index}') if num_instances is not None: - plt.plot(x_values, df[metric], label=labels[i], marker='o', linestyle='-', markersize=5) + plt.plot( + x_values, + df[metric], + label=labels[i], + marker="o", + linestyle="-", + markersize=5, + ) else: - plt.plot(df.index, df[metric], label=labels[i], marker='o', linestyle='-', markersize=5) - + plt.plot( + df.index, + df[metric], + label=labels[i], + marker="o", + linestyle="-", + markersize=5, + ) if stream is not None and isinstance(stream, DriftStream): drifts = stream.get_drifts() @@ -66,43 +89,52 @@ def plot_windowed_results(*results, metric="classifications correct (percent)", # Add vertical lines at drift locations if drift_locations: for location in drift_locations: - plt.axvline(location, color='red', linestyle='-') - + plt.axvline(location, color="red", linestyle="-") + # Add gradual drift windows as 70% transparent rectangles if gradual_drift_window_lengths: if not drift_locations: - print("Error: gradual_drift_window_lengths is provided, but drift_locations is not.") + print( + "Error: gradual_drift_window_lengths is provided, but drift_locations is not." + ) return - + if len(drift_locations) != len(gradual_drift_window_lengths): - print("Error: drift_locations and gradual_drift_window_lengths must have the same length.") + print( + "Error: drift_locations and gradual_drift_window_lengths must have the same length." + ) return - + for i in range(len(drift_locations)): location = drift_locations[i] window_length = gradual_drift_window_lengths[i] - + # Plot the 70% transparent rectangle - plt.axvspan(location - window_length / 2, location + window_length / 2, alpha=0.2, color='red') - + plt.axvspan( + location - window_length / 2, + location + window_length / 2, + alpha=0.2, + color="red", + ) + # Add labels and title - xlabel = xlabel if xlabel is not None else '# Instances' + xlabel = xlabel if xlabel is not None else "# Instances" plt.xlabel(xlabel) ylabel = ylabel if ylabel is not None else metric plt.ylabel(ylabel) plot_title = plot_title if plot_title is not None else metric plt.title(plot_title) - + # Add legend plt.legend() plt.grid(True) - + # Show the plot or save it to the specified path if save_only == False: plt.show() elif figure_path is not None: if figure_name is None: - figure_name = result['learner'] + "_" + ylabel.replace(' ', '') + figure_name = result["learner"] + "_" + ylabel.replace(" ", "") plt.savefig(figure_path + figure_name) diff --git a/src/capymoa/stream/instance.py b/src/capymoa/instance.py similarity index 97% rename from src/capymoa/stream/instance.py rename to src/capymoa/instance.py index f33b8091..d582919d 100644 --- a/src/capymoa/stream/instance.py +++ b/src/capymoa/instance.py @@ -62,7 +62,7 @@ def from_array(cls, schema: "Schema", instance: FeatureVector) -> "Instance": >>> from capymoa.stream import Schema ... - >>> from capymoa.stream.instance import Instance + >>> from capymoa.instance import Instance >>> import numpy as np >>> schema = Schema.from_custom( ... ["f1", "f2"], @@ -146,7 +146,7 @@ class LabeledInstance(Instance): >>> from capymoa.datasets import ElectricityTiny ... - >>> from capymoa.stream.instance import LabeledInstance + >>> from capymoa.instance import LabeledInstance >>> stream = ElectricityTiny() >>> instance: LabeledInstance = stream.next_instance() >>> instance.y_label @@ -182,7 +182,7 @@ def from_array( >>> from capymoa.stream import Schema ... - >>> from capymoa.stream.instance import LabeledInstance + >>> from capymoa.instance import LabeledInstance >>> import numpy as np >>> schema = Schema.from_custom( ... ["f1", "f2"], @@ -253,7 +253,7 @@ class RegressionInstance(Instance): >>> from capymoa.datasets import Fried ... - >>> from capymoa.stream.instance import RegressionInstance + >>> from capymoa.instance import RegressionInstance >>> stream = Fried() >>> instance: RegressionInstance = stream.next_instance() >>> instance.y_value @@ -286,7 +286,7 @@ def from_array( >>> from capymoa.stream import Schema ... - >>> from capymoa.stream.instance import LabeledInstance + >>> from capymoa.instance import LabeledInstance >>> import numpy as np >>> schema = Schema.from_custom( ... ["f1", "f2"], diff --git a/src/capymoa/learner/__init__.py b/src/capymoa/learner/__init__.py deleted file mode 100644 index e8d2355e..00000000 --- a/src/capymoa/learner/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from .learners import ( - Classifier, - MOAClassifier, - ClassifierSSL, - MOAClassifierSSL, - Regressor, - MOARegressor, - SKClassifier, -) - - -__ALL__ = [ - "Classifier", - "MOAClassifier", - "ClassifierSSL", - "MOAClassifierSSL", - "Regressor", - "MOARegressor", - "SKClassifier", -] diff --git a/src/capymoa/learner/classifier/__init__.py b/src/capymoa/learner/classifier/__init__.py deleted file mode 100644 index 51c8b531..00000000 --- a/src/capymoa/learner/classifier/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from .classifiers import AdaptiveRandomForest, OnlineBagging, AdaptiveRandomForest -from .efdt import EFDT -from .sklearn import PassiveAggressiveClassifier -from .hoeffding_tree import HoeffdingTree -from .naive_bayes import NaiveBayes - -__all__ = [ - "AdaptiveRandomForest", - "OnlineBagging", - "AdaptiveRandomForest", - "EFDT", - "HoeffdingTree", - "NaiveBayes", - "PassiveAggressiveClassifier", -] diff --git a/src/capymoa/learner/regressor/__init__.py b/src/capymoa/learner/regressor/__init__.py deleted file mode 100644 index cead1fa7..00000000 --- a/src/capymoa/learner/regressor/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from .regressors import ( - KNNRegressor, - AdaptiveRandomForestRegressor, - FIMTDD, - ARFFIMTDD, - ORTO, - SOKNLBT, - SOKNL, -) - -__all__ = [ - "KNNRegressor", - "AdaptiveRandomForestRegressor", - "FIMTDD", - "ARFFIMTDD", - "ORTO", - "SOKNLBT", - "SOKNL", -] diff --git a/src/capymoa/learner/regressor/regressors.py b/src/capymoa/learner/regressor/regressors.py deleted file mode 100644 index d717ab48..00000000 --- a/src/capymoa/learner/regressor/regressors.py +++ /dev/null @@ -1,493 +0,0 @@ -# Library imports -from typing import Optional, Union - -from capymoa.learner.learners import ( - MOARegressor, -) - -from capymoa.learner.splitcriteria import SplitCriterion, _split_criterion_to_cli_str -from capymoa.stream.stream import Schema -from moa.classifiers.lazy import kNN as MOA_kNN -from moa.classifiers.meta import ( - AdaptiveRandomForestRegressor as MOA_AdaptiveRandomForestRegressor, - SelfOptimisingKNearestLeaves as MOA_SOKNL, -) -from moa.classifiers.trees import ( - FIMTDD as _MOA_FIMTDD, - ARFFIMTDD as _MOA_ARFFIMTDD, - ORTO as _MOA_ORTO, - SelfOptimisingBaseTree as _MOA_SelfOptimisingBaseTree, -) - - -######################## -######### TREES ######## -######################## -class FIMTDD(MOARegressor): - """Implementation of the FIMT-DD tree as described by Ikonomovska et al.""" - - def __init__( - self, - schema: Schema, - split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", - grace_period: int = 200, - split_confidence: float = 1.0e-7, - tie_threshold: float = 0.05, - page_hinckley_alpha: float = 0.005, - page_hinckley_threshold: int = 50, - alternate_tree_fading_factor: float = 0.995, - alternate_tree_t_min: int = 150, - alternate_tree_time: int = 1500, - regression_tree: bool = False, - learning_ratio: float = 0.02, - learning_ratio_decay_factor: float = 0.001, - learning_ratio_const: bool = False, - random_seed: Optional[int] = None, - ) -> None: - """ - Construct FIMTDD. - - :param split_criterion: Split criterion to use. - :param grace_period: Number of instances a leaf should observe between split attempts. - :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. - :param tie_threshold: Threshold below which a split will be forced to break ties. - :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. - :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. - :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. - :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. - :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. - :param regression_tree: Build a regression tree instead of a model tree. - :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. - :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). - :param learning_ratio_const: Keep learning rate constant instead of decaying. - """ - cli = [] - - cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") - cli.append(f"-g {grace_period}") - cli.append(f"-c {split_confidence}") - cli.append(f"-t {tie_threshold}") - cli.append(f"-a {page_hinckley_alpha}") - cli.append(f"-h {page_hinckley_threshold}") - cli.append(f"-f {alternate_tree_fading_factor}") - cli.append(f"-y {alternate_tree_t_min}") - cli.append(f"-u {alternate_tree_time}") - cli.append("-e") if regression_tree else None - cli.append(f"-l {learning_ratio}") - cli.append(f"-d {learning_ratio_decay_factor}") - cli.append("-p") if learning_ratio_const else None - - self.moa_learner = _MOA_FIMTDD() - - super().__init__( - schema=schema, - CLI=" ".join(cli), - random_seed=random_seed, - moa_learner=self.moa_learner, - ) - - -class ARFFIMTDD(MOARegressor): - """Modified Fast Incremental Model Tree with Drift Detection for basic - learner for ARF-Regas described by Ikonomovska et al.""" - - def __init__( - self, - schema: Schema, - subspace_size_size: int = 2, - split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", - grace_period: int = 200, - split_confidence: float = 1.0e-7, - tie_threshold: float = 0.05, - page_hinckley_alpha: float = 0.005, - page_hinckley_threshold: int = 50, - alternate_tree_fading_factor: float = 0.995, - alternate_tree_t_min: int = 150, - alternate_tree_time: int = 1500, - learning_ratio: float = 0.02, - learning_ratio_decay_factor: float = 0.001, - learning_ratio_const: bool = False, - random_seed: Optional[int] = None, - ) -> None: - """ - Construct ARFFIMTDD. - - :param subspace_size_size: Number of features per subset for each node split. Negative values = #features - k - :param split_criterion: Split criterion to use. - :param grace_period: Number of instances a leaf should observe between split attempts. - :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. - :param tie_threshold: Threshold below which a split will be forced to break ties. - :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. - :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. - :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. - :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. - :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. - :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. - :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). - :param learning_ratio_const: Keep learning rate constant instead of decaying. - """ - cli = [] - - cli.append(f"-k {subspace_size_size}") - cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") - cli.append(f"-g {grace_period}") - cli.append(f"-c {split_confidence}") - cli.append(f"-t {tie_threshold}") - cli.append(f"-a {page_hinckley_alpha}") - cli.append(f"-h {page_hinckley_threshold}") - cli.append(f"-f {alternate_tree_fading_factor}") - cli.append(f"-y {alternate_tree_t_min}") - cli.append(f"-u {alternate_tree_time}") - cli.append(f"-l {learning_ratio}") - cli.append(f"-d {learning_ratio_decay_factor}") - cli.append("-p") if learning_ratio_const else None - - self.moa_learner = _MOA_ARFFIMTDD() - - super().__init__( - schema=schema, - CLI=" ".join(cli), - random_seed=random_seed, - moa_learner=self.moa_learner, - ) - - -class ORTO(MOARegressor): - """Implementation of the ORTO tree as described by Ikonomovska et al.""" - - def __init__( - self, - schema: Schema, - max_trees: int = 10, - max_option_level: int = 10, - option_decay_factor: float = 0.9, - option_fading_factor: float = 0.9995, - split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", - grace_period: int = 200, - split_confidence: float = 1.0e-7, - tie_threshold: float = 0.05, - page_hinckley_alpha: float = 0.005, - page_hinckley_threshold: int = 50, - alternate_tree_fading_factor: float = 0.995, - alternate_tree_t_min: int = 150, - alternate_tree_time: int = 1500, - regression_tree: bool = False, - learning_ratio: float = 0.02, - learning_ratio_decay_factor: float = 0.001, - learning_ratio_const: bool = False, - random_seed: Optional[int] = None, - ) -> None: - """ - Construct ORTO. - - :param max_trees: The maximum number of trees contained in the option tree. - :param max_option_level: The maximal depth at which option nodes can be created. - :param option_decay_factor: The option decay factor that determines how many options can be selected at a given level. - :param option_fading_factor: The fading factor used for comparing subtrees of an option node. - :param split_criterion: Split criterion to use. - :param grace_period: Number of instances a leaf should observe between split attempts. - :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. - :param tie_threshold: Threshold below which a split will be forced to break ties. - :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. - :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. - :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. - :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. - :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. - :param regression_tree: Build a regression tree instead of a model tree. - :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. - :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). - :param learning_ratio_const: Keep learning rate constant instead of decaying. - """ - cli = [] - - cli.append(f"-m {max_trees}") - cli.append(f"-x {max_option_level}") - cli.append(f"-z {option_decay_factor}") - cli.append(f"-q {option_fading_factor}") - cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") - cli.append(f"-g {grace_period}") - cli.append(f"-c {split_confidence}") - cli.append(f"-t {tie_threshold}") - cli.append(f"-a {page_hinckley_alpha}") - cli.append(f"-h {page_hinckley_threshold}") - cli.append(f"-f {alternate_tree_fading_factor}") - cli.append(f"-y {alternate_tree_t_min}") - cli.append(f"-u {alternate_tree_time}") - cli.append("-e") if regression_tree else None - cli.append(f"-l {learning_ratio}") - cli.append(f"-d {learning_ratio_decay_factor}") - cli.append("-p") if learning_ratio_const else None - - self.moa_learner = _MOA_ORTO() - - super().__init__( - schema=schema, - CLI=" ".join(cli), - random_seed=random_seed, - moa_learner=self.moa_learner, - ) - - -class SOKNLBT(MOARegressor): - """Implementation of the FIMT-DD tree as described by Ikonomovska et al.""" - - def __init__( - self, - schema: Schema, - subspace_size_size: int = 2, - split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", - grace_period: int = 200, - split_confidence: float = 1.0e-7, - tie_threshold: float = 0.05, - page_hinckley_alpha: float = 0.005, - page_hinckley_threshold: int = 50, - alternate_tree_fading_factor: float = 0.995, - alternate_tree_t_min: int = 150, - alternate_tree_time: int = 1500, - learning_ratio: float = 0.02, - learning_ratio_decay_factor: float = 0.001, - learning_ratio_const: bool = False, - random_seed: Optional[int] = None, - ) -> None: - """ - Construct SelfOptimisingBaseTree. - - :param subspace_size_size: Number of features per subset for each node split. Negative values = #features - k - :param split_criterion: Split criterion to use. - :param grace_period: Number of instances a leaf should observe between split attempts. - :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. - :param tie_threshold: Threshold below which a split will be forced to break ties. - :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. - :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. - :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. - :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. - :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. - :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. - :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). - :param learning_ratio_const: Keep learning rate constant instead of decaying. - """ - cli = [] - - cli.append(f"-k {subspace_size_size}") - cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") - cli.append(f"-g {grace_period}") - cli.append(f"-c {split_confidence}") - cli.append(f"-t {tie_threshold}") - cli.append(f"-a {page_hinckley_alpha}") - cli.append(f"-h {page_hinckley_threshold}") - cli.append(f"-f {alternate_tree_fading_factor}") - cli.append(f"-y {alternate_tree_t_min}") - cli.append(f"-u {alternate_tree_time}") - cli.append(f"-l {learning_ratio}") - cli.append(f"-d {learning_ratio_decay_factor}") - cli.append("-p") if learning_ratio_const else None - - self.moa_learner = _MOA_SelfOptimisingBaseTree() - - super().__init__( - schema=schema, - CLI=" ".join(cli), - random_seed=random_seed, - moa_learner=self.moa_learner, - ) - - -######################## -######### LAZY ######### -######################## - - -class KNNRegressor(MOARegressor): - """ - The default number of neighbors (k) is set to 3 instead of 10 (as in MOA) - """ - - def __init__( - self, schema=None, CLI=None, random_seed=1, k=3, median=False, window_size=1000 - ): - # Important, should create the MOA object before invoking the super class __init__ - self.moa_learner = MOA_kNN() - super().__init__( - schema=schema, - CLI=CLI, - random_seed=random_seed, - moa_learner=self.moa_learner, - ) - - # Initialize instance attributes with default values, CLI was not set. - if self.CLI is None: - self.k = k - self.median = median - self.window_size = window_size - self.moa_learner.getOptions().setViaCLIString( - f"-k {self.k} {'-m' if self.median else ''} -w \ - {self.window_size}" - ) - self.moa_learner.prepareForUse() - self.moa_learner.resetLearning() - - def __str__(self): - # Overrides the default class name from MOA - return "kNNRegressor" - - -######################## -####### ENSEMBLES ###### -######################## - - -# TODO: replace the m_features_mode logic such that we can infer from m_features_per_tree_size, e.g. if value is double between 0.0 and 1.0 = percentage -class AdaptiveRandomForestRegressor(MOARegressor): - def __init__( - self, - schema=None, - CLI=None, - random_seed=1, - tree_learner=None, - ensemble_size=100, - max_features=0.6, - lambda_param=6.0, # m_features_mode=None, m_features_per_tree_size=60, - drift_detection_method=None, - warning_detection_method=None, - disable_drift_detection=False, - disable_background_learner=False, - ): - # Important: must create the MOA object before invoking the super class __init__ - self.moa_learner = MOA_AdaptiveRandomForestRegressor() - super().__init__( - schema=schema, - CLI=CLI, - random_seed=random_seed, - moa_learner=self.moa_learner, - ) - - # Initialize instance attributes with default values, CLI was not set. - if self.CLI is None: - self.tree_learner = ( - ARFFIMTDD(schema, grace_period=50, split_confidence=0.01) - if tree_learner is None - else tree_learner - ) - self.ensemble_size = ensemble_size - - self.max_features = max_features - if isinstance(self.max_features, float) and 0.0 <= self.max_features <= 1.0: - self.m_features_mode = "(Percentage (M * (m / 100)))" - self.m_features_per_tree_size = int(self.max_features * 100) - elif isinstance(self.max_features, int): - self.m_features_mode = "(Specified m (integer value))" - self.m_features_per_tree_size = max_features - elif self.max_features in ["sqrt"]: - self.m_features_mode = "(sqrt(M)+1)" - self.m_features_per_tree_size = -1 # or leave it unchanged - elif self.max_features is None: - self.m_features_mode = "(Percentage (M * (m / 100)))" - self.m_features_per_tree_size = 60 - else: - # Handle other cases or raise an exception if needed - raise ValueError("Invalid value for max_features") - - # self.m_features_mode = "(Percentage (M * (m / 100)))" if m_features_mode is None else m_features_mode - # self.m_features_per_tree_size = m_features_per_tree_size - self.lambda_param = lambda_param - self.drift_detection_method = ( - "(ADWINChangeDetector -a 1.0E-3)" - if drift_detection_method is None - else drift_detection_method - ) - self.warning_detection_method = ( - "(ADWINChangeDetector -a 1.0E-2)" - if warning_detection_method is None - else warning_detection_method - ) - self.disable_drift_detection = disable_drift_detection - self.disable_background_learner = disable_background_learner - - self.moa_learner.getOptions().setViaCLIString( - f"-l {self.tree_learner} -s {self.ensemble_size} -o {self.m_features_mode} -m \ - {self.m_features_per_tree_size} -a {self.lambda_param} -x {self.drift_detection_method} -p \ - {self.warning_detection_method} {'-u' if self.disable_drift_detection else ''} {'-q' if self.disable_background_learner else ''}" - ) - self.moa_learner.prepareForUse() - self.moa_learner.resetLearning() - - -class SOKNL(MOARegressor): - def __init__( - self, - schema=None, - CLI=None, - random_seed=1, - tree_learner=None, - ensemble_size=100, - max_features=0.6, - lambda_param=6.0, # m_features_mode=None, m_features_per_tree_size=60, - drift_detection_method=None, - warning_detection_method=None, - disable_drift_detection=False, - disable_background_learner=False, - self_optimising=True, - k_value=10, - ): - # Important: must create the MOA object before invoking the super class __init__ - self.moa_learner = MOA_SOKNL() - super().__init__( - schema=schema, - CLI=CLI, - random_seed=random_seed, - moa_learner=self.moa_learner, - ) - - # Initialize instance attributes with default values, CLI was not set. - if self.CLI is None: - self.tree_learner = ( - # "(SelfOptimisingBaseTree -s VarianceReductionSplitCriterion -g 50 -c 0.01)" - SOKNLBT(schema, grace_period=50, split_confidence=0.01) - if tree_learner is None - else tree_learner - ) - self.ensemble_size = ensemble_size - - self.max_features = max_features - if isinstance(self.max_features, float) and 0.0 <= self.max_features <= 1.0: - self.m_features_mode = "(Percentage (M * (m / 100)))" - self.m_features_per_tree_size = int(self.max_features * 100) - elif isinstance(self.max_features, int): - self.m_features_mode = "(Specified m (integer value))" - self.m_features_per_tree_size = max_features - elif self.max_features in ["sqrt"]: - self.m_features_mode = "(sqrt(M)+1)" - self.m_features_per_tree_size = -1 # or leave it unchanged - elif self.max_features is None: - self.m_features_mode = "(Percentage (M * (m / 100)))" - self.m_features_per_tree_size = 60 - else: - # Handle other cases or raise an exception if needed - raise ValueError("Invalid value for max_features") - - # self.m_features_mode = "(Percentage (M * (m / 100)))" if m_features_mode is None else m_features_mode - # self.m_features_per_tree_size = m_features_per_tree_size - self.lambda_param = lambda_param - self.drift_detection_method = ( - "(ADWINChangeDetector -a 1.0E-3)" - if drift_detection_method is None - else drift_detection_method - ) - self.warning_detection_method = ( - "(ADWINChangeDetector -a 1.0E-2)" - if warning_detection_method is None - else warning_detection_method - ) - self.disable_drift_detection = disable_drift_detection - self.disable_background_learner = disable_background_learner - - self.self_optimising = self_optimising - self.k_value = k_value - - self.moa_learner.getOptions().setViaCLIString( - f"-l {self.tree_learner} -s {self.ensemble_size} {'-f' if self.self_optimising else ''} -k {self.k_value} -o {self.m_features_mode} -m \ - {self.m_features_per_tree_size} -a {self.lambda_param} -x {self.drift_detection_method} -p \ - {self.warning_detection_method} {'-u' if self.disable_drift_detection else ''} {'-q' if self.disable_background_learner else ''}" - ) - self.moa_learner.prepareForUse() - self.moa_learner.resetLearning() diff --git a/src/capymoa/learner/ssl/classifier/__init__.py b/src/capymoa/learner/ssl/classifier/__init__.py deleted file mode 100644 index 42d00b44..00000000 --- a/src/capymoa/learner/ssl/classifier/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .CPSSDS import CPSSDS -from .OSNN import OSNN -from .batch import BatchClassifierSSL - -__all__ = ["BatchClassifierSSL", "CPSSDS", "OSNN"] diff --git a/src/capymoa/prepare_jpype.py b/src/capymoa/prepare_jpype.py index dccfddf4..af257429 100644 --- a/src/capymoa/prepare_jpype.py +++ b/src/capymoa/prepare_jpype.py @@ -1,9 +1,7 @@ # Python imports -import subprocess import configparser import jpype import jpype.imports -from jpype.types import * import os from pathlib import Path diff --git a/src/capymoa/regressor/__init__.py b/src/capymoa/regressor/__init__.py new file mode 100644 index 00000000..8ea9dd43 --- /dev/null +++ b/src/capymoa/regressor/__init__.py @@ -0,0 +1,16 @@ +from ._soknl import SOKNL, SOKNLBT +from ._orto import ORTO +from ._knn import KNNRegressor +from ._fimtdd import FIMTDD +from ._arffimtdd import ARFFIMTDD +from ._adaptive_random_forrest import AdaptiveRandomForestRegressor + +__all__ = [ + "SOKNLBT", + "SOKNL", + "ORTO", + "KNNRegressor", + "FIMTDD", + "ARFFIMTDD", + "AdaptiveRandomForestRegressor", +] diff --git a/src/capymoa/regressor/_adaptive_random_forrest.py b/src/capymoa/regressor/_adaptive_random_forrest.py new file mode 100644 index 00000000..294c6020 --- /dev/null +++ b/src/capymoa/regressor/_adaptive_random_forrest.py @@ -0,0 +1,84 @@ +# Library imports + +from capymoa.base import MOARegressor +from ._arffimtdd import ARFFIMTDD + +from moa.classifiers.meta import ( + AdaptiveRandomForestRegressor as MOA_AdaptiveRandomForestRegressor, +) + + +# TODO: replace the m_features_mode logic such that we can infer from m_features_per_tree_size, e.g. if value is double between 0.0 and 1.0 = percentage +class AdaptiveRandomForestRegressor(MOARegressor): + def __init__( + self, + schema=None, + CLI=None, + random_seed=1, + tree_learner=None, + ensemble_size=100, + max_features=0.6, + lambda_param=6.0, # m_features_mode=None, m_features_per_tree_size=60, + drift_detection_method=None, + warning_detection_method=None, + disable_drift_detection=False, + disable_background_learner=False, + ): + # Important: must create the MOA object before invoking the super class __init__ + self.moa_learner = MOA_AdaptiveRandomForestRegressor() + super().__init__( + schema=schema, + CLI=CLI, + random_seed=random_seed, + moa_learner=self.moa_learner, + ) + + # Initialize instance attributes with default values, CLI was not set. + if self.CLI is None: + self.tree_learner = ( + ARFFIMTDD(schema, grace_period=50, split_confidence=0.01) + if tree_learner is None + else tree_learner + ) + self.ensemble_size = ensemble_size + + self.max_features = max_features + if isinstance(self.max_features, float) and 0.0 <= self.max_features <= 1.0: + self.m_features_mode = "(Percentage (M * (m / 100)))" + self.m_features_per_tree_size = int(self.max_features * 100) + elif isinstance(self.max_features, int): + self.m_features_mode = "(Specified m (integer value))" + self.m_features_per_tree_size = max_features + elif self.max_features in ["sqrt"]: + self.m_features_mode = "(sqrt(M)+1)" + self.m_features_per_tree_size = -1 # or leave it unchanged + elif self.max_features is None: + self.m_features_mode = "(Percentage (M * (m / 100)))" + self.m_features_per_tree_size = 60 + else: + # Handle other cases or raise an exception if needed + raise ValueError("Invalid value for max_features") + + # self.m_features_mode = "(Percentage (M * (m / 100)))" if m_features_mode is None else m_features_mode + # self.m_features_per_tree_size = m_features_per_tree_size + self.lambda_param = lambda_param + self.drift_detection_method = ( + "(ADWINChangeDetector -a 1.0E-3)" + if drift_detection_method is None + else drift_detection_method + ) + self.warning_detection_method = ( + "(ADWINChangeDetector -a 1.0E-2)" + if warning_detection_method is None + else warning_detection_method + ) + self.disable_drift_detection = disable_drift_detection + self.disable_background_learner = disable_background_learner + + self.moa_learner.getOptions().setViaCLIString( + f"-l {self.tree_learner} -s {self.ensemble_size} -o {self.m_features_mode} -m \ + {self.m_features_per_tree_size} -a {self.lambda_param} -x {self.drift_detection_method} -p \ + {self.warning_detection_method} {'-u' if self.disable_drift_detection else ''} {'-q' if self.disable_background_learner else ''}" + ) + self.moa_learner.prepareForUse() + self.moa_learner.resetLearning() diff --git a/src/capymoa/regressor/_arffimtdd.py b/src/capymoa/regressor/_arffimtdd.py new file mode 100644 index 00000000..c228f69e --- /dev/null +++ b/src/capymoa/regressor/_arffimtdd.py @@ -0,0 +1,73 @@ +# Library imports +from typing import Optional, Union + +from capymoa.base import MOARegressor + +from capymoa.splitcriteria import SplitCriterion, _split_criterion_to_cli_str +from capymoa.stream._stream import Schema +from moa.classifiers.trees import ARFFIMTDD as _MOA_ARFFIMTDD + + +class ARFFIMTDD(MOARegressor): + """Modified Fast Incremental Model Tree with Drift Detection for basic + learner for ARF-Regas described by Ikonomovska et al.""" + + def __init__( + self, + schema: Schema, + subspace_size_size: int = 2, + split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", + grace_period: int = 200, + split_confidence: float = 1.0e-7, + tie_threshold: float = 0.05, + page_hinckley_alpha: float = 0.005, + page_hinckley_threshold: int = 50, + alternate_tree_fading_factor: float = 0.995, + alternate_tree_t_min: int = 150, + alternate_tree_time: int = 1500, + learning_ratio: float = 0.02, + learning_ratio_decay_factor: float = 0.001, + learning_ratio_const: bool = False, + random_seed: Optional[int] = None, + ) -> None: + """ + Construct ARFFIMTDD. + + :param subspace_size_size: Number of features per subset for each node split. Negative values = #features - k + :param split_criterion: Split criterion to use. + :param grace_period: Number of instances a leaf should observe between split attempts. + :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. + :param tie_threshold: Threshold below which a split will be forced to break ties. + :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. + :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. + :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. + :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. + :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. + :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. + :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). + :param learning_ratio_const: Keep learning rate constant instead of decaying. + """ + cli = [] + + cli.append(f"-k {subspace_size_size}") + cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") + cli.append(f"-g {grace_period}") + cli.append(f"-c {split_confidence}") + cli.append(f"-t {tie_threshold}") + cli.append(f"-a {page_hinckley_alpha}") + cli.append(f"-h {page_hinckley_threshold}") + cli.append(f"-f {alternate_tree_fading_factor}") + cli.append(f"-y {alternate_tree_t_min}") + cli.append(f"-u {alternate_tree_time}") + cli.append(f"-l {learning_ratio}") + cli.append(f"-d {learning_ratio_decay_factor}") + cli.append("-p") if learning_ratio_const else None + + self.moa_learner = _MOA_ARFFIMTDD() + + super().__init__( + schema=schema, + CLI=" ".join(cli), + random_seed=random_seed, + moa_learner=self.moa_learner, + ) diff --git a/src/capymoa/regressor/_fimtdd.py b/src/capymoa/regressor/_fimtdd.py new file mode 100644 index 00000000..172b48c6 --- /dev/null +++ b/src/capymoa/regressor/_fimtdd.py @@ -0,0 +1,71 @@ +from typing import Optional, Union + +from capymoa.base import MOARegressor + +from capymoa.splitcriteria import SplitCriterion, _split_criterion_to_cli_str +from capymoa.stream._stream import Schema +from moa.classifiers.trees import FIMTDD as _MOA_FIMTDD + + +class FIMTDD(MOARegressor): + """Implementation of the FIMT-DD tree as described by Ikonomovska et al.""" + + def __init__( + self, + schema: Schema, + split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", + grace_period: int = 200, + split_confidence: float = 1.0e-7, + tie_threshold: float = 0.05, + page_hinckley_alpha: float = 0.005, + page_hinckley_threshold: int = 50, + alternate_tree_fading_factor: float = 0.995, + alternate_tree_t_min: int = 150, + alternate_tree_time: int = 1500, + regression_tree: bool = False, + learning_ratio: float = 0.02, + learning_ratio_decay_factor: float = 0.001, + learning_ratio_const: bool = False, + random_seed: Optional[int] = None, + ) -> None: + """ + Construct FIMTDD. + + :param split_criterion: Split criterion to use. + :param grace_period: Number of instances a leaf should observe between split attempts. + :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. + :param tie_threshold: Threshold below which a split will be forced to break ties. + :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. + :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. + :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. + :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. + :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. + :param regression_tree: Build a regression tree instead of a model tree. + :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. + :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). + :param learning_ratio_const: Keep learning rate constant instead of decaying. + """ + cli = [] + + cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") + cli.append(f"-g {grace_period}") + cli.append(f"-c {split_confidence}") + cli.append(f"-t {tie_threshold}") + cli.append(f"-a {page_hinckley_alpha}") + cli.append(f"-h {page_hinckley_threshold}") + cli.append(f"-f {alternate_tree_fading_factor}") + cli.append(f"-y {alternate_tree_t_min}") + cli.append(f"-u {alternate_tree_time}") + cli.append("-e") if regression_tree else None + cli.append(f"-l {learning_ratio}") + cli.append(f"-d {learning_ratio_decay_factor}") + cli.append("-p") if learning_ratio_const else None + + self.moa_learner = _MOA_FIMTDD() + + super().__init__( + schema=schema, + CLI=" ".join(cli), + random_seed=random_seed, + moa_learner=self.moa_learner, + ) diff --git a/src/capymoa/regressor/_knn.py b/src/capymoa/regressor/_knn.py new file mode 100644 index 00000000..b641cf76 --- /dev/null +++ b/src/capymoa/regressor/_knn.py @@ -0,0 +1,36 @@ +from capymoa.base import MOARegressor +from moa.classifiers.lazy import kNN as _moa_kNN + + +class KNNRegressor(MOARegressor): + """ + The default number of neighbors (k) is set to 3 instead of 10 (as in MOA) + """ + + def __init__( + self, schema=None, CLI=None, random_seed=1, k=3, median=False, window_size=1000 + ): + # Important, should create the MOA object before invoking the super class __init__ + self.moa_learner = _moa_kNN() + super().__init__( + schema=schema, + CLI=CLI, + random_seed=random_seed, + moa_learner=self.moa_learner, + ) + + # Initialize instance attributes with default values, CLI was not set. + if self.CLI is None: + self.k = k + self.median = median + self.window_size = window_size + self.moa_learner.getOptions().setViaCLIString( + f"-k {self.k} {'-m' if self.median else ''} -w \ + {self.window_size}" + ) + self.moa_learner.prepareForUse() + self.moa_learner.resetLearning() + + def __str__(self): + # Overrides the default class name from MOA + return "kNNRegressor" diff --git a/src/capymoa/regressor/_orto.py b/src/capymoa/regressor/_orto.py new file mode 100644 index 00000000..172bf38b --- /dev/null +++ b/src/capymoa/regressor/_orto.py @@ -0,0 +1,83 @@ +from typing import Optional, Union + +from capymoa.stream import Schema +from capymoa.base import MOARegressor +from capymoa.splitcriteria import SplitCriterion, _split_criterion_to_cli_str + +from moa.classifiers.trees import ORTO as _MOA_ORTO + + +class ORTO(MOARegressor): + """Implementation of the ORTO tree as described by Ikonomovska et al.""" + + def __init__( + self, + schema: Schema, + max_trees: int = 10, + max_option_level: int = 10, + option_decay_factor: float = 0.9, + option_fading_factor: float = 0.9995, + split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", + grace_period: int = 200, + split_confidence: float = 1.0e-7, + tie_threshold: float = 0.05, + page_hinckley_alpha: float = 0.005, + page_hinckley_threshold: int = 50, + alternate_tree_fading_factor: float = 0.995, + alternate_tree_t_min: int = 150, + alternate_tree_time: int = 1500, + regression_tree: bool = False, + learning_ratio: float = 0.02, + learning_ratio_decay_factor: float = 0.001, + learning_ratio_const: bool = False, + random_seed: Optional[int] = None, + ) -> None: + """ + Construct ORTO. + + :param max_trees: The maximum number of trees contained in the option tree. + :param max_option_level: The maximal depth at which option nodes can be created. + :param option_decay_factor: The option decay factor that determines how many options can be selected at a given level. + :param option_fading_factor: The fading factor used for comparing subtrees of an option node. + :param split_criterion: Split criterion to use. + :param grace_period: Number of instances a leaf should observe between split attempts. + :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. + :param tie_threshold: Threshold below which a split will be forced to break ties. + :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. + :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. + :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. + :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. + :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. + :param regression_tree: Build a regression tree instead of a model tree. + :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. + :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). + :param learning_ratio_const: Keep learning rate constant instead of decaying. + """ + cli = [] + + cli.append(f"-m {max_trees}") + cli.append(f"-x {max_option_level}") + cli.append(f"-z {option_decay_factor}") + cli.append(f"-q {option_fading_factor}") + cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") + cli.append(f"-g {grace_period}") + cli.append(f"-c {split_confidence}") + cli.append(f"-t {tie_threshold}") + cli.append(f"-a {page_hinckley_alpha}") + cli.append(f"-h {page_hinckley_threshold}") + cli.append(f"-f {alternate_tree_fading_factor}") + cli.append(f"-y {alternate_tree_t_min}") + cli.append(f"-u {alternate_tree_time}") + cli.append("-e") if regression_tree else None + cli.append(f"-l {learning_ratio}") + cli.append(f"-d {learning_ratio_decay_factor}") + cli.append("-p") if learning_ratio_const else None + + self.moa_learner = _MOA_ORTO() + + super().__init__( + schema=schema, + CLI=" ".join(cli), + random_seed=random_seed, + moa_learner=self.moa_learner, + ) diff --git a/src/capymoa/regressor/_soknl.py b/src/capymoa/regressor/_soknl.py new file mode 100644 index 00000000..dcac87ee --- /dev/null +++ b/src/capymoa/regressor/_soknl.py @@ -0,0 +1,156 @@ +# Library imports +from typing import Optional, Union + +from capymoa.base import ( + MOARegressor, +) + +from capymoa.splitcriteria import SplitCriterion, _split_criterion_to_cli_str +from capymoa.stream._stream import Schema +from moa.classifiers.meta import SelfOptimisingKNearestLeaves as _MOA_SOKNL +from moa.classifiers.trees import SelfOptimisingBaseTree as _MOA_SelfOptimisingBaseTree + + +class SOKNLBT(MOARegressor): + """Implementation of the FIMT-DD tree as described by Ikonomovska et al.""" + + def __init__( + self, + schema: Schema, + subspace_size_size: int = 2, + split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", + grace_period: int = 200, + split_confidence: float = 1.0e-7, + tie_threshold: float = 0.05, + page_hinckley_alpha: float = 0.005, + page_hinckley_threshold: int = 50, + alternate_tree_fading_factor: float = 0.995, + alternate_tree_t_min: int = 150, + alternate_tree_time: int = 1500, + learning_ratio: float = 0.02, + learning_ratio_decay_factor: float = 0.001, + learning_ratio_const: bool = False, + random_seed: Optional[int] = None, + ) -> None: + """ + Construct SelfOptimisingBaseTree. + + :param subspace_size_size: Number of features per subset for each node split. Negative values = #features - k + :param split_criterion: Split criterion to use. + :param grace_period: Number of instances a leaf should observe between split attempts. + :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. + :param tie_threshold: Threshold below which a split will be forced to break ties. + :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. + :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. + :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. + :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. + :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. + :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. + :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). + :param learning_ratio_const: Keep learning rate constant instead of decaying. + """ + cli = [] + + cli.append(f"-k {subspace_size_size}") + cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") + cli.append(f"-g {grace_period}") + cli.append(f"-c {split_confidence}") + cli.append(f"-t {tie_threshold}") + cli.append(f"-a {page_hinckley_alpha}") + cli.append(f"-h {page_hinckley_threshold}") + cli.append(f"-f {alternate_tree_fading_factor}") + cli.append(f"-y {alternate_tree_t_min}") + cli.append(f"-u {alternate_tree_time}") + cli.append(f"-l {learning_ratio}") + cli.append(f"-d {learning_ratio_decay_factor}") + cli.append("-p") if learning_ratio_const else None + + self.moa_learner = _MOA_SelfOptimisingBaseTree() + + super().__init__( + schema=schema, + CLI=" ".join(cli), + random_seed=random_seed, + moa_learner=self.moa_learner, + ) + + +class SOKNL(MOARegressor): + def __init__( + self, + schema=None, + CLI=None, + random_seed=1, + tree_learner=None, + ensemble_size=100, + max_features=0.6, + lambda_param=6.0, # m_features_mode=None, m_features_per_tree_size=60, + drift_detection_method=None, + warning_detection_method=None, + disable_drift_detection=False, + disable_background_learner=False, + self_optimising=True, + k_value=10, + ): + # Important: must create the MOA object before invoking the super class __init__ + self.moa_learner = _MOA_SOKNL() + super().__init__( + schema=schema, + CLI=CLI, + random_seed=random_seed, + moa_learner=self.moa_learner, + ) + + # Initialize instance attributes with default values, CLI was not set. + if self.CLI is None: + self.tree_learner = ( + # "(SelfOptimisingBaseTree -s VarianceReductionSplitCriterion -g 50 -c 0.01)" + SOKNLBT(schema, grace_period=50, split_confidence=0.01) + if tree_learner is None + else tree_learner + ) + self.ensemble_size = ensemble_size + + self.max_features = max_features + if isinstance(self.max_features, float) and 0.0 <= self.max_features <= 1.0: + self.m_features_mode = "(Percentage (M * (m / 100)))" + self.m_features_per_tree_size = int(self.max_features * 100) + elif isinstance(self.max_features, int): + self.m_features_mode = "(Specified m (integer value))" + self.m_features_per_tree_size = max_features + elif self.max_features in ["sqrt"]: + self.m_features_mode = "(sqrt(M)+1)" + self.m_features_per_tree_size = -1 # or leave it unchanged + elif self.max_features is None: + self.m_features_mode = "(Percentage (M * (m / 100)))" + self.m_features_per_tree_size = 60 + else: + # Handle other cases or raise an exception if needed + raise ValueError("Invalid value for max_features") + + # self.m_features_mode = "(Percentage (M * (m / 100)))" if m_features_mode is None else m_features_mode + # self.m_features_per_tree_size = m_features_per_tree_size + self.lambda_param = lambda_param + self.drift_detection_method = ( + "(ADWINChangeDetector -a 1.0E-3)" + if drift_detection_method is None + else drift_detection_method + ) + self.warning_detection_method = ( + "(ADWINChangeDetector -a 1.0E-2)" + if warning_detection_method is None + else warning_detection_method + ) + self.disable_drift_detection = disable_drift_detection + self.disable_background_learner = disable_background_learner + + self.self_optimising = self_optimising + self.k_value = k_value + + self.moa_learner.getOptions().setViaCLIString( + f"-l {self.tree_learner} -s {self.ensemble_size} {'-f' if self.self_optimising else ''} -k {self.k_value} -o {self.m_features_mode} -m \ + {self.m_features_per_tree_size} -a {self.lambda_param} -x {self.drift_detection_method} -p \ + {self.warning_detection_method} {'-u' if self.disable_drift_detection else ''} {'-q' if self.disable_background_learner else ''}" + ) + self.moa_learner.prepareForUse() + self.moa_learner.resetLearning() diff --git a/src/capymoa/learner/splitcriteria.py b/src/capymoa/splitcriteria.py similarity index 94% rename from src/capymoa/learner/splitcriteria.py rename to src/capymoa/splitcriteria.py index b4894179..e50c3311 100644 --- a/src/capymoa/learner/splitcriteria.py +++ b/src/capymoa/splitcriteria.py @@ -4,8 +4,9 @@ import moa.classifiers.core.splitcriteria as moa_split -class SplitCriterion(): +class SplitCriterion: """Split criterions are used to evaluate the quality of a split in a decision tree.""" + _java_object: Optional[moa_split.SplitCriterion] = None def java_object(self) -> moa_split.SplitCriterion: @@ -65,4 +66,6 @@ def _split_criterion_to_cli_str(split_criterion: Union[str, SplitCriterion]) -> elif isinstance(split_criterion, str): return split_criterion.strip().strip("() ") else: - raise TypeError(f"Expected a string or SplitCriterion, got {type(split_criterion)}") + raise TypeError( + f"Expected a string or SplitCriterion, got {type(split_criterion)}" + ) diff --git a/src/capymoa/learner/ssl/classifier/CPSSDS.py b/src/capymoa/ssl/classifier/_CPSSDS.py similarity index 98% rename from src/capymoa/learner/ssl/classifier/CPSSDS.py rename to src/capymoa/ssl/classifier/_CPSSDS.py index 8f55fb30..45303ee8 100644 --- a/src/capymoa/learner/ssl/classifier/CPSSDS.py +++ b/src/capymoa/ssl/classifier/_CPSSDS.py @@ -6,9 +6,9 @@ from river.naive_bayes import GaussianNB from river.tree import HoeffdingTreeClassifier -from capymoa.learner.ssl.classifier.batch import BatchClassifierSSL +from capymoa.ssl.classifier._batch import BatchClassifierSSL from capymoa.stream import Schema -from capymoa.stream.instance import Instance +from capymoa.instance import Instance def shuffle_split( diff --git a/src/capymoa/ssl/classifier/__init__.py b/src/capymoa/ssl/classifier/__init__.py new file mode 100644 index 00000000..573fe5e3 --- /dev/null +++ b/src/capymoa/ssl/classifier/__init__.py @@ -0,0 +1,5 @@ +from ._cpssds import CPSSDS +from ._osnn import OSNN +from ._batch import BatchClassifierSSL + +__all__ = ["BatchClassifierSSL", "CPSSDS", "OSNN"] diff --git a/src/capymoa/learner/ssl/classifier/batch.py b/src/capymoa/ssl/classifier/_batch.py similarity index 92% rename from src/capymoa/learner/ssl/classifier/batch.py rename to src/capymoa/ssl/classifier/_batch.py index 37a0c785..09ec351e 100644 --- a/src/capymoa/learner/ssl/classifier/batch.py +++ b/src/capymoa/ssl/classifier/_batch.py @@ -1,12 +1,11 @@ from abc import ABC, abstractmethod -from typing import Any import numpy as np from numpy.typing import NDArray -from capymoa.learner import ClassifierSSL -from capymoa.stream.instance import Instance, LabeledInstance -from capymoa.stream.stream import Schema +from capymoa.base import ClassifierSSL +from capymoa.instance import Instance, LabeledInstance +from capymoa.stream._stream import Schema from capymoa.type_alias import FeatureVector diff --git a/src/capymoa/ssl/classifier/_cpssds.py b/src/capymoa/ssl/classifier/_cpssds.py new file mode 100644 index 00000000..2d3b7e35 --- /dev/null +++ b/src/capymoa/ssl/classifier/_cpssds.py @@ -0,0 +1,296 @@ +import typing as t +from typing import Dict, Literal + +import numpy as np +from river.base import Classifier +from river.naive_bayes import GaussianNB +from river.tree import HoeffdingTreeClassifier + +from ._batch import BatchClassifierSSL +from capymoa.stream import Schema +from capymoa.instance import Instance + + +def shuffle_split( + split_proportion: float, x: np.ndarray, y: np.ndarray +) -> t.Tuple[t.Tuple[np.ndarray, np.ndarray], t.Tuple[np.ndarray, np.ndarray]]: + """Shuffle and split the data into two parts. + + :param split_proportion: The proportion of the dataset to be included in + the first part. + :param x: The instances to split. + :param y: The labels to split. + :raises LengthMismatchError: The length of x and y must be the same. + :return: Two tuples containing the instances and labels of the two parts. + """ + assert len(x) == len(y), "x and y must have the same length" + indices = np.arange(len(x)) + np.random.shuffle(indices) + split_index = int(len(x) * split_proportion) + idx_a = indices[:split_index] + idx_b = indices[split_index:] + return (x[idx_a], y[idx_a]), (x[idx_b], y[idx_b]) + + +def split_by_label_presence( + x: np.ndarray, y: np.ndarray +) -> t.Tuple[t.Tuple[np.ndarray, np.ndarray], np.ndarray]: + """Split the data into labeled and unlabeled instances. + + :param x: A batch of instances. + :param y: A batch of labels where -1 means that the instance is unlabeled. + :raises LengthMismatchError: The length of x and y must be the same. + :return: + - A tuple containing the labeled instances and labels. + - A numpy array containing the unlabeled instances. + """ + assert len(x) == len(y), "x and y must have the same length" + labeled_mask = y != -1 + return (x[labeled_mask], y[labeled_mask]), x[~labeled_mask] + + +def Unlabeling_data(X_train, Y_train, Percentage, chunk_size, class_count): + labeled_count = round(Percentage * chunk_size) + TLabeled = X_train[0 : labeled_count - 1] + Y_TLabeled = Y_train[0 : labeled_count - 1] + X_Unlabeled = X_train[labeled_count : Y_train.shape[0] - 1] + + cal_count = round(0.3 * TLabeled.shape[0]) + X_cal = TLabeled[0 : cal_count - 1] + Y_cal = Y_TLabeled[0 : cal_count - 1] + X_L = TLabeled[cal_count : TLabeled.shape[0] - 1] + Y_L = Y_TLabeled[cal_count : TLabeled.shape[0] - 1] + + return X_Unlabeled, X_L, Y_L, X_cal, Y_cal + + +def Prediction_by_CP(num, classifier, X, Y, X_Unlabeled, class_count, sl): + row = X_Unlabeled.shape[0] + col = class_count + p_values = np.zeros([row, col]) + labels = np.ones((row, col), dtype=bool) + alphas = NCM(num, classifier, X, Y, 1, class_count) + for elem in range(row): + c = [] + for o in range(class_count): + a_test = NCM( + num, classifier, np.array([X_Unlabeled[elem, :]]), o, 2, class_count + ) + idx = np.argwhere(Y == o).flatten() + temp = alphas[idx] + p = len(temp[temp >= a_test]) + if idx.shape[0] == 0: + s = 0 + else: + s = p / idx.shape[0] + c.append(s) + if s < sl: + labels[elem, int(o)] = False + p_values[elem, :] = np.array(c) + return p_values, labels + + +def NCM(num, classifier, X, Y, t, class_count): + if num == 1: + if t == 1: + p = np.zeros([X.shape[0], 1]) + alpha = np.zeros([X.shape[0], 1]) + for g in range(X.shape[0]): + dic_vote = classifier.predict_proba_one(np_to_dict(X[g, :])) + vote = np.fromiter(dic_vote.values(), dtype=float) + vote_keys = np.fromiter(dic_vote.keys(), dtype=int) + Sum = np.sum(vote) + keys = np.argwhere(vote_keys == int(Y[g])).flatten() + if keys.size == 0: + p[g] = (1) / (Sum + class_count) + else: + for key, val in dic_vote.items(): + if key == float(Y[g]): + p[g] = (val + 1) / (Sum + class_count) + alpha[g] = 1 - p[g] + + else: + dic_vote = classifier.predict_proba_one(np_to_dict(X[0, :])) + vote = np.fromiter(dic_vote.values(), dtype=float) + vote_keys = np.fromiter(dic_vote.keys(), dtype=int) + Sum = np.sum(vote) + keys = np.argwhere(vote_keys == int(Y)).flatten() + if keys.size == 0: + p = (1) / (Sum + class_count) + else: + for key, val in dic_vote.items(): + if key == float(Y): + p = (val + 1) / (Sum + class_count) + alpha = 1 - p + + else: + if t == 1: + prediction = predict_many(classifier, X) + P = np.max(prediction, axis=1) + alpha = 1 - P + elif t == 2: + prediction = predict_many(classifier, X) + # TODO: This is a hacky patch because river tries to be smart and + # infer the number of classes from the data. This is silly because + # CPSSDS assumes that the number of classes is known. Future work + # will replace river with MOA. + if prediction.shape[1] <= Y: + P = 0 + else: + P = prediction[0, int(Y)] + alpha = 1 - P + return alpha + + +def Informatives_selection(X_Unlabeled, p_values, labels, class_count): + row = X_Unlabeled.shape[0] + X = np.empty([1, X_Unlabeled.shape[1]]) + Y = np.empty([1]) + for elem in range(row): + l = np.argwhere(labels[elem, :] == True).flatten() + if len(l) == 1: + pp = p_values[elem, l] + X = np.append(X, [X_Unlabeled[elem, :]], axis=0) + Y = np.append(Y, [l[0]], axis=0) + Informatives = X[1 : X.shape[0], :] + Y_Informatives = Y[1 : Y.shape[0]] + return Informatives, Y_Informatives + + +def Appending_informative_to_nextchunk( + X_Currentchunk_Labeled, Y_Currentchunk_Labeled, Informatives, Y_Informatives +): + X = np.append(X_Currentchunk_Labeled, Informatives, axis=0) + Y = np.append(Y_Currentchunk_Labeled, Y_Informatives, axis=0) + return X, Y + + +def np_to_dict(x): + return dict(enumerate(x)) + + +def predict_many(classifier: Classifier, x: np.ndarray) -> np.ndarray: + """Predict the labels of a batch of instances. + + :param classifier: The classifier to use. + :param x: A batch of instances. + :return: A numpy array containing the predicted labels. + """ + if len(x) == 0: + return np.array([]) + results = [] + for x_i in x: + y_hat = classifier.predict_proba_one(np_to_dict(x_i)) + y_hat_skmf = np.array(list(y_hat.values())) + results.append(y_hat_skmf) + return np.stack(results) + + +class CPSSDS(BatchClassifierSSL): + """Conformal prediction for semi-supervised classification on data streams. + + Tanha, J., Samadi, N., Abdi, Y., & Razzaghi-Asl, N. (2021). CPSSDS: + Conformal prediction for semi-supervised classification on data streams. + Information Sciences, 584, 212–234. https://doi.org/10.1016/j.ins.2021.10.068 + """ + + def __init__( + self, + base_model: Literal["NaiveBayes", "HoeffdingTree"], + batch_size: int, + schema: Schema, + significance_level: float = 0.98, + calibration_split: float = 0.3, + random_seed=1, + ) -> None: + """Constructor for CPSSDS. + + :param base_model: An underlying model which is augmented with + self-labeled data from conformal prediction. + :param batch_size: The number of instances to train on at a time. + :param schema: The schema of the data stream. + :param significance_level: Controls the required confidence level for + unlabeled instances to be labeled. Must be between 0 and 1. defaults to 0.98 + :param calibration_split: The proportion of the labeled data to be used + for calibration. defaults to 0.3 + :param random_seed: The random seed to use for reproducibility. + :raises ValueError: `base_model` must be either NaiveBayes or HoeffdingTree + """ + super().__init__(batch_size, schema, random_seed) + self.significance_level: float = significance_level + self.chunk_id = 0 + self.class_count = schema.get_num_classes() + self.calibration_split = calibration_split + + # TODO: These classifiers should be replaced with MOA classifiers + if base_model == "NaiveBayes": + self.classifier = GaussianNB() + self._num = 2 + elif base_model == "HoeffdingTree": + self.classifier = HoeffdingTreeClassifier() + self._num = 1 + else: + raise ValueError("`base_model` must be either NaiveBayes or HoeffdingTree") + + # Self-labeled data, initialized as empty + self.self_labeled_x: np.array = None + self.self_labeled_y: np.array = None + + # Set seed for reproducibility + np.random.seed(random_seed) + + def train_on_batch(self, x_batch, y_indices): + (x_label, y_label), x_unlabeled = split_by_label_presence(x_batch, y_indices) + (x_cal, y_cal), (x_train, y_train) = shuffle_split( + self.calibration_split, x_label, y_label + ) + + # Add self-labeled data to training data + if self.self_labeled_x is not None and self.self_labeled_y is not None: + x_train = np.concatenate((x_train, self.self_labeled_x)) + y_train = np.concatenate((y_train, self.self_labeled_y)) + + for x_one, y_one in zip(x_train, y_train): + self.classifier.learn_one(dict(enumerate(x_one)), y_one) + + assert x_cal.shape[0] > 0, "Calibration data must not be empty" + assert x_unlabeled.shape[0] > 0, "Unlabeled data must not be empty" + """Issues arise when not enough labeled data is available for calibration. + This can be solved by increasing the calibration split or increasing the + batch size. + """ + + # Use conformal prediction to label some unlabeled data + p_values, labels = Prediction_by_CP( + self._num, + self.classifier, + x_cal, + y_cal, + x_unlabeled, + self.class_count, + self.significance_level, + ) + + # Add newly labeled data to self-labeled data + self.self_labeled_x, self.self_labeled_y = Informatives_selection( + x_unlabeled, p_values, labels, self.class_count + ) + + def instance_to_dict(self, instance: Instance) -> Dict[str, float]: + """Convert an instance to a dictionary with the feature names as keys.""" + return dict(enumerate(instance.x)) + + def skmf_to_river(self, x): + return dict(enumerate(x)) + + def predict(self, instance: Instance): + class_index = self.classifier.predict_one(self.instance_to_dict(instance)) + if class_index is None: + return None + return class_index + + def predict_proba(self, instance): + raise NotImplementedError() + + def __str__(self): + return f"CPSSDS(significance_level={self.significance_level})" diff --git a/src/capymoa/learner/ssl/classifier/OSNN.py b/src/capymoa/ssl/classifier/_osnn.py similarity index 65% rename from src/capymoa/learner/ssl/classifier/OSNN.py rename to src/capymoa/ssl/classifier/_osnn.py index 8b673138..9ced0e30 100644 --- a/src/capymoa/learner/ssl/classifier/OSNN.py +++ b/src/capymoa/ssl/classifier/_osnn.py @@ -6,28 +6,30 @@ CapyMOA implementation by Botao, Anton """ + import numpy as np import random import torch.nn as nn import torch from scipy.spatial.distance import cdist -from capymoa.learner import ClassifierSSL +from capymoa.base import ClassifierSSL def kernel_fun(a, b, sigma): A = torch.sum((a - b) ** 2, dim=1) - B = A / (2 * sigma ** 2) + B = A / (2 * sigma**2) C = torch.exp(-B) return C -def Euclidean_Distances(a,b): - dis = torch.sqrt(torch.sum((a-b)**2, dim=1)) + +def Euclidean_Distances(a, b): + dis = torch.sqrt(torch.sum((a - b) ** 2, dim=1)) return dis -class OSNeuralNetwork(nn.Module): - def __init__(self, num_center, n_out, window_size, beta=1, gamma = 1): +class OSNeuralNetwork(nn.Module): + def __init__(self, num_center, n_out, window_size, beta=1, gamma=1): super(OSNeuralNetwork, self).__init__() self.n_out = n_out self.num_centers = num_center @@ -66,37 +68,52 @@ def initialize_weights(self): m.bias.data.zero_() def update_sigma(self): - #The width of basis function is set to a proportion β of the mean of the Euclidean distances to the other centers. + # The width of basis function is set to a proportion β of the mean of the Euclidean distances to the other centers. self.sigma = torch.ones(1, self.num_centers) for i in range(self.num_centers): dis = Euclidean_Distances(self.centers[i], self.centers) - dis = torch.sum(dis)/(self.num_centers) - self.sigma[0][i] = dis*self.beta + dis = torch.sum(dis) / (self.num_centers) + self.sigma[0][i] = dis * self.beta def window_update(self, data, label): - #The window is updated according to random sampling, and the first-in-first-out principle is adopted. + # The window is updated according to random sampling, and the first-in-first-out principle is adopted. if self.i == 0: - self.data_window = torch.zeros([self.window_size, data.size(1)], dtype=torch.float32) - self.label_window = torch.zeros([self.window_size, self.n_out], dtype=torch.float32) + self.data_window = torch.zeros( + [self.window_size, data.size(1)], dtype=torch.float32 + ) + self.label_window = torch.zeros( + [self.window_size, self.n_out], dtype=torch.float32 + ) self.label_index = torch.zeros((self.window_size, 1), dtype=torch.float32) - self.linear = nn.Sequential(nn.Linear(self.num_centers + data.size(1), self.n_out, bias=True) - , nn.Sigmoid()) + self.linear = nn.Sequential( + nn.Linear(self.num_centers + data.size(1), self.n_out, bias=True), + nn.Sigmoid(), + ) for i in range(data.size(0)): - - self.data_window = torch.cat([self.data_window[1:, :], data[i:i+1, :]], dim=0) - self.label_window = torch.cat([self.label_window[1:, :], label[i:i+1, :]], dim=0) + self.data_window = torch.cat( + [self.data_window[1:, :], data[i : i + 1, :]], dim=0 + ) + self.label_window = torch.cat( + [self.label_window[1:, :], label[i : i + 1, :]], dim=0 + ) if label[i] != -1: - self.label_index = torch.cat([self.label_index[1:, :], torch.ones(1, 1)], dim=0) + self.label_index = torch.cat( + [self.label_index[1:, :], torch.ones(1, 1)], dim=0 + ) else: - self.label_index = torch.cat([self.label_index[1:, :], torch.zeros(1, 1)], dim=0) + self.label_index = torch.cat( + [self.label_index[1:, :], torch.zeros(1, 1)], dim=0 + ) self.i = self.i + 1 if self.i == self.window_size: - index = torch.LongTensor(random.sample(range(self.data_window.size(0)), self.num_centers)) + index = torch.LongTensor( + random.sample(range(self.data_window.size(0)), self.num_centers) + ) self.centers = torch.index_select(self.data_window, 0, index) self.initialize_weights() @@ -110,17 +127,25 @@ def window_update(self, data, label): return update def center_adjustment(self): - #The samples are assigned to the nearest RBF centers, and then each center is updated according to the assigned samples. - distances = np.linalg.norm(self.data_window[:, np.newaxis] - self.centers, axis=2) + # The samples are assigned to the nearest RBF centers, and then each center is updated according to the assigned samples. + distances = np.linalg.norm( + self.data_window[:, np.newaxis] - self.centers, axis=2 + ) nearest_centers = np.argmin(distances, axis=1) - assigned_samples = [self.data_window[nearest_centers == i] for i in range(len(self.centers))] - assigned_labels = [self.label_window[nearest_centers == i] for i in range(len(self.centers))] - assigned_label_index = [self.label_index[nearest_centers == i] for i in range(len(self.centers))] + assigned_samples = [ + self.data_window[nearest_centers == i] for i in range(len(self.centers)) + ] + assigned_labels = [ + self.label_window[nearest_centers == i] for i in range(len(self.centers)) + ] + assigned_label_index = [ + self.label_index[nearest_centers == i] for i in range(len(self.centers)) + ] for i in range(self.num_centers): if len(assigned_samples) > 0: - unlabel_index = torch.squeeze(assigned_label_index[i] == 0., 1) - label_index = torch.squeeze(assigned_label_index[i] == 1., 1) + unlabel_index = torch.squeeze(assigned_label_index[i] == 0.0, 1) + label_index = torch.squeeze(assigned_label_index[i] == 1.0, 1) unlabel_sample = assigned_samples[i][unlabel_index] label_sample = assigned_samples[i][label_index] @@ -133,30 +158,42 @@ def center_adjustment(self): majorit_class = unique[np.argmax(counts)] minorit_class = unique[np.argmin(counts)] if majorit_class == minorit_class: - self.centers[i] = (torch.mean(unlabel_sample, axis=0) + torch.mean(label_sample, axis=0))/2 + self.centers[i] = ( + torch.mean(unlabel_sample, axis=0) + + torch.mean(label_sample, axis=0) + ) / 2 else: majorit_sample = label_sample[labels.flatten() == majorit_class] minorit_sample = label_sample[labels.flatten() == minorit_class] - a = (majorit_sample.sum(dim=0) + minorit_sample.sum(dim=0))/len(label_sample) + a = ( + majorit_sample.sum(dim=0) + minorit_sample.sum(dim=0) + ) / len(label_sample) b = torch.mean(unlabel_sample, axis=0) - c = ((len(majorit_sample) - len(minorit_sample))/len(label_sample)) + 1 - self.centers[i] = (a + b)/c + c = ( + (len(majorit_sample) - len(minorit_sample)) + / len(label_sample) + ) + 1 + self.centers[i] = (a + b) / c elif len(label_sample) > 0 and len(unlabel_sample == 0): unique, counts = np.unique(labels, return_counts=True) majorit_class = unique[np.argmax(counts)] minorit_class = unique[np.argmin(counts)] majorit_sample = label_sample[labels.flatten() == majorit_class] minorit_sample = label_sample[labels.flatten() == minorit_class] - a = (majorit_sample.sum(dim=0) + minorit_sample.sum(dim=0)) / len(label_sample) - c = ((len(majorit_sample) - len(minorit_sample)) / len(label_sample)) + a = (majorit_sample.sum(dim=0) + minorit_sample.sum(dim=0)) / len( + label_sample + ) + c = (len(majorit_sample) - len(minorit_sample)) / len(label_sample) self.centers[i] = a / c else: - self.centers[i] = self.data_window[torch.randint(self.data_window.shape[0], size=(1,))][0] + self.centers[i] = self.data_window[ + torch.randint(self.data_window.shape[0], size=(1,)) + ][0] self.update_sigma() def pseudo_label(self): - #Pseudo-labels for unlabeled samples are calculated based on the true labels of labeled samples and the output of the network on unlabeled samples. + # Pseudo-labels for unlabeled samples are calculated based on the true labels of labeled samples and the output of the network on unlabeled samples. V = torch.cat([self.data_window, self.centers], dim=0) label = np.vstack((self.label_window, np.zeros((self.num_centers, 1)))) label_index = np.vstack((self.label_index, np.zeros((self.num_centers, 1)))) @@ -167,18 +204,19 @@ def pseudo_label(self): nearest_distances = np.sort(distances, axis=1)[:, 1] nearest_distances = self.gamma * nearest_distances.reshape(-1, 1) - S = np.exp(-1 * np.square(distances)/(nearest_distances+1e-8)) + S = np.exp(-1 * np.square(distances) / (nearest_distances + 1e-8)) y = np.where(label_index, label, pre.detach().numpy()) - U = np.dot(S, y)/np.sum(S, axis=1).reshape(-1, 1) + U = np.dot(S, y) / np.sum(S, axis=1).reshape(-1, 1) U = np.where(label_index, label, U) - self.plabel_window = torch.from_numpy(U[:len(U)-self.num_centers]) + self.plabel_window = torch.from_numpy(U[: len(U) - self.num_centers]) def return_window(self): - #Returns the samples, pseudo-labels and true labels within the windows. + # Returns the samples, pseudo-labels and true labels within the windows. return self.data_window, self.plabel_window, self.label_index + class def_loss(nn.Module): def __init__(self, model, lam=0.3, alpha=0.2): super().__init__() @@ -190,14 +228,14 @@ def L2loss(self): # l2 regularization on the network weights. l2_loss = torch.tensor(0.0, requires_grad=True) for name, parma in self.model.named_parameters(): - if 'bias' not in name: + if "bias" not in name: l2_loss = l2_loss + (0.5 * torch.sum(torch.pow(parma, 2))) return l2_loss def forward(self, y_pred, y_true, label_index): - #Computes supervised loss for labeled samples and unsupervised loss for unlabeled samples. - labeled = torch.squeeze(label_index == 1., 1) - unlabeled = torch.squeeze(label_index == 0., 1) + # Computes supervised loss for labeled samples and unsupervised loss for unlabeled samples. + labeled = torch.squeeze(label_index == 1.0, 1) + unlabeled = torch.squeeze(label_index == 0.0, 1) y_pred_labeled = y_pred[labeled] y_true_label = y_true[labeled] @@ -205,17 +243,24 @@ def forward(self, y_pred, y_true, label_index): y_pred_unlabeled = y_pred[unlabeled] y_sudo_unlabeled = y_true[unlabeled] - first_item = -torch.mean(y_true_label * torch.log(y_pred_labeled + 1e-8) + (1 - y_true_label) * torch.log(1 - y_pred_labeled + 1e-8)) - second_item = -torch.mean(y_sudo_unlabeled * torch.log(y_pred_unlabeled + 1e-8) + (1 - y_sudo_unlabeled) * torch.log(1 - y_pred_unlabeled + 1e-8)) + first_item = -torch.mean( + y_true_label * torch.log(y_pred_labeled + 1e-8) + + (1 - y_true_label) * torch.log(1 - y_pred_labeled + 1e-8) + ) + second_item = -torch.mean( + y_sudo_unlabeled * torch.log(y_pred_unlabeled + 1e-8) + + (1 - y_sudo_unlabeled) * torch.log(1 - y_pred_unlabeled + 1e-8) + ) l2_loss = self.L2loss() / len(y_pred) - loss = first_item + self.lam*second_item + self.alpha*l2_loss + loss = first_item + self.lam * second_item + self.alpha * l2_loss return loss class OSNN(ClassifierSSL): def __init__( self, + schema=None, num_center=10, n_out=1, window_size=200, diff --git a/src/capymoa/stream/PytorchStream.py b/src/capymoa/stream/PytorchStream.py index 3d664c4f..cf94a494 100644 --- a/src/capymoa/stream/PytorchStream.py +++ b/src/capymoa/stream/PytorchStream.py @@ -1,11 +1,8 @@ -from jpype import JObject - -import numpy as np import torch from capymoa.stream import Stream, Schema -from capymoa.stream.stream import _init_moa_stream_and_create_moa_header -from capymoa.stream.instance import ( +from capymoa.stream._stream import _init_moa_stream_and_create_moa_header +from capymoa.instance import ( LabeledInstance, RegressionInstance, ) diff --git a/src/capymoa/stream/__init__.py b/src/capymoa/stream/__init__.py index 2e5c4cb0..dc19e74e 100644 --- a/src/capymoa/stream/__init__.py +++ b/src/capymoa/stream/__init__.py @@ -1,11 +1,4 @@ -from .stream import ( - Stream, - Schema, - ARFFStream, - stream_from_file, - CSVStream -) -from .generator import RandomTreeGenerator +from ._stream import Stream, Schema, ARFFStream, stream_from_file, CSVStream from .PytorchStream import PytorchStream __all__ = [ @@ -13,7 +6,6 @@ "Schema", "stream_from_file", "ARFFStream", - "RandomTreeGenerator", "PytorchStream", - "CSVStream" + "CSVStream", ] diff --git a/src/capymoa/stream/stream.py b/src/capymoa/stream/_stream.py similarity index 86% rename from src/capymoa/stream/stream.py rename to src/capymoa/stream/_stream.py index c8900b5d..15027262 100644 --- a/src/capymoa/stream/stream.py +++ b/src/capymoa/stream/_stream.py @@ -15,7 +15,7 @@ # MOA/Java imports -from capymoa.stream.instance import ( +from capymoa.instance import ( Instance, LabeledInstance, RegressionInstance, @@ -35,9 +35,9 @@ class Schema: """ def __init__(self, moa_header: InstancesHeader): - """Construct a schema by wrapping a :class:`InstancesHeader`. + """Construct a schema by wrapping a ``InstancesHeader``. - To create a schema without an :class:`InstancesHeader` use + To create a schema without an ``InstancesHeader`` use :meth:`from_custom` method. :param moa_header: A Java MOA header object. @@ -218,7 +218,7 @@ def __init__( ): """Construct a Stream from a MOA stream object. - Usually, you will want to construct a Stream using the :func:`stream_from_file` + Usually, you will want to construct a Stream using the :func:`capymoa.stream.stream_from_file` function. :param moa_stream: The MOA stream object to read instances from. Is None @@ -429,7 +429,10 @@ def stream_from_file( targets = targets.astype(int) x_features = x_features[:, :-1] return NumpyStream( - x_features, targets, dataset_name=dataset_name, enforce_regression=enforce_regression + x_features, + targets, + dataset_name=dataset_name, + enforce_regression=enforce_regression, ) @@ -571,18 +574,20 @@ def _add_instances_to_moa_stream(moa_stream, moa_header, X, y): moa_stream.add(instance) -class CSVStream(Stream): - def __init__(self, - csv_file_path, - dtypes: list = None, # [('column1', np.float64), ('column2', np.int32), ('column3', np.float64), ('column3', str)] reads nomonal attributes as str - values_for_nominal_features={}, # {i: [1,2,3], k: [Aa, BB]}. Key is integer. Values are turned into strings - class_index: int = -1, - values_for_class_label: list = None, - target_attribute_name=None, - enforce_regression=False, - skip_header: bool = False, - delimiter=','): +class CSVStream(Stream): + def __init__( + self, + csv_file_path, + dtypes: list = None, # [('column1', np.float64), ('column2', np.int32), ('column3', np.float64), ('column3', str)] reads nomonal attributes as str + values_for_nominal_features={}, # {i: [1,2,3], k: [Aa, BB]}. Key is integer. Values are turned into strings + class_index: int = -1, + values_for_class_label: list = None, + target_attribute_name=None, + enforce_regression=False, + skip_header: bool = False, + delimiter=",", + ): self.csv_file_path = csv_file_path self.values_for_nominal_features = values_for_nominal_features self.class_index = class_index @@ -592,56 +597,96 @@ def __init__(self, self.skip_header = skip_header self.delimiter = delimiter - self.dtypes = [] # [('column1', np.float64), ('column2', np.int32), ('column3', np.float64), ('column3', str)] reads nomonal attributes as str - if dtypes is None or len(dtypes) == 0: # data definition for each column not provided - if len(self.values_for_nominal_features) == 0: # data definition for nominal features are given + self.dtypes = ( + [] + ) # [('column1', np.float64), ('column2', np.int32), ('column3', np.float64), ('column3', str)] reads nomonal attributes as str + if ( + dtypes is None or len(dtypes) == 0 + ): # data definition for each column not provided + if ( + len(self.values_for_nominal_features) == 0 + ): # data definition for nominal features are given # need to infer number of columns, then generate full data definition using nominal information # LOADS FIRST TWO ROWS INTO THE MEMORY - data = np.genfromtxt(self.csv_file_path, delimiter=self.delimiter, dtype=None, names=True, - skip_header=0, max_rows=2) + data = np.genfromtxt( + self.csv_file_path, + delimiter=self.delimiter, + dtype=None, + names=True, + skip_header=0, + max_rows=2, + ) if not self.enforce_regression and self.values_for_class_label is None: # LOADS THE FULL FILE INTO THE MEMORY - data = np.genfromtxt(self.csv_file_path, delimiter=self.delimiter, dtype=None, names=True, - skip_header=1 if skip_header else 0) + data = np.genfromtxt( + self.csv_file_path, + delimiter=self.delimiter, + dtype=None, + names=True, + skip_header=1 if skip_header else 0, + ) y = data[data.dtype.names[self.class_index]] self.values_for_class_label = [str(value) for value in np.unique(y)] for i, data_info in enumerate(data.dtype.descr): column_name, data_type = data_info - if self.values_for_nominal_features.get(i) is not None: # i is in nominal feature keys - self.dtypes.append((column_name, 'str')) + if ( + self.values_for_nominal_features.get(i) is not None + ): # i is in nominal feature keys + self.dtypes.append((column_name, "str")) else: self.dtypes.append((column_name, data_type)) - else: # need to infer data definitions + else: # need to infer data definitions # LOADS THE FULL FILE INTO THE MEMORY - data = np.genfromtxt(self.csv_file_path, delimiter=self.delimiter, dtype=None, names=True, - skip_header=1 if skip_header else 0) + data = np.genfromtxt( + self.csv_file_path, + delimiter=self.delimiter, + dtype=None, + names=True, + skip_header=1 if skip_header else 0, + ) self.dtypes = data.dtype if not self.enforce_regression and self.values_for_class_label is None: y = data[data.dtype.names[self.class_index]] self.values_for_class_label = [str(value) for value in np.unique(y)] - else: # data definition for each column are provided + else: # data definition for each column are provided self.dtypes = dtypes self.total_number_of_lines = 0 if self.skip_header: self.n_lines_to_skip = 1 else: - row1_data = np.genfromtxt(self.csv_file_path, delimiter=self.delimiter, dtype=None, names=True, skip_header=0,max_rows=1) - row2_data = np.genfromtxt(self.csv_file_path, delimiter=self.delimiter, dtype=None, names=True, skip_header=1, max_rows=1) + row1_data = np.genfromtxt( + self.csv_file_path, + delimiter=self.delimiter, + dtype=None, + names=True, + skip_header=0, + max_rows=1, + ) + row2_data = np.genfromtxt( + self.csv_file_path, + delimiter=self.delimiter, + dtype=None, + names=True, + skip_header=1, + max_rows=1, + ) if row1_data.dtype.names != row2_data.dtype.names: self.n_lines_to_skip = 1 else: self.n_lines_to_skip = 0 - self.__moa_stream_with_only_header, self.moa_header = _init_moa_stream_and_create_moa_header( - number_of_instances=1, # we only need this to initialize the MOA header - feature_names = [data_info[0] for data_info in self.dtypes], - values_for_nominal_features = self.values_for_nominal_features, - values_for_class_label = self.values_for_class_label, + self.__moa_stream_with_only_header, self.moa_header = ( + _init_moa_stream_and_create_moa_header( + number_of_instances=1, # we only need this to initialize the MOA header + feature_names=[data_info[0] for data_info in self.dtypes], + values_for_nominal_features=self.values_for_nominal_features, + values_for_class_label=self.values_for_class_label, dataset_name="CSVDataset", - target_attribute_name = self.target_attribute_name, - enforce_regression = self.enforce_regression, + target_attribute_name=self.target_attribute_name, + enforce_regression=self.enforce_regression, ) + ) self.schema = Schema(moa_header=self.moa_header) super().__init__(schema=self.schema, CLI=None, moa_stream=None) @@ -660,15 +705,32 @@ def next_instance(self): if not self.has_more_instances(): return None # skip header - data = np.genfromtxt(self.csv_file_path, delimiter=self.delimiter, dtype=self.dtypes, names=None, skip_header=self.n_lines_to_skip, max_rows=1) + data = np.genfromtxt( + self.csv_file_path, + delimiter=self.delimiter, + dtype=self.dtypes, + names=None, + skip_header=self.n_lines_to_skip, + max_rows=1, + ) # np.genfromtxt() returns a structured https://numpy.org/doc/stable/user/basics.rec.html#structured-arrays self.n_lines_to_skip += 1 # data = np.expand_dims(data, axis=0) # y = data[[data.dtype.names[self.class_index]]].view('i4') - y = rfn.structured_to_unstructured(data[[data.dtype.names[self.class_index]]])[0] + y = rfn.structured_to_unstructured(data[[data.dtype.names[self.class_index]]])[ + 0 + ] # X = data[[item for item in data.dtype.names if item != data.dtype.names[self.class_index]]].view('f4') - X = rfn.structured_to_unstructured(data[[item for item in data.dtype.names if item != data.dtype.names[self.class_index]]]) + X = rfn.structured_to_unstructured( + data[ + [ + item + for item in data.dtype.names + if item != data.dtype.names[self.class_index] + ] + ] + ) if self.schema.is_classification(): return LabeledInstance.from_array(self.schema, X, y) @@ -688,4 +750,4 @@ def get_moa_stream(self): def restart(self): self.total_number_of_lines = 0 - self.n_lines_to_skip = 1 if self.skip_header else 0 \ No newline at end of file + self.n_lines_to_skip = 1 if self.skip_header else 0 diff --git a/src/capymoa/stream/drift.py b/src/capymoa/stream/drift.py index f60a77c7..81451b0c 100644 --- a/src/capymoa/stream/drift.py +++ b/src/capymoa/stream/drift.py @@ -2,7 +2,7 @@ import re -from capymoa.stream.stream import Stream +from capymoa.stream._stream import Stream from capymoa._utils import _get_moa_creation_CLI from moa.streams import ConceptDriftStream as MOA_ConceptDriftStream diff --git a/tasks.py b/tasks.py index fc4e71fe..7c3a9d65 100644 --- a/tasks.py +++ b/tasks.py @@ -34,13 +34,14 @@ def all_exist(files: List[str] = None, directories: List[str] = None) -> bool: def docs_build(ctx: Context, ignore_warnings: bool = False): """Build the documentation using Sphinx.""" warn = "-W" if not ignore_warnings else "" + nitpicky = "-n" if not ignore_warnings else "" doc_dir = Path("docs/_build") doc_dir.mkdir(exist_ok=True, parents=True) cpu = cpu_count() // 2 print("Building documentation...") - ctx.run(f"python -m sphinx build {warn} --color -E -b html docs {doc_dir}") + ctx.run(f"python -m sphinx build {warn} {nitpicky} --color -E -b html docs {doc_dir}") print("-" * 80) print("Documentation is built and available at:") diff --git a/test_utility/ssl_helpers.py b/test_utility/ssl_helpers.py deleted file mode 100644 index 560ae67a..00000000 --- a/test_utility/ssl_helpers.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest -from capymoa.evaluation.evaluation import prequential_SSL_evaluation -from capymoa.learner import ClassifierSSL -from capymoa.stream import Stream - -def assert_ssl_evaluation( - learner: ClassifierSSL, - stream: Stream, - expectation: float, - label_probability: float = 0.01, - max_instances: int = 1000, -): - results = prequential_SSL_evaluation( - stream=stream, - learner=learner, - label_probability=label_probability, - window_size=10, - max_instances=max_instances, - ) - - assert results["cumulative"].accuracy() == pytest.approx(expectation), \ - f"Expected accuracy of {expectation} but got {results['cumulative'].accuracy()}" + \ - f" for learner {learner} on stream {stream}" diff --git a/tests/test_CPSSDS.py b/tests/test_CPSSDS.py deleted file mode 100644 index 81cd87cc..00000000 --- a/tests/test_CPSSDS.py +++ /dev/null @@ -1,28 +0,0 @@ -from capymoa.datasets.datasets import ElectricityTiny, CovtypeTiny -from capymoa.learner.ssl.classifier.CPSSDS import CPSSDS -from test_utility.ssl_helpers import assert_ssl_evaluation -import pytest - - -@pytest.mark.parametrize( - "learner, stream, expectation", - [ - ("NaiveBayes", ElectricityTiny(), 76.6), - ("HoeffdingTree", ElectricityTiny(), 66.2), - ("NaiveBayes", CovtypeTiny(), 55.7), - ("HoeffdingTree", CovtypeTiny(), 53.3), - ], - ids=[ - "ElectricityTiny-NaiveBayes", - "ElectricityTiny-HoeffdingTree", - "CovtypeTiny-NaiveBayes", - "CovtypeTiny-HoeffdingTree", - ], -) -def test_CPSSDS(learner, stream, expectation): - assert_ssl_evaluation( - CPSSDS(learner, 100, schema=stream.schema), - stream, - expectation, - label_probability=0.5, - ) diff --git a/tests/test_OSNN.py b/tests/test_OSNN.py deleted file mode 100644 index a75c0556..00000000 --- a/tests/test_OSNN.py +++ /dev/null @@ -1,23 +0,0 @@ -from capymoa.datasets.datasets import ElectricityTiny, CovtypeTiny -from test_utility.ssl_helpers import assert_ssl_evaluation -import pytest -import importlib - -@pytest.mark.parametrize( - "stream, expectation", - [ - (ElectricityTiny(), 46.1), - (CovtypeTiny(), 26.3), - ], - ids=["ElectricityTiny", "CovtypeTiny"], -) -def test_OSNN(stream, expectation): - pytest.importorskip("torch.nn", reason="PyTorch not installed. Skipping test.") - OSNN = importlib.import_module("capymoa.learner.ssl.classifier.OSNN").OSNN - # The optimizer steps are set to 10 to speed up the test - learner = OSNN(optim_steps=10) - assert_ssl_evaluation( - learner, - stream, - expectation, - ) diff --git a/tests/test_batch.py b/tests/test_batch.py index ad72451c..b2fc7206 100644 --- a/tests/test_batch.py +++ b/tests/test_batch.py @@ -1,6 +1,6 @@ -from capymoa.datasets.datasets import ElectricityTiny -from capymoa.learner.ssl.classifier.batch import BatchClassifierSSL -from capymoa.stream.stream import Schema, NumpyStream +from capymoa.datasets._datasets import ElectricityTiny +from capymoa.ssl.classifier._batch import BatchClassifierSSL +from capymoa.stream._stream import Schema, NumpyStream from capymoa.evaluation.evaluation import prequential_SSL_evaluation import numpy as np diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py index ac8599a3..b619d227 100644 --- a/tests/test_classifiers.py +++ b/tests/test_classifiers.py @@ -1,22 +1,23 @@ from capymoa.evaluation import ClassificationEvaluator, ClassificationWindowedEvaluator -from capymoa.learner.classifier import ( +from capymoa.classifier import ( EFDT, HoeffdingTree, AdaptiveRandomForest, OnlineBagging, NaiveBayes, ) -from capymoa.learner import Classifier, MOAClassifier +from capymoa.base import Classifier +from capymoa.base import MOAClassifier from capymoa.datasets import ElectricityTiny import pytest from functools import partial from typing import Callable, Optional -from capymoa.learner.learners import _extract_moa_learner_CLI -from capymoa.learner.splitcriteria import InfoGainSplitCriterion +from capymoa.base import _extract_moa_learner_CLI +from capymoa.splitcriteria import InfoGainSplitCriterion -from capymoa.stream.stream import Schema +from capymoa.stream._stream import Schema -from capymoa.learner.classifier.sklearn import PassiveAggressiveClassifier +from capymoa.classifier import PassiveAggressiveClassifier @pytest.mark.parametrize( @@ -34,8 +35,14 @@ ), (partial(NaiveBayes), 84.0, 91.0, None), ], - ids=["OnlineBagging", "AdaptiveRandomForest", "HoeffdingTree", "EFDT", "EFDT_gini", "NaiveBayes"], - + ids=[ + "OnlineBagging", + "AdaptiveRandomForest", + "HoeffdingTree", + "EFDT", + "EFDT_gini", + "NaiveBayes", + ], ) def test_classifiers( learner_constructor: Callable[[Schema], Classifier], @@ -62,7 +69,6 @@ def test_classifiers( ) learner: Classifier = learner_constructor(schema=stream.get_schema()) - # learner = learner_constructor(schema=stream.get_schema()) while stream.has_more_instances(): instance = stream.next_instance() @@ -71,6 +77,7 @@ def test_classifiers( win_evaluator.update(instance.y_index, prediction) learner.train(instance) + # Check if the accuracy matches the expected value for both evaluator types actual_acc = evaluator.accuracy() actual_win_acc = win_evaluator.accuracy() assert actual_acc == pytest.approx( @@ -80,11 +87,7 @@ def test_classifiers( win_accuracy, abs=0.1 ), f"Windowed Eval: Expected accuracy of {win_accuracy:0.1f} got {actual_win_acc:0.1f}" + # Optionally check the CLI string if it was provided if isinstance(learner, MOAClassifier) and cli_string is not None: cli_str = _extract_moa_learner_CLI(learner).strip("()") - assert ( - cli_str == cli_string - ), "CLI does not match expected value" - - # assert evaluator.accuracy() == pytest.approx(accuracy, abs=0.1) - # assert win_evaluator.accuracy() == pytest.approx(win_accuracy, abs=0.1) + assert cli_str == cli_string, "CLI does not match expected value" diff --git a/tests/test_regressors.py b/tests/test_regressors.py index de601c10..14ba21ca 100644 --- a/tests/test_regressors.py +++ b/tests/test_regressors.py @@ -1,6 +1,6 @@ from capymoa.evaluation import RegressionEvaluator, RegressionWindowedEvaluator from capymoa.datasets import Fried -from capymoa.learner.regressor import ( +from capymoa.regressor import ( KNNRegressor, AdaptiveRandomForestRegressor, FIMTDD, @@ -12,7 +12,7 @@ import pytest from functools import partial -from capymoa.learner import Regressor +from capymoa.base import Regressor @pytest.mark.parametrize( diff --git a/tests/test_ssl_classifiers.py b/tests/test_ssl_classifiers.py new file mode 100644 index 00000000..79cec583 --- /dev/null +++ b/tests/test_ssl_classifiers.py @@ -0,0 +1,64 @@ +from capymoa.datasets._datasets import ElectricityTiny, CovtypeTiny +from capymoa.ssl.classifier import OSNN, CPSSDS +import pytest + +import pytest +from capymoa.evaluation.evaluation import prequential_SSL_evaluation +from capymoa.base import ClassifierSSL +from capymoa.stream import Stream +from functools import partial + + +def assert_ssl_evaluation( + learner: ClassifierSSL, + stream: Stream, + expectation: float, + label_probability: float = 0.01, + max_instances: int = 1000, +): + results = prequential_SSL_evaluation( + stream=stream, + learner=learner, + label_probability=label_probability, + window_size=10, + max_instances=max_instances, + ) + + assert results["cumulative"].accuracy() == pytest.approx(expectation), ( + f"Expected accuracy of {expectation} but got {results['cumulative'].accuracy()}" + + f" for learner {learner} on stream {stream}" + ) + +@pytest.mark.parametrize( + "learner_constructor, stream_constructor, expectation, label_probability", + [ + (partial(OSNN, optim_steps=10), ElectricityTiny, 46.1, None), + (partial(OSNN, optim_steps=10), CovtypeTiny, 26.3, None), + (partial(CPSSDS, batch_size=100, base_model="NaiveBayes"), ElectricityTiny, 76.6, 0.5), + (partial(CPSSDS, batch_size=100, base_model="HoeffdingTree"), ElectricityTiny, 66.2, 0.5), + (partial(CPSSDS, batch_size=100, base_model="NaiveBayes"), CovtypeTiny, 55.7, 0.5), + (partial(CPSSDS, batch_size=100, base_model="HoeffdingTree"), CovtypeTiny, 53.3, 0.5), + ], + ids=[ + "OSNN_ElectricityTiny", + "OSNN_CovtypeTiny", + "CPSSDS_ElectricityTiny-NaiveBayes", + "CPSSDS_ElectricityTiny-HoeffdingTree", + "CPSSDS_CovtypeTiny-NaiveBayes", + "CPSSDS_CovtypeTiny-HoeffdingTree", + ], +) +def test_ssl_classifiers(learner_constructor, stream_constructor, expectation, label_probability): + # The optimizer steps are set to 10 to speed up the test + stream = stream_constructor() + learner = learner_constructor(schema=stream.get_schema()) + + if label_probability is None: + label_probability = 0.01 + + assert_ssl_evaluation( + learner, + stream, + expectation, + label_probability=label_probability, + ) diff --git a/tests/test_stream.py b/tests/test_stream.py index 4801bd90..eb5356d6 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -1,5 +1,6 @@ """This module is for testing the speeds of different stream implementations. """ + import time from capymoa.stream import stream_from_file from cProfile import Profile @@ -7,33 +8,41 @@ import numpy as np from capymoa.stream import Stream -from capymoa.stream.instance import Instance -from capymoa.stream.stream import CSVStream +from capymoa.instance import Instance +from capymoa.stream._stream import CSVStream import csv + def _get_streams() -> List[Stream]: return [ stream_from_file("data/electricity_tiny.csv"), stream_from_file("data/electricity_tiny.arff"), - CSVStream("data/electricity_tiny.csv") + CSVStream("data/electricity_tiny.csv"), ] + def test_stream_consistency(): streams = _get_streams() def _has_more_instance(): return [stream.has_more_instances() for stream in streams] - + def _next_instance(): return [stream.next_instance() for stream in streams] - + i = 0 while any(_has_more_instance()): - assert all(_has_more_instance()), "Not all streams have the same number of instances" + assert all( + _has_more_instance() + ), "Not all streams have the same number of instances" i += 1 instances = _next_instance() prototype = instances.pop() for instance in instances: - assert np.allclose(prototype.x, instance.x), f"Streams are not consistent at instance {i}" - assert prototype.y_index == instance.y_index, f"Streams are not consistent at instance {i}" + assert np.allclose( + prototype.x, instance.x + ), f"Streams are not consistent at instance {i}" + assert ( + prototype.y_index == instance.y_index + ), f"Streams are not consistent at instance {i}"