feat: Make Cross Validation optional (#52)

- [x] k-Fold Cross Validation is now optional, classic train-val split is default - [x] Simplify model handler names - [x] Add CIFAR10 GeneticCNN example (section 4.2 of the paper) - [x] Log all individuals evaluated (#50) - [x] Do not spam messages when validating parameters, do it once - [x] Add test cases for models.base
gmontamat · Sep 22, 2024 · 18edb15 · 18edb15
1 parent ebe0c4b
commit 18edb15
Show file tree

Hide file tree

Showing 18 changed files with 389 additions and 151 deletions.
diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,5 @@ cython_debug/
 # Dataset files
 examples/iris.data
 examples/mnist.npz
+examples/cifar-10-python.tar.gz
+examples/cifar-10-batches-py/
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 <br />
 <div align="center">
   <a href="https://github.com/gmontamat/gentun">
-    <img alt="plugin-icon" src="assets/icon.png">
+    <img alt="plugin-icon" src="https://github.com/gmontamat/gentun/blob/develop/assets/icon.png?raw=true">
   </a>
   <h1 style="margin: 0;" align="center">gentun</h1>
   <p>
@@ -113,7 +113,7 @@ genes = [
 ]
 ```
 
-We are using the `gentun.models.xgboost.XGBoostCV` handler, which performs k-fold cross validation with available train
+We are using the `gentun.models.xgboost.XGBoost` handler, which performs k-fold cross validation with available train
 data and returns an average metric over the folds. Thus, we need to define some static parameters which are shared
 across the population over all generations:
 
@@ -134,11 +134,11 @@ passed either through genes or keyword arguments.
 
 ```python
 from gentun.algorithms import Tournament
-from gentun.models.xgboost import XGBoostCV
+from gentun.models.xgboost import XGBoost
 from gentun.populations import Population
 
 # Run the genetic algorithm with a population of 50 for 100 generations
-population = Population(genes, XGBoostCV, 50, x_train, y_train, **kwargs)
+population = Population(genes, XGBoost, 50, x_train, y_train, **kwargs)
 algorithm = Tournament(population)
 algorithm.run(100, maximize=False)
 ```
@@ -159,7 +159,7 @@ population. You can add custom individuals to the population before running the
 an intuition of which hyperparameters work well with your model:
 
 ```python
-from gentun.models.xgboost import XGBoostCV
+from gentun.models.xgboost import XGBoost
 from gentun.populations import Population
 
 
@@ -171,7 +171,7 @@ hyperparams = {
 }
 
 # Generate a random population and then add a custom individual
-population = Population(genes, XGBoostCV, 49, x_train, y_train, **kwargs)
+population = Population(genes, XGBoost, 49, x_train, y_train, **kwargs)
 population.add_individual(hyperparams)
 ```
 
@@ -183,7 +183,7 @@ method, so that uniformly distributed hyperparameter values are obtained with it
 
 ```python
 from gentun.genes import RandomChoice, RandomLogUniform
-from gentun.models.xgboost import XGBoostCV
+from gentun.models.xgboost import XGBoost
 from gentun.populations import Grid
 
 
@@ -196,7 +196,7 @@ genes = [
 gene_samples = [10, 8, 11]  # How many samples we want to get from each gene
 
 # Generate a grid of individuals
-population = Grid(genes, XGBoostCV, gene_samples, x_train, y_train, **kwargs)
+population = Grid(genes, XGBoost, gene_samples, x_train, y_train, **kwargs)
 ```
 
 Running the genetic algorithm on this population for just one generation is equivalent to doing a grid search over 10
@@ -227,12 +227,12 @@ processes. Once this is done, the mutation and reproduction steps are run by the
 produced.
 
 ```python
-from gentun.models.xgboost import XGBoostCV
+from gentun.models.xgboost import XGBoost
 from gentun.services import RedisController
 
 controller = RedisController("experiment", host="localhost", port=6379)
 # ... define genes
-population = Population(genes, XGBoostCV, 100, controller=controller, **kwargs)
+population = Population(genes, XGBoost, 100, controller=controller, **kwargs)
 # ... run algorithm
 ```
 
@@ -243,10 +243,10 @@ its `run()` method with train data to begin processing jobs from the queue. You
 as they have network access to the redis server.
 
 ```python
-from gentun.models.xgboost import XGBoostCV
+from gentun.models.xgboost import XGBoost
 from gentun.services import RedisWorker
 
-worker = RedisWorker("experiment", XGBoostCV, host="localhost", port=6379)
+worker = RedisWorker("experiment", XGBoost, host="localhost", port=6379)
 
 # ... fetch x_train and y_train
 worker.run(x_train, y_train)
@@ -264,7 +264,7 @@ This project supports hyperparameter tuning for the following models:
 ## Contributing
 
 We welcome contributions to enhance this library. You can submit your custom subclasses for:
-- [`gentun.models.Handler`](src/gentun/models/base.py#L9-L25)
+- [`gentun.models.Handler`](src/gentun/models/base.py#L11-L30)
 - [`gentun.genes.Gene`](src/gentun/genes.py#L11-L47)
 
 Our roadmap includes:

diff --git a/examples/geneticcnn_cifar10.py b/examples/geneticcnn_cifar10.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+"""
+Implementation of Genetic CNN on CIFAR-10 data. This is a replica of
+the algorithm described on section 4.2 of the Genetic CNN paper.
+http://arxiv.org/pdf/1703.01513
+"""
+
+import os
+import pickle
+from typing import Tuple
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+from gentun.algorithms import RussianRoulette
+from gentun.genes import Binary
+from gentun.models.tensorflow import GeneticCNN
+from gentun.populations import Population
+
+
+def load_cifar10(data_dir: str, test_size: int = 10000) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Load, sample, one-hot encode, and normalize CIFAR-10."""
+
+    def unpickle(file_name: str) -> dict:
+        with open(file_name, "rb") as fo:
+            data = pickle.load(fo, encoding="bytes")
+        return data
+
+    x_train = []
+    y_raw = []
+    for batch in range(1, 6):
+        batch_file = os.path.join(data_dir, f"data_batch_{batch}")
+        batch_data = unpickle(batch_file)
+        x_train.append(batch_data[b"data"])
+        y_raw.extend(batch_data[b"labels"])
+    x_train = np.concatenate(x_train).astype(np.float32)
+    x_train = np.moveaxis(x_train.reshape(-1, 3, 32, 32), 1, -1) / 255
+    y_raw = np.array(y_raw, dtype=np.int32)
+    # One-hot encode the output
+    y_train = np.zeros((y_raw.size, 10))
+    y_train[np.arange(y_raw.size), y_raw] = 1
+    return train_test_split(x_train, y_train, test_size=test_size, shuffle=True, stratify=y_raw)
+
+
+if __name__ == "__main__":
+    # Genetic CNN static parameters
+    kwargs = {
+        "nodes": (3, 4, 5),
+        "input_shape": (32, 32, 3),
+        "kernels_per_layer": (8, 16, 32),
+        "kernel_sizes": ((5, 5), (5, 5), (5, 5)),
+        "pool_sizes": ((3, 3), (3, 3), (3, 3)),
+        "dense_units": 128,
+        "dropout_probability": 0.5,
+        "classes": 10,
+        "epochs": (120, 60, 40, 20),
+        "learning_rate": (1e-2, 1e-3, 1e-4, 1e-5),
+        "batch_size": 32,  # Not mentioned in the paper, but 32 is a good default for most cases
+        "plot": False,  # if True, graphviz needs to be installed on your system
+    }
+    # Genetic CNN hyperparameters
+    genes = [Binary(f"S_{i + 1}", int(K_s * (K_s - 1) / 2)) for i, K_s in enumerate(kwargs["nodes"])]
+
+    x_train, x_test, y_train, y_test = load_cifar10("cifar-10-batches-py")
+    population = Population(genes, GeneticCNN, 20, x_train, y_train, x_test, y_test, **kwargs)
+    algorithm = RussianRoulette(
+        population,
+        crossover_probability=0.2,  # p_C
+        crossover_rate=0.2,  # q_C
+        mutation_probability=0.8,  # p_M
+        mutation_rate=0.05,  # q_M
+    )
+    algorithm.run(50)
diff --git a/examples/geneticcnn_mnist.py b/examples/geneticcnn_mnist.py
@@ -1,27 +1,23 @@
 #!/usr/bin/env python
 """
-Implementation of Genetic CNN on MNIST data.
-This is a replica of the algorithm described
-on section 4.1.1 of the Genetic CNN paper.
+Implementation of Genetic CNN on MNIST data. This is a replica of the
+algorithm described on section 4.1 of the Genetic CNN paper.
 http://arxiv.org/pdf/1703.01513
 """
 
-import random
 from typing import Tuple
 
 import numpy as np
+from sklearn.model_selection import train_test_split
 
 from gentun.algorithms import RussianRoulette
 from gentun.genes import Binary
 from gentun.models.tensorflow import GeneticCNN
 from gentun.populations import Population
 
 
-def load_mnist(file_name: str, sample_size: int = 10000) -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Load, sample, one-hot encode,
-    and normalize MNIST dataset.
-    """
+def load_mnist(file_name: str, test_size: int = 10000) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Load, sample, one-hot encode, and normalize MNIST."""
     mnist = np.load(file_name)
     x = mnist["x_train"].reshape(mnist["x_train"].shape[:-2] + (-1,))
     y_raw = mnist["y_train"]
@@ -32,9 +28,8 @@ def load_mnist(file_name: str, sample_size: int = 10000) -> Tuple[np.ndarray, np
     # One-hot encode the output
     y = np.zeros((size, 10))
     y[np.arange(size), y_raw] = 1
-    # TODO: stratified selection or random (check paper)?
-    selection = random.sample(range(size), sample_size)
-    return x[selection], y[selection]
+    # Split the data into training and test sets, stratified by y
+    return train_test_split(x, y, test_size=test_size, shuffle=True, stratify=y_raw)
 
 
 if __name__ == "__main__":
@@ -44,20 +39,20 @@ def load_mnist(file_name: str, sample_size: int = 10000) -> Tuple[np.ndarray, np
         "input_shape": (28, 28, 1),
         "kernels_per_layer": (20, 50),
         "kernel_sizes": ((5, 5), (5, 5)),
+        "pool_sizes": ((2, 2), (2, 2)),
         "dense_units": 500,
         "dropout_probability": 0.5,
         "classes": 10,
-        "kfold": 5,
         "epochs": (20, 4, 1),
         "learning_rate": (1e-3, 1e-4, 1e-5),
-        "batch_size": 32,
-        "plot": True,
+        "batch_size": 32,  # Not mentioned in the paper, but 32 is a good default for most cases
+        "plot": False,  # if True, graphviz needs to be installed on your system
     }
     # Genetic CNN hyperparameters
     genes = [Binary(f"S_{i + 1}", int(K_s * (K_s - 1) / 2)) for i, K_s in enumerate(kwargs["nodes"])]
 
-    x_train, y_train = load_mnist("mnist.npz")
-    population = Population(genes, GeneticCNN, 20, x_train, y_train, **kwargs)
+    x_train, x_test, y_train, y_test = load_mnist("mnist.npz")
+    population = Population(genes, GeneticCNN, 20, x_train, y_train, x_test, y_test, **kwargs)
     algorithm = RussianRoulette(
         population,
         crossover_probability=0.2,  # p_C

diff --git a/examples/get_datasets.sh b/examples/get_datasets.sh
@@ -10,4 +10,10 @@ if [ ! -e mnist.npz ]; then
     wget https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
 fi
 
+if [ ! -e cifar-10-python.tar.gz ]; then
+    echo "Downloading CIFAR-10 dataset..."
+    wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+    tar -xvzf cifar-10-python.tar.gz
+fi
+
 echo "Done!"
diff --git a/examples/scikit_iris.py b/examples/scikit_iris.py
@@ -13,7 +13,7 @@
 
 from gentun.algorithms import Tournament
 from gentun.genes import RandomChoice
-from gentun.models.sklearn import SklearnCV
+from gentun.models.sklearn import Sklearn
 from gentun.populations import Population
 
 
@@ -47,12 +47,12 @@ def parse_iris(file_name: str) -> Tuple[np.ndarray, np.ndarray]:
         "sklearn_model": RandomForestClassifier,
         "sklearn_metric": f1_score,
         "metric_kwargs": {"average": "macro"},
-        "kfold": 5,
+        "folds": 5,
     }
 
     # Fetch training data
     x_train, y_train = parse_iris("iris.data")
     # Run genetic algorithm on a population of 10 for 10 generations
-    population = Population(genes, SklearnCV, 10, x_train, y_train, **kwargs)
+    population = Population(genes, Sklearn, 10, x_train, y_train, **kwargs)
     algorithm = Tournament(population)
     algorithm.run(10)
diff --git a/examples/xgboost_grid_iris.py b/examples/xgboost_grid_iris.py
@@ -11,7 +11,7 @@
 
 from gentun.algorithms import Tournament
 from gentun.genes import RandomChoice, RandomLogUniform
-from gentun.models.xgboost import XGBoostCV
+from gentun.models.xgboost import XGBoost
 from gentun.populations import Grid
 
 
@@ -56,6 +56,6 @@ def parse_iris(file_name: str) -> Tuple[np.ndarray, np.ndarray]:
     # Fetch training data
     x_train, y_train = parse_iris("iris.data")
     # Run genetic algorithm on a grid population for 1 generation
-    population = Grid(genes, XGBoostCV, gene_samples, x_train, y_train, **kwargs)
+    population = Grid(genes, XGBoost, gene_samples, x_train, y_train, **kwargs)
     algorithm = Tournament(population)
     algorithm.run(1, maximize=False)
diff --git a/examples/xgboost_iris.py b/examples/xgboost_iris.py
@@ -11,7 +11,7 @@
 
 from gentun.algorithms import Tournament
 from gentun.genes import RandomChoice, RandomLogUniform, RandomUniform
-from gentun.models.xgboost import XGBoostCV
+from gentun.models.xgboost import XGBoost
 from gentun.populations import Population
 
 
@@ -63,6 +63,6 @@ def parse_iris(file_name: str) -> Tuple[np.ndarray, np.ndarray]:
     # Fetch training data
     x_train, y_train = parse_iris("iris.data")
     # Run genetic algorithm on a population of 50 for 100 generations
-    population = Population(genes, XGBoostCV, 50, x_train, y_train, **kwargs)
+    population = Population(genes, XGBoost, 50, x_train, y_train, **kwargs)
     algorithm = Tournament(population)
     algorithm.run(100, maximize=False)
diff --git a/pyproject.toml b/pyproject.toml
@@ -307,8 +307,8 @@ valid-classmethod-first-arg="cls"
 valid-metaclass-classmethod-first-arg="cls"
 
 [tool.pylint.'DESIGN']
-max-args=5
-max-attributes=7
+max-args=9  # 5
+max-attributes=12  # 7
 max-bool-expr=5
 max-branches=12
 max-locals=15
@@ -322,6 +322,7 @@ min-public-methods=2
 allow-wildcard-with-all="no"
 analyse-fallback-blocks="no"
 deprecated-modules="optparse,tkinter.tix"
+ignored-modules="tensorflow,tensorflow.keras"
 
 [tool.pylint.'EXCEPTIONS']
 overgeneral-exceptions= [