diff --git a/examples/regular/models/adult_ctgan.py b/examples/regular/models/adult_ctgan.py new file mode 100644 index 00000000..6fb8bfa0 --- /dev/null +++ b/examples/regular/models/adult_ctgan.py @@ -0,0 +1,34 @@ +from pmlb import fetch_data + +from ydata_synthetic.synthesizers.regular import RegularSynthesizer +from ydata_synthetic.synthesizers import ModelParameters, TrainParameters + +# Load data and define the data processor parameters +data = fetch_data('adult') +num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'] +cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'native-country', 'target'] + +# Defining the training parameters +batch_size = 500 +epochs = 500+1 +learning_rate = 2e-4 +beta_1 = 0.5 +beta_2 = 0.9 + +ctgan_args = ModelParameters(batch_size=batch_size, + lr=learning_rate, + betas=(beta_1, beta_2)) + +train_args = TrainParameters(epochs=epochs) +synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args) +synth.fit(data=data, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols) + +synth.save('adult_ctgan_model.pkl') + +######################################################### +# Loading and sampling from a trained synthesizer # +######################################################### +synth = RegularSynthesizer.load('adult_ctgan_model.pkl') +synth_data = synth.sample(1000) +print(synth_data) \ No newline at end of file diff --git a/examples/regular/models/creditcard_ctgan.py b/examples/regular/models/creditcard_ctgan.py new file mode 100644 index 00000000..a8a51fe0 --- /dev/null +++ b/examples/regular/models/creditcard_ctgan.py @@ -0,0 +1,71 @@ +""" + CTGAN architecture example file +""" +import pandas as pd +from sklearn import cluster + +from ydata_synthetic.utils.cache import cache_file +from ydata_synthetic.synthesizers import ModelParameters, TrainParameters +from ydata_synthetic.synthesizers.regular import RegularSynthesizer + +# Read the original data and have it preprocessed +data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv') +data = pd.read_csv(data_path, index_col=[0]) + +# Data processing and analysis +num_cols = list(data.columns[ data.columns != 'Class' ]) +cat_cols = [] + +print('Dataset columns: {}'.format(num_cols)) +sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', + 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', + 'V9', 'V23', 'Class'] +processed_data = data[ sorted_cols ].copy() +processed_data['Class'] = processed_data['Class'].apply(lambda x: 1 if x == "'1'" else 0) + +# For the purpose of this example we will only synthesize the minority class +train_data = processed_data.loc[processed_data['Class'] == 1].copy() + +# Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN +print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) +algorithm = cluster.KMeans +args, kwds = (), {'n_clusters':2, 'random_state':0} +labels = algorithm(*args, **kwds).fit_predict(train_data[num_cols]) + +fraud_w_classes = train_data.copy() +fraud_w_classes['Class'] = labels + +#---------------------------- +# CTGAN Training +#---------------------------- + +batch_size = 500 +epochs = 500+1 +learning_rate = 2e-4 +beta_1 = 0.5 +beta_2 = 0.9 + +ctgan_args = ModelParameters(batch_size=batch_size, + lr=learning_rate, + betas=(beta_1, beta_2)) + +train_args = TrainParameters(epochs=epochs) + +# Create a bining +fraud_w_classes['Amount'] = pd.cut(fraud_w_classes['Amount'], 5).cat.codes + +# Init the CTGAN +synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args) + +#Training the CTGAN +synth.fit(data=fraud_w_classes, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols) + +# Saving the synthesizer +synth.save('creditcard_ctgan_model.pkl') + +# Loading the synthesizer +synthesizer = RegularSynthesizer.load('creditcard_ctgan_model.pkl') + +# Sampling from the synthesizer +sample = synthesizer.sample(1000) +print(sample) diff --git a/requirements.txt b/requirements.txt index 1e91a8d7..71f321a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ numpy==1.23.* scikit-learn==1.2.* matplotlib==3.6.* tensorflow==2.11.0 +tensorflow-probability==0.19.0 easydict==1.10 pmlb==1.0.* tqdm<5.0 diff --git a/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py b/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py new file mode 100644 index 00000000..158cedf1 --- /dev/null +++ b/src/ydata_synthetic/preprocessing/regular/ctgan_processor.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +from typing import List, Optional +from typeguard import typechecked +from dataclasses import dataclass +import pandas as pd +import numpy as np +from sklearn.exceptions import NotFittedError, ConvergenceWarning +from sklearn.utils._testing import ignore_warnings +from sklearn.mixture import BayesianGaussianMixture +from sklearn.preprocessing import OneHotEncoder + +from ydata_synthetic.preprocessing.base_processor import BaseProcessor + +@dataclass +class ColumnMetadata: + """ + Dataclass that stores the metadata of each column. + """ + start_idx: int + end_idx: int + discrete: bool + output_dim: int + model: any + components: list + name: str + + +@typechecked +class CTGANDataProcessor(BaseProcessor): + """ + CTGAN data preprocessing class. + It works like any other transformer in scikit-learn with the methods fit, transform and inverse_transform. + Args: + n_clusters (int), default=10: + Number of clusters. + epsilon (float), default=0.005: + Epsilon value. + num_cols (list of strings): + List of names of numerical columns. + cat_cols (list of strings): + List of names of categorical columns. + """ + SUPPORTED_MODEL = 'CTGAN' + + def __init__(self, n_clusters=10, epsilon=0.005, + num_cols: Optional[List[str]] = None, + cat_cols: Optional[List[str]] = None): + super().__init__(num_cols, cat_cols) + + self._n_clusters = n_clusters + self._epsilon = epsilon + self._metadata = None + self._dtypes = None + self._output_dimensions = None + + @property + def metadata(self) -> list[ColumnMetadata]: + """ + Returns the metadata for each column. + """ + return self._metadata + + @property + def output_dimensions(self) -> int: + """ + Returns the dataset dimensionality after the preprocessing. + """ + return int(self._output_dimensions) + + @ignore_warnings(category=ConvergenceWarning) + def fit(self, X: pd.DataFrame) -> CTGANDataProcessor: + """ + Fits the data processor to a passed DataFrame. + + Args: + X (DataFrame): + DataFrame used to fit the processor parameters. + Should be aligned with the num/cat columns defined in initialization. + Returns: + self (CTGANDataProcessor): The fitted data processor. + """ + self._dtypes = X.infer_objects().dtypes + self._metadata = [] + cur_idx = 0 + for column in X.columns: + column_data = X[[column]].values + if column in self.cat_cols: + ohe = OneHotEncoder(sparse_output=False) + ohe.fit(column_data) + n_categories = len(ohe.categories_[0]) + self._metadata.append( + ColumnMetadata( + start_idx=cur_idx, + end_idx=cur_idx + n_categories, + discrete=True, + output_dim=n_categories, + model=ohe, + components=None, + name=column + ) + ) + cur_idx += n_categories + else: + bgm = BayesianGaussianMixture( + n_components=self._n_clusters, + weight_concentration_prior_type='dirichlet_process', + weight_concentration_prior=0.001, + n_init=1 + ) + bgm.fit(column_data) + components = bgm.weights_ > self._epsilon + output_dim = components.sum() + 1 + self._metadata.append( + ColumnMetadata( + start_idx=cur_idx, + end_idx=cur_idx + output_dim, + discrete=False, + output_dim=output_dim, + model=bgm, + components=components, + name=column + ) + ) + cur_idx += output_dim + self._output_dimensions = cur_idx + return self + + def transform(self, X: pd.DataFrame) -> np.ndarray: + """ + Transforms the passed DataFrame with the fitted data processor. + + Args: + X (DataFrame): + DataFrame used to fit the processor parameters. + Should be aligned with the columns types defined in initialization. + Returns: + Processed version of the passed DataFrame. + """ + if self._metadata is None: + raise NotFittedError("This data processor has not yet been fitted.") + + transformed_data = [] + for col_md in self._metadata: + column_data = X[[col_md.name]].values + if col_md.discrete: + ohe = col_md.model + transformed_data.append(ohe.transform(column_data)) + else: + bgm = col_md.model + components = col_md.components + + means = bgm.means_.reshape((1, self._n_clusters)) + stds = np.sqrt(bgm.covariances_).reshape((1, self._n_clusters)) + features = (column_data - means) / (4 * stds) + + probabilities = bgm.predict_proba(column_data) + n_opts = components.sum() + features = features[:, components] + probabilities = probabilities[:, components] + + opt_sel = np.zeros(len(column_data), dtype='int') + for i in range(len(column_data)): + norm_probs = probabilities[i] + 1e-6 + norm_probs = norm_probs / norm_probs.sum() + opt_sel[i] = np.random.choice(np.arange(n_opts), p=norm_probs) + + idx = np.arange((len(features))) + features = features[idx, opt_sel].reshape([-1, 1]) + features = np.clip(features, -.99, .99) + + probs_onehot = np.zeros_like(probabilities) + probs_onehot[np.arange(len(probabilities)), opt_sel] = 1 + transformed_data.append( + np.concatenate([features, probs_onehot], axis=1).astype(float)) + + return np.concatenate(transformed_data, axis=1).astype(float) + + def inverse_transform(self, X: np.ndarray) -> pd.DataFrame: + """ + Reverts the data transformations on a passed DataFrame. + + Args: + X (ndarray): + Numpy array to be brought back to the original data format. + Should share the schema of data transformed by this data processor. + Can be used to revert transformations of training data or for synthetic samples. + Returns: + DataFrame with all performed transformations reverted. + """ + if self._metadata is None: + raise NotFittedError("This data processor has not yet been fitted.") + + transformed_data = [] + col_names = [] + for col_md in self._metadata: + col_data = X[:, col_md.start_idx:col_md.end_idx] + if col_md.discrete: + inv_data = col_md.model.inverse_transform(col_data) + else: + mean = col_data[:, 0] + variance = col_data[:, 1:] + mean = np.clip(mean, -1, 1) + + v_t = np.ones((len(col_data), self._n_clusters)) * -100 + v_t[:, col_md.components] = variance + variance = v_t + means = col_md.model.means_.reshape([-1]) + stds = np.sqrt(col_md.model.covariances_).reshape([-1]) + + p_argmax = np.argmax(variance, axis=1) + std_t = stds[p_argmax] + mean_t = means[p_argmax] + inv_data = mean * 4 * std_t + mean_t + + transformed_data.append(inv_data) + col_names.append(col_md.name) + + transformed_data = np.column_stack(transformed_data) + transformed_data = pd.DataFrame(transformed_data, columns=col_names).astype(self._dtypes) + return transformed_data diff --git a/src/ydata_synthetic/synthesizers/gan.py b/src/ydata_synthetic/synthesizers/gan.py index dfe78a62..d3666f78 100644 --- a/src/ydata_synthetic/synthesizers/gan.py +++ b/src/ydata_synthetic/synthesizers/gan.py @@ -16,7 +16,6 @@ from tensorflow import config as tfconfig from tensorflow import data as tfdata -from tensorflow import dtypes from tensorflow import random from typeguard import typechecked @@ -24,17 +23,22 @@ RegularDataProcessor, RegularModels) from ydata_synthetic.preprocessing.timeseries.timeseries_processor import ( TimeSeriesDataProcessor, TimeSeriesModels) +from ydata_synthetic.preprocessing.regular.ctgan_processor import CTGANDataProcessor from ydata_synthetic.synthesizers.saving_keras import make_keras_picklable _model_parameters = ['batch_size', 'lr', 'betas', 'layers_dim', 'noise_dim', - 'n_cols', 'seq_len', 'condition', 'n_critic', 'n_features', 'tau_gs'] + 'n_cols', 'seq_len', 'condition', 'n_critic', 'n_features', + 'tau_gs', 'generator_dims', 'critic_dims', 'l2_scale', + 'latent_dim', 'gp_lambda', 'pac'] _model_parameters_df = [128, 1e-4, (None, None), 128, 264, - None, None, None, 1, None, 0.2] + None, None, None, 1, None, 0.2, [256, 256], + [256, 256], 1e-6, 128, 10.0, 10] -_train_parameters = ['cache_prefix', 'label_dim', 'epochs', 'sample_interval', 'labels'] +_train_parameters = ['cache_prefix', 'label_dim', 'epochs', 'sample_interval', + 'labels', 'n_clusters', 'epsilon', 'log_frequency'] ModelParameters = namedtuple('ModelParameters', _model_parameters, defaults=_model_parameters_df) -TrainParameters = namedtuple('TrainParameters', _train_parameters, defaults=('', None, 300, 50, None)) +TrainParameters = namedtuple('TrainParameters', _train_parameters, defaults=('', None, 300, 50, None, 10, 0.005, True)) # pylint: disable=R0902 @@ -71,8 +75,18 @@ def __init__( self.noise_dim = model_parameters.noise_dim self.data_dim = None self.layers_dim = model_parameters.layers_dim + + # Additional parameters for the CTGAN + self.generator_dims = model_parameters.generator_dims + self.critic_dims = model_parameters.critic_dims + self.l2_scale = model_parameters.l2_scale + self.latent_dim = model_parameters.latent_dim + self.gp_lambda = model_parameters.gp_lambda + self.pac = model_parameters.pac + self.processor = None - if self.__MODEL__ in RegularModels.__members__: + if self.__MODEL__ in RegularModels.__members__ or \ + self.__MODEL__ == CTGANDataProcessor.SUPPORTED_MODEL: self.tau = model_parameters.tau_gs # pylint: disable=E1101 @@ -85,7 +99,7 @@ def _set_lr(self, lr): self.g_lr=lr self.d_lr=lr elif isinstance(lr,(list, tuple)): - assert len(lr)==2, "Please provide a tow values array for the learning rates or a float." + assert len(lr)==2, "Please provide a two values array for the learning rates or a float." self.g_lr=lr[0] self.d_lr=lr[1] @@ -107,7 +121,8 @@ def model_name(self): def fit(self, data: Union[DataFrame, array], num_cols: Optional[List[str]] = None, - cat_cols: Optional[List[str]] = None) -> Union[DataFrame, array]: + cat_cols: Optional[List[str]] = None, + train_arguments: Optional[TrainParameters] = None) -> Union[DataFrame, array]: """ ### Description: Trains and fit a synthesizer model to a given input dataset. @@ -116,18 +131,23 @@ def fit(self, `data` (Union[DataFrame, array]): Training data `num_cols` (Optional[List[str]]) : List with the names of the categorical columns `cat_cols` (Optional[List[str]]): List of names of categorical columns + `train_arguments` (Optional[TrainParameters]): Training parameters ### Returns: **self:** *object* Fitted synthesizer """ if self.__MODEL__ in RegularModels.__members__: - self.processor = RegularDataProcessor + self.processor = RegularDataProcessor(num_cols=num_cols, cat_cols=cat_cols).fit(data) elif self.__MODEL__ in TimeSeriesModels.__members__: - self.processor = TimeSeriesDataProcessor + self.processor = TimeSeriesDataProcessor(num_cols=num_cols, cat_cols=cat_cols).fit(data) + elif self.__MODEL__ == CTGANDataProcessor.SUPPORTED_MODEL: + n_clusters = train_arguments.n_clusters + epsilon = train_arguments.epsilon + self.processor = CTGANDataProcessor(n_clusters=n_clusters, epsilon=epsilon, + num_cols=num_cols, cat_cols=cat_cols).fit(data) else: print(f'A DataProcessor is not available for the {self.__MODEL__}.') - self.processor = self.processor(num_cols = num_cols, cat_cols = cat_cols).fit(data) def sample(self, n_samples: int): """ @@ -226,7 +246,7 @@ def _generate_noise(self): def get_batch_noise(self): "Create a batch iterator for the generator gaussian noise input." - return iter(tfdata.Dataset.from_generator(self._generate_noise, output_types=dtypes.float32) + return iter(tfdata.Dataset.from_generator(self._generate_noise, output_types=tf.dtypes.float32) .batch(self.batch_size) .repeat()) diff --git a/src/ydata_synthetic/synthesizers/loss.py b/src/ydata_synthetic/synthesizers/loss.py index 3179ffc8..a4f45671 100644 --- a/src/ydata_synthetic/synthesizers/loss.py +++ b/src/ydata_synthetic/synthesizers/loss.py @@ -1,22 +1,23 @@ -from tensorflow import random -from tensorflow import reshape, shape, math, GradientTape, reduce_mean -from tensorflow import norm as tfnorm - +from tensorflow import \ + (random, reshape, shape, GradientTape, reduce_mean, + norm as tfnorm, tile, constant, int32) +from tensorflow.math import reduce_std, reduce_euclidean_norm from enum import Enum class Mode(Enum): WGANGP = 'wgangp' DRAGAN = 'dragan' CRAMER = 'cramer' + CTGAN = 'ctgan' ## Original code loss from ## https://github.com/LynnHo/DCGAN-LSGAN-WGAN-GP-DRAGAN-Tensorflow-2/blob/master/tf2gan/loss.py -def gradient_penalty(f, real, fake, mode): +def gradient_penalty(f, real, fake, mode, pac=None): def _gradient_penalty(f, real, fake=None): def _interpolate(a, b=None): if b is None: # interpolation in DRAGAN beta = random.uniform(shape=shape(a), minval=0., maxval=1.) - b = a + 0.5 * math.reduce_std(a) * beta + b = a + 0.5 * reduce_std(a) * beta shape_ = [shape(a)[0]] + [1] * (a.shape.ndims - 1) alpha = random.uniform(shape=shape_, minval=0., maxval=1.) inter = a + alpha * (b - a) @@ -43,11 +44,29 @@ def _gradient_penalty_cramer(f_crit, real, fake): c_regularizer = (c_dx - 1.0) ** 2 return c_regularizer + def _gradient_penalty_ctgan(f, real, fake, pac=10): + alpha = random.uniform([real.shape[0] // pac, 1, 1], 0., 1.) + alpha = tile(alpha, constant([1, pac, real.shape[1]], int32)) + alpha = reshape(alpha, [-1, real.shape[1]]) + interpolate = alpha * real + ((1 - alpha) * fake) + with GradientTape() as tape: + tape.watch(interpolate) + prediction = f(interpolate) + gradient = tape.gradient(prediction, [interpolate])[0] + gradient = reshape(gradient, constant([-1, pac * real.shape[1]], int32)) + slope = reduce_euclidean_norm(gradient, axis=1) + return reduce_mean((slope - 1.) ** 2) + if mode == Mode.DRAGAN: gp = _gradient_penalty(f, real) elif mode == Mode.CRAMER: gp = _gradient_penalty_cramer(f, real, fake) elif mode == Mode.WGANGP: gp = _gradient_penalty(f, real, fake) + elif mode == Mode.CTGAN: + if pac is not None: + gp = _gradient_penalty_ctgan(f, real, fake, pac=pac) + else: + gp = _gradient_penalty_ctgan(f, real, fake) return gp diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py b/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py new file mode 100644 index 00000000..e8b1dd71 --- /dev/null +++ b/src/ydata_synthetic/synthesizers/regular/ctgan/__init__.py @@ -0,0 +1 @@ +from .model import CTGAN diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/model.py b/src/ydata_synthetic/synthesizers/regular/ctgan/model.py new file mode 100644 index 00000000..e7feb57c --- /dev/null +++ b/src/ydata_synthetic/synthesizers/regular/ctgan/model.py @@ -0,0 +1,322 @@ +from functools import partial +from joblib import dump +import numpy as np +from pandas import DataFrame +import tensorflow as tf +from keras.layers import \ + (Input, Dense, LeakyReLU, Dropout, BatchNormalization, ReLU, Concatenate) +from keras import Model + +import tensorflow_probability as tfp +from ydata_synthetic.synthesizers.regular.ctgan.utils \ + import ConditionalLoss, RealDataSampler, ConditionalSampler + +from ydata_synthetic.synthesizers.loss import gradient_penalty, Mode as ModeGP +from ydata_synthetic.synthesizers.gan import BaseModel, ModelParameters, TrainParameters +from ydata_synthetic.preprocessing.regular.ctgan_processor import CTGANDataProcessor + +class CTGAN(BaseModel): + """ + Conditional Tabular GAN model. + Based on the paper https://arxiv.org/abs/1907.00503. + + Args: + model_parameters: Parameters used to create the CTGAN model. + """ + __MODEL__ = 'CTGAN' + + def __init__(self, model_parameters: ModelParameters): + super().__init__(model_parameters) + if self.batch_size % 2 != 0 or self.batch_size % self.pac != 0: + raise ValueError("The batch size needs to be an even value divisible by the PAC.") + self._model_parameters = model_parameters + self._real_data_sampler = None + self._conditional_sampler = None + self._generator_model = None + self._critic_model = None + + @staticmethod + def _create_generator_model(input_dim, generator_dims, data_dim, metadata, tau): + """ + Creates the generator model. + + Args: + input_dim: Input dimensionality. + generator_dims: Dimensions of each hidden layer. + data_dim: Output dimensionality. + metadata: Dataset columns metadata. + tau: Gumbel-Softmax non-negative temperature. + """ + input = Input(shape=(input_dim, )) + x = input + dim = input_dim + for layer_dim in generator_dims: + layer_input = x + x = Dense(layer_dim, + kernel_initializer="random_uniform", + bias_initializer="random_uniform")(x) + x = BatchNormalization(epsilon=1e-5, momentum=0.9)(x) + x = ReLU()(x) + x = Concatenate(axis=1)([x, layer_input]) + dim += layer_dim + + def _gumbel_softmax(logits, tau=1.0): + """Applies the Gumbel-Softmax function to the given logits.""" + gumbel_dist = tfp.distributions.Gumbel(loc=0, scale=1) + gumbels = gumbel_dist.sample(tf.shape(logits)) + gumbels = (logits + gumbels) / tau + return tf.nn.softmax(gumbels, -1) + + def _generator_activation(data): + """Custom activation function for the generator model.""" + data_transformed = [] + for col_md in metadata: + if col_md.discrete: + logits = data[:, col_md.start_idx:col_md.end_idx] + data_transformed.append(_gumbel_softmax(logits, tau=tau)) + else: + data_transformed.append(tf.math.tanh(data[:, col_md.start_idx:col_md.start_idx+1])) + logits = data[:, col_md.start_idx+1:col_md.end_idx] + data_transformed.append(_gumbel_softmax(logits, tau=tau)) + return data, tf.concat(data_transformed, axis=1) + + x = Dense(data_dim, kernel_initializer="random_uniform", + bias_initializer="random_uniform", + activation=_generator_activation)(x) + return Model(inputs=input, outputs=x) + + @staticmethod + def _create_critic_model(input_dim, critic_dims, pac): + """ + Creates the critic model. + + Args: + input_dim: Input dimensionality. + critic_dims: Dimensions of each hidden layer. + pac: PAC size. + """ + input = Input(shape=(input_dim,)) + x = tf.reshape(input, [-1, input_dim * pac]) + for dim in critic_dims: + x = Dense(dim, + kernel_initializer="random_uniform", + bias_initializer="random_uniform")(x) + x = LeakyReLU(0.2)(x) + x = Dropout(0.5)(x) + x = Dense(1, kernel_initializer="random_uniform", + bias_initializer="random_uniform")(x) + return Model(inputs=input, outputs=x) + + def fit(self, data: DataFrame, train_arguments: TrainParameters, num_cols: list[str], cat_cols: list[str]): + """ + Fits the CTGAN model. + + Args: + data: A pandas DataFrame with the data to be synthesized. + train_arguments: CTGAN training arguments. + num_cols: List of columns to be handled as numerical + cat_cols: List of columns to be handled as categorical + """ + super().fit(data=data, num_cols=num_cols, cat_cols=cat_cols, train_arguments=train_arguments) + + self._generator_optimizer = tf.keras.optimizers.Adam( + learning_rate=self.g_lr, beta_1=self.beta_1, beta_2=self.beta_2) + self._critic_optimizer = tf.keras.optimizers.Adam( + learning_rate=self.d_lr, beta_1=self.beta_1, beta_2=self.beta_2) + + train_data = self.processor.transform(data) + metadata = self.processor.metadata + data_dim = self.processor.output_dimensions + + self._real_data_sampler = RealDataSampler(train_data, metadata) + self._conditional_sampler = ConditionalSampler(train_data, metadata, train_arguments.log_frequency) + + gen_input_dim = self.latent_dim + self._conditional_sampler.output_dimensions + self._generator_model = self._create_generator_model( + gen_input_dim, self.generator_dims, data_dim, metadata, self.tau) + + crt_input_dim = data_dim + self._conditional_sampler.output_dimensions + self._critic_model = self._create_critic_model(crt_input_dim, self.critic_dims, self.pac) + + self._generator_model.build((self.batch_size, gen_input_dim)) + self._critic_model.build((self.batch_size, crt_input_dim)) + + steps_per_epoch = max(len(train_data) // self.batch_size, 1) + for epoch in range(train_arguments.epochs): + for _ in range(steps_per_epoch): + fake_z = tf.random.normal([self.batch_size, self.latent_dim]) + cond_vector = self._conditional_sampler.sample(self.batch_size) + if cond_vector is None: + real = self._real_data_sampler.sample(self.batch_size) + else: + cond, _, col_idx, opt_idx = cond_vector + cond = tf.convert_to_tensor(cond) + fake_z = tf.concat([fake_z, cond], 1) + perm = np.arange(self.batch_size) + np.random.shuffle(perm) + real = self._real_data_sampler.sample_col(col_idx[perm], opt_idx[perm]) + cond_perm = tf.gather(cond, perm) + + fake, fake_act = self._generator_model(fake_z, training=True) + real = tf.convert_to_tensor(real.astype('float32')) + real_cat = real if cond_vector is None else tf.concat([real, cond_perm], 1) + fake_cat = fake if cond_vector is None else tf.concat([fake_act, cond], 1) + critic_loss = self._train_critic_step(real_cat, fake_cat) + + fake_z = tf.random.normal([self.batch_size, self.latent_dim]) + cond_vector = self._conditional_sampler.sample(self.batch_size) + if cond_vector is None: + generator_loss = self._train_generator_step(fake_z) + else: + cond, mask, _, _ = cond_vector + cond = tf.convert_to_tensor(cond) + mask = tf.convert_to_tensor(mask) + fake_z = tf.concat([fake_z, cond], axis=1) + generator_loss = self._train_generator_step(fake_z, cond, mask, metadata) + + print(f"Epoch: {epoch} | critic_loss: {critic_loss} | generator_loss: {generator_loss}") + + def _train_critic_step(self, real, fake): + """ + Single training iteration of the critic model. + + Args: + real: Real data. + fake: Fake data. + """ + with tf.GradientTape() as tape: + y_real = self._critic_model(real, training=True) + y_fake = self._critic_model(fake, training=True) + gp = gradient_penalty( + partial(self._critic_model, training=True), real, fake, ModeGP.CTGAN, self.pac) + rec_loss = -(tf.reduce_mean(y_real) - tf.reduce_mean(y_fake)) + critic_loss = rec_loss + gp * self.gp_lambda + gradient = tape.gradient(critic_loss, self._critic_model.trainable_variables) + self._apply_critic_gradients(gradient, self._critic_model.trainable_variables) + return critic_loss + + @tf.function + def _apply_critic_gradients(self, gradient, trainable_variables): + """ + Updates gradients of the critic model. + This logic is isolated in order to be optimized as a TF function. + + Args: + gradient: Gradient. + trainable_variables: Variables to be updated. + """ + self._critic_optimizer.apply_gradients(zip(gradient, trainable_variables)) + + def _train_generator_step(self, fake_z, cond_vector=None, mask=None, metadata=None): + """ + Single training iteration of the generator model. + + Args: + real: Real data. + fake: Fake data. + cond_vector: Conditional vector. + mask: Mask vector. + metadata: Dataset columns metadata. + """ + with tf.GradientTape() as tape: + fake, fake_act = self._generator_model(fake_z, training=True) + if cond_vector is not None: + y_fake = self._critic_model( + tf.concat([fake_act, cond_vector], 1), training=True) + cond_loss = ConditionalLoss.compute(fake, cond_vector, mask, metadata) + generator_loss = -tf.reduce_mean(y_fake) + cond_loss + else: + y_fake = self._critic_model(fake_act, training=True) + generator_loss = -tf.reduce_mean(y_fake) + gradient = tape.gradient(generator_loss, self._generator_model.trainable_variables) + gradient = [gradient[i] + self.l2_scale * self._generator_model.trainable_variables[i] for i in range(len(gradient))] + self._apply_generator_gradients(gradient, self._generator_model.trainable_variables) + return generator_loss + + @tf.function + def _apply_generator_gradients(self, gradient, trainable_variables): + """ + Updates gradients of the generator model. + This logic is isolated in order to be optimized as a TF function. + + Args: + gradient: Gradient. + trainable_variables: Variables to be updated. + """ + self._generator_optimizer.apply_gradients(zip(gradient, trainable_variables)) + + def sample(self, n_samples: int): + """ + Samples new data from the CTGAN. + + Args: + n_samples: Number of samples to be generated. + """ + if n_samples <= 0: + raise ValueError("Invalid number of samples.") + + steps = n_samples // self.batch_size + 1 + data = [] + for _ in tf.range(steps): + fake_z = tf.random.normal([self.batch_size, self.latent_dim]) + cond_vec = self._conditional_sampler.sample(self.batch_size, from_active_bits=True) + if cond_vec is not None: + cond = tf.constant(cond_vec) + fake_z = tf.concat([fake_z, cond], 1) + + fake = self._generator_model(fake_z)[1] + data.append(fake.numpy()) + + data = np.concatenate(data, 0) + data = data[:n_samples] + return self.processor.inverse_transform(data) + + def save(self, path): + """ + Save the CTGAN model in a pickle file. + Only the required components to sample new data are saved. + + Args: + path: Path of the pickle file. + """ + dump({ + "model_parameters": self._model_parameters, + "data_dim": self.processor.output_dimensions, + "gen_input_dim": self.latent_dim + self._conditional_sampler.output_dimensions, + "generator_dims": self.generator_dims, + "tau": self.tau, + "metadata": self.processor.metadata, + "batch_size": self.batch_size, + "latent_dim": self.latent_dim, + "conditional_sampler": self._conditional_sampler.__dict__, + "generator_model_weights": self._generator_model.get_weights(), + "processor": self.processor.__dict__ + }, path) + + @staticmethod + def load(class_dict): + """ + Load the CTGAN model from a pickle file. + Only the required components to sample new data are loaded. + + Args: + class_dict: Class dict loaded from the pickle file. + """ + new_instance = CTGAN(class_dict["model_parameters"]) + setattr(new_instance, "generator_dims", class_dict["generator_dims"]) + setattr(new_instance, "tau", class_dict["tau"]) + setattr(new_instance, "batch_size", class_dict["batch_size"]) + setattr(new_instance, "latent_dim", class_dict["latent_dim"]) + + new_instance._conditional_sampler = ConditionalSampler() + new_instance._conditional_sampler.__dict__ = class_dict["conditional_sampler"] + new_instance.processor = CTGANDataProcessor() + new_instance.processor.__dict__ = class_dict["processor"] + + new_instance._generator_model = new_instance._create_generator_model( + class_dict["gen_input_dim"], class_dict["generator_dims"], + class_dict["data_dim"], class_dict["metadata"], class_dict["tau"]) + + new_instance._generator_model.build((class_dict["batch_size"], class_dict["gen_input_dim"])) + new_instance._generator_model.set_weights(class_dict['generator_model_weights']) + return new_instance \ No newline at end of file diff --git a/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py b/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py new file mode 100644 index 00000000..f204bf3f --- /dev/null +++ b/src/ydata_synthetic/synthesizers/regular/ctgan/utils.py @@ -0,0 +1,156 @@ +import tensorflow as tf +import numpy as np + + +class RealDataSampler: + """ + Class used to sample from real data. + + Args: + data: Input data. + metadata: Dataset columns metadata. + """ + def __init__(self, data, metadata): + super(RealDataSampler, self).__init__() + self._data = data + self._active_bits = [] + self._n_rows = len(data) + + for col_md in metadata: + if col_md.discrete: + col_active_bits = [] + for c in range(col_md.output_dim): + col_active_bits.append(np.nonzero(data[:, col_md.start_idx + c])[0]) + self._active_bits.append(col_active_bits) + + def sample(self, num_samples): + """ + Samples from the entire dataset. + + Args: + num_samples: Number of samples to be returned. + """ + return self._data[np.random.choice(np.arange(self._n_rows), num_samples)] + + def sample_col(self, col_idx, opt_idx): + """ + Samples a specific discrete column. + + Args: + col_idx: Index of the column to be sampled. + opt_idx: Index of the category. + """ + idx = [] + for col, opt in zip(col_idx, opt_idx): + idx.append(np.random.choice(self._active_bits[col][opt])) + return self._data[idx] + + +class ConditionalSampler: + """ + Class used to sample conditional vectors. + + Args: + data: Input data. + metadata: Dataset columns metadata. + log_frequency: Whether to apply log frequency or not. + """ + def __init__(self, data=None, metadata=None, log_frequency=None): + if data is None: + return + self._active_bits = [] + max_interval = 0 + counter = 0 + + for col_md in metadata: + if col_md.discrete: + max_interval = max(max_interval, col_md.end_idx - col_md.start_idx) + self._active_bits.append(np.argmax(data[:, col_md.start_idx:col_md.end_idx], axis=-1)) + counter += 1 + + self._interval = [] + self._n_col = 0 + self._n_opt = 0 + self._probabilities = np.zeros((counter, max_interval)) + + for col_md in metadata: + if col_md.discrete: + col_active_bits_sum = np.sum(data[:, col_md.start_idx:col_md.end_idx], axis=0) + if log_frequency: + col_active_bits_sum = np.log(col_active_bits_sum + 1) + col_active_bits_sum = col_active_bits_sum / np.sum(col_active_bits_sum) + self._probabilities[self._n_col, :col_md.output_dim] = col_active_bits_sum + self._interval.append((self._n_opt, col_md.output_dim)) + self._n_opt += col_md.output_dim + self._n_col += 1 + + self._interval = np.asarray(self._interval) + + @property + def output_dimensions(self): + """ + Returns the dimensionality of the conditional vectors. + """ + return self._n_opt + + def sample(self, batch_size, from_active_bits=False): + """ + Samples conditional vectors. + + Args: + batch_size: Batch size. + from_active_bits: Whether to directly sample from active bits or not. + """ + if self._n_col == 0: + return None + + col_idx = np.random.choice(np.arange(self._n_col), batch_size) + cond_vector = np.zeros((batch_size, self._n_opt), dtype='float32') + + if from_active_bits: + for i in range(batch_size): + pick = int(np.random.choice(self._active_bits[col_idx[i]])) + cond_vector[i, pick + self._interval[col_idx[i], 0]] = 1 + return cond_vector + + mask = np.zeros((batch_size, self._n_col), dtype='float32') + mask[np.arange(batch_size), col_idx] = 1 + prob = self._probabilities[col_idx] + rand = np.expand_dims(np.random.rand(prob.shape[0]), axis=1) + opt_idx = (prob.cumsum(axis=1) > rand).argmax(axis=1) + opt = self._interval[col_idx, 0] + opt_idx + cond_vector[np.arange(batch_size), opt] = 1 + return cond_vector, mask, col_idx, opt_idx + +class ConditionalLoss: + """ + Conditional loss utils. + """ + @staticmethod + def compute(data, cond_vector, mask, metadata): + """ + Computes the conditional loss. + + Args: + data: Input data. + cond_vector: Conditional vector. + mask: Mask vector. + metadata: Dataset columns metadata. + """ + shape = tf.shape(mask) + cond_loss = tf.zeros(shape) + start_cat = 0 + counter = 0 + for col_md in metadata: + if col_md.discrete: + end_cat = start_cat + col_md.output_dim + data_log_softmax = data[:, col_md.start_idx:col_md.end_idx] + cond_vector_am = tf.math.argmax(cond_vector[:, start_cat:end_cat], 1) + loss = tf.reshape(tf.nn.sparse_softmax_cross_entropy_with_logits( + cond_vector_am, data_log_softmax), [-1, 1]) + cond_loss = tf.concat( + [cond_loss[:, :counter], loss, cond_loss[:, counter+1:]], 1) + start_cat = end_cat + counter += 1 + + return tf.reduce_sum(cond_loss * mask) / tf.cast(shape[0], dtype=tf.float32) diff --git a/src/ydata_synthetic/synthesizers/regular/model.py b/src/ydata_synthetic/synthesizers/regular/model.py index 27846dad..7332ab5a 100644 --- a/src/ydata_synthetic/synthesizers/regular/model.py +++ b/src/ydata_synthetic/synthesizers/regular/model.py @@ -7,15 +7,14 @@ from tensorflow import config as tfconfig -from ..regular.vanillagan.model import VanilllaGAN -from ..regular.cgan.model import CGAN -from ..regular.wgan.model import WGAN -from ..regular.wgangp.model import WGAN_GP -from ..regular.cwgangp.model import CWGANGP -from ..regular.cramergan.model import CRAMERGAN -from ..regular.dragan.model import DRAGAN - -from ...utils.gumbel_softmax import GumbelSoftmaxActivation +from ydata_synthetic.synthesizers.regular.vanillagan.model import VanilllaGAN +from ydata_synthetic.synthesizers.regular.cgan.model import CGAN +from ydata_synthetic.synthesizers.regular.wgan.model import WGAN +from ydata_synthetic.synthesizers.regular.wgangp.model import WGAN_GP +from ydata_synthetic.synthesizers.regular.cwgangp.model import CWGANGP +from ydata_synthetic.synthesizers.regular.cramergan.model import CRAMERGAN +from ydata_synthetic.synthesizers.regular.dragan.model import DRAGAN +from ydata_synthetic.synthesizers.regular.ctgan.model import CTGAN @unique @@ -27,6 +26,7 @@ class Model(Enum): CWASSERTEINGP = 'cwgangp' CRAMER = 'cramer' DEEPREGRET = 'dragan' + CONDITIONALTABULAR = 'ctgan' __MAPPING__ = { VANILLA : VanilllaGAN, @@ -35,7 +35,8 @@ class Model(Enum): WASSERTEINGP: WGAN_GP, CWASSERTEINGP: CWGANGP, CRAMER: CRAMERGAN, - DEEPREGRET: DRAGAN + DEEPREGRET: DRAGAN, + CONDITIONALTABULAR: CTGAN } @property @@ -64,4 +65,6 @@ def load(path): # Invalid device or cannot modify virtual devices once initialized. pass synth = load(path) + if isinstance(synth, dict): + return CTGAN.load(synth) return synth \ No newline at end of file