Synthetic-Eval is a package for the comprehensive evaluation of synthetic tabular datasets.
Install using pip:
pip install synthetic-eval
-
Statistical Fidelity
- KL-Divergence (
KL
) - Goodness-of-Fit (Kolmogorov-Smirnov test & Chi-Squared test) (
GoF
) - Maximum Mean Discrepancy (
MMD
) - Cramer-Wold Distance (
CW
) - (naive)
$\alpha$ -precision &$\beta$ -recall (alpha_precision
,beta_recall
)
- KL-Divergence (
-
Machine Learning Utility (classification task)
- Accuracy (
base_cls
,syn_cls
) - Model Selection Performance (
model_selection
) - Feature Selection Performance (
feature_selection
)
- Accuracy (
-
Privacy Preservation
-
$k$ -Anonymization (Kanon_base
,Kanon_syn
) -
$k$ -Map (KMap
) - Distance to Closest Record (
DCR_RS
,DCR_RR
,DCR_SS
) - Attribute Disclosure (
AD
)
-
from synthetic_eval import evaluation
evaluation.evaluate # function for evaluating synthetic data quality
- See example.ipynb for detailed example and its results with
loan
dataset.- Link for download
loan
dataset: https://www.kaggle.com/datasets/teertha/personal-loan-modeling
- Link for download
- Please ensure that the target column for the machine learning utility is the last column of the dataset.
"""import libraries"""
import pandas as pd
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
"""specify column types"""
data = pd.read_csv('./loan.csv')
# len(data) # 5,000
"""specify column types"""
continuous_features = [
'Age',
'Experience',
'Income',
'CCAvg',
'Mortgage',
]
categorical_features = [
'Family',
'Securities Account',
'CD Account',
'Online',
'CreditCard',
'Personal Loan',
]
target = 'Personal Loan' # machine learning utility target column
### the target column should be the last column
data = data[continuous_features + [x for x in categorical_features if x != target] + [target]]
"""training, test, synthetic datasets"""
data[categorical_features] = data[categorical_features].apply(
lambda col: col.astype('category').cat.codes)
train = data.iloc[:2000]
test = data.iloc[2000:4000]
syndata = data.iloc[4000:]
"""load Synthetic-Eval"""
from synthetic_eval import evaluation
results = evaluation.evaluate(
syndata, train, test,
target, continuous_features, categorical_features, device
)
"""print results"""
for x, y in results._asdict().items():
print(f"{x}: {y:.3f}")