dbnc/example.yaml

{
    #
    # Training Data
    # =============
    #
    # Size of training sample to use for feature extraction and
    # discretization; this defines the `training data' below:
    bn_abstr_train_size: 20000,
    #
    # Proportion of the training data (above) to use for feature
    # extraction (defaults to 1, i.e. use all of it).
    # feat_extr_train_size: .5,
    #
    # Feature Extraction (for each layer)
    # ===================================
    #
    # Note that `decomp: "PCA"' is optional as it is the default.
    #
    # PCA: Use 3 components with randomized SVD algorithm (Note:
    # usually quite efficient, yet may impair the generatation of test
    # cases via the LP-based algorithm):
    # feats: { decomp: 'pca', n_components: 3, svd_solver: 'randomized' },
    #
    # PCA: Use 5 components with exact algorthm (potentially slower,
    # but yields exact inverse transformation---needed for LP-based
    # test case generation):
    feats: { decomp: 'pca', n_components: 5, svd_solver: 'full' },
    #
    # PCA: Use 5 components and guess a good algorithms based on size
    # of training data:
    # feats: 5,
    #
    # PCA: Use as many components as needed to capture at least 50% of
    # variance.  High values for shallow layers (close to input) is
    # discouraged, as this usually leads to large amounts of
    # components (which in turn leads to a BN that does not fit in
    # memory):
    # feats: 0.5,
    #
    # ICA: Use 3 components, with up to 5000 iteratons and a tolerance
    # of 0.01:
    # feats: { decomp: 'ica', n_components: 3, max_iter: 5000, tol: 0.01 },
    #
    # Custom (per-layer) Specifications
    # ---------------------------------
    #
    # To define the feature extraction technique strategy on a
    # per-layer basis, one can give a lambda function that takes the
    # layer index as argument, and returns any of the specification
    # dictionaries as above:
    # feats: '(lambda li: 3 if li <= 2 else 4)',
    #
    # As above, returing minimum captured variance instead:
    # feats: '(lambda li: 0.9 if li > 5 else 0.7 if li > 4 else 0.5 if li > 2 else 0.1)',
    #
    # Hybrid: Use ICA(3) for the three first layers, PCA(4) elsewhere:
    # feats: '(lambda li: { "decomp": "ica", "n_components": 3 } if li <= 2 else { "n_components": 4 })',
    #
    # Discretization Strategy (for each extracted feature)
    # ====================================================
    #
    # Binarize feature around 0.0 (this is the default, if `discr` is
    # not given:
    # discr: 'bin',
    #
    # Partition each feature into 2 intervals (with quantile strategy,
    # that computes interval boundaries so that the projected training
    # data is evenly spread among each interval---the default for
    # non-binarization):
    # discr: 2,
    #
    # Perform a 3-clustering of the projected training data, and use 3
    # adjacent intervals that each span exactly one cluster:
    # discr: { n_bins: 3, strategy: "kmeans" },
    #
    # Use 4 intervals of identical width that span all training data:
    # discr: { n_bins: 4, strategy: "uniform" },
    #
    # Kernel Density Estimate (KDE):
    # ------------------------------
    #
    # Compute a kernel density estimation based on training data and
    # find split locations based on dips (local minima of the
    # estimated density function) and plateaux (large-enough intervals
    # where the density approaches zero): each dip is associated with
    # an interval boundary, whereas each plateau defines a whole
    # interval.  Roughtly, dips are selected using a prominence
    # criterion that is defined as a ratio w.r.t the maximum of the
    # density function (`kde_dip_prominence_prop', defaulting to
    # 1/10).  Alternatively, plateaux are intervals where the density
    # function is lower than a threshold that is also defined as a
    # ratio w.r.t the maximum of the density function
    # (`kde_baseline_density_prop`, defaulting to 1/20):
    discr: { strategy: 'kde' },
    # is thus equivalent to:
    # discr: { strategy: 'kde', kde_dip_prominence_prop: 0.1, kde_baseline_density_prop: 0.05 },
    #
    # The additional key `kde_plot_spaces` allows to output plots of
    # density estimates and resulting intervals, along with scattered
    # points that each represent one element of the training data.
    # When defined, it may take its values in `dens` or `logl`, to
    # plot the density estimation or the log-likelihood, respectively.
    # Additional arguments can also be provided to control whether to
    # include some markers on dips in the plots, or the amount of
    # points, to include in the plots:
    # discr: { strategy: 'kde', kde_plot_spaces: 'dens', kde_plot_dip_markers: True, kde_plot_training_samples: 500 },
    # discr: { strategy: 'kde', kde_plot_spaces: 'dens', kde_plot_dip_markers: False, kde_plot_training_samples: 0 },
    #
    # Extended Variants
    # -----------------
    #
    # Extended discretization strategies (with `extended: True') add
    # two left- and right-open intervals that contain no projected
    # training data.
    #
    # Use 3 extended bins (so, 5 in total) to discretize based on
    # 3-clustered training data:
    # discr: { n_bins: 3, extended: True, strategy: "kmeans" },
    #
    # Use 3 extended bins (so, 5 in total) to discretize with 3
    # intervals of identical width:
    # discr: { n_bins: 3, extended: True, strategy: "uniform" },
    #
    # Kernel Density Estimates with additional open intervals (which
    # may contain a limitted amount of projected training data, partly
    # depending on `kde_baseline_density_prop`):
    # discr: { strategy: 'kde', extended: True },
    #
    # Custom (per-layer) Specifications
    # ---------------------------------
    #
    # To define the discretization strategy on a per-layer basis, one
    # can also use a lambda function that takes the layer index as
    # argument, and returns any of the specification dictionaries as
    # above (or None for default, binarization):
    # discr: '(lambda li: { "n_bins": 4, "strategy": "uniform" } if li in (1,) else None)',
    #
    # To use dictionaries:
    # discr: '(lambda li: "bin" if li == 3 else dict(n_bins=3, strategy="uniform"))',
    #
    # Reporting
    # =========
    #
    # Current reporting requires `matplotlib` to be installed.
    #
    # If True, plot graphs that show how some of the training data is
    # spread w.r.t each extracted features for each layer:
    report_on_feature_extractions: False,
    #
    # Size of test sample (also extracted from training data) to use
    # for scoring and reporting:
    bn_abstr_test_size: 200,
    #
    # Dump the BN fit with the (portion of the) training dataset that
    # was used to extract and discretize features in a file called
    # 'bn4trained.yml':
    dump_bn_with_trained_dataset_distribution: False,
    #
    # Dump the BN fit with the initial and generated dataset in a file
    # called 'bn4tests.yml' upon termination of the generation
    # process:
    dump_bn_with_final_dataset_distribution: False,
    #
    #
    # Process Customization
    # =====================
    #
    # The `*_n_jobs` values below default to `1` (unless some `joblib`
    # context is setup).  Use `-1` to use all processors (see
    # https://scikit-learn.org/stable/glossary.html#term-n-jobs)
    #
    # Number of jobs used to bake and fit the BN abstraction:
    bn_abstr_n_jobs: 3,
    #
    # Number of jobs used to discretize feature components/spaces (see
    # https://scikit-learn.org/stable/glossary.html#term-n-jobs for
    # details on its semantics):
    discr_n_jobs: -1,
    #
    # Verbosity level (only affects console output):
    verbose: 2,
}