From b36800e238f008b6465d9db9d1c1f08b6226c85f Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 5 Oct 2023 11:34:02 +1300 Subject: [PATCH 1/5] bump [compat] MLJBase = "1" --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index fec4b24..f446ca9 100644 --- a/Project.toml +++ b/Project.toml @@ -12,7 +12,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] -MLJBase = "0.21" +MLJBase = "1" OrderedCollections = "1.6" julia = "1.6" MLJModelInterface = "1.9" From 9bef4d66c2677ef9503ce085055b193c4046fb10 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 5 Oct 2023 11:34:29 +1300 Subject: [PATCH 2/5] bump 0.1.1 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f446ca9..67f0a70 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJBalancing" uuid = "45f359ea-796d-4f51-95a5-deb1a414c586" authors = ["Essam Wisam ", "Anthony Blaom and contributors"] -version = "0.1.0" +version = "0.1.1" [deps] MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" From eca9b963102db00e97be3180b4fa4b820711c85d Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 5 Oct 2023 11:38:36 +1300 Subject: [PATCH 3/5] ensure julia 1.6 is tested --- .github/workflows/CI.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 4d7ef92..02c3364 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: version: - - '1.8' + - '1.6' - '1' os: [ubuntu-latest, windows-latest, macOS-latest] From 4cde40bd1355f1870ffcd40fbf04e67de74a4052 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 5 Oct 2023 12:08:28 +1300 Subject: [PATCH 4/5] revert julia support - now julia >= 1.7 in project and CI --- .github/workflows/CI.yml | 2 +- Project.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 02c3364..4145a5e 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: version: - - '1.6' + - '1.7' - '1' os: [ubuntu-latest, windows-latest, macOS-latest] diff --git a/Project.toml b/Project.toml index 67f0a70..67dfc34 100644 --- a/Project.toml +++ b/Project.toml @@ -14,10 +14,10 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] MLJBase = "1" OrderedCollections = "1.6" -julia = "1.6" MLJModelInterface = "1.9" MLUtils = "0.4" StatsBase = "0.34" +julia = "1.7" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" From c5b17d68d5b84f85cc39b26bcfd87d5eba938335 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 6 Oct 2023 15:59:19 +1300 Subject: [PATCH 5/5] fix exception throwing to fix julia 1.7 fails --- src/balanced_bagging.jl | 40 +++++++++++++++++++++------------------- src/balanced_model.jl | 6 +++--- test/balanced_bagging.jl | 5 +++-- test/balanced_model.jl | 14 ++++++++------ 4 files changed, 35 insertions(+), 30 deletions(-) diff --git a/src/balanced_bagging.jl b/src/balanced_bagging.jl index 4137077..f43cb41 100644 --- a/src/balanced_bagging.jl +++ b/src/balanced_bagging.jl @@ -1,4 +1,3 @@ - """ Return a dictionary `result` mapping each unique value in a given abstract vector `y` to the vector of indices where that value occurs. @@ -16,18 +15,21 @@ function group_inds(y::AbstractVector{T}) where {T} return freeze(result) end -const ERR_MULTICLASS_UNSUPP(num_classes) = - "Only binary classification supported by BalancedBaggingClassifier. Got $num_classes classes" +const ERR_MULTICLASS_UNSUPP(num_classes) = ArgumentError( +"Only binary classification supported by BalancedBaggingClassifier. "* + "Got $num_classes classes" +) """ -Given an abstract vector `y` where any element takes one of two values, return the indices of the - most frequent of them, the indices of the least frequent of them, and the counts of each. +Given an abstract vector `y` where any element takes one of two values, return the +indices of the most frequent of them, the indices of the least frequent of them, and the +counts of each. """ function get_majority_minority_inds_counts(y) # a tuple mapping each class to its indices labels_inds = collect(group_inds(y)) num_classes = length(labels_inds) - num_classes == 2 || throw(ArgumentError(ERR_MULTICLASS_UNSUPP(num_classes))) + num_classes == 2 || throw(ERR_MULTICLASS_UNSUPP(num_classes)) # get the length of each class first_class_count = length(labels_inds[1][2]) second_class_count = length(labels_inds[2][2]) @@ -42,9 +44,9 @@ function get_majority_minority_inds_counts(y) end """ -Given data `X`, `y` where `X` is a table and `y` is an abstract vector (which may be wrapped in nodes), +Given data `X`, `y` where `X` is a table and `y` is an abstract vector (which may be wrapped in nodes), the indices and counts of the majority and minority classes and abstract rng, - return `X_sub`, `y_sub`, in the form of nodes, which are the result of randomly undersampling + return `X_sub`, `y_sub`, in the form of nodes, which are the result of randomly undersampling the majority class data in `X`, `y` so that both classes occur equally frequently. """ function get_some_balanced_subset( @@ -89,8 +91,8 @@ function BalancedBaggingClassifier(; rng = Random.default_rng(), ) model === nothing && error(ERR_MISSING_CLF) - T < 0 && error(ERR_BAD_T) - rng = rng_handler(rng) + T < 0 && error(ERR_BAD_T) + rng = rng_handler(rng) return BalancedBaggingClassifier(model, T, rng) end @@ -178,8 +180,8 @@ Construct an instance with default hyper-parameters using the syntax `bagging_mo Given a probablistic classifier.`BalancedBaggingClassifier` performs bagging by undersampling only majority data in each bag so that its includes as much samples as in the minority data. This is proposed with an Adaboost classifier where the output scores are averaged in the paper -Xu-Ying Liu, Jianxin Wu, & Zhi-Hua Zhou. (2009). Exploratory Undersampling for Class-Imbalance Learning. -IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), 39 (2), 539–5501 +Xu-Ying Liu, Jianxin Wu, & Zhi-Hua Zhou. (2009). Exploratory Undersampling for Class-Imbalance Learning. +IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), 39 (2), 539–5501 # Training data @@ -206,7 +208,7 @@ Train the machine with `fit!(mach, rows=...)`. - `T::Integer=0`: The number of bags to be used in the ensemble. If not given, will be set as the ratio between the frequency of the majority and minority classes. Can be later found in `report(mach)`. -- `rng::Union{AbstractRNG, Integer}=default_rng()`: Either an `AbstractRNG` object or an `Integer` +- `rng::Union{AbstractRNG, Integer}=default_rng()`: Either an `AbstractRNG` object or an `Integer` seed to be used with `Xoshiro` # Operations @@ -234,13 +236,13 @@ logistic_model = LogisticClassifier() model = BalancedBaggingClassifier(model=logistic_model, T=5) # Load the data and train the BalancedBaggingClassifier -X, y = Imbalance.generate_imbalanced_data(100, 5; num_vals_per_category = [3, 2], - class_probs = [0.9, 0.1], - type = "ColTable", +X, y = Imbalance.generate_imbalanced_data(100, 5; num_vals_per_category = [3, 2], + class_probs = [0.9, 0.1], + type = "ColTable", rng=42) julia> Imbalance.checkbalance(y) -1: ▇▇▇▇▇▇▇▇▇▇ 16 (19.0%) -0: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 84 (100.0%) +1: ▇▇▇▇▇▇▇▇▇▇ 16 (19.0%) +0: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 84 (100.0%) mach = machine(model, X, y) |> fit! @@ -250,4 +252,4 @@ yhat = predict(mach, X) # probabilistic predictions predict_mode(mach, X) # point predictions ``` """ -BalancedBaggingClassifier \ No newline at end of file +BalancedBaggingClassifier diff --git a/src/balanced_model.jl b/src/balanced_model.jl index 88cd78e..6b7d9a0 100644 --- a/src/balanced_model.jl +++ b/src/balanced_model.jl @@ -116,7 +116,7 @@ for model_type in SUPPORTED_MODEL_TYPES eval(ex) end -const ERR_NO_PROP = "trying to access property $name which does not exist" +const ERR_NO_PROP = ArgumentError("trying to access property $name which does not exist") # overload set property to set the property from the vector in the struct for model_type in SUPPORTED_MODEL_TYPES struct_name = MODELTYPE_TO_COMPOSITETYPE[model_type] @@ -128,7 +128,7 @@ for model_type in SUPPORTED_MODEL_TYPES !isnothing(idx) && return getfield(b, :balancers)[idx] = val # the other only option is model name === :model && return setfield(b, :model, val) - error(ERR_NO_PROP) + throw(ERR_NO_PROP) end end eval(ex) @@ -198,4 +198,4 @@ for composite_type in COMPOSITE_TYPES MMI.$trait(::Type{<:$composite_type{balancernames, M}}) where {balancernames, M} = MMI.$trait(M) end |> eval end -end \ No newline at end of file +end diff --git a/test/balanced_bagging.jl b/test/balanced_bagging.jl index 678192d..4d7ccc1 100644 --- a/test/balanced_bagging.jl +++ b/test/balanced_bagging.jl @@ -5,8 +5,9 @@ @test MLJBalancing.get_majority_minority_inds_counts(y) == ([1, 2, 3, 4, 8], [5, 6, 7], 5, 3) y = [0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2] - @test_throws MLJBalancing.ERR_MULTICLASS_UNSUPP(3) MLJBalancing.get_majority_minority_inds_counts( - y, + @test_throws( + MLJBalancing.ERR_MULTICLASS_UNSUPP(3), + MLJBalancing.get_majority_minority_inds_counts(y), ) end diff --git a/test/balanced_model.jl b/test/balanced_model.jl index 19576b6..361a06e 100644 --- a/test/balanced_model.jl +++ b/test/balanced_model.jl @@ -39,9 +39,10 @@ @test_throws MLJBalancing.ERR_MODEL_UNSPECIFIED begin BalancedModel(b1 = balancer1, b2 = balancer2, b3 = balancer3) end - @test_throws "ArgumentError: Only these model supertypes support wrapping: `Probabilistic`, `Deterministic`, and `Interval`.\nModel provided has type `Int64`." begin - BalancedModel(model = 1, b1 = balancer1, b2 = balancer2, b3 = balancer3) - end + @test_throws( + MLJBalancing.ERR_UNSUPPORTED_MODEL(1), + BalancedModel(model = 1, b1 = balancer1, b2 = balancer2, b3 = balancer3), + ) @test_logs (:warn, MLJBalancing.WRN_BALANCER_UNSPECIFIED) begin BalancedModel(model = model_prob) end @@ -80,7 +81,8 @@ Base.getproperty(balanced_model, :b1) == balancer1 Base.setproperty!(balanced_model, :b1, balancer2) Base.getproperty(balanced_model, :b1) == balancer2 - @test_throws MLJBalancing.ERR_NO_PROP begin - Base.setproperty!(balanced_model, :name11, balancer2) - end + @test_throws( + MLJBalancing.ERR_NO_PROP, + Base.setproperty!(balanced_model, :name11, balancer2), + ) end