diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 4d7ef92..4145a5e 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: version: - - '1.8' + - '1.7' - '1' os: [ubuntu-latest, windows-latest, macOS-latest] diff --git a/Project.toml b/Project.toml index fec4b24..67dfc34 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJBalancing" uuid = "45f359ea-796d-4f51-95a5-deb1a414c586" authors = ["Essam Wisam ", "Anthony Blaom and contributors"] -version = "0.1.0" +version = "0.1.1" [deps] MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" @@ -12,12 +12,12 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] -MLJBase = "0.21" +MLJBase = "1" OrderedCollections = "1.6" -julia = "1.6" MLJModelInterface = "1.9" MLUtils = "0.4" StatsBase = "0.34" +julia = "1.7" [extras] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" diff --git a/src/balanced_bagging.jl b/src/balanced_bagging.jl index 4137077..f43cb41 100644 --- a/src/balanced_bagging.jl +++ b/src/balanced_bagging.jl @@ -1,4 +1,3 @@ - """ Return a dictionary `result` mapping each unique value in a given abstract vector `y` to the vector of indices where that value occurs. @@ -16,18 +15,21 @@ function group_inds(y::AbstractVector{T}) where {T} return freeze(result) end -const ERR_MULTICLASS_UNSUPP(num_classes) = - "Only binary classification supported by BalancedBaggingClassifier. Got $num_classes classes" +const ERR_MULTICLASS_UNSUPP(num_classes) = ArgumentError( +"Only binary classification supported by BalancedBaggingClassifier. "* + "Got $num_classes classes" +) """ -Given an abstract vector `y` where any element takes one of two values, return the indices of the - most frequent of them, the indices of the least frequent of them, and the counts of each. +Given an abstract vector `y` where any element takes one of two values, return the +indices of the most frequent of them, the indices of the least frequent of them, and the +counts of each. """ function get_majority_minority_inds_counts(y) # a tuple mapping each class to its indices labels_inds = collect(group_inds(y)) num_classes = length(labels_inds) - num_classes == 2 || throw(ArgumentError(ERR_MULTICLASS_UNSUPP(num_classes))) + num_classes == 2 || throw(ERR_MULTICLASS_UNSUPP(num_classes)) # get the length of each class first_class_count = length(labels_inds[1][2]) second_class_count = length(labels_inds[2][2]) @@ -42,9 +44,9 @@ function get_majority_minority_inds_counts(y) end """ -Given data `X`, `y` where `X` is a table and `y` is an abstract vector (which may be wrapped in nodes), +Given data `X`, `y` where `X` is a table and `y` is an abstract vector (which may be wrapped in nodes), the indices and counts of the majority and minority classes and abstract rng, - return `X_sub`, `y_sub`, in the form of nodes, which are the result of randomly undersampling + return `X_sub`, `y_sub`, in the form of nodes, which are the result of randomly undersampling the majority class data in `X`, `y` so that both classes occur equally frequently. """ function get_some_balanced_subset( @@ -89,8 +91,8 @@ function BalancedBaggingClassifier(; rng = Random.default_rng(), ) model === nothing && error(ERR_MISSING_CLF) - T < 0 && error(ERR_BAD_T) - rng = rng_handler(rng) + T < 0 && error(ERR_BAD_T) + rng = rng_handler(rng) return BalancedBaggingClassifier(model, T, rng) end @@ -178,8 +180,8 @@ Construct an instance with default hyper-parameters using the syntax `bagging_mo Given a probablistic classifier.`BalancedBaggingClassifier` performs bagging by undersampling only majority data in each bag so that its includes as much samples as in the minority data. This is proposed with an Adaboost classifier where the output scores are averaged in the paper -Xu-Ying Liu, Jianxin Wu, & Zhi-Hua Zhou. (2009). Exploratory Undersampling for Class-Imbalance Learning. -IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), 39 (2), 539–5501 +Xu-Ying Liu, Jianxin Wu, & Zhi-Hua Zhou. (2009). Exploratory Undersampling for Class-Imbalance Learning. +IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), 39 (2), 539–5501 # Training data @@ -206,7 +208,7 @@ Train the machine with `fit!(mach, rows=...)`. - `T::Integer=0`: The number of bags to be used in the ensemble. If not given, will be set as the ratio between the frequency of the majority and minority classes. Can be later found in `report(mach)`. -- `rng::Union{AbstractRNG, Integer}=default_rng()`: Either an `AbstractRNG` object or an `Integer` +- `rng::Union{AbstractRNG, Integer}=default_rng()`: Either an `AbstractRNG` object or an `Integer` seed to be used with `Xoshiro` # Operations @@ -234,13 +236,13 @@ logistic_model = LogisticClassifier() model = BalancedBaggingClassifier(model=logistic_model, T=5) # Load the data and train the BalancedBaggingClassifier -X, y = Imbalance.generate_imbalanced_data(100, 5; num_vals_per_category = [3, 2], - class_probs = [0.9, 0.1], - type = "ColTable", +X, y = Imbalance.generate_imbalanced_data(100, 5; num_vals_per_category = [3, 2], + class_probs = [0.9, 0.1], + type = "ColTable", rng=42) julia> Imbalance.checkbalance(y) -1: ▇▇▇▇▇▇▇▇▇▇ 16 (19.0%) -0: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 84 (100.0%) +1: ▇▇▇▇▇▇▇▇▇▇ 16 (19.0%) +0: ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 84 (100.0%) mach = machine(model, X, y) |> fit! @@ -250,4 +252,4 @@ yhat = predict(mach, X) # probabilistic predictions predict_mode(mach, X) # point predictions ``` """ -BalancedBaggingClassifier \ No newline at end of file +BalancedBaggingClassifier diff --git a/src/balanced_model.jl b/src/balanced_model.jl index 88cd78e..6b7d9a0 100644 --- a/src/balanced_model.jl +++ b/src/balanced_model.jl @@ -116,7 +116,7 @@ for model_type in SUPPORTED_MODEL_TYPES eval(ex) end -const ERR_NO_PROP = "trying to access property $name which does not exist" +const ERR_NO_PROP = ArgumentError("trying to access property $name which does not exist") # overload set property to set the property from the vector in the struct for model_type in SUPPORTED_MODEL_TYPES struct_name = MODELTYPE_TO_COMPOSITETYPE[model_type] @@ -128,7 +128,7 @@ for model_type in SUPPORTED_MODEL_TYPES !isnothing(idx) && return getfield(b, :balancers)[idx] = val # the other only option is model name === :model && return setfield(b, :model, val) - error(ERR_NO_PROP) + throw(ERR_NO_PROP) end end eval(ex) @@ -198,4 +198,4 @@ for composite_type in COMPOSITE_TYPES MMI.$trait(::Type{<:$composite_type{balancernames, M}}) where {balancernames, M} = MMI.$trait(M) end |> eval end -end \ No newline at end of file +end diff --git a/test/balanced_bagging.jl b/test/balanced_bagging.jl index 678192d..4d7ccc1 100644 --- a/test/balanced_bagging.jl +++ b/test/balanced_bagging.jl @@ -5,8 +5,9 @@ @test MLJBalancing.get_majority_minority_inds_counts(y) == ([1, 2, 3, 4, 8], [5, 6, 7], 5, 3) y = [0, 0, 0, 0, 1, 1, 1, 0, 2, 2, 2] - @test_throws MLJBalancing.ERR_MULTICLASS_UNSUPP(3) MLJBalancing.get_majority_minority_inds_counts( - y, + @test_throws( + MLJBalancing.ERR_MULTICLASS_UNSUPP(3), + MLJBalancing.get_majority_minority_inds_counts(y), ) end diff --git a/test/balanced_model.jl b/test/balanced_model.jl index 19576b6..361a06e 100644 --- a/test/balanced_model.jl +++ b/test/balanced_model.jl @@ -39,9 +39,10 @@ @test_throws MLJBalancing.ERR_MODEL_UNSPECIFIED begin BalancedModel(b1 = balancer1, b2 = balancer2, b3 = balancer3) end - @test_throws "ArgumentError: Only these model supertypes support wrapping: `Probabilistic`, `Deterministic`, and `Interval`.\nModel provided has type `Int64`." begin - BalancedModel(model = 1, b1 = balancer1, b2 = balancer2, b3 = balancer3) - end + @test_throws( + MLJBalancing.ERR_UNSUPPORTED_MODEL(1), + BalancedModel(model = 1, b1 = balancer1, b2 = balancer2, b3 = balancer3), + ) @test_logs (:warn, MLJBalancing.WRN_BALANCER_UNSPECIFIED) begin BalancedModel(model = model_prob) end @@ -80,7 +81,8 @@ Base.getproperty(balanced_model, :b1) == balancer1 Base.setproperty!(balanced_model, :b1, balancer2) Base.getproperty(balanced_model, :b1) == balancer2 - @test_throws MLJBalancing.ERR_NO_PROP begin - Base.setproperty!(balanced_model, :name11, balancer2) - end + @test_throws( + MLJBalancing.ERR_NO_PROP, + Base.setproperty!(balanced_model, :name11, balancer2), + ) end