ankane · nunosilva800 · Dec 11, 2024 · Oct 18, 2024 · Dec 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@
 /tmp/
 *.lock
 /vendor/lib_lightgbm.*
+/.venv/
diff --git a/Gemfile b/Gemfile
@@ -9,3 +9,4 @@ gem "matrix"
 gem "numo-narray", platform: [:ruby, :x64_mingw]
 gem "rover-df", platform: [:ruby, :x64_mingw]
 gem "csv"
+gem "debug"
diff --git a/lib/lightgbm/booster.rb b/lib/lightgbm/booster.rb
@@ -1,15 +1,19 @@
+require_relative "categorical_feature_encoder"
+
 module LightGBM
   class Booster
     attr_accessor :best_iteration, :train_data_name
 
     def initialize(params: nil, train_set: nil, model_file: nil, model_str: nil)
       if model_str
         model_from_string(model_str)
+        @categorical_feature_encoder = CategoricalFeatureEncoder.new(model_str.each_line)
       elsif model_file
         out_num_iterations = ::FFI::MemoryPointer.new(:int)
         create_handle do |handle|
           check_result FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, handle)
         end
+        @categorical_feature_encoder = CategoricalFeatureEncoder.new(File.foreach(model_file))
       else
         params ||= {}
         set_verbosity(params)
@@ -164,7 +168,12 @@ def predict(input, start_iteration: nil, num_iteration: nil, **params)
       num_iteration ||= best_iteration
       num_class = self.num_class
 
-      flat_input = input.flatten
+      flat_input = if @categorical_feature_encoder
+        input.flat_map { |row| @categorical_feature_encoder.apply(row) }
+      else
+        input.flatten
+      end
+
       handle_missing(flat_input)
       data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
       data.write_array_of_double(flat_input)

diff --git a/lib/lightgbm/categorical_feature_encoder.rb b/lib/lightgbm/categorical_feature_encoder.rb
@@ -0,0 +1,73 @@
+require "json"
+
+module LightGBM
+  # Converts LightGBM categorical featulres to Float, using label encoding.
+  # The categorical and mappings are extracted from the LightGBM model file.
+  class CategoricalFeatureEncoder
+    # Initializes a new CategoricalFeatureEncoder instance.
+    #
+    # @param model_enumerable [Enumerable] Enumerable with each line of LightGBM model file.
+    def initialize(model_enumerable)
+      @categorical_feature = []
+      @pandas_categorical = []
+
+      load_categorical_features(model_enumerable)
+    end
+
+    # Returns a new array with categorical features converted to Float, using label encoding.
+    def apply(feature_values)
+      return feature_values if @categorical_feature.empty?
+
+      transformed_features = feature_values.dup
+
+      @categorical_feature.each_with_index do |feature_index, pandas_categorical_index|
+        pandas_categorical_entry = @pandas_categorical[pandas_categorical_index]
+        value = feature_values[feature_index]
+        transformed_features[feature_index] = pandas_categorical_entry.fetch(value, Float::NAN).to_f
+      end
+
+      transformed_features
+    end
+
+    private
+
+    def load_categorical_features(model_enumerable)
+      categorical_found = false
+      pandas_found = false
+
+      model_enumerable.each_entry do |line|
+        # Format: "[categorical_feature: 0,1,2,3,4,5]"
+        if line.start_with?("[categorical_feature:")
+          parts = line.split("categorical_feature:")
+          last_part = parts.last
+          next if last_part.nil?
+
+          values = last_part.strip[0...-1]
+          next if values.nil?
+
+          @categorical_feature = values.split(",").map(&:to_i)
+          categorical_found = true
+        end
+
+        # Format: "pandas_categorical:[[-1.0, 0.0, 1.0], ["", "a"], [false, true]]"
+        if line.start_with?("pandas_categorical:")
+          parts = line.split("pandas_categorical:")
+          values = parts[1]
+          next if values.nil?
+
+          @pandas_categorical = JSON.parse(values).map do |array|
+            array.each_with_index.to_h
+          end
+          pandas_found = true
+        end
+
+        # Break the loop if both lines are found
+        break if categorical_found && pandas_found
+      end
+
+      if @categorical_feature.size != @pandas_categorical.size
+        raise "categorical_feature and pandas_categorical mismatch"
+      end
+    end
+  end
+end
diff --git a/test/booster_test.rb b/test/booster_test.rb
@@ -8,6 +8,25 @@ def test_model_file
     assert_elements_in_delta [0.9823112229173586, 0.9583143724610858], y_pred.first(2)
   end
 
+  def test_model_file_with_categorical_features
+    booster = LightGBM::Booster.new(model_file: "test/support/model_categorical.txt")
+
+    x_test = [[3.7, 1.2, 7.2, "9"], [7.5, 0.5, 7.9, "0"]]
+    y_pred = booster.predict(x_test)
+    assert_elements_in_delta [1.014580415457883, 0.9327349972866771], y_pred.first(2)
+
+    x_test = [
+      {"x0" => 3.7, "x1" => 1.2, "x2" => 7.2, "x3" => "9"},
+      {"x0" => 7.5, "x1" => 0.5, "x2" => 7.9, "x3" => "0"}
+    ]
+    y_pred = booster.predict(x_test)
+    assert_elements_in_delta [1.014580415457883, 0.9327349972866771], y_pred.first(2)
+
+    x_test = {"x0" => 3.7, "x1" => 1.2, "x2" => 7.2, "x3" => "9"}
+    y_pred = booster.predict(x_test)
+    assert_in_delta 1.014580415457883, y_pred
+  end
+
   def test_model_str
     x_test = [[3.7, 1.2, 7.2, 9.0], [7.5, 0.5, 7.9, 0.0]]
     booster = LightGBM::Booster.new(model_str: File.read("test/support/model.txt"))
@@ -23,6 +42,25 @@ def test_model_from_string
     assert_elements_in_delta [0.9823112229173586, 0.9583143724610858], y_pred.first(2)
   end
 
+  def test_model_str_with_categorical_features
+    booster = LightGBM::Booster.new(model_str: File.read("test/support/model_categorical.txt"))
+
+    x_test = [[3.7, 1.2, 7.2, "9"], [7.5, 0.5, 7.9, "0"]]
+    y_pred = booster.predict(x_test)
+    assert_elements_in_delta [1.014580415457883, 0.9327349972866771], y_pred.first(2)
+
+    x_test = [
+      {"x0" => 3.7, "x1" => 1.2, "x2" => 7.2, "x3" => "9"},
+      {"x0" => 7.5, "x1" => 0.5, "x2" => 7.9, "x3" => "0"}
+    ]
+    y_pred = booster.predict(x_test)
+    assert_elements_in_delta [1.014580415457883, 0.9327349972866771], y_pred.first(2)
+
+    x_test = {"x0" => 3.7, "x1" => 1.2, "x2" => 7.2, "x3" => "9"}
+    y_pred = booster.predict(x_test)
+    assert_in_delta 1.014580415457883, y_pred
+  end
+
   def test_feature_importance
     assert_equal [280, 285, 335, 148], booster.feature_importance
   end

diff --git a/test/categorical_feature_encoder_test.rb b/test/categorical_feature_encoder_test.rb
@@ -0,0 +1,41 @@
+require_relative "test_helper"
+
+class CategoricalFeatureEncoder < Minitest::Test
+  def setup
+    model = <<~MODEL
+      [categorical_feature: 1,2,3]
+      pandas_categorical:[[-1.0, 0.0, 1.0], ["red", "green", "blue"], [false, true]]
+    MODEL
+
+    @encoder = LightGBM::CategoricalFeatureEncoder.new(model.each_line)
+  end
+
+  def test_apply_with_categorical_features
+    input = [42.0, 0.0, "green", true]
+    expected = [42.0, 1.0, 1.0, 1.0]
+
+    assert_equal(expected, @encoder.apply(input))
+  end
+
+  def test_apply_with_non_categorical_features
+    input = [42.0, "non_categorical", 39.0, false]
+    expected = [42.0, Float::NAN, Float::NAN, 0]
+
+    assert_equal(expected, @encoder.apply(input))
+  end
+
+  def test_apply_with_missing_values
+    input = [42.0, nil, "red", nil]
+    expected = [42.0, Float::NAN, 0.0, Float::NAN]
+    result = @encoder.apply(input)
+
+    assert_equal(expected, result)
+  end
+
+  def test_apply_with_boolean_values
+    input = [42.0, -1.0, "green", false]
+    expected = [42.0, 0.0, 1.0, 0.0]
+
+    assert_equal(expected, @encoder.apply(input))
+  end
+end
diff --git a/test/support/booster.py b/test/support/booster.py
@@ -1,22 +1,57 @@
+# Run this script to regenerate the test/support/model.txt and test/support/model_categorical.txt files
+
 import lightgbm as lgb
 import pandas as pd
 
-df = pd.read_csv('test/support/data.csv')
+params = {'verbosity': -1}
+
+def booster():
+    df = pd.read_csv('test/support/data.csv')
+
+    X = df.drop(columns=['y'])
+    y = df['y']
+
+    X_train = X[:300]
+    y_train = y[:300]
+    X_test = X[300:]
+    y_test = y[300:]
+
+    train_data = lgb.Dataset(X_train, label=y_train)
+    bst = lgb.train(params, train_data)
+    bst.save_model('test/support/model.txt')
+
+    bst = lgb.Booster(model_file='test/support/model.txt')
+    print('x', X_train[:2].to_numpy().tolist())
+    print('predict', bst.predict(X_train)[:2].tolist())
+    print('feature_importance', bst.feature_importance().tolist())
+    print('feature_name', bst.feature_name())
+
+def booster_categorical():
+    df = pd.read_csv('test/support/data.csv', dtype={'x3': 'category'})
+
+    X = df.drop(columns=['y'])
+    y = df['y']
+
+    X_train = X[:300]
+    y_train = y[:300]
+    X_test = X[300:]
+    y_test = y[300:]
+
+    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature='auto')
+    bst = lgb.train(params, train_data)
+    bst.save_model('test/support/model_categorical.txt')
+
+    bst = lgb.Booster(model_file='test/support/model_categorical.txt')
+    print('x', X_train[:2].to_numpy().tolist())
+    print('predict', bst.predict(X_train)[:2].tolist())
+    print('feature_importance', bst.feature_importance().tolist())
+    print('feature_name', bst.feature_name())
 
-X = df.drop(columns=['y'])
-y = df['y']
 
-X_train = X[:300]
-y_train = y[:300]
-X_test = X[300:]
-y_test = y[300:]
+print('booster -> model.txt')
+booster()
 
-train_data = lgb.Dataset(X_train, label=y_train)
-bst = lgb.train({}, train_data)
-bst.save_model('test/support/model.txt')
+print('')
 
-bst = lgb.Booster(model_file='test/support/model.txt')
-print('x', X_train[:2].to_numpy().tolist())
-print('predict', bst.predict(X_train)[:2].tolist())
-print('feature_importance', bst.feature_importance().tolist())
-print('feature_name', bst.feature_name())
+print('categorical')
+booster_categorical()
diff --git a/test/support/classifier.py b/test/support/classifier.py
@@ -25,15 +25,16 @@
 print()
 print('test_multiclass')
 
-model = lgb.LGBMClassifier()
+model = lgb.LGBMClassifier(verbosity=-1)
 model.fit(X_train, ym_train)
 print(model.predict(X_test)[0:100].tolist())
 print(model.predict_proba(X_test)[0].tolist())
 print(model.feature_importances_.tolist())
 
 print()
 print('test_early_stopping')
-model.fit(X_train, ym_train, eval_set=[(X_test, ym_test)], early_stopping_rounds=5, verbose=True)
+model = lgb.LGBMClassifier(early_stopping_round=5, verbosity=1)
+model.fit(X_train, ym_train, eval_set=[(X_test, ym_test)])
 
 print()
 print('test_missing_numeric')

diff --git a/test/support/cv.py b/test/support/cv.py
@@ -16,42 +16,45 @@
 regression_params = {'objective': 'regression', 'verbosity': -1}
 regression_train = lgb.Dataset(X_train, label=y_train)
 eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False)
-print(eval_hist['l2-mean'][0])
-print(eval_hist['l2-mean'][-1])
-print(eval_hist['l2-stdv'][0])
-print(eval_hist['l2-stdv'][-1])
+print(eval_hist['valid l2-mean'][0])
+print(eval_hist['valid l2-mean'][-1])
+print(eval_hist['valid l2-stdv'][0])
+print(eval_hist['valid l2-stdv'][-1])
 
 print()
 print('test_binary')
 
 binary_params = {'objective': 'binary', 'verbosity': -1}
 binary_train = lgb.Dataset(X_train, label=y_train.replace(2, 1))
 eval_hist = lgb.cv(binary_params, binary_train, shuffle=False, stratified=False)
-print(eval_hist['binary_logloss-mean'][0])
-print(eval_hist['binary_logloss-mean'][-1])
-print(eval_hist['binary_logloss-stdv'][0])
-print(eval_hist['binary_logloss-stdv'][-1])
+print(eval_hist['valid binary_logloss-mean'][0])
+print(eval_hist['valid binary_logloss-mean'][-1])
+print(eval_hist['valid binary_logloss-stdv'][0])
+print(eval_hist['valid binary_logloss-stdv'][-1])
 
 print()
 print('test_multiclass')
 
 multiclass_params = {'objective': 'multiclass', 'num_class': 3, 'verbosity': -1}
 multiclass_train = lgb.Dataset(X_train, label=y_train)
 eval_hist = lgb.cv(multiclass_params, multiclass_train, shuffle=False, stratified=False)
-print(eval_hist['multi_logloss-mean'][0])
-print(eval_hist['multi_logloss-mean'][-1])
-print(eval_hist['multi_logloss-stdv'][0])
-print(eval_hist['multi_logloss-stdv'][-1])
+print(eval_hist['valid multi_logloss-mean'][0])
+print(eval_hist['valid multi_logloss-mean'][-1])
+print(eval_hist['valid multi_logloss-stdv'][0])
+print(eval_hist['valid multi_logloss-stdv'][-1])
 
 print('')
 print('test_early_stopping_early')
 
-eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False, verbose_eval=True, early_stopping_rounds=5)
-print(len(eval_hist['l2-mean']))
+regression_params = {'objective': 'regression', 'verbosity': 1, 'early_stopping_round': 5}
+eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False)
+print(len(eval_hist['valid l2-mean']))
 
 print('')
 print('test_early_stopping_not_early')
 
-eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False, verbose_eval=True, early_stopping_rounds=500)
-print(len(eval_hist['l2-mean']))
+regression_params = {'objective': 'regression', 'verbosity': 1, 'early_stopping_round': 500}
+eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False)
+print(len(eval_hist['valid l2-mean']))
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,4 @@ @@
     /tmp/
     *.lock
     /vendor/lib_lightgbm.*
+    /.venv/