Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for categorical feature auto-encoding #8

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
/tmp/
*.lock
/vendor/lib_lightgbm.*
/.venv/
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ gem "matrix"
gem "numo-narray", platform: [:ruby, :x64_mingw]
gem "rover-df", platform: [:ruby, :x64_mingw]
gem "csv"
gem "debug"
11 changes: 10 additions & 1 deletion lib/lightgbm/booster.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
require_relative "categorical_feature_encoder"

module LightGBM
class Booster
attr_accessor :best_iteration, :train_data_name

def initialize(params: nil, train_set: nil, model_file: nil, model_str: nil)
if model_str
model_from_string(model_str)
@categorical_feature_encoder = CategoricalFeatureEncoder.new(model_str.each_line)
elsif model_file
out_num_iterations = ::FFI::MemoryPointer.new(:int)
create_handle do |handle|
check_result FFI.LGBM_BoosterCreateFromModelfile(model_file, out_num_iterations, handle)
end
@categorical_feature_encoder = CategoricalFeatureEncoder.new(File.foreach(model_file))
else
params ||= {}
set_verbosity(params)
Expand Down Expand Up @@ -164,7 +168,12 @@ def predict(input, start_iteration: nil, num_iteration: nil, **params)
num_iteration ||= best_iteration
num_class = self.num_class

flat_input = input.flatten
flat_input = if @categorical_feature_encoder
input.flat_map { |row| @categorical_feature_encoder.apply(row) }
else
input.flatten
end

handle_missing(flat_input)
data = ::FFI::MemoryPointer.new(:double, input.count * input.first.count)
data.write_array_of_double(flat_input)
Expand Down
73 changes: 73 additions & 0 deletions lib/lightgbm/categorical_feature_encoder.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
require "json"

module LightGBM
# Converts LightGBM categorical featulres to Float, using label encoding.
# The categorical and mappings are extracted from the LightGBM model file.
class CategoricalFeatureEncoder
# Initializes a new CategoricalFeatureEncoder instance.
#
# @param model_enumerable [Enumerable] Enumerable with each line of LightGBM model file.
def initialize(model_enumerable)
@categorical_feature = []
@pandas_categorical = []

load_categorical_features(model_enumerable)
end

# Returns a new array with categorical features converted to Float, using label encoding.
def apply(feature_values)
return feature_values if @categorical_feature.empty?

transformed_features = feature_values.dup

@categorical_feature.each_with_index do |feature_index, pandas_categorical_index|
pandas_categorical_entry = @pandas_categorical[pandas_categorical_index]
value = feature_values[feature_index]
transformed_features[feature_index] = pandas_categorical_entry.fetch(value, Float::NAN).to_f
end

transformed_features
end

private

def load_categorical_features(model_enumerable)
categorical_found = false
pandas_found = false

model_enumerable.each_entry do |line|
# Format: "[categorical_feature: 0,1,2,3,4,5]"
if line.start_with?("[categorical_feature:")
parts = line.split("categorical_feature:")
last_part = parts.last
next if last_part.nil?

values = last_part.strip[0...-1]
next if values.nil?

@categorical_feature = values.split(",").map(&:to_i)
categorical_found = true
end

# Format: "pandas_categorical:[[-1.0, 0.0, 1.0], ["", "a"], [false, true]]"
if line.start_with?("pandas_categorical:")
parts = line.split("pandas_categorical:")
values = parts[1]
next if values.nil?

@pandas_categorical = JSON.parse(values).map do |array|
array.each_with_index.to_h
end
pandas_found = true
end

# Break the loop if both lines are found
break if categorical_found && pandas_found
end

if @categorical_feature.size != @pandas_categorical.size
raise "categorical_feature and pandas_categorical mismatch"
end
end
end
end
38 changes: 38 additions & 0 deletions test/booster_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,25 @@ def test_model_file
assert_elements_in_delta [0.9823112229173586, 0.9583143724610858], y_pred.first(2)
end

def test_model_file_with_categorical_features
booster = LightGBM::Booster.new(model_file: "test/support/model_categorical.txt")

x_test = [[3.7, 1.2, 7.2, "9"], [7.5, 0.5, 7.9, "0"]]
y_pred = booster.predict(x_test)
assert_elements_in_delta [1.014580415457883, 0.9327349972866771], y_pred.first(2)

x_test = [
{"x0" => 3.7, "x1" => 1.2, "x2" => 7.2, "x3" => "9"},
{"x0" => 7.5, "x1" => 0.5, "x2" => 7.9, "x3" => "0"}
]
y_pred = booster.predict(x_test)
assert_elements_in_delta [1.014580415457883, 0.9327349972866771], y_pred.first(2)

x_test = {"x0" => 3.7, "x1" => 1.2, "x2" => 7.2, "x3" => "9"}
y_pred = booster.predict(x_test)
assert_in_delta 1.014580415457883, y_pred
end

def test_model_str
x_test = [[3.7, 1.2, 7.2, 9.0], [7.5, 0.5, 7.9, 0.0]]
booster = LightGBM::Booster.new(model_str: File.read("test/support/model.txt"))
Expand All @@ -23,6 +42,25 @@ def test_model_from_string
assert_elements_in_delta [0.9823112229173586, 0.9583143724610858], y_pred.first(2)
end

def test_model_str_with_categorical_features
booster = LightGBM::Booster.new(model_str: File.read("test/support/model_categorical.txt"))

x_test = [[3.7, 1.2, 7.2, "9"], [7.5, 0.5, 7.9, "0"]]
y_pred = booster.predict(x_test)
assert_elements_in_delta [1.014580415457883, 0.9327349972866771], y_pred.first(2)

x_test = [
{"x0" => 3.7, "x1" => 1.2, "x2" => 7.2, "x3" => "9"},
{"x0" => 7.5, "x1" => 0.5, "x2" => 7.9, "x3" => "0"}
]
y_pred = booster.predict(x_test)
assert_elements_in_delta [1.014580415457883, 0.9327349972866771], y_pred.first(2)

x_test = {"x0" => 3.7, "x1" => 1.2, "x2" => 7.2, "x3" => "9"}
y_pred = booster.predict(x_test)
assert_in_delta 1.014580415457883, y_pred
end

def test_feature_importance
assert_equal [280, 285, 335, 148], booster.feature_importance
end
Expand Down
41 changes: 41 additions & 0 deletions test/categorical_feature_encoder_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
require_relative "test_helper"

class CategoricalFeatureEncoder < Minitest::Test
def setup
model = <<~MODEL
[categorical_feature: 1,2,3]
pandas_categorical:[[-1.0, 0.0, 1.0], ["red", "green", "blue"], [false, true]]
MODEL

@encoder = LightGBM::CategoricalFeatureEncoder.new(model.each_line)
end

def test_apply_with_categorical_features
input = [42.0, 0.0, "green", true]
expected = [42.0, 1.0, 1.0, 1.0]

assert_equal(expected, @encoder.apply(input))
end

def test_apply_with_non_categorical_features
input = [42.0, "non_categorical", 39.0, false]
expected = [42.0, Float::NAN, Float::NAN, 0]

assert_equal(expected, @encoder.apply(input))
end

def test_apply_with_missing_values
input = [42.0, nil, "red", nil]
expected = [42.0, Float::NAN, 0.0, Float::NAN]
result = @encoder.apply(input)

assert_equal(expected, result)
end

def test_apply_with_boolean_values
input = [42.0, -1.0, "green", false]
expected = [42.0, 0.0, 1.0, 0.0]

assert_equal(expected, @encoder.apply(input))
end
end
65 changes: 50 additions & 15 deletions test/support/booster.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,57 @@
# Run this script to regenerate the test/support/model.txt and test/support/model_categorical.txt files

import lightgbm as lgb
import pandas as pd

df = pd.read_csv('test/support/data.csv')
params = {'verbosity': -1}

def booster():
df = pd.read_csv('test/support/data.csv')

X = df.drop(columns=['y'])
y = df['y']

X_train = X[:300]
y_train = y[:300]
X_test = X[300:]
y_test = y[300:]

train_data = lgb.Dataset(X_train, label=y_train)
bst = lgb.train(params, train_data)
bst.save_model('test/support/model.txt')

bst = lgb.Booster(model_file='test/support/model.txt')
print('x', X_train[:2].to_numpy().tolist())
print('predict', bst.predict(X_train)[:2].tolist())
print('feature_importance', bst.feature_importance().tolist())
print('feature_name', bst.feature_name())

def booster_categorical():
df = pd.read_csv('test/support/data.csv', dtype={'x3': 'category'})

X = df.drop(columns=['y'])
y = df['y']

X_train = X[:300]
y_train = y[:300]
X_test = X[300:]
y_test = y[300:]

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature='auto')
bst = lgb.train(params, train_data)
bst.save_model('test/support/model_categorical.txt')

bst = lgb.Booster(model_file='test/support/model_categorical.txt')
print('x', X_train[:2].to_numpy().tolist())
print('predict', bst.predict(X_train)[:2].tolist())
print('feature_importance', bst.feature_importance().tolist())
print('feature_name', bst.feature_name())

X = df.drop(columns=['y'])
y = df['y']

X_train = X[:300]
y_train = y[:300]
X_test = X[300:]
y_test = y[300:]
print('booster -> model.txt')
booster()

train_data = lgb.Dataset(X_train, label=y_train)
bst = lgb.train({}, train_data)
bst.save_model('test/support/model.txt')
print('')

bst = lgb.Booster(model_file='test/support/model.txt')
print('x', X_train[:2].to_numpy().tolist())
print('predict', bst.predict(X_train)[:2].tolist())
print('feature_importance', bst.feature_importance().tolist())
print('feature_name', bst.feature_name())
print('categorical')
booster_categorical()
5 changes: 3 additions & 2 deletions test/support/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,16 @@
print()
print('test_multiclass')

model = lgb.LGBMClassifier()
model = lgb.LGBMClassifier(verbosity=-1)
model.fit(X_train, ym_train)
print(model.predict(X_test)[0:100].tolist())
print(model.predict_proba(X_test)[0].tolist())
print(model.feature_importances_.tolist())

print()
print('test_early_stopping')
model.fit(X_train, ym_train, eval_set=[(X_test, ym_test)], early_stopping_rounds=5, verbose=True)
model = lgb.LGBMClassifier(early_stopping_round=5, verbosity=1)
model.fit(X_train, ym_train, eval_set=[(X_test, ym_test)])

print()
print('test_missing_numeric')
Expand Down
35 changes: 19 additions & 16 deletions test/support/cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,42 +16,45 @@
regression_params = {'objective': 'regression', 'verbosity': -1}
regression_train = lgb.Dataset(X_train, label=y_train)
eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False)
print(eval_hist['l2-mean'][0])
print(eval_hist['l2-mean'][-1])
print(eval_hist['l2-stdv'][0])
print(eval_hist['l2-stdv'][-1])
print(eval_hist['valid l2-mean'][0])
print(eval_hist['valid l2-mean'][-1])
print(eval_hist['valid l2-stdv'][0])
print(eval_hist['valid l2-stdv'][-1])

print()
print('test_binary')

binary_params = {'objective': 'binary', 'verbosity': -1}
binary_train = lgb.Dataset(X_train, label=y_train.replace(2, 1))
eval_hist = lgb.cv(binary_params, binary_train, shuffle=False, stratified=False)
print(eval_hist['binary_logloss-mean'][0])
print(eval_hist['binary_logloss-mean'][-1])
print(eval_hist['binary_logloss-stdv'][0])
print(eval_hist['binary_logloss-stdv'][-1])
print(eval_hist['valid binary_logloss-mean'][0])
print(eval_hist['valid binary_logloss-mean'][-1])
print(eval_hist['valid binary_logloss-stdv'][0])
print(eval_hist['valid binary_logloss-stdv'][-1])

print()
print('test_multiclass')

multiclass_params = {'objective': 'multiclass', 'num_class': 3, 'verbosity': -1}
multiclass_train = lgb.Dataset(X_train, label=y_train)
eval_hist = lgb.cv(multiclass_params, multiclass_train, shuffle=False, stratified=False)
print(eval_hist['multi_logloss-mean'][0])
print(eval_hist['multi_logloss-mean'][-1])
print(eval_hist['multi_logloss-stdv'][0])
print(eval_hist['multi_logloss-stdv'][-1])
print(eval_hist['valid multi_logloss-mean'][0])
print(eval_hist['valid multi_logloss-mean'][-1])
print(eval_hist['valid multi_logloss-stdv'][0])
print(eval_hist['valid multi_logloss-stdv'][-1])

print('')
print('test_early_stopping_early')

eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False, verbose_eval=True, early_stopping_rounds=5)
print(len(eval_hist['l2-mean']))
regression_params = {'objective': 'regression', 'verbosity': 1, 'early_stopping_round': 5}
eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False)
print(len(eval_hist['valid l2-mean']))

print('')
print('test_early_stopping_not_early')

eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False, verbose_eval=True, early_stopping_rounds=500)
print(len(eval_hist['l2-mean']))
regression_params = {'objective': 'regression', 'verbosity': 1, 'early_stopping_round': 500}
eval_hist = lgb.cv(regression_params, regression_train, shuffle=False, stratified=False)
print(len(eval_hist['valid l2-mean']))


Loading
Loading