Skip to content

Commit

Permalink
Add support for Unicode and make conversion of categorical values rob…
Browse files Browse the repository at this point in the history
…ust.

This adds support for Unicode strings, normalizes categorical set values in the same way as categorical values, and validates their types.

As a consequence, boolean values in categorical sets are converted to lowercase, matching the treatment of categorical features.

PiperOrigin-RevId: 675906253
  • Loading branch information
damianoamatruda authored and copybara-github committed Sep 18, 2024
1 parent ee38cda commit eefb2e0
Show file tree
Hide file tree
Showing 5 changed files with 398 additions and 28 deletions.
4 changes: 4 additions & 0 deletions yggdrasil_decision_forests/port/python/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
learners that do not support these capabilities.
- By default, `model.analyze` for a maximum of 20 seconds (i.e.
`maximum_duration=20` by default).
- Convert boolean values in categorical sets to lowercase, matching the
treatment of categorical features.

### Feature

Expand All @@ -31,6 +33,8 @@
cores.
- Add multi-threaded results in `model.benchmark`.
- Add argument to control the maximum duration of `model.analyze`.
- Add support for Unicode strings, normalize categorical set values in the
same way as categorical values, and validate their types.

### Fix

Expand Down
124 changes: 97 additions & 27 deletions yggdrasil_decision_forests/port/python/ydf/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from typing import Any, Dict, List, Optional, Sequence, Union

import numpy as np
import numpy.typing as npt

from yggdrasil_decision_forests.dataset import data_spec_pb2
from ydf.cc import ydf
Expand Down Expand Up @@ -50,6 +51,45 @@ def data_spec(self) -> data_spec_pb2.DataSpecification:
"""
return self._dataset.data_spec()

def _normalize_categorical_string_values(
self,
column: dataspec.Column,
values: npt.ArrayLike,
original_column_data: Any,
) -> npt.NDArray[np.bytes_]:
"""Normalizes a sequence of categorical string values into an array of bytes."""

def normalize_categorical_string_value(value: Any) -> bytes:
"""Normalizes a categorical string value into a bytes literal."""
if isinstance(value, str):
return value.encode("utf-8")
if isinstance(value, (bytes, np.bytes_)):
return value
if isinstance(value, (bool, np.bool_)):
return b"true" if value else b"false"
if isinstance(value, (int, np.integer)):
return str(value).encode("utf-8")
if isinstance(value, (float, np.floating)):
raise ValueError(
f"Cannot import column {column.name!r} with"
f" semantic={column.semantic} as it contains floating point"
" values.\nNote: If the column is a label, make sure the correct"
" task is selected. For example, you cannot train a classification"
" model (task=ydf.Task.CLASSIFICATION) with floating point labels."
)
raise ValueError(
f"Cannot import column {column.name!r} with"
f" semantic={column.semantic} and"
f" type={_type(original_column_data)}.\nNote: If the column is a"
" label, the semantic was selected based on the task. For example,"
" task=ydf.Task.CLASSIFICATION requires a CATEGORICAL compatible"
" label column, and task=ydf.Task.REGRESSION requires a NUMERICAL"
" compatible label column."
)

normalized_values = [normalize_categorical_string_value(v) for v in values]
return np.array(normalized_values, dtype=np.bytes_)

def _add_column(
self,
column: dataspec.Column,
Expand Down Expand Up @@ -114,7 +154,9 @@ def _add_column(
elif column.semantic == dataspec.Semantic.CATEGORICAL:
force_dictionary = None
if not isinstance(column_data, np.ndarray):
column_data = np.array(column_data, dtype=np.bytes_)
column_data = self._normalize_categorical_string_values(
column, column_data, original_column_data
)
ydf_dtype = dataspec.np_dtype_to_ydf_dtype(column_data.dtype)

if column_data.dtype.type in [np.bool_]:
Expand All @@ -128,18 +170,12 @@ def _add_column(
and (dictionary_size := dense_integer_dictionary_size(column_data))
):
column_data = column_data.astype(np.bytes_)
force_dictionary = [dataspec.YDF_OOD_BYTES] + [
str(i).encode("utf-8") for i in range(dictionary_size)
]
elif (
column_data.dtype.type
in [
np.object_,
np.bytes_,
np.str_,
]
or column_data.dtype.type in dataspec.NP_SUPPORTED_INT_DTYPE
):
force_dictionary = [dataspec.YDF_OOD_BYTES, *range(dictionary_size)]
elif column_data.dtype.type in [np.object_, np.str_]:
column_data = self._normalize_categorical_string_values(
column, column_data, original_column_data
)
elif column_data.dtype.type in dataspec.NP_SUPPORTED_INT_DTYPE:
column_data = column_data.astype(np.bytes_)
elif np.issubdtype(column_data.dtype, np.floating):
message = (
Expand Down Expand Up @@ -180,11 +216,39 @@ def _add_column(
):
raise ValueError("Categorical Set columns must be a list of lists.")
column_data = np.empty(len(original_column_data), dtype=np.object_)
column_data_are_bytes = True
force_dictionary = None
for i, row in enumerate(original_column_data):
if isinstance(row, np.ndarray):
column_data[i] = row.astype(np.bytes_)
elif isinstance(row, list):
column_data[i] = np.array(row, dtype=np.bytes_)
if isinstance(row, list):
column_data[i] = self._normalize_categorical_string_values(
column, row, original_column_data
)
elif isinstance(row, np.ndarray):
if row.dtype.type in [np.bool_]:
bool_row = row
column_data[i] = np.full_like(bool_row, b"false", "|S5")
column_data[i][bool_row] = b"true"
force_dictionary = [dataspec.YDF_OOD_BYTES, b"false", b"true"]
elif row.dtype.type in [np.object_, np.str_]:
column_data[i] = self._normalize_categorical_string_values(
column, row, original_column_data
)
elif row.dtype.type in dataspec.NP_SUPPORTED_INT_DTYPE:
column_data[i] = row.astype(np.bytes_)
elif np.issubdtype(row.dtype, np.floating):
raise ValueError(
f"Cannot import column {column.name!r} with"
f" semantic={column.semantic} as it contains floating point"
" values.\nNote: If the column is a label, make sure the"
" correct task is selected. For example, you cannot train a"
" classification model (task=ydf.Task.CLASSIFICATION) with"
" floating point labels."
)
elif row.dtype.type == np.bytes_:
column_data[i] = row
else:
column_data_are_bytes = False
break
elif not row:
column_data[i] = np.array([b""], dtype=np.bytes_)
else:
Expand All @@ -195,16 +259,22 @@ def _add_column(
)
ydf_dtype = dataspec.np_dtype_to_ydf_dtype(column_data.dtype)

if inference_args is not None:
guide = dataspec.categorical_column_guide(column, inference_args)
self._dataset.PopulateColumnCategoricalSetNPBytes(
column.name, column_data, **guide, ydf_dtype=ydf_dtype
)
else:
self._dataset.PopulateColumnCategoricalSetNPBytes(
column.name, column_data, ydf_dtype=ydf_dtype, column_idx=column_idx
)
return
if column_data_are_bytes:
if inference_args is not None:
guide = dataspec.categorical_column_guide(column, inference_args)
if force_dictionary:
guide["dictionary"] = np.array(force_dictionary, dtype=np.bytes_)
self._dataset.PopulateColumnCategoricalSetNPBytes(
column.name, column_data, **guide, ydf_dtype=ydf_dtype
)
else:
self._dataset.PopulateColumnCategoricalSetNPBytes(
column.name,
column_data,
ydf_dtype=ydf_dtype,
column_idx=column_idx,
)
return

elif column.semantic == dataspec.Semantic.HASH:
if not isinstance(column_data, np.ndarray):
Expand Down
Loading

0 comments on commit eefb2e0

Please sign in to comment.