Skip to content

Commit

Permalink
feat: Improve Flatbuffers Data Format Performance (#3989)
Browse files Browse the repository at this point in the history
Update the schema to remove an extra layer of indirection for every feature by removing the Feature table from the schema. This reduces a flatbuffer "pointer"-dereference (plus some overhead) per feature.

Also:
* Increases unit-testing coverage for Flatbuffers parser
* Implements desired alignment (including explicit off-align) support in io_buf for reading packed binary data
* Adds more detailed error reporting in Flatbuffers parser

Misc:
* Add launch profile for CTests for Visual Studio Code
* Python lint

---------

Co-authored-by: Griffin Bassman <griffinbassman@gmail.com>
Co-authored-by: Jacob Alber <jacob.alber@microsoft.com>
  • Loading branch information
3 people authored Feb 9, 2024
1 parent 4f33d61 commit db7f024
Show file tree
Hide file tree
Showing 41 changed files with 2,477 additions and 184 deletions.
16 changes: 16 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "(ctest) Launch",
"type": "cppdbg",
"cwd": "${workspaceFolder}",
"request": "launch",
"program": "${cmake.testProgram}",
"args": [ "${cmake.testArgs}" ]
}
]
}
14 changes: 8 additions & 6 deletions python/docs/source/tutorials/DFtoVW_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -802,15 +802,17 @@
"\n",
"# Adding columns for easier visualization\n",
"weights_df[\"feature_name\"] = weights_df.apply(\n",
" lambda row: row.vw_feature_name.split(\"=\")[0]\n",
" if row.is_cat\n",
" else row.vw_feature_name,\n",
" lambda row: (\n",
" row.vw_feature_name.split(\"=\")[0] if row.is_cat else row.vw_feature_name\n",
" ),\n",
" axis=1,\n",
")\n",
"weights_df[\"feature_value\"] = weights_df.apply(\n",
" lambda row: row.vw_feature_name.split(\"=\")[1].zfill(2)\n",
" if row.is_cat\n",
" else row.vw_feature_name,\n",
" lambda row: (\n",
" row.vw_feature_name.split(\"=\")[1].zfill(2)\n",
" if row.is_cat\n",
" else row.vw_feature_name\n",
" ),\n",
" axis=1,\n",
")\n",
"weights_df.sort_values([\"feature_name\", \"feature_value\"], inplace=True)"
Expand Down
3 changes: 1 addition & 2 deletions python/tests/confidence_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,5 @@ def lblogwealth(self, *, t, sumXt, v, eta, s, alpha):

return max(
0,
(sumXt - sqrt(gamma1**2 * ll * v + gamma2**2 * ll**2) - gamma2 * ll)
/ t,
(sumXt - sqrt(gamma1**2 * ll * v + gamma2**2 * ll**2) - gamma2 * ll) / t,
)
32 changes: 17 additions & 15 deletions python/tests/crminustwo.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,21 +440,23 @@ def intervaldiff(
candidates.append(
(
gstar,
None
if isclose(kappa, 0)
else {
"kappastar": kappa,
"betastar": beta,
"gammastar": gamma,
"taustar": tau,
"ufake": ufake,
"wfake": wfake,
"rfake": rex,
"qfunc": lambda c, u, w, r, k=kappa, g=gamma, b=beta, t=tau, s=sign, num=n: -c
* (b + g * u + t * w + s * (u - w) * r)
/ ((num + 1) * k),
"mle": mle,
},
(
None
if isclose(kappa, 0)
else {
"kappastar": kappa,
"betastar": beta,
"gammastar": gamma,
"taustar": tau,
"ufake": ufake,
"wfake": wfake,
"rfake": rex,
"qfunc": lambda c, u, w, r, k=kappa, g=gamma, b=beta, t=tau, s=sign, num=n: -c
* (b + g * u + t * w + s * (u - w) * r)
/ ((num + 1) * k),
"mle": mle,
}
),
)
)

Expand Down
6 changes: 3 additions & 3 deletions python/vowpalwabbit/pyvw.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,9 +532,9 @@ def parse(
for ex in str_ex
]
):
str_ex: List[
Example
] = str_ex # pytype: disable=annotation-type-mismatch
str_ex: List[Example] = (
str_ex # pytype: disable=annotation-type-mismatch
)
return str_ex

if not isinstance(str_ex, (list, str)):
Expand Down
16 changes: 10 additions & 6 deletions test/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,21 @@ def _are_same(expected: Any, actual: Any, key: str) -> Tuple[bool, str]:
elif isinstance(expected, (int, bool, str)):
return (
expected == actual,
f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}'"
if expected != actual
else "",
(
f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}'"
if expected != actual
else ""
),
)
elif isinstance(expected, (float)):
delta = abs(expected - actual)
return (
delta < epsilon,
f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}' (using epsilon: '{epsilon}')"
if delta >= epsilon
else "",
(
f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}' (using epsilon: '{epsilon}')"
if delta >= epsilon
else ""
),
)
elif isinstance(expected, dict):
expected_keys = set(expected.keys())
Expand Down
1 change: 1 addition & 0 deletions test/save_resume_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Test that the models generated with and without --predict_only_model produce the same predictions when loaded in test_mode.
"""

import sys
import os
import optparse
Expand Down
Binary file modified test/train-sets/0001.fb
Binary file not shown.
Binary file modified test/train-sets/ccb.fb
Binary file not shown.
Binary file modified test/train-sets/cs.fb
Binary file not shown.
Binary file modified test/train-sets/multiclass.fb
Binary file not shown.
Binary file modified test/train-sets/multilabel.fb
Binary file not shown.
Binary file modified test/train-sets/rcv1_cb_eval.fb
Binary file not shown.
Binary file modified test/train-sets/rcv1_raw_cb_small.fb
Binary file not shown.
Binary file modified test/train-sets/wiki256_no_label.fb
Binary file not shown.
91 changes: 69 additions & 22 deletions utl/flatbuffer/vw_to_flat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,10 @@ void to_flat::create_no_label(VW::example* v, ExampleBuilder& ex_builder)
ex_builder.label = VW::parsers::flatbuffer::Createno_label(_builder, (uint8_t)'\000').Union();
}

flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace(VW::features::audit_iterator begin,
VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash, bool audit)
// Create namespace when audit is true
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace_audit(
VW::features::audit_iterator begin, VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash)
{
std::vector<flatbuffers::Offset<VW::parsers::flatbuffer::Feature>> fts;
std::stringstream ss;
ss << index;

Expand All @@ -316,26 +316,61 @@ flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespac
if (find_ns_offset == _share_examples.end())
{
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> namespace_offset;
std::vector<flatbuffers::Offset<flatbuffers::String>> feature_names;
std::vector<float> feature_values;
std::vector<uint64_t> feature_hashes;

// new namespace
if (audit)

std::string ns_name;
for (auto it = begin; it != end; ++it)
{
std::string ns_name;
for (auto it = begin; it != end; ++it)
{
ns_name = it.audit()->ns;
fts.push_back(
VW::parsers::flatbuffer::CreateFeatureDirect(_builder, it.audit()->name.c_str(), it.value(), it.index()));
}
namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(_builder, ns_name.c_str(), index, &fts, hash);
if ((it.audit()->ns).c_str() != nullptr) ns_name = it.audit()->ns;

(feature_names).push_back(_builder.CreateString(it.audit()->name.c_str()));
(feature_values).push_back(it.value());
(feature_hashes).push_back(it.index());
}
else
namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(
_builder, ns_name.c_str(), index, hash, &feature_names, &feature_values, &feature_hashes);

_share_examples[refid] = namespace_offset;
}

return _share_examples[refid];
}

// Create namespace when audit is false
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace(
features::const_iterator begin, features::const_iterator end, VW::namespace_index index, uint64_t hash)

Check warning on line 345 in utl/flatbuffer/vw_to_flat.cc

View workflow job for this annotation

GitHub Actions / asan.macos-latest.vcpkg-asan-debug

'features' is deprecated: Moved into VW namespace. Will be removed in VW 10. [-Wdeprecated-declarations]

Check warning on line 345 in utl/flatbuffer/vw_to_flat.cc

View workflow job for this annotation

GitHub Actions / asan.macos-latest.vcpkg-asan-debug

'features' is deprecated: Moved into VW namespace. Will be removed in VW 10. [-Wdeprecated-declarations]

Check warning on line 345 in utl/flatbuffer/vw_to_flat.cc

View workflow job for this annotation

GitHub Actions / asan.macos-latest.vcpkg-ubsan-debug

'features' is deprecated: Moved into VW namespace. Will be removed in VW 10. [-Wdeprecated-declarations]

Check warning on line 345 in utl/flatbuffer/vw_to_flat.cc

View workflow job for this annotation

GitHub Actions / asan.macos-latest.vcpkg-ubsan-debug

'features' is deprecated: Moved into VW namespace. Will be removed in VW 10. [-Wdeprecated-declarations]
{
std::stringstream ss;
ss << index;

for (auto it = begin; it != end; ++it) { ss << it.index() << it.value(); }
ss << ":" << hash;

std::string s = ss.str();
uint64_t refid = VW::uniform_hash(s.c_str(), s.size(), 0);
const auto find_ns_offset = _share_examples.find(refid);

if (find_ns_offset == _share_examples.end())
{
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> namespace_offset;
std::vector<float> feature_values;
std::vector<uint64_t> feature_hashes;

for (auto it = begin; it != end; ++it)
{
for (auto it = begin; it != end; ++it)
if (it.value() != 0) // store the feature data only if the value is non zero
{
fts.push_back(VW::parsers::flatbuffer::CreateFeatureDirect(_builder, nullptr, it.value(), it.index()));
(feature_values).push_back(it.value());
(feature_hashes).push_back(it.index());
}
namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(_builder, nullptr, index, &fts, hash);
}
namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(
_builder, nullptr, index, hash, nullptr, &feature_values, &feature_hashes);

_share_examples[refid] = namespace_offset;
}

Expand Down Expand Up @@ -438,13 +473,25 @@ void to_flat::convert_txt_to_flat(VW::workspace& all)
VW::details::flatten_namespace_extents(ae->feature_space[ns].namespace_extents, ae->feature_space[ns].size());
auto unflattened_with_ranges_that_dont_have_extents = unflatten_namespace_extents_dont_skip(flattened_extents);

for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
if (all.output_config.audit || all.output_config.hash_inv)
{
for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
{
// The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
auto created_ns = create_namespace_audit(ae->feature_space[ns].audit_begin() + extent.begin_index,
ae->feature_space[ns].audit_begin() + extent.end_index, ns, extent.hash);
namespaces.push_back(created_ns);
}
}
else
{
// The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
auto created_ns = create_namespace(ae->feature_space[ns].audit_begin() + extent.begin_index,
ae->feature_space[ns].audit_begin() + extent.end_index, ns, extent.hash,
all.output_config.audit || all.output_config.hash_inv);
namespaces.push_back(created_ns);
for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
{
// The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
auto created_ns = create_namespace(ae->feature_space[ns].cbegin() + extent.begin_index,
ae->feature_space[ns].cbegin() + extent.end_index, ns, extent.hash);
namespaces.push_back(created_ns);
}
}
}
std::string tag(ae->tag.begin(), ae->tag.size());
Expand Down
6 changes: 4 additions & 2 deletions utl/flatbuffer/vw_to_flat.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ class to_flat
void write_to_file(bool collection, bool is_multiline, MultiExampleBuilder& multi_ex_builder,
ExampleBuilder& ex_builder, std::ofstream& outfile);

flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace(VW::features::audit_iterator begin,
VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash, bool audit);
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace(
VW::features::const_iterator begin, VW::features::const_iterator end, VW::namespace_index index, uint64_t hash);
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace_audit(
VW::features::audit_iterator begin, VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash);
};
1 change: 1 addition & 0 deletions vowpalwabbit/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ set(vw_core_test_sources
tests/flat_example_test.cc
tests/guard_test.cc
tests/interactions_test.cc
tests/io_alignment_test.cc
tests/loss_functions_test.cc
tests/math_test.cc
tests/merge_header_opts_test.cc
Expand Down
11 changes: 11 additions & 0 deletions vowpalwabbit/core/include/vw/core/api_status.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,14 @@ int report_error(status_builder& sb, const First& first, const Rest&... rest)
return sb << VW::experimental::error_code::code##_s

#endif // RETURN_ERROR_LS

#ifndef RETURN_IF_FAIL
/**
* @brief Error reporting macro to test and return on error
*/
# define RETURN_IF_FAIL(x) \
do { \
int retval__LINE__ = (x); \
if (retval__LINE__ != 0) { return retval__LINE__; } \
} while (0)
#endif
5 changes: 1 addition & 4 deletions vowpalwabbit/core/include/vw/core/array_parameters_dense.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,7 @@ class dense_parameters
dense_parameters(dense_parameters&&) noexcept;

bool not_null();
VW::weight* first()
{
return _begin.get();
} // TODO: Temporary fix for allreduce.
VW::weight* first() { return _begin.get(); } // TODO: Temporary fix for allreduce.

VW::weight* data() { return _begin.get(); }

Expand Down
12 changes: 12 additions & 0 deletions vowpalwabbit/core/include/vw/core/error_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,21 @@ ERROR_CODE_DEFINITION(
3, options_disagree, "Different values specified for two options that are constrained to be the same.")
ERROR_CODE_DEFINITION(4, not_implemented, "Not implemented.")
ERROR_CODE_DEFINITION(5, native_exception, "Native exception: ")
ERROR_CODE_DEFINITION(6, fb_parser_namespace_missing, "Missing Namespace. ")
ERROR_CODE_DEFINITION(7, fb_parser_feature_values_missing, "Missing Feature Values. ")
ERROR_CODE_DEFINITION(8, fb_parser_feature_hashes_names_missing, "Missing Feature Names and Feature Hashes. ")
ERROR_CODE_DEFINITION(9, nothing_to_parse, "No new object to be read from file. ")
ERROR_CODE_DEFINITION(10, fb_parser_unknown_example_type, "Unkown Example type. ")
ERROR_CODE_DEFINITION(11, fb_parser_name_hash_missing, "Missing name and hash field in namespace. ")
ERROR_CODE_DEFINITION(
12, fb_parser_size_mismatch_ft_hashes_ft_values, "Size of feature hashes and feature values do not match. ")
ERROR_CODE_DEFINITION(
13, fb_parser_size_mismatch_ft_names_ft_values, "Size of feature names and feature values do not match. ")
ERROR_CODE_DEFINITION(14, unknown_label_type, "Label type in Flatbuffer not understood. ")

// TODO: This is temporary until we switch to the new error handling mechanism.
ERROR_CODE_DEFINITION(10000, vw_exception, "vw_exception: ")
ERROR_CODE_DEFINITION(20000, internal_error, "BUGBUG: ")

#ifdef ERROR_CODE_DEFINITION_NOOP
#undef ERROR_CODE_DEFINITION
Expand Down
Loading

0 comments on commit db7f024

Please sign in to comment.