feat: Improve Flatbuffers Data Format Performance (#3989)

Update the schema to remove an extra layer of indirection for every feature by removing the Feature table from the schema. This reduces a flatbuffer "pointer"-dereference (plus some overhead) per feature. Also: * Increases unit-testing coverage for Flatbuffers parser * Implements desired alignment (including explicit off-align) support in io_buf for reading packed binary data * Adds more detailed error reporting in Flatbuffers parser Misc: * Add launch profile for CTests for Visual Studio Code * Python lint --------- Co-authored-by: Griffin Bassman <griffinbassman@gmail.com> Co-authored-by: Jacob Alber <jacob.alber@microsoft.com>
VowpalWabbit · Feb 9, 2024 · db7f024 · db7f024
1 parent 4f33d61
commit db7f024
Show file tree

Hide file tree

Showing 41 changed files with 2,477 additions and 184 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "(ctest) Launch",
+            "type": "cppdbg",
+            "cwd": "${workspaceFolder}",
+            "request": "launch",
+            "program": "${cmake.testProgram}",
+            "args": [ "${cmake.testArgs}" ]
+        }
+    ]
+}
diff --git a/python/docs/source/tutorials/DFtoVW_tutorial.ipynb b/python/docs/source/tutorials/DFtoVW_tutorial.ipynb
@@ -802,15 +802,17 @@
     "\n",
     "# Adding columns for easier visualization\n",
     "weights_df[\"feature_name\"] = weights_df.apply(\n",
-    "    lambda row: row.vw_feature_name.split(\"=\")[0]\n",
-    "    if row.is_cat\n",
-    "    else row.vw_feature_name,\n",
+    "    lambda row: (\n",
+    "        row.vw_feature_name.split(\"=\")[0] if row.is_cat else row.vw_feature_name\n",
+    "    ),\n",
     "    axis=1,\n",
     ")\n",
     "weights_df[\"feature_value\"] = weights_df.apply(\n",
-    "    lambda row: row.vw_feature_name.split(\"=\")[1].zfill(2)\n",
-    "    if row.is_cat\n",
-    "    else row.vw_feature_name,\n",
+    "    lambda row: (\n",
+    "        row.vw_feature_name.split(\"=\")[1].zfill(2)\n",
+    "        if row.is_cat\n",
+    "        else row.vw_feature_name\n",
+    "    ),\n",
     "    axis=1,\n",
     ")\n",
     "weights_df.sort_values([\"feature_name\", \"feature_value\"], inplace=True)"

diff --git a/python/tests/confidence_sequence.py b/python/tests/confidence_sequence.py
@@ -189,6 +189,5 @@ def lblogwealth(self, *, t, sumXt, v, eta, s, alpha):
 
         return max(
             0,
-            (sumXt - sqrt(gamma1**2 * ll * v + gamma2**2 * ll**2) - gamma2 * ll)
-            / t,
+            (sumXt - sqrt(gamma1**2 * ll * v + gamma2**2 * ll**2) - gamma2 * ll) / t,
         )
diff --git a/python/tests/crminustwo.py b/python/tests/crminustwo.py
@@ -440,21 +440,23 @@ def intervaldiff(
                     candidates.append(
                         (
                             gstar,
-                            None
-                            if isclose(kappa, 0)
-                            else {
-                                "kappastar": kappa,
-                                "betastar": beta,
-                                "gammastar": gamma,
-                                "taustar": tau,
-                                "ufake": ufake,
-                                "wfake": wfake,
-                                "rfake": rex,
-                                "qfunc": lambda c, u, w, r, k=kappa, g=gamma, b=beta, t=tau, s=sign, num=n: -c
-                                * (b + g * u + t * w + s * (u - w) * r)
-                                / ((num + 1) * k),
-                                "mle": mle,
-                            },
+                            (
+                                None
+                                if isclose(kappa, 0)
+                                else {
+                                    "kappastar": kappa,
+                                    "betastar": beta,
+                                    "gammastar": gamma,
+                                    "taustar": tau,
+                                    "ufake": ufake,
+                                    "wfake": wfake,
+                                    "rfake": rex,
+                                    "qfunc": lambda c, u, w, r, k=kappa, g=gamma, b=beta, t=tau, s=sign, num=n: -c
+                                    * (b + g * u + t * w + s * (u - w) * r)
+                                    / ((num + 1) * k),
+                                    "mle": mle,
+                                }
+                            ),
                         )
                     )
 

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
@@ -532,9 +532,9 @@ def parse(
                     for ex in str_ex
                 ]
             ):
-                str_ex: List[
-                    Example
-                ] = str_ex  # pytype: disable=annotation-type-mismatch
+                str_ex: List[Example] = (
+                    str_ex  # pytype: disable=annotation-type-mismatch
+                )
                 return str_ex
 
         if not isinstance(str_ex, (list, str)):

diff --git a/test/run_tests.py b/test/run_tests.py
@@ -68,17 +68,21 @@ def _are_same(expected: Any, actual: Any, key: str) -> Tuple[bool, str]:
         elif isinstance(expected, (int, bool, str)):
             return (
                 expected == actual,
-                f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}'"
-                if expected != actual
-                else "",
+                (
+                    f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}'"
+                    if expected != actual
+                    else ""
+                ),
             )
         elif isinstance(expected, (float)):
             delta = abs(expected - actual)
             return (
                 delta < epsilon,
-                f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}' (using epsilon: '{epsilon}')"
-                if delta >= epsilon
-                else "",
+                (
+                    f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}' (using epsilon: '{epsilon}')"
+                    if delta >= epsilon
+                    else ""
+                ),
             )
         elif isinstance(expected, dict):
             expected_keys = set(expected.keys())

diff --git a/test/save_resume_test.py b/test/save_resume_test.py
@@ -1,6 +1,7 @@
 """
 Test that the models generated with and without --predict_only_model produce the same predictions when loaded in test_mode.
 """
+
 import sys
 import os
 import optparse

diff --git a/test/train-sets/0001.fb b/test/train-sets/0001.fb
diff --git a/test/train-sets/ccb.fb b/test/train-sets/ccb.fb
diff --git a/test/train-sets/cs.fb b/test/train-sets/cs.fb
diff --git a/test/train-sets/multiclass.fb b/test/train-sets/multiclass.fb
diff --git a/test/train-sets/multilabel.fb b/test/train-sets/multilabel.fb
diff --git a/test/train-sets/rcv1_cb_eval.fb b/test/train-sets/rcv1_cb_eval.fb
diff --git a/test/train-sets/rcv1_raw_cb_small.fb b/test/train-sets/rcv1_raw_cb_small.fb
diff --git a/test/train-sets/wiki256_no_label.fb b/test/train-sets/wiki256_no_label.fb
diff --git a/utl/flatbuffer/vw_to_flat.cc b/utl/flatbuffer/vw_to_flat.cc
@@ -299,10 +299,10 @@ void to_flat::create_no_label(VW::example* v, ExampleBuilder& ex_builder)
   ex_builder.label = VW::parsers::flatbuffer::Createno_label(_builder, (uint8_t)'\000').Union();
 }
 
-flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace(VW::features::audit_iterator begin,
-    VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash, bool audit)
+// Create namespace when audit is true
+flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace_audit(
+    VW::features::audit_iterator begin, VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash)
 {
-  std::vector<flatbuffers::Offset<VW::parsers::flatbuffer::Feature>> fts;
   std::stringstream ss;
   ss << index;
 
@@ -316,26 +316,61 @@ flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespac
   if (find_ns_offset == _share_examples.end())
   {
     flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> namespace_offset;
+    std::vector<flatbuffers::Offset<flatbuffers::String>> feature_names;
+    std::vector<float> feature_values;
+    std::vector<uint64_t> feature_hashes;
+
     // new namespace
-    if (audit)
+
+    std::string ns_name;
+    for (auto it = begin; it != end; ++it)
     {
-      std::string ns_name;
-      for (auto it = begin; it != end; ++it)
-      {
-        ns_name = it.audit()->ns;
-        fts.push_back(
-            VW::parsers::flatbuffer::CreateFeatureDirect(_builder, it.audit()->name.c_str(), it.value(), it.index()));
-      }
-      namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(_builder, ns_name.c_str(), index, &fts, hash);
+      if ((it.audit()->ns).c_str() != nullptr) ns_name = it.audit()->ns;
+
+      (feature_names).push_back(_builder.CreateString(it.audit()->name.c_str()));
+      (feature_values).push_back(it.value());
+      (feature_hashes).push_back(it.index());
     }
-    else
+    namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(
+        _builder, ns_name.c_str(), index, hash, &feature_names, &feature_values, &feature_hashes);
+
+    _share_examples[refid] = namespace_offset;
+  }
+
+  return _share_examples[refid];
+}
+
+// Create namespace when audit is false
+flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace(
+    features::const_iterator begin, features::const_iterator end, VW::namespace_index index, uint64_t hash)
+{
+  std::stringstream ss;
+  ss << index;
+
+  for (auto it = begin; it != end; ++it) { ss << it.index() << it.value(); }
+  ss << ":" << hash;
+
+  std::string s = ss.str();
+  uint64_t refid = VW::uniform_hash(s.c_str(), s.size(), 0);
+  const auto find_ns_offset = _share_examples.find(refid);
+
+  if (find_ns_offset == _share_examples.end())
+  {
+    flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> namespace_offset;
+    std::vector<float> feature_values;
+    std::vector<uint64_t> feature_hashes;
+
+    for (auto it = begin; it != end; ++it)
     {
-      for (auto it = begin; it != end; ++it)
+      if (it.value() != 0)  // store the feature data only if the value is non zero
       {
-        fts.push_back(VW::parsers::flatbuffer::CreateFeatureDirect(_builder, nullptr, it.value(), it.index()));
+        (feature_values).push_back(it.value());
+        (feature_hashes).push_back(it.index());
       }
-      namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(_builder, nullptr, index, &fts, hash);
     }
+    namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(
+        _builder, nullptr, index, hash, nullptr, &feature_values, &feature_hashes);
+
     _share_examples[refid] = namespace_offset;
   }
 
@@ -438,13 +473,25 @@ void to_flat::convert_txt_to_flat(VW::workspace& all)
           VW::details::flatten_namespace_extents(ae->feature_space[ns].namespace_extents, ae->feature_space[ns].size());
       auto unflattened_with_ranges_that_dont_have_extents = unflatten_namespace_extents_dont_skip(flattened_extents);
 
-      for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
+      if (all.output_config.audit || all.output_config.hash_inv)
+      {
+        for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
+        {
+          // The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
+          auto created_ns = create_namespace_audit(ae->feature_space[ns].audit_begin() + extent.begin_index,
+              ae->feature_space[ns].audit_begin() + extent.end_index, ns, extent.hash);
+          namespaces.push_back(created_ns);
+        }
+      }
+      else
       {
-        // The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
-        auto created_ns = create_namespace(ae->feature_space[ns].audit_begin() + extent.begin_index,
-            ae->feature_space[ns].audit_begin() + extent.end_index, ns, extent.hash,
-            all.output_config.audit || all.output_config.hash_inv);
-        namespaces.push_back(created_ns);
+        for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
+        {
+          // The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
+          auto created_ns = create_namespace(ae->feature_space[ns].cbegin() + extent.begin_index,
+              ae->feature_space[ns].cbegin() + extent.end_index, ns, extent.hash);
+          namespaces.push_back(created_ns);
+        }
       }
     }
     std::string tag(ae->tag.begin(), ae->tag.size());

diff --git a/utl/flatbuffer/vw_to_flat.h b/utl/flatbuffer/vw_to_flat.h
@@ -85,6 +85,8 @@ class to_flat
   void write_to_file(bool collection, bool is_multiline, MultiExampleBuilder& multi_ex_builder,
       ExampleBuilder& ex_builder, std::ofstream& outfile);
 
-  flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace(VW::features::audit_iterator begin,
-      VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash, bool audit);
+  flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace(
+      VW::features::const_iterator begin, VW::features::const_iterator end, VW::namespace_index index, uint64_t hash);
+  flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace_audit(
+      VW::features::audit_iterator begin, VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash);
 };
diff --git a/vowpalwabbit/core/CMakeLists.txt b/vowpalwabbit/core/CMakeLists.txt
@@ -481,6 +481,7 @@ set(vw_core_test_sources
       tests/flat_example_test.cc
       tests/guard_test.cc
       tests/interactions_test.cc
+      tests/io_alignment_test.cc
       tests/loss_functions_test.cc
       tests/math_test.cc
       tests/merge_header_opts_test.cc

diff --git a/vowpalwabbit/core/include/vw/core/api_status.h b/vowpalwabbit/core/include/vw/core/api_status.h
@@ -258,3 +258,14 @@ int report_error(status_builder& sb, const First& first, const Rest&... rest)
     return sb << VW::experimental::error_code::code##_s
 
 #endif  // RETURN_ERROR_LS
+
+#ifndef RETURN_IF_FAIL
+/**
+ * @brief Error reporting macro to test and return on error
+ */
+#  define RETURN_IF_FAIL(x)                               \
+    do {                                                  \
+      int retval__LINE__ = (x);                           \
+      if (retval__LINE__ != 0) { return retval__LINE__; } \
+    } while (0)
+#endif
diff --git a/vowpalwabbit/core/include/vw/core/array_parameters_dense.h b/vowpalwabbit/core/include/vw/core/array_parameters_dense.h
@@ -102,10 +102,7 @@ class dense_parameters
   dense_parameters(dense_parameters&&) noexcept;
 
   bool not_null();
-  VW::weight* first()
-  {
-    return _begin.get();
-  }  // TODO: Temporary fix for allreduce.
+  VW::weight* first() { return _begin.get(); }  // TODO: Temporary fix for allreduce.
 
   VW::weight* data() { return _begin.get(); }
 

diff --git a/vowpalwabbit/core/include/vw/core/error_data.h b/vowpalwabbit/core/include/vw/core/error_data.h
@@ -14,9 +14,21 @@ ERROR_CODE_DEFINITION(
     3, options_disagree, "Different values specified for two options that are constrained to be the same.")
 ERROR_CODE_DEFINITION(4, not_implemented, "Not implemented.")
 ERROR_CODE_DEFINITION(5, native_exception, "Native exception: ")
+ERROR_CODE_DEFINITION(6, fb_parser_namespace_missing, "Missing Namespace. ")
+ERROR_CODE_DEFINITION(7, fb_parser_feature_values_missing, "Missing Feature Values. ")
+ERROR_CODE_DEFINITION(8, fb_parser_feature_hashes_names_missing, "Missing Feature Names and Feature Hashes. ")
+ERROR_CODE_DEFINITION(9, nothing_to_parse, "No new object to be read from file. ")
+ERROR_CODE_DEFINITION(10, fb_parser_unknown_example_type, "Unkown Example type. ")
+ERROR_CODE_DEFINITION(11, fb_parser_name_hash_missing, "Missing name and hash field in namespace. ")
+ERROR_CODE_DEFINITION(
+    12, fb_parser_size_mismatch_ft_hashes_ft_values, "Size of feature hashes and feature values do not match. ")
+ERROR_CODE_DEFINITION(
+    13, fb_parser_size_mismatch_ft_names_ft_values, "Size of feature names and feature values do not match. ")
+ERROR_CODE_DEFINITION(14, unknown_label_type, "Label type in Flatbuffer not understood. ")
 
 // TODO: This is temporary until we switch to the new error handling mechanism.
 ERROR_CODE_DEFINITION(10000, vw_exception, "vw_exception: ")
+ERROR_CODE_DEFINITION(20000, internal_error, "BUGBUG: ")
 
 #ifdef ERROR_CODE_DEFINITION_NOOP
 #undef ERROR_CODE_DEFINITION