Merge branch 'master' into model_ver_save_load

VowpalWabbit · Mar 14, 2024 · ba53b71 · ba53b71
2 parents 557c8f4 + 9837a0e
commit ba53b71
Show file tree

Hide file tree

Showing 71 changed files with 3,590 additions and 432 deletions.
diff --git a/.github/workflows/asan.yml b/.github/workflows/asan.yml
@@ -21,7 +21,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, ubuntu-latest, macos-latest]
+        #os: [windows-latest, ubuntu-latest, macos-latest]
+        os: [ubuntu-latest, macos-latest] # Temporarily remove windows asan
         preset: [vcpkg-asan-debug, vcpkg-ubsan-debug]
         exclude:
           # UBSan not supported by MSVC on Windows

diff --git a/.github/workflows/build_windows_cmake.yml b/.github/workflows/build_windows_cmake.yml
@@ -25,7 +25,7 @@ jobs:
       CMAKE_BUILD_DIR: ${{ github.workspace }}/vw/build
       SOURCE_DIR: ${{ github.workspace }}/vw
       VCPKG_ROOT: ${{ github.workspace }}/vw/ext_libs/vcpkg
-      VCPKG_REF: 501db0f17ef6df184fcdbfbe0f87cde2313b6ab1
+      VCPKG_REF: 53bef8994c541b6561884a8395ea35715ece75db
 
     steps:
       - uses: actions/checkout@v3

diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
@@ -284,7 +284,7 @@ jobs:
     runs-on: windows-2019
     env:
       VCPKG_ROOT: ${{ github.workspace }}\\vcpkg
-      VCPKG_REF: 501db0f17ef6df184fcdbfbe0f87cde2313b6ab1
+      VCPKG_REF: 53bef8994c541b6561884a8395ea35715ece75db
       VCPKG_DEFAULT_BINARY_CACHE: ${{ github.workspace }}\vcpkg_binary_cache
     strategy:
       matrix:

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "(ctest) Launch",
+            "type": "cppdbg",
+            "cwd": "${workspaceFolder}",
+            "request": "launch",
+            "program": "${cmake.testProgram}",
+            "args": [ "${cmake.testArgs}" ]
+        }
+    ]
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -60,10 +60,10 @@ if(VW_FEAT_LDA AND NOT BUILD_PYTHON)
   list(APPEND VCPKG_MANIFEST_FEATURES "lda")
 endif()
 
-option(BUILD_TESTING "Build tests" ON)
-if(BUILD_TESTING)
-  list(APPEND VCPKG_MANIFEST_FEATURES "tests")
-endif()
+#option(BUILD_TESTING "Build tests" ON)
+#if(BUILD_TESTING)
+#  list(APPEND VCPKG_MANIFEST_FEATURES "tests")
+#endif()
 
 option(BUILD_BENCHMARKS "Build benchmarks" OFF)
 if(BUILD_BENCHMARKS)
@@ -100,6 +100,31 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_VISIBILITY_INLINES_HIDDEN TRUE)
 set(CMAKE_CXX_VISIBILITY_PRESET "hidden")
 
+option(VW_USE_ASAN "Compile with AddressSanitizer" OFF)
+option(VW_USE_UBSAN "Compile with UndefinedBehaviorSanitizer" OFF)
+
+if(VW_USE_ASAN)
+  add_compile_definitions(VW_USE_ASAN)
+  if(MSVC)
+    add_compile_options(/fsanitize=address)
+    add_link_options(/InferASanLibs /incremental:no /debug)
+  else()
+    add_compile_options(-fsanitize=address -fno-omit-frame-pointer -g3)
+    add_link_options(-fsanitize=address -fno-omit-frame-pointer -g3)
+  endif()
+endif()
+
+if(VW_USE_UBSAN)
+  add_compile_definitions(VW_USE_UBSAN)
+  if(MSVC)
+    message(FATAL_ERROR "UBSan not supported on MSVC")
+  else()
+    add_compile_options(-fsanitize=undefined -fno-sanitize-recover -fno-omit-frame-pointer -g3)
+    add_link_options(-fsanitize=undefined -fno-sanitize-recover -fno-omit-frame-pointer -g3)
+  endif()
+endif()
+
+
 include(VowpalWabbitUtils)
 
 if(MSVC)
@@ -152,33 +177,8 @@ option(VW_SSE2NEON_SYS_DEP "Override using the submodule for SSE2Neon dependency
 option(VW_BUILD_VW_C_WRAPPER "Enable building the c_wrapper project" ON)
 option(vw_BUILD_NET_CORE "Build .NET Core targets" OFF)
 option(vw_BUILD_NET_FRAMEWORK "Build .NET Framework targets" OFF)
-option(VW_USE_ASAN "Compile with AddressSanitizer" OFF)
-option(VW_USE_UBSAN "Compile with UndefinedBehaviorSanitizer" OFF)
 option(VW_BUILD_WASM "Add WASM target" OFF)
 
-if(VW_USE_ASAN)
-  add_compile_definitions(VW_USE_ASAN)
-  if(MSVC)
-    add_compile_options(/fsanitize=address /GS- /wd5072)
-    add_link_options(/InferASanLibs /incremental:no /debug)
-    # Workaround for MSVC ASan issue here: https://developercommunity.visualstudio.com/t/VS2022---Address-sanitizer-on-x86-Debug-/10116361
-    add_compile_definitions(_DISABLE_STRING_ANNOTATION)
-  else()
-    add_compile_options(-fsanitize=address -fno-omit-frame-pointer -g3)
-    add_link_options(-fsanitize=address -fno-omit-frame-pointer -g3)
-  endif()
-endif()
-
-if(VW_USE_UBSAN)
-  add_compile_definitions(VW_USE_UBSAN)
-  if(MSVC)
-    message(FATAL_ERROR "UBSan not supported on MSVC")
-  else()
-    add_compile_options(-fsanitize=undefined -fno-sanitize-recover -fno-omit-frame-pointer -g3)
-    add_link_options(-fsanitize=undefined -fno-sanitize-recover -fno-omit-frame-pointer -g3)
-  endif()
-endif()
-
 if(VW_INSTALL AND NOT VW_ZLIB_SYS_DEP)
   message(WARNING "Installing with a vendored version of zlib is not recommended. Use VW_ZLIB_SYS_DEP to use a system dependency or specify VW_INSTALL=OFF to silence this warning.")
 endif()

diff --git a/CMakePresets.json b/CMakePresets.json
@@ -41,7 +41,7 @@
         },
         "VW_GTEST_SYS_DEP": {
           "type": "BOOL",
-          "value": "ON"
+          "value": "OFF"
         },
         "VW_EIGEN_SYS_DEP": {
           "type": "BOOL",

diff --git a/cmake/VowpalWabbitUtils.cmake b/cmake/VowpalWabbitUtils.cmake
@@ -22,7 +22,7 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
       include(FetchContent)
       FetchContent_Declare(
         googletest
-        URL https://github.com/google/googletest/archive/refs/tags/release-1.11.0.zip
+        URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
       )
       # For Windows: Prevent overriding the parent project's compiler/linker settings
       set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)

diff --git a/ext_libs/ext_libs.cmake b/ext_libs/ext_libs.cmake
@@ -38,7 +38,7 @@ if(RAPIDJSON_SYS_DEP)
   # Since EXACT is not specified, any version compatible with 1.1.0 is accepted (>= 1.1.0)
   find_package(RapidJSON 1.1.0 CONFIG REQUIRED)
   add_library(RapidJSON INTERFACE)
-  target_include_directories(RapidJSON INTERFACE ${RapidJSON_INCLUDE_DIRS})
+  target_include_directories(RapidJSON INTERFACE ${RapidJSON_INCLUDE_DIRS} ${RAPIDJSON_INCLUDE_DIRS})
 else()
   add_library(RapidJSON INTERFACE)
   target_include_directories(RapidJSON SYSTEM INTERFACE "${CMAKE_CURRENT_LIST_DIR}/rapidjson/include")
@@ -127,4 +127,4 @@ if(VW_FEAT_CB_GRAPH_FEEDBACK)
   target_include_directories(mlpack_ensmallen SYSTEM INTERFACE ${CMAKE_CURRENT_LIST_DIR}/armadillo-code/include)
 
   target_include_directories(mlpack_ensmallen SYSTEM INTERFACE ${CMAKE_CURRENT_LIST_DIR}/ensmallen/include)
-endif()
+endif()
diff --git a/ext_libs/vcpkg b/ext_libs/vcpkg
diff --git a/python/docs/source/tutorials/DFtoVW_tutorial.ipynb b/python/docs/source/tutorials/DFtoVW_tutorial.ipynb
@@ -802,15 +802,17 @@
     "\n",
     "# Adding columns for easier visualization\n",
     "weights_df[\"feature_name\"] = weights_df.apply(\n",
-    "    lambda row: row.vw_feature_name.split(\"=\")[0]\n",
-    "    if row.is_cat\n",
-    "    else row.vw_feature_name,\n",
+    "    lambda row: (\n",
+    "        row.vw_feature_name.split(\"=\")[0] if row.is_cat else row.vw_feature_name\n",
+    "    ),\n",
     "    axis=1,\n",
     ")\n",
     "weights_df[\"feature_value\"] = weights_df.apply(\n",
-    "    lambda row: row.vw_feature_name.split(\"=\")[1].zfill(2)\n",
-    "    if row.is_cat\n",
-    "    else row.vw_feature_name,\n",
+    "    lambda row: (\n",
+    "        row.vw_feature_name.split(\"=\")[1].zfill(2)\n",
+    "        if row.is_cat\n",
+    "        else row.vw_feature_name\n",
+    "    ),\n",
     "    axis=1,\n",
     ")\n",
     "weights_df.sort_values([\"feature_name\", \"feature_value\"], inplace=True)"

diff --git a/python/docs/source/tutorials/cmd_first_steps.md b/python/docs/source/tutorials/cmd_first_steps.md
@@ -116,6 +116,6 @@ The model predicted a value of **0**. This result means our house will not need
 ## More to explore
 
 - See [Python tutorial](python_first_steps.ipynb) for a quick introduction to the basics of training and testing your model.
-- To learn more about how to approach a contextual bandits problem using  tVowpal Wabbit — including how to  work with different contextual bandits approaches, how to format data, and understand the results — see the [Contextual Bandit Reinforcement Learning Tutorial](python_Contextual_bandits_and_Vowpal_Wabbit.ipynb).
+- To learn more about how to approach a contextual bandits problem using Vowpal Wabbit — including how to work with different contextual bandits approaches, how to format data, and understand the results — see the [Contextual Bandit Reinforcement Learning Tutorial](python_Contextual_bandits_and_Vowpal_Wabbit.ipynb).
 - For more on the contextual bandits approach to reinforcement learning, including a content personalization scenario, see the [Contextual Bandit Simulation Tutorial](python_Simulating_a_news_personalization_scenario_using_Contextual_Bandits.ipynb).
 - See the [Linear Regression Tutorial](cmd_linear_regression.md) for a different look at the roof replacement problem and learn more about Vowpal Wabbit's format and understanding the results.
diff --git a/python/tests/confidence_sequence.py b/python/tests/confidence_sequence.py
@@ -189,6 +189,5 @@ def lblogwealth(self, *, t, sumXt, v, eta, s, alpha):
 
         return max(
             0,
-            (sumXt - sqrt(gamma1**2 * ll * v + gamma2**2 * ll**2) - gamma2 * ll)
-            / t,
+            (sumXt - sqrt(gamma1**2 * ll * v + gamma2**2 * ll**2) - gamma2 * ll) / t,
         )
diff --git a/python/tests/crminustwo.py b/python/tests/crminustwo.py
@@ -440,21 +440,23 @@ def intervaldiff(
                     candidates.append(
                         (
                             gstar,
-                            None
-                            if isclose(kappa, 0)
-                            else {
-                                "kappastar": kappa,
-                                "betastar": beta,
-                                "gammastar": gamma,
-                                "taustar": tau,
-                                "ufake": ufake,
-                                "wfake": wfake,
-                                "rfake": rex,
-                                "qfunc": lambda c, u, w, r, k=kappa, g=gamma, b=beta, t=tau, s=sign, num=n: -c
-                                * (b + g * u + t * w + s * (u - w) * r)
-                                / ((num + 1) * k),
-                                "mle": mle,
-                            },
+                            (
+                                None
+                                if isclose(kappa, 0)
+                                else {
+                                    "kappastar": kappa,
+                                    "betastar": beta,
+                                    "gammastar": gamma,
+                                    "taustar": tau,
+                                    "ufake": ufake,
+                                    "wfake": wfake,
+                                    "rfake": rex,
+                                    "qfunc": lambda c, u, w, r, k=kappa, g=gamma, b=beta, t=tau, s=sign, num=n: -c
+                                    * (b + g * u + t * w + s * (u - w) * r)
+                                    / ((num + 1) * k),
+                                    "mle": mle,
+                                }
+                            ),
                         )
                     )
 

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
@@ -532,9 +532,9 @@ def parse(
                     for ex in str_ex
                 ]
             ):
-                str_ex: List[
-                    Example
-                ] = str_ex  # pytype: disable=annotation-type-mismatch
+                str_ex: List[Example] = (
+                    str_ex  # pytype: disable=annotation-type-mismatch
+                )
                 return str_ex
 
         if not isinstance(str_ex, (list, str)):

diff --git a/test/core.vwtest.json b/test/core.vwtest.json
@@ -6073,5 +6073,34 @@
     "depends_on": [
       467
     ]
+  },
+  {
+    "id": 469,
+    "desc": "https://github.com/VowpalWabbit/vowpal_wabbit/issues/4669",
+    "vw_command": "--ccb_explore_adf --dsjson -d train-sets/issue4669.dsjson -f issue4669.model",
+    "diff_files": {
+      "stderr": "train-sets/ref/issue4669_train.stderr",
+      "stdout": "train-sets/ref/issue4669_train.stdout"
+    },
+    "input_files": [
+      "train-sets/issue4669.dsjson"
+    ]
+  },
+  {
+    "id": 470,
+    "desc": "https://github.com/VowpalWabbit/vowpal_wabbit/issues/4669",
+    "vw_command": "--ccb_explore_adf --dsjson --all_slots_loss --epsilon 0 -t -i issue4669.model -t -d train-sets/issue4669.dsjson -p issue4669_test_pred.txt",
+    "diff_files": {
+      "stderr": "train-sets/ref/issue4669_test.stderr",
+      "stdout": "train-sets/ref/issue4669_test.stdout",
+      "issue4669_test_pred.txt": "train-sets/ref/issue4669_test_pred.txt"
+    },
+    "input_files": [
+      "train-sets/issue4669.dsjson",
+      "issue4669.model"
+    ],
+    "depends_on": [
+      469
+    ]
   }
 ]
diff --git a/test/run_tests.py b/test/run_tests.py
@@ -68,17 +68,21 @@ def _are_same(expected: Any, actual: Any, key: str) -> Tuple[bool, str]:
         elif isinstance(expected, (int, bool, str)):
             return (
                 expected == actual,
-                f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}'"
-                if expected != actual
-                else "",
+                (
+                    f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}'"
+                    if expected != actual
+                    else ""
+                ),
             )
         elif isinstance(expected, (float)):
             delta = abs(expected - actual)
             return (
                 delta < epsilon,
-                f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}' (using epsilon: '{epsilon}')"
-                if delta >= epsilon
-                else "",
+                (
+                    f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}' (using epsilon: '{epsilon}')"
+                    if delta >= epsilon
+                    else ""
+                ),
             )
         elif isinstance(expected, dict):
             expected_keys = set(expected.keys())

diff --git a/test/save_resume_test.py b/test/save_resume_test.py
@@ -1,6 +1,7 @@
 """
 Test that the models generated with and without --predict_only_model produce the same predictions when loaded in test_mode.
 """
+
 import sys
 import os
 import optparse

diff --git a/test/train-sets/0001.fb b/test/train-sets/0001.fb
diff --git a/test/train-sets/ccb.fb b/test/train-sets/ccb.fb
diff --git a/test/train-sets/cs.fb b/test/train-sets/cs.fb
diff --git a/test/train-sets/issue4669.dsjson b/test/train-sets/issue4669.dsjson
@@ -0,0 +1 @@
+{"c":{"_multi":[{"f":"1"},{"f":"2"}],"_slots":[{"_inc":[0,1]},{"_inc":[1]}]},"_outcomes":[{"_label_cost":1.0,"_a":[0,1],"_p":[0.5,0.5]},{"_label_cost":0.0,"_a":[1],"_p":[1]}]}
diff --git a/test/train-sets/multiclass.fb b/test/train-sets/multiclass.fb
diff --git a/test/train-sets/multilabel.fb b/test/train-sets/multilabel.fb
diff --git a/test/train-sets/rcv1_cb_eval.fb b/test/train-sets/rcv1_cb_eval.fb
diff --git a/test/train-sets/rcv1_raw_cb_small.fb b/test/train-sets/rcv1_raw_cb_small.fb
diff --git a/test/train-sets/ref/active-simulation.t24.stderr b/test/train-sets/ref/active-simulation.t24.stderr
@@ -11,20 +11,13 @@ Output pred = SCALAR
 average  since         example        example        current        current  current
 loss     last          counter         weight          label        predict features
 1.000000 1.000000            1            1.0        -1.0000         0.0000      128
-0.791125 0.755288            2            6.8        -1.0000        -0.1309       44
-1.274829 1.444750            8           26.3         1.0000        -0.2020       34
-1.083985 0.895011           73           52.8         1.0000         0.0214       21
-0.887295 0.693362          130          106.3        -1.0000        -0.3071      146
-0.788245 0.690009          233          213.6        -1.0000         0.0421       47
-0.664628 0.541195          398          427.4        -1.0000        -0.1863       68
-0.634406 0.604328          835          856.9        -1.0000        -0.4327       40
 
 finished run
 number of examples = 1000
-weighted example sum = 1014.004519
-weighted label sum = -68.618036
-average loss = 0.630964
-best constant = -0.067670
-best constant's loss = 0.995421
+weighted example sum = 1.000000
+weighted label sum = -1.000000
+average loss = 1.000000
+best constant = -1.000000
+best constant's loss = 0.000000
 total feature number = 78739
-total queries = 474
+total queries = 1
diff --git a/test/train-sets/ref/help.stdout b/test/train-sets/ref/help.stdout
@@ -221,8 +221,12 @@ Weight Options:
 [Reduction] Active Learning Options:
     --active                                Enable active learning (type: bool, keep, necessary)
     --simulation                            Active learning simulation mode (type: bool)
-    --mellowness arg                        Active learning mellowness parameter c_0. Default 8 (type: float,
-                                            default: 8, keep)
+    --direct                                Active learning via the tag and predictions interface. Tag should
+                                            start with "query?" to get query decision. Returned prediction
+                                            is either -1 for no or the importance weight for yes. (type:
+                                            bool)
+    --mellowness arg                        Active learning mellowness parameter c_0. Default 1. (type: float,
+                                            default: 1, keep)
 [Reduction] Active Learning with Cover Options:
     --active_cover                          Enable active learning with cover (type: bool, keep, necessary)
     --mellowness arg                        Active learning mellowness parameter c_0 (type: float, default:

diff --git a/test/train-sets/ref/issue4669_test.stderr b/test/train-sets/ref/issue4669_test.stderr
@@ -0,0 +1,23 @@
+only testing
+predictions = issue4669_test_pred.txt
+using no cache
+Reading datafile = train-sets/issue4669.dsjson
+num sources = 1
+Num weight bits = 18
+learning rate = 0.5
+initial_t = 1
+power_t = 0.5
+cb_type = mtr
+Enabled learners: gd, generate_interactions, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, cb_sample, shared_feature_merger, ccb_explore_adf
+Input label = CCB
+Output pred = DECISION_PROBS
+average  since         example        example        current        current  current
+loss     last          counter         weight          label        predict features
+0.000000 0.000000            1            1.0        0:1,1:0         1,None        9
+
+finished run
+number of examples = 1
+weighted example sum = 1.000000
+weighted label sum = 0.000000
+average loss = 0.000000
+total feature number = 9
diff --git a/test/train-sets/ref/issue4669_test.stdout b/test/train-sets/ref/issue4669_test.stdout