diff --git a/python/tests/test_sklearn_vw.py b/python/tests/test_sklearn_vw.py index 7e3fa41e022..2527a52abbc 100644 --- a/python/tests/test_sklearn_vw.py +++ b/python/tests/test_sklearn_vw.py @@ -135,7 +135,7 @@ def test_lrq(self): '2 |user B |movie 2', '3 |user C |movie 3', '4 |user D |movie 4', - '5 |user D |movie 1'] + '5 |user E |movie 1'] model = VW(convert_to_vw=False, lrq='um4', lrqdropout=True, loss_function='quantile') assert model.params['lrq'] == 'um4' assert model.params['lrqdropout'] diff --git a/test/train-sets/ref/0151.stderr b/test/train-sets/ref/0151.stderr index acb1cb5b27d..0c53694d93f 100644 --- a/test/train-sets/ref/0151.stderr +++ b/test/train-sets/ref/0151.stderr @@ -7,15 +7,15 @@ Reading datafile = train-sets/0080.dat num sources = 1 average since example example current current current loss last counter weight label predict features -0.596384 0.596384 1 1.0 1.0000 0.2277 5 -1.618849 2.641314 2 2.0 2.0000 0.3748 5 -1.000556 0.382262 4 4.0 2.0000 2.0000 5 +0.484530 0.484530 1 1.0 1.0000 0.3039 5 +1.414942 2.345354 2 2.0 2.0000 0.4685 5 +0.902065 0.389187 4 4.0 2.0000 2.0000 5 finished run number of examples = 4 weighted example sum = 4.000000 weighted label sum = 6.000000 -average loss = 1.000556 +average loss = 0.902065 best constant = 1.500000 best constant's loss = 0.250000 total feature number = 18 diff --git a/test/unit_test/CMakeLists.txt b/test/unit_test/CMakeLists.txt index 375cc21a7ad..b011d03b8e6 100644 --- a/test/unit_test/CMakeLists.txt +++ b/test/unit_test/CMakeLists.txt @@ -7,6 +7,11 @@ add_executable(vw-unit-test.out target_include_directories(vw-unit-test.out PRIVATE $) target_link_libraries(vw-unit-test.out PRIVATE vw allreduce Boost::unit_test_framework Boost::system) +if(NOT DEFINED DO_NOT_BUILD_VW_C_WRAPPER) + target_sources(vw-unit-test.out PUBLIC vwdll_test.cc) + target_link_libraries(vw-unit-test.out PRIVATE vw_c_wrapper) +endif() + # Communicate that Boost Unit Test is being statically linked if(STATIC_LINK_VW) target_compile_definitions(vw-unit-test.out PRIVATE STATIC_LINK_VW) diff --git a/test/unit_test/unit_test.vcxproj b/test/unit_test/unit_test.vcxproj index 8684848802d..9f85e4cb7ca 100644 --- a/test/unit_test/unit_test.vcxproj +++ b/test/unit_test/unit_test.vcxproj @@ -1,173 +1,177 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - - - - - - - - - - - - - - - - {1e205806-7f80-47dd-a38d-fc08083f3593} - - - - - - - {E02E3869-D9AD-4513-B352-93F90B7D6FE3} - Win32Proj - unit_test - vw_unit_test - 8.1 - - - - Application - true - v140 - Unicode - - - Application - false - v140 - true - Unicode - - - Application - true - v140 - Unicode - - - Application - false - v140 - true - Unicode - - - - - - - - - - - - - - - - - - - true - - - false - - - $(SolutionDir)out\target\$(Configuration)\$(PlatformShortName)\ - $(SolutionDir)out\int\$(Configuration)\$(PlatformShortName)\$(ProjectName)\ - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - true - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - true - - - Console - true - true - true - - - - - - - - $(SolutionDir);$(SolutionDir)..\explore;%(AdditionalIncludeDirectories) - ZLIB_WINAPI;_WINSOCK_DEPRECATED_NO_WARNINGS;%(PreprocessorDefinitions) - /D "_CRT_SECURE_NO_WARNINGS" %(AdditionalOptions) - - - %(AdditionalLibraryDirectories) - kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;ws2_32.lib;%(AdditionalDependencies) - - - - - - - - - - - This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - - - - - - - - - %(PreprocessorDefinitions) - true - - - - xcopy /Y $(SolutionDir)packages\boost_unit_test_framework-vc140.1.63.0.0\lib\native\address-model-64\lib\*.dll $(OutDir) - xcopy /Y $(SolutionDir)packages\boost_unit_test_framework-vc140.1.63.0.0\lib\native\address-model-32\lib\*.dll $(OutDir) - - + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + + + + + + + + + + {ea52de0d-a5be-4fb9-8c84-3a57bdfebed9} + + + {1e205806-7f80-47dd-a38d-fc08083f3593} + + + + + + + {E02E3869-D9AD-4513-B352-93F90B7D6FE3} + Win32Proj + unit_test + vw_unit_test + 8.1 + + + + Application + true + v140 + Unicode + + + Application + false + v140 + true + Unicode + + + Application + true + v140 + Unicode + + + Application + false + v140 + true + Unicode + + + + + + + + + + + + + + + + + + + true + + + false + + + $(SolutionDir)out\target\$(Configuration)\$(PlatformShortName)\ + $(SolutionDir)out\int\$(Configuration)\$(PlatformShortName)\$(ProjectName)\ + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + + $(SolutionDir);$(SolutionDir)..\explore;%(AdditionalIncludeDirectories) + ZLIB_WINAPI;_WINSOCK_DEPRECATED_NO_WARNINGS;%(PreprocessorDefinitions) + /D "_CRT_SECURE_NO_WARNINGS" %(AdditionalOptions) + + + %(AdditionalLibraryDirectories) + kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;ws2_32.lib;%(AdditionalDependencies) + + + + + + + + + + + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. + + + + + + + + + %(PreprocessorDefinitions) + true + + + + xcopy /Y $(SolutionDir)packages\boost_unit_test_framework-vc140.1.63.0.0\lib\native\address-model-64\lib\*.dll $(OutDir) + xcopy /Y $(SolutionDir)packages\boost_unit_test_framework-vc140.1.63.0.0\lib\native\address-model-32\lib\*.dll $(OutDir) + + \ No newline at end of file diff --git a/test/unit_test/unit_test.vcxproj.filters b/test/unit_test/unit_test.vcxproj.filters index 5b0f428cf2d..d4db85dafa4 100644 --- a/test/unit_test/unit_test.vcxproj.filters +++ b/test/unit_test/unit_test.vcxproj.filters @@ -36,6 +36,9 @@ Source Files + + Source Files + diff --git a/test/unit_test/vwdll_test.cc b/test/unit_test/vwdll_test.cc new file mode 100644 index 00000000000..d649815ccd3 --- /dev/null +++ b/test/unit_test/vwdll_test.cc @@ -0,0 +1,77 @@ + +#ifndef STATIC_LINK_VW +#define BOOST_TEST_DYN_LINK +#endif + +#include + +#include "vwdll.h" +#include "vw.h" + +template +void check_weights_equal(T& first, T& second) +{ + auto secondIt = second.begin(); + for (auto firstIt : first) + { + BOOST_CHECK_EQUAL(firstIt, *secondIt); + ++secondIt; + } + BOOST_CHECK_EQUAL(secondIt == second.end(), true); +} + +BOOST_AUTO_TEST_CASE(vw_dll_parsed_and_constructed_example_parity) +{ + //parse example + VW_HANDLE handle1 = VW_InitializeA("-q st --noconstant --quiet"); + VW_EXAMPLE example_parsed; + example_parsed = VW_ReadExampleA(handle1, "1 |s p^the_man w^the w^man |t p^un_homme w^un w^homme"); + + //construct example + VW_HANDLE handle2 = VW_InitializeA("-q st --noconstant --quiet"); + VW_EXAMPLE example_constructed; + auto fs = VW_InitializeFeatureSpaces(2); + + auto first = VW_GetFeatureSpace(fs, 0); + VW_InitFeatures(first, 3); + auto shash = VW_SetFeatureSpace(handle2, first, "s"); + VW_SetFeature(first, 0, VW_HashFeatureA(handle2, "p^the_man", shash), 1.0f); + VW_SetFeature(first, 1, VW_HashFeatureA(handle2, "w^the", shash), 1.0f); + VW_SetFeature(first, 2, VW_HashFeatureA(handle2, "w^man", shash), 1.0f); + + auto second = VW_GetFeatureSpace(fs, 1); + VW_InitFeatures(second, 3); + auto thash = VW_SetFeatureSpace(handle2, second, "t"); + VW_SetFeature(second, 0, VW_HashFeatureA(handle2, "p^un_homme", thash), 1.0f); + VW_SetFeature(second, 1, VW_HashFeatureA(handle2, "w^un", thash), 1.0f); + VW_SetFeature(second, 2, VW_HashFeatureA(handle2, "w^homme", thash), 1.0f); + + example_constructed = VW_ImportExample(handle2, "1", fs, 2); + + + // learn both + auto score_parsed = VW_Learn(handle1, example_parsed); + auto score_constructed = VW_Learn(handle2, example_parsed); + + + //check parity + BOOST_CHECK_EQUAL(score_parsed, score_constructed); + auto vw1 = static_cast(handle1); + auto vw2 = static_cast(handle2); + + BOOST_CHECK_EQUAL(vw1->weights.sparse, vw2->weights.sparse); + + if (vw1->weights.sparse) { + check_weights_equal(vw1->weights.sparse_weights, vw2->weights.sparse_weights); + } + else { + check_weights_equal(vw1->weights.dense_weights, vw2->weights.dense_weights); + } + + VW_ReleaseFeatureSpace(fs, 2); + + VW_Finish(handle1); + VW_Finish(handle2); +} + + diff --git a/vowpalwabbit/parse_example.cc b/vowpalwabbit/parse_example.cc index d4c2b64d1a3..2d72e092054 100644 --- a/vowpalwabbit/parse_example.cc +++ b/vowpalwabbit/parse_example.cc @@ -69,6 +69,7 @@ class TC_parser bool* spelling_features; v_array spelling; uint32_t hash_seed; + uint64_t parse_mask; vector* namespace_dictionaries; @@ -138,7 +139,7 @@ class TC_parser v = cur_channel_v * featureValue(); uint64_t word_hash; if (feature_name.end != feature_name.begin) - word_hash = (p->hasher(feature_name, channel_hash)); + word_hash = (p->hasher(feature_name, channel_hash) & parse_mask); else word_hash = channel_hash + anon++; if (v == 0) @@ -407,6 +408,7 @@ class TC_parser this->namespace_dictionaries = all.namespace_dictionaries; this->base = nullptr; this->hash_seed = all.hash_seed; + this->parse_mask = all.parse_mask; listNameSpace(); if (base != nullptr) free(base); @@ -457,21 +459,22 @@ void substring_to_example(vw* all, example* ae, substring example) TC_parser parser_line(bar_location, example.end, *all, ae); } -std::vector split(char* phrase, std::string delimiter){ - std::vector list; - std::string s = std::string(phrase); - size_t pos = 0; - std::string token; - while ((pos = s.find(delimiter)) != std::string::npos) { - token = s.substr(0, pos); - list.push_back(token); - s.erase(0, pos + delimiter.length()); - } - list.push_back(s); - return list; +std::vector split(char* phrase, std::string delimiter) +{ + std::vector list; + std::string s = std::string(phrase); + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) + { + token = s.substr(0, pos); + list.push_back(token); + s.erase(0, pos + delimiter.length()); + } + list.push_back(s); + return list; } - namespace VW { void read_line(vw& all, example* ex, char* line) @@ -484,10 +487,10 @@ void read_line(vw& all, example* ex, char* line) void read_lines(vw* all, char* line, size_t /*len*/, v_array& examples) { auto lines = split(line, "\n"); - for(size_t i = 0; i < lines.size(); i++) + for (size_t i = 0; i < lines.size(); i++) { // Check if a new empty example needs to be added. - if(examples.size() < i + 1) + if (examples.size() < i + 1) { examples.push_back(&VW::get_unused_example(all)); } @@ -495,5 +498,4 @@ void read_lines(vw* all, char* line, size_t /*len*/, v_array& examples } } - } // namespace VW diff --git a/vowpalwabbit/vw.h b/vowpalwabbit/vw.h index 3a618d7375c..9e9cd8eab84 100644 --- a/vowpalwabbit/vw.h +++ b/vowpalwabbit/vw.h @@ -177,4 +177,14 @@ inline void set_weight(vw& all, uint32_t index, uint32_t offset, float value) inline uint32_t num_weights(vw& all) { return (uint32_t)all.length(); } inline uint32_t get_stride(vw& all) { return all.weights.stride(); } + +inline void init_features(primitive_feature_space& fs, size_t features_count) { + fs.fs = new feature[features_count]; + fs.len = features_count; +} + +inline void set_feature(primitive_feature_space& fs, size_t index, uint64_t feature_hash, float value) { + fs.fs[index].weight_index = feature_hash; + fs.fs[index].x = value; +} } // namespace VW diff --git a/vowpalwabbit/vwdll.cpp b/vowpalwabbit/vwdll.cpp index 2bb00562f86..8023f73d868 100644 --- a/vowpalwabbit/vwdll.cpp +++ b/vowpalwabbit/vwdll.cpp @@ -77,19 +77,30 @@ VW_DLL_MEMBER void VW_CALLING_CONV VW_Finish(VW_HANDLE handle) VW::finish(*pointer); } -VW_DLL_MEMBER VW_EXAMPLE VW_CALLING_CONV VW_ImportExample(VW_HANDLE handle, const char * label, VW_FEATURE_SPACE* features, size_t len) +VW_DLL_MEMBER VW_EXAMPLE VW_CALLING_CONV VW_ImportExample(VW_HANDLE handle, const char * label, VW_FEATURE_SPACE features, size_t len) { vw * pointer = static_cast(handle); VW::primitive_feature_space * f = reinterpret_cast( features ); return static_cast(VW::import_example(*pointer, label, f, len)); } +VW_DLL_MEMBER VW_FEATURE_SPACE VW_CALLING_CONV VW_InitializeFeatureSpaces(size_t len) +{ + return static_cast(new VW::primitive_feature_space[len]); +} + +VW_DLL_MEMBER VW_FEATURE_SPACE VW_CALLING_CONV VW_GetFeatureSpace(VW_FEATURE_SPACE first, size_t index) +{ + VW::primitive_feature_space* f = reinterpret_cast(first); + return static_cast(&f[index]); +} + VW_DLL_MEMBER VW_FEATURE_SPACE VW_CALLING_CONV VW_ExportExample(VW_HANDLE handle, VW_EXAMPLE e, size_t * plen) { vw* pointer = static_cast(handle); example* ex = static_cast(e); return static_cast(VW::export_example(*pointer, ex, *plen)); } -VW_DLL_MEMBER void VW_CALLING_CONV VW_ReleaseFeatureSpace(VW_FEATURE_SPACE* features, size_t len) +VW_DLL_MEMBER void VW_CALLING_CONV VW_ReleaseFeatureSpace(VW_FEATURE_SPACE features, size_t len) { VW::primitive_feature_space * f = reinterpret_cast( features ); VW::releaseFeatureSpace(f, len); } @@ -164,6 +175,30 @@ VW_DLL_MEMBER float VW_CALLING_CONV VW_GetConfidence(VW_EXAMPLE e) { return VW::get_confidence(static_cast(e)); } +VW_DLL_MEMBER size_t VW_CALLING_CONV VW_SetFeatureSpace(VW_HANDLE handle, VW_FEATURE_SPACE feature_space, const char* name) +{ VW::primitive_feature_space* f = reinterpret_cast(feature_space); + f->name = *name; + return VW_HashSpaceA(handle, name); +} + +VW_DLL_MEMBER void VW_CALLING_CONV VW_InitFeatures(VW_FEATURE_SPACE feature_space, size_t features_count) +{ + VW::primitive_feature_space* fs = reinterpret_cast(feature_space); + VW::init_features(*fs, features_count); +} + +VW_DLL_MEMBER VW_FEATURE VW_CALLING_CONV VW_GetFeature(VW_FEATURE_SPACE feature_space, size_t index) +{ + VW::primitive_feature_space* fs = reinterpret_cast(feature_space); + return &(fs->fs[index]); +} + +VW_DLL_MEMBER void VW_CALLING_CONV VW_SetFeature(VW_FEATURE_SPACE feature_space, size_t index, size_t feature_hash, float value) +{ + VW::primitive_feature_space* fs = reinterpret_cast(feature_space); + VW::set_feature(*fs, index, feature_hash, value); +} + VW_DLL_MEMBER VW_FEATURE VW_CALLING_CONV VW_GetFeatures(VW_HANDLE handle, VW_EXAMPLE e, size_t* plen) { vw* pointer = static_cast(handle); return VW::get_features(*pointer, static_cast(e), *plen); diff --git a/vowpalwabbit/vwdll.h b/vowpalwabbit/vwdll.h index ce38cdbd65e..c28f812b51f 100644 --- a/vowpalwabbit/vwdll.h +++ b/vowpalwabbit/vwdll.h @@ -73,10 +73,12 @@ extern "C" VW_DLL_MEMBER void VW_CALLING_CONV VW_Finish(VW_HANDLE handle); VW_DLL_MEMBER VW_EXAMPLE VW_CALLING_CONV VW_ImportExample( - VW_HANDLE handle, const char* label, VW_FEATURE_SPACE* features, size_t len); + VW_HANDLE handle, const char* label, VW_FEATURE_SPACE features, size_t len); + VW_DLL_MEMBER VW_FEATURE_SPACE VW_CALLING_CONV VW_InitializeFeatureSpaces(size_t len); + VW_DLL_MEMBER VW_FEATURE_SPACE VW_CALLING_CONV VW_GetFeatureSpace(VW_FEATURE_SPACE first, size_t index); VW_DLL_MEMBER VW_FEATURE_SPACE VW_CALLING_CONV VW_ExportExample(VW_HANDLE handle, VW_EXAMPLE e, size_t* plen); - VW_DLL_MEMBER void VW_CALLING_CONV VW_ReleaseFeatureSpace(VW_FEATURE_SPACE* features, size_t len); + VW_DLL_MEMBER void VW_CALLING_CONV VW_ReleaseFeatureSpace(VW_FEATURE_SPACE features, size_t len); #ifdef USE_CODECVT VW_DLL_MEMBER VW_EXAMPLE VW_CALLING_CONV VW_ReadExample(VW_HANDLE handle, const char16_t* line); #endif @@ -100,6 +102,10 @@ extern "C" VW_DLL_MEMBER const char* VW_CALLING_CONV VW_GetTag(VW_EXAMPLE e); VW_DLL_MEMBER size_t VW_CALLING_CONV VW_GetFeatureNumber(VW_EXAMPLE e); VW_DLL_MEMBER float VW_CALLING_CONV VW_GetConfidence(VW_EXAMPLE e); + VW_DLL_MEMBER size_t VW_CALLING_CONV VW_SetFeatureSpace(VW_HANDLE handle, VW_FEATURE_SPACE feature_space, const char* name); + VW_DLL_MEMBER void VW_CALLING_CONV VW_InitFeatures(VW_FEATURE_SPACE feature_space, size_t features_count); + VW_DLL_MEMBER VW_FEATURE VW_CALLING_CONV VW_GetFeature(VW_FEATURE_SPACE feature_space, size_t index); + VW_DLL_MEMBER void VW_CALLING_CONV VW_SetFeature(VW_FEATURE feature, size_t index, size_t feature_hash, float value); VW_DLL_MEMBER VW_FEATURE VW_CALLING_CONV VW_GetFeatures(VW_HANDLE handle, VW_EXAMPLE e, size_t* plen); VW_DLL_MEMBER void VW_CALLING_CONV VW_ReturnFeatures(VW_FEATURE f); #ifdef USE_CODECVT @@ -120,7 +126,9 @@ extern "C" VW_DLL_MEMBER float VW_CALLING_CONV VW_Learn(VW_HANDLE handle, VW_EXAMPLE e); VW_DLL_MEMBER float VW_CALLING_CONV VW_Predict(VW_HANDLE handle, VW_EXAMPLE e); VW_DLL_MEMBER float VW_CALLING_CONV VW_PredictCostSensitive(VW_HANDLE handle, VW_EXAMPLE e); + //deprecated. Please use either VW_ReadExample for parsing, or VW_ImportExample for example construction VW_DLL_MEMBER void VW_CALLING_CONV VW_AddLabel(VW_EXAMPLE e, float label, float weight, float base); + // deprecated. Please use either VW_ReadExample for parsing, or VW_ImportExample for example construction VW_DLL_MEMBER void VW_CALLING_CONV VW_AddStringLabel(VW_HANDLE handle, VW_EXAMPLE e, const char* label); VW_DLL_MEMBER float VW_CALLING_CONV VW_Get_Weight(VW_HANDLE handle, size_t index, size_t offset);