From 63eb7ad5e931aed69e05c92037c694cc440f19d1 Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Thu, 11 Aug 2022 10:17:33 -0400 Subject: [PATCH 1/4] test: add csv parser to throughput measurement tool --- test/tools/parser_throughput/CMakeLists.txt | 5 ++ test/tools/parser_throughput/main.cc | 52 +++++++++++++++---- .../include/vw/csv_parser/parse_example_csv.h | 3 +- 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/test/tools/parser_throughput/CMakeLists.txt b/test/tools/parser_throughput/CMakeLists.txt index 8ab6f41f5e6..122e70a29a5 100644 --- a/test/tools/parser_throughput/CMakeLists.txt +++ b/test/tools/parser_throughput/CMakeLists.txt @@ -2,3 +2,8 @@ add_executable(parser_throughput main.cc) # We are using a "private" header in VW and therefore must bring in the RapidJSON dependency manually. target_link_libraries(parser_throughput PRIVATE RapidJSON vw_core) + +if(VW_BUILD_CSV) + target_link_libraries(parser_throughput PRIVATE vw_csv_parser) + target_compile_definitions(parser_throughput PUBLIC VW_BUILD_CSV) +endif() \ No newline at end of file diff --git a/test/tools/parser_throughput/main.cc b/test/tools/parser_throughput/main.cc index 77b90ae12a5..85f7b350a4b 100644 --- a/test/tools/parser_throughput/main.cc +++ b/test/tools/parser_throughput/main.cc @@ -2,8 +2,14 @@ #include "vw/config/option_builder.h" #include "vw/config/option_group_definition.h" #include "vw/config/options_cli.h" +#include "vw/core/io_buf.h" #include "vw/core/parse_example_json.h" #include "vw/core/vw.h" +#include "vw/io/io_adapter.h" + +#ifdef VW_BUILD_CSV +# include "vw/csv_parser/parse_example_csv.h" +#endif #include #include @@ -15,7 +21,8 @@ enum class parser_type { text, - dsjson + dsjson, + csv }; parser_type to_parser_type(const std::string& str) @@ -25,6 +32,10 @@ parser_type to_parser_type(const std::string& str) { return parser_type::dsjson; } + else if (str == "csv") + { + return parser_type::csv; + } else { throw std::runtime_error("Unknown input type: " + str); @@ -45,7 +56,7 @@ int main(int argc, char** argv) .add(VW::config::make_option("args", extra_args).short_name("a").help("VW args to setup parser correctly")) .add(VW::config::make_option("type", type_str) .short_name("t") - .help("Type of input format. [text, djson] (required)")); + .help("Type of input format. [text, djson, csv] (required)")); opts.add_and_parse(desc); // Return value is ignored as option reachability is not relevant here. @@ -76,7 +87,7 @@ int main(int argc, char** argv) std::string args = "--no_stdin --quiet "; if (opts.was_supplied("args")) { - const auto& illegal_options = {"--djson", "--json", "--data", "-d"}; + const auto& illegal_options = {"--djson", "--json", "--data", "-d", "--csv"}; for (const auto& illegal_option : illegal_options) { if (extra_args.find(illegal_option) != std::string::npos) @@ -108,6 +119,14 @@ int main(int argc, char** argv) const auto type = to_parser_type(type_str); if (type == parser_type::dsjson) { args += " --dsjson"; } + else if (type == parser_type::csv) + { +#ifndef VW_BUILD_CSV + THROW("CSV parser not enabled. Please reconfigure cmake and rebuild with VW_BUILD_CSV=ON"); +#endif + + args += " --csv"; + } auto vw = VW::initialize(args, nullptr, false, nullptr, nullptr); const auto is_multiline = vw->l->is_multiline(); @@ -149,7 +168,7 @@ int main(int argc, char** argv) } } } - else + else if (type == parser_type::dsjson) { DecisionServiceInteraction interaction; for (const auto& line : lines) @@ -158,12 +177,27 @@ int main(int argc, char** argv) examples.push_back(&VW::get_unused_example(vw)); VW::read_line_decision_service_json(*vw, examples, const_cast(line.data()), line.length(), false, (VW::example_factory_t)&VW::get_unused_example, (void*)vw, &interaction); - VW::multi_ex result; - result.reserve(examples.size()); - for (size_t i = 0; i < examples.size(); ++i) { result.push_back(examples[i]); } - // TODO - finish_example should support a v_array as input. - VW::finish_example(*vw, result); + VW::finish_example(*vw, examples); + } + } + else + { +#ifdef VW_BUILD_CSV + + io_buf file_contents; + file_contents.add_file(VW::io::open_file_reader(file_name)); + + VW::multi_ex examples; + examples.push_back(&VW::get_unused_example(vw)); + while (VW::parsers::parse_csv_examples(vw, file_contents, examples) != 0) + { + VW::finish_example(*vw, *examples[0]); + examples.clear(); + examples.push_back(&VW::get_unused_example(vw)); } +#else + THROW("CSV parser not enabled. Please reconfigure cmake and rebuild with VW_BUILD_CSV=ON"); +#endif } const auto end = std::chrono::high_resolution_clock::now(); diff --git a/vowpalwabbit/csv_parser/include/vw/csv_parser/parse_example_csv.h b/vowpalwabbit/csv_parser/include/vw/csv_parser/parse_example_csv.h index e66e58ffde2..044752395fd 100644 --- a/vowpalwabbit/csv_parser/include/vw/csv_parser/parse_example_csv.h +++ b/vowpalwabbit/csv_parser/include/vw/csv_parser/parse_example_csv.h @@ -10,6 +10,7 @@ #include "vw/core/v_array.h" #include +#include #include namespace VW @@ -41,7 +42,7 @@ class csv_parser : public VW::details::input_parser std::unordered_map> feature_list; std::unordered_map ns_value; - explicit csv_parser(csv_parser_options options) : VW::details::input_parser("csv"), options(options) {} + explicit csv_parser(csv_parser_options options) : VW::details::input_parser("csv"), options(std::move(options)) {} virtual ~csv_parser() = default; static void set_parse_args(VW::config::option_group_definition& in_options, csv_parser_options& parsed_options); From 0cf5c55f018498d2c3569848e115d07011e9eacd Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Thu, 11 Aug 2022 10:24:04 -0400 Subject: [PATCH 2/4] move file loading outside of time meaurement to be fair --- test/tools/parser_throughput/main.cc | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/test/tools/parser_throughput/main.cc b/test/tools/parser_throughput/main.cc index 85f7b350a4b..86500688449 100644 --- a/test/tools/parser_throughput/main.cc +++ b/test/tools/parser_throughput/main.cc @@ -100,7 +100,7 @@ int main(int argc, char** argv) } size_t bytes = 0; - std::vector lines; + std::vector file_contents_as_lines; std::ifstream file(file_name); if (file.is_open()) { @@ -108,7 +108,7 @@ int main(int argc, char** argv) while (std::getline(file, line)) { bytes += line.size() * sizeof(std::string::value_type); - lines.push_back(std::move(line)); + file_contents_as_lines.push_back(std::move(line)); } file.close(); } @@ -117,6 +117,13 @@ int main(int argc, char** argv) std::cerr << "error: could not open file: '" << file_name << "'\n"; } + // Other option is the parser can use this io_buf. It's using more memory for no good reason, unless we run out it + // shouldnt matter in this test tool. + io_buf file_contents_as_io_buf; + std::ifstream testFile(file_name, std::ios::binary); + std::vector fileContents((std::istreambuf_iterator(testFile)), std::istreambuf_iterator()); + file_contents_as_io_buf.add_file(VW::io::create_buffer_view(fileContents.data(), fileContents.size())); + const auto type = to_parser_type(type_str); if (type == parser_type::dsjson) { args += " --dsjson"; } else if (type == parser_type::csv) @@ -137,7 +144,7 @@ int main(int argc, char** argv) if (is_multiline) { VW::multi_ex exs; - for (const auto& line : lines) + for (const auto& line : file_contents_as_lines) { if (line.empty() && !exs.empty()) { @@ -159,7 +166,7 @@ int main(int argc, char** argv) } else { - for (const auto& line : lines) + for (const auto& line : file_contents_as_lines) { VW::example& ae = VW::get_unused_example(vw); VW::string_view example(line.c_str(), line.size()); @@ -171,7 +178,7 @@ int main(int argc, char** argv) else if (type == parser_type::dsjson) { DecisionServiceInteraction interaction; - for (const auto& line : lines) + for (const auto& line : file_contents_as_lines) { VW::multi_ex examples; examples.push_back(&VW::get_unused_example(vw)); @@ -183,13 +190,9 @@ int main(int argc, char** argv) else { #ifdef VW_BUILD_CSV - - io_buf file_contents; - file_contents.add_file(VW::io::open_file_reader(file_name)); - VW::multi_ex examples; examples.push_back(&VW::get_unused_example(vw)); - while (VW::parsers::parse_csv_examples(vw, file_contents, examples) != 0) + while (VW::parsers::parse_csv_examples(vw, file_contents_as_io_buf, examples) != 0) { VW::finish_example(*vw, *examples[0]); examples.clear(); From 8489ca2d5b600e3976cc044aaaa7713b5105efe5 Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Thu, 11 Aug 2022 14:13:10 -0400 Subject: [PATCH 3/4] Update main.cc --- test/tools/parser_throughput/main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/tools/parser_throughput/main.cc b/test/tools/parser_throughput/main.cc index 86500688449..cc0e6791517 100644 --- a/test/tools/parser_throughput/main.cc +++ b/test/tools/parser_throughput/main.cc @@ -56,7 +56,7 @@ int main(int argc, char** argv) .add(VW::config::make_option("args", extra_args).short_name("a").help("VW args to setup parser correctly")) .add(VW::config::make_option("type", type_str) .short_name("t") - .help("Type of input format. [text, djson, csv] (required)")); + .help("Type of input format. [text, dsjson, csv] (required)")); opts.add_and_parse(desc); // Return value is ignored as option reachability is not relevant here. @@ -87,7 +87,7 @@ int main(int argc, char** argv) std::string args = "--no_stdin --quiet "; if (opts.was_supplied("args")) { - const auto& illegal_options = {"--djson", "--json", "--data", "-d", "--csv"}; + const auto& illegal_options = {"--dsjson", "--json", "--data", "-d", "--csv"}; for (const auto& illegal_option : illegal_options) { if (extra_args.find(illegal_option) != std::string::npos) From 25b293c74f27b421c44d7c4e4c7675d2e7945207 Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Tue, 16 Aug 2022 16:41:53 -0400 Subject: [PATCH 4/4] Update test/tools/parser_throughput/main.cc Co-authored-by: Hollow Man --- test/tools/parser_throughput/main.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/test/tools/parser_throughput/main.cc b/test/tools/parser_throughput/main.cc index cc0e6791517..2e0870ea632 100644 --- a/test/tools/parser_throughput/main.cc +++ b/test/tools/parser_throughput/main.cc @@ -198,6 +198,7 @@ int main(int argc, char** argv) examples.clear(); examples.push_back(&VW::get_unused_example(vw)); } + VW::finish_example(*vw, *examples[0]); #else THROW("CSV parser not enabled. Please reconfigure cmake and rebuild with VW_BUILD_CSV=ON"); #endif