Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: add csv parser to throughput measurement tool #4110

Merged
merged 7 commits into from
Aug 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions test/tools/parser_throughput/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,8 @@ add_executable(parser_throughput main.cc)

# We are using a "private" header in VW and therefore must bring in the RapidJSON dependency manually.
target_link_libraries(parser_throughput PRIVATE RapidJSON vw_core)

if(VW_BUILD_CSV)
target_link_libraries(parser_throughput PRIVATE vw_csv_parser)
target_compile_definitions(parser_throughput PUBLIC VW_BUILD_CSV)
endif()
66 changes: 52 additions & 14 deletions test/tools/parser_throughput/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,14 @@
#include "vw/config/option_builder.h"
#include "vw/config/option_group_definition.h"
#include "vw/config/options_cli.h"
#include "vw/core/io_buf.h"
#include "vw/core/parse_example_json.h"
#include "vw/core/vw.h"
#include "vw/io/io_adapter.h"

#ifdef VW_BUILD_CSV
# include "vw/csv_parser/parse_example_csv.h"
#endif

#include <chrono>
#include <exception>
Expand All @@ -15,7 +21,8 @@
enum class parser_type
{
text,
dsjson
dsjson,
csv
};

parser_type to_parser_type(const std::string& str)
Expand All @@ -25,6 +32,10 @@ parser_type to_parser_type(const std::string& str)
{
return parser_type::dsjson;
}
else if (str == "csv")
{
return parser_type::csv;
}
else
{
throw std::runtime_error("Unknown input type: " + str);
Expand All @@ -45,7 +56,7 @@ int main(int argc, char** argv)
.add(VW::config::make_option("args", extra_args).short_name("a").help("VW args to setup parser correctly"))
.add(VW::config::make_option("type", type_str)
.short_name("t")
.help("Type of input format. [text, djson] (required)"));
.help("Type of input format. [text, dsjson, csv] (required)"));

opts.add_and_parse(desc);
// Return value is ignored as option reachability is not relevant here.
Expand Down Expand Up @@ -76,7 +87,7 @@ int main(int argc, char** argv)
std::string args = "--no_stdin --quiet ";
if (opts.was_supplied("args"))
{
const auto& illegal_options = {"--djson", "--json", "--data", "-d"};
const auto& illegal_options = {"--dsjson", "--json", "--data", "-d", "--csv"};
for (const auto& illegal_option : illegal_options)
{
if (extra_args.find(illegal_option) != std::string::npos)
Expand All @@ -89,15 +100,15 @@ int main(int argc, char** argv)
}

size_t bytes = 0;
std::vector<std::string> lines;
std::vector<std::string> file_contents_as_lines;
std::ifstream file(file_name);
if (file.is_open())
{
std::string line;
while (std::getline(file, line))
{
bytes += line.size() * sizeof(std::string::value_type);
lines.push_back(std::move(line));
file_contents_as_lines.push_back(std::move(line));
}
file.close();
}
Expand All @@ -106,8 +117,23 @@ int main(int argc, char** argv)
std::cerr << "error: could not open file: '" << file_name << "'\n";
}

// Other option is the parser can use this io_buf. It's using more memory for no good reason, unless we run out it
// shouldnt matter in this test tool.
io_buf file_contents_as_io_buf;
std::ifstream testFile(file_name, std::ios::binary);
std::vector<char> fileContents((std::istreambuf_iterator<char>(testFile)), std::istreambuf_iterator<char>());
file_contents_as_io_buf.add_file(VW::io::create_buffer_view(fileContents.data(), fileContents.size()));

const auto type = to_parser_type(type_str);
if (type == parser_type::dsjson) { args += " --dsjson"; }
else if (type == parser_type::csv)
{
#ifndef VW_BUILD_CSV
THROW("CSV parser not enabled. Please reconfigure cmake and rebuild with VW_BUILD_CSV=ON");
#endif

args += " --csv";
}

auto vw = VW::initialize(args, nullptr, false, nullptr, nullptr);
const auto is_multiline = vw->l->is_multiline();
Expand All @@ -118,7 +144,7 @@ int main(int argc, char** argv)
if (is_multiline)
{
VW::multi_ex exs;
for (const auto& line : lines)
for (const auto& line : file_contents_as_lines)
{
if (line.empty() && !exs.empty())
{
Expand All @@ -140,7 +166,7 @@ int main(int argc, char** argv)
}
else
{
for (const auto& line : lines)
for (const auto& line : file_contents_as_lines)
{
VW::example& ae = VW::get_unused_example(vw);
VW::string_view example(line.c_str(), line.size());
Expand All @@ -149,21 +175,33 @@ int main(int argc, char** argv)
}
}
}
else
else if (type == parser_type::dsjson)
{
DecisionServiceInteraction interaction;
for (const auto& line : lines)
for (const auto& line : file_contents_as_lines)
{
VW::multi_ex examples;
examples.push_back(&VW::get_unused_example(vw));
VW::read_line_decision_service_json<false>(*vw, examples, const_cast<char*>(line.data()), line.length(), false,
(VW::example_factory_t)&VW::get_unused_example, (void*)vw, &interaction);
VW::multi_ex result;
result.reserve(examples.size());
for (size_t i = 0; i < examples.size(); ++i) { result.push_back(examples[i]); }
// TODO - finish_example should support a v_array as input.
VW::finish_example(*vw, result);
VW::finish_example(*vw, examples);
}
}
else
{
#ifdef VW_BUILD_CSV
VW::multi_ex examples;
examples.push_back(&VW::get_unused_example(vw));
while (VW::parsers::parse_csv_examples(vw, file_contents_as_io_buf, examples) != 0)
{
VW::finish_example(*vw, *examples[0]);
examples.clear();
examples.push_back(&VW::get_unused_example(vw));
}
VW::finish_example(*vw, *examples[0]);
#else
jackgerrits marked this conversation as resolved.
Show resolved Hide resolved
THROW("CSV parser not enabled. Please reconfigure cmake and rebuild with VW_BUILD_CSV=ON");
#endif
}
const auto end = std::chrono::high_resolution_clock::now();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "vw/core/v_array.h"

#include <unordered_map>
#include <utility>
#include <vector>

namespace VW
Expand Down Expand Up @@ -41,7 +42,7 @@ class csv_parser : public VW::details::input_parser
std::unordered_map<std::string, VW::v_array<size_t>> feature_list;
std::unordered_map<std::string, float> ns_value;

explicit csv_parser(csv_parser_options options) : VW::details::input_parser("csv"), options(options) {}
explicit csv_parser(csv_parser_options options) : VW::details::input_parser("csv"), options(std::move(options)) {}
virtual ~csv_parser() = default;

static void set_parse_args(VW::config::option_group_definition& in_options, csv_parser_options& parsed_options);
Expand Down