From 4f8639abd06c679d4382eb715a1793afd94df3d2 Mon Sep 17 00:00:00 2001 From: Chen Yufei Date: Thu, 18 Mar 2021 19:02:43 +0800 Subject: [PATCH] Add benchmark for CSVParser with Atof and AtofPrecise. --- tests/benchmark/parser/CMakeLists.txt | 18 +++++++ tests/benchmark/parser/README.md | 38 ++++++++++++++ tests/benchmark/parser/gen_csv.py | 17 +++++++ tests/benchmark/parser/parser.cpp | 49 +++++++++++++++++++ .../benchmark/parser/run_parser_benchmark.sh | 43 ++++++++++++++++ 5 files changed, 165 insertions(+) create mode 100644 tests/benchmark/parser/CMakeLists.txt create mode 100644 tests/benchmark/parser/README.md create mode 100644 tests/benchmark/parser/gen_csv.py create mode 100644 tests/benchmark/parser/parser.cpp create mode 100755 tests/benchmark/parser/run_parser_benchmark.sh diff --git a/tests/benchmark/parser/CMakeLists.txt b/tests/benchmark/parser/CMakeLists.txt new file mode 100644 index 000000000000..47d1219321ff --- /dev/null +++ b/tests/benchmark/parser/CMakeLists.txt @@ -0,0 +1,18 @@ +cmake_minimum_required(VERSION 3.0) + +project(benchmark) + +OPTION(USE_PRECISE_TEXT_PARSER "Use precise (and faster) double parser for text input file" OFF) + +if(USE_PRECISE_TEXT_PARSER) + ADD_DEFINITIONS(-DUSE_PRECISE_TEXT_PARSER) +endif(USE_PRECISE_TEXT_PARSER) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O3") + +include_directories(${CMAKE_SOURCE_DIR}/../../../include) +include_directories(${CMAKE_SOURCE_DIR}/../../../src) +#link_directories(${CMAKE_SOURCE_DIR}/lib) + +add_executable(parser parser.cpp) diff --git a/tests/benchmark/parser/README.md b/tests/benchmark/parser/README.md new file mode 100644 index 000000000000..c5c3f1edfdf4 --- /dev/null +++ b/tests/benchmark/parser/README.md @@ -0,0 +1,38 @@ +This is a simple benchmark comparing performance of `Common::Atof` +and `Common::AtofPrecise` when used in `CSVParser`. + +Just run `./run_parser_benchmark.sh` in this directory. + +The test script generates 20000 rows, 2000 columns csv, 840MB file size. + +For this test, `Common::Atof` is much faster than `Common::AtofPrecise`. + +Benchmark run output on Intel Xeon 2640 v3: + +``` +========== Benchmark run Atof parser ========== real 0m2.027s user 0m1.822s +real 0m2.027s +user 0m1.822s +sys 0m0.204s + +real 0m2.186s +user 0m1.998s +sys 0m0.188s + +real 0m2.202s +user 0m2.010s +sys 0m0.192s + +========== Benchmark run AtofPrecise parser ========== +real 0m6.556s +user 0m6.324s +sys 0m0.232s + +real 0m6.648s +user 0m6.496s +sys 0m0.152s + +real 0m6.912s +user 0m6.748s +sys 0m0.164s +``` \ No newline at end of file diff --git a/tests/benchmark/parser/gen_csv.py b/tests/benchmark/parser/gen_csv.py new file mode 100644 index 000000000000..5c2c9b5d2e23 --- /dev/null +++ b/tests/benchmark/parser/gen_csv.py @@ -0,0 +1,17 @@ +import sys + +import numpy as np + + +def gen_csv(fname, nrow, ncol): + nrow = int(nrow) + ncol = int(ncol) + + arr = np.random.random(nrow * ncol) * 5 + arr = arr.reshape((nrow, ncol)) + np.savetxt(fname, arr, fmt='%.19f', delimiter=',') + + +if __name__ == '__main__': + import argh + argh.dispatch_command(gen_csv) \ No newline at end of file diff --git a/tests/benchmark/parser/parser.cpp b/tests/benchmark/parser/parser.cpp new file mode 100644 index 000000000000..6eb244ef707c --- /dev/null +++ b/tests/benchmark/parser/parser.cpp @@ -0,0 +1,49 @@ +// This is a very simple benchmark for comparing performance of Atof and AtofPrecise. + +#include +#include +#include +#include + +#include + +namespace LightGBM { + +void ParseCSV(const std::string& fpath, int ncol) { + CSVParser parser(-1, ncol); + + std::ifstream infile(fpath); + if (! infile) { + std::cerr << "fail to open " << fpath; + std::exit(1); + } + + std::string line; + double label; + std::vector> oneline_features; + while (getline(infile, line)) { + parser.ParseOneLine(line.c_str(), &oneline_features, &label); +// printf("%f\n", oneline_features[0].second); + oneline_features.clear(); + } +} + +} // namespace LightGBM + +int main(int argc, const char* argv[]) { + if (argc != 3) { + printf("usage: parser \n"); + exit(1); + } + + const char* fpath = argv[1]; + long ncol = strtol(argv[2], nullptr, 10); + if (errno != 0) { + fprintf(stderr, "fail to parse ncol\n"); + exit(1); + } + + LightGBM::ParseCSV(fpath, ncol); + + return 0; +} \ No newline at end of file diff --git a/tests/benchmark/parser/run_parser_benchmark.sh b/tests/benchmark/parser/run_parser_benchmark.sh new file mode 100755 index 000000000000..2d4d380fac71 --- /dev/null +++ b/tests/benchmark/parser/run_parser_benchmark.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e + +nrow=20000 +ncol=2000 + +build() { + d=$1 + flag=$2 + echo "building $d $flag" + test -d $d || (mkdir -p $d && cd $d && cmake $flag ..) + pushd $d + make + popd +} + +gen_data() { + if [[ ! -f test.csv ]]; then + echo "generating csv ..." + python gen_csv.py test.csv $nrow $ncol + fi +} + +parser_benchmark() { + echo "========== Benchmark run Atof parser ==========" + for i in {1..3}; do +# /usr/bin/time ./build/parser test.csv $ncol + time ./build/parser test.csv $ncol + done + + echo + echo "========== Benchmark run AtofPrecise parser ==========" + for i in {1..3}; do +# /usr/bin/time ./build-precise/parser test.csv $ncol + time ./build-precise/parser test.csv $ncol + done +} + +build build "" +build build-precise "-DUSE_PRECISE_TEXT_PARSER=on" +gen_data +parser_benchmark