From 4f8639abd06c679d4382eb715a1793afd94df3d2 Mon Sep 17 00:00:00 2001
From: Chen Yufei <cyfdecyf@gmail.com>
Date: Thu, 18 Mar 2021 19:02:43 +0800
Subject: [PATCH] Add benchmark for CSVParser with Atof and AtofPrecise.

---
 tests/benchmark/parser/CMakeLists.txt         | 18 +++++++
 tests/benchmark/parser/README.md              | 38 ++++++++++++++
 tests/benchmark/parser/gen_csv.py             | 17 +++++++
 tests/benchmark/parser/parser.cpp             | 49 +++++++++++++++++++
 .../benchmark/parser/run_parser_benchmark.sh  | 43 ++++++++++++++++
 5 files changed, 165 insertions(+)
 create mode 100644 tests/benchmark/parser/CMakeLists.txt
 create mode 100644 tests/benchmark/parser/README.md
 create mode 100644 tests/benchmark/parser/gen_csv.py
 create mode 100644 tests/benchmark/parser/parser.cpp
 create mode 100755 tests/benchmark/parser/run_parser_benchmark.sh

diff --git a/tests/benchmark/parser/CMakeLists.txt b/tests/benchmark/parser/CMakeLists.txt
new file mode 100644
index 000000000000..47d1219321ff
--- /dev/null
+++ b/tests/benchmark/parser/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(benchmark)
+
+OPTION(USE_PRECISE_TEXT_PARSER "Use precise (and faster) double parser for text input file" OFF)
+
+if(USE_PRECISE_TEXT_PARSER)
+    ADD_DEFINITIONS(-DUSE_PRECISE_TEXT_PARSER)
+endif(USE_PRECISE_TEXT_PARSER)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O3")
+
+include_directories(${CMAKE_SOURCE_DIR}/../../../include)
+include_directories(${CMAKE_SOURCE_DIR}/../../../src)
+#link_directories(${CMAKE_SOURCE_DIR}/lib)
+
+add_executable(parser parser.cpp)
diff --git a/tests/benchmark/parser/README.md b/tests/benchmark/parser/README.md
new file mode 100644
index 000000000000..c5c3f1edfdf4
--- /dev/null
+++ b/tests/benchmark/parser/README.md
@@ -0,0 +1,38 @@
+This is a simple benchmark comparing performance of `Common::Atof`
+and `Common::AtofPrecise` when used in `CSVParser`.
+
+Just run `./run_parser_benchmark.sh` in this directory.
+
+The test script generates 20000 rows, 2000 columns csv, 840MB file size.
+
+For this test, `Common::Atof` is much faster than `Common::AtofPrecise`.
+
+Benchmark run output on Intel Xeon 2640 v3:
+
+```
+========== Benchmark run Atof parser ==========                                                                                                                                                                                                                                                                                                                                                                 real    0m2.027s                                                                                                                                                                                        user    0m1.822s
+real    0m2.027s
+user    0m1.822s
+sys     0m0.204s
+
+real    0m2.186s
+user    0m1.998s
+sys     0m0.188s
+
+real    0m2.202s
+user    0m2.010s
+sys     0m0.192s
+
+========== Benchmark run AtofPrecise parser ==========
+real    0m6.556s
+user    0m6.324s
+sys     0m0.232s
+
+real    0m6.648s
+user    0m6.496s
+sys     0m0.152s
+
+real    0m6.912s
+user    0m6.748s
+sys     0m0.164s
+```
\ No newline at end of file
diff --git a/tests/benchmark/parser/gen_csv.py b/tests/benchmark/parser/gen_csv.py
new file mode 100644
index 000000000000..5c2c9b5d2e23
--- /dev/null
+++ b/tests/benchmark/parser/gen_csv.py
@@ -0,0 +1,17 @@
+import sys
+
+import numpy as np
+
+
+def gen_csv(fname, nrow, ncol):
+    nrow = int(nrow)
+    ncol = int(ncol)
+
+    arr = np.random.random(nrow * ncol) * 5
+    arr = arr.reshape((nrow, ncol))
+    np.savetxt(fname, arr, fmt='%.19f', delimiter=',')
+
+
+if __name__ == '__main__':
+    import argh
+    argh.dispatch_command(gen_csv)
\ No newline at end of file
diff --git a/tests/benchmark/parser/parser.cpp b/tests/benchmark/parser/parser.cpp
new file mode 100644
index 000000000000..6eb244ef707c
--- /dev/null
+++ b/tests/benchmark/parser/parser.cpp
@@ -0,0 +1,49 @@
+// This is a very simple benchmark for comparing performance of Atof and AtofPrecise.
+
+#include <cstdlib>
+#include <string>
+#include <fstream>
+#include <iostream>
+
+#include <io/parser.hpp>
+
+namespace LightGBM {
+
+void ParseCSV(const std::string& fpath, int ncol) {
+  CSVParser parser(-1, ncol);
+
+  std::ifstream infile(fpath);
+  if (! infile) {
+    std::cerr << "fail to open " << fpath;
+    std::exit(1);
+  }
+
+  std::string line;
+  double label;
+  std::vector<std::pair<int, double>> oneline_features;
+  while (getline(infile, line)) {
+    parser.ParseOneLine(line.c_str(), &oneline_features, &label);
+//    printf("%f\n", oneline_features[0].second);
+    oneline_features.clear();
+  }
+}
+
+}  // namespace LightGBM
+
+int main(int argc, const char* argv[]) {
+  if (argc != 3) {
+    printf("usage: parser <fname> <ncol>\n");
+    exit(1);
+  }
+
+  const char* fpath = argv[1];
+  long ncol = strtol(argv[2], nullptr, 10);
+  if (errno != 0) {
+    fprintf(stderr, "fail to parse ncol\n");
+    exit(1);
+  }
+
+  LightGBM::ParseCSV(fpath, ncol);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/benchmark/parser/run_parser_benchmark.sh b/tests/benchmark/parser/run_parser_benchmark.sh
new file mode 100755
index 000000000000..2d4d380fac71
--- /dev/null
+++ b/tests/benchmark/parser/run_parser_benchmark.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+set -e
+
+nrow=20000
+ncol=2000
+
+build() {
+  d=$1
+  flag=$2
+  echo "building $d $flag"
+  test -d $d || (mkdir -p $d && cd $d && cmake $flag ..)
+  pushd $d
+  make
+  popd
+}
+
+gen_data() {
+  if [[ ! -f test.csv ]]; then
+    echo "generating csv ..."
+    python gen_csv.py test.csv $nrow $ncol
+  fi
+}
+
+parser_benchmark() {
+  echo "========== Benchmark run Atof parser =========="
+  for i in {1..3}; do
+#    /usr/bin/time ./build/parser test.csv $ncol
+    time ./build/parser test.csv $ncol
+  done
+
+  echo
+  echo "========== Benchmark run AtofPrecise parser =========="
+  for i in {1..3}; do
+#    /usr/bin/time ./build-precise/parser test.csv $ncol
+    time ./build-precise/parser test.csv $ncol
+  done
+}
+
+build build ""
+build build-precise "-DUSE_PRECISE_TEXT_PARSER=on"
+gen_data
+parser_benchmark