Include CSV header in data files (#220)

smarr · Jul 12, 2023 · 89176e9 · 89176e9
2 parents 025c616 + c25d789
commit 89176e9
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 8 deletions.
diff --git a/rebench/model/benchmark.py b/rebench/model/benchmark.py
@@ -107,3 +107,6 @@ def as_dict(self):
     def from_str_list(cls, data_store, str_list):
         return data_store.get_config(str_list[0], str_list[1], str_list[2],
                                      None if str_list[3] == '' else str_list[3])
+    @classmethod
+    def get_column_headers(cls):
+        return ["benchmark", "executor", "suite", "extraArgs"]
diff --git a/rebench/model/measurement.py b/rebench/model/measurement.py
@@ -58,6 +58,10 @@ def from_str_list(cls, data_store, str_list, line_number=None, filename=None):
 
         return Measurement(invocation, iteration, value, unit, run_id,
                            criterion, line_number, filename)
+    @classmethod
+    def get_column_headers(cls):
+        run_id_headers = RunId.get_column_headers()
+        return ["invocation", "iteration", "value", "unit", "criterion"] + run_id_headers
 
     def as_dict(self):
         return {

diff --git a/rebench/model/run_id.py b/rebench/model/run_id.py
@@ -338,6 +338,11 @@ def from_str_list(cls, data_store, str_list):
         return data_store.create_run_id(
             benchmark, str_list[-4], str_list[-3], str_list[-2], str_list[-1])
 
+    @classmethod
+    def get_column_headers(cls):
+        benchmark_headers = Benchmark.get_column_headers()
+        return benchmark_headers + ["cores", "inputSize", "varValue", "machine"]
+
     def __str__(self):
         return "RunId(%s, %s, %s, %s, %s, %s, %d)" % (
             self.benchmark.name,

diff --git a/rebench/persistence.py b/rebench/persistence.py
@@ -304,22 +304,25 @@ def _parse_data_line(
             data_point = DataPoint(run_id)
         return data_point, previous_run_id
 
+    _SEP = "\t"  # separator between serialized parts of a measurement
+
     def _open_file_and_append_execution_comment(self):
         """
         Append a shebang (#!/path/to/executable) to the data file.
         This allows it theoretically to be executable.
         But more importantly also records execution metadata to reproduce the data.
         """
-        shebang_line = "#!%s\n" % (subprocess.list2cmdline(sys.argv))
-        shebang_line += _START_TIME_LINE + self._start_time + "\n"
-        shebang_line += "# Environment: " + json.dumps(determine_environment()) + "\n"
-        shebang_line += "# Source: " + json.dumps(
+        shebang_with_metadata = "#!%s\n" % (subprocess.list2cmdline(sys.argv))
+        shebang_with_metadata += _START_TIME_LINE + self._start_time + "\n"
+        shebang_with_metadata += "# Environment: " + json.dumps(determine_environment()) + "\n"
+        shebang_with_metadata += "# Source: " + json.dumps(
             determine_source_details(self._configurator)) + "\n"
+        shebang_with_metadata += self._SEP.join(Measurement.get_column_headers()) + "\n"
 
         try:
             # pylint: disable-next=unspecified-encoding,consider-using-with
             data_file = open(self._data_filename, 'a+')
-            data_file.write(shebang_line)
+            data_file.write(shebang_with_metadata)
             data_file.flush()
             return data_file
         except Exception as err:  # pylint: disable=broad-except
@@ -328,7 +331,6 @@ def _open_file_and_append_execution_comment(self):
                     os.getcwd(), err),
                 err)
 
-    _SEP = "\t"  # separator between serialized parts of a measurement
 
     def _persists_data_point_in_open_file(self, data_point):
         for measurement in data_point.get_measurements():

diff --git a/rebench/tests/persistency_test.py b/rebench/tests/persistency_test.py
@@ -17,6 +17,10 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
+import subprocess
+import json
+import sys
+from datetime import datetime
 from unittest import skipIf
 from .mock_http_server import MockHTTPServer
 from .rebench_test_case import ReBenchTestCase
@@ -36,7 +40,6 @@
 
 
 class PersistencyTest(ReBenchTestCase):
-
     def test_de_serialization(self):
         data_store = DataStore(self.ui)
         executor = ExecutorConf("MyVM", '', '',
@@ -67,7 +70,6 @@ def test_iteration_invocation_semantics(self):
         cnf = Configurator(load_config(self._path + '/persistency.conf'),
                            ds, self.ui, data_file=self._tmp_file)
         ds.load_data(None, False)
-
         self._assert_runs(cnf, 1, 0, 0)
 
         ex = Executor(cnf.get_runs(), False, self.ui)
@@ -168,3 +170,46 @@ def _exec_rebench_db(self, cmd_config, server):
 
         run = list(cnf.get_runs())[0]
         run.close_files()
+
+    def test_check_file_lines(self):
+        ds = DataStore(self.ui)
+        cnf = Configurator(load_config(self._path + '/persistency.conf'),
+                            ds, self.ui, data_file=self._tmp_file)
+        ds.load_data(None, False)
+        ex = Executor(cnf.get_runs(), False, self.ui)
+        ex.execute()
+        with open(self._tmp_file, 'r') as file: # pylint: disable=unspecified-encoding
+            lines = file.readlines()
+            command = self.get_line_after_char('#!', lines[0])
+            self.assertEqual(command, subprocess.list2cmdline(sys.argv))
+            time = self.get_line_after_char('Start:', lines[1])
+            self.assertTrue(self.is_valid_time(time))
+            json_code = self.get_line_after_char('Environment:', lines[2])
+            self.assertTrue(self.is_valid_json(json_code))
+            json_code = self.get_line_after_char('Source:', lines[3])
+            self.assertTrue(self.is_valid_json(json_code))
+            line = lines[4].split("\t")
+            line[-1] = line[-1].rstrip('\n')
+            words = Measurement.get_column_headers()
+            self.assertEqual(line, words)
+            self.assertEqual(len((lines[5]).split("\t")) ,len(line))
+
+    def get_line_after_char(self, char, line):
+        if char in line:
+            get_line = line.split(char)
+            return (get_line[1]).strip()
+        return None
+
+    def is_valid_time(self, time_str):
+        try:
+            datetime.strptime(time_str, '%Y-%m-%dT%H:%M:%S.%f%z')
+            return True
+        except ValueError:
+            return False
+
+    def is_valid_json(self, json_str):
+        try:
+            json.loads(json_str)
+            return True
+        except json.JSONDecodeError:
+            return False