From de5230316aa60f77704a08cb8d95a175cd50fe67 Mon Sep 17 00:00:00 2001
From: Griffin Bassman <griffinbassman@gmail.com>
Date: Fri, 8 Sep 2023 13:59:35 -0400
Subject: [PATCH] fix: Skip of newline for single-examples (#4636)

* fix: Skip of newline for single-examples

* remove from cs test
---
 cs/unittest/RunTests.tt                       |  3 ++-
 test/core.vwtest.json                         | 13 +++++++++++
 test/pred-sets/ref/single_empty_lines.predict |  1 +
 test/test-sets/ref/single_empty_lines.stderr  | 23 +++++++++++++++++++
 test/test-sets/ref/single_empty_lines.stdout  |  0
 test/train-sets/ref/empty-set.stderr          |  7 +++---
 test/train-sets/ref/topk-train.stderr         | 23 +++++++++----------
 test/train-sets/single_empty_lines.txt        |  9 ++++++++
 vowpalwabbit/core/src/learner.cc              |  1 +
 9 files changed, 63 insertions(+), 17 deletions(-)
 create mode 100644 test/pred-sets/ref/single_empty_lines.predict
 create mode 100644 test/test-sets/ref/single_empty_lines.stderr
 create mode 100644 test/test-sets/ref/single_empty_lines.stdout
 create mode 100644 test/train-sets/single_empty_lines.txt

diff --git a/cs/unittest/RunTests.tt b/cs/unittest/RunTests.tt
index 8c0d39e48db..29c97fbcc73 100644
--- a/cs/unittest/RunTests.tt
+++ b/cs/unittest/RunTests.tt
@@ -40,7 +40,8 @@ var skipList = new[] { 13, 32, 39, 258, 40, 259, 41, 260, 59, 60, 61, 66, 68, 90
     256, 299, 300, 306, 310, 311, 327, 328, 329, 330, 331, 367, 368, 396, 397, 398, // DSJSON not supported
     383, 389, 390, 391, 392, 393, // no data file
     400, 404, // positional args
-    405, 406, 407, 411, 415, 417, 456, 457, 458, 459, 460, 461, 462 // DSJSON not supported
+    405, 406, 407, 411, 415, 417, 456, 457, 458, 459, 460, 461, 462, // DSJSON not supported
+    464 // Empty lines not supported
     };
 
 var outputModels = new Dictionary<string, TestCase>();
diff --git a/test/core.vwtest.json b/test/core.vwtest.json
index d3c7bc95699..020d1987f8c 100644
--- a/test/core.vwtest.json
+++ b/test/core.vwtest.json
@@ -6004,5 +6004,18 @@
     "input_files": [
       "train-sets/automl_spin_off.txt"
     ]
+  },
+  {
+    "id": 464,
+    "desc": "Ignore empty lines on single-examples",
+    "vw_command": "-d train-sets/single_empty_lines.txt -p single_empty_lines.predict",
+    "diff_files": {
+      "stderr": "test-sets/ref/single_empty_lines.stderr",
+      "single_empty_lines.predict": "pred-sets/ref/single_empty_lines.predict",
+      "stdout": "test-sets/ref/single_empty_lines.stdout"
+    },
+    "input_files": [
+      "train-sets/single_empty_lines.txt"
+    ]
   }
 ]
\ No newline at end of file
diff --git a/test/pred-sets/ref/single_empty_lines.predict b/test/pred-sets/ref/single_empty_lines.predict
new file mode 100644
index 00000000000..573541ac970
--- /dev/null
+++ b/test/pred-sets/ref/single_empty_lines.predict
@@ -0,0 +1 @@
+0
diff --git a/test/test-sets/ref/single_empty_lines.stderr b/test/test-sets/ref/single_empty_lines.stderr
new file mode 100644
index 00000000000..02b560bff4f
--- /dev/null
+++ b/test/test-sets/ref/single_empty_lines.stderr
@@ -0,0 +1,23 @@
+predictions = single_empty_lines.predict
+using no cache
+Reading datafile = train-sets/single_empty_lines.txt
+num sources = 1
+Num weight bits = 18
+learning rate = 0.5
+initial_t = 0
+power_t = 0.5
+Enabled learners: gd, scorer-identity, count_label
+Input label = SIMPLE
+Output pred = SCALAR
+average  since         example        example        current        current  current
+loss     last          counter         weight          label        predict features
+1.000000 1.000000            1            1.0         1.0000         0.0000        2
+
+finished run
+number of examples = 1
+weighted example sum = 1.000000
+weighted label sum = 1.000000
+average loss = 1.000000
+best constant = 1.000000
+best constant's loss = 0.000000
+total feature number = 2
diff --git a/test/test-sets/ref/single_empty_lines.stdout b/test/test-sets/ref/single_empty_lines.stdout
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/test/train-sets/ref/empty-set.stderr b/test/train-sets/ref/empty-set.stderr
index 906f5eb3a6d..56ca7a279ca 100644
--- a/test/train-sets/ref/empty-set.stderr
+++ b/test/train-sets/ref/empty-set.stderr
@@ -10,11 +10,10 @@ Input label = SIMPLE
 Output pred = SCALAR
 average  since         example        example        current        current  current
 loss     last          counter         weight          label        predict features
-n.a.     n.a.                1            1.0        unknown         0.0000        1
 
 finished run
-number of examples = 1
-weighted example sum = 1.000000
+number of examples = 0
+weighted example sum = 0.000000
 weighted label sum = 0.000000
 average loss = n.a.
-total feature number = 1
+total feature number = 0
diff --git a/test/train-sets/ref/topk-train.stderr b/test/train-sets/ref/topk-train.stderr
index 886e97a8b2c..703adb43508 100644
--- a/test/train-sets/ref/topk-train.stderr
+++ b/test/train-sets/ref/topk-train.stderr
@@ -15,21 +15,20 @@ average  since         example        example        current        current  cur
 loss     last          counter         weight          label        predict features
 9.000000 9.000000            1            1.0         3.0000         0.0000        4
 4.590362 0.180723            2            2.0         0.0000         0.4251        4
-3.928039 2.603395            4            4.0        unknown         0.2876        1
-3.523584 3.119128            8            8.0        unknown         0.4184        1
-2.610412 1.697241           16           16.0        unknown         0.6151        1
-1.917275 1.224138           32           32.0        unknown         0.7335        1
-1.246961 0.576646           64           64.0        unknown         0.8100        1
-0.784439 0.321916          128          128.0        unknown         0.8650        1
-0.439552 0.094665          256          256.0        unknown         0.9058        1
-0.226776 0.014000          512          512.0        unknown         0.9328        1
-0.113599 0.000422         1024         1024.0        unknown         0.9396        1
+3.008577 1.426792            4            4.0         0.0000         0.5002        4
+2.893238 2.777898            8            8.0         1.0000         0.7497        4
+2.321989 1.750740           16           16.0         2.0000         1.5635        4
+1.640977 0.959966           32           32.0         3.0000         1.4030        4
+1.041363 0.441749           64           64.0         3.0000         2.2510        4
+0.623755 0.206147          128          128.0         0.0000         0.4018        4
+0.336533 0.049310          256          256.0         0.0000         0.1610        4
+0.170349 0.004165          512          512.0         1.0000         1.0024        4
 
 finished run
-number of examples per pass = 12
+number of examples per pass = 9
 passes used = 100
-weighted example sum = 1200.000000
+weighted example sum = 900.000000
 weighted label sum = 1500.000000
 average loss = 0.096938
 best constant = 1.666667
-total feature number = 3900
+total feature number = 3600
diff --git a/test/train-sets/single_empty_lines.txt b/test/train-sets/single_empty_lines.txt
new file mode 100644
index 00000000000..0d58596fabc
--- /dev/null
+++ b/test/train-sets/single_empty_lines.txt
@@ -0,0 +1,9 @@
+1 | x:1
+
+
+
+
+
+
+
+
diff --git a/vowpalwabbit/core/src/learner.cc b/vowpalwabbit/core/src/learner.cc
index cdc4738faf9..c124abb01ab 100644
--- a/vowpalwabbit/core/src/learner.cc
+++ b/vowpalwabbit/core/src/learner.cc
@@ -145,6 +145,7 @@ class single_example_handler
     }
     else if (ec->end_pass) { _context.template process<example, end_pass>(*ec); }
     else if (is_save_cmd(ec)) { _context.template process<example, save>(*ec); }
+    else if (ec->is_newline) { VW::finish_example(_context.get_master(), *ec); }
     else { _context.template process<example, learn_ex>(*ec); }
   }