From de5230316aa60f77704a08cb8d95a175cd50fe67 Mon Sep 17 00:00:00 2001 From: Griffin Bassman Date: Fri, 8 Sep 2023 13:59:35 -0400 Subject: [PATCH] fix: Skip of newline for single-examples (#4636) * fix: Skip of newline for single-examples * remove from cs test --- cs/unittest/RunTests.tt | 3 ++- test/core.vwtest.json | 13 +++++++++++ test/pred-sets/ref/single_empty_lines.predict | 1 + test/test-sets/ref/single_empty_lines.stderr | 23 +++++++++++++++++++ test/test-sets/ref/single_empty_lines.stdout | 0 test/train-sets/ref/empty-set.stderr | 7 +++--- test/train-sets/ref/topk-train.stderr | 23 +++++++++---------- test/train-sets/single_empty_lines.txt | 9 ++++++++ vowpalwabbit/core/src/learner.cc | 1 + 9 files changed, 63 insertions(+), 17 deletions(-) create mode 100644 test/pred-sets/ref/single_empty_lines.predict create mode 100644 test/test-sets/ref/single_empty_lines.stderr create mode 100644 test/test-sets/ref/single_empty_lines.stdout create mode 100644 test/train-sets/single_empty_lines.txt diff --git a/cs/unittest/RunTests.tt b/cs/unittest/RunTests.tt index 8c0d39e48db..29c97fbcc73 100644 --- a/cs/unittest/RunTests.tt +++ b/cs/unittest/RunTests.tt @@ -40,7 +40,8 @@ var skipList = new[] { 13, 32, 39, 258, 40, 259, 41, 260, 59, 60, 61, 66, 68, 90 256, 299, 300, 306, 310, 311, 327, 328, 329, 330, 331, 367, 368, 396, 397, 398, // DSJSON not supported 383, 389, 390, 391, 392, 393, // no data file 400, 404, // positional args - 405, 406, 407, 411, 415, 417, 456, 457, 458, 459, 460, 461, 462 // DSJSON not supported + 405, 406, 407, 411, 415, 417, 456, 457, 458, 459, 460, 461, 462, // DSJSON not supported + 464 // Empty lines not supported }; var outputModels = new Dictionary(); diff --git a/test/core.vwtest.json b/test/core.vwtest.json index d3c7bc95699..020d1987f8c 100644 --- a/test/core.vwtest.json +++ b/test/core.vwtest.json @@ -6004,5 +6004,18 @@ "input_files": [ "train-sets/automl_spin_off.txt" ] + }, + { + "id": 464, + "desc": "Ignore empty lines on single-examples", + "vw_command": "-d train-sets/single_empty_lines.txt -p single_empty_lines.predict", + "diff_files": { + "stderr": "test-sets/ref/single_empty_lines.stderr", + "single_empty_lines.predict": "pred-sets/ref/single_empty_lines.predict", + "stdout": "test-sets/ref/single_empty_lines.stdout" + }, + "input_files": [ + "train-sets/single_empty_lines.txt" + ] } ] \ No newline at end of file diff --git a/test/pred-sets/ref/single_empty_lines.predict b/test/pred-sets/ref/single_empty_lines.predict new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/test/pred-sets/ref/single_empty_lines.predict @@ -0,0 +1 @@ +0 diff --git a/test/test-sets/ref/single_empty_lines.stderr b/test/test-sets/ref/single_empty_lines.stderr new file mode 100644 index 00000000000..02b560bff4f --- /dev/null +++ b/test/test-sets/ref/single_empty_lines.stderr @@ -0,0 +1,23 @@ +predictions = single_empty_lines.predict +using no cache +Reading datafile = train-sets/single_empty_lines.txt +num sources = 1 +Num weight bits = 18 +learning rate = 0.5 +initial_t = 0 +power_t = 0.5 +Enabled learners: gd, scorer-identity, count_label +Input label = SIMPLE +Output pred = SCALAR +average since example example current current current +loss last counter weight label predict features +1.000000 1.000000 1 1.0 1.0000 0.0000 2 + +finished run +number of examples = 1 +weighted example sum = 1.000000 +weighted label sum = 1.000000 +average loss = 1.000000 +best constant = 1.000000 +best constant's loss = 0.000000 +total feature number = 2 diff --git a/test/test-sets/ref/single_empty_lines.stdout b/test/test-sets/ref/single_empty_lines.stdout new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test/train-sets/ref/empty-set.stderr b/test/train-sets/ref/empty-set.stderr index 906f5eb3a6d..56ca7a279ca 100644 --- a/test/train-sets/ref/empty-set.stderr +++ b/test/train-sets/ref/empty-set.stderr @@ -10,11 +10,10 @@ Input label = SIMPLE Output pred = SCALAR average since example example current current current loss last counter weight label predict features -n.a. n.a. 1 1.0 unknown 0.0000 1 finished run -number of examples = 1 -weighted example sum = 1.000000 +number of examples = 0 +weighted example sum = 0.000000 weighted label sum = 0.000000 average loss = n.a. -total feature number = 1 +total feature number = 0 diff --git a/test/train-sets/ref/topk-train.stderr b/test/train-sets/ref/topk-train.stderr index 886e97a8b2c..703adb43508 100644 --- a/test/train-sets/ref/topk-train.stderr +++ b/test/train-sets/ref/topk-train.stderr @@ -15,21 +15,20 @@ average since example example current current cur loss last counter weight label predict features 9.000000 9.000000 1 1.0 3.0000 0.0000 4 4.590362 0.180723 2 2.0 0.0000 0.4251 4 -3.928039 2.603395 4 4.0 unknown 0.2876 1 -3.523584 3.119128 8 8.0 unknown 0.4184 1 -2.610412 1.697241 16 16.0 unknown 0.6151 1 -1.917275 1.224138 32 32.0 unknown 0.7335 1 -1.246961 0.576646 64 64.0 unknown 0.8100 1 -0.784439 0.321916 128 128.0 unknown 0.8650 1 -0.439552 0.094665 256 256.0 unknown 0.9058 1 -0.226776 0.014000 512 512.0 unknown 0.9328 1 -0.113599 0.000422 1024 1024.0 unknown 0.9396 1 +3.008577 1.426792 4 4.0 0.0000 0.5002 4 +2.893238 2.777898 8 8.0 1.0000 0.7497 4 +2.321989 1.750740 16 16.0 2.0000 1.5635 4 +1.640977 0.959966 32 32.0 3.0000 1.4030 4 +1.041363 0.441749 64 64.0 3.0000 2.2510 4 +0.623755 0.206147 128 128.0 0.0000 0.4018 4 +0.336533 0.049310 256 256.0 0.0000 0.1610 4 +0.170349 0.004165 512 512.0 1.0000 1.0024 4 finished run -number of examples per pass = 12 +number of examples per pass = 9 passes used = 100 -weighted example sum = 1200.000000 +weighted example sum = 900.000000 weighted label sum = 1500.000000 average loss = 0.096938 best constant = 1.666667 -total feature number = 3900 +total feature number = 3600 diff --git a/test/train-sets/single_empty_lines.txt b/test/train-sets/single_empty_lines.txt new file mode 100644 index 00000000000..0d58596fabc --- /dev/null +++ b/test/train-sets/single_empty_lines.txt @@ -0,0 +1,9 @@ +1 | x:1 + + + + + + + + diff --git a/vowpalwabbit/core/src/learner.cc b/vowpalwabbit/core/src/learner.cc index cdc4738faf9..c124abb01ab 100644 --- a/vowpalwabbit/core/src/learner.cc +++ b/vowpalwabbit/core/src/learner.cc @@ -145,6 +145,7 @@ class single_example_handler } else if (ec->end_pass) { _context.template process(*ec); } else if (is_save_cmd(ec)) { _context.template process(*ec); } + else if (ec->is_newline) { VW::finish_example(_context.get_master(), *ec); } else { _context.template process(*ec); } }