From 615914fa4c78235bafac6766e5c30b1719dc862e Mon Sep 17 00:00:00 2001
From: Jon Degenhardt <jondegenhardt@users.noreply.github.com>
Date: Mon, 7 Sep 2020 14:22:02 -0700
Subject: [PATCH] csv2tsv: Discard UTF-8 Byte Order Mark (BOM) (#302)

* [WIP] Initial version of BOM discarding. Plus more unit tests for the new buffering algorithm.

* csv2tsv BOM detection/removal: Unit tests and documentation.

* Minor code comment cleanup.
---
 csv2tsv/src/tsv_utils/csv2tsv.d      | 299 ++++++++++++++++++++++++++-
 csv2tsv/tests/gold/basic_tests_1.txt |  20 ++
 csv2tsv/tests/input_bom.csv          |   3 +
 csv2tsv/tests/tests.sh               |   3 +
 4 files changed, 318 insertions(+), 7 deletions(-)
 create mode 100644 csv2tsv/tests/input_bom.csv

diff --git a/csv2tsv/src/tsv_utils/csv2tsv.d b/csv2tsv/src/tsv_utils/csv2tsv.d
index 04b6281c..48c9e7f2 100644
--- a/csv2tsv/src/tsv_utils/csv2tsv.d
+++ b/csv2tsv/src/tsv_utils/csv2tsv.d
@@ -54,8 +54,10 @@ Behaviors of this program that often vary between CSV implementations:
   * Double quotes are permitted in a non-quoted field. However, a field starting
     with a quote must follow quoting rules.
   * Each record can have a different numbers of fields.
-  * The three common forms of newlines are supported: CR, CRLF, LF.
+  * The three common forms of newlines are supported: CR, CRLF, LF. Output is
+    written using Unix newlines (LF).
   * A newline will be added if the file does not end with one.
+  * A UTF-8 Byte Order Mark (BOM) at the start of a file will be removed.
   * No whitespace trimming is done.
 
 This program does not validate CSV correctness, but will terminate with an error
@@ -244,7 +246,7 @@ csv2tsv conversion is a good result.
 
 Algorithm notes:
 
-The algorithm works by reading an input block, then examing each byte in-order to
+The algorithm works by reading an input block, then examining each byte in-order to
 identify needed modifications. The region of consecutive characters without a change
 is tracked. Single character changes are done in-place, in the read buffer. This
 allows assembling longer blocks before write is needed. The region being tracked is
@@ -441,7 +443,7 @@ if (isBufferableInputSource!InputSource)
                     _buffer[0 .. len] = _chunks.front[];
                     _chunks.popFront;
 
-                    // Only the last chunk should be shorter than the buffer.
+                    /* Only the last chunk should be shorter than the buffer. */
                     assert(_buffer.length == len || _chunks.empty);
 
                     if (_buffer.length != len) _buffer.length = len;
@@ -557,6 +559,8 @@ unittest  // inputSourceByChunk
  *                            (e.g. TABs) found in the CSV data fields.
  *   tsvNewlineReplacement =  String to use when replacing newlines found in the CSV
  *                            data fields.
+ *   discardBOM            =  If true (the default), a UTF-8 Byte Order Mark found at the
+ *                            start of the input stream will be dropped.
  *
  * Throws: Exception on finding inconsistent CSV. Exception text includes the filename and
  *         line number where the error was identified.
@@ -572,15 +576,20 @@ void csv2tsv(InputSource, OutputRange)(
     const char tsvDelim = '\t',
     const string tsvDelimReplacement = " ",
     const string tsvNewlineReplacement = " ",
+    bool discardBOM = true,
 )
 if (isBufferableInputSource!InputSource &&
     isOutputRange!(OutputRange, char))
 {
+    import std.conv: hexString;
+
     assert (readBuffer.length >= 1);
 
     enum char LF = '\n';
     enum char CR = '\r';
 
+    enum ubyte[3] UTF8_BOM = cast(ubyte[3])hexString!"efbbbf";
+
     /* Process state information - These variables are defined either in the outer
      * context or within one of the foreach loops.
      *
@@ -611,10 +620,23 @@ if (isBufferableInputSource!InputSource &&
     size_t recordNum = 1;
     size_t fieldNum = 0;
 
-    foreach (inputChunk; inputSource.inputSourceByChunk(readBuffer))
+    foreach (chunkIndex, inputChunkComplete; inputSource.inputSourceByChunk(readBuffer).enumerate)
     {
         size_t writeRegionStart = 0;
 
+        /* Discard byte order marks at the start of input.
+         * Note: Slicing the chunk in this fashion generates very good code, better
+         * other approaches like manipulating indices.
+         */
+        auto inputChunk =
+            (discardBOM &&
+             chunkIndex == 0 &&
+             inputChunkComplete.length >= UTF8_BOM.length &&
+             inputChunkComplete[0 .. UTF8_BOM.length] == UTF8_BOM
+            )
+            ? inputChunkComplete[UTF8_BOM.length .. $]
+            : inputChunkComplete[];
+
         /* flushCurrentRegion flushes the current write region and moves the start of
          * the next write region one byte past the end of the current region. If
          * appendChars are provided they are ouput as well.
@@ -622,6 +644,11 @@ if (isBufferableInputSource!InputSource &&
          * This routine is called when the current character (byte) terminates the
          * current write region and should not itself be output. That is why the next
          * write region always starts one byte past the current region end.
+         *
+         * This routine is also called when the 'skiplines' region has been processed.
+         * This is done to flush the region without actually writing it. This is done
+         * by explicit checks in the finite state machine when newline characters
+         * that terminate a record are processed. It would be nice to refactor this.
          */
         void flushCurrentRegion(size_t regionEnd, const char[] appendChars = "")
         {
@@ -727,7 +754,6 @@ if (isBufferableInputSource!InputSource &&
                     if (tsvNewlineReplacement.length == 1)
                     {
                         inputChunk[nextIndex] = tsvNewlineReplacement[0];
-                        if (recordNum == skipLines) flushCurrentRegion(nextIndex);
                     }
                     else
                     {
@@ -739,7 +765,6 @@ if (isBufferableInputSource!InputSource &&
                     if (tsvNewlineReplacement.length == 1)
                     {
                         inputChunk[nextIndex] = tsvNewlineReplacement[0];
-                        if (recordNum == skipLines) flushCurrentRegion(nextIndex);
                     }
                     else
                     {
@@ -825,7 +850,8 @@ if (isBufferableInputSource!InputSource &&
                    (filename == "-") ? "Standard Input" : filename,
                    recordNum));
 
-    if (fieldNum > 0) put(outputStream, '\n');    // Last line w/o terminating newline.
+    /* Output a newline if the CSV input did not have a terminating newline. */
+    if (fieldNum > 0 && recordNum > skipLines) put(outputStream, '\n');
 }
 
 unittest
@@ -843,6 +869,9 @@ unittest
      *
      * This test set does not test main, file handling, or error messages. These are
      * handled by tests run against the executable.
+     *
+     * Note: unittest is non @safe due to the casts from string to ubyte[]. This can
+     * probably be rewritten to use std.string.representation instead, which is @safe.
      */
 
     /* Default CSV. */
@@ -1182,3 +1211,259 @@ unittest
         }
     }
 }
+
+// csv2tsv skiplines tests
+unittest
+{
+    import std.string : representation;
+
+    auto csv1 = "";
+    auto csv2 = "a";
+
+    auto csv3 = "\n";
+    auto csv4 = "\n\n";
+    auto csv5 = "\n\n\n";
+
+    auto csv6 = "a\n";
+    auto csv7 = "a\nb\n";
+    auto csv8 = "a\nb\nc\n";
+
+    auto csv9 = "\"\n\"\n";
+    auto csv10 = "\"\n\"\n\"\n\"\n";
+    auto csv11 = "\"\n\"\n\"\n\"\n\"\n\"\n";
+
+    auto csv12 = "\r";
+    auto csv13 = "\r\r";
+    auto csv14 = "\r\r\r";
+
+    auto csv15 = "a\r";
+    auto csv16 = "a\rb\r";
+    auto csv17 = "a\rb\rc\r";
+
+    auto csv18 = "\"\r\"\r";
+    auto csv19 = "\"\r\"\r\"\r\"\r";
+    auto csv20 = "\"\r\"\r\"\r\"\r\"\r\"\r";
+
+    auto csv21 = "\r\n";
+    auto csv22 = "\r\n\r\n";
+    auto csv23 = "\r\n\r\n\r\n";
+
+    auto csv24 = "a\r\n";
+    auto csv25 = "a\r\nb\r\n";
+    auto csv26 = "a\r\nb\r\nc\r\n";
+
+    auto csv27 = "\"\r\n\"\r\n";
+    auto csv28 = "\"\r\n\"\r\n\"\r\n\"\r\n";
+    auto csv29 = "\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n\"\r\n";
+
+    /* The Skip 1 expected results. */
+    auto tsv1Skip1 = "";
+    auto tsv2Skip1 = "";
+
+    auto tsv3Skip1 = "";
+    auto tsv4Skip1 = "\n";
+    auto tsv5Skip1 = "\n\n";
+
+    auto tsv6Skip1 = "";
+    auto tsv7Skip1 = "b\n";
+    auto tsv8Skip1 = "b\nc\n";
+
+    auto tsv9Skip1 = "";
+    auto tsv10Skip1 = " \n";
+    auto tsv11Skip1 = " \n \n";
+
+    auto tsv12Skip1 = "";
+    auto tsv13Skip1 = "\n";
+    auto tsv14Skip1 = "\n\n";
+
+    auto tsv15Skip1 = "";
+    auto tsv16Skip1 = "b\n";
+    auto tsv17Skip1 = "b\nc\n";
+
+    auto tsv18Skip1 = "";
+    auto tsv19Skip1 = " \n";
+    auto tsv20Skip1 = " \n \n";
+
+    auto tsv21Skip1 = "";
+    auto tsv22Skip1 = "\n";
+    auto tsv23Skip1 = "\n\n";
+
+    auto tsv24Skip1 = "";
+    auto tsv25Skip1 = "b\n";
+    auto tsv26Skip1 = "b\nc\n";
+
+    auto tsv27Skip1 = "";
+    auto tsv28Skip1 = " \n";
+    auto tsv29Skip1 = " \n \n";
+
+    /* The Skip 2 expected results. */
+    auto tsv1Skip2 = "";
+    auto tsv2Skip2 = "";
+
+    auto tsv3Skip2 = "";
+    auto tsv4Skip2 = "";
+    auto tsv5Skip2 = "\n";
+
+    auto tsv6Skip2 = "";
+    auto tsv7Skip2 = "";
+    auto tsv8Skip2 = "c\n";
+
+    auto tsv9Skip2 = "";
+    auto tsv10Skip2 = "";
+    auto tsv11Skip2 = " \n";
+
+    auto tsv12Skip2 = "";
+    auto tsv13Skip2 = "";
+    auto tsv14Skip2 = "\n";
+
+    auto tsv15Skip2 = "";
+    auto tsv16Skip2 = "";
+    auto tsv17Skip2 = "c\n";
+
+    auto tsv18Skip2 = "";
+    auto tsv19Skip2 = "";
+    auto tsv20Skip2 = " \n";
+
+    auto tsv21Skip2 = "";
+    auto tsv22Skip2 = "";
+    auto tsv23Skip2 = "\n";
+
+    auto tsv24Skip2 = "";
+    auto tsv25Skip2 = "";
+    auto tsv26Skip2 = "c\n";
+
+    auto tsv27Skip2 = "";
+    auto tsv28Skip2 = "";
+    auto tsv29Skip2 = " \n";
+
+    auto csvSet =
+        [csv1, csv2, csv3, csv4, csv5, csv6, csv7, csv8, csv9, csv10,
+         csv11, csv12, csv13, csv14, csv15, csv16, csv17, csv18, csv19, csv20,
+         csv21, csv22, csv23, csv24, csv25, csv26, csv27, csv28, csv29];
+
+    auto tsvSkip1Set =
+        [tsv1Skip1, tsv2Skip1, tsv3Skip1, tsv4Skip1, tsv5Skip1, tsv6Skip1, tsv7Skip1, tsv8Skip1, tsv9Skip1, tsv10Skip1,
+         tsv11Skip1, tsv12Skip1, tsv13Skip1, tsv14Skip1, tsv15Skip1, tsv16Skip1, tsv17Skip1, tsv18Skip1, tsv19Skip1, tsv20Skip1,
+         tsv21Skip1, tsv22Skip1, tsv23Skip1, tsv24Skip1, tsv25Skip1, tsv26Skip1, tsv27Skip1, tsv28Skip1, tsv29Skip1];
+
+    auto tsvSkip2Set =
+        [tsv1Skip2, tsv2Skip2, tsv3Skip2, tsv4Skip2, tsv5Skip2, tsv6Skip2, tsv7Skip2, tsv8Skip2, tsv9Skip2, tsv10Skip2,
+         tsv11Skip2, tsv12Skip2, tsv13Skip2, tsv14Skip2, tsv15Skip2, tsv16Skip2, tsv17Skip2, tsv18Skip2, tsv19Skip2, tsv20Skip2,
+         tsv21Skip2, tsv22Skip2, tsv23Skip2, tsv24Skip2, tsv25Skip2, tsv26Skip2, tsv27Skip2, tsv28Skip2, tsv29Skip2];
+
+    auto bufferSizeTests = [1, 2, 3, 4, 8, 128];
+
+    foreach (bufferSize; bufferSizeTests)
+    {
+        ubyte[] readBuffer = new ubyte[](bufferSize);
+
+        foreach (i, csv, tsvSkip1, tsvSkip2; lockstep(csvSet, tsvSkip1Set, tsvSkip2Set))
+        {
+            ubyte[] csvInput = csv.dup.representation;
+            auto csvToTSVSkip1 = appender!(char[])();
+            auto csvToTSVSkip2 = appender!(char[])();
+
+            csv2tsv(csvInput, csvToTSVSkip1, readBuffer, "csvToTSVSkip1", 1);
+
+            assert(tsvSkip1 == csvToTSVSkip1.data,
+                   format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
+                          i + 1, bufferSize, csv, tsvSkip1, csvToTSVSkip1.data));
+
+            csv2tsv(csvInput, csvToTSVSkip2, readBuffer, "csvToTSVSkip2", 2);
+
+            assert(tsvSkip2 == csvToTSVSkip2.data,
+                   format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
+                          i + 1, bufferSize, csv, tsvSkip2, csvToTSVSkip2.data));
+        }
+    }
+}
+
+// csv2tsv BOM tests. Note: std.range.lockstep prevents use of @safe
+unittest
+{
+    import std.conv : hexString;
+    import std.string : representation;
+
+    enum utf8BOM = hexString!"efbbbf";
+
+    auto csv1 = "";
+    auto csv2 = "a";
+    auto csv3 = "ab";
+    auto csv4 = "a,b";
+    auto csv5 = "a,b\ncdef,ghi\njklmn,opqrs\ntuv,wxyz";
+
+    auto csv1BOM = utf8BOM ~ csv1;
+    auto csv2BOM = utf8BOM ~ csv2;
+    auto csv3BOM = utf8BOM ~ csv3;
+    auto csv4BOM = utf8BOM ~ csv4;
+    auto csv5BOM = utf8BOM ~ csv5;
+
+    auto tsv1 = "";
+    auto tsv2 = "a\n";
+    auto tsv3 = "ab\n";
+    auto tsv4 = "a\tb\n";
+    auto tsv5 = "a\tb\ncdef\tghi\njklmn\topqrs\ntuv\twxyz\n";
+
+    /* Note: csv1 is the empty string, so tsv1 does not have a trailing newline.
+     * However, with the BOM prepended the tsv gets a trailing newline.
+     */
+    auto tsv1BOM = utf8BOM ~ tsv1 ~ "\n";
+    auto tsv2BOM = utf8BOM ~ tsv2;
+    auto tsv3BOM = utf8BOM ~ tsv3;
+    auto tsv4BOM = utf8BOM ~ tsv4;
+    auto tsv5BOM = utf8BOM ~ tsv5;
+
+    auto csvSet = [csv1, csv2, csv3, csv4, csv5];
+    auto csvBOMSet = [csv1BOM, csv2BOM, csv3BOM, csv4BOM, csv5BOM];
+
+    auto tsvSet = [tsv1, tsv2, tsv3, tsv4, tsv5];
+    auto tsvBOMSet = [tsv1BOM, tsv2BOM, tsv3BOM, tsv4BOM, tsv5BOM];
+
+    auto bufferSizeTests = [1, 2, 3, 4, 8, 128];
+
+    foreach (bufferSize; bufferSizeTests)
+    {
+        ubyte[] readBuffer = new ubyte[](bufferSize);
+
+        foreach (i, csv, csvBOM, tsv, tsvBOM; lockstep(csvSet, csvBOMSet, tsvSet, tsvBOMSet))
+        {
+            ubyte[] csvInput = csv.dup.representation;
+            ubyte[] csvBOMInput = csvBOM.dup.representation;
+
+            auto csvToTSV = appender!(char[])();
+            auto csvToTSV_NoBOMRemoval = appender!(char[])();
+            auto csvBOMToTSV = appender!(char[])();
+            auto csvBOMToTSV_NoBOMRemoval = appender!(char[])();
+
+            csv2tsv(csvInput, csvToTSV, readBuffer, "csvToTSV", 0, '"', ',', '\t', " ", " ", true);
+            assert(tsv == csvToTSV.data,
+                   format("Unittest failure. tsv != csvToTSV.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
+                          i + 1, bufferSize, csv, tsv, csvToTSV.data));
+
+            csv2tsv(csvInput, csvToTSV_NoBOMRemoval, readBuffer, "csvToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false);
+            assert(tsv == csvToTSV_NoBOMRemoval.data,
+                   format("Unittest failure. tsv != csvToTSV_NoBOMRemoval.data. Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
+                          i + 1, bufferSize, csv, tsv, csvToTSV_NoBOMRemoval.data));
+
+            csv2tsv(csvBOMInput, csvBOMToTSV, readBuffer, "csvBOMToTSV", 0, '"', ',', '\t', " ", " ", true);
+            if (readBuffer.length < utf8BOM.length)
+            {
+                /* Removing BOMs, but didn't provide enough buffer, so no removal. */
+                assert(tsvBOM == csvBOMToTSV.data,
+                       format("Unittest failure. tsvBOM != csvBOMToTSV.data. (Small buffer) Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
+                              i + 1, bufferSize, csv, tsv, csvBOMToTSV.data));
+            }
+            else
+            {
+                assert(tsv == csvBOMToTSV.data,
+                       format("Unittest failure. tsv != csvBOMToTSV.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
+                              i + 1, bufferSize, csv, tsv, csvBOMToTSV.data));
+            }
+
+            csv2tsv(csvBOMInput, csvBOMToTSV_NoBOMRemoval, readBuffer, "csvBOMToTSV_NoBOMRemoval", 0, '"', ',', '\t', " ", " ", false);
+            assert(tsvBOM == csvBOMToTSV_NoBOMRemoval.data,
+                   format("Unittest failure. tsvBOM != csvBOMToTSV_NoBOMRemoval.data. Test: Test: %d; buffer size: %d\ncsv: |%s|\ntsv: |%s|\nres: |%s|\n",
+                          i + 1, bufferSize, csv, tsv, csvBOMToTSV_NoBOMRemoval.data));
+        }
+    }
+}
diff --git a/csv2tsv/tests/gold/basic_tests_1.txt b/csv2tsv/tests/gold/basic_tests_1.txt
index db5b2274..252ea5e4 100644
--- a/csv2tsv/tests/gold/basic_tests_1.txt
+++ b/csv2tsv/tests/gold/basic_tests_1.txt
@@ -136,6 +136,26 @@ deutsche	Farbe grün gelb blau weiß schwarz	grün	blau
 suomalainen	väri vihreä keltainen sininen valkoinen musta	vihreä	sininen
 中文	颜色绿色黄色蓝色白色黑色	绿色	蓝色
 
+====[csv2tsv input_bom.csv]====
+abc	def	ghi
+ABC	DEF	GHI
+12.3	45.6	78.9
+
+====[csv2tsv input_bom.csv input_bom.csv]====
+abc	def	ghi
+ABC	DEF	GHI
+12.3	45.6	78.9
+abc	def	ghi
+ABC	DEF	GHI
+12.3	45.6	78.9
+
+====[csv2tsv -H input_bom.csv input_bom.csv]====
+abc	def	ghi
+ABC	DEF	GHI
+12.3	45.6	78.9
+ABC	DEF	GHI
+12.3	45.6	78.9
+
 ====[cat header3.csv | csv2tsv --header -- header1.csv header2.csv - header4.csv header5.csv]====
 field1	field2	field3
 123	456	789
diff --git a/csv2tsv/tests/input_bom.csv b/csv2tsv/tests/input_bom.csv
new file mode 100644
index 00000000..e83fa206
--- /dev/null
+++ b/csv2tsv/tests/input_bom.csv
@@ -0,0 +1,3 @@
+﻿abc,def,ghi
+"ABC","DEF","GHI"
+12.3,45.6,78.9
diff --git a/csv2tsv/tests/tests.sh b/csv2tsv/tests/tests.sh
index 0d31d35f..77f1a0e0 100755
--- a/csv2tsv/tests/tests.sh
+++ b/csv2tsv/tests/tests.sh
@@ -35,6 +35,9 @@ runtest ${prog} "header1.csv header2.csv header3.csv header4.csv header5.csv" ${
 runtest ${prog} "--header header1.csv header2.csv header3.csv header4.csv header5.csv" ${basic_tests_1}
 runtest ${prog} "-H header1.csv header2.csv header3.csv header4.csv header5.csv" ${basic_tests_1}
 runtest ${prog} "input_unicode.csv" ${basic_tests_1}
+runtest ${prog} "input_bom.csv" ${basic_tests_1}
+runtest ${prog} "input_bom.csv input_bom.csv" ${basic_tests_1}
+runtest ${prog} "-H input_bom.csv input_bom.csv" ${basic_tests_1}
 
 echo "" >> ${basic_tests_1}; echo "====[cat header3.csv | csv2tsv --header -- header1.csv header2.csv - header4.csv header5.csv]====" >> ${basic_tests_1}
 cat header3.csv | ${prog} --header -- header1.csv header2.csv - header4.csv header5.csv >> ${basic_tests_1} 2>&1