From 3a536a119cf315ad5b73b96f2a68383a98414797 Mon Sep 17 00:00:00 2001 From: David Bennion Date: Wed, 21 Feb 2024 17:41:09 -0700 Subject: [PATCH 1/2] Add support for multi-line fields supported by RFC4180 when escapeColumnDelimitersCSV is set to true --- .../jdbc/SQLServerBulkCSVFileRecord.java | 43 ++++++++++++++++++- .../jdbc/bulkCopy/BulkCopyCSVTest.java | 3 +- .../BulkCopyCSVTestInputDelimiterEscape.csv | 4 ++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerBulkCSVFileRecord.java b/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerBulkCSVFileRecord.java index 195b197e0..aaf40046b 100644 --- a/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerBulkCSVFileRecord.java +++ b/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerBulkCSVFileRecord.java @@ -212,6 +212,47 @@ private void initFileReader(InputStreamReader sr, String encoding, String demlim } } + /* RFC4180 specifies that rules for quoted fields. It allows quoted string data to contain newlines data + provided the contents otherwise conforms to the rules for escaping quotes. For example, the following is valid: + "a","b","c" + "aaa","b <-- newline is retained in data field + bb","c" + "aa","bb","cc" + + We cannot simply use fileReader.readLine() to read these records but instead must continue reading until we reach + a newline that is not contained within quotes. + */ + private String readLineEscapeDelimiters() throws SQLServerException { + int quoteCount = 0; + StringBuilder sb = new StringBuilder(); + try { + int c; + while ((c = fileReader.read()) != -1) { + if((c == '\n' || c == '\r') && quoteCount % 2 == 0) { // newlines only end the record if we are not in quotes + fileReader.mark(1); + c = fileReader.read(); // we might have read \r of a \r\n, if so we need to read the \n as well + if(c != '\n') { + fileReader.reset(); // only delimited by \n, unread last char so it goes into the next record + } + break; + } + sb.append((char)c); + if( c == '"') { + quoteCount++; + } + } + if (c == -1 && quoteCount % 2 != 0) { // stream ended, but we are within quotes -- data problem + throw new SQLServerException(SQLServerException.getErrString("R_InvalidCSVQuotes"),null,0,null); + } + if(c == -1) { // keep semantics of readLine() by returning a null when there is no more data + return null; + } + } catch (IOException e) { + throw new SQLServerException(e.getMessage(),null,0,e); + } + return sb.toString(); + } + private void initLoggerResources() { super.loggerPackageName = "com.microsoft.sqlserver.jdbc.SQLServerBulkCSVFileRecord"; } @@ -526,7 +567,7 @@ else if ((null != columnNames) && (columnNames.length >= positionInSource)) @Override public boolean next() throws SQLServerException { try { - currentLine = fileReader.readLine(); + currentLine = escapeDelimiters ? readLineEscapeDelimiters() : fileReader.readLine(); } catch (IOException e) { throw new SQLServerException(e.getMessage(), null, 0, e); } diff --git a/src/test/java/com/microsoft/sqlserver/jdbc/bulkCopy/BulkCopyCSVTest.java b/src/test/java/com/microsoft/sqlserver/jdbc/bulkCopy/BulkCopyCSVTest.java index 48e28b712..97b0e3f92 100644 --- a/src/test/java/com/microsoft/sqlserver/jdbc/bulkCopy/BulkCopyCSVTest.java +++ b/src/test/java/com/microsoft/sqlserver/jdbc/bulkCopy/BulkCopyCSVTest.java @@ -153,7 +153,7 @@ public void testEscapeColumnDelimitersCSV() throws Exception { /* * The list below is the copy of inputFileDelimiterEsc ape with quotes removed. */ - String[][] expectedEscaped = new String[11][4]; + String[][] expectedEscaped = new String[12][4]; expectedEscaped[0] = new String[] {"test", " test\"", "no@split", " testNoQuote", ""}; expectedEscaped[1] = new String[] {null, null, null, null, ""}; expectedEscaped[2] = new String[] {"\"", "test\"test", "test@\" test", null, ""}; @@ -166,6 +166,7 @@ public void testEscapeColumnDelimitersCSV() throws Exception { expectedEscaped[8] = new String[] {"1997", "Ford", "E350", "Super@ \"luxurious\" truck", ""}; expectedEscaped[9] = new String[] {"1997", "Ford", "E350", "E63", ""}; expectedEscaped[10] = new String[] {"1997", "Ford", "E350", " Super luxurious truck ", ""}; + expectedEscaped[11] = new String[] {"1997", "F\r\no\r\nr\r\nd", "E350", "\"Super\" \"luxurious\" \"truck\"", ""}; try (Connection con = getConnection(); Statement stmt = con.createStatement(); SQLServerBulkCopy bulkCopy = new SQLServerBulkCopy(con); diff --git a/src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv b/src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv index 10afd5d84..98ac9f114 100644 --- a/src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv +++ b/src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv @@ -9,3 +9,7 @@ 9@1997@Ford@E350@"Super@ ""luxurious"" truck"@ 10@1997@ "Ford" @E350@ "E63"@ 11@1997@Ford@E350@" Super luxurious truck "@ +12@1997@"F +o +r +d"@"E350"@"""Super"" ""luxurious"" ""truck"""@ \ No newline at end of file From 32656a6cdeb0b69660d0e357803e253004d8ad2e Mon Sep 17 00:00:00 2001 From: David Bennion Date: Tue, 26 Mar 2024 21:19:31 -0600 Subject: [PATCH 2/2] Format code --- .../jdbc/SQLServerBulkCSVFileRecord.java | 44 +++++++++---------- .../BulkCopyCSVTestInputDelimiterEscape.csv | 2 +- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerBulkCSVFileRecord.java b/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerBulkCSVFileRecord.java index aaf40046b..25f99214b 100644 --- a/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerBulkCSVFileRecord.java +++ b/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerBulkCSVFileRecord.java @@ -212,43 +212,43 @@ private void initFileReader(InputStreamReader sr, String encoding, String demlim } } - /* RFC4180 specifies that rules for quoted fields. It allows quoted string data to contain newlines data - provided the contents otherwise conforms to the rules for escaping quotes. For example, the following is valid: - "a","b","c" - "aaa","b <-- newline is retained in data field - bb","c" - "aa","bb","cc" - - We cannot simply use fileReader.readLine() to read these records but instead must continue reading until we reach - a newline that is not contained within quotes. - */ + /* + * RFC4180 specifies that rules for quoted fields. It allows quoted string data to contain newlines data + * provided the contents otherwise conforms to the rules for escaping quotes. For example, the following is valid: + * "a","b","c" + * "aaa","b <-- newline is retained in data field + * bb","c" + * "aa","bb","cc" + * We cannot simply use fileReader.readLine() to read these records but instead must continue reading until we reach + * a newline that is not contained within quotes. + */ private String readLineEscapeDelimiters() throws SQLServerException { int quoteCount = 0; StringBuilder sb = new StringBuilder(); try { int c; while ((c = fileReader.read()) != -1) { - if((c == '\n' || c == '\r') && quoteCount % 2 == 0) { // newlines only end the record if we are not in quotes - fileReader.mark(1); - c = fileReader.read(); // we might have read \r of a \r\n, if so we need to read the \n as well - if(c != '\n') { - fileReader.reset(); // only delimited by \n, unread last char so it goes into the next record - } - break; + if ((c == '\n' || c == '\r') && quoteCount % 2 == 0) { // newlines only end the record if we are not in quotes + fileReader.mark(1); + c = fileReader.read(); // we might have read \r of a \r\n, if so we need to read the \n as well + if (c != '\n') { + fileReader.reset(); // only delimited by \n, unread last char so it goes into the next record + } + break; } - sb.append((char)c); - if( c == '"') { + sb.append((char) c); + if (c == '"') { quoteCount++; } } if (c == -1 && quoteCount % 2 != 0) { // stream ended, but we are within quotes -- data problem - throw new SQLServerException(SQLServerException.getErrString("R_InvalidCSVQuotes"),null,0,null); + throw new SQLServerException(SQLServerException.getErrString("R_InvalidCSVQuotes"), null, 0, null); } - if(c == -1) { // keep semantics of readLine() by returning a null when there is no more data + if (c == -1) { // keep semantics of readLine() by returning a null when there is no more data return null; } } catch (IOException e) { - throw new SQLServerException(e.getMessage(),null,0,e); + throw new SQLServerException(e.getMessage(), null, 0, e); } return sb.toString(); } diff --git a/src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv b/src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv index 98ac9f114..9425b0a1d 100644 --- a/src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv +++ b/src/test/resources/BulkCopyCSVTestInputDelimiterEscape.csv @@ -12,4 +12,4 @@ 12@1997@"F o r -d"@"E350"@"""Super"" ""luxurious"" ""truck"""@ \ No newline at end of file +d"@"E350"@"""Super"" ""luxurious"" ""truck"""@