diff --git a/Sources/Document.swift b/Sources/Document.swift index 9ece087..f17449d 100644 --- a/Sources/Document.swift +++ b/Sources/Document.swift @@ -50,9 +50,9 @@ public class Document: InputHandlerDelegate { - Parameter data: Data which comprises of the entire document as a UTF-8 string. - Parameter dialect: Dialect from which to parse against. */ - public convenience init(data: Data, dialect: Dialect = Dialect()) throws { + public convenience init(data: Data, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect()) throws { let parser = ImportParser(dialect: dialect) - var allRows = try parser.import(data: data) + var allRows = try parser.import(data: data, encoding: encoding) if let row = try parser.flushRow() { allRows.append(row) } @@ -64,9 +64,9 @@ public class Document: InputHandlerDelegate { - Note: Although this streams input data from the `FileHandle` the resulting document is still the full physical representation of the data. */ - public convenience init(fileHandle: FileHandle, dialect: Dialect = Dialect()) throws { + public convenience init(fileHandle: FileHandle, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect()) throws { self.init(dialect: dialect) - let inputHandler = InputHandler(fileHandle: fileHandle, dialect: dialect) + let inputHandler = InputHandler(fileHandle: fileHandle, encoding: encoding, dialect: dialect) inputHandler.delegate = self try inputHandler.readToEndOfFile() } diff --git a/Sources/ImportParser.swift b/Sources/ImportParser.swift old mode 100644 new mode 100755 index cef1f53..b11f1df --- a/Sources/ImportParser.swift +++ b/Sources/ImportParser.swift @@ -25,7 +25,7 @@ public class ImportParser { } /** - - badEncoding: Indicates input could not be decoded. + - badEncoding: Indicates input could not be decoded from the specified encoding. - uncaughtCharacter: An unexpected character at a 1-indexed row number. - uneven: Encountered a row whose number of values is mismatched relative to other rows. All rows are expected to contain the same number of values. */ @@ -50,8 +50,8 @@ public class ImportParser { - Returns: Parsed rows. An incomplete row is not returned prematurely until the data is provided or a flush command is issued. - Note: It is best practice to call the flush method after having parsed the last of the input data. */ - public func `import`(data: Data) throws -> [Row] { - guard let string = String(data: data, encoding: String.Encoding.utf8) else { + public func `import`(data: Data, encoding: String.Encoding = .utf8) throws -> [Row] { + guard let string = String(data: data, encoding: encoding) else { throw ImportError.badEncoding } diff --git a/Sources/InputHandler.swift b/Sources/InputHandler.swift index ce67a71..4c3a35b 100644 --- a/Sources/InputHandler.swift +++ b/Sources/InputHandler.swift @@ -57,6 +57,7 @@ public class InputHandler { private let maxRetries: Int private var retries: Int = 0 private let fileHandle: FileHandle + private let encoding: String.Encoding private var parser: ImportParser /** @@ -64,23 +65,24 @@ public class InputHandler { - Parameter dialect: Dialect from which to parse against. - Parameter maxRetries: Maximum number of allowed consecutive retries */ - public init(fileHandle: FileHandle, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) { + public init(fileHandle: FileHandle, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) { self.fileHandle = fileHandle + self.encoding = encoding self.dialect = dialect self.maxRetries = maxRetries self.parser = ImportParser(dialect: dialect) } - public convenience init(from url: URL, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) throws { + public convenience init(from url: URL, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) throws { let fileHandle = try FileHandle(forReadingFrom: url) - self.init(fileHandle: fileHandle, dialect: dialect, maxRetries: maxRetries) + self.init(fileHandle: fileHandle, encoding: encoding, dialect: dialect, maxRetries: maxRetries) } - public convenience init?(atPath path: String, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) { + public convenience init?(atPath path: String, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) { guard let fileHandle = FileHandle(forReadingAtPath: path) else { return nil } - self.init(fileHandle: fileHandle, dialect: dialect, maxRetries: maxRetries) + self.init(fileHandle: fileHandle, encoding: encoding, dialect: dialect, maxRetries: maxRetries) } deinit { @@ -103,7 +105,7 @@ public class InputHandler { var rows = [Row]() do { - rows = try self.parser.import(data: data) + rows = try self.parser.import(data: data, encoding: encoding) } catch ImportParser.ImportError.badEncoding { self.retries += 1 // We may have received incomplete data that broke UTF-8 decoding due to variable byte widths diff --git a/Tests/DialectalCSVTests/ImportTests.swift b/Tests/DialectalCSVTests/ImportTests.swift index 75049fa..95a3a5f 100644 --- a/Tests/DialectalCSVTests/ImportTests.swift +++ b/Tests/DialectalCSVTests/ImportTests.swift @@ -4,6 +4,7 @@ import XCTest class ImportTests: XCTestCase { static var allTests = [ + ("testBadEncoding", testBadEncoding), ("testEscapeCharacter", testEscapeCharacter), ("testEscapeDoubleQuote", testEscapeDoubleQuote), ("testHeadersOnly", testHeadersOnly), @@ -21,10 +22,24 @@ class ImportTests: XCTestCase { ("testTrailingComma", testTrailingComma), ("testUnescapedQuotes", testUnescapedQuotes), ("testUnquotedHeaders", testUnquotedHeaders), - ("testBadEncoding", testBadEncoding), - ("testVariableWidthEncodedStreamSplit", testVariableWidthEncodedStreamSplit) + ("testVariableWidthEncodedStreamSplit", testVariableWidthEncodedStreamSplit), + ("testWestern1252Encoding", testWestern1252Encoding) ] + func testBadEncoding() throws { + let fileURL = Utility.fixtureURL(named: "western1252Encoded.csv") + let fileHandle = try FileHandle(forReadingFrom: fileURL) + do { + _ = try Document(fileHandle: fileHandle) + } catch ImportParser.ImportError.badEncoding { + return + } catch { + XCTFail() + return + } + XCTFail() + } + func testEscapeCharacter() { let data = Utility.fixture(named: "escapeCharacter.csv") var dialect = Dialect() @@ -290,20 +305,6 @@ class ImportTests: XCTestCase { XCTAssertEqual(document.header![1], HeaderFields.author.rawValue + " name") } - func testBadEncoding() throws { - let fileURL = Utility.fixtureURL(named: "western1252Encoded.csv") - let fileHandle = try FileHandle(forReadingFrom: fileURL) - do { - _ = try Document(fileHandle: fileHandle) - } catch ImportParser.ImportError.badEncoding { - return - } catch { - XCTFail() - return - } - XCTFail() - } - func testVariableWidthEncodedStreamSplit() throws { let inputURL = Utility.fixtureURL(named: "variableWidthEncodedStreamSplit.csv") var dialect = Dialect() @@ -311,31 +312,31 @@ class ImportTests: XCTestCase { let inputFileHandle = try FileHandle(forReadingFrom: inputURL) var inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect) - var handler = SpyInputHandlerDelegate() - inputHandler.delegate = handler + var outputSpy = SpyInputHandlerDelegate() + inputHandler.delegate = outputSpy for numberOfBytes in 1...4 { try inputHandler.readToEndOfFile(length: numberOfBytes) - XCTAssertEqual(handler.records.count, 2) + XCTAssertEqual(outputSpy.records.count, 2) - let first = try XCTUnwrap(handler.records[safe: 0]) + let first = try XCTUnwrap(outputSpy.records[safe: 0]) XCTAssertEqual(first.count, 4) - XCTAssertEqual(first[0], "éab") - XCTAssertEqual(first[1], "abé") - XCTAssertEqual(first[2], "aéb") - XCTAssertEqual(first[3], "abcé") + XCTAssertEqual(first[safe: 0], "éab") + XCTAssertEqual(first[safe: 1], "abé") + XCTAssertEqual(first[safe: 2], "aéb") + XCTAssertEqual(first[safe: 3], "abcé") - let second = try XCTUnwrap(handler.records[safe: 1]) + let second = try XCTUnwrap(outputSpy.records[safe: 1]) XCTAssertEqual(second.count, 4) - XCTAssertEqual(second[0], "123") - XCTAssertEqual(second[1], "456") - XCTAssertEqual(second[2], "789") - XCTAssertEqual(second[3], "321") + XCTAssertEqual(second[safe: 0], "123") + XCTAssertEqual(second[safe: 1], "456") + XCTAssertEqual(second[safe: 2], "789") + XCTAssertEqual(second[safe: 3], "321") } inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect, maxRetries: 0) - handler = SpyInputHandlerDelegate() - inputHandler.delegate = handler + outputSpy = SpyInputHandlerDelegate() + inputHandler.delegate = outputSpy for numberOfBytes in 1...4 { do { @@ -349,4 +350,18 @@ class ImportTests: XCTestCase { } } + func testWestern1252Encoding() throws { + let inputURL = Utility.fixtureURL(named: "western1252Encoded.csv") + let inputFileHandle = try FileHandle(forReadingFrom: inputURL) + let inputHandler = InputHandler(fileHandle: inputFileHandle, encoding: .windowsCP1252) + let outputSpy = SpyInputHandlerDelegate() + inputHandler.delegate = outputSpy + + try inputHandler.readToEndOfFile() + XCTAssertEqual(outputSpy.records.count, 1) + let first = try XCTUnwrap(outputSpy.records.first) + XCTAssertEqual(first[safe: 0], "Always bear in mind that your own resolütion to succeed is more important than any other.") + XCTAssertEqual(first[safe: 1], "Abraham Lincoln") + } + }