Skip to content

Commit

Permalink
GH-7: Optionally specify import data encoding.
Browse files Browse the repository at this point in the history
  • Loading branch information
slythfox committed Dec 4, 2023
1 parent 4e04316 commit 89c6c11
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 44 deletions.
8 changes: 4 additions & 4 deletions Sources/Document.swift
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ public class Document: InputHandlerDelegate {
- Parameter data: Data which comprises of the entire document as a UTF-8 string.
- Parameter dialect: Dialect from which to parse against.
*/
public convenience init(data: Data, dialect: Dialect = Dialect()) throws {
public convenience init(data: Data, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect()) throws {
let parser = ImportParser(dialect: dialect)
var allRows = try parser.import(data: data)
var allRows = try parser.import(data: data, encoding: encoding)
if let row = try parser.flushRow() {
allRows.append(row)
}
Expand All @@ -64,9 +64,9 @@ public class Document: InputHandlerDelegate {
- Note: Although this streams input data from the `FileHandle` the resulting document is still the full physical representation of the data.
*/
public convenience init(fileHandle: FileHandle, dialect: Dialect = Dialect()) throws {
public convenience init(fileHandle: FileHandle, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect()) throws {
self.init(dialect: dialect)
let inputHandler = InputHandler(fileHandle: fileHandle, dialect: dialect)
let inputHandler = InputHandler(fileHandle: fileHandle, encoding: encoding, dialect: dialect)
inputHandler.delegate = self
try inputHandler.readToEndOfFile()
}
Expand Down
6 changes: 3 additions & 3 deletions Sources/ImportParser.swift
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public class ImportParser {
}

/**
- badEncoding: Indicates input could not be decoded.
- badEncoding: Indicates input could not be decoded from the specified encoding.
- uncaughtCharacter: An unexpected character at a 1-indexed row number.
- uneven: Encountered a row whose number of values is mismatched relative to other rows. All rows are expected to contain the same number of values.
*/
Expand All @@ -50,8 +50,8 @@ public class ImportParser {
- Returns: Parsed rows. An incomplete row is not returned prematurely until the data is provided or a flush command is issued.
- Note: It is best practice to call the flush method after having parsed the last of the input data.
*/
public func `import`(data: Data) throws -> [Row] {
guard let string = String(data: data, encoding: String.Encoding.utf8) else {
public func `import`(data: Data, encoding: String.Encoding = .utf8) throws -> [Row] {
guard let string = String(data: data, encoding: encoding) else {
throw ImportError.badEncoding
}

Expand Down
14 changes: 8 additions & 6 deletions Sources/InputHandler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -57,30 +57,32 @@ public class InputHandler {
private let maxRetries: Int
private var retries: Int = 0
private let fileHandle: FileHandle
private let encoding: String.Encoding
private var parser: ImportParser

/**
- Parameter fileHandle: FileHandle for reading. InputHandler should be solely responsible for controlling seeking behavior during its lifetime. The FileHandle's seek position should be at the beginning.
- Parameter dialect: Dialect from which to parse against.
- Parameter maxRetries: Maximum number of allowed consecutive retries
*/
public init(fileHandle: FileHandle, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
public init(fileHandle: FileHandle, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
self.fileHandle = fileHandle
self.encoding = encoding
self.dialect = dialect
self.maxRetries = maxRetries
self.parser = ImportParser(dialect: dialect)
}

public convenience init(from url: URL, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) throws {
public convenience init(from url: URL, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) throws {
let fileHandle = try FileHandle(forReadingFrom: url)
self.init(fileHandle: fileHandle, dialect: dialect, maxRetries: maxRetries)
self.init(fileHandle: fileHandle, encoding: encoding, dialect: dialect, maxRetries: maxRetries)
}

public convenience init?(atPath path: String, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
public convenience init?(atPath path: String, encoding: String.Encoding = .utf8, dialect: Dialect = Dialect(), maxRetries: Int = InputHandler.defaultMaxRetries) {
guard let fileHandle = FileHandle(forReadingAtPath: path) else {
return nil
}
self.init(fileHandle: fileHandle, dialect: dialect, maxRetries: maxRetries)
self.init(fileHandle: fileHandle, encoding: encoding, dialect: dialect, maxRetries: maxRetries)
}

deinit {
Expand All @@ -103,7 +105,7 @@ public class InputHandler {

var rows = [Row]()
do {
rows = try self.parser.import(data: data)
rows = try self.parser.import(data: data, encoding: encoding)
} catch ImportParser.ImportError.badEncoding {
self.retries += 1
// We may have received incomplete data that broke UTF-8 decoding due to variable byte widths
Expand Down
77 changes: 46 additions & 31 deletions Tests/DialectalCSVTests/ImportTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import XCTest
class ImportTests: XCTestCase {

static var allTests = [
("testBadEncoding", testBadEncoding),
("testEscapeCharacter", testEscapeCharacter),
("testEscapeDoubleQuote", testEscapeDoubleQuote),
("testHeadersOnly", testHeadersOnly),
Expand All @@ -21,10 +22,24 @@ class ImportTests: XCTestCase {
("testTrailingComma", testTrailingComma),
("testUnescapedQuotes", testUnescapedQuotes),
("testUnquotedHeaders", testUnquotedHeaders),
("testBadEncoding", testBadEncoding),
("testVariableWidthEncodedStreamSplit", testVariableWidthEncodedStreamSplit)
("testVariableWidthEncodedStreamSplit", testVariableWidthEncodedStreamSplit),
("testWestern1252Encoding", testWestern1252Encoding)
]

func testBadEncoding() throws {
let fileURL = Utility.fixtureURL(named: "western1252Encoded.csv")
let fileHandle = try FileHandle(forReadingFrom: fileURL)
do {
_ = try Document(fileHandle: fileHandle)
} catch ImportParser.ImportError.badEncoding {
return
} catch {
XCTFail()
return
}
XCTFail()
}

func testEscapeCharacter() {
let data = Utility.fixture(named: "escapeCharacter.csv")
var dialect = Dialect()
Expand Down Expand Up @@ -290,52 +305,38 @@ class ImportTests: XCTestCase {
XCTAssertEqual(document.header![1], HeaderFields.author.rawValue + " name")
}

func testBadEncoding() throws {
let fileURL = Utility.fixtureURL(named: "western1252Encoded.csv")
let fileHandle = try FileHandle(forReadingFrom: fileURL)
do {
_ = try Document(fileHandle: fileHandle)
} catch ImportParser.ImportError.badEncoding {
return
} catch {
XCTFail()
return
}
XCTFail()
}

func testVariableWidthEncodedStreamSplit() throws {
let inputURL = Utility.fixtureURL(named: "variableWidthEncodedStreamSplit.csv")
var dialect = Dialect()
dialect.header = false

let inputFileHandle = try FileHandle(forReadingFrom: inputURL)
var inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect)
var handler = SpyInputHandlerDelegate()
inputHandler.delegate = handler
var outputSpy = SpyInputHandlerDelegate()
inputHandler.delegate = outputSpy

for numberOfBytes in 1...4 {
try inputHandler.readToEndOfFile(length: numberOfBytes)
XCTAssertEqual(handler.records.count, 2)
XCTAssertEqual(outputSpy.records.count, 2)

let first = try XCTUnwrap(handler.records[safe: 0])
let first = try XCTUnwrap(outputSpy.records[safe: 0])
XCTAssertEqual(first.count, 4)
XCTAssertEqual(first[0], "éab")
XCTAssertEqual(first[1], "abé")
XCTAssertEqual(first[2], "aéb")
XCTAssertEqual(first[3], "abcé")
XCTAssertEqual(first[safe: 0], "éab")
XCTAssertEqual(first[safe: 1], "abé")
XCTAssertEqual(first[safe: 2], "aéb")
XCTAssertEqual(first[safe: 3], "abcé")

let second = try XCTUnwrap(handler.records[safe: 1])
let second = try XCTUnwrap(outputSpy.records[safe: 1])
XCTAssertEqual(second.count, 4)
XCTAssertEqual(second[0], "123")
XCTAssertEqual(second[1], "456")
XCTAssertEqual(second[2], "789")
XCTAssertEqual(second[3], "321")
XCTAssertEqual(second[safe: 0], "123")
XCTAssertEqual(second[safe: 1], "456")
XCTAssertEqual(second[safe: 2], "789")
XCTAssertEqual(second[safe: 3], "321")
}

inputHandler = InputHandler(fileHandle: inputFileHandle, dialect: dialect, maxRetries: 0)
handler = SpyInputHandlerDelegate()
inputHandler.delegate = handler
outputSpy = SpyInputHandlerDelegate()
inputHandler.delegate = outputSpy

for numberOfBytes in 1...4 {
do {
Expand All @@ -349,4 +350,18 @@ class ImportTests: XCTestCase {
}
}

func testWestern1252Encoding() throws {
let inputURL = Utility.fixtureURL(named: "western1252Encoded.csv")
let inputFileHandle = try FileHandle(forReadingFrom: inputURL)
let inputHandler = InputHandler(fileHandle: inputFileHandle, encoding: .windowsCP1252)
let outputSpy = SpyInputHandlerDelegate()
inputHandler.delegate = outputSpy

try inputHandler.readToEndOfFile()
XCTAssertEqual(outputSpy.records.count, 1)
let first = try XCTUnwrap(outputSpy.records.first)
XCTAssertEqual(first[safe: 0], "Always bear in mind that your own resolütion to succeed is more important than any other.")
XCTAssertEqual(first[safe: 1], "Abraham Lincoln")
}

}

0 comments on commit 89c6c11

Please sign in to comment.