Skip to content

Commit

Permalink
Fix duplicate column collisions for CSV and Arrow types
Browse files Browse the repository at this point in the history
Signed-off-by: Andrew Stein <steinlink@gmail.com>
  • Loading branch information
texodus committed Dec 8, 2024
1 parent da17600 commit d331eb3
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 5 deletions.
36 changes: 31 additions & 5 deletions cpp/perspective/src/cpp/arrow_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,33 @@

namespace perspective::apachearrow {

std::shared_ptr<::arrow::Table>
deduplicate_table(std::shared_ptr<::arrow::Table> input) {
auto columns = input->ColumnNames();
std::set<std::string> columns_seen;
bool is_changed;
for (auto& column : columns) {
std::stringstream ss;
ss << column;
while (columns_seen.find(ss.str()) != columns_seen.end()) {
ss << "*";
is_changed = true;
}

if (is_changed) {
column = ss.str();
}

columns_seen.insert(column);
}

if (is_changed) {
input = *input->RenameColumns(columns);
}

return input;
}

void
load_stream(
const std::uint8_t* ptr,
Expand All @@ -38,6 +65,7 @@ load_stream(
arrow::io::BufferReader buffer_reader(
reinterpret_cast<const std::uint8_t*>(ptr), length
);

auto status = arrow::ipc::RecordBatchStreamReader::Open(&buffer_reader);
if (!status.ok()) {
std::stringstream ss;
Expand All @@ -54,7 +82,7 @@ load_stream(
PSP_COMPLAIN_AND_ABORT(ss.str());
};

table = *status5;
table = deduplicate_table(*status5);
}
}

Expand Down Expand Up @@ -101,7 +129,7 @@ load_file(
<< status3.status().ToString() << "\n";
PSP_COMPLAIN_AND_ABORT(ss.str());
};
table = *status3;
table = deduplicate_table(std::move(*status3));
};
}

Expand Down Expand Up @@ -191,11 +219,9 @@ ArrowLoader::init_csv(
std::unordered_map<std::string, std::shared_ptr<arrow::DataType>>&
psp_schema
) {
m_table = csvToTable(csv, is_update, psp_schema);

m_table = deduplicate_table(csvToTable(csv, is_update, psp_schema));
std::shared_ptr<arrow::Schema> schema = m_table->schema();
std::vector<std::shared_ptr<arrow::Field>> fields = schema->fields();

for (const auto& field : fields) {
m_names.push_back(field->name());
m_types.push_back(convert_type(field->type()->name()));
Expand Down
20 changes: 20 additions & 0 deletions rust/perspective-js/test/js/constructors.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,26 @@ function validate_typed_array(typed_array, column_data) {
table.delete();
});

test("Handles duplicate column names with different types", async function () {
const csv = "A,A\ntest,1";
let table = await perspective.table(csv);
let view = await table.view();
let csv2 = await view.to_json();
expect(csv2).toEqual([{ A: "test", "A*": 1 }]);
view.delete();
table.delete();
});

test("Handles duplicate column names with rename collisions", async function () {
const csv = "A,A,A*\ntest,1,2";
let table = await perspective.table(csv);
let view = await table.view();
let csv2 = await view.to_json();
expect(csv2).toEqual([{ A: "test", "A*": 1, "A**": 2 }]);
view.delete();
table.delete();
});

test("Handles strings with quotation characters and commas", async function () {
let table = await perspective.table({ x: "string", y: "integer" });
table.update([
Expand Down

0 comments on commit d331eb3

Please sign in to comment.