Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stream import #1298

Merged
merged 5 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/common/stl.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ export namespace std {
using std::ostream;
using std::ofstream;
using std::ifstream;
using std::fstream;
using std::ios;

using std::align;
Expand Down
4 changes: 3 additions & 1 deletion src/executor/operator/physical_export.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,11 @@ SizeT PhysicalExport::ExportToJSONL(QueryContext *query_context, ExportOperatorS

SizeT row_count{0};
Map<SegmentID, SegmentSnapshot>& segment_block_index_ref = block_index_->segment_block_index_;

LOG_DEBUG(fmt::format("Going to export segment count: {}", segment_block_index_ref.size()));
for(auto& [segment_id, segment_snapshot]: segment_block_index_ref) {
LOG_DEBUG(fmt::format("Export segment_id: {}", segment_id));
SizeT block_count = segment_snapshot.block_map_.size();
LOG_DEBUG(fmt::format("Export segment_id: {}, with block count: {}", segment_id, block_count));
for(SizeT block_idx = 0; block_idx < block_count; ++ block_idx) {
LOG_DEBUG(fmt::format("Export block_idx: {}", block_idx));
BlockEntry *block_entry = segment_snapshot.block_map_[block_idx];
Expand Down
85 changes: 32 additions & 53 deletions src/executor/operator/physical_import.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ import value;
import catalog;
import catalog_delta_entry;
import build_fast_rough_filter_task;
import stream_io;

namespace infinity {

Expand Down Expand Up @@ -428,34 +429,15 @@ void PhysicalImport::ImportCSV(QueryContext *query_context, ImportOperatorState
}

void PhysicalImport::ImportJSONL(QueryContext *query_context, ImportOperatorState *import_op_state) {
LocalFileSystem fs;
auto [file_handler, status] = fs.OpenFile(file_path_, FileFlags::READ_FLAG, FileLockType::kReadLock);
if(!status.ok()) {
UnrecoverableError(status.message());
}
DeferFn file_defer([&]() { fs.Close(*file_handler); });

SizeT file_size = fs.GetFileSize(*file_handler);
String jsonl_str(file_size + 1, 0);
SizeT read_n = file_handler->Read(jsonl_str.data(), file_size);
if (read_n != file_size) {
String error_message = fmt::format("Read file size {} doesn't match with file size {}.", read_n, file_size);
LOG_CRITICAL(error_message);
UnrecoverableError(error_message);
}

if (read_n == 0) {
auto result_msg = MakeUnique<String>(fmt::format("Empty JSONL file, IMPORT 0 Rows"));
import_op_state->result_msg_ = std::move(result_msg);
return;
}
StreamIO stream_io;
stream_io.Init(file_path_, FileFlags::READ_FLAG);
DeferFn file_defer([&]() { stream_io.Close(); });

Txn *txn = query_context->GetTxn();
u64 segment_id = Catalog::GetNextSegmentID(table_entry_);
SharedPtr<SegmentEntry> segment_entry = SegmentEntry::NewSegmentEntry(table_entry_, segment_id, txn);
UniquePtr<BlockEntry> block_entry = BlockEntry::NewBlockEntry(segment_entry.get(), 0, 0, table_entry_->ColumnCount(), txn);

SizeT start_pos = 0;
Vector<ColumnVector> column_vectors;
for (SizeT i = 0; i < table_entry_->ColumnCount(); ++i) {
auto *block_column_entry = block_entry->GetColumnBlockEntry(i);
Expand All @@ -464,8 +446,34 @@ void PhysicalImport::ImportJSONL(QueryContext *query_context, ImportOperatorStat

SizeT row_count{0};
while (true) {
if (start_pos >= file_size) {
String json_str;
if (stream_io.ReadLine(json_str)) {
nlohmann::json line_json = nlohmann::json::parse(json_str);

JSONLRowHandler(line_json, column_vectors);
block_entry->IncreaseRowCount(1);
++row_count;

if (block_entry->GetAvailableCapacity() <= 0) {
LOG_DEBUG(fmt::format("Block {} saved, total rows: {}", block_entry->block_id(), row_count));
segment_entry->AppendBlockEntry(std::move(block_entry));
if (segment_entry->Room() <= 0) {
LOG_DEBUG(fmt::format("Segment {} saved, total rows: {}", segment_entry->segment_id(), row_count));
SaveSegmentData(table_entry_, txn, segment_entry);
u64 segment_id = Catalog::GetNextSegmentID(table_entry_);
segment_entry = SegmentEntry::NewSegmentEntry(table_entry_, segment_id, txn);
}

block_entry = BlockEntry::NewBlockEntry(segment_entry.get(), segment_entry->GetNextBlockID(), 0, table_entry_->ColumnCount(), txn);
column_vectors.clear();
for (SizeT i = 0; i < table_entry_->ColumnCount(); ++i) {
auto *block_column_entry = block_entry->GetColumnBlockEntry(i);
column_vectors.emplace_back(block_column_entry->GetColumnVector(txn->buffer_mgr()));
}
}
} else {
if (block_entry->row_count() == 0) {
column_vectors.clear();
std::move(*block_entry).Cleanup();
} else {
segment_entry->AppendBlockEntry(std::move(block_entry));
Expand All @@ -474,39 +482,10 @@ void PhysicalImport::ImportJSONL(QueryContext *query_context, ImportOperatorStat
std::move(*segment_entry).Cleanup();
} else {
SaveSegmentData(table_entry_, txn, segment_entry);
LOG_DEBUG(fmt::format("Last segment {} saved, total rows: {}", segment_entry->segment_id(), row_count));
}
break;
}
SizeT end_pos = jsonl_str.find('\n', start_pos);
if (end_pos == String::npos) {
end_pos = file_size;
}
std::string_view json_sv(jsonl_str.data() + start_pos, end_pos - start_pos);
start_pos = end_pos + 1;

nlohmann::json line_json = nlohmann::json::parse(json_sv);

JSONLRowHandler(line_json, column_vectors);
block_entry->IncreaseRowCount(1);
++ row_count;

if (block_entry->GetAvailableCapacity() <= 0) {
LOG_DEBUG(fmt::format("Block {} saved", block_entry->block_id()));
segment_entry->AppendBlockEntry(std::move(block_entry));
if (segment_entry->Room() <= 0) {
LOG_DEBUG(fmt::format("Segment {} saved", segment_entry->segment_id()));
SaveSegmentData(table_entry_, txn, segment_entry);
u64 segment_id = Catalog::GetNextSegmentID(table_entry_);
segment_entry = SegmentEntry::NewSegmentEntry(table_entry_, segment_id, txn);
}

block_entry = BlockEntry::NewBlockEntry(segment_entry.get(), segment_entry->GetNextBlockID(), 0, table_entry_->ColumnCount(), txn);
column_vectors.clear();
for (SizeT i = 0; i < table_entry_->ColumnCount(); ++i) {
auto *block_column_entry = block_entry->GetColumnBlockEntry(i);
column_vectors.emplace_back(block_column_entry->GetColumnVector(txn->buffer_mgr()));
}
}
}

auto result_msg = MakeUnique<String>(fmt::format("IMPORT {} Rows", row_count));
Expand Down
66 changes: 66 additions & 0 deletions src/storage/io/stream_io.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

#include <fstream>

module stream_io;

import stl;
import logger;
import status;
import file_system_type;
import infinity_exception;
import third_party;

namespace infinity {

StreamIO::~StreamIO() = default;

void StreamIO::Init(const String& file_name, u8 flags) {
bool reader_ = flags & FileFlags::READ_FLAG;
bool writer_ = flags & FileFlags::WRITE_FLAG;
if (reader_ && writer_) {
file_.open(file_name, std::ios::in | std::ios::out);
} else if (reader_) {
file_.open(file_name, std::ios::in);
} else if (writer_) {
file_.open(file_name, std::ios::out);
} else {
Status status = Status::InvalidCommand("Not reachable");
LOG_ERROR(status.message());
RecoverableError(status);
}

if (!file_.is_open()) {
Status status = Status::IOError(fmt::format("{} can't open", file_name));
LOG_ERROR(file_name);
RecoverableError(status);
}
}

bool StreamIO::ReadLine(String& line) {
if(getline(file_, line)) {
return true;
} else {
return false;
}
}

void StreamIO::Close() {
file_.close();
}

} // namespace infinity
42 changes: 42 additions & 0 deletions src/storage/io/stream_io.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

#include <fstream>

export module stream_io;

import stl;
import status;

namespace infinity {

export class StreamIO {

public:
StreamIO() = default;
~StreamIO();

void Init(const String& file_name, u8 flags);
bool ReadLine(String& line);
void Close();

private:
std::fstream file_;
bool reader_{false};
bool writer_{false};
};

} // namespace infinity
Loading