-
Notifications
You must be signed in to change notification settings - Fork 411
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add ReorganizeBlockInputStream for reorganize the boundary of blocks (#…
…330)
- Loading branch information
1 parent
3484309
commit bf87a3a
Showing
3 changed files
with
129 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
119 changes: 119 additions & 0 deletions
119
dbms/src/Storages/DeltaMerge/ReorganizeBlockInputStream.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#pragma once | ||
|
||
#include <DataStreams/IBlockInputStream.h> | ||
#include <Storages/DeltaMerge/DeltaMergeHelpers.h> | ||
|
||
namespace DB | ||
{ | ||
namespace DM | ||
{ | ||
|
||
/// Reorganize the boundary of blocks. | ||
/// Note that child must be a sorted input stream with increasing pk column. | ||
class ReorganizeBlockInputStream final : public IBlockInputStream | ||
{ | ||
public: | ||
ReorganizeBlockInputStream(BlockInputStreamPtr child, String pk_column_name_) | ||
: sorted_input_stream(std::move(child)), pk_column_name(std::move(pk_column_name_)) | ||
{ | ||
assert(sorted_input_stream != nullptr); | ||
cur_block = {}; | ||
next_block = ::DB::DM::readNextBlock(sorted_input_stream); | ||
if constexpr (DM_RUN_CHECK) | ||
{ | ||
// Sanity check for existence of pk column | ||
Block header = sorted_input_stream->getHeader(); | ||
if (!header.has(pk_column_name)) | ||
{ | ||
throw Exception("Try to write block to Chunk without pk column", ErrorCodes::LOGICAL_ERROR); | ||
} | ||
} | ||
} | ||
|
||
String getName() const override { return "ReorganizeBlockBoundary"; } | ||
Block getHeader() const override { return sorted_input_stream->getHeader(); } | ||
|
||
Block read() override | ||
{ | ||
cur_block = next_block; | ||
if (!cur_block) | ||
return cur_block; | ||
|
||
while (true) | ||
{ | ||
next_block = ::DB::DM::readNextBlock(sorted_input_stream); | ||
|
||
const size_t cut_offset = findCutOffsetInNextBlock(cur_block, next_block, pk_column_name); | ||
if (unlikely(cut_offset == 0)) | ||
// There is no pk overlap between `cur_block` and `next_block`, or `next_block` is empty, just return `cur_block`. | ||
return cur_block; | ||
else | ||
{ | ||
const size_t next_block_nrows = next_block.rows(); | ||
for (size_t col_idx = 0; col_idx != cur_block.columns(); ++col_idx) | ||
{ | ||
auto & cur_col_with_name = cur_block.getByPosition(col_idx); | ||
auto & next_col_with_name = next_block.getByPosition(col_idx); | ||
auto * cur_col_raw = const_cast<IColumn *>(cur_col_with_name.column.get()); | ||
cur_col_raw->insertRangeFrom(*next_col_with_name.column, 0, cut_offset); | ||
|
||
if (cut_offset != next_block_nrows) | ||
{ | ||
// TODO: we can track the valid range instead of copying data. | ||
size_t nrows_to_copy = next_block_nrows - cut_offset; | ||
// Pop front `cut_offset` elems from `next_col_with_name` | ||
assert(next_block_nrows == next_col_with_name.column->size()); | ||
MutableColumnPtr cutted_next_column = next_col_with_name.column->cloneEmpty(); | ||
cutted_next_column->insertRangeFrom(*next_col_with_name.column, cut_offset, nrows_to_copy); | ||
next_col_with_name.column = cutted_next_column->getPtr(); | ||
} | ||
} | ||
if (cut_offset != next_block_nrows) | ||
{ | ||
// We merge some rows to `cur_block`, return it. | ||
return cur_block; | ||
} | ||
// else we merge all rows from `next_block` to `cur_block`, continue to check if we should merge more blocks. | ||
} | ||
} | ||
} | ||
|
||
private: | ||
static size_t findCutOffsetInNextBlock(const Block & cur_block, const Block & next_block, const String & pk_column_name) | ||
{ | ||
assert(cur_block); | ||
if (!next_block) | ||
return 0; | ||
|
||
auto cur_col = cur_block.getByName(pk_column_name).column; | ||
const Int64 last_curr_pk = cur_col->getInt(cur_col->size() - 1); | ||
auto next_col = next_block.getByName(pk_column_name).column; | ||
size_t cut_offset = 0; | ||
for (/* */; cut_offset < next_col->size(); ++cut_offset) | ||
{ | ||
const Int64 next_pk = next_col->getInt(cut_offset); | ||
if (next_pk != last_curr_pk) | ||
{ | ||
if constexpr (DM_RUN_CHECK) | ||
{ | ||
if (unlikely(next_pk < last_curr_pk)) | ||
throw Exception("InputStream is not sorted, pk in next block is smaller than current block: " + toString(next_pk) | ||
+ " < " + toString(last_curr_pk), | ||
ErrorCodes::LOGICAL_ERROR); | ||
} | ||
break; | ||
} | ||
} | ||
return cut_offset; | ||
} | ||
|
||
private: | ||
BlockInputStreamPtr sorted_input_stream; | ||
const String pk_column_name; | ||
|
||
Block cur_block; | ||
Block next_block; | ||
}; | ||
|
||
} // namespace DM | ||
} // namespace DB |