Skip to content

Commit

Permalink
Fix null compacting for ARRAY and MAP selective column reader (facebo…
Browse files Browse the repository at this point in the history
…okincubator#10805)

Summary:
Pull Request resolved: facebookincubator#10805

For array/map column readers, we were using `resultNulls_` as nulls.
This is working correctly in most of the cases, except when during `read` call,
we decide to reuse reader nulls, but later there is another filter shrink the
row set, and cause that we need to compact the reader nulls into result nulls.
Although we set the bits in `resultNulls_` correctly, the `returnReaderNulls_` is
not reset in this case and we still use the reader nulls in result vector.

Fix this by reusing the same implementation in struct column reader on all
complex type column readers.  Also clean up the unused `compactComplexValues`
method.

Reviewed By: HuamengJiang

Differential Revision: D61620252

fbshipit-source-id: b5d35c7f94a622a8f9365e2fff7e89e441d113b0
  • Loading branch information
Yuhta authored and facebook-github-bot committed Aug 22, 2024
1 parent 9387a9d commit 3325e96
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 76 deletions.
25 changes: 25 additions & 0 deletions velox/dwio/common/SelectiveColumnReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,31 @@ const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) {
return moveFrom;
}

void SelectiveColumnReader::setComplexNulls(RowSet rows, VectorPtr& result)
const {
if (!nullsInReadRange_) {
result->clearNulls(0, rows.size());
return;
}
const bool dense = 1 + rows.back() == rows.size();
auto& nulls = result->nulls();
if (dense &&
!(nulls && nulls->isMutable() &&
nulls->capacity() >= bits::nbytes(rows.size()))) {
result->setNulls(nullsInReadRange_);
return;
}
auto* readerNulls = nullsInReadRange_->as<uint64_t>();
auto* resultNulls = result->mutableNulls(rows.size())->asMutable<uint64_t>();
if (dense) {
bits::copyBits(readerNulls, 0, resultNulls, 0, rows.size());
return;
}
for (vector_size_t i = 0; i < rows.size(); ++i) {
bits::setBit(resultNulls, i, bits::isBitSet(readerNulls, rows[i]));
}
}

void SelectiveColumnReader::getIntValues(
RowSet rows,
const TypePtr& requestedType,
Expand Down
13 changes: 5 additions & 8 deletions velox/dwio/common/SelectiveColumnReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -548,17 +548,14 @@ class SelectiveColumnReader {
template <typename T, typename TVector>
void compactScalarValues(RowSet rows, bool isFinal);

// Compacts values extracted for a complex type column with
// filter. The values for 'rows' are shifted to be consecutive at
// indices [0..rows.size() - 1]'. 'move' is a function that takes
// two indices source and target and moves the value at source to
// target. target is <= source for all calls.
template <typename Move>
void compactComplexValues(RowSet rows, Move move, bool isFinal);

template <typename T, typename TVector>
void upcastScalarValues(RowSet rows);

// For complex type column, we need to compact only nulls if the rows are
// shrinked. Child fields are handled recursively in their own column
// readers.
void setComplexNulls(RowSet rows, VectorPtr& result) const;

// Return the source null bits if compactScalarValues and upcastScalarValues
// should move null flags. Return nullptr if nulls does not need to be moved.
// Checks consistency of nulls-related state.
Expand Down
52 changes: 0 additions & 52 deletions velox/dwio/common/SelectiveColumnReaderInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,58 +306,6 @@ inline int32_t sizeOfIntKind(TypeKind kind) {
}
}

template <typename Move>
void SelectiveColumnReader::compactComplexValues(
RowSet rows,
Move move,
bool isFinal) {
VELOX_CHECK_LE(rows.size(), outputRows_.size());
VELOX_CHECK(!rows.empty());
if (rows.size() == outputRows_.size()) {
return;
}
RowSet sourceRows;
// The row numbers corresponding to elements in 'values_' are in
// 'valueRows_' if values have been accessed before. Otherwise
// they are in 'outputRows_' if these are non-empty (there is a
// filter) and in 'inputRows_' otherwise.
if (!valueRows_.empty()) {
sourceRows = valueRows_;
} else if (!outputRows_.empty()) {
sourceRows = outputRows_;
} else {
sourceRows = inputRows_;
}
if (valueRows_.empty()) {
valueRows_.resize(rows.size());
}
vector_size_t rowIndex = 0;
auto nextRow = rows[rowIndex];
auto* moveNullsFrom = shouldMoveNulls(rows);
for (size_t i = 0; i < numValues_; i++) {
if (sourceRows[i] < nextRow) {
continue;
}

VELOX_DCHECK(sourceRows[i] == nextRow);
// The value at i is moved to be the value at 'rowIndex'.
move(i, rowIndex);
if (moveNullsFrom && rowIndex != i) {
bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i));
}
if (!isFinal) {
valueRows_[rowIndex] = nextRow;
}
rowIndex++;
if (rowIndex >= rows.size()) {
break;
}
nextRow = rows[rowIndex];
}
numValues_ = rows.size();
valueRows_.resize(numValues_);
}

template <typename T>
void SelectiveColumnReader::filterNulls(
RowSet rows,
Expand Down
7 changes: 2 additions & 5 deletions velox/dwio/common/SelectiveRepeatedColumnReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,6 @@ void SelectiveRepeatedColumnReader::makeOffsetsAndSizes(
rawOffsets[i] = nestedRowIndex;
if (nulls && bits::isBitNull(nulls, row)) {
rawSizes[i] = 0;
if (!returnReaderNulls_) {
bits::setNull(rawResultNulls_, i);
}
anyNulls_ = true;
} else {
currentOffset += allLengths_[row];
Expand Down Expand Up @@ -240,7 +237,7 @@ void SelectiveListColumnReader::getValues(RowSet rows, VectorPtr* result) {
prepareResult(*result, requestedType_, rows.size(), &memoryPool_);
auto* resultArray = result->get()->asUnchecked<ArrayVector>();
makeOffsetsAndSizes(rows, *resultArray);
result->get()->setNulls(resultNulls());
setComplexNulls(rows, *result);
if (child_ && !nestedRows_.empty()) {
auto& elements = resultArray->elements();
prepareStructResult(requestedType_->childAt(0), &elements);
Expand Down Expand Up @@ -321,7 +318,7 @@ void SelectiveMapColumnReader::getValues(RowSet rows, VectorPtr* result) {
prepareResult(*result, requestedType_, rows.size(), &memoryPool_);
auto* resultMap = result->get()->asUnchecked<MapVector>();
makeOffsetsAndSizes(rows, *resultMap);
result->get()->setNulls(resultNulls());
setComplexNulls(rows, *result);
VELOX_CHECK(
keyReader_ && elementReader_,
"keyReader_ and elementReaer_ must exist in "
Expand Down
10 changes: 1 addition & 9 deletions velox/dwio/common/SelectiveStructColumnReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,15 +386,7 @@ void SelectiveStructColumnReaderBase::getValues(
if (!rows.size()) {
return;
}
if (nullsInReadRange_) {
auto readerNulls = nullsInReadRange_->as<uint64_t>();
auto* nulls = resultRow->mutableNulls(rows.size())->asMutable<uint64_t>();
for (size_t i = 0; i < rows.size(); ++i) {
bits::setBit(nulls, i, bits::isBitSet(readerNulls, rows[i]));
}
} else {
resultRow->clearNulls(0, rows.size());
}
setComplexNulls(rows, *result);
bool lazyPrepared = false;
for (auto& childSpec : scanSpec_->children()) {
VELOX_TRACE_HISTORY_PUSH("getValues %s", childSpec->fieldName().c_str());
Expand Down
3 changes: 1 addition & 2 deletions velox/dwio/common/SelectiveStructColumnReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,6 @@ SelectiveFlatMapColumnReaderHelper<T, KeyNode, FormatData>::calculateOffsets(
for (vector_size_t i = 0; i < rows.size(); ++i) {
if (!reader_.returnReaderNulls_ && nulls &&
bits::isBitNull(nulls, rows[i])) {
bits::setNull(reader_.rawResultNulls_, i);
reader_.anyNulls_ = true;
}
offsets[i] = numNestedRows;
Expand Down Expand Up @@ -546,7 +545,7 @@ void SelectiveFlatMapColumnReaderHelper<T, KeyNode, FormatData>::getValues(
std::copy_backward(
rawOffsets, rawOffsets + rows.size() - 1, rawOffsets + rows.size());
rawOffsets[0] = 0;
result->get()->setNulls(reader_.resultNulls());
reader_.setComplexNulls(rows, *result);
}

} // namespace facebook::velox::dwio::common
25 changes: 25 additions & 0 deletions velox/exec/tests/TableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3153,6 +3153,31 @@ TEST_F(TableScanTest, mapIsNullFilter) {
"SELECT * FROM tmp WHERE c0 is null");
}

TEST_F(TableScanTest, compactComplexNulls) {
constexpr int kSize = 10;
auto iota = makeFlatVector<int64_t>(kSize, folly::identity);
std::vector<vector_size_t> offsets(kSize);
for (int i = 0; i < kSize; ++i) {
offsets[i] = (i + 1) / 2 * 2;
}
auto c0 = makeRowVector(
{
makeArrayVector(offsets, iota, {1, 3, 5, 7, 9}),
iota,
},
[](auto i) { return i == 2; });
auto data = makeRowVector({c0});
auto schema = asRowType(data->type());
auto file = TempFilePath::create();
writeToFile(file->getPath(), {data});
auto plan = PlanBuilder().tableScan(schema, {"(c0).c1 > 0"}).planNode();
auto split = makeHiveConnectorSplit(file->getPath());
const vector_size_t indices[] = {1, 3, 4, 5, 6, 7, 8, 9};
auto expected = makeRowVector({wrapInDictionary(
makeIndices(8, [&](auto i) { return indices[i]; }), c0)});
AssertQueryBuilder(plan).split(split).assertResults(expected);
}

TEST_F(TableScanTest, remainingFilter) {
auto rowType = ROW(
{"c0", "c1", "c2", "c3"}, {INTEGER(), INTEGER(), DOUBLE(), BOOLEAN()});
Expand Down

0 comments on commit 3325e96

Please sign in to comment.