Skip to content

Commit

Permalink
remove dictionary_ member
Browse files Browse the repository at this point in the history
  • Loading branch information
js8544 committed Dec 18, 2023
1 parent f09ab5c commit a841030
Showing 1 changed file with 31 additions and 16 deletions.
47 changes: 31 additions & 16 deletions cpp/src/arrow/compute/kernels/vector_hash.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,20 @@
#include "arrow/array/concatenate.h"
#include "arrow/array/dict_internal.h"
#include "arrow/array/util.h"
#include "arrow/buffer.h"
#include "arrow/compute/api_vector.h"
#include "arrow/compute/cast.h"
#include "arrow/compute/kernels/common_internal.h"
#include "arrow/result.h"
#include "arrow/util/hashing.h"
#include "arrow/util/int_util.h"
#include "arrow/util/unreachable.h"

namespace arrow {

using internal::DictionaryTraits;
using internal::HashTraits;
using internal::TransposeInts;

namespace compute {
namespace internal {
Expand Down Expand Up @@ -448,9 +451,9 @@ class DictionaryHashKernel : public HashKernel {

Status Append(const ArraySpan& arr) override {
auto arr_dict = arr.dictionary().ToArray();
if (!dictionary_) {
dictionary_ = arr_dict;
} else if (!dictionary_->Equals(*arr_dict)) {
if (!first_dictionary_) {
first_dictionary_ = arr_dict;
} else if (!first_dictionary_->Equals(*arr_dict)) {
// NOTE: This approach computes a new dictionary unification per chunk.
// This is in effect O(n*k) where n is the total chunked array length and
// k is the number of chunks (therefore O(n**2) if chunks have a fixed size).
Expand All @@ -460,22 +463,21 @@ class DictionaryHashKernel : public HashKernel {
// the "value_counts" kernel).
if (dictionary_unifier_ == nullptr) {
ARROW_ASSIGN_OR_RAISE(dictionary_unifier_,
DictionaryUnifier::Make(dictionary_->type()));
RETURN_NOT_OK(dictionary_unifier_->Unify(*dictionary_));
DictionaryUnifier::Make(first_dictionary_->type()));
RETURN_NOT_OK(dictionary_unifier_->Unify(*first_dictionary_));
}
auto out_dict_type = dictionary_->type();
auto out_dict_type = first_dictionary_->type();
std::shared_ptr<Buffer> transpose_map;
std::shared_ptr<Array> out_dict;

RETURN_NOT_OK(dictionary_unifier_->Unify(*arr_dict, &transpose_map));
RETURN_NOT_OK(dictionary_unifier_->GetResult(&out_dict_type, &out_dict));

dictionary_ = out_dict;
auto transpose = reinterpret_cast<const int32_t*>(transpose_map->data());
auto in_dict_array = arr.ToArray();
auto in_array = arr.ToArray();
const auto& in_dict_array =
arrow::internal::checked_cast<const DictionaryArray&>(*in_array);
ARROW_ASSIGN_OR_RAISE(
auto tmp, arrow::internal::checked_cast<const DictionaryArray&>(*in_dict_array)
.Transpose(arr.type->GetSharedPtr(), out_dict, transpose));
auto tmp, in_dict_array.Transpose(arr.type->GetSharedPtr(),
in_dict_array.dictionary(), transpose));
return indices_kernel_->Append(*tmp->data());
}

Expand All @@ -498,11 +500,23 @@ class DictionaryHashKernel : public HashKernel {
return dictionary_value_type_;
}

std::shared_ptr<Array> dictionary() const { return dictionary_; }
Result<std::shared_ptr<Array>> dictionary() const {
if (!first_dictionary_) { // Append is never called
return nullptr;
}
if (!dictionary_unifier_) { // Append is called only once
return first_dictionary_;
}

auto out_dict_type = first_dictionary_->type();
std::shared_ptr<Array> out_dict;
RETURN_NOT_OK(dictionary_unifier_->GetResult(&out_dict_type, &out_dict));
return out_dict;
}

private:
std::unique_ptr<HashKernel> indices_kernel_;
std::shared_ptr<Array> dictionary_;
std::shared_ptr<Array> first_dictionary_;
std::shared_ptr<DataType> dictionary_value_type_;
std::unique_ptr<DictionaryUnifier> dictionary_unifier_;
};
Expand Down Expand Up @@ -634,8 +648,9 @@ Status ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
// hence have no dictionary.
Result<std::shared_ptr<ArrayData>> EnsureHashDictionary(KernelContext* ctx,
DictionaryHashKernel* hash) {
if (hash->dictionary()) {
return hash->dictionary()->data();
ARROW_ASSIGN_OR_RAISE(auto dict, hash->dictionary());
if (dict) {
return dict->data();
}
ARROW_ASSIGN_OR_RAISE(auto null, MakeArrayOfNull(hash->dictionary_value_type(),
/*length=*/0, ctx->memory_pool()));
Expand Down

0 comments on commit a841030

Please sign in to comment.