Skip to content

Commit

Permalink
start on stats
Browse files Browse the repository at this point in the history
  • Loading branch information
paleolimbot committed Aug 17, 2024
1 parent ccbd616 commit 3752f83
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 3 deletions.
6 changes: 3 additions & 3 deletions cpp/src/parquet/geometry_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ class WKBBuffer {
size_ -= sizeof(uint32_t);

if constexpr (swap) {
value = arrow::bit_util::ByteSwap(value);
value = ::arrow::bit_util::ByteSwap(value);
}

return value;
Expand All @@ -232,7 +232,7 @@ class WKBBuffer {

if constexpr (swap) {
for (uint32_t i = 0; i < n; i++) {
out[i] = arrow::bit_util::ByteSwap(out[i]);
out[i] = ::arrow::bit_util::ByteSwap(out[i]);
}
}
}
Expand Down Expand Up @@ -316,7 +316,7 @@ struct BoundingBox {
double max[4];
};

bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) {
inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) {
return lhs.dimensions == rhs.dimensions &&
std::memcmp(lhs.min, rhs.min, sizeof(lhs.min)) == 0 &&
std::memcmp(lhs.max, rhs.max, sizeof(lhs.max)) == 0;
Expand Down
20 changes: 20 additions & 0 deletions cpp/src/parquet/statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "arrow/visit_data_inline.h"
#include "parquet/encoding.h"
#include "parquet/exception.h"
#include "parquet/geometry_util.h"
#include "parquet/platform.h"
#include "parquet/schema.h"

Expand Down Expand Up @@ -618,6 +619,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
bool HasDistinctCount() const override { return has_distinct_count_; };
bool HasMinMax() const override { return has_min_max_; }
bool HasNullCount() const override { return has_null_count_; };
bool HasGeometryStatistics() const override { return geometry_statistics_ != nullptr; }

void IncrementNullCount(int64_t n) override {
statistics_.null_count += n;
Expand All @@ -630,6 +632,8 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
switch (type) {
case LogicalType::Type::FLOAT16:
return true;
case LogicalType::Type::GEOMETRY:
return true;
default:
return false;
}
Expand All @@ -654,6 +658,15 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
if (!MinMaxEqual(other)) return false;
}

if (HasGeometryStatistics() != other.HasGeometryStatistics()) {
return false;
}

if (HasGeometryStatistics() &&
!geometry_statistics_->Equals(*other.GeometryStatistics())) {
return false;
}

return null_count() == other.null_count() &&
distinct_count() == other.distinct_count() &&
num_values() == other.num_values();
Expand Down Expand Up @@ -773,6 +786,7 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
std::shared_ptr<TypedComparator<DType>> comparator_;
std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
LogicalType::Type::type logical_type_ = LogicalType::Type::NONE;
std::shared_ptr<GeometryStatistics> geometry_statistics_;

void PlainEncode(const T& src, std::string* dst) const;
void PlainDecode(const std::string& src, T* dst) const;
Expand Down Expand Up @@ -865,6 +879,12 @@ void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_values,

if (num_values == 0) return;
SetMinMaxPair(comparator_->GetMinMax(values, num_values));

if constexpr (std::is_same<T, ByteArray>::value) {
if (logical_type_ == LogicalType::Type::GEOMETRY) {
geometry_statistics_->Update(values, num_values, null_count);
}
}
}

template <typename DType>
Expand Down
19 changes: 19 additions & 0 deletions cpp/src/parquet/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <string>
#include <utility>

#include "parquet/geometry_util.h"
#include "parquet/platform.h"
#include "parquet/types.h"

Expand Down Expand Up @@ -114,6 +115,20 @@ std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* d
return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
}

class GeometryStatistics {
public:
bool Equals(const GeometryStatistics& other) const { ParquetException::NYI(); }

void Merge(const GeometryStatistics& other) { ParquetException::NYI(); }

void Update(const ByteArray* values, int64_t num_values, int64_t null_count) {
ParquetException::NYI();
}

private:
geometry::WKBGeometryBounder bounder_;
};

// ----------------------------------------------------------------------

/// \brief Structure represented encoded statistics to be written to
Expand Down Expand Up @@ -250,6 +265,10 @@ class PARQUET_EXPORT Statistics {
/// with TypedStatistics<T>::min and max
virtual bool HasMinMax() const = 0;

virtual bool HasGeometryStatistics() const { return false; };

virtual const GeometryStatistics* GeometryStatistics() const { return nullptr; }

/// \brief Reset state of object to initial (no data observed) state
virtual void Reset() = 0;

Expand Down

0 comments on commit 3752f83

Please sign in to comment.