Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parquet - write column chunk statistics #754

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
<?php declare(strict_types=1);

namespace Flow\ETL\Adapter\Parquet\Tests\Unit\ParquetFile\RowGroupBuilder;

use Flow\Parquet\ParquetFile\RowGroupBuilder\ColumnChunkStatistics;
use Flow\Parquet\ParquetFile\Schema\FlatColumn;
use PHPUnit\Framework\TestCase;

final class ColumnChunkStatisticsTest extends TestCase
{
public function test_statistics_for_boolean() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::boolean('boolean'));

$statistics->add(true);
$statistics->add(false);
$statistics->add(true);
$statistics->add(true);
$statistics->add(false);
$statistics->add(false);
$statistics->add(null);

$this->assertFalse($statistics->min());
$this->assertTrue($statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(2, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_date() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::date('date'));

$statistics->add(new \DateTimeImmutable('2020-01-01'));
$statistics->add(new \DateTimeImmutable('2020-01-02'));
$statistics->add(new \DateTimeImmutable('2020-01-03'));
$statistics->add(new \DateTimeImmutable('2020-01-04'));
$statistics->add(new \DateTimeImmutable('2020-01-05'));
$statistics->add(new \DateTimeImmutable('2020-01-05'));
$statistics->add(null);

$this->assertSame('2020-01-01', $statistics->min()->format('Y-m-d'));
$this->assertSame('2020-01-05', $statistics->max()->format('Y-m-d'));
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_decimal() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::decimal('decimal'));

$statistics->add('1.1');
$statistics->add('2.2');
$statistics->add('3.3');
$statistics->add('4.4');
$statistics->add('5.5');
$statistics->add('5.5');
$statistics->add(null);

$this->assertSame('1.1', $statistics->min());
$this->assertSame('5.5', $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_double() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::double('double'));

$statistics->add(1.1);
$statistics->add(2.2);
$statistics->add(3.3);
$statistics->add(4.4);
$statistics->add(5.5);
$statistics->add(5.5);
$statistics->add(null);

$this->assertSame(1.1, $statistics->min());
$this->assertSame(5.5, $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_enum() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::enum('enum'));

$statistics->add('a');
$statistics->add('b');
$statistics->add('c');
$statistics->add('d');
$statistics->add('e');
$statistics->add('e');
$statistics->add(null);

$this->assertSame('a', $statistics->min());
$this->assertSame('e', $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_float() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::float('float'));

$statistics->add(1.1);
$statistics->add(2.2);
$statistics->add(3.3);
$statistics->add(4.4);
$statistics->add(5.5);
$statistics->add(5.5);
$statistics->add(null);

$this->assertSame(1.1, $statistics->min());
$this->assertSame(5.5, $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_int32() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::int32('int32'));

$statistics->add(1);
$statistics->add(2);
$statistics->add(3);
$statistics->add(4);
$statistics->add(5);
$statistics->add(5);
$statistics->add(null);

$this->assertSame(1, $statistics->min());
$this->assertSame(5, $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_int64() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::int64('int64'));

$statistics->add(1);
$statistics->add(2);
$statistics->add(3);
$statistics->add(4);
$statistics->add(5);
$statistics->add(5);
$statistics->add(null);

$this->assertSame(1, $statistics->min());
$this->assertSame(5, $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_json() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::json('json'));

$statistics->add('{"a":1}');
$statistics->add('{"b":2}');
$statistics->add('{"c":3}');
$statistics->add('{"d":4}');
$statistics->add('{"e":5}');
$statistics->add('{"e":5}');
$statistics->add(null);

$this->assertSame('{"a":1}', $statistics->min());
$this->assertSame('{"e":5}', $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_string() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::string('string'));

$statistics->add('a');
$statistics->add('b');
$statistics->add('c');
$statistics->add('d');
$statistics->add('e');
$statistics->add('e');
$statistics->add(null);

$this->assertSame('a', $statistics->min());
$this->assertSame('e', $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_time() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::time('time'));

$statistics->add(new \DateInterval('PT1S'));
$statistics->add(new \DateInterval('PT2S'));
$statistics->add(new \DateInterval('PT3S'));
$statistics->add(new \DateInterval('PT4S'));
$statistics->add(new \DateInterval('PT5S'));
$statistics->add(new \DateInterval('PT5S'));
$statistics->add(null);

$this->assertSame('PT01S', $statistics->min()->format('PT%SS'));
$this->assertSame('PT05S', $statistics->max()->format('PT%SS'));
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}

public function test_statistics_for_uuid() : void
{
$statistics = new ColumnChunkStatistics(FlatColumn::uuid('uuid'));

$statistics->add('00000000-0000-0000-0000-000000000000');
$statistics->add('00000000-0000-0000-0000-000000000001');
$statistics->add('00000000-0000-0000-0000-000000000002');
$statistics->add('00000000-0000-0000-0000-000000000003');
$statistics->add('00000000-0000-0000-0000-000000000004');
$statistics->add('00000000-0000-0000-0000-000000000004');
$statistics->add(null);

$this->assertSame('00000000-0000-0000-0000-000000000000', $statistics->min());
$this->assertSame('00000000-0000-0000-0000-000000000004', $statistics->max());
$this->assertSame(7, $statistics->valuesCount());
$this->assertSame(5, $statistics->distinctCount());
$this->assertSame(1, $statistics->nullCount());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ public function flush(int $fileOffset) : ColumnChunkContainer
$pageContainers = (new PagesBuilder($this->dataConverter, $this->compression, $this->calculator, $this->options))
->build($this->column, $this->rows, $this->statistics);

$statistics = (new StatisticsBuilder($this->dataConverter))->build($this->column, $this->statistics);

$this->statistics->reset();

return new ColumnChunkContainer(
Expand All @@ -51,7 +53,7 @@ public function flush(int $fileOffset) : ColumnChunkContainer
dictionaryPageOffset: ($pageContainers->dictionaryPageContainer()) ? $fileOffset : null,
dataPageOffset: ($pageContainers->dictionaryPageContainer()) ? $fileOffset + $pageContainers->dictionaryPageContainer()->totalCompressedSize() : $fileOffset,
indexPageOffset: null,
statistics: null
statistics: $statistics
)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use Flow\Parquet\Data\ObjectToString;
use Flow\Parquet\Exception\RuntimeException;
use Flow\Parquet\ParquetFile\RowGroupBuilder\Statistics\Comparator;
use Flow\Parquet\ParquetFile\Schema\ColumnPrimitiveType;
use Flow\Parquet\ParquetFile\Schema\FlatColumn;
use Flow\Parquet\ParquetFile\Schema\PhysicalType;
Expand All @@ -12,6 +13,12 @@ final class ColumnChunkStatistics
{
private bool $columnIsString;

private Comparator $comparator;

private mixed $max;

private mixed $min;

private int $nullCount;

private int $totalStringLength;
Expand All @@ -26,6 +33,9 @@ public function __construct(private readonly FlatColumn $column)
$this->valuesCount = 0;
$this->totalStringLength = 0;
$this->columnIsString = ColumnPrimitiveType::isString($this->column);
$this->min = null;
$this->max = null;
$this->comparator = new Comparator();
}

public function add(string|int|float|null|array|bool|object $value) : void
Expand All @@ -44,9 +54,26 @@ public function add(string|int|float|null|array|bool|object $value) : void

if (\is_array($value)) {
foreach ($value as $val) {

if ($this->comparator->isLessThan($val, $this->min)) {
$this->min = $val;
}

if ($this->comparator->isGreaterThan($val, $this->max)) {
$this->max = $val;
}

$this->values[] = \is_object($val) ? ObjectToString::toString($val) : $val;
}
} else {
if ($this->comparator->isLessThan($value, $this->min)) {
$this->min = $value;
}

if ($this->comparator->isGreaterThan($value, $this->max)) {
$this->max = $value;
}

$this->values[] = \is_object($value) ? ObjectToString::toString($value) : $value;
}

Expand Down Expand Up @@ -83,6 +110,16 @@ public function distinctCount() : int
return \count(\array_unique($this->values));
}

public function max() : mixed
{
return $this->max;
}

public function min() : mixed
{
return $this->min;
}

public function notNullCount() : int
{
return $this->valuesCount - $this->nullCount;
Expand All @@ -98,6 +135,8 @@ public function reset() : void
$this->nullCount = 0;
$this->valuesCount = 0;
$this->totalStringLength = 0;
$this->min = null;
$this->max = null;
$this->values = [];
}

Expand Down Expand Up @@ -133,6 +172,11 @@ public function uncompressedSize() : int
throw new RuntimeException('Unknown column type');
}

public function values() : array
{
return $this->values;
}

public function valuesCount() : int
{
return $this->valuesCount;
Expand Down
Loading