Skip to content

Commit

Permalink
DH-18300: Improve DataIndex performance.
Browse files Browse the repository at this point in the history
DataIndex, particularly when used for where() filters had missing
parallelization opportunities; and would read more data than strictly
necessary to satisfy the filter.

Statistics have been added to various operations, the existing Value class
was not thread safe.  The internal state has been updated to use volatiles
and AtomicLongFieldUpdaters.

The following Configuration properties have been added:
- AbstractColumnSource.usePartialDataIndex
- AbstractColumnSource.useParallelIndexBuild
- QueryTable.useDataIndexForAggregation
- MergedDataIndex.useParallelLazyFetch
  • Loading branch information
cpwright committed Jan 20, 2025
1 parent e7f731b commit a15b998
Show file tree
Hide file tree
Showing 15 changed files with 566 additions and 148 deletions.
47 changes: 37 additions & 10 deletions Base/src/main/java/io/deephaven/base/stats/Value.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,22 @@
//
package io.deephaven.base.stats;

public abstract class Value {
import java.text.DecimalFormat;
import java.util.concurrent.atomic.AtomicLongFieldUpdater;

protected long n = 0;
protected long last = 0;
protected long sum = 0;
protected long sum2 = 0;
protected long max = Long.MIN_VALUE;
protected long min = Long.MAX_VALUE;
public abstract class Value {
private static final AtomicLongFieldUpdater<Value> N_UPDATER = AtomicLongFieldUpdater.newUpdater(Value.class, "n");
private static final AtomicLongFieldUpdater<Value> SUM_UPDATER =
AtomicLongFieldUpdater.newUpdater(Value.class, "sum");
private static final AtomicLongFieldUpdater<Value> SUM2_UPDATER =
AtomicLongFieldUpdater.newUpdater(Value.class, "sum2");

volatile protected long n = 0;
volatile protected long last = 0;
volatile protected long sum = 0;
volatile protected long sum2 = 0;
volatile protected long max = Long.MIN_VALUE;
volatile protected long min = Long.MAX_VALUE;

private boolean alwaysUpdated = false;

Expand Down Expand Up @@ -52,10 +60,10 @@ protected Value(History history) {
}

public void sample(long x) {
n++;
N_UPDATER.incrementAndGet(this);
SUM_UPDATER.addAndGet(this, x);
SUM2_UPDATER.addAndGet(this, x * x);
last = x;
sum += x;
sum2 += x * x;
if (x > max) {
max = x;
}
Expand Down Expand Up @@ -99,4 +107,23 @@ public void update(Item item, ItemUpdateListener listener, long logInterval, lon
}
}
}

@Override
public String toString() {
final DecimalFormat format = new DecimalFormat("#,###");
final DecimalFormat avgFormat = new DecimalFormat("#,###.###");

final double variance = n > 1 ? (sum2 - ((double) sum * sum / (double) n)) / (n - 1) : Double.NaN;

return "Value{" +
"n=" + format.format(n) +
(n > 0 ? ", sum=" + format.format(sum) +
", max=" + format.format(max) +
", min=" + format.format(min) +
", avg=" + avgFormat.format((n > 0 ? (double) sum / n : Double.NaN)) +
", std=" + avgFormat.format(Math.sqrt(variance))
: "")
+
'}';
}
}
5 changes: 5 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ For more information, see:
* [gh pr create](https://cli.github.com/manual/gh_pr_create)
* [CLI In Use](https://cli.github.com/manual/examples.html)

## Labels

- Each pull request msut have a `ReleaseNotesNeeded` label if a user can perceive the changes. Build or testing-only changes should have a `NoReleaseNotesNeeded` label.
- Each pull request must have a `DocumentationNeeded` label if the user guide should be updated. Pull requests that do not require a user guide update should have a `NoDocumentationNeeded` label.

## Styleguide
The [styleguide](style/README.md) is applied globally to the entire project, except for generated code that gets checked in.
To apply the styleguide, run `./gradlew spotlessApply`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,43 @@ default ColumnSource<?>[] keyColumns(@NotNull final ColumnSource<?>[] indexedCol
@FinalDefault
@NotNull
default ColumnSource<RowSet> rowSetColumn() {
return table().getColumnSource(rowSetColumnName(), RowSet.class);
return rowSetColumn(DataIndexOptions.DEFAULT);
}

/**
* Get the {@link RowSet} {@link ColumnSource} of the index {@link #table() table}.
*
* @return The {@link RowSet} {@link ColumnSource}
*/
@FinalDefault
@NotNull
default ColumnSource<RowSet> rowSetColumn(final DataIndexOptions options) {
return table(options).getColumnSource(rowSetColumnName(), RowSet.class);
}

/**
* Get the {@link Table} backing this data index.
*
* <p>
* The returned table is fully in-memory, equivalent to {@link #table(DataIndexOptions)} with default options.
* </p>
*
* @return The {@link Table}
*/
@NotNull
Table table();
default Table table() {
return table(DataIndexOptions.DEFAULT);
}

/**
* Get the {@link Table} backing this data index.
*
* @param options parameters to control the returned table
*
* @return The {@link Table}
*/
@NotNull
Table table(DataIndexOptions options);

/**
* Whether the index {@link #table()} {@link Table#isRefreshing() is refreshing}. Some transformations will force
Expand All @@ -115,4 +142,5 @@ default ColumnSource<RowSet> rowSetColumn() {
* otherwise
*/
boolean isRefreshing();

}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@ interface RowKeyLookup {
* @return A function that provides map-like lookup of index {@link #table()} row keys from an index lookup key
*/
@NotNull
RowKeyLookup rowKeyLookup();
default RowKeyLookup rowKeyLookup() {
return rowKeyLookup(DataIndexOptions.DEFAULT);
}

@NotNull
RowKeyLookup rowKeyLookup(DataIndexOptions options);

/**
* Return a {@link RowKeyLookup lookup function} function of index row keys for this index. If
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
//
// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
//
package io.deephaven.engine.table;

import io.deephaven.annotations.BuildableStyle;
import io.deephaven.api.filter.Filter;
import org.immutables.value.Value;

/**
* Options for controlling the function of a {@link DataIndex}.
*
* <p>
* Presently, this is used for the {@link Table#where(Filter)} operation to more efficiently handle data index matches,
* without necessarily reading all RowSet information from disk across partitions.
* </p>
*/
@Value.Immutable
@BuildableStyle
public interface DataIndexOptions {
DataIndexOptions DEFAULT = DataIndexOptions.builder().build();

/**
* Does this operation use only a subset of the DataIndex?
*
* <p>
* The DataIndex implementation may use this hint to defer work for some row sets.
* </p>
*
* @return if this operation is only going to use a subset of this data index
*/
@Value.Default
default boolean operationUsesPartialTable() {
return false;
}

/**
* Create a new builder for a {@link DataIndexOptions}.
*
* @return
*/
static Builder builder() {
return ImmutableDataIndexOptions.builder();
}

/**
* The builder interface to construct a {@link DataIndexOptions}.
*/
interface Builder {
/**
* Set whether this operation only uses a subset of the data index.
*
* @param usesPartialTable true if this operation only uses a partial table
* @return this builder
*/
Builder operationUsesPartialTable(boolean usesPartialTable);

/**
* Build the {@link DataIndexOptions}.
*
* @return an immutable DataIndexOptions structure.
*/
DataIndexOptions build();
}
}
Loading

0 comments on commit a15b998

Please sign in to comment.