Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: statistical functions should return null when provided a vector of only null values #5606

Merged
merged 19 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 94 additions & 40 deletions engine/function/src/templates/Numeric.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -1268,38 +1268,63 @@ public class Numeric {

if (n == 0) {
return NULL_DOUBLE;
} else {
// NULL values sorted to beginning of the array.
${pt.primitive}[] copy = sort(values.toArray());
}

${pt.primitive}[] sorted = values.copyToArray();
Arrays.sort(sorted);

// Determine if there are any NULL in the array.
int nullCount = 0;
for (int i = 0; i < n; i++) {
if (isNull(copy[i])) {
int nullStart = -1;
int nullCount = 0;

<#if pt.valueType.isFloat >
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
for (int i = 0; i < n; i++) {
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
final ${pt.primitive} val = sorted[i];
if (isNaN(val)) {
return Double.NaN; // Any NaN will pollute the result.
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
}
if (nullStart == -1) {
if (isNull(val)) {
nullStart = i;
nullCount++;
} else {
break;
}
} else if (isNull(val)) {
nullCount++;
}

if (nullCount == 0) {
// No NULL, so we can just compute the median and return.
if (n % 2 == 0)
return 0.5 * (copy[n / 2 - 1] + copy[n / 2]);
else return copy[n / 2];
} else if (nullCount < n) {
// Some NULL, reduce the count and compute the median of the non-null values.
n -= nullCount;
if (n % 2 == 0) {
int index = n / 2;
return 0.5 * (copy[n / 2 - 1 + nullCount] + copy[n / 2 + nullCount]);
}
<#else>
for (int i = 0; i < n; i++) {
final ${pt.primitive} val = sorted[i];
if (nullStart == -1) {
if (isNull(val)) {
nullStart = i;
nullCount++;
}
else return copy[n / 2 + nullCount];
} else if (isNull(val)) {
nullCount++;
} else {
// All values are NULL.
return NULL_DOUBLE;
break; // no more NULL possible
}
}
</#if>

if (nullCount == n) {
return NULL_DOUBLE;
} else if (nullCount > 0) {
if (nullStart > 0) {
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
// Move the pre-NULL data so we have a contiguous block of non-NULL values.
System.arraycopy(sorted, 0, sorted, nullCount, nullStart);
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
}
n -= nullCount;
if (n % 2 == 0) {
int index = n / 2;
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
return 0.5 * (sorted[n / 2 - 1 + nullCount] + sorted[n / 2 + nullCount]);
}
else return sorted[n / 2 + nullCount];
} else {
if (n % 2 == 0)
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
return 0.5 * (sorted[n / 2 - 1] + sorted[n / 2]);
else return sorted[n / 2];
}
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
}

/**
Expand Down Expand Up @@ -1334,31 +1359,60 @@ public class Numeric {
}

int n = values.intSize("percentile");
// NULL values sorted to beginning of the array.
${pt.primitive}[] copy = sort(values.toArray());

// Determine if there are any NULL in the array.
if (n == 0) {
return ${pt.null};
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
}

${pt.primitive}[] sorted = values.copyToArray();
Arrays.sort(sorted);

int nullStart = -1;
int nullCount = 0;

<#if pt.valueType.isFloat >
for (int i = 0; i < n; i++) {
if (isNull(copy[i])) {
final ${pt.primitive} val = sorted[i];
if (isNaN(val)) {
return ${pt.boxed}.NaN; // Any NaN will pollute the result.
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
}
if (nullStart == -1) {
if (isNull(val)) {
nullStart = i;
nullCount++;
}
} else if (isNull(val)) {
nullCount++;
}
}
<#else>
for (int i = 0; i < n; i++) {
final ${pt.primitive} val = sorted[i];
if (nullStart == -1) {
if (isNull(val)) {
nullStart = i;
nullCount++;
}
} else if (isNull(val)) {
nullCount++;
} else {
break;
break; // no more NULL possible
}
}
</#if>

if (nullCount == 0) {
// No NULL, so we can just compute the index and return.
int idx = (int) Math.round(percentile * (n - 1));
return copy[idx];
} else if (nullCount < n) {
// Some NULL, reduce the count and compute the median of the non-null values.
n -= nullCount;
int idx = (int) Math.round(percentile * (n - 1));
return copy[idx + nullCount];
} else {
// All values are NULL.
if (nullCount == n) {
return ${pt.null};
} else if (nullCount > 0) {
if (nullStart > 0) {
// Move the pre-NULL data so we have a contiguous block of non-NULL values.
System.arraycopy(sorted, 0, sorted, nullCount, nullStart);
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
}
int idx = (int) Math.round(percentile * (n - nullCount - 1));
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
return sorted[idx + nullCount];
} else {
int idx = (int) Math.round(percentile * (n - 1));
return sorted[idx];
}
}

Expand Down
16 changes: 16 additions & 0 deletions engine/function/src/templates/TestNumeric.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -1150,6 +1150,12 @@ public class TestNumeric extends BaseArrayTestCase {
assertEquals(3.0, median(new ${pt.boxed}[]{(${pt.primitive})4,(${pt.primitive})2,(${pt.primitive})3,${pt.null},${pt.null}}));
assertEquals(3.5, median(new ${pt.boxed}[]{(${pt.primitive})4,(${pt.primitive})2,(${pt.primitive})3,(${pt.primitive})5,${pt.null},${pt.null}}));

<#if pt.valueType.isFloat >
assertEquals(Double.NaN, median(new ${pt.primitive}[]{4,2,3, ${pt.boxed}.NaN}));
assertEquals(3.0, median(new ${pt.primitive}[]{4,2,3, ${pt.boxed}.POSITIVE_INFINITY, ${pt.null}, ${pt.null}, ${pt.boxed}.NEGATIVE_INFINITY}));
assertEquals(3.5, median(new ${pt.primitive}[]{4,2,3,5, ${pt.boxed}.POSITIVE_INFINITY, ${pt.null}, ${pt.null}, ${pt.boxed}.NEGATIVE_INFINITY}));
</#if>

// check that functions can be resolved with varargs
assertEquals(3.0, median((${pt.primitive})4, (${pt.primitive})2, (${pt.primitive})3));
}
Expand Down Expand Up @@ -1188,6 +1194,16 @@ public class TestNumeric extends BaseArrayTestCase {
assertEquals((${pt.primitive})2, percentile(0.00, new ${pt.primitive}[]{4,2,3,${pt.null}}));
assertEquals((${pt.primitive})3, percentile(0.50, new ${pt.primitive}[]{4,2,3,${pt.null},${pt.null}}));
assertEquals((${pt.primitive})4, percentile(1.0, new ${pt.primitive}[]{4,2,3,${pt.null},${pt.null},${pt.null}}));

<#if pt.valueType.isFloat >
assertEquals(${pt.boxed}.NaN, percentile(1.0, new ${pt.primitive}[]{4,2,3, ${pt.boxed}.NaN}));

assertEquals(${pt.boxed}.NEGATIVE_INFINITY, percentile(0.0, new ${pt.primitive}[]{4,2,3, ${pt.boxed}.POSITIVE_INFINITY, ${pt.null}, ${pt.null}, ${pt.boxed}.NEGATIVE_INFINITY}));
assertEquals((${pt.primitive})2, percentile(0.25, new ${pt.primitive}[]{4,2,3, ${pt.boxed}.POSITIVE_INFINITY, ${pt.null}, ${pt.null}, ${pt.boxed}.NEGATIVE_INFINITY}));
assertEquals((${pt.primitive})3, percentile(0.5, new ${pt.primitive}[]{4,2,3, ${pt.boxed}.POSITIVE_INFINITY, ${pt.null}, ${pt.null}, ${pt.boxed}.NEGATIVE_INFINITY}));
assertEquals((${pt.primitive})4, percentile(0.75, new ${pt.primitive}[]{4,2,3, ${pt.boxed}.POSITIVE_INFINITY, ${pt.null}, ${pt.null}, ${pt.boxed}.NEGATIVE_INFINITY}));
assertEquals(${pt.boxed}.POSITIVE_INFINITY, percentile(1.0, new ${pt.primitive}[]{4,2,3, ${pt.boxed}.POSITIVE_INFINITY, ${pt.null}, ${pt.null}, ${pt.boxed}.NEGATIVE_INFINITY}));
</#if>
}

public void test${pt.boxed}Wsum() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,13 @@ private boolean addChunk(CharChunk<? extends Values> values, long destination, i
final double variance = (newSum2 - (newSum * newSum / nonNullCount)) / (nonNullCount - 1);
resultColumn.set(destination, std ? Math.sqrt(variance) : variance);
}
} else if (nonNullCounter.getCountUnsafe(destination) == 1) {
resultColumn.set(destination, Double.NaN);
} else if (nonNullCounter.getCountUnsafe(destination) == 0) {
resultColumn.set(destination, NULL_DOUBLE);
} else {
final long nonNullCount = nonNullCounter.getCountUnsafe(destination);
if (nonNullCount == 0) {
resultColumn.set(destination, NULL_DOUBLE);
} else if (nonNullCount == 1) {
resultColumn.set(destination, Double.NaN);
}
}
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
//
package io.deephaven.engine.table.impl.by;

import io.deephaven.base.verify.Assert;
import io.deephaven.chunk.attributes.ChunkLengths;
import io.deephaven.chunk.attributes.ChunkPositions;
import io.deephaven.chunk.attributes.Values;
Expand Down Expand Up @@ -99,7 +100,8 @@ private boolean addChunk(FloatChunk<? extends Values> values, long destination,
sumSource.set(destination, newSum);
sum2Source.set(destination, newSum2);

if (forceNanResult || nonNullCount <= 1) {
Assert.neqZero(nonNullCount, "nonNullCount");
if (forceNanResult || nonNullCount == 1) {
resultColumn.set(destination, Double.NaN);
} else {
// If the sum or sumSquared has reached +/-Infinity, we are stuck with NaN forever.
Expand All @@ -111,15 +113,20 @@ private boolean addChunk(FloatChunk<? extends Values> values, long destination,
resultColumn.set(destination, std ? Math.sqrt(variance) : variance);
}
return true;
} else if (forceNanResult || (nonNullCounter.getCountUnsafe(destination) == 1)) {
} else if (forceNanResult) {
resultColumn.set(destination, Double.NaN);
return true;
} else if (nonNullCounter.getCountUnsafe(destination) == 0) {
resultColumn.set(destination, NULL_DOUBLE);
return true;
} else {
return false;
final long totalNormalCount = nonNullCounter.getCountUnsafe(destination);
if (totalNormalCount == 0) {
resultColumn.set(destination, NULL_DOUBLE);
return true;
} else if (totalNormalCount == 1) {
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
resultColumn.set(destination, Double.NaN);
return true;
}
}
return false;
}

private static double computeVariance(long nonNullCount, double newSum, double newSum2) {
Expand Down Expand Up @@ -175,10 +182,6 @@ private boolean removeChunk(FloatChunk<? extends Values> values, long destinatio
newSum = sumSource.getUnsafe(destination);
newSum2 = sum2Source.getUnsafe(destination);
lbooker42 marked this conversation as resolved.
Show resolved Hide resolved
}
if (totalNormalCount <= 1) {
resultColumn.set(destination, Double.NaN);
return true;
}

// If the sum has reach +/-Infinity, we are stuck with NaN forever.
if (Double.isInfinite(newSum) || Double.isInfinite(newSum2)) {
Expand Down
Loading