Skip to content

Commit

Permalink
[ML] Display integers without .0 in file structure field stats (#33947)
Browse files Browse the repository at this point in the history
Previously numeric values in the field_stats created by the
find_file_structure endpoint were always output with a
decimal point.  This looked unfriendly and unnatural for
fields that clearly store integer values.  This change
converts integer values to type Integer before output in
the file structure field stats.
  • Loading branch information
droberts195 authored and kcm committed Oct 30, 2018
1 parent ecfeaca commit 2122632
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 35 deletions.
24 changes: 12 additions & 12 deletions docs/reference/ml/apis/find-file-structure.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -365,49 +365,49 @@ If the request does not encounter errors, you receive the following result:
"page_count" : {
"count" : 24,
"cardinality" : 24,
"min_value" : 180.0,
"max_value" : 768.0,
"min_value" : 180,
"max_value" : 768,
"mean_value" : 387.0833333333333,
"median_value" : 329.5,
"top_hits" : [
{
"value" : 180.0,
"value" : 180,
"count" : 1
},
{
"value" : 208.0,
"value" : 208,
"count" : 1
},
{
"value" : 224.0,
"value" : 224,
"count" : 1
},
{
"value" : 227.0,
"value" : 227,
"count" : 1
},
{
"value" : 268.0,
"value" : 268,
"count" : 1
},
{
"value" : 271.0,
"value" : 271,
"count" : 1
},
{
"value" : 275.0,
"value" : 275,
"count" : 1
},
{
"value" : 288.0,
"value" : 288,
"count" : 1
},
{
"value" : 304.0,
"value" : 304,
"count" : 1
},
{
"value" : 311.0,
"value" : 311,
"count" : 1
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,16 +123,16 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.field(COUNT.getPreferredName(), count);
builder.field(CARDINALITY.getPreferredName(), cardinality);
if (minValue != null) {
builder.field(MIN_VALUE.getPreferredName(), minValue);
builder.field(MIN_VALUE.getPreferredName(), toIntegerIfInteger(minValue));
}
if (maxValue != null) {
builder.field(MAX_VALUE.getPreferredName(), maxValue);
builder.field(MAX_VALUE.getPreferredName(), toIntegerIfInteger(maxValue));
}
if (meanValue != null) {
builder.field(MEAN_VALUE.getPreferredName(), meanValue);
builder.field(MEAN_VALUE.getPreferredName(), toIntegerIfInteger(meanValue));
}
if (medianValue != null) {
builder.field(MEDIAN_VALUE.getPreferredName(), medianValue);
builder.field(MEDIAN_VALUE.getPreferredName(), toIntegerIfInteger(medianValue));
}
if (topHits.isEmpty() == false) {
builder.field(TOP_HITS.getPreferredName(), topHits);
Expand All @@ -142,6 +142,15 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
return builder;
}

public static Number toIntegerIfInteger(double d) {

if (d >= Integer.MIN_VALUE && d <= Integer.MAX_VALUE && Double.compare(d, StrictMath.rint(d)) == 0) {
return (int) d;
}

return d;
}

@Override
public int hashCode() {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ static FieldStats createTestFieldStats() {
Double medianValue = null;
boolean isMetric = randomBoolean();
if (isMetric) {
minValue = randomDouble();
maxValue = randomDouble();
if (randomBoolean()) {
minValue = randomDouble();
maxValue = randomDouble();
} else {
minValue = (double) randomInt();
maxValue = (double) randomInt();
}
meanValue = randomDouble();
medianValue = randomDouble();
}
Expand All @@ -42,7 +47,7 @@ static FieldStats createTestFieldStats() {
for (int i = 0; i < Math.min(10, cardinality); ++i) {
Map<String, Object> topHit = new LinkedHashMap<>();
if (isMetric) {
topHit.put("value", randomDouble());
topHit.put("value", randomBoolean() ? randomDouble() : (double) randomInt());
} else {
topHit.put("value", randomAlphaOfLength(20));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -152,18 +153,20 @@ Double calculateMedian() {

List<Map<String, Object>> findNumericTopHits(int numTopHits) {
assert countsByNumericValue != null;
return findTopHits(numTopHits, countsByNumericValue, Comparator.comparing(Map.Entry<Double, Integer>::getKey));
return findTopHits(numTopHits, countsByNumericValue, Comparator.comparing(Map.Entry<Double, Integer>::getKey),
FieldStats::toIntegerIfInteger);
}

List<Map<String, Object>> findStringTopHits(int numTopHits) {
return findTopHits(numTopHits, countsByStringValue, Comparator.comparing(Map.Entry<String, Integer>::getKey));
return findTopHits(numTopHits, countsByStringValue, Comparator.comparing(Map.Entry<String, Integer>::getKey), s -> s);
}

/**
* Order by descending count, with a secondary sort to ensure reproducibility of results.
*/
private static <T> List<Map<String, Object>> findTopHits(int numTopHits, Map<T, Integer> countsByValue,
Comparator<Map.Entry<T, Integer>> secondarySort) {
Comparator<Map.Entry<T, Integer>> secondarySort,
Function<T, Object> outputMapper) {

List<Map.Entry<T, Integer>> sortedByCount = countsByValue.entrySet().stream()
.sorted(Comparator.comparing(Map.Entry<T, Integer>::getValue, Comparator.reverseOrder()).thenComparing(secondarySort))
Expand All @@ -174,7 +177,7 @@ private static <T> List<Map<String, Object>> findTopHits(int numTopHits, Map<T,
for (Map.Entry<T, Integer> entry : sortedByCount) {

Map<String, Object> topHit = new LinkedHashMap<>(3);
topHit.put("value", entry.getKey());
topHit.put("value", outputMapper.apply(entry.getKey()));
topHit.put("count", entry.getValue());
topHits.add(topHit);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,16 @@ public void testTopHitsNumeric() {

FieldStatsCalculator calculator = new FieldStatsCalculator();

calculator.accept(Arrays.asList("4", "4", "7", "4", "6", "5", "6", "5", "16", "4", "5"));
calculator.accept(Arrays.asList("4", "4", "7", "4", "6", "5.2", "6", "5.2", "16", "4", "5.2"));

List<Map<String, Object>> topHits = calculator.findNumericTopHits(3);

assertEquals(3, topHits.size());
assertEquals(4.0, topHits.get(0).get("value"));
assertEquals(4, topHits.get(0).get("value"));
assertEquals(4, topHits.get(0).get("count"));
assertEquals(5.0, topHits.get(1).get("value"));
assertEquals(5.2, topHits.get(1).get("value"));
assertEquals(3, topHits.get(1).get("count"));
assertEquals(6.0, topHits.get(2).get("value"));
assertEquals(6, topHits.get(2).get("value"));
assertEquals(2, topHits.get(2).get("count"));
}

Expand Down Expand Up @@ -124,25 +124,25 @@ public void testCalculateGivenNumericField() {

FieldStatsCalculator calculator = new FieldStatsCalculator();

calculator.accept(Arrays.asList("4", "4", "7", "4", "6", "5", "6", "5", "16", "4", "5"));
calculator.accept(Arrays.asList("4.5", "4.5", "7", "4.5", "6", "5", "6", "5", "25", "4.5", "5"));

FieldStats stats = calculator.calculate(3);

assertEquals(11L, stats.getCount());
assertEquals(5, stats.getCardinality());
assertEquals(4.0, stats.getMinValue(), 1e-10);
assertEquals(16.0, stats.getMaxValue(), 1e-10);
assertEquals(6.0, stats.getMeanValue(), 1e-10);
assertEquals(4.5, stats.getMinValue(), 1e-10);
assertEquals(25.0, stats.getMaxValue(), 1e-10);
assertEquals(7.0, stats.getMeanValue(), 1e-10);
assertEquals(5.0, stats.getMedianValue(), 1e-10);

List<Map<String, Object>> topHits = stats.getTopHits();

assertEquals(3, topHits.size());
assertEquals(4.0, topHits.get(0).get("value"));
assertEquals(4.5, topHits.get(0).get("value"));
assertEquals(4, topHits.get(0).get("count"));
assertEquals(5.0, topHits.get(1).get("value"));
assertEquals(5, topHits.get(1).get("value"));
assertEquals(3, topHits.get(1).get("count"));
assertEquals(6.0, topHits.get(2).get("value"));
assertEquals(6, topHits.get(2).get("value"));
assertEquals(2, topHits.get(2).get("count"));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ public void testGuessMappingsAndCalculateFieldStats() {
assertEquals(3, fieldStats.size());
assertEquals(new FieldStats(2, 2, makeTopHits("not a time", 1, "whatever", 1)), fieldStats.get("foo"));
assertEquals(new FieldStats(2, 2, makeTopHits("2018-05-24 17:28:31,735", 1, "2018-05-29 11:53:02,837", 1)), fieldStats.get("time"));
assertEquals(new FieldStats(2, 2, 17.0, 42.0, 29.5, 29.5, makeTopHits(17.0, 1, 42.0, 1)), fieldStats.get("bar"));
assertEquals(new FieldStats(2, 2, 17.0, 42.0, 29.5, 29.5, makeTopHits(17, 1, 42, 1)), fieldStats.get("bar"));
assertNull(fieldStats.get("nothing"));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@
- match: { field_stats.airline.cardinality: 2 }
- match: { field_stats.responsetime.count: 3 }
- match: { field_stats.responsetime.cardinality: 3 }
- match: { field_stats.responsetime.min_value: 132.2046 }
- match: { field_stats.responsetime.max_value: 990.4628 }
# Not asserting on field_stats.responsetime.mean as it's a recurring decimal
# so its representation in the response could cause spurious failures
- match: { field_stats.responsetime.median_value: 134.2046 }
- match: { field_stats.sourcetype.count: 3 }
- match: { field_stats.sourcetype.cardinality: 1 }
- match: { field_stats.time.count: 3 }
Expand Down Expand Up @@ -89,6 +94,11 @@
- match: { field_stats.airline.cardinality: 2 }
- match: { field_stats.responsetime.count: 3 }
- match: { field_stats.responsetime.cardinality: 3 }
- match: { field_stats.responsetime.min_value: 132.2046 }
- match: { field_stats.responsetime.max_value: 990.4628 }
# Not asserting on field_stats.responsetime.mean as it's a recurring decimal
# so its representation in the response could cause spurious failures
- match: { field_stats.responsetime.median_value: 134.2046 }
- match: { field_stats.sourcetype.count: 3 }
- match: { field_stats.sourcetype.cardinality: 1 }
- match: { field_stats.time.count: 3 }
Expand Down

0 comments on commit 2122632

Please sign in to comment.