Skip to content

Commit

Permalink
[ML] fix segfault caused by to few outliers and harden container usage (
Browse files Browse the repository at this point in the history
#96)

Do not re-weight outliers if there is just 1, preventing a crash downstream
and harden accumulators to prevent empty containers.

fixes #94
  • Loading branch information
Hendrik Muhs committed May 17, 2018
1 parent 27e07a5 commit 18ebdd6
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 25 deletions.
13 changes: 12 additions & 1 deletion include/maths/CBasicStatistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,9 @@ class MATHS_EXPORT CBasicStatistics {
class COrderStatisticsStack
: public COrderStatisticsImpl<T, boost::array<T, N>, LESS>,
private boost::addable<COrderStatisticsStack<T, N, LESS>> {

static_assert(N > 0, "N must be > 0");

private:
using TArray = boost::array<T, N>;
using TImpl = COrderStatisticsImpl<T, TArray, LESS>;
Expand Down Expand Up @@ -1327,10 +1330,18 @@ class MATHS_EXPORT CBasicStatistics {

public:
explicit COrderStatisticsHeap(std::size_t n, const LESS& less = LESS{})
: TImpl{std::vector<T>(n, T{}), less} {}
: TImpl{std::vector<T>(std::max(n, std::size_t(1)), T{}), less} {
if (n == 0) {
LOG_ERROR(<< "Invalid size of 0 for order statistics accumulator");
}
}

//! Reset the number of statistics to gather to \p n.
void resize(std::size_t n) {
if (n == 0) {
LOG_ERROR(<< "Invalid resize to 0 for order statistics accumulator");
n = 1;
}
this->clear();
this->statistics().resize(n);
}
Expand Down
52 changes: 28 additions & 24 deletions lib/maths/CTimeSeriesDecompositionDetail.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1596,30 +1596,34 @@ void CTimeSeriesDecompositionDetail::CComponents::reweightOutliers(
})};
double numberOutliers{SEASONAL_OUTLIER_FRACTION * numberValues};

TMinAccumulator outliers{static_cast<std::size_t>(2.0 * numberOutliers)};
TMeanAccumulator meanDifference;
core_t::TTime time = startTime + dt / 2;
for (std::size_t i = 0; i < values.size(); ++i, time += dt) {
if (CBasicStatistics::count(values[i]) > 0.0) {
double difference{std::fabs(CBasicStatistics::mean(values[i]) - predictor(time))};
outliers.add({-difference, i});
meanDifference.add(difference);
}
}
outliers.sort();
TMeanAccumulator meanDifferenceOfOutliers;
for (std::size_t i = 0u; i < static_cast<std::size_t>(numberOutliers); ++i) {
meanDifferenceOfOutliers.add(-outliers[i].first);
}
meanDifference -= meanDifferenceOfOutliers;
for (std::size_t i = 0; i < outliers.count(); ++i) {
if (-outliers[i].first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD *
CBasicStatistics::mean(meanDifference)) {
double weight{SEASONAL_OUTLIER_WEIGHT +
(1.0 - SEASONAL_OUTLIER_WEIGHT) *
CTools::logisticFunction(static_cast<double>(i) / numberOutliers,
0.1, 1.0)};
CBasicStatistics::count(values[outliers[i].second]) *= weight;
if (numberOutliers > 1.0) {

TMinAccumulator outliers{static_cast<std::size_t>(2.0 * numberOutliers)};
TMeanAccumulator meanDifference;
core_t::TTime time = startTime + dt / 2;
for (std::size_t i = 0; i < values.size(); ++i, time += dt) {
if (CBasicStatistics::count(values[i]) > 0.0) {
double difference{
std::fabs(CBasicStatistics::mean(values[i]) - predictor(time))};
outliers.add({-difference, i});
meanDifference.add(difference);
}
}
outliers.sort();
TMeanAccumulator meanDifferenceOfOutliers;
for (std::size_t i = 0u; i < static_cast<std::size_t>(numberOutliers); ++i) {
meanDifferenceOfOutliers.add(-outliers[i].first);
}
meanDifference -= meanDifferenceOfOutliers;
for (std::size_t i = 0; i < outliers.count(); ++i) {
if (-outliers[i].first > SEASONAL_OUTLIER_DIFFERENCE_THRESHOLD *
CBasicStatistics::mean(meanDifference)) {
double weight{SEASONAL_OUTLIER_WEIGHT +
(1.0 - SEASONAL_OUTLIER_WEIGHT) *
CTools::logisticFunction(static_cast<double>(i) / numberOutliers,
0.1, 1.0)};
CBasicStatistics::count(values[outliers[i].second]) *= weight;
}
}
}
}
Expand Down

0 comments on commit 18ebdd6

Please sign in to comment.