Skip to content

Commit

Permalink
[6.4][ML] Store calendar periodicity historical error statistics in c…
Browse files Browse the repository at this point in the history
…ompressed format (#137)

Backport of #127.
  • Loading branch information
tveasey committed Jul 2, 2018
1 parent 313a16b commit 2ec8910
Show file tree
Hide file tree
Showing 18 changed files with 1,132 additions and 773 deletions.
3 changes: 3 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ new processes being created and macOS uses the sandbox functionality ({pull}98[#
Fix a bug causing us to under estimate the memory used by shared pointers and reduce the memory consumed
by unnecessary reference counting ({pull}108[#108])

Reduce model memory by storing state for testing for predictive calendar features in a compressed format
({pull}127[#127])

=== Bug Fixes

Age seasonal components in proportion to the fraction of values with which they're updated ({pull}88[#88])
Expand Down
133 changes: 133 additions & 0 deletions include/maths/CCalendarCyclicTest.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/

#ifndef INCLUDED_ml_maths_CCalendarCyclicTest_h
#define INCLUDED_ml_maths_CCalendarCyclicTest_h

#include <core/CMemoryUsage.h>
#include <core/CoreTypes.h>

#include <maths/CCalendarFeature.h>
#include <maths/CQuantileSketch.h>
#include <maths/ImportExport.h>
#include <maths/MathsTypes.h>

#include <boost/optional.hpp>

#include <cstdint>
#include <vector>

namespace ml {
namespace core {
class CStatePersistInserter;
class CStateRestoreTraverser;
}
namespace maths {

//! \brief The basic idea of this test is to see if there is stronger
//! than expected temporal correlation between large prediction errors
//! and calendar features.
//!
//! DESCRIPTION:\n
//! This maintains prediction error statistics for a collection of
//! calendar features. These are things like "day of month",
//! ("day of week", "week month") pairs and so on. The test checks to
//! see if the number of large prediction errors is statistically high,
//! i.e. are there many more errors exceeding a specified percentile
//! than one would expect given that this is expected to be binomial.
//! Amongst features with statistically significant frequencies of large
//! errors it returns the feature with the highest mean prediction error.
class MATHS_EXPORT CCalendarCyclicTest {
public:
using TOptionalFeature = boost::optional<CCalendarFeature>;

public:
explicit CCalendarCyclicTest(double decayRate = 0.0);

//! Initialize by reading state from \p traverser.
bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);

//! Persist state by passing information to \p inserter.
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;

//! Age the bucket values to account for \p time elapsed time.
void propagateForwardsByTime(double time);

//! Add \p error at \p time.
void add(core_t::TTime time, double error, double weight = 1.0);

//! Check if there are calendar components.
TOptionalFeature test() const;

//! Get a checksum for this object.
std::uint64_t checksum(std::uint64_t seed = 0) const;

//! Debug the memory used by this object.
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;

//! Get the memory used by this object.
std::size_t memoryUsage() const;

private:
using TTimeVec = std::vector<core_t::TTime>;
using TByte = unsigned char;
using TByteVec = std::vector<TByte>;

//! \brief Records the daily error statistics.
struct MATHS_EXPORT SErrorStats {
//! Get a checksum for this object.
std::uint64_t checksum() const;
//! Convert to a delimited string.
std::string toDelimited() const;
//! Initialize from a delimited string.
bool fromDelimited(const std::string& str);

std::uint32_t s_Count = 0;
std::uint32_t s_LargeErrorCount = 0;
CFloatStorage s_LargeErrorSum = 0.0;
};
using TErrorStatsVec = std::vector<SErrorStats>;

private:
//! Winsorise \p error.
double winsorise(double error) const;

//! Get the significance of \p x large errors given \p n samples.
double significance(double n, double x) const;

//! Convert to a compressed representation.
void deflate(const TErrorStatsVec& stats);

//! Extract from the compressed representation.
TErrorStatsVec inflate() const;

private:
//! The rate at which the error counts are aged.
double m_DecayRate;

//! Used to estimate large error thresholds.
CQuantileSketch m_ErrorQuantiles;

//! The start time of the bucket to which the last error
//! was added.
core_t::TTime m_CurrentBucketTime;

//! The start time of the earliest bucket for which we have
//! error statistics.
core_t::TTime m_CurrentBucketIndex;

//! The bucket statistics currently being updated.
SErrorStats m_CurrentBucketErrorStats;

//! The compressed error statistics.
//!
//! \note We always persist the errors in uncompressed format.
TByteVec m_CompressedBucketErrorStats;
};
}
}

#endif // INCLUDED_ml_maths_CCalendarCyclicTest_h
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,31 @@
* you may not use this file except in compliance with the Elastic License.
*/

#ifndef INCLUDED_ml_maths_CTrendTests_h
#define INCLUDED_ml_maths_CTrendTests_h
#ifndef INCLUDED_ml_maths_CRandomizedPeriodicityTest_h
#define INCLUDED_ml_maths_CRandomizedPeriodicityTest_h

#include <core/AtomicTypes.h>
#include <core/CMutex.h>
#include <core/CVectorRange.h>
#include <core/CoreTypes.h>

#include <maths/CBasicStatistics.h>
#include <maths/CCalendarFeature.h>
#include <maths/CLinearAlgebra.h>
#include <maths/CPRNG.h>
#include <maths/CQuantileSketch.h>
#include <maths/CRegression.h>
#include <maths/ImportExport.h>
#include <maths/MathsTypes.h>

#include <cstddef>
#include <vector>

#include <boost/circular_buffer.hpp>
#include <boost/container/flat_map.hpp>
#include <boost/random/mersenne_twister.hpp>

#include <stdint.h>
#include <cstddef>
#include <cstdint>
#include <vector>

class CTrendTestsTest;
class CRandomizedPeriodicityTestTest;

namespace ml {
namespace core {
class CStatePersistInserter;
class CStateRestoreTraverser;
}
namespace maths {
class CSeasonalTime;

Expand Down Expand Up @@ -93,7 +89,7 @@ class MATHS_EXPORT CRandomizedPeriodicityTest {
static void reset();

//! Get a checksum for this object.
uint64_t checksum(uint64_t seed = 0) const;
std::uint64_t checksum(std::uint64_t seed = 0) const;

private:
using TDoubleVec = std::vector<double>;
Expand Down Expand Up @@ -155,97 +151,9 @@ class MATHS_EXPORT CRandomizedPeriodicityTest {
//! The last time the day projections were updated.
core_t::TTime m_WeekRefreshedProjections;

friend class ::CTrendTestsTest;
};

//! \brief The basic idea of this test is to see if there is stronger
//! than expected temporal correlation between large prediction errors
//! and calendar features.
//!
//! DESCRIPTION:\n
//! This maintains prediction error statistics for a collection of
//! calendar features. These are things like "day of month",
//! ("day of week", "week month") pairs and so on. The test checks to
//! see if the number of large prediction errors is statistically high,
//! i.e. are there many more errors exceeding a specified percentile
//! than one would expect given that this is expected to be binomial.
//! Amongst features with statistically significant frequencies of large
//! errors it returns the feature with the highest mean prediction error.
class MATHS_EXPORT CCalendarCyclicTest {
public:
using TOptionalFeature = boost::optional<CCalendarFeature>;

public:
explicit CCalendarCyclicTest(double decayRate = 0.0);

//! Initialize by reading state from \p traverser.
bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser);

//! Persist state by passing information to \p inserter.
void acceptPersistInserter(core::CStatePersistInserter& inserter) const;

//! Age the bucket values to account for \p time elapsed time.
void propagateForwardsByTime(double time);

//! Add \p error at \p time.
void add(core_t::TTime time, double error, double weight = 1.0);

//! Check if there are calendar components.
TOptionalFeature test() const;

//! Get a checksum for this object.
uint64_t checksum(uint64_t seed = 0) const;

//! Debug the memory used by this object.
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;

//! Get the memory used by this object.
std::size_t memoryUsage() const;

private:
using TTimeVec = std::vector<core_t::TTime>;
using TUInt32CBuf = boost::circular_buffer<uint32_t>;
using TTimeFloatPr = std::pair<core_t::TTime, CFloatStorage>;
using TTimeFloatFMap = boost::container::flat_map<core_t::TTime, CFloatStorage>;

private:
//! Winsorise \p error.
double winsorise(double error) const;

//! Get the significance of \p x large errors given \p n samples.
double significance(double n, double x) const;

private:
//! The error bucketing interval.
static const core_t::TTime BUCKET;
//! The window length in buckets.
static const core_t::TTime WINDOW;
//! The percentile of a large error.
static const double LARGE_ERROR_PERCENTILE;
//! The minimum number of repeats for a testable feature.
static const unsigned int MINIMUM_REPEATS;
//! The bits used to count added values.
static const uint32_t COUNT_BITS;
//! The offsets that are used for different timezone offsets.
static const TTimeVec TIMEZONE_OFFSETS;

private:
//! The rate at which the error counts are aged.
double m_DecayRate;

//! The time of the last error added.
core_t::TTime m_Bucket;

//! Used to estimate large error thresholds.
CQuantileSketch m_ErrorQuantiles;

//! The counts of errors and large errors in a sliding window.
TUInt32CBuf m_ErrorCounts;

//! The bucket large error sums.
TTimeFloatFMap m_ErrorSums;
friend class ::CRandomizedPeriodicityTestTest;
};
}
}

#endif // INCLUDED_ml_maths_CTrendTests_h
#endif // INCLUDED_ml_maths_CRandomizedPeriodicityTest_h
2 changes: 1 addition & 1 deletion include/maths/CTimeSeriesDecompositionDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
#include <core/CoreTypes.h>

#include <maths/CCalendarComponent.h>
#include <maths/CCalendarCyclicTest.h>
#include <maths/CExpandingWindow.h>
#include <maths/CPeriodicityHypothesisTests.h>
#include <maths/CSeasonalComponent.h>
#include <maths/CSeasonalTime.h>
#include <maths/CTimeSeriesDecompositionInterface.h>
#include <maths/CTrendComponent.h>
#include <maths/CTrendTests.h>
#include <maths/ImportExport.h>

#include <boost/ref.hpp>
Expand Down
12 changes: 6 additions & 6 deletions lib/api/unittest/CAnomalyJobLimitTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
LOG_DEBUG(<< "Processed " << std::floor(100.0 * progress) << "%");
reportProgress += 0.1;
}
for (std::size_t i = 0; i < 1000; ++i) {
for (std::size_t i = 0; i < 900; ++i) {
rng.generateUniformSamples(0, generators.size(), 1, generator);
TOptionalDouble value{generators[generator[0]](time)};
if (value) {
Expand All @@ -392,10 +392,10 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
LOG_DEBUG(<< "Memory status = " << used.s_MemoryStatus);
LOG_DEBUG(<< "Memory usage bytes = " << used.s_Usage);
LOG_DEBUG(<< "Memory limit bytes = " << memoryLimit * 1024 * 1024);
CPPUNIT_ASSERT(used.s_ByFields > 600 && used.s_ByFields < 800);
CPPUNIT_ASSERT(used.s_ByFields > 650 && used.s_ByFields < 850);
CPPUNIT_ASSERT_EQUAL(std::size_t(2), used.s_PartitionFields);
CPPUNIT_ASSERT_DOUBLES_EQUAL(memoryLimit * 1024 * 1024 / 2, used.s_Usage,
memoryLimit * 1024 * 1024 / 40); // Within 5%.
memoryLimit * 1024 * 1024 / 33); // Within 6%.
}

LOG_DEBUG(<< "**** Test partition ****");
Expand All @@ -421,7 +421,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
LOG_DEBUG(<< "Processed " << std::floor(100.0 * progress) << "%");
reportProgress += 0.1;
}
for (std::size_t i = 0; i < 600; ++i) {
for (std::size_t i = 0; i < 500; ++i) {
rng.generateUniformSamples(0, generators.size(), 1, generator);
TOptionalDouble value{generators[generator[0]](time)};
if (value) {
Expand All @@ -438,7 +438,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
LOG_DEBUG(<< "# partition = " << used.s_PartitionFields);
LOG_DEBUG(<< "Memory status = " << used.s_MemoryStatus);
LOG_DEBUG(<< "Memory usage = " << used.s_Usage);
CPPUNIT_ASSERT(used.s_PartitionFields > 350 && used.s_PartitionFields < 450);
CPPUNIT_ASSERT(used.s_PartitionFields > 370 && used.s_PartitionFields < 470);
CPPUNIT_ASSERT(static_cast<double>(used.s_ByFields) >
0.97 * static_cast<double>(used.s_PartitionFields));
CPPUNIT_ASSERT_DOUBLES_EQUAL(memoryLimit * 1024 * 1024 / 2, used.s_Usage,
Expand Down Expand Up @@ -468,7 +468,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() {
LOG_DEBUG(<< "Processed " << std::floor(100.0 * progress) << "%");
reportProgress += 0.1;
}
for (std::size_t i = 0; i < 12000; ++i) {
for (std::size_t i = 0; i < 9000; ++i) {
TOptionalDouble value{sparse(time)};
if (value) {
dataRows["time"] = core::CStringUtils::typeToString(time);
Expand Down
Loading

0 comments on commit 2ec8910

Please sign in to comment.