From 2ec8910627f22a16179f43aa4db781a33a0d7031 Mon Sep 17 00:00:00 2001 From: Tom Veasey Date: Mon, 2 Jul 2018 15:08:05 +0100 Subject: [PATCH] [6.4][ML] Store calendar periodicity historical error statistics in compressed format (#137) Backport of #127. --- docs/CHANGELOG.asciidoc | 3 + include/maths/CCalendarCyclicTest.h | 133 ++++++ ...ndTests.h => CRandomizedPeriodicityTest.h} | 118 +----- .../maths/CTimeSeriesDecompositionDetail.h | 2 +- lib/api/unittest/CAnomalyJobLimitTest.cc | 12 +- lib/maths/CCalendarCyclicTest.cc | 329 ++++++++++++++ ...Tests.cc => CRandomizedPeriodicityTest.cc} | 251 +---------- lib/maths/Makefile | 5 +- lib/maths/unittest/CCalendarCyclicTestTest.cc | 384 +++++++++++++++++ lib/maths/unittest/CCalendarCyclicTestTest.h | 21 + .../CRandomizedPeriodicityTestTest.cc | 204 +++++++++ ...est.h => CRandomizedPeriodicityTestTest.h} | 11 +- .../unittest/CTimeSeriesDecompositionTest.cc | 4 +- lib/maths/unittest/CTimeSeriesModelTest.cc | 2 +- lib/maths/unittest/CTrendTestsTest.cc | 400 ------------------ lib/maths/unittest/Main.cc | 8 +- lib/maths/unittest/Makefile | 5 +- lib/model/CAnomalyDetector.cc | 13 +- 18 files changed, 1132 insertions(+), 773 deletions(-) create mode 100644 include/maths/CCalendarCyclicTest.h rename include/maths/{CTrendTests.h => CRandomizedPeriodicityTest.h} (59%) create mode 100644 lib/maths/CCalendarCyclicTest.cc rename lib/maths/{CTrendTests.cc => CRandomizedPeriodicityTest.cc} (61%) create mode 100644 lib/maths/unittest/CCalendarCyclicTestTest.cc create mode 100644 lib/maths/unittest/CCalendarCyclicTestTest.h create mode 100644 lib/maths/unittest/CRandomizedPeriodicityTestTest.cc rename lib/maths/unittest/{CTrendTestsTest.h => CRandomizedPeriodicityTestTest.h} (59%) delete mode 100644 lib/maths/unittest/CTrendTestsTest.cc diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 9ef680ccc0..1c5f00679f 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -36,6 +36,9 @@ new processes being created and macOS uses the sandbox functionality ({pull}98[# Fix a bug causing us to under estimate the memory used by shared pointers and reduce the memory consumed by unnecessary reference counting ({pull}108[#108]) +Reduce model memory by storing state for testing for predictive calendar features in a compressed format +({pull}127[#127]) + === Bug Fixes Age seasonal components in proportion to the fraction of values with which they're updated ({pull}88[#88]) diff --git a/include/maths/CCalendarCyclicTest.h b/include/maths/CCalendarCyclicTest.h new file mode 100644 index 0000000000..c16186e73a --- /dev/null +++ b/include/maths/CCalendarCyclicTest.h @@ -0,0 +1,133 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_maths_CCalendarCyclicTest_h +#define INCLUDED_ml_maths_CCalendarCyclicTest_h + +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include + +namespace ml { +namespace core { +class CStatePersistInserter; +class CStateRestoreTraverser; +} +namespace maths { + +//! \brief The basic idea of this test is to see if there is stronger +//! than expected temporal correlation between large prediction errors +//! and calendar features. +//! +//! DESCRIPTION:\n +//! This maintains prediction error statistics for a collection of +//! calendar features. These are things like "day of month", +//! ("day of week", "week month") pairs and so on. The test checks to +//! see if the number of large prediction errors is statistically high, +//! i.e. are there many more errors exceeding a specified percentile +//! than one would expect given that this is expected to be binomial. +//! Amongst features with statistically significant frequencies of large +//! errors it returns the feature with the highest mean prediction error. +class MATHS_EXPORT CCalendarCyclicTest { +public: + using TOptionalFeature = boost::optional; + +public: + explicit CCalendarCyclicTest(double decayRate = 0.0); + + //! Initialize by reading state from \p traverser. + bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); + + //! Persist state by passing information to \p inserter. + void acceptPersistInserter(core::CStatePersistInserter& inserter) const; + + //! Age the bucket values to account for \p time elapsed time. + void propagateForwardsByTime(double time); + + //! Add \p error at \p time. + void add(core_t::TTime time, double error, double weight = 1.0); + + //! Check if there are calendar components. + TOptionalFeature test() const; + + //! Get a checksum for this object. + std::uint64_t checksum(std::uint64_t seed = 0) const; + + //! Debug the memory used by this object. + void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; + + //! Get the memory used by this object. + std::size_t memoryUsage() const; + +private: + using TTimeVec = std::vector; + using TByte = unsigned char; + using TByteVec = std::vector; + + //! \brief Records the daily error statistics. + struct MATHS_EXPORT SErrorStats { + //! Get a checksum for this object. + std::uint64_t checksum() const; + //! Convert to a delimited string. + std::string toDelimited() const; + //! Initialize from a delimited string. + bool fromDelimited(const std::string& str); + + std::uint32_t s_Count = 0; + std::uint32_t s_LargeErrorCount = 0; + CFloatStorage s_LargeErrorSum = 0.0; + }; + using TErrorStatsVec = std::vector; + +private: + //! Winsorise \p error. + double winsorise(double error) const; + + //! Get the significance of \p x large errors given \p n samples. + double significance(double n, double x) const; + + //! Convert to a compressed representation. + void deflate(const TErrorStatsVec& stats); + + //! Extract from the compressed representation. + TErrorStatsVec inflate() const; + +private: + //! The rate at which the error counts are aged. + double m_DecayRate; + + //! Used to estimate large error thresholds. + CQuantileSketch m_ErrorQuantiles; + + //! The start time of the bucket to which the last error + //! was added. + core_t::TTime m_CurrentBucketTime; + + //! The start time of the earliest bucket for which we have + //! error statistics. + core_t::TTime m_CurrentBucketIndex; + + //! The bucket statistics currently being updated. + SErrorStats m_CurrentBucketErrorStats; + + //! The compressed error statistics. + //! + //! \note We always persist the errors in uncompressed format. + TByteVec m_CompressedBucketErrorStats; +}; +} +} + +#endif // INCLUDED_ml_maths_CCalendarCyclicTest_h diff --git a/include/maths/CTrendTests.h b/include/maths/CRandomizedPeriodicityTest.h similarity index 59% rename from include/maths/CTrendTests.h rename to include/maths/CRandomizedPeriodicityTest.h index 9ba7199858..2b409d0cd6 100644 --- a/include/maths/CTrendTests.h +++ b/include/maths/CRandomizedPeriodicityTest.h @@ -4,35 +4,31 @@ * you may not use this file except in compliance with the Elastic License. */ -#ifndef INCLUDED_ml_maths_CTrendTests_h -#define INCLUDED_ml_maths_CTrendTests_h +#ifndef INCLUDED_ml_maths_CRandomizedPeriodicityTest_h +#define INCLUDED_ml_maths_CRandomizedPeriodicityTest_h #include #include -#include #include #include -#include #include #include -#include -#include #include -#include -#include -#include - -#include -#include #include -#include +#include +#include +#include -class CTrendTestsTest; +class CRandomizedPeriodicityTestTest; namespace ml { +namespace core { +class CStatePersistInserter; +class CStateRestoreTraverser; +} namespace maths { class CSeasonalTime; @@ -93,7 +89,7 @@ class MATHS_EXPORT CRandomizedPeriodicityTest { static void reset(); //! Get a checksum for this object. - uint64_t checksum(uint64_t seed = 0) const; + std::uint64_t checksum(std::uint64_t seed = 0) const; private: using TDoubleVec = std::vector; @@ -155,97 +151,9 @@ class MATHS_EXPORT CRandomizedPeriodicityTest { //! The last time the day projections were updated. core_t::TTime m_WeekRefreshedProjections; - friend class ::CTrendTestsTest; -}; - -//! \brief The basic idea of this test is to see if there is stronger -//! than expected temporal correlation between large prediction errors -//! and calendar features. -//! -//! DESCRIPTION:\n -//! This maintains prediction error statistics for a collection of -//! calendar features. These are things like "day of month", -//! ("day of week", "week month") pairs and so on. The test checks to -//! see if the number of large prediction errors is statistically high, -//! i.e. are there many more errors exceeding a specified percentile -//! than one would expect given that this is expected to be binomial. -//! Amongst features with statistically significant frequencies of large -//! errors it returns the feature with the highest mean prediction error. -class MATHS_EXPORT CCalendarCyclicTest { -public: - using TOptionalFeature = boost::optional; - -public: - explicit CCalendarCyclicTest(double decayRate = 0.0); - - //! Initialize by reading state from \p traverser. - bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); - - //! Persist state by passing information to \p inserter. - void acceptPersistInserter(core::CStatePersistInserter& inserter) const; - - //! Age the bucket values to account for \p time elapsed time. - void propagateForwardsByTime(double time); - - //! Add \p error at \p time. - void add(core_t::TTime time, double error, double weight = 1.0); - - //! Check if there are calendar components. - TOptionalFeature test() const; - - //! Get a checksum for this object. - uint64_t checksum(uint64_t seed = 0) const; - - //! Debug the memory used by this object. - void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const; - - //! Get the memory used by this object. - std::size_t memoryUsage() const; - -private: - using TTimeVec = std::vector; - using TUInt32CBuf = boost::circular_buffer; - using TTimeFloatPr = std::pair; - using TTimeFloatFMap = boost::container::flat_map; - -private: - //! Winsorise \p error. - double winsorise(double error) const; - - //! Get the significance of \p x large errors given \p n samples. - double significance(double n, double x) const; - -private: - //! The error bucketing interval. - static const core_t::TTime BUCKET; - //! The window length in buckets. - static const core_t::TTime WINDOW; - //! The percentile of a large error. - static const double LARGE_ERROR_PERCENTILE; - //! The minimum number of repeats for a testable feature. - static const unsigned int MINIMUM_REPEATS; - //! The bits used to count added values. - static const uint32_t COUNT_BITS; - //! The offsets that are used for different timezone offsets. - static const TTimeVec TIMEZONE_OFFSETS; - -private: - //! The rate at which the error counts are aged. - double m_DecayRate; - - //! The time of the last error added. - core_t::TTime m_Bucket; - - //! Used to estimate large error thresholds. - CQuantileSketch m_ErrorQuantiles; - - //! The counts of errors and large errors in a sliding window. - TUInt32CBuf m_ErrorCounts; - - //! The bucket large error sums. - TTimeFloatFMap m_ErrorSums; + friend class ::CRandomizedPeriodicityTestTest; }; } } -#endif // INCLUDED_ml_maths_CTrendTests_h +#endif // INCLUDED_ml_maths_CRandomizedPeriodicityTest_h diff --git a/include/maths/CTimeSeriesDecompositionDetail.h b/include/maths/CTimeSeriesDecompositionDetail.h index 81a7bdb50b..b40072cf0d 100644 --- a/include/maths/CTimeSeriesDecompositionDetail.h +++ b/include/maths/CTimeSeriesDecompositionDetail.h @@ -12,13 +12,13 @@ #include #include +#include #include #include #include #include #include #include -#include #include #include diff --git a/lib/api/unittest/CAnomalyJobLimitTest.cc b/lib/api/unittest/CAnomalyJobLimitTest.cc index a4735f531a..b296762889 100644 --- a/lib/api/unittest/CAnomalyJobLimitTest.cc +++ b/lib/api/unittest/CAnomalyJobLimitTest.cc @@ -374,7 +374,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() { LOG_DEBUG(<< "Processed " << std::floor(100.0 * progress) << "%"); reportProgress += 0.1; } - for (std::size_t i = 0; i < 1000; ++i) { + for (std::size_t i = 0; i < 900; ++i) { rng.generateUniformSamples(0, generators.size(), 1, generator); TOptionalDouble value{generators[generator[0]](time)}; if (value) { @@ -392,10 +392,10 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() { LOG_DEBUG(<< "Memory status = " << used.s_MemoryStatus); LOG_DEBUG(<< "Memory usage bytes = " << used.s_Usage); LOG_DEBUG(<< "Memory limit bytes = " << memoryLimit * 1024 * 1024); - CPPUNIT_ASSERT(used.s_ByFields > 600 && used.s_ByFields < 800); + CPPUNIT_ASSERT(used.s_ByFields > 650 && used.s_ByFields < 850); CPPUNIT_ASSERT_EQUAL(std::size_t(2), used.s_PartitionFields); CPPUNIT_ASSERT_DOUBLES_EQUAL(memoryLimit * 1024 * 1024 / 2, used.s_Usage, - memoryLimit * 1024 * 1024 / 40); // Within 5%. + memoryLimit * 1024 * 1024 / 33); // Within 6%. } LOG_DEBUG(<< "**** Test partition ****"); @@ -421,7 +421,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() { LOG_DEBUG(<< "Processed " << std::floor(100.0 * progress) << "%"); reportProgress += 0.1; } - for (std::size_t i = 0; i < 600; ++i) { + for (std::size_t i = 0; i < 500; ++i) { rng.generateUniformSamples(0, generators.size(), 1, generator); TOptionalDouble value{generators[generator[0]](time)}; if (value) { @@ -438,7 +438,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() { LOG_DEBUG(<< "# partition = " << used.s_PartitionFields); LOG_DEBUG(<< "Memory status = " << used.s_MemoryStatus); LOG_DEBUG(<< "Memory usage = " << used.s_Usage); - CPPUNIT_ASSERT(used.s_PartitionFields > 350 && used.s_PartitionFields < 450); + CPPUNIT_ASSERT(used.s_PartitionFields > 370 && used.s_PartitionFields < 470); CPPUNIT_ASSERT(static_cast(used.s_ByFields) > 0.97 * static_cast(used.s_PartitionFields)); CPPUNIT_ASSERT_DOUBLES_EQUAL(memoryLimit * 1024 * 1024 / 2, used.s_Usage, @@ -468,7 +468,7 @@ void CAnomalyJobLimitTest::testModelledEntityCountForFixedMemoryLimit() { LOG_DEBUG(<< "Processed " << std::floor(100.0 * progress) << "%"); reportProgress += 0.1; } - for (std::size_t i = 0; i < 12000; ++i) { + for (std::size_t i = 0; i < 9000; ++i) { TOptionalDouble value{sparse(time)}; if (value) { dataRows["time"] = core::CStringUtils::typeToString(time); diff --git a/lib/maths/CCalendarCyclicTest.cc b/lib/maths/CCalendarCyclicTest.cc new file mode 100644 index 0000000000..6fbbc558c9 --- /dev/null +++ b/lib/maths/CCalendarCyclicTest.cc @@ -0,0 +1,329 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace ml { +namespace maths { +namespace { +//! \brief Sets the time zone to a specified value in a constructor +//! call so it can be called once by static initialisation. +struct SSetTimeZone { + SSetTimeZone(const std::string& zone) { + core::CTimezone::instance().timezoneName(zone); + } +}; + +//! \brief Hashes a calendar feature. +struct SHashFeature { + std::size_t operator()(const CCalendarFeature& feature) const { + return feature.checksum(0); + } +}; + +//! \brief The statistics we need in order to be able to test for +//! calendar features. +struct SStats { + core_t::TTime s_Offset = 0; + unsigned int s_Repeats = 0; + double s_Sum = 0.0; + double s_Count = 0.0; + double s_Significance = 0.0; +}; +using TFeatureStatsUMap = boost::unordered_map; + +const std::string VERSION_6_4_TAG("6.4"); +// Version 6.4 +const std::string ERROR_QUANTILES_6_4_TAG("a"); +const std::string CURRENT_BUCKET_TIME_6_4_TAG("b"); +const std::string CURRENT_BUCKET_INDEX_6_4_TAG("c"); +const std::string CURRENT_BUCKET_ERROR_STATS_6_4_TAG("d"); +const std::string ERRORS_6_4_TAG("e"); +// Version < 6.4 +const std::string ERROR_QUANTILES_OLD_TAG("a"); +// Everything else gets default initialised. + +const std::string DELIMITER{","}; +const core_t::TTime SIZE{155}; +const core_t::TTime BUCKET{core::constants::DAY}; +const core_t::TTime WINDOW{SIZE * BUCKET}; +const core_t::TTime TIME_ZONE_OFFSETS[]{0}; + +//! The percentile of a large error. +const double LARGE_ERROR_PERCENTILE{98.5}; +//! The minimum number of repeats to test a feature. +const unsigned int MINIMUM_REPEATS{4}; +//! The maximum significance to accept a feature. +const double MAXIMUM_SIGNIFICANCE{0.01}; +} + +CCalendarCyclicTest::CCalendarCyclicTest(double decayRate) + : m_DecayRate{decayRate}, m_ErrorQuantiles{CQuantileSketch::E_Linear, 20}, + m_CurrentBucketTime{0}, m_CurrentBucketIndex{0} { + static const SSetTimeZone timezone("GMT"); + TErrorStatsVec stats(SIZE); + this->deflate(stats); +} + +bool CCalendarCyclicTest::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { + TErrorStatsVec errors; + if (traverser.name() == VERSION_6_4_TAG) { + while (traverser.next()) { + const std::string& name = traverser.name(); + RESTORE(ERROR_QUANTILES_6_4_TAG, + traverser.traverseSubLevel(boost::bind(&CQuantileSketch::acceptRestoreTraverser, + &m_ErrorQuantiles, _1))) + RESTORE_BUILT_IN(CURRENT_BUCKET_TIME_6_4_TAG, m_CurrentBucketTime) + RESTORE_BUILT_IN(CURRENT_BUCKET_INDEX_6_4_TAG, m_CurrentBucketIndex) + RESTORE(CURRENT_BUCKET_ERROR_STATS_6_4_TAG, + m_CurrentBucketErrorStats.fromDelimited(traverser.value())) + RESTORE(ERRORS_6_4_TAG, + core::CPersistUtils::restore(ERRORS_6_4_TAG, errors, traverser)) + } + } else { + do { + const std::string& name = traverser.name(); + RESTORE(ERROR_QUANTILES_OLD_TAG, + traverser.traverseSubLevel(boost::bind(&CQuantileSketch::acceptRestoreTraverser, + &m_ErrorQuantiles, _1))) + } while (traverser.next()); + errors.resize(SIZE); + } + this->deflate(errors); + return true; +} + +void CCalendarCyclicTest::acceptPersistInserter(core::CStatePersistInserter& inserter) const { + inserter.insertValue(VERSION_6_4_TAG, ""); + inserter.insertLevel(ERROR_QUANTILES_6_4_TAG, + boost::bind(&CQuantileSketch::acceptPersistInserter, + &m_ErrorQuantiles, _1)); + inserter.insertValue(CURRENT_BUCKET_TIME_6_4_TAG, m_CurrentBucketTime); + inserter.insertValue(CURRENT_BUCKET_INDEX_6_4_TAG, m_CurrentBucketIndex); + inserter.insertValue(CURRENT_BUCKET_ERROR_STATS_6_4_TAG, + m_CurrentBucketErrorStats.toDelimited()); + TErrorStatsVec errors{this->inflate()}; + core::CPersistUtils::persist(ERRORS_6_4_TAG, errors, inserter); +} + +void CCalendarCyclicTest::propagateForwardsByTime(double time) { + if (CMathsFuncs::isFinite(time) == false || time < 0.0) { + LOG_ERROR(<< "Bad propagation time " << time); + return; + } + m_ErrorQuantiles.age(std::exp(-m_DecayRate * time)); +} + +void CCalendarCyclicTest::add(core_t::TTime time, double error, double weight) { + error = std::fabs(error); + + m_ErrorQuantiles.add(error, weight); + + if (m_ErrorQuantiles.count() > 100.0) { + time = CIntegerTools::floor(time, BUCKET); + if (time > m_CurrentBucketTime) { + TErrorStatsVec errors{this->inflate()}; + do { + errors[m_CurrentBucketIndex] = m_CurrentBucketErrorStats; + m_CurrentBucketErrorStats = SErrorStats{}; + m_CurrentBucketTime += BUCKET; + m_CurrentBucketIndex = (m_CurrentBucketIndex + 1) % SIZE; + } while (m_CurrentBucketTime < time); + this->deflate(errors); + } + + ++m_CurrentBucketErrorStats.s_Count; + + double large; + m_ErrorQuantiles.quantile(LARGE_ERROR_PERCENTILE, large); + + if (error >= large) { + ++m_CurrentBucketErrorStats.s_LargeErrorCount; + m_CurrentBucketErrorStats.s_LargeErrorSum += this->winsorise(error); + } + } +} + +CCalendarCyclicTest::TOptionalFeature CCalendarCyclicTest::test() const { + using TDoubleTimeCalendarFeatureTr = core::CTriple; + using TMaxAccumulator = CBasicStatistics::SMax::TAccumulator; + + TErrorStatsVec errors{this->inflate()}; + TFeatureStatsUMap stats{errors.size()}; + + // Note that the current index points to the next bucket to overwrite, + // i.e. the earliest bucket error statistics we have. The start of + // this bucket is WINDOW before the start time of the current partial + // bucket. + for (auto offset : TIME_ZONE_OFFSETS) { + for (core_t::TTime i = m_CurrentBucketIndex, time = m_CurrentBucketTime - WINDOW; + time < m_CurrentBucketTime; i = (i + 1) % SIZE, time += BUCKET) { + if (errors[i].s_Count > 0) { + double n{static_cast(errors[i].s_Count)}; + double x{static_cast(errors[i].s_LargeErrorCount)}; + double s{this->significance(n, x)}; + core_t::TTime midpoint{time + BUCKET / 2 + offset}; + for (auto feature : CCalendarFeature::features(midpoint)) { + feature.offset(offset); + SStats& stat = stats[feature]; + ++stat.s_Repeats; + stat.s_Offset = offset; + stat.s_Sum += errors[i].s_LargeErrorSum; + stat.s_Count += x; + stat.s_Significance = std::max(stat.s_Significance, s); + } + } + } + } + + double errorThreshold; + m_ErrorQuantiles.quantile(50.0, errorThreshold); + errorThreshold *= 2.0; + + TMaxAccumulator result; + + for (const auto& stat : stats) { + CCalendarFeature feature = stat.first; + double r{static_cast(stat.second.s_Repeats)}; + double x{stat.second.s_Count}; + double e{stat.second.s_Sum}; + double s{stat.second.s_Significance}; + if (stat.second.s_Repeats >= MINIMUM_REPEATS && + e > errorThreshold * x && std::pow(s, r) < MAXIMUM_SIGNIFICANCE) { + result.add({e, stat.second.s_Offset, feature}); + } + } + + return result.count() > 0 ? result[0].third : TOptionalFeature(); +} + +std::uint64_t CCalendarCyclicTest::checksum(std::uint64_t seed) const { + seed = CChecksum::calculate(seed, m_DecayRate); + seed = CChecksum::calculate(seed, m_ErrorQuantiles); + seed = CChecksum::calculate(seed, m_CurrentBucketTime); + seed = CChecksum::calculate(seed, m_CurrentBucketIndex); + seed = CChecksum::calculate(seed, m_CurrentBucketErrorStats); + TErrorStatsVec errors{this->inflate()}; + return CChecksum::calculate(seed, errors); +} + +void CCalendarCyclicTest::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { + mem->setName("CCalendarCyclicTest"); + core::CMemoryDebug::dynamicSize("m_ErrorQuantiles", m_ErrorQuantiles, mem); + core::CMemoryDebug::dynamicSize("m_CompressedBucketErrorStats", + m_CompressedBucketErrorStats, mem); +} + +std::size_t CCalendarCyclicTest::memoryUsage() const { + std::size_t mem{core::CMemory::dynamicSize(m_ErrorQuantiles)}; + mem += core::CMemory::dynamicSize(m_CompressedBucketErrorStats); + return mem; +} + +double CCalendarCyclicTest::winsorise(double error) const { + double high; + m_ErrorQuantiles.quantile(99.5, high); + return std::min(error, high); +} + +double CCalendarCyclicTest::significance(double n, double x) const { + if (n > 0.0) { + try { + // We have roughly 31 independent error samples, one for each + // day of the month, so the chance of seeing as extreme an event + // among all of them is: + // 1 - P("don't see as extreme event") = 1 - (1 - P("event"))^31 + boost::math::binomial binom{n, 1.0 - LARGE_ERROR_PERCENTILE / 100.0}; + double p{std::min(2.0 * CTools::safeCdfComplement(binom, x - 1.0), 1.0)}; + return CTools::oneMinusPowOneMinusX(p, 31.0); + } catch (const std::exception& e) { + LOG_ERROR(<< "Failed to calculate significance: " << e.what() + << " n = " << n << " x = " << x); + } + } + return 1.0; +} + +void CCalendarCyclicTest::deflate(const TErrorStatsVec& stats) { + bool lengthOnly{false}; + core::CDeflator deflator{lengthOnly}; + deflator.addVector(stats); + deflator.finishAndTakeData(m_CompressedBucketErrorStats); + m_CompressedBucketErrorStats.shrink_to_fit(); +} + +CCalendarCyclicTest::TErrorStatsVec CCalendarCyclicTest::inflate() const { + bool lengthOnly{false}; + core::CInflator inflator{lengthOnly}; + inflator.addVector(m_CompressedBucketErrorStats); + TByteVec decompressed; + inflator.finishAndTakeData(decompressed); + TErrorStatsVec result(decompressed.size() / sizeof(SErrorStats)); + std::copy(decompressed.begin(), decompressed.end(), + reinterpret_cast(result.data())); + return result; +} + +std::uint64_t CCalendarCyclicTest::SErrorStats::checksum() const { + std::uint64_t seed{static_cast(s_Count)}; + seed = CChecksum::calculate(seed, s_LargeErrorCount); + return CChecksum::calculate(seed, s_LargeErrorSum); +} + +std::string CCalendarCyclicTest::SErrorStats::toDelimited() const { + return core::CStringUtils::typeToString(s_Count) + DELIMITER + + core::CStringUtils::typeToString(s_LargeErrorCount) + DELIMITER + + s_LargeErrorSum.toString(); +} + +bool CCalendarCyclicTest::SErrorStats::fromDelimited(const std::string& str_) { + std::string str{str_}; + std::size_t delimiter{str.find(DELIMITER)}; + if (core::CStringUtils::stringToType(str.substr(0, delimiter), s_Count) == false) { + LOG_ERROR("Failed to parse '" << str_ << "'"); + return false; + } + str = str.substr(delimiter + 1); + delimiter = str.find(DELIMITER); + if (core::CStringUtils::stringToType(str.substr(0, delimiter), s_LargeErrorCount) == false) { + LOG_ERROR("Failed to parse '" << str_ << "'"); + return false; + } + str = str.substr(delimiter + 1); + if (s_LargeErrorSum.fromString(str) == false) { + LOG_ERROR("Failed to parse '" << str_ << "'"); + return false; + } + return true; +} +} +} diff --git a/lib/maths/CTrendTests.cc b/lib/maths/CRandomizedPeriodicityTest.cc similarity index 61% rename from lib/maths/CTrendTests.cc rename to lib/maths/CRandomizedPeriodicityTest.cc index b436821577..714d765af1 100644 --- a/lib/maths/CTrendTests.cc +++ b/lib/maths/CRandomizedPeriodicityTest.cc @@ -4,16 +4,13 @@ * you may not use this file except in compliance with the Elastic License. */ -#include +#include #include #include -#include #include #include #include -#include -#include #include #include @@ -24,22 +21,11 @@ #include #include #include -#include -#include #include -#include -#include #include -#include #include -#include -#include -#include -#include -#include #include -#include #include #include #include @@ -47,6 +33,9 @@ #include #include #include +#include +#include +#include namespace ml { namespace maths { @@ -56,14 +45,6 @@ using TDoubleVec = std::vector; using TMeanAccumulator = CBasicStatistics::SSampleMean::TAccumulator; using TTimeVec = std::vector; -//! \brief Sets the timezone to a specified value in a constructor -//! call so it can be called once by static initialisation. -struct SSetTimeZone { - SSetTimeZone(const std::string& zone) { - core::CTimezone::instance().timezoneName(zone); - } -}; - //! Generate \p n samples uniformly in the interval [\p a, \p b]. template void generateUniformSamples(boost::random::mt19937_64& rng, double a, double b, std::size_t n, ITR samples) { @@ -99,19 +80,6 @@ const std::string DAY_REFRESHED_PROJECTIONS_TAG("c"); const std::string WEEK_PROJECTIONS_TAG("d"); const std::string WEEK_STATISTICS_TAG("e"); const std::string WEEK_REFRESHED_PROJECTIONS_TAG("f"); - -// CCalendarCyclicTest -const std::string ERROR_QUANTILES_TAG("a"); -const std::string BUCKET_TAG("c"); -const std::string ERROR_COUNTS_TAG("d"); -const std::string ERROR_SUMS_TAG("e"); - -//! The maximum significance of a test statistic. -const double MAXIMUM_SIGNIFICANCE = 0.001; -//! Forward day in seconds into scope. -const core_t::TTime DAY = core::constants::DAY; -//! Forward day in seconds into scope. -const core_t::TTime WEEK = core::constants::WEEK; } //////// CRandomizedPeriodicitytest //////// @@ -240,9 +208,9 @@ void CRandomizedPeriodicityTest::add(core_t::TTime time, double value) { TVector2N daySample; TVector2N weekSample; std::size_t td = static_cast((time % DAY_RESAMPLE_INTERVAL) / SAMPLE_INTERVAL); - std::size_t d = static_cast((time % DAY) / SAMPLE_INTERVAL); + std::size_t d = static_cast((time % core::constants::DAY) / SAMPLE_INTERVAL); std::size_t tw = static_cast((time % WEEK_RESAMPLE_INTERVAL) / SAMPLE_INTERVAL); - std::size_t w = static_cast((time % WEEK) / SAMPLE_INTERVAL); + std::size_t w = static_cast((time % core::constants::WEEK) / SAMPLE_INTERVAL); for (std::size_t i = 0u; i < N; ++i) { daySample(2 * i + 0) = ms_DayRandomProjections[i][td] * value; @@ -309,16 +277,15 @@ void CRandomizedPeriodicityTest::reset() { ms_WeekResampled = -WEEK_RESAMPLE_INTERVAL; } -uint64_t CRandomizedPeriodicityTest::checksum(uint64_t seed) const { +std::uint64_t CRandomizedPeriodicityTest::checksum(std::uint64_t seed) const { // This checksum is problematic until we switch to using our // own rng for each test. - //seed = CChecksum::calculate(seed, m_DayProjections); - //seed = CChecksum::calculate(seed, m_DayStatistics); - //seed = CChecksum::calculate(seed, m_DayRefreshedProjections); - //seed = CChecksum::calculate(seed, m_WeekProjections); - //seed = CChecksum::calculate(seed, m_WeekStatistics); - //return CChecksum::calculate(seed, m_WeekRefreshedProjections); - return seed; + seed = CChecksum::calculate(seed, m_DayProjections); + seed = CChecksum::calculate(seed, m_DayStatistics); + seed = CChecksum::calculate(seed, m_DayRefreshedProjections); + seed = CChecksum::calculate(seed, m_WeekProjections); + seed = CChecksum::calculate(seed, m_WeekStatistics); + return CChecksum::calculate(seed, m_WeekRefreshedProjections); } void CRandomizedPeriodicityTest::updateStatistics(TVector2NMeanAccumulator& projections, @@ -350,8 +317,8 @@ void CRandomizedPeriodicityTest::resample(core_t::TTime time) { LOG_TRACE(<< "Updating daily random projections at " << time); if (time >= ms_DayResampled.load(atomic_t::memory_order_relaxed) + DAY_RESAMPLE_INTERVAL) { - resample(DAY, DAY_RESAMPLE_INTERVAL, ms_DayPeriodicProjections, - ms_DayRandomProjections); + resample(core::constants::DAY, DAY_RESAMPLE_INTERVAL, + ms_DayPeriodicProjections, ms_DayRandomProjections); ms_DayResampled.store(CIntegerTools::floor(time, DAY_RESAMPLE_INTERVAL), atomic_t::memory_order_release); } @@ -362,8 +329,8 @@ void CRandomizedPeriodicityTest::resample(core_t::TTime time) { LOG_TRACE(<< "Updating weekly random projections at " << time); if (time >= ms_WeekResampled.load(atomic_t::memory_order_relaxed) + WEEK_RESAMPLE_INTERVAL) { - resample(WEEK, WEEK_RESAMPLE_INTERVAL, ms_WeekPeriodicProjections, - ms_WeekRandomProjections); + resample(core::constants::WEEK, WEEK_RESAMPLE_INTERVAL, + ms_WeekPeriodicProjections, ms_WeekRandomProjections); ms_WeekResampled.store(CIntegerTools::floor(time, WEEK_RESAMPLE_INTERVAL), atomic_t::memory_order_release); } @@ -402,189 +369,5 @@ TDoubleVec CRandomizedPeriodicityTest::ms_WeekRandomProjections[N] = {}; TDoubleVec CRandomizedPeriodicityTest::ms_WeekPeriodicProjections[N] = {}; atomic_t::atomic CRandomizedPeriodicityTest::ms_WeekResampled(-WEEK_RESAMPLE_INTERVAL); core::CMutex CRandomizedPeriodicityTest::ms_Lock; - -//////// CCalendarCyclicTest //////// - -CCalendarCyclicTest::CCalendarCyclicTest(double decayRate) - : m_DecayRate(decayRate), m_Bucket(0), - m_ErrorQuantiles(CQuantileSketch::E_Linear, 20), m_ErrorCounts(WINDOW / BUCKET) { - static const SSetTimeZone timezone("GMT"); - m_ErrorSums.reserve(WINDOW / BUCKET / 10); -} - -bool CCalendarCyclicTest::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { - do { - const std::string& name = traverser.name(); - RESTORE_BUILT_IN(BUCKET_TAG, m_Bucket) - RESTORE(ERROR_QUANTILES_TAG, - traverser.traverseSubLevel(boost::bind(&CQuantileSketch::acceptRestoreTraverser, - &m_ErrorQuantiles, _1))) - RESTORE(ERROR_COUNTS_TAG, - core::CPersistUtils::restore(ERROR_COUNTS_TAG, m_ErrorCounts, traverser)) - RESTORE(ERROR_SUMS_TAG, core::CPersistUtils::fromString(traverser.value(), m_ErrorSums)) - } while (traverser.next()); - return true; -} - -void CCalendarCyclicTest::acceptPersistInserter(core::CStatePersistInserter& inserter) const { - inserter.insertValue(BUCKET_TAG, m_Bucket); - inserter.insertLevel(ERROR_QUANTILES_TAG, boost::bind(&CQuantileSketch::acceptPersistInserter, - &m_ErrorQuantiles, _1)); - core::CPersistUtils::persist(ERROR_COUNTS_TAG, m_ErrorCounts, inserter); - inserter.insertValue(ERROR_SUMS_TAG, core::CPersistUtils::toString(m_ErrorSums)); -} - -void CCalendarCyclicTest::propagateForwardsByTime(double time) { - if (!CMathsFuncs::isFinite(time) || time < 0.0) { - LOG_ERROR(<< "Bad propagation time " << time); - return; - } - m_ErrorQuantiles.age(std::exp(-m_DecayRate * time)); -} - -void CCalendarCyclicTest::add(core_t::TTime time, double error, double weight) { - error = std::fabs(error); - - m_ErrorQuantiles.add(error, weight); - - if (m_ErrorQuantiles.count() > 100.0) { - core_t::TTime bucket = CIntegerTools::floor(time, BUCKET); - if (m_ErrorCounts.empty()) { - m_ErrorCounts.push_back(0); - } else { - for (core_t::TTime i = m_Bucket; i < bucket; i += BUCKET) { - m_ErrorCounts.push_back(0); - } - } - - uint32_t& count = m_ErrorCounts.back(); - count += (count % COUNT_BITS < COUNT_BITS - 1) ? 1 : 0; - - double high; - m_ErrorQuantiles.quantile(LARGE_ERROR_PERCENTILE, high); - - m_ErrorSums.erase(m_ErrorSums.begin(), - std::find_if(m_ErrorSums.begin(), m_ErrorSums.end(), - [bucket](const TTimeFloatPr& error_) { - return error_.first + WINDOW > bucket; - })); - if (error >= high) { - count += (count < 0x100000000 - COUNT_BITS) ? COUNT_BITS : 0; - m_ErrorSums[bucket] += this->winsorise(error); - } - - m_Bucket = bucket; - } -} - -CCalendarCyclicTest::TOptionalFeature CCalendarCyclicTest::test() const { - // The statistics we need in order to be able to test for calendar - // features. - struct SStats { - SStats() - : s_Offset(0), s_Repeats(0), s_Sum(0.0), s_Count(0.0), - s_Significance(0.0) {} - core_t::TTime s_Offset; - unsigned int s_Repeats; - double s_Sum; - double s_Count; - double s_Significance; - }; - using TFeatureStatsFMap = boost::container::flat_map; - using TDoubleTimeCalendarFeatureTr = core::CTriple; - using TMaxAccumulator = CBasicStatistics::SMax::TAccumulator; - - TMaxAccumulator result; - - // Most features get the same count. The odd ones out are features - // which happen sporadically because of the variation of the days - // in a month and the day of week on which the first of the month - // falls. The test therefore isn't that sensitive to the exact value - // of this threshold. - - TFeatureStatsFMap stats; - stats.reserve(m_ErrorSums.size()); - - for (auto offset : TIMEZONE_OFFSETS) { - for (const auto& error : m_ErrorSums) { - std::size_t i = m_ErrorCounts.size() - 1 - - static_cast((m_Bucket - error.first) / BUCKET); - double n = static_cast(m_ErrorCounts[i] % COUNT_BITS); - double x = static_cast(m_ErrorCounts[i] / COUNT_BITS); - double s = this->significance(n, x); - for (auto feature : - CCalendarFeature::features(error.first + BUCKET / 2 + offset)) { - SStats& stat = stats[feature]; - ++stat.s_Repeats; - stat.s_Offset = offset; - stat.s_Sum += error.second; - stat.s_Count += x; - stat.s_Significance = std::max(stat.s_Significance, s); - } - } - } - - double errorThreshold; - m_ErrorQuantiles.quantile(50.0, errorThreshold); - errorThreshold *= 2.0; - - for (const auto& stat : stats) { - CCalendarFeature feature = stat.first; - double r = static_cast(stat.second.s_Repeats); - double x = stat.second.s_Count; - double e = stat.second.s_Sum; - double s = stat.second.s_Significance; - if (stat.second.s_Repeats >= MINIMUM_REPEATS && - e > errorThreshold * x && std::pow(s, r) < MAXIMUM_SIGNIFICANCE) { - result.add({e, stat.second.s_Offset, feature}); - } - } - - return result.count() > 0 ? result[0].third : TOptionalFeature(); -} - -uint64_t CCalendarCyclicTest::checksum(uint64_t seed) const { - seed = CChecksum::calculate(seed, m_ErrorQuantiles); - seed = CChecksum::calculate(seed, m_ErrorCounts); - return CChecksum::calculate(seed, m_ErrorSums); -} - -void CCalendarCyclicTest::debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const { - mem->setName("CCalendarCyclicTest"); - core::CMemoryDebug::dynamicSize("m_ErrorQuantiles", m_ErrorQuantiles, mem); - core::CMemoryDebug::dynamicSize("m_ErrorCounts", m_ErrorCounts, mem); - core::CMemoryDebug::dynamicSize("m_ErrorSums", m_ErrorSums, mem); -} - -std::size_t CCalendarCyclicTest::memoryUsage() const { - return core::CMemory::dynamicSize(m_ErrorQuantiles) + - core::CMemory::dynamicSize(m_ErrorCounts) + - core::CMemory::dynamicSize(m_ErrorSums); -} - -double CCalendarCyclicTest::winsorise(double error) const { - double high; - m_ErrorQuantiles.quantile(99.5, high); - return std::min(error, high); -} - -double CCalendarCyclicTest::significance(double n, double x) const { - try { - boost::math::binomial binom(n, 1.0 - LARGE_ERROR_PERCENTILE / 100.0); - return std::min(2.0 * CTools::safeCdfComplement(binom, x - 1.0), 1.0); - } catch (const std::exception& e) { - LOG_ERROR(<< "Failed to calculate significance: " << e.what() - << " n = " << n << " x = " << x); - } - return 1.0; -} - -const core_t::TTime CCalendarCyclicTest::BUCKET{core::constants::DAY}; -const core_t::TTime CCalendarCyclicTest::WINDOW{124 * BUCKET}; -const double CCalendarCyclicTest::LARGE_ERROR_PERCENTILE(99.0); -const unsigned int CCalendarCyclicTest::MINIMUM_REPEATS{4}; -const uint32_t CCalendarCyclicTest::COUNT_BITS{0x100000}; -// TODO support offsets are +/- 12hrs for time zones. -const TTimeVec CCalendarCyclicTest::TIMEZONE_OFFSETS{0}; } } diff --git a/lib/maths/Makefile b/lib/maths/Makefile index 22a5fd7a79..c5072e8e83 100644 --- a/lib/maths/Makefile +++ b/lib/maths/Makefile @@ -20,9 +20,10 @@ CAgglomerativeClusterer.cc \ CAssignment.cc \ CBasicStatistics.cc \ CBjkstUniqueValues.cc \ -CCalendarFeature.cc \ +CCalendarCyclicTest.cc \ CCalendarComponentAdaptiveBucketing.cc \ CCalendarComponent.cc \ +CCalendarFeature.cc \ CCategoricalTools.cc \ CClusterer.cc \ CClustererStateSerialiser.cc \ @@ -77,6 +78,7 @@ CProbabilityCalibrator.cc \ CQDigest.cc \ CQuantileSketch.cc \ CRadialBasisFunction.cc \ +CRandomizedPeriodicityTest.cc \ CRegression.cc \ CRestoreParams.cc \ CSampling.cc \ @@ -94,7 +96,6 @@ CTimeSeriesDecompositionStub.cc \ CTimeSeriesModel.cc \ CTools.cc \ CTrendComponent.cc \ -CTrendTests.cc \ CXMeansOnline1d.cc \ CXMeansOnlineFactory.cc \ MathsTypes.cc \ diff --git a/lib/maths/unittest/CCalendarCyclicTestTest.cc b/lib/maths/unittest/CCalendarCyclicTestTest.cc new file mode 100644 index 0000000000..29f7939a7c --- /dev/null +++ b/lib/maths/unittest/CCalendarCyclicTestTest.cc @@ -0,0 +1,384 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include "CCalendarCyclicTestTest.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "TestUtils.h" + +#include +#include +#include + +#include + +using namespace ml; + +namespace { +using TDoubleVec = std::vector; +using TOptionalFeature = maths::CCalendarCyclicTest::TOptionalFeature; + +const core_t::TTime HALF_HOUR{core::constants::HOUR / 2}; +const core_t::TTime DAY{core::constants::DAY}; +const core_t::TTime MONTH{4 * core::constants::WEEK}; +const core_t::TTime YEAR{core::constants::YEAR}; +} + +void CCalendarCyclicTestTest::testTruePositives() { + // Test the true positive rate for a variety of different features. + + test::CRandomNumbers rng; + + double truePositive{0.0}; + double falsePositive{0.0}; + double falseNegative{0.0}; + + LOG_DEBUG(<< "Day of month"); + + for (std::size_t t = 0; t < 10; ++t) { + // Repeated error on the second day of the month. + core_t::TTime months[]{ + 86400, // 2nd Jan + 2764800, // 2nd Feb + 5184000, // 2nd Mar + 7862400, // 2nd Apr + 10454400, // 2nd May + 13132800, // 2nd June + 15724800, // 2nd July + 18403200, // 2nd Aug + 21081600, // 2nd Sep + 23673600 // 2nd Oct + }; + core_t::TTime end = months[boost::size(months) - 1] + 86400; + + maths::CCalendarCyclicTest cyclic(HALF_HOUR); + + TDoubleVec error; + for (core_t::TTime time = 0; time <= end; time += HALF_HOUR) { + ptrdiff_t i = maths::CTools::truncate( + std::lower_bound(boost::begin(months), boost::end(months), time) - + boost::begin(months), + ptrdiff_t(1), ptrdiff_t(boost::size(months))); + + rng.generateNormalSamples(0.0, 9.0, 1, error); + if (time >= months[i - 1] + 30000 && time < months[i - 1] + 50000) { + TDoubleVec multiplier; + rng.generateUniformSamples(4.0, 6.0, 1, multiplier); + error[0] *= multiplier[0]; + } + cyclic.add(time, error[0]); + + if (time > 121 * DAY && time % DAY == 0) { + TOptionalFeature feature{cyclic.test()}; + if (feature == boost::none) { + falseNegative += 1.0; + } else { + (core::CContainerPrinter::print(feature) == "2nd day of month" + ? truePositive + : falsePositive) += 1.0; + } + } + CPPUNIT_ASSERT(core::CMemory::dynamicSize(&cyclic) < 700); + } + } + LOG_DEBUG(<< "true positive = " << truePositive); + LOG_DEBUG(<< "false negative = " << falseNegative); + LOG_DEBUG(<< "false positive = " << falsePositive); + + LOG_DEBUG(<< "Days before end of month"); + for (std::size_t t = 0; t < 10; ++t) { + // Repeated error on the last day of the month. + core_t::TTime months[]{ + 2592000, // 31st Jan + 5011200, // 28th Feb + 7689600, // 31st Mar + 10281600, // 30th Apr + 12960000, // 31st May + 15552000, // 30th June + 18230400 // 31st July + }; + core_t::TTime end = months[boost::size(months) - 1] + 86400; + + maths::CCalendarCyclicTest cyclic(HALF_HOUR); + + TDoubleVec error; + for (core_t::TTime time = 0; time <= end; time += HALF_HOUR) { + ptrdiff_t i = maths::CTools::truncate( + std::lower_bound(boost::begin(months), boost::end(months), time) - + boost::begin(months), + ptrdiff_t(1), ptrdiff_t(boost::size(months))); + + rng.generateNormalSamples(0.0, 9.0, 1, error); + if (time >= months[i - 1] + 10000 && time < months[i - 1] + 20000) { + error[0] -= 15.0; + } + cyclic.add(time, error[0]); + + if (time > 121 * DAY && time % DAY == 0) { + TOptionalFeature feature = cyclic.test(); + if (feature == boost::none) { + falseNegative += 1.0; + } else { + (core::CContainerPrinter::print(feature) == "0 days before end of month" + ? truePositive + : falsePositive) += 1.0; + } + } + CPPUNIT_ASSERT(core::CMemory::dynamicSize(&cyclic) < 700); + } + } + LOG_DEBUG(<< "true positive = " << truePositive); + LOG_DEBUG(<< "false negative = " << falseNegative); + LOG_DEBUG(<< "false positive = " << falsePositive); + + LOG_DEBUG(<< "Day of week week of month"); + for (std::size_t t = 0; t < 10; ++t) { + // Repeated error on first Monday of each month. + core_t::TTime months[]{ + 345600, // Mon 5th Jan + 2764800, // Mon 2nd Feb + 5184000, // Mon 2nd Mar + 8208000, // Mon 6th Apr + 10627200 // Mon 4th May + }; + core_t::TTime end = months[boost::size(months) - 1] + 86400; + + maths::CCalendarCyclicTest cyclic(HALF_HOUR); + + TDoubleVec error; + for (core_t::TTime time = 0; time <= end; time += HALF_HOUR) { + ptrdiff_t i = maths::CTools::truncate( + std::lower_bound(boost::begin(months), boost::end(months), time) - + boost::begin(months), + ptrdiff_t(1), ptrdiff_t(boost::size(months))); + + rng.generateNormalSamples(0.0, 9.0, 1, error); + if (time >= months[i - 1] + 45000 && time < months[i - 1] + 60000) { + error[0] += 12.0; + } + cyclic.add(time, error[0]); + + if (time > 121 * DAY && time % DAY == 0) { + TOptionalFeature feature = cyclic.test(); + if (feature == boost::none) { + falseNegative += 1.0; + } else { + (core::CContainerPrinter::print(feature) == "1st Monday of month" + ? truePositive + : falsePositive) += 1.0; + } + } + CPPUNIT_ASSERT(core::CMemory::dynamicSize(&cyclic) < 700); + } + } + LOG_DEBUG(<< "true positive = " << truePositive); + LOG_DEBUG(<< "false negative = " << falseNegative); + LOG_DEBUG(<< "false positive = " << falsePositive); + + LOG_DEBUG(<< "Day of week weeks before end of month"); + for (std::size_t t = 0; t < 10; ++t) { + // Repeated error on last Friday of each month. + core_t::TTime months[]{ + 2505600, // Fri 30th Jan + 4924800, // Fri 27th Feb + 7344000, // Fri 27th Mar + 9763200, // Fri 24th Apr + 12787200 // Fri 29th May + }; + core_t::TTime end = months[boost::size(months) - 1] + 86400; + + maths::CCalendarCyclicTest cyclic(HALF_HOUR); + + TDoubleVec error; + for (core_t::TTime time = 0; time <= end; time += HALF_HOUR) { + ptrdiff_t i = maths::CTools::truncate( + std::lower_bound(boost::begin(months), boost::end(months), time) - + boost::begin(months), + ptrdiff_t(1), ptrdiff_t(boost::size(months))); + + rng.generateNormalSamples(0.0, 9.0, 1, error); + if (time >= months[i - 1] + 45000 && time < months[i - 1] + 60000) { + error[0] -= 12.0; + } + cyclic.add(time, error[0]); + + if (time > 121 * DAY && time % DAY == 0) { + TOptionalFeature feature = cyclic.test(); + if (feature == boost::none) { + falseNegative += 1.0; + } else { + (core::CContainerPrinter::print(feature) == "0 Fridays before end of month" + ? truePositive + : falsePositive) += 1.0; + } + } + CPPUNIT_ASSERT(core::CMemory::dynamicSize(&cyclic) < 700); + } + } + LOG_DEBUG(<< "true positive = " << truePositive); + LOG_DEBUG(<< "false negative = " << falseNegative); + LOG_DEBUG(<< "false positive = " << falsePositive); + + double accuracy{(truePositive / (truePositive + falseNegative + falsePositive))}; + LOG_DEBUG(<< "accuracy = " << accuracy); + CPPUNIT_ASSERT(accuracy > 0.9); +} + +void CCalendarCyclicTestTest::testFalsePositives() { + // Test a false positive rates under a variety of noise characteristics. + + test::CRandomNumbers rng; + + double trueNegatives{0.0}; + double falsePositives{0.0}; + + LOG_DEBUG(<< "Normal"); + for (std::size_t t = 0; t < 10; ++t) { + LOG_DEBUG(<< "test = " << t + 1); + + maths::CCalendarCyclicTest cyclic(HALF_HOUR); + + TDoubleVec error; + for (core_t::TTime time = 0; time <= YEAR; time += HALF_HOUR) { + rng.generateNormalSamples(0.0, 9.0, 1, error); + cyclic.add(time, error[0]); + + if (time % MONTH == 0) { + TOptionalFeature feature{cyclic.test()}; + (feature == boost::none ? trueNegatives : falsePositives) += 1.0; + if (feature != boost::none) { + LOG_DEBUG(<< "Detected = " << feature->print()); + } + CPPUNIT_ASSERT(core::CMemory::dynamicSize(&cyclic) < 820); + } + } + } + LOG_DEBUG(<< "true negatives = " << trueNegatives); + LOG_DEBUG(<< "false positives = " << falsePositives); + + LOG_DEBUG(<< "Log-normal"); + for (std::size_t t = 0; t < 10; ++t) { + LOG_DEBUG(<< "test = " << t + 1); + + maths::CCalendarCyclicTest cyclic(HALF_HOUR); + + TDoubleVec error; + for (core_t::TTime time = 0; time <= YEAR; time += HALF_HOUR) { + rng.generateLogNormalSamples(1.0, 2.0, 1, error); + cyclic.add(time, error[0]); + + if (time % MONTH == 0) { + TOptionalFeature feature{cyclic.test()}; + (feature == boost::none ? trueNegatives : falsePositives) += 1.0; + if (feature != boost::none) { + LOG_DEBUG(<< "Detected = " << feature->print()); + } + CPPUNIT_ASSERT(core::CMemory::dynamicSize(&cyclic) < 830); + } + } + } + LOG_DEBUG(<< "true negatives = " << trueNegatives); + LOG_DEBUG(<< "false positives = " << falsePositives); + + LOG_DEBUG(<< "Mixture"); + for (std::size_t t = 0; t < 10; ++t) { + LOG_DEBUG(<< "test = " << t + 1); + + maths::CCalendarCyclicTest cyclic(HALF_HOUR); + + TDoubleVec error; + TDoubleVec uniform01; + for (core_t::TTime time = 0; time <= YEAR; time += HALF_HOUR) { + rng.generateUniformSamples(0.0, 1.0, 1, uniform01); + rng.generateNormalSamples(uniform01[0] < 0.3 ? 0.0 : 15.0, 9.0, 1, error); + cyclic.add(time, error[0]); + + if (time % MONTH == 0) { + TOptionalFeature feature{cyclic.test()}; + (feature == boost::none ? trueNegatives : falsePositives) += 1.0; + if (feature != boost::none) { + LOG_DEBUG(<< "Detected = " << feature->print()); + } + CPPUNIT_ASSERT(core::CMemory::dynamicSize(&cyclic) < 830); + } + } + } + LOG_DEBUG(<< "true negatives = " << trueNegatives); + LOG_DEBUG(<< "false positives = " << falsePositives); + + double accuracy{trueNegatives / (falsePositives + trueNegatives)}; + LOG_DEBUG(<< "accuracy = " << accuracy); + CPPUNIT_ASSERT(accuracy > 0.99); +} + +void CCalendarCyclicTestTest::testPersist() { + // Check that persistence is idempotent. + + test::CRandomNumbers rng; + + maths::CCalendarCyclicTest orig(HALF_HOUR); + + TDoubleVec error; + for (core_t::TTime time = 0; time <= 12787200; time += HALF_HOUR) { + rng.generateNormalSamples(0.0, 10.0, 1, error); + orig.add(time, error[0]); + } + + std::string origXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + orig.acceptPersistInserter(inserter); + inserter.toXml(origXml); + } + + LOG_TRACE(<< "XML representation:\n" << origXml); + LOG_TRACE(<< "XML size:" << origXml.size()); + + maths::CCalendarCyclicTest restored(HALF_HOUR); + { + core::CRapidXmlParser parser; + CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); + core::CRapidXmlStateRestoreTraverser traverser(parser); + CPPUNIT_ASSERT(traverser.traverseSubLevel(boost::bind( + &maths::CCalendarCyclicTest::acceptRestoreTraverser, &restored, _1))); + } + CPPUNIT_ASSERT_EQUAL(orig.checksum(), restored.checksum()); + + std::string newXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + restored.acceptPersistInserter(inserter); + inserter.toXml(newXml); + } + CPPUNIT_ASSERT_EQUAL(origXml, newXml); +} + +CppUnit::Test* CCalendarCyclicTestTest::suite() { + CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CCalendarCyclicTestTest"); + + suiteOfTests->addTest(new CppUnit::TestCaller( + "CCalendarCyclicTestTest::testTruePositives", &CCalendarCyclicTestTest::testTruePositives)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CCalendarCyclicTestTest::testFalsePositives", + &CCalendarCyclicTestTest::testFalsePositives)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CCalendarCyclicTestTest::testPersist", &CCalendarCyclicTestTest::testPersist)); + + return suiteOfTests; +} diff --git a/lib/maths/unittest/CCalendarCyclicTestTest.h b/lib/maths/unittest/CCalendarCyclicTestTest.h new file mode 100644 index 0000000000..cbac81d692 --- /dev/null +++ b/lib/maths/unittest/CCalendarCyclicTestTest.h @@ -0,0 +1,21 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_CCalendarCyclicTestTest_h +#define INCLUDED_CCalendarCyclicTestTest_h + +#include + +class CCalendarCyclicTestTest : public CppUnit::TestFixture { +public: + void testTruePositives(); + void testFalsePositives(); + void testPersist(); + + static CppUnit::Test* suite(); +}; + +#endif // INCLUDED_CCalendarCyclicTestTest_h diff --git a/lib/maths/unittest/CRandomizedPeriodicityTestTest.cc b/lib/maths/unittest/CRandomizedPeriodicityTestTest.cc new file mode 100644 index 0000000000..d98241ca6f --- /dev/null +++ b/lib/maths/unittest/CRandomizedPeriodicityTestTest.cc @@ -0,0 +1,204 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include "CRandomizedPeriodicityTestTest.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "TestUtils.h" + +#include +#include +#include + +#include +#include + +using namespace ml; + +namespace { +using TDoubleVec = std::vector; +using TTimeVec = std::vector; +using TTimeDoublePr = std::pair; +using TTimeDoublePrVec = std::vector; + +const core_t::TTime HALF_HOUR{core::constants::HOUR / 2}; +const core_t::TTime DAY{core::constants::DAY}; +const core_t::TTime WEEK{core::constants::WEEK}; +} + +void CRandomizedPeriodicityTestTest::testAccuracy() { + using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; + using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; + using TMaxAccumulator = + maths::CBasicStatistics::COrderStatisticsStack>; + using TFunction = double (*)(core_t::TTime); + + test::CRandomNumbers rng; + + TMeanAccumulator typeI; + TMeanAccumulator typeII; + for (std::size_t t = 0u; t < 5; ++t) { + LOG_DEBUG(<< "*** test = " << t << " ***"); + + core_t::TTime time = 0; + core_t::TTime day = 0; + + TDoubleVec samples; + rng.generateLogNormalSamples(1.0, 4.0, 84000, samples); + + maths::CRandomizedPeriodicityTest::reset(); + + maths::CRandomizedPeriodicityTest rtests[8]; + double falsePositives[3] = {0.0, 0.0, 0.0}; + double trueNegatives[3] = {0.0, 0.0, 0.0}; + double truePositives[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; + double falseNegatives[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; + TMeanVarAccumulator timeToDetectionMoments[5]; + TMaxAccumulator timeToDetectionMax[5]; + core_t::TTime lastTruePositive[5] = {time, time, time, time, time}; + TFunction functions[] = {&constant, &ramp, &markov, + &smoothDaily, &smoothWeekly, &spikeyDaily, + &spikeyWeekly, &weekends}; + + for (std::size_t i = 0u; i < samples.size(); ++i) { + for (std::size_t j = 0u; j < boost::size(functions); ++j) { + rtests[j].add(time, 600.0 * (functions[j])(time) + samples[i]); + } + if (time >= day + DAY) { + for (std::size_t j = 0u; j < boost::size(rtests); ++j) { + if (j < 3) { + (rtests[j].test() ? falsePositives[j] : trueNegatives[j]) += 1.0; + } else { + (rtests[j].test() ? truePositives[j - 3] + : falseNegatives[j - 3]) += 1.0; + if (rtests[j].test()) { + timeToDetectionMoments[j - 3].add( + time - lastTruePositive[j - 3]); + timeToDetectionMax[j - 3].add( + static_cast(time - lastTruePositive[j - 3])); + lastTruePositive[j - 3] = time; + } + } + } + day += DAY; + } + time += HALF_HOUR; + } + + LOG_DEBUG(<< "falsePositives = " << core::CContainerPrinter::print(falsePositives)); + LOG_DEBUG(<< "trueNegatives = " << core::CContainerPrinter::print(trueNegatives)); + for (std::size_t i = 0u; i < boost::size(falsePositives); ++i) { + CPPUNIT_ASSERT(falsePositives[i] / trueNegatives[i] < 0.1); + typeI.add(falsePositives[i] / trueNegatives[i]); + } + LOG_DEBUG(<< "truePositives = " << core::CContainerPrinter::print(truePositives)); + LOG_DEBUG(<< "falseNegatives = " << core::CContainerPrinter::print(falseNegatives)); + for (std::size_t i = 0u; i < boost::size(falsePositives); ++i) { + CPPUNIT_ASSERT(falseNegatives[i] / truePositives[i] < 0.2); + typeII.add(falseNegatives[i] / truePositives[i]); + } + + for (std::size_t i = 0u; i < boost::size(timeToDetectionMoments); ++i) { + LOG_DEBUG(<< "time to detect moments = " << timeToDetectionMoments[i]); + LOG_DEBUG(<< "maximum time to detect = " << timeToDetectionMax[i][0]); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(timeToDetectionMoments[i]) < + 1.5 * DAY); + CPPUNIT_ASSERT(std::sqrt(maths::CBasicStatistics::variance( + timeToDetectionMoments[i])) < 5 * DAY); + CPPUNIT_ASSERT(timeToDetectionMax[i][0] <= 27 * WEEK); + } + } + LOG_DEBUG(<< "type I = " << maths::CBasicStatistics::mean(typeI)); + LOG_DEBUG(<< "type II = " << maths::CBasicStatistics::mean(typeII)); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(typeI) < 0.015); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(typeII) < 0.05); +} + +void CRandomizedPeriodicityTestTest::testPersist() { + // Check that persistence is idempotent. + + maths::CRandomizedPeriodicityTest test; + for (core_t::TTime t = 1400000000; t < 1400050000; t += 5000) { + test.add(t, 0.2); + } + + std::string origXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + test.acceptPersistInserter(inserter); + inserter.toXml(origXml); + } + + std::string origStaticsXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + test.staticsAcceptPersistInserter(inserter); + inserter.toXml(origStaticsXml); + } + + // Check that the static state is also preserved + uint64_t origNextRandom = test.ms_Rng(); + + LOG_DEBUG(<< "XML representation:\n" << origXml); + + // Restore the XML into a new test + maths::CRandomizedPeriodicityTest test2; + { + core::CRapidXmlParser parser; + CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); + core::CRapidXmlStateRestoreTraverser traverser(parser); + CPPUNIT_ASSERT(traverser.traverseSubLevel(boost::bind( + &maths::CRandomizedPeriodicityTest::acceptRestoreTraverser, &test2, _1))); + } + std::string newXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + test2.acceptPersistInserter(inserter); + inserter.toXml(newXml); + } + CPPUNIT_ASSERT_EQUAL(origXml, newXml); + CPPUNIT_ASSERT_EQUAL(test.checksum(), test2.checksum()); + + { + core::CRapidXmlParser parser; + CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origStaticsXml)); + core::CRapidXmlStateRestoreTraverser traverser(parser); + CPPUNIT_ASSERT(traverser.traverseSubLevel( + &maths::CRandomizedPeriodicityTest::staticsAcceptRestoreTraverser)); + } + std::string newStaticsXml; + { + core::CRapidXmlStatePersistInserter inserter("root"); + test2.staticsAcceptPersistInserter(inserter); + inserter.toXml(newStaticsXml); + } + CPPUNIT_ASSERT_EQUAL(origStaticsXml, newStaticsXml); + + uint64_t newNextRandom = test2.ms_Rng(); + CPPUNIT_ASSERT_EQUAL(origNextRandom, newNextRandom); +} + +CppUnit::Test* CRandomizedPeriodicityTestTest::suite() { + CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CRandomizedPeriodicityTestTest"); + + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTrendTestsTest::testAccuracy", &CRandomizedPeriodicityTestTest::testAccuracy)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CTrendTestsTest::testPersist", &CRandomizedPeriodicityTestTest::testPersist)); + + return suiteOfTests; +} diff --git a/lib/maths/unittest/CTrendTestsTest.h b/lib/maths/unittest/CRandomizedPeriodicityTestTest.h similarity index 59% rename from lib/maths/unittest/CTrendTestsTest.h rename to lib/maths/unittest/CRandomizedPeriodicityTestTest.h index 813d2da3d2..e8be408f47 100644 --- a/lib/maths/unittest/CTrendTestsTest.h +++ b/lib/maths/unittest/CRandomizedPeriodicityTestTest.h @@ -4,18 +4,17 @@ * you may not use this file except in compliance with the Elastic License. */ -#ifndef INCLUDED_CTrendTestsTest_h -#define INCLUDED_CTrendTestsTest_h +#ifndef INCLUDED_CRandomizedPeriodicityTestTest_h +#define INCLUDED_CRandomizedPeriodicityTestTest_h #include -class CTrendTestsTest : public CppUnit::TestFixture { +class CRandomizedPeriodicityTestTest : public CppUnit::TestFixture { public: - void testRandomizedPeriodicity(); - void testCalendarCyclic(); + void testAccuracy(); void testPersist(); static CppUnit::Test* suite(); }; -#endif // INCLUDED_CTrendTestsTest_h +#endif // INCLUDED_CRandomizedPeriodicityTestTest_h diff --git a/lib/maths/unittest/CTimeSeriesDecompositionTest.cc b/lib/maths/unittest/CTimeSeriesDecompositionTest.cc index 59cda83ef8..cf23d892c9 100644 --- a/lib/maths/unittest/CTimeSeriesDecompositionTest.cc +++ b/lib/maths/unittest/CTimeSeriesDecompositionTest.cc @@ -190,7 +190,7 @@ void CTimeSeriesDecompositionTest::testSuperpositionOfSines() { LOG_DEBUG(<< "total 'max residual' / 'max value' = " << totalMaxResidual / totalMaxValue); LOG_DEBUG(<< "total 70% error = " << totalPercentileError / totalSumValue); - CPPUNIT_ASSERT(totalSumResidual < 0.014 * totalSumValue); + CPPUNIT_ASSERT(totalSumResidual < 0.015 * totalSumValue); CPPUNIT_ASSERT(totalMaxResidual < 0.017 * totalMaxValue); CPPUNIT_ASSERT(totalPercentileError < 0.01 * totalSumValue); } @@ -1864,7 +1864,7 @@ void CTimeSeriesDecompositionTest::testCalendar() { LOG_DEBUG(<< "large error count = " << largeErrorCount); CPPUNIT_ASSERT(++count > 4 || largeErrorCount > 15); - CPPUNIT_ASSERT(count < 5 || largeErrorCount <= 5); + CPPUNIT_ASSERT(count < 5 || largeErrorCount <= 1); } } } diff --git a/lib/maths/unittest/CTimeSeriesModelTest.cc b/lib/maths/unittest/CTimeSeriesModelTest.cc index fce3067ebb..a6538bd4de 100644 --- a/lib/maths/unittest/CTimeSeriesModelTest.cc +++ b/lib/maths/unittest/CTimeSeriesModelTest.cc @@ -1405,7 +1405,7 @@ void CTimeSeriesModelTest::testWeights() { error.add(std::fabs(scale - dataScale) / dataScale); } LOG_DEBUG(<< "error = " << maths::CBasicStatistics::mean(error)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.21); + CPPUNIT_ASSERT(maths::CBasicStatistics::mean(error) < 0.18); LOG_DEBUG(<< "Winsorisation"); TDouble2Vec prediction(model.predict(time)); diff --git a/lib/maths/unittest/CTrendTestsTest.cc b/lib/maths/unittest/CTrendTestsTest.cc deleted file mode 100644 index ffc7e1b775..0000000000 --- a/lib/maths/unittest/CTrendTestsTest.cc +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ - -#include "CTrendTestsTest.h" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include "TestUtils.h" - -#include -#include -#include - -#include - -using namespace ml; - -namespace { -using TDoubleVec = std::vector; -using TTimeVec = std::vector; -using TTimeDoublePr = std::pair; -using TTimeDoublePrVec = std::vector; - -const core_t::TTime HALF_HOUR = core::constants::HOUR / 2; -const core_t::TTime DAY = core::constants::DAY; -const core_t::TTime WEEK = core::constants::WEEK; -} - -void CTrendTestsTest::testRandomizedPeriodicity() { - using TMeanAccumulator = maths::CBasicStatistics::SSampleMean::TAccumulator; - using TMeanVarAccumulator = maths::CBasicStatistics::SSampleMeanVar::TAccumulator; - using TMaxAccumulator = - maths::CBasicStatistics::COrderStatisticsStack>; - using TFunction = double (*)(core_t::TTime); - - test::CRandomNumbers rng; - - TMeanAccumulator typeI; - TMeanAccumulator typeII; - for (std::size_t t = 0u; t < 5; ++t) { - LOG_DEBUG(<< "*** test = " << t << " ***"); - - core_t::TTime time = 0; - core_t::TTime day = 0; - - TDoubleVec samples; - rng.generateLogNormalSamples(1.0, 4.0, 84000, samples); - - maths::CRandomizedPeriodicityTest::reset(); - - maths::CRandomizedPeriodicityTest rtests[8]; - double falsePositives[3] = {0.0, 0.0, 0.0}; - double trueNegatives[3] = {0.0, 0.0, 0.0}; - double truePositives[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; - double falseNegatives[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; - TMeanVarAccumulator timeToDetectionMoments[5]; - TMaxAccumulator timeToDetectionMax[5]; - core_t::TTime lastTruePositive[5] = {time, time, time, time, time}; - TFunction functions[] = {&constant, &ramp, &markov, - &smoothDaily, &smoothWeekly, &spikeyDaily, - &spikeyWeekly, &weekends}; - - for (std::size_t i = 0u; i < samples.size(); ++i) { - for (std::size_t j = 0u; j < boost::size(functions); ++j) { - rtests[j].add(time, 600.0 * (functions[j])(time) + samples[i]); - } - if (time >= day + DAY) { - for (std::size_t j = 0u; j < boost::size(rtests); ++j) { - if (j < 3) { - (rtests[j].test() ? falsePositives[j] : trueNegatives[j]) += 1.0; - } else { - (rtests[j].test() ? truePositives[j - 3] - : falseNegatives[j - 3]) += 1.0; - if (rtests[j].test()) { - timeToDetectionMoments[j - 3].add( - time - lastTruePositive[j - 3]); - timeToDetectionMax[j - 3].add( - static_cast(time - lastTruePositive[j - 3])); - lastTruePositive[j - 3] = time; - } - } - } - day += DAY; - } - time += HALF_HOUR; - } - - LOG_DEBUG(<< "falsePositives = " << core::CContainerPrinter::print(falsePositives)); - LOG_DEBUG(<< "trueNegatives = " << core::CContainerPrinter::print(trueNegatives)); - for (std::size_t i = 0u; i < boost::size(falsePositives); ++i) { - CPPUNIT_ASSERT(falsePositives[i] / trueNegatives[i] < 0.1); - typeI.add(falsePositives[i] / trueNegatives[i]); - } - LOG_DEBUG(<< "truePositives = " << core::CContainerPrinter::print(truePositives)); - LOG_DEBUG(<< "falseNegatives = " << core::CContainerPrinter::print(falseNegatives)); - for (std::size_t i = 0u; i < boost::size(falsePositives); ++i) { - CPPUNIT_ASSERT(falseNegatives[i] / truePositives[i] < 0.2); - typeII.add(falseNegatives[i] / truePositives[i]); - } - - for (std::size_t i = 0u; i < boost::size(timeToDetectionMoments); ++i) { - LOG_DEBUG(<< "time to detect moments = " << timeToDetectionMoments[i]); - LOG_DEBUG(<< "maximum time to detect = " << timeToDetectionMax[i][0]); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(timeToDetectionMoments[i]) < - 1.5 * DAY); - CPPUNIT_ASSERT(std::sqrt(maths::CBasicStatistics::variance( - timeToDetectionMoments[i])) < 5 * DAY); - CPPUNIT_ASSERT(timeToDetectionMax[i][0] <= 27 * WEEK); - } - } - LOG_DEBUG(<< "type I = " << maths::CBasicStatistics::mean(typeI)); - LOG_DEBUG(<< "type II = " << maths::CBasicStatistics::mean(typeII)); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(typeI) < 0.015); - CPPUNIT_ASSERT(maths::CBasicStatistics::mean(typeII) < 0.05); -} - -void CTrendTestsTest::testCalendarCyclic() { - using TOptionalFeature = maths::CCalendarCyclicTest::TOptionalFeature; - - core::CTimezone::instance().timezoneName("GMT"); - - test::CRandomNumbers rng; - - LOG_DEBUG(<< "Day of month"); - { - // Repeated error on the second day of the month. - - core_t::TTime months[] = { - 86400, // 2nd Jan - 2764800, // 2nd Feb - 5184000, // 2nd Mar - 7862400, // 2nd Apr - 10454400 // 2nd May - }; - core_t::TTime end = months[boost::size(months) - 1] + 86400; - - maths::CCalendarCyclicTest cyclic(HALF_HOUR); - - TDoubleVec error; - for (core_t::TTime time = 0; time <= end; time += HALF_HOUR) { - ptrdiff_t i = maths::CTools::truncate( - std::lower_bound(boost::begin(months), boost::end(months), time) - - boost::begin(months), - ptrdiff_t(1), ptrdiff_t(boost::size(months))); - - rng.generateNormalSamples(0.0, 10.0, 1, error); - if (time >= months[i - 1] + 30000 && time < months[i - 1] + 50000) { - error[0] *= 5.0; - } - cyclic.add(time, error[0]); - - if (time > 121 * DAY && time % DAY == 0) { - TOptionalFeature feature = cyclic.test(); - CPPUNIT_ASSERT_EQUAL(std::string("2nd day of month"), - core::CContainerPrinter::print(feature)); - } - } - } - - LOG_DEBUG(<< "Days before end of month"); - { - // Repeated error on the last day of the month. - - core_t::TTime months[] = { - 2592000, // 31st Jan - 5011200, // 28th Feb - 7689600, // 31st Mar - 10281600, // 30th Apr - 12960000 // 31st May - }; - core_t::TTime end = months[boost::size(months) - 1] + 86400; - - maths::CCalendarCyclicTest cyclic(HALF_HOUR); - - TDoubleVec error; - for (core_t::TTime time = 0; time <= end; time += HALF_HOUR) { - ptrdiff_t i = maths::CTools::truncate( - std::lower_bound(boost::begin(months), boost::end(months), time) - - boost::begin(months), - ptrdiff_t(1), ptrdiff_t(boost::size(months))); - - rng.generateNormalSamples(0.0, 10.0, 1, error); - if (time >= months[i - 1] + 10000 && time < months[i - 1] + 20000) { - error[0] += 12.0; - } - cyclic.add(time, error[0]); - - if (time > 121 * DAY && time % DAY == 0) { - TOptionalFeature feature = cyclic.test(); - CPPUNIT_ASSERT_EQUAL(std::string("0 days before end of month"), - core::CContainerPrinter::print(feature)); - } - } - } - - LOG_DEBUG(<< "Day of week week of month"); - { - // Repeated error on first Monday of each month. - - core_t::TTime months[] = { - 345600, // Mon 5th Jan - 2764800, // Mon 2nd Feb - 5184000, // Mon 2nd Mar - 8208000, // Mon 6th Apr - 10627200 // Mon 4th May - }; - core_t::TTime end = months[boost::size(months) - 1] + 86400; - - maths::CCalendarCyclicTest cyclic(HALF_HOUR); - - TDoubleVec error; - for (core_t::TTime time = 0; time <= end; time += HALF_HOUR) { - ptrdiff_t i = maths::CTools::truncate( - std::lower_bound(boost::begin(months), boost::end(months), time) - - boost::begin(months), - ptrdiff_t(1), ptrdiff_t(boost::size(months))); - - rng.generateNormalSamples(0.0, 10.0, 1, error); - if (time >= months[i - 1] + 45000 && time < months[i - 1] + 60000) { - error[0] += 12.0; - } - cyclic.add(time, error[0]); - - if (time > 121 * DAY && time % DAY == 0) { - TOptionalFeature feature = cyclic.test(); - CPPUNIT_ASSERT_EQUAL(std::string("1st Monday of month"), - core::CContainerPrinter::print(feature)); - } - } - } - - LOG_DEBUG(<< "Day of week weeks before end of month"); - { - // Repeated error on last Friday of each month. - core_t::TTime months[] = { - 2505600, // Fri 30th Jan - 4924800, // Fri 27th Feb - 7344000, // Fri 27th Mar - 9763200, // Fri 24th Apr - 12787200 // Fri 29th May - }; - core_t::TTime end = months[boost::size(months) - 1] + 86400; - - maths::CCalendarCyclicTest cyclic(HALF_HOUR); - - TDoubleVec error; - for (core_t::TTime time = 0; time <= end; time += HALF_HOUR) { - ptrdiff_t i = maths::CTools::truncate( - std::lower_bound(boost::begin(months), boost::end(months), time) - - boost::begin(months), - ptrdiff_t(1), ptrdiff_t(boost::size(months))); - - rng.generateNormalSamples(0.0, 10.0, 1, error); - if (time >= months[i - 1] + 45000 && time < months[i - 1] + 60000) { - error[0] += 12.0; - } - cyclic.add(time, error[0]); - - if (time > 121 * DAY && time % DAY == 0) { - TOptionalFeature feature = cyclic.test(); - CPPUNIT_ASSERT_EQUAL(std::string("0 Fridays before end of month"), - core::CContainerPrinter::print(feature)); - } - } - } -} - -void CTrendTestsTest::testPersist() { - // Check that persistence is idempotent. - - LOG_DEBUG(<< "Test CRandomizedPeriodicityTest"); - { - maths::CRandomizedPeriodicityTest test; - for (core_t::TTime t = 1400000000; t < 1400050000; t += 5000) { - test.add(t, 0.2); - } - - std::string origXml; - { - core::CRapidXmlStatePersistInserter inserter("root"); - test.acceptPersistInserter(inserter); - inserter.toXml(origXml); - } - - std::string origStaticsXml; - { - core::CRapidXmlStatePersistInserter inserter("root"); - test.staticsAcceptPersistInserter(inserter); - inserter.toXml(origStaticsXml); - } - - // Check that the static state is also preserved - uint64_t origNextRandom = test.ms_Rng(); - - LOG_DEBUG(<< "XML representation:\n" << origXml); - - // Restore the XML into a new test - maths::CRandomizedPeriodicityTest test2; - { - core::CRapidXmlParser parser; - CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); - core::CRapidXmlStateRestoreTraverser traverser(parser); - CPPUNIT_ASSERT(traverser.traverseSubLevel(boost::bind( - &maths::CRandomizedPeriodicityTest::acceptRestoreTraverser, &test2, _1))); - } - std::string newXml; - { - core::CRapidXmlStatePersistInserter inserter("root"); - test2.acceptPersistInserter(inserter); - inserter.toXml(newXml); - } - CPPUNIT_ASSERT_EQUAL(origXml, newXml); - - { - core::CRapidXmlParser parser; - CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origStaticsXml)); - core::CRapidXmlStateRestoreTraverser traverser(parser); - CPPUNIT_ASSERT(traverser.traverseSubLevel( - &maths::CRandomizedPeriodicityTest::staticsAcceptRestoreTraverser)); - } - std::string newStaticsXml; - { - core::CRapidXmlStatePersistInserter inserter("root"); - test2.staticsAcceptPersistInserter(inserter); - inserter.toXml(newStaticsXml); - } - CPPUNIT_ASSERT_EQUAL(origStaticsXml, newStaticsXml); - - uint64_t newNextRandom = test2.ms_Rng(); - CPPUNIT_ASSERT_EQUAL(origNextRandom, newNextRandom); - } - - LOG_DEBUG(<< "Test CCalendarCyclicTest"); - { - test::CRandomNumbers rng; - - maths::CCalendarCyclicTest orig(HALF_HOUR); - - TDoubleVec error; - for (core_t::TTime time = 0; time <= 12787200; time += HALF_HOUR) { - rng.generateNormalSamples(0.0, 10.0, 1, error); - orig.add(time, error[0]); - } - - std::string origXml; - { - core::CRapidXmlStatePersistInserter inserter("root"); - orig.acceptPersistInserter(inserter); - inserter.toXml(origXml); - } - - LOG_DEBUG(<< "XML representation:\n" << origXml); - - maths::CCalendarCyclicTest restored(HALF_HOUR); - { - core::CRapidXmlParser parser; - CPPUNIT_ASSERT(parser.parseStringIgnoreCdata(origXml)); - core::CRapidXmlStateRestoreTraverser traverser(parser); - CPPUNIT_ASSERT(traverser.traverseSubLevel(boost::bind( - &maths::CCalendarCyclicTest::acceptRestoreTraverser, &restored, _1))); - } - CPPUNIT_ASSERT_EQUAL(orig.checksum(), restored.checksum()); - - std::string newXml; - { - core::CRapidXmlStatePersistInserter inserter("root"); - restored.acceptPersistInserter(inserter); - inserter.toXml(newXml); - } - CPPUNIT_ASSERT_EQUAL(origXml, newXml); - } -} - -CppUnit::Test* CTrendTestsTest::suite() { - CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CTrendTestsTest"); - - suiteOfTests->addTest(new CppUnit::TestCaller( - "CTrendTestsTest::testRandomizedPeriodicity", &CTrendTestsTest::testRandomizedPeriodicity)); - suiteOfTests->addTest(new CppUnit::TestCaller( - "CTrendTestsTest::testCalendarCyclic", &CTrendTestsTest::testCalendarCyclic)); - suiteOfTests->addTest(new CppUnit::TestCaller( - "CTrendTestsTest::testPersist", &CTrendTestsTest::testPersist)); - - return suiteOfTests; -} diff --git a/lib/maths/unittest/Main.cc b/lib/maths/unittest/Main.cc index 0ae687eea6..6415556869 100644 --- a/lib/maths/unittest/Main.cc +++ b/lib/maths/unittest/Main.cc @@ -13,6 +13,7 @@ #include "CBootstrapClustererTest.h" #include "CBoundingBoxTest.h" #include "CCalendarComponentAdaptiveBucketingTest.h" +#include "CCalendarCyclicTestTest.h" #include "CCalendarFeatureTest.h" #include "CCategoricalToolsTest.h" #include "CChecksumTest.h" @@ -63,6 +64,7 @@ #include "CQuantileSketchTest.h" #include "CRadialBasisFunctionTest.h" #include "CRandomProjectionClustererTest.h" +#include "CRandomizedPeriodicityTestTest.h" #include "CRegressionTest.h" #include "CSamplingTest.h" #include "CSeasonalComponentAdaptiveBucketingTest.h" @@ -77,7 +79,6 @@ #include "CTimeSeriesModelTest.h" #include "CToolsTest.h" #include "CTrendComponentTest.h" -#include "CTrendTestsTest.h" #include "CXMeansOnline1dTest.h" #include "CXMeansOnlineTest.h" #include "CXMeansTest.h" @@ -92,8 +93,9 @@ int main(int argc, const char** argv) { runner.addTest(CBootstrapClustererTest::suite()); runner.addTest(CBoundingBoxTest::suite()); runner.addTest(CCategoricalToolsTest::suite()); - runner.addTest(CCalendarFeatureTest::suite()); runner.addTest(CCalendarComponentAdaptiveBucketingTest::suite()); + runner.addTest(CCalendarCyclicTestTest::suite()); + runner.addTest(CCalendarFeatureTest::suite()); runner.addTest(CChecksumTest::suite()); runner.addTest(CClustererTest::suite()); runner.addTest(CCountMinSketchTest::suite()); @@ -142,6 +144,7 @@ int main(int argc, const char** argv) { runner.addTest(CQuantileSketchTest::suite()); runner.addTest(CRadialBasisFunctionTest::suite()); runner.addTest(CRandomProjectionClustererTest::suite()); + runner.addTest(CRandomizedPeriodicityTestTest::suite()); runner.addTest(CRegressionTest::suite()); runner.addTest(CSamplingTest::suite()); runner.addTest(CSeasonalComponentTest::suite()); @@ -156,7 +159,6 @@ int main(int argc, const char** argv) { runner.addTest(CTimeSeriesModelTest::suite()); runner.addTest(CToolsTest::suite()); runner.addTest(CTrendComponentTest::suite()); - runner.addTest(CTrendTestsTest::suite()); runner.addTest(CXMeansTest::suite()); runner.addTest(CXMeansOnlineTest::suite()); runner.addTest(CXMeansOnline1dTest::suite()); diff --git a/lib/maths/unittest/Makefile b/lib/maths/unittest/Makefile index 96f7a44c09..0083fd4fa0 100644 --- a/lib/maths/unittest/Makefile +++ b/lib/maths/unittest/Makefile @@ -23,8 +23,9 @@ SRCS=\ CBjkstUniqueValuesTest.cc \ CBootstrapClustererTest.cc \ CBoundingBoxTest.cc \ - CCalendarFeatureTest.cc \ CCalendarComponentAdaptiveBucketingTest.cc \ + CCalendarCyclicTestTest.cc \ + CCalendarFeatureTest.cc \ CCategoricalToolsTest.cc \ CChecksumTest.cc \ CClustererTest.cc \ @@ -74,6 +75,7 @@ SRCS=\ CQuantileSketchTest.cc \ CRegressionTest.cc \ CRadialBasisFunctionTest.cc \ + CRandomizedPeriodicityTestTest.cc \ CRandomProjectionClustererTest.cc \ CSamplingTest.cc \ CSeasonalComponentTest.cc \ @@ -88,7 +90,6 @@ SRCS=\ CTimeSeriesModelTest.cc \ CToolsTest.cc \ CTrendComponentTest.cc \ - CTrendTestsTest.cc \ CXMeansOnlineTest.cc \ CXMeansOnline1dTest.cc \ CXMeansTest.cc \ diff --git a/lib/model/CAnomalyDetector.cc b/lib/model/CAnomalyDetector.cc index 37f14afb58..cfb84c57a8 100644 --- a/lib/model/CAnomalyDetector.cc +++ b/lib/model/CAnomalyDetector.cc @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -49,7 +48,7 @@ const std::string KEY_TAG("d"); const std::string SIMPLE_COUNT_STATICS("f"); // classes containing static members needing persistence -const std::string RANDOMIZED_PERIODIC_TAG("a"); +//const std::string RANDOMIZED_PERIODIC_TAG("a"); // No longer used const std::string STATISTICS_TAG("b"); const std::string SAMPLING_TAG("c"); @@ -253,13 +252,7 @@ bool CAnomalyDetector::legacyModelsAcceptRestoreTraverser(core::CStateRestoreTra bool CAnomalyDetector::staticsAcceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { do { const std::string& name = traverser.name(); - if (name == RANDOMIZED_PERIODIC_TAG) { - if (traverser.traverseSubLevel(&maths::CRandomizedPeriodicityTest::staticsAcceptRestoreTraverser) == - false) { - LOG_ERROR(<< "Failed to restore randomized periodic test state"); - return false; - } - } else if (name == STATISTICS_TAG) { + if (name == STATISTICS_TAG) { if (traverser.traverseSubLevel( &core::CStatistics::staticsAcceptRestoreTraverser) == false) { LOG_ERROR(<< "Failed to restore statistics"); @@ -333,8 +326,6 @@ void CAnomalyDetector::acceptPersistInserter(core::CStatePersistInserter& insert } void CAnomalyDetector::staticsAcceptPersistInserter(core::CStatePersistInserter& inserter) const { - inserter.insertLevel(RANDOMIZED_PERIODIC_TAG, - &maths::CRandomizedPeriodicityTest::staticsAcceptPersistInserter); inserter.insertLevel(STATISTICS_TAG, &core::CStatistics::staticsAcceptPersistInserter); inserter.insertLevel(SAMPLING_TAG, &maths::CSampling::staticsAcceptPersistInserter); }