Skip to content

Commit

Permalink
Metrics cache integration with C++ backend
Browse files Browse the repository at this point in the history
  • Loading branch information
Naman Nandan committed Nov 29, 2022
1 parent f21518d commit 0ade963
Show file tree
Hide file tree
Showing 20 changed files with 233 additions and 22 deletions.
17 changes: 16 additions & 1 deletion cpp/src/backends/process/model_worker_socket.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "src/backends/process/model_worker.hh"
#include "src/utils/logging.hh"
#include "src/utils/metrics/registry.hh"

DEFINE_string(sock_type, "tcp", "socket type");
DEFINE_string(sock_name, "", "socket name for uds");
Expand All @@ -16,6 +17,7 @@ DEFINE_string(device_type, "cpu", "cpu, or gpu");
// TODO: discuss multiple backends support
DEFINE_string(model_dir, "", "model path");
DEFINE_string(logger_config_path, "", "Logging config file path");
DEFINE_string(metrics_config_path, "", "Metrics config file path");

int main(int argc, char* argv[]) {
try {
Expand All @@ -34,6 +36,19 @@ int main(int argc, char* argv[]) {
}
torchserve::Logger::InitLogger(FLAGS_logger_config_path);

if (FLAGS_metrics_config_path.empty()) {
FLAGS_metrics_config_path =
std::string() +
std::filesystem::canonical(gflags::ProgramInvocationName())
.parent_path()
.c_str() +
PATH_SEPARATOR + ".." + PATH_SEPARATOR + ".." + PATH_SEPARATOR +
".." + PATH_SEPARATOR + "ts" + PATH_SEPARATOR + "configs" +
PATH_SEPARATOR + "metrics.yaml";
}
torchserve::MetricsRegistry::Initialize(
FLAGS_metrics_config_path, torchserve::MetricsContext::BACKEND);

torchserve::SocketServer server = torchserve::SocketServer::GetInstance();
server.Initialize(FLAGS_sock_type, FLAGS_sock_name, FLAGS_host, FLAGS_port,
FLAGS_runtime_type, FLAGS_device_type, FLAGS_model_dir);
Expand All @@ -44,4 +59,4 @@ int main(int argc, char* argv[]) {
} catch (...) {
std::cout << "cpp backend failed to start\n";
}
}
}
15 changes: 15 additions & 0 deletions cpp/src/backends/torch_scripted/handler/base_handler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,26 @@ void BaseHandler::Handle(
std::pair<std::string&, std::map<uint8_t, std::string>&> idx_to_req_id(
req_ids, map_idx_to_req_id);
try {
auto start_time = std::chrono::high_resolution_clock::now();
auto inputs =
Preprocess(device, idx_to_req_id, request_batch, response_batch);
auto outputs =
Inference(model, inputs, device, idx_to_req_id, response_batch);
Postprocess(outputs, idx_to_req_id, response_batch);
auto stop_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = stop_time - start_time;
try {
auto& handler_time_metric =
torchserve::MetricsRegistry::GetMetricsCacheInstance()->GetMetric(
torchserve::MetricType::GAUGE, "HandlerTime");
handler_time_metric.AddOrUpdate(
std::vector<std::string>{manifest_->GetModel().model_name, "Model"},
idx_to_req_id.first, duration.count());
} catch (std::runtime_error& e) {
TS_LOG(ERROR, e.what());
} catch (std::invalid_argument& e) {
TS_LOGF(ERROR, "Failed to record HandlerTime metric. {}", e.what());
}
} catch (...) {
TS_LOG(ERROR, "Failed to handle this batch");
}
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/backends/torch_scripted/handler/base_handler.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
#include <torch/script.h>
#include <torch/torch.h>

#include <chrono>
#include <functional>
#include <map>
#include <memory>
#include <ratio>
#include <utility>

#include "src/utils/logging.hh"
#include "src/utils/message.hh"
#include "src/utils/metrics/registry.hh"
#include "src/utils/model_archive.hh"

namespace torchserve {
Expand Down Expand Up @@ -80,4 +83,4 @@ class BaseHandler {
};
} // namespace torchscripted
} // namespace torchserve
#endif // TS_CPP_BACKENDS_TORCH_HANDLER_BASE_HANDLER_HH_
#endif // TS_CPP_BACKENDS_TORCH_HANDLER_BASE_HANDLER_HH_
1 change: 1 addition & 0 deletions cpp/src/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ list(APPEND TS_UTILS_SOURCE_FILES ${TS_UTILS_SRC_DIR}/metrics/units.cc)
list(APPEND TS_UTILS_SOURCE_FILES ${TS_UTILS_SRC_DIR}/metrics/log_metric.cc)
list(APPEND TS_UTILS_SOURCE_FILES ${TS_UTILS_SRC_DIR}/metrics/yaml_config.cc)
list(APPEND TS_UTILS_SOURCE_FILES ${TS_UTILS_SRC_DIR}/metrics/log_metrics_cache.cc)
list(APPEND TS_UTILS_SOURCE_FILES ${TS_UTILS_SRC_DIR}/metrics/registry.cc)
add_library(ts_utils SHARED ${TS_UTILS_SOURCE_FILES})
target_include_directories(ts_utils PUBLIC ${TS_UTILS_SRC_DIR})
target_include_directories(ts_utils PRIVATE ${Boost_INCLUDE_DIRS})
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/utils/metrics/cache.hh
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
#include <string>
#include <vector>

#include "src/utils/metrics/config.hh"
#include "src/utils/metrics/metric.hh"

namespace torchserve {
class MetricsCache {
public:
virtual ~MetricsCache() {}
virtual void Initialize(
const MetricsConfigurationHandler& config_handler) = 0;
virtual IMetric& GetMetric(const MetricType& type,
const std::string& name) = 0;

Expand Down
11 changes: 11 additions & 0 deletions cpp/src/utils/metrics/config.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,23 @@ enum MetricsMode { LOG, PROMETHEUS };

enum MetricsContext { BACKEND, FRONTEND };

// Due to https://github.com/llvm/llvm-project/issues/54668,
// ignore bugprone-exception-escape
// NOLINTBEGIN(bugprone-exception-escape)
struct MetricConfiguration {
MetricType type;
std::string name;
std::string unit;
std::vector<std::string> dimension_names;

MetricConfiguration() : type(MetricType::COUNTER) {}

MetricConfiguration(const MetricType& type, const std::string& name,
const std::string& unit,
const std::vector<std::string>& dimension_names)
: type(type), name(name), unit(unit), dimension_names(dimension_names) {}
// NOLINTEND(bugprone-exception-escape)

bool operator==(const MetricConfiguration& config) const {
return (config.type == type) && (config.name == name) &&
(config.unit == unit) && (config.dimension_names == dimension_names);
Expand Down
11 changes: 5 additions & 6 deletions cpp/src/utils/metrics/log_metric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,11 @@ void TSLogMetric::AddOrUpdate(const std::vector<std::string>& dimension_values,
ValidateDimensionValues(dimension_values);
ValidateMetricValue(value);
} catch (const std::invalid_argument& exception) {
TS_LOGF(ERROR,
"[METRICS]Failed to update metric with name: {} and dimensions: {} "
"with value: {}. {}",
name, BuildDimensionsString(dimension_values), value,
exception.what());
return;
std::string error_message =
"Failed to update metric with name: " + name +
" and dimensions: " + BuildDimensionsString(dimension_values) +
" with value: " + std::to_string(value) + ". " + exception.what();
throw std::invalid_argument(error_message);
}

Emit(dimension_values, request_id, value);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/utils/metrics/log_metric.hh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class TSLogMetric : public IMetric {
void AddOrUpdate(const std::vector<std::string>& dimension_values,
const double& value) override;
void AddOrUpdate(const std::vector<std::string>& dimension_values,
const std::string& request_id, const double& value);
const std::string& request_id, const double& value) override;

private:
std::string BuildDimensionsString(
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/utils/metrics/log_metrics_cache.hh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ namespace torchserve {
class LogMetricsCache : public MetricsCache {
public:
~LogMetricsCache() override {}
void Initialize(const MetricsConfigurationHandler& config_handler);
void Initialize(const MetricsConfigurationHandler& config_handler) override;
TSLogMetric& GetMetric(const MetricType& type,
const std::string& name) override;

Expand Down
3 changes: 3 additions & 0 deletions cpp/src/utils/metrics/metric.hh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ class IMetric {
virtual ~IMetric() {}
virtual void AddOrUpdate(const std::vector<std::string>& dimension_values,
const double& value) = 0;
virtual void AddOrUpdate(const std::vector<std::string>& dimension_values,
const std::string& request_id,
const double& value) = 0;

protected:
const MetricType type;
Expand Down
29 changes: 29 additions & 0 deletions cpp/src/utils/metrics/registry.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "src/utils/metrics/registry.hh"

namespace torchserve {
std::shared_ptr<MetricsCache> MetricsRegistry::metrics_cache = nullptr;

void MetricsRegistry::Initialize(const std::string& metrics_config_file_path,
const MetricsContext& metrics_context) {
try {
std::shared_ptr<MetricsConfigurationHandler> metrics_config_handler =
std::make_shared<YAMLMetricsConfigurationHandler>();
metrics_config_handler->LoadConfiguration(metrics_config_file_path,
metrics_context);
metrics_cache = std::make_shared<LogMetricsCache>();
metrics_cache->Initialize(*metrics_config_handler);
} catch (...) {
metrics_cache = nullptr;
throw;
}
}

std::shared_ptr<MetricsCache>& MetricsRegistry::GetMetricsCacheInstance() {
if (metrics_cache == nullptr) {
throw std::runtime_error(
"Metrics cache not initialized in metrics registry");
}

return metrics_cache;
}
} // namespace torchserve
21 changes: 21 additions & 0 deletions cpp/src/utils/metrics/registry.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#ifndef TS_CPP_UTILS_METRICS_REGISTRY_HH_
#define TS_CPP_UTILS_METRICS_REGISTRY_HH_

#include <stdexcept>

#include "src/utils/metrics/log_metrics_cache.hh"
#include "src/utils/metrics/yaml_config.hh"

namespace torchserve {
class MetricsRegistry {
public:
static void Initialize(const std::string& metrics_config_file_path,
const MetricsContext& metrics_context);
static std::shared_ptr<MetricsCache>& GetMetricsCacheInstance();

private:
static std::shared_ptr<MetricsCache> metrics_cache;
};
} // namespace torchserve

#endif // TS_CPP_UTILS_METRICS_REGISTRY_HH_
3 changes: 3 additions & 0 deletions cpp/test/backends/otf_protocol_and_handler_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "protocol/mock_socket.hh"
#include "src/backends/process/model_worker.hh"
#include "src/backends/torch_scripted/torch_scripted_backend.hh"
#include "src/utils/metrics/registry.hh"

namespace torchserve {
TEST(BackendIntegTest, TestOTFProtocolAndHandler) {
Expand Down Expand Up @@ -68,6 +69,8 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) {

// initialize backend
auto backend = std::make_shared<torchserve::torchscripted::Backend>();
MetricsRegistry::Initialize("test/resources/metrics/default_config.yaml",
MetricsContext::BACKEND);
backend->Initialize("test/resources/torchscript_model/mnist/base_handler");

// load the model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <memory>

#include "src/utils/message.hh"
#include "src/utils/metrics/registry.hh"

namespace torchserve {
class TorchScriptedBackendTest : public ::testing::Test {
Expand All @@ -21,6 +22,8 @@ class TorchScriptedBackendTest : public ::testing::Test {
const std::string& inference_input_file_path,
const std::string& inference_request_id_prefix,
int inference_expect_code) {
MetricsRegistry::Initialize("test/resources/metrics/default_config.yaml",
MetricsContext::BACKEND);
backend_->Initialize(model_dir);
auto result = backend_->LoadModel(std::move(load_model_request));
ASSERT_EQ(result->code, 200);
Expand Down Expand Up @@ -104,4 +107,4 @@ TEST_F(TorchScriptedBackendTest, TestLoadPredictMnistHandlerFailure) {
500);
}

} // namespace torchserve
} // namespace torchserve
9 changes: 9 additions & 0 deletions cpp/test/resources/metrics/default_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
dimensions:
- &model_name "ModelName"
- &level "Level"

model_metrics:
gauge:
- name: HandlerTime
unit: ms
dimensions: [*model_name, *level]
22 changes: 15 additions & 7 deletions cpp/test/utils/metrics/log_metric_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,11 @@ TEST_F(TSLogMetricTest, TestCounterMetric) {
ASSERT_TRUE(GetMetricValuesFromLogs().empty());

test_metric.AddOrUpdate(metric_dimension_values, 1.0);
test_metric.AddOrUpdate(metric_dimension_values, -2.0);
test_metric.AddOrUpdate(metric_dimension_values, metric_request_id, -2.5);
ASSERT_THROW(test_metric.AddOrUpdate(metric_dimension_values, -2.0),
std::invalid_argument);
ASSERT_THROW(
test_metric.AddOrUpdate(metric_dimension_values, metric_request_id, -2.5),
std::invalid_argument);
test_metric.AddOrUpdate(metric_dimension_values, metric_request_id, 3.5);
const std::vector<double> expected_metric_values{1.0, 3.5};
ASSERT_EQ(GetMetricValuesFromLogs(), expected_metric_values);
Expand Down Expand Up @@ -148,11 +151,16 @@ TEST_F(TSLogMetricTest, TestTSLogMetricEmitWithoutRequestId) {
TEST_F(TSLogMetricTest, TestTSLogMetricEmitWithIncorrectDimensionData) {
TSLogMetric test_metric(MetricType::COUNTER, metric_name, "ms",
metric_dimension_names);
test_metric.AddOrUpdate(std::vector<std::string>{"model"}, 1.5);
test_metric.AddOrUpdate(std::vector<std::string>{"model", ""}, 1.5);
test_metric.AddOrUpdate(
std::vector<std::string>{"model", "test_model", "extra_dim"}, 1.5);
ASSERT_EQ(GetMetricLogs().size(), 3);
ASSERT_THROW(test_metric.AddOrUpdate(std::vector<std::string>{"model"}, 1.5),
std::invalid_argument);
ASSERT_THROW(
test_metric.AddOrUpdate(std::vector<std::string>{"model", ""}, 1.5),
std::invalid_argument);
ASSERT_THROW(
test_metric.AddOrUpdate(
std::vector<std::string>{"model", "test_model", "extra_dim"}, 1.5),
std::invalid_argument);
ASSERT_EQ(GetMetricLogs().size(), 0);
ASSERT_TRUE(GetMetricValuesFromLogs().empty());
}
} // namespace torchserve
36 changes: 36 additions & 0 deletions cpp/test/utils/metrics/registry_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include "src/utils/metrics/registry.hh"

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <stdexcept>

namespace torchserve {
TEST(RegistryTest, TestValidConfigFile) {
MetricsRegistry::Initialize("test/resources/metrics/valid_config.yaml",
MetricsContext::BACKEND);
ASSERT_THAT(MetricsRegistry::GetMetricsCacheInstance(),
::testing::A<std::shared_ptr<MetricsCache>>());
}

TEST(RegistryTest, TestInvalidConfigFile) {
ASSERT_THROW(
MetricsRegistry::Initialize(
"test/resources/metrics/invalid_config_duplicate_dimension.yaml",
MetricsContext::BACKEND),
std::invalid_argument);
ASSERT_THROW(MetricsRegistry::GetMetricsCacheInstance(), std::runtime_error);
}

TEST(RegistryTest, TestReInitialize) {
MetricsRegistry::Initialize("test/resources/metrics/valid_config.yaml",
MetricsContext::BACKEND);
ASSERT_THAT(MetricsRegistry::GetMetricsCacheInstance(),
::testing::A<std::shared_ptr<MetricsCache>>());

MetricsRegistry::Initialize("test/resources/metrics/default_config.yaml",
MetricsContext::BACKEND);
ASSERT_THAT(MetricsRegistry::GetMetricsCacheInstance(),
::testing::A<std::shared_ptr<MetricsCache>>());
}
} // namespace torchserve
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public final class ConfigManager {
private static final String TS_NETTY_CLIENT_THREADS = "netty_client_threads";
private static final String TS_JOB_QUEUE_SIZE = "job_queue_size";
private static final String TS_NUMBER_OF_GPU = "number_of_gpu";
private static final String TS_METRICS_CONFIG = "metrics_config";

// IPEX config option that can be set at config.properties
private static final String TS_IPEX_ENABLE = "ipex_enable";
Expand Down Expand Up @@ -370,6 +371,14 @@ public int getNumberOfGpu() {
return getIntProperty(TS_NUMBER_OF_GPU, 0);
}

public String getMetricsConfigPath() {
String path = getCanonicalPath(prop.getProperty(TS_METRICS_CONFIG));
if (path == null) {
path = getModelServerHome() + "/ts/configs/metrics.yaml";
}
return path;
}

public String getTsDefaultServiceHandler() {
return getProperty(TS_DEFAULT_SERVICE_HANDLER, null);
}
Expand Down Expand Up @@ -593,6 +602,8 @@ public String dumpConfigurations() {
+ getCanonicalPath(".")
+ "\nTemp directory: "
+ System.getProperty("java.io.tmpdir")
+ "\nMetrics config path: "
+ getMetricsConfigPath()
+ "\nNumber of GPUs: "
+ getNumberOfGpu()
+ "\nNumber of CPUs: "
Expand Down
Loading

0 comments on commit 0ade963

Please sign in to comment.