diff --git a/system/autoware_node_death_monitor/CMakeLists.txt b/system/autoware_node_death_monitor/CMakeLists.txt new file mode 100644 index 0000000000000..8bf400e0c0649 --- /dev/null +++ b/system/autoware_node_death_monitor/CMakeLists.txt @@ -0,0 +1,18 @@ +cmake_minimum_required(VERSION 3.14) +project(autoware_node_death_monitor) + +find_package(autoware_cmake REQUIRED) +autoware_package() + +ament_auto_add_library(${PROJECT_NAME} SHARED + src/autoware_node_death_monitor.cpp +) + +rclcpp_components_register_node(${PROJECT_NAME} + PLUGIN "autoware::node_death_monitor::NodeDeathMonitor" + EXECUTABLE ${PROJECT_NAME}_node) + +ament_auto_package(INSTALL_TO_SHARE + config + launch +) diff --git a/system/autoware_node_death_monitor/README.md b/system/autoware_node_death_monitor/README.md new file mode 100644 index 0000000000000..7df8393a64d03 --- /dev/null +++ b/system/autoware_node_death_monitor/README.md @@ -0,0 +1,73 @@ +# autoware_node_death_monitor + +This package provides a monitoring node that detects ROS 2 node crashes by analyzing `launch.log` files, rather than subscribing to `/rosout` logs. + +--- + +## Overview + +- **Node name**: `autoware_node_death_monitor` +- **Monitored file**: `launch.log` +- **Detected event**: Looks for lines containing the substring `"process has died"` and extracts the node name and exit code. + +When a crash or unexpected shutdown occurs, `ros2 launch` typically outputs a line in `launch.log` such as: + +```bash +[ERROR] [node_name-1]: process has died [pid 12345, exit code 139, cmd '...'] +``` + +The `autoware_node_death_monitor` node continuously reads the latest `launch.log` file, detects these messages, and logs a warning or marks the node as "dead." + +--- + +## How it Works + +1. **Find `launch.log`**: + - First, checks the `ROS_LOG_DIR` environment variable. + - If not set, falls back to `~/.ros/log`. + - Identifies the latest log directory based on modification time. +2. **Monitor `launch.log`**: + - Reads the file from the last known position to detect new log entries. + - Looks for lines containing `"process has died"`. + - Extracts the node name and exit code. +3. **Filtering**: + - **Ignored node names**: Nodes matching patterns in `ignore_node_names` are skipped. + - **Ignored exit codes**: Logs with ignored exit codes are not flagged as errors. +4. **Regular Updates**: + - A timer periodically reads new entries from `launch.log`. + - Dead nodes are reported in the logs. (will be changed to publish diagnostics) + +--- + +## Parameters + +| Parameter Name | Type | Default | Description | +| ------------------- | ---------- | ----------------- | ---------------------------------------------------------- | +| `ignore_node_names` | `string[]` | `[]` (empty list) | Node name patterns to ignore. E.g., `['rviz2']`. | +| `ignore_exit_codes` | `int[]` | `[]` (empty list) | Exit codes to ignore (e.g., `0` or `130` for normal exit). | +| `check_interval` | `double` | `1.0` | Timer interval (seconds) for scanning the log file. | +| `enable_debug` | `bool` | `false` | Enables debug logging for detailed output. | + +Example **`autoware_node_death_monitor.param.yaml`**: + +```yaml +autoware_node_death_monitor: + ros__parameters: + ignore_node_names: + - rviz2 + - teleop_twist_joy + ignore_exit_codes: + - 0 + - 130 + check_interval: 1.0 + enable_debug: false +``` + +--- + +## Limitations + +- **後で書く**: TBD. +- **Robust Monitoring**: Works alongside systemd, supervisord, or other process supervisors for enhanced fault detection. + +--- diff --git a/system/autoware_node_death_monitor/config/autoware_node_death_monitor.param.yaml b/system/autoware_node_death_monitor/config/autoware_node_death_monitor.param.yaml new file mode 100644 index 0000000000000..f65fb0f554ecc --- /dev/null +++ b/system/autoware_node_death_monitor/config/autoware_node_death_monitor.param.yaml @@ -0,0 +1,18 @@ +/**: + ros__parameters: + # Node names to exclude from monitoring (Note: be careful with the "[node_name-#]" format) + # Example: Do not issue a warning if rviz2 crashes. + ignore_node_names: + - rviz2 + + # Exit codes to exclude from monitoring (e.g., Ctrl+C) + # Example: 0, 130 are considered normal exits and not treated as errors. + ignore_exit_codes: + - 0 + - 130 + + # Check interval (seconds) + check_interval: 1.0 + + # Enable/disable debug output + enable_debug: false diff --git a/system/autoware_node_death_monitor/include/autoware_node_death_monitor/autoware_node_death_monitor.hpp b/system/autoware_node_death_monitor/include/autoware_node_death_monitor/autoware_node_death_monitor.hpp new file mode 100644 index 0000000000000..ec4392294b5d8 --- /dev/null +++ b/system/autoware_node_death_monitor/include/autoware_node_death_monitor/autoware_node_death_monitor.hpp @@ -0,0 +1,72 @@ +// Copyright 2025 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE_NODE_DEATH_MONITOR__AUTOWARE_NODE_DEATH_MONITOR_HPP_ +#define AUTOWARE_NODE_DEATH_MONITOR__AUTOWARE_NODE_DEATH_MONITOR_HPP_ + +#include "rclcpp/rclcpp.hpp" + +#include +#include +#include +#include + +namespace autoware::node_death_monitor +{ + +class NodeDeathMonitor : public rclcpp::Node +{ +public: + /** + * @brief Constructor for NodeDeathMonitor + * @param options Node options for configuration + */ + explicit NodeDeathMonitor(const rclcpp::NodeOptions & options); + +private: + /** + * @brief Read and process new content appended to launch.log + */ + void read_launch_log_diff(); + + /** + * @brief Parse a single line from the log for process death information + * @param line The log line to parse + */ + void parse_log_line(const std::string & line); + + /** + * @brief Timer callback to report and manage dead node list + */ + void on_timer(); + + // Map to track dead nodes: [node_name-#] -> true + std::unordered_map dead_nodes_; + + rclcpp::TimerBase::SharedPtr timer_; + + // Launch log file path and read position + std::filesystem::path launch_log_path_; + size_t last_file_pos_{static_cast(-1)}; + + // Parameters + std::vector ignore_node_names_; // Node names to exclude from monitoring + std::vector ignore_exit_codes_; // Exit codes to ignore (e.g., normal termination) + double check_interval_{1.0}; // Check interval in seconds + bool enable_debug_{false}; // Enable debug output +}; + +} // namespace autoware::node_death_monitor + +#endif // AUTOWARE_NODE_DEATH_MONITOR__AUTOWARE_NODE_DEATH_MONITOR_HPP_ diff --git a/system/autoware_node_death_monitor/launch/autoware_node_death_monitor.launch.xml b/system/autoware_node_death_monitor/launch/autoware_node_death_monitor.launch.xml new file mode 100644 index 0000000000000..7d217f614556c --- /dev/null +++ b/system/autoware_node_death_monitor/launch/autoware_node_death_monitor.launch.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/system/autoware_node_death_monitor/package.xml b/system/autoware_node_death_monitor/package.xml new file mode 100644 index 0000000000000..39b524ff78d01 --- /dev/null +++ b/system/autoware_node_death_monitor/package.xml @@ -0,0 +1,23 @@ + + + autoware_node_death_monitor + 0.0.1 + The node_death_monitor package + + Kyoichi Sugahara + Apache License 2.0 + + ament_cmake_auto + autoware_cmake + + rcl_interfaces + rclcpp + rclcpp_components + + ament_cmake_gtest + ament_lint_auto + + + ament_cmake + + diff --git a/system/autoware_node_death_monitor/src/autoware_node_death_monitor.cpp b/system/autoware_node_death_monitor/src/autoware_node_death_monitor.cpp new file mode 100644 index 0000000000000..93597ab09d575 --- /dev/null +++ b/system/autoware_node_death_monitor/src/autoware_node_death_monitor.cpp @@ -0,0 +1,310 @@ +// Copyright 2025 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "autoware_node_death_monitor/autoware_node_death_monitor.hpp" + +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +namespace autoware::node_death_monitor +{ + +/** + * @brief Find the latest launch.log file + * @return Path to latest launch.log file, empty path if not found + */ +static fs::path find_latest_launch_log() +{ + // Check ROS_LOG_DIR environment variable first + const char * ros_log_dir_env = std::getenv("ROS_LOG_DIR"); + fs::path base_path; + if (ros_log_dir_env) { + base_path = fs::path(ros_log_dir_env); + } else { + // Fallback to ~/.ros/log + const char * home_env = std::getenv("HOME"); + if (!home_env) { + base_path = fs::current_path(); + } else { + base_path = fs::path(home_env) / ".ros" / "log"; + } + } + + if (!fs::exists(base_path) || !fs::is_directory(base_path)) { + return fs::path(); + } + + // Find the latest directory by last write time + fs::path latest_dir; + auto latest_time = fs::file_time_type::min(); + + for (auto & entry : fs::directory_iterator(base_path)) { + if (entry.is_directory()) { + auto ftime = fs::last_write_time(entry.path()); + if (ftime > latest_time) { + latest_time = ftime; + latest_dir = entry.path(); + } + } + } + + if (latest_dir.empty()) { + return fs::path(); + } + + // Check for launch.log in the latest directory + fs::path log_file = latest_dir / "launch.log"; + if (fs::exists(log_file) && fs::is_regular_file(log_file)) { + return log_file; + } + return fs::path(); +} + +NodeDeathMonitor::NodeDeathMonitor(const rclcpp::NodeOptions & options) +: Node("autoware_node_death_monitor", options) +{ + ignore_node_names_ = + declare_parameter>("ignore_node_names", std::vector{}); + ignore_exit_codes_ = declare_parameter>("ignore_exit_codes"); + check_interval_ = declare_parameter("check_interval"); + enable_debug_ = declare_parameter("enable_debug"); + + // Initialize launch.log monitoring + launch_log_path_ = find_latest_launch_log(); + if (launch_log_path_.empty()) { + RCLCPP_WARN(get_logger(), "Could not find latest launch.log. Monitoring disabled."); + } else { + RCLCPP_WARN(get_logger(), "Monitoring launch.log at: %s", launch_log_path_.c_str()); + } + + // Set initial file position for differential reading + last_file_pos_ = 0; + if (!launch_log_path_.empty() && fs::exists(launch_log_path_)) { + auto raw_size = fs::file_size(launch_log_path_); + last_file_pos_ = raw_size; + + if (enable_debug_) { + RCLCPP_WARN( + get_logger(), + "File size details - Raw size (uintmax_t): %ju, Stored position (size_t): %zu", raw_size, + last_file_pos_); + } + } + + auto interval_ns = std::chrono::duration_cast( + std::chrono::duration(check_interval_)); + timer_ = create_wall_timer(interval_ns, std::bind(&NodeDeathMonitor::on_timer, this)); +} + +void NodeDeathMonitor::read_launch_log_diff() +{ + if (launch_log_path_.empty()) { + return; + } + + std::ifstream ifs(launch_log_path_, std::ios::binary); + if (!ifs.good()) { + RCLCPP_WARN(get_logger(), "Failed to open launch.log: %s", launch_log_path_.c_str()); + return; + } + + // Get file size + ifs.seekg(0, std::ios::end); + const std::streampos file_end = ifs.tellg(); + + // Reset position to start if previous position exceeds file size (log rotation) + if (last_file_pos_ > static_cast(file_end)) { + RCLCPP_WARN( + get_logger(), + "File size is reset. Possibly new session? Reading from top. last_file_pos_: %zu, file_end: " + "%zu", + last_file_pos_, static_cast(file_end)); + last_file_pos_ = 0; + } + + // Seek to last position + ifs.seekg(last_file_pos_, std::ios::beg); + + if (enable_debug_) { + RCLCPP_WARN( + get_logger(), "[DEBUG] Reading launch.log from pos=%zu to end=%zu", + static_cast(last_file_pos_), static_cast(file_end)); + } + + std::streampos last_valid_pos = static_cast(last_file_pos_); + + size_t iteration = 0; + while (true) { + // Check current position + std::streampos current_pos_start = ifs.tellg(); + if (current_pos_start == std::streampos(-1)) { + if (ifs.eof()) { + RCLCPP_DEBUG(get_logger(), "EOF reached at iteration=%zu", iteration); + } else { + RCLCPP_WARN( + get_logger(), "tellg() failed at iteration=%zu. Possibly file closed?", iteration); + } + break; + } + + // Read one line + std::string line; + if (!std::getline(ifs, line)) { + if (ifs.eof()) { + RCLCPP_DEBUG(get_logger(), "Reached EOF at iteration=%zu", iteration); + } else { + RCLCPP_WARN(get_logger(), "Error reading line at iteration=%zu", iteration); + } + break; + } + + parse_log_line(line); + + // Check position after reading + std::streampos current_pos_end = ifs.tellg(); + if (current_pos_end == std::streampos(-1)) { + if (ifs.eof()) { + RCLCPP_DEBUG(get_logger(), "EOF after iteration=%zu", iteration); + } else { + RCLCPP_WARN(get_logger(), "tellg() failed after reading line at iteration=%zu", iteration); + } + break; + } + + last_valid_pos = current_pos_end; + ++iteration; + } + + // Update last valid position + if (last_valid_pos != std::streampos(-1)) { + last_file_pos_ = static_cast(last_valid_pos); + RCLCPP_DEBUG(get_logger(), "Set last_file_pos_=%zu after reading", last_file_pos_); + } else { + RCLCPP_WARN(get_logger(), "No valid position found at the end"); + } +} + +void NodeDeathMonitor::parse_log_line(const std::string & line) +{ + const std::string target_str = "process has died"; + if (line.find(target_str) == std::string::npos) { + if (enable_debug_) { + RCLCPP_INFO( + get_logger(), "[DEBUG] The log line does not contain '%s': skip\nline='%s'", + target_str.c_str(), line.c_str()); + } + return; + } + + // Parse exit code + int exit_code = -1; + { + static const std::regex exit_code_pattern("exit code\\s+(-?[0-9]+)"); + std::smatch match_exit; + if (std::regex_search(line, match_exit, exit_code_pattern)) { + try { + exit_code = std::stoi(match_exit[1]); + } catch (...) { + exit_code = -1; + } + + if (enable_debug_) { + RCLCPP_WARN( + get_logger(), "[DEBUG] Parsed exit_code=%d from log line (line='%s')", exit_code, + line.c_str()); + } + } else { + if (enable_debug_) { + RCLCPP_WARN( + get_logger(), "[DEBUG] Could not parse exit_code from log line (line='%s')", + line.c_str()); + } + } + } + + // Filter by exit code + if ( + std::find(ignore_exit_codes_.begin(), ignore_exit_codes_.end(), exit_code) != + ignore_exit_codes_.end()) { + RCLCPP_WARN( + get_logger(), + "[DEBUG] Ignoring process died log (exit_code=%d is in ignore_exit_codes_). line='%s'", + exit_code, line.c_str()); + return; + } + + // Extract node name from log line + static const std::regex node_name_pattern("\\[([^\\]]+)\\]\\:\\s*process has died"); + std::smatch match_node; + if (std::regex_search(line, match_node, node_name_pattern)) { + const std::string node_id = match_node[1]; + if (enable_debug_) { + RCLCPP_WARN(get_logger(), "[DEBUG] Extracted node_id='%s'", node_id.c_str()); + } + + // Filter by node name + for (const auto & ignore : ignore_node_names_) { + if (node_id.find(ignore) != std::string::npos) { + if (enable_debug_) { + RCLCPP_WARN( + get_logger(), "[DEBUG] Ignoring node death: node_id='%s' matched ignore='%s'", + node_id.c_str(), ignore.c_str()); + } + return; + } + } + + // Register dead node + dead_nodes_[node_id] = true; + + RCLCPP_WARN( + get_logger(), "Detected node death from launch.log: node_id='%s' (exit_code=%d)\n line='%s'", + node_id.c_str(), exit_code, line.c_str()); + } else { + if (enable_debug_) { + RCLCPP_WARN( + get_logger(), + "[DEBUG] Could not extract [node_name-#] (with ': process has died') from log line='%s'", + line.c_str()); + } + } +} + +void NodeDeathMonitor::on_timer() +{ + read_launch_log_diff(); + + // Report dead nodes + if (!dead_nodes_.empty()) { + std::string report = "Dead nodes detected: "; + for (const auto & kv : dead_nodes_) { + if (kv.second) { + report += kv.first + " "; + } + } + RCLCPP_WARN(get_logger(), "%s", report.c_str()); + } else if (enable_debug_) { + RCLCPP_WARN(get_logger(), "[DEBUG] on_timer: No dead nodes so far."); + } +} + +} // namespace autoware::node_death_monitor + +#include +RCLCPP_COMPONENTS_REGISTER_NODE(autoware::node_death_monitor::NodeDeathMonitor)