diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index e47da46c0..129485784 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -176,6 +176,14 @@ builtinPlaybooks: - image_pull_backoff_reporter: {} # playbooks for non-prometheus based monitoring that use prometheus for enrichment +- name: "PodEvicted" + triggers: + - on_pod_evicted: {} + actions: + - pod_evicted_enricher: {} + - pod_events_enricher: {} + - enrich_pod_with_node_events: {} + - name: "PodOOMKill" triggers: - on_pod_oom_killed: diff --git a/playbooks/robusta_playbooks/event_enrichments.py b/playbooks/robusta_playbooks/event_enrichments.py index f3a7307ff..7868ca243 100644 --- a/playbooks/robusta_playbooks/event_enrichments.py +++ b/playbooks/robusta_playbooks/event_enrichments.py @@ -272,6 +272,28 @@ def pod_events_enricher(event: PodEvent, params: EventEnricherParams): ) +@action +def enrich_pod_with_node_events(event: PodEvent, params: EventEnricherParams): + """ + Given a Kubernetes pod, fetch related events in the near past for its node + """ + pod = event.get_pod() + events_table_block = get_resource_events_table( + "*Node events:*", + kind="Node", + name=pod.spec.nodeName, + included_types=params.included_types, + max_events=params.max_events, + ) + if events_table_block: + event.add_enrichment( + [events_table_block], + {SlackAnnotations.ATTACHMENT: True}, + enrichment_type=EnrichmentType.k8s_events, + title="Node Events", + ) + + @action def deployment_events_enricher(event: DeploymentEvent, params: ExtendedEventEnricherParams): """ diff --git a/playbooks/robusta_playbooks/node_enrichments.py b/playbooks/robusta_playbooks/node_enrichments.py index a444e316b..d92678f37 100644 --- a/playbooks/robusta_playbooks/node_enrichments.py +++ b/playbooks/robusta_playbooks/node_enrichments.py @@ -2,8 +2,10 @@ from typing import List from hikaru.model.rel_1_26 import Pod, PodList + from robusta.api import ( BaseBlock, + EnrichmentType, FileBlock, Finding, FindingSeverity, @@ -16,22 +18,14 @@ PodRunningParams, ResourceGraphEnricherParams, RobustaPod, - TableBlock, action, create_node_graph_enrichment, - EnrichmentType + get_node_allocatable_resources_table_block, + get_node_running_pods_table_block_or_none, + get_node_status_table_block, ) -def pod_row(pod: Pod) -> List[str]: - ready_condition = [condition.status for condition in pod.status.conditions if condition.type == "Ready"] - return [ - pod.metadata.namespace, - pod.metadata.name, - ready_condition[0] if ready_condition else "Unknown", - ] - - def has_resource_request(pod: Pod, resource_type: str) -> bool: for container in pod.spec.containers: try: @@ -81,12 +75,10 @@ def node_running_pods_enricher(event: NodeEvent): return block_list: List[BaseBlock] = [] - pod_list: PodList = PodList.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj - - effected_pods_rows = [pod_row(pod) for pod in pod_list.items] - block_list.append( - TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name=f"Pods running on the node") - ) + table_resources = get_node_running_pods_table_block_or_none(node) + if not table_resources: + return + block_list.append(table_resources) event.add_enrichment(block_list) @@ -104,13 +96,7 @@ def node_allocatable_resources_enricher(event: NodeEvent): block_list: List[BaseBlock] = [] if node: - block_list.append( - TableBlock( - [[k, v] for (k, v) in node.status.allocatable.items()], - ["resource", "value"], - table_name="Node Allocatable Resources - The amount of compute resources that are available for pods", - ) - ) + block_list.append(get_node_allocatable_resources_table_block(node)) event.add_enrichment(block_list) @@ -123,21 +109,14 @@ def node_status_enricher(event: NodeEvent): Can help troubleshooting Node issues. """ - if not event.get_node(): - logging.error(f"node_status_enricher was called on event without node : {event}") + node = event.get_node() + if not node: + logging.error("node_status_enricher was called on event without node : {event}") return - logging.info(f"node_status_enricher is depricated, use status_enricher instead") + logging.info("node_status_enricher is depricated, use status_enricher instead") - event.add_enrichment( - [ - TableBlock( - [[c.type, c.status] for c in event.get_node().status.conditions], - headers=["Type", "Status"], - table_name="*Node status details:*", - ), - ] - ) + event.add_enrichment(get_node_status_table_block(node)) @action @@ -154,8 +133,9 @@ def node_dmesg_enricher(event: NodeEvent, params: PodRunningParams): ) if exec_result: event.add_enrichment( - [FileBlock(f"dmesg.log", exec_result.encode())], enrichment_type=EnrichmentType.text_file, - title="DMESG Info" + [FileBlock("dmesg.log", exec_result.encode())], + enrichment_type=EnrichmentType.text_file, + title="DMESG Info", ) @@ -189,8 +169,9 @@ def node_health_watcher(event: NodeChangeEvent): subject=KubeObjFindingSubject(event.obj), ) event.add_finding(finding) - event.add_enrichment([KubernetesDiffBlock([], event.old_obj, - event.obj, event.obj.metadata.name, kind=event.obj.kind)]) + event.add_enrichment( + [KubernetesDiffBlock([], event.old_obj, event.obj, event.obj.metadata.name, kind=event.obj.kind)] + ) node_status_enricher(event) diff --git a/playbooks/robusta_playbooks/oom_killer.py b/playbooks/robusta_playbooks/oom_killer.py index f13439124..2b9e6835d 100644 --- a/playbooks/robusta_playbooks/oom_killer.py +++ b/playbooks/robusta_playbooks/oom_killer.py @@ -8,8 +8,11 @@ from hikaru.model.rel_1_26 import Node, Pod, PodList, ResourceRequirements from robusta.api import ( + EnrichmentType, Finding, FindingSeverity, + OOMGraphEnricherParams, + OomKillParams, PodContainer, PodEvent, PodFindingSubject, @@ -17,18 +20,15 @@ PrometheusAlert, PrometheusKubernetesAlert, RendererType, - OomKillParams, - OOMGraphEnricherParams, TableBlock, action, create_container_graph, + create_node_graph_enrichment, get_oom_killed_container, parse_kubernetes_datetime_to_ms, pod_most_recent_oom_killed_container, - EnrichmentType, - create_node_graph_enrichment, ) -from robusta.core.model.base_params import PrometheusParams, LogEnricherParams +from robusta.core.model.base_params import LogEnricherParams, PrometheusParams from robusta.core.playbooks.oom_killer_utils import logs_enricher from robusta.core.reporting.blocks import GraphBlock from robusta.integrations.resource_analysis.memory_analyzer import MemoryAnalyzer @@ -53,12 +53,17 @@ class OomKillerEnricherParams(PrometheusParams): NODE_MEMORY_THRESHOLD = 0.95 -def get_oomkilled_graph(oomkilled_container: PodContainer, pod: Pod, params: OOMGraphEnricherParams, - metrics_legends_labels: Optional[List[str]] = None,) -> GraphBlock: +def get_oomkilled_graph( + oomkilled_container: PodContainer, + pod: Pod, + params: OOMGraphEnricherParams, + metrics_legends_labels: Optional[List[str]] = None, +) -> GraphBlock: if params.delay_graph_s > 0: time.sleep(params.delay_graph_s) - return create_container_graph(params, pod, oomkilled_container, show_limit=True, - metrics_legends_labels=metrics_legends_labels) + return create_container_graph( + params, pod, oomkilled_container, show_limit=True, metrics_legends_labels=metrics_legends_labels + ) @action @@ -74,8 +79,7 @@ def oomkilled_container_graph_enricher(event: PodEvent, params: OOMGraphEnricher if not oomkilled_container: logging.error("Unable to find oomkilled container") return - container_graph = get_oomkilled_graph(oomkilled_container, pod, params, - metrics_legends_labels=["container"]) + container_graph = get_oomkilled_graph(oomkilled_container, pod, params, metrics_legends_labels=["container"]) event.add_enrichment([container_graph], enrichment_type=EnrichmentType.graph, title="Container Info") @@ -96,7 +100,7 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams): subject=PodFindingSubject(pod), ) - node: Node = Node.readNode(pod.spec.nodeName).obj + node = pod.get_node() labels = [ ("Pod", pod.metadata.name), ("Namespace", pod.metadata.namespace), @@ -112,13 +116,16 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams): ( "Node allocated memory", f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable", - )] + ), + ] - blocks = [TableBlock( - [[k, v] for (k, v) in node_labels], - ["field", "value"], - table_name="*Node Info*", - )] + blocks = [ + TableBlock( + [[k, v] for (k, v) in node_labels], + ["field", "value"], + table_name="*Node Info*", + ) + ] if params.node_memory_graph: node_graph = create_node_graph_enrichment(params, node, metrics_legends_labels=["pod"]) blocks.append(node_graph) @@ -151,18 +158,18 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams): if oom_killed_status.terminated.finishedAt: container_labels.append(("Container finished at", oom_killed_status.terminated.finishedAt)) - blocks = [TableBlock( - [[k, v] for (k, v) in container_labels], - ["field", "value"], - table_name="*Container Info*", - )] + blocks = [ + TableBlock( + [[k, v] for (k, v) in container_labels], + ["field", "value"], + table_name="*Container Info*", + ) + ] if params.container_memory_graph and oomkilled_container.container: - container_graph = get_oomkilled_graph(oomkilled_container, pod, params, - metrics_legends_labels=["pod"]) + container_graph = get_oomkilled_graph(oomkilled_container, pod, params, metrics_legends_labels=["pod"]) blocks.append(container_graph) - finding.add_enrichment(blocks, enrichment_type=EnrichmentType.container_info, - title="Container Info") + finding.add_enrichment(blocks, enrichment_type=EnrichmentType.container_info, title="Container Info") event.add_finding(finding) if params.attach_logs and container_name is not None: diff --git a/playbooks/robusta_playbooks/pod_enrichments.py b/playbooks/robusta_playbooks/pod_enrichments.py index 1ed82b296..2ca3fdc27 100644 --- a/playbooks/robusta_playbooks/pod_enrichments.py +++ b/playbooks/robusta_playbooks/pod_enrichments.py @@ -1,8 +1,8 @@ import logging from datetime import datetime -from hikaru.model.rel_1_26 import Node from robusta.api import ( + EnrichmentType, PodEvent, PodResourceGraphEnricherParams, ResourceChartItemType, @@ -13,7 +13,6 @@ create_node_graph_enrichment, create_resource_enrichment, pod_limits, - EnrichmentType ) from robusta.core.model.pods import pod_requests @@ -64,7 +63,7 @@ def pod_graph_enricher(pod_event: PodEvent, params: PodResourceGraphEnricherPara prometheus_params=params, graph_duration_minutes=params.graph_duration_minutes, lines=limit_lines, - metrics_legends_labels=["pod"] + metrics_legends_labels=["pod"], ) pod_event.add_enrichment([graph_enrichment], enrichment_type=EnrichmentType.graph, title="Pod Resources") @@ -78,7 +77,7 @@ def pod_node_graph_enricher(pod_event: PodEvent, params: ResourceGraphEnricherPa if not pod: logging.error(f"cannot run pod_node_graph_enricher on event with no pod: {pod_event}") return - node: Node = Node.readNode(pod.spec.nodeName).obj + node = pod.get_node() if not node: logging.warning(f"Node {pod.spec.nodeName} not found for pod {pod.metadata.name}") return diff --git a/playbooks/robusta_playbooks/pod_evicted_enrichments.py b/playbooks/robusta_playbooks/pod_evicted_enrichments.py new file mode 100644 index 000000000..fd44fac9a --- /dev/null +++ b/playbooks/robusta_playbooks/pod_evicted_enrichments.py @@ -0,0 +1,60 @@ +import logging + +from robusta.api import ( + EnrichmentType, + Finding, + FindingSeverity, + PodEvent, + PodFindingSubject, + TableBlock, + action, + get_node_allocatable_resources_table_block, + get_node_running_pods_table_block_or_none, + get_node_status_table_block, +) + + +@action +def pod_evicted_enricher(event: PodEvent): + """ + Retrieves pod and node information for an OOMKilled pod + """ + pod = event.get_pod() + if not pod: + logging.error(f"cannot run pod_evicted_enricher on event with no pod: {event}") + return + + node = pod.get_node() + if not node: + logging.error(f"cannot run pod_evicted_enricher on event with no node: {event}") + return + + finding = Finding( + title=f"Pod {pod.metadata.name} in namespace {pod.metadata.namespace} was Evicted", + aggregation_key="PodEvictedTriggered", + severity=FindingSeverity.HIGH, + subject=PodFindingSubject(pod), + ) + + node_labels = [("Node Name", pod.spec.nodeName)] + node_info_block = TableBlock( + [[k, v] for k, v in node_labels], + headers=["Field", "Value"], + table_name="*Node general info:*", + ) + node_status_block = get_node_status_table_block(node) + + allocatable_resources_block = get_node_allocatable_resources_table_block( + node, table_name="*Node Allocatable Resources:*" + ) + + finding.add_enrichment( + [node_info_block, node_status_block, allocatable_resources_block], + enrichment_type=EnrichmentType.node_info, + title="Node Info", + ) + + event.add_finding(finding) + + running_nodes_table = get_node_running_pods_table_block_or_none(node) + event.add_enrichment(running_nodes_table) diff --git a/src/robusta/api/__init__.py b/src/robusta/api/__init__.py index 418d8765a..1af7ff4f9 100644 --- a/src/robusta/api/__init__.py +++ b/src/robusta/api/__init__.py @@ -112,6 +112,11 @@ from robusta.core.playbooks.container_playbook_utils import create_container_graph from robusta.core.playbooks.crash_reporter import send_crash_report from robusta.core.playbooks.job_utils import CONTROLLER_UID, get_job_all_pods, get_job_latest_pod, get_job_selector +from robusta.core.playbooks.node_enrichment_utils import ( + get_node_allocatable_resources_table_block, + get_node_running_pods_table_block_or_none, + get_node_status_table_block, +) from robusta.core.playbooks.node_playbook_utils import create_node_graph_enrichment from robusta.core.playbooks.pod_utils.crashloop_utils import get_crash_report_enrichments from robusta.core.playbooks.pod_utils.imagepull_utils import ( diff --git a/src/robusta/core/playbooks/node_enrichment_utils.py b/src/robusta/core/playbooks/node_enrichment_utils.py new file mode 100644 index 000000000..75d9e9fe4 --- /dev/null +++ b/src/robusta/core/playbooks/node_enrichment_utils.py @@ -0,0 +1,59 @@ +import logging +from typing import Optional + +from hikaru.model.rel_1_26 import Node, PodList + +from robusta.core.reporting import TableBlock + + +def get_node_allocatable_resources_table_block( + node: Node, + table_name: Optional[ + str + ] = "Node Allocatable Resources - The amount of compute resources that are available for pods", +) -> TableBlock: + """ + Enrich the finding with the node resources available for allocation. + + Can help troubleshooting node issues. + """ + return TableBlock( + [[k, v] for (k, v) in node.status.allocatable.items()], + ["resource", "value"], + table_name=table_name, + ) + + +def get_node_status_table_block(node: Node, table_name: Optional[str] = "*Node status details:*") -> TableBlock: + """ + Enrich the finding with the node resources available for allocation. + + Can help troubleshooting node issues. + """ + + return TableBlock( + [[c.type, c.status] for c in node.status.conditions], + headers=["Type", "Status"], + table_name=table_name, + ) + + +def get_node_running_pods_table_block_or_none( + node: Node, table_name: Optional[str] = "Pods running on the node" +) -> Optional[TableBlock]: + """ + Enrich the finding with the node resources available for allocation. + + Can help troubleshooting node issues. + """ + try: + pod_list = PodList.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj + except Exception as e: + logging.error(f"Failed to list pods for node {node.metadata.name}: {e}") + return None + + effected_pods_rows = [ + [pod.metadata.namespace, pod.metadata.name, pod.is_pod_in_ready_condition()] for pod in pod_list.items + ] + + return TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name=table_name) diff --git a/src/robusta/integrations/kubernetes/custom_models.py b/src/robusta/integrations/kubernetes/custom_models.py index ae64bf615..536cc0004 100644 --- a/src/robusta/integrations/kubernetes/custom_models.py +++ b/src/robusta/integrations/kubernetes/custom_models.py @@ -334,6 +334,14 @@ def extract_container_id(status: ContainerStatus) -> str: runtime, container_id = status.containerID.split("://") return container_id + def get_node(self) -> Optional[Node]: + try: + node = Node.readNode(self.spec.nodeName).obj + except Exception as e: + logging.error(f"Failed to read pod's node information: {e}") + return None + return node + def get_processes(self, custom_annotations: Optional[Dict[str, str]] = None) -> List[Process]: container_ids = " ".join([self.extract_container_id(s) for s in self.status.containerStatuses]) output = RobustaPod.exec_in_debugger_pod( @@ -375,6 +383,10 @@ def upload_file(self, path: str, contents: bytes, container: Optional[str] = Non container=container, ) + def is_pod_in_ready_condition(self) -> str: + ready_condition = [condition.status for condition in self.status.conditions if condition.type == "Ready"] + return ready_condition[0] if ready_condition else "Unknown" + @staticmethod def find_pods_with_direct_owner(namespace: str, owner_uid: str) -> List["RobustaPod"]: all_pods: List["RobustaPod"] = PodList.listNamespacedPod(namespace).obj.items diff --git a/tests/test_node_enrichment_utils.py b/tests/test_node_enrichment_utils.py new file mode 100644 index 000000000..a020d0e80 --- /dev/null +++ b/tests/test_node_enrichment_utils.py @@ -0,0 +1,103 @@ +from unittest.mock import patch + +import pytest +from hikaru.model.rel_1_26 import Node, NodeCondition, NodeStatus, ObjectMeta, PodCondition, PodList, PodStatus + +from robusta.core.playbooks.node_enrichment_utils import ( + get_node_allocatable_resources_table_block, + get_node_running_pods_table_block_or_none, + get_node_status_table_block, +) +from robusta.core.reporting import TableBlock +from robusta.integrations.kubernetes.custom_models import RobustaPod + + +@pytest.fixture +def create_test_node(): + def _create_test_node(allocatable=None, conditions=None): + # this way of Node object initialization is taken from hikaru repo + # https://github.com/haxsaw/hikaru/blob/bb89e0ddc2de241c2d04da9f720b01ce46473fb1/tests/basic_tests_rel_1_26.py#L1634 + status = NodeStatus(allocatable=allocatable, conditions=conditions) + return Node(status=status) + + return _create_test_node + + +@pytest.fixture +def create_test_pod(): + def _create_test_pod(name, namespace, conditions): + return RobustaPod( + metadata=ObjectMeta(name=name, namespace=namespace), + status=PodStatus( + conditions=[ + PodCondition(status=condition["status"], type=condition["type"]) for condition in conditions + ] + ), + ) + + return _create_test_pod + + +def test_get_node_allocatable_resources_table_block(create_test_node): + test_node = create_test_node(allocatable={"cpu": "4", "memory": "8Gi"}) + + table_block = get_node_allocatable_resources_table_block(test_node) + + assert isinstance(table_block, TableBlock) + assert table_block.headers == ["resource", "value"] + assert ( + table_block.table_name + == "Node Allocatable Resources - The amount of compute resources that are available for pods" + ) + assert table_block.rows == [["cpu", "4"], ["memory", "8Gi"]] + + +def test_get_node_status_table_block(create_test_node): + first_node_condition = NodeCondition(type="Ready", status="True") + second_node_condition = NodeCondition(type="DiskPressure", status="False") + test_node = create_test_node(conditions=[first_node_condition, second_node_condition]) + + table_block = get_node_status_table_block(test_node) + + assert isinstance(table_block, TableBlock) + assert table_block.headers == ["Type", "Status"] + assert table_block.table_name == "*Node status details:*" + assert table_block.rows == [ + [first_node_condition.type, first_node_condition.status], + [second_node_condition.type, second_node_condition.status], + ] + + +def test_get_node_running_pods_table_block_or_none(create_test_node, create_test_pod): + test_node = Node(metadata=ObjectMeta(name="test-node")) + pods = [ + create_test_pod("pod1", "default", [{"status": "True", "type": "Ready"}]), + create_test_pod("pod2", "default", [{"status": "False", "type": "PodScheduled"}]), + create_test_pod("pod3", "default", [{"status": "Unknown", "type": "ContainersReady"}]), + ] + pod_list = PodList(pods) + + with patch("robusta.core.playbooks.node_enrichment_utils.PodList.listPodForAllNamespaces") as patched_list_pods: + patched_list_pods.return_value.obj = pod_list + + table_block = get_node_running_pods_table_block_or_none(test_node) + + assert isinstance(table_block, TableBlock) + assert table_block.headers == ["namespace", "name", "ready"] + assert table_block.table_name == "Pods running on the node" + assert table_block.rows == [ + ["default", "pod1", "True"], + ["default", "pod2", "Unknown"], + ["default", "pod3", "Unknown"], + ] + + +def test_get_node_running_pods_table_block_or_none_failure(): + test_node = Node(metadata=ObjectMeta(name="test-node")) + with patch( + "robusta.core.playbooks.node_enrichment_utils.PodList.listPodForAllNamespaces", + side_effect=Exception("API call failed"), + ): + + table_block = get_node_running_pods_table_block_or_none(test_node) + assert table_block is None