Skip to content

Commit

Permalink
adds a status and disconnected timestamp property to the agent interface
Browse files Browse the repository at this point in the history
Fixes #543
In order to enable other applications on the managed node to react
to connection loss of the agent with the controller, add two new
properties: Status and DisconnectTimestamp
If the status will change, the agent will emit a property changed
signal on which other applications can listen. To provide information
for later when the disconnect happened, the timestamp property
can be queried.
This change also includes extending the D-Bus API description and
adding an integration test to it.

Signed-off-by: Michael Engel <mengel@redhat.com>
  • Loading branch information
engelmi committed Sep 12, 2023
1 parent ad845d3 commit 05cbd9a
Show file tree
Hide file tree
Showing 9 changed files with 274 additions and 19 deletions.
22 changes: 22 additions & 0 deletions data/org.eclipse.bluechi.Agent.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,27 @@
<arg name="node" type="s" direction="in" />
<arg name="unit" type="s" direction="in" />
</method>


<!--
Status:
The connection status of the agent with the BlueChi controller.
On any change, a signal is emitted on the org.freedesktop.DBus.Properties interface.
-->
<property name="Status" type="s" access="read">
<annotation name="org.freedesktop.DBus.Property.EmitsChangedSignal" value="true" />
</property>

<!--
DisconnectTimestamp:
A timestamp indicating when the agent lost connection to the BlueChi controller.
If the connection is active (agent is online), this value is 0.
-->
<property name="DisconnectTimestamp" type="t" access="read">
<annotation name="org.freedesktop.DBus.Property.EmitsChangedSignal" value="false" />
</property>

</interface>
</node>
77 changes: 73 additions & 4 deletions src/agent/agent.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ bool agent_is_connected(Agent *agent) {
return agent != NULL && agent->connection_state == AGENT_CONNECTION_STATE_CONNECTED;
}

char *agent_is_online(Agent *agent) {
if (agent_is_connected(agent)) {
return "online";
}
return "offline";
}

static int agent_stop_local_proxy_service(Agent *agent, ProxyService *proxy);
static bool agent_connect(Agent *agent);
static bool agent_reconnect(Agent *agent);
Expand All @@ -105,10 +112,20 @@ static int agent_disconnected(UNUSED sd_bus_message *message, void *userdata, UN

bc_log_error("Disconnected from manager");

if (!agent_reconnect(agent)) {
agent->connection_state = AGENT_CONNECTION_STATE_RETRY;
agent->connection_state = AGENT_CONNECTION_STATE_RETRY;
int r = sd_bus_emit_properties_changed(
agent->api_bus, BC_AGENT_OBJECT_PATH, AGENT_INTERFACE, "Status", NULL);
if (r < 0) {
bc_log_errorf("Failed to emit status property changed: %s", strerror(-r));
}
r = get_time_seconds(&agent->disconnect_timestamp);
if (r < 0) {
bc_log_errorf("Failed to get current time on agent disconnect: %s", strerror(-r));
}

/* try to reconnect right away */
agent_reconnect(agent);

return 0;
}

Expand All @@ -117,8 +134,9 @@ static int agent_reset_heartbeat_timer(Agent *agent, sd_event_source **event_sou
static int agent_heartbeat_timer_callback(sd_event_source *event_source, UNUSED uint64_t usec, void *userdata) {
Agent *agent = (Agent *) userdata;

int r = 0;
if (agent->connection_state == AGENT_CONNECTION_STATE_CONNECTED) {
int r = sd_bus_emit_signal(
r = sd_bus_emit_signal(
agent->peer_dbus,
INTERNAL_AGENT_OBJECT_PATH,
INTERNAL_AGENT_INTERFACE,
Expand All @@ -135,7 +153,7 @@ static int agent_heartbeat_timer_callback(sd_event_source *event_source, UNUSED
}
}

int r = agent_reset_heartbeat_timer(agent, &event_source);
r = agent_reset_heartbeat_timer(agent, &event_source);
if (r < 0) {
bc_log_errorf("Failed to reset agent heartbeat timer: %s", strerror(-r));
return r;
Expand Down Expand Up @@ -390,6 +408,7 @@ Agent *agent_new(void) {
agent->connection_retry_count = 0;
agent->wildcard_subscription_active = false;
agent->metrics_enabled = false;
agent->disconnect_timestamp = 0;

return steal_pointer(&agent);
}
Expand Down Expand Up @@ -1596,10 +1615,53 @@ static int agent_method_remove_proxy(sd_bus_message *m, UNUSED void *userdata, U
return sd_bus_reply_method_return(m, "");
}


/*************************************************************************
**** org.eclipse.bluechi.Agent.Status ****************
*************************************************************************/

static int agent_property_get_status(
UNUSED sd_bus *bus,
UNUSED const char *path,
UNUSED const char *interface,
UNUSED const char *property,
sd_bus_message *reply,
void *userdata,
UNUSED sd_bus_error *ret_error) {
Agent *agent = userdata;

return sd_bus_message_append(reply, "s", agent_is_online(agent));
}

/*************************************************************************
**** org.eclipse.bluechi.Agent.LastSuccessfulHeartbeat ****************
*************************************************************************/

static int agent_property_get_disconnect_timestamp(
UNUSED sd_bus *bus,
UNUSED const char *path,
UNUSED const char *interface,
UNUSED const char *property,
sd_bus_message *reply,
void *userdata,
UNUSED sd_bus_error *ret_error) {
Agent *agent = userdata;

return sd_bus_message_append(reply, "t", agent->disconnect_timestamp);
}


static const sd_bus_vtable agent_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_METHOD("CreateProxy", "sss", "", agent_method_create_proxy, 0),
SD_BUS_METHOD("RemoveProxy", "sss", "", agent_method_remove_proxy, 0),

SD_BUS_PROPERTY("Status", "s", agent_property_get_status, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
SD_BUS_PROPERTY("DisconnectTimestamp",
"t",
agent_property_get_disconnect_timestamp,
0,
SD_BUS_VTABLE_PROPERTY_EXPLICIT),
SD_BUS_VTABLE_END
};

Expand Down Expand Up @@ -2257,6 +2319,13 @@ static bool agent_connect(Agent *agent) {

agent->connection_state = AGENT_CONNECTION_STATE_CONNECTED;
agent->connection_retry_count = 0;
agent->disconnect_timestamp = 0;

r = sd_bus_emit_properties_changed(
agent->api_bus, BC_AGENT_OBJECT_PATH, AGENT_INTERFACE, "Status", NULL);
if (r < 0) {
bc_log_errorf("Failed to emit status property changed: %s", strerror(-r));
}

r = sd_bus_match_signal_async(
agent->peer_dbus,
Expand Down
2 changes: 2 additions & 0 deletions src/agent/agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct Agent {

AgentConnectionState connection_state;
uint64_t connection_retry_count;
time_t disconnect_timestamp;

char *orch_addr;
char *api_bus_service_name;
Expand Down Expand Up @@ -107,6 +108,7 @@ bool agent_start(Agent *agent);
bool agent_stop(Agent *agent);

bool agent_is_connected(Agent *agent);
char *agent_is_online(Agent *agent);

void agent_remove_proxy(Agent *agent, ProxyService *proxy, bool emit);

Expand Down
39 changes: 39 additions & 0 deletions src/bindings/python/bluechi/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,3 +862,42 @@ def remove_proxy(self, local_service_name: str, node: str, unit: str) -> None:
node,
unit,
)

@property
def status(self) -> str:
"""
Status:
The connection status of the agent with the BlueChi controller.
On any change, a signal is emitted on the org.freedesktop.DBus.Properties interface.
"""
return self.get_proxy().Status

def on_status_changed(self, callback: Callable[[Variant], None]):
"""
Status:
The connection status of the agent with the BlueChi controller.
On any change, a signal is emitted on the org.freedesktop.DBus.Properties interface.
"""

def on_properties_changed(
interface: str,
changed_props: Dict[str, Variant],
invalidated_props: Dict[str, Variant],
) -> None:
value = changed_props.get("Status")
if value is not None:
callback(value)

self.get_properties_proxy().PropertiesChanged.connect(on_properties_changed)

@property
def disconnect_timestamp(self) -> UInt64:
"""
DisconnectTimestamp:
A timestamp indicating when the agent lost connection to the BlueChi controller.
If the connection is active (agent is online), this value is 0.
"""
return self.get_proxy().DisconnectTimestamp
30 changes: 15 additions & 15 deletions tests/bluechi_test/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,21 @@ def wait_for_unit_state_to_be(self, unit_name: str, expected_state: str, timeout
Latest state: {latest_state}")
return False

def run_python(self, python_script_path: str) -> \
Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:

target_file_dir = os.path.join("/", "tmp")
target_file_name = get_random_name(10)
content = read_file(python_script_path)
self.create_file(target_file_dir, target_file_name, content)

target_file_path = os.path.join(target_file_dir, target_file_name)
result, output = self.exec_run(f'python3 {target_file_path}')
try:
os.remove(target_file_path)
finally:
return result, output


class BluechiNodeContainer(BluechiContainer):

Expand Down Expand Up @@ -183,18 +198,3 @@ def thaw_unit(self, node_name: str, unit_name: str) -> None:
result, output = self.exec_run(f"bluechictl thaw {node_name} {unit_name}")
if result != 0:
raise Exception(f"Failed to thaw service {unit_name} on node {node_name}: {output}")

def run_python(self, python_script_path: str) -> \
Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:

target_file_dir = os.path.join("/", "tmp")
target_file_name = get_random_name(10)
content = read_file(python_script_path)
self.create_file(target_file_dir, target_file_name, content)

target_file_path = os.path.join(target_file_dir, target_file_name)
result, output = self.exec_run(f'python3 {target_file_path}')
try:
os.remove(target_file_path)
finally:
return result, output
1 change: 1 addition & 0 deletions tests/tests/tier0/monitor-agent-loses-connection/main.fmf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
summary: Test if the bluechi agent emits a status changed signal when it disconnects from the controller, e.g. when the controller goes down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# SPDX-License-Identifier: LGPL-2.1-or-later

import unittest

from bluechi.api import Agent

# Test to verify that the agent on the node is connected to the controller
# and the properties for status and disconnected timestamp are set


class TestAgentIsConnected(unittest.TestCase):

def test_agent_is_connected(self):
agent = Agent()
assert agent.status == "online"
assert agent.disconnect_timestamp == 0


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-License-Identifier: LGPL-2.1-or-later

import unittest

from dasbus.error import DBusError
from dasbus.loop import EventLoop
from dasbus.typing import Variant

from bluechi.api import Node, Agent


class TestMonitorCtrlDown(unittest.TestCase):

def setUp(self) -> None:
super().setUp()

self.agent_state = None
self.agent_disconnected_timestamp = 0
self.agent_on_node = Agent()

def test_monitor_ctrl_down(self):
loop = EventLoop()

def on_state_change(state: Variant):
self.agent_state = state.get_string()
self.agent_disconnected_timestamp = self.agent_on_node.disconnect_timestamp
loop.quit()

self.agent_on_node.on_status_changed(on_state_change)

# stop the bluechi controller service to trigger a disconnected message in the agent
# hacky solution to cause a disconnect:
# stop the controller service
# problem:
# dasbus will raise a DBusError for connection timeout on v1.4, v1.7 doesn't
# the command will be executed anyway, resulting in the desired shutdown of the node
# therefore, the DBusError is silenced for now, but all other errors are still raised
try:
node = Node("node-foo")
node.stop_unit('bluechi.service', 'replace')
except DBusError:
pass

loop.run()

assert self.agent_state == "offline"
assert self.agent_disconnected_timestamp != 0


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-License-Identifier: LGPL-2.1-or-later

import os
from typing import Dict

from bluechi_test.test import BluechiTest
from bluechi_test.container import BluechiControllerContainer, BluechiNodeContainer
from bluechi_test.config import BluechiControllerConfig, BluechiNodeConfig

node_foo_name = "node-foo"


def start_agent_in_ctrl_container(ctrl: BluechiControllerContainer):
assert ctrl.wait_for_unit_state_to_be("bluechi", "active")

node_foo_config = BluechiNodeConfig(
file_name="agent.conf",
node_name=node_foo_name,
manager_host="localhost",
manager_port=ctrl.config.port,
)
ctrl.create_file(node_foo_config.get_confd_dir(), node_foo_config.file_name, node_foo_config.serialize())
result, _ = ctrl.exec_run("systemctl start bluechi-agent")
assert result == 0
assert ctrl.wait_for_unit_state_to_be("bluechi-agent", "active")


def exec(ctrl: BluechiControllerContainer, nodes: Dict[str, BluechiNodeContainer]):
start_agent_in_ctrl_container(ctrl)

# bluechi-agent is running, check if it is connected in Agent
result, output = ctrl.run_python(os.path.join("python", "is_agent_connected.py"))
if result != 0:
raise Exception(output)

# stop bluechi service and verify that the agent emits a status changed signal
result, output = ctrl.run_python(os.path.join("python", "monitor_ctrl_down.py"))
if result != 0:
raise Exception(output)


def test_monitor_agent_loses_connection(
bluechi_test: BluechiTest,
bluechi_ctrl_default_config: BluechiControllerConfig):

bluechi_ctrl_default_config.allowed_node_names = [node_foo_name]

bluechi_test.set_bluechi_controller_config(bluechi_ctrl_default_config)
# don't add node_foo_config to bluechi_test to prevent it being started
# as separate container
bluechi_test.run(exec)

0 comments on commit 05cbd9a

Please sign in to comment.