Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds a status and disconnected timestamp property to the agent interface #558

Merged
merged 1 commit into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions data/org.eclipse.bluechi.Agent.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,27 @@
<arg name="node" type="s" direction="in" />
<arg name="unit" type="s" direction="in" />
</method>


<!--
Status:

The connection status of the agent with the BlueChi controller.
On any change, a signal is emitted on the org.freedesktop.DBus.Properties interface.
-->
<property name="Status" type="s" access="read">
<annotation name="org.freedesktop.DBus.Property.EmitsChangedSignal" value="true" />
</property>

<!--
DisconnectTimestamp:

A timestamp indicating when the agent lost connection to the BlueChi controller.
If the connection is active (agent is online), this value is 0.
-->
<property name="DisconnectTimestamp" type="t" access="read">
<annotation name="org.freedesktop.DBus.Property.EmitsChangedSignal" value="false" />
</property>

</interface>
</node>
77 changes: 73 additions & 4 deletions src/agent/agent.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ bool agent_is_connected(Agent *agent) {
return agent != NULL && agent->connection_state == AGENT_CONNECTION_STATE_CONNECTED;
}

char *agent_is_online(Agent *agent) {
if (agent_is_connected(agent)) {
return "online";
}
return "offline";
}

static int agent_stop_local_proxy_service(Agent *agent, ProxyService *proxy);
static bool agent_connect(Agent *agent);
static bool agent_reconnect(Agent *agent);
Expand All @@ -105,10 +112,20 @@ static int agent_disconnected(UNUSED sd_bus_message *message, void *userdata, UN

bc_log_error("Disconnected from manager");

if (!agent_reconnect(agent)) {
agent->connection_state = AGENT_CONNECTION_STATE_RETRY;
agent->connection_state = AGENT_CONNECTION_STATE_RETRY;
int r = sd_bus_emit_properties_changed(
agent->api_bus, BC_AGENT_OBJECT_PATH, AGENT_INTERFACE, "Status", NULL);
if (r < 0) {
bc_log_errorf("Failed to emit status property changed: %s", strerror(-r));
}
r = get_time_seconds(&agent->disconnect_timestamp);
if (r < 0) {
bc_log_errorf("Failed to get current time on agent disconnect: %s", strerror(-r));
}

/* try to reconnect right away */
agent_reconnect(agent);

return 0;
}

Expand All @@ -117,8 +134,9 @@ static int agent_reset_heartbeat_timer(Agent *agent, sd_event_source **event_sou
static int agent_heartbeat_timer_callback(sd_event_source *event_source, UNUSED uint64_t usec, void *userdata) {
Agent *agent = (Agent *) userdata;

int r = 0;
if (agent->connection_state == AGENT_CONNECTION_STATE_CONNECTED) {
int r = sd_bus_emit_signal(
r = sd_bus_emit_signal(
agent->peer_dbus,
INTERNAL_AGENT_OBJECT_PATH,
INTERNAL_AGENT_INTERFACE,
Expand All @@ -135,7 +153,7 @@ static int agent_heartbeat_timer_callback(sd_event_source *event_source, UNUSED
}
}

int r = agent_reset_heartbeat_timer(agent, &event_source);
r = agent_reset_heartbeat_timer(agent, &event_source);
if (r < 0) {
bc_log_errorf("Failed to reset agent heartbeat timer: %s", strerror(-r));
return r;
Expand Down Expand Up @@ -390,6 +408,7 @@ Agent *agent_new(void) {
agent->connection_retry_count = 0;
agent->wildcard_subscription_active = false;
agent->metrics_enabled = false;
agent->disconnect_timestamp = 0;

return steal_pointer(&agent);
}
Expand Down Expand Up @@ -1596,10 +1615,53 @@ static int agent_method_remove_proxy(sd_bus_message *m, UNUSED void *userdata, U
return sd_bus_reply_method_return(m, "");
}


/*************************************************************************
**** org.eclipse.bluechi.Agent.Status ****************
*************************************************************************/

static int agent_property_get_status(
UNUSED sd_bus *bus,
UNUSED const char *path,
UNUSED const char *interface,
UNUSED const char *property,
sd_bus_message *reply,
void *userdata,
UNUSED sd_bus_error *ret_error) {
Agent *agent = userdata;

return sd_bus_message_append(reply, "s", agent_is_online(agent));
}

/*************************************************************************
**** org.eclipse.bluechi.Agent.LastSuccessfulHeartbeat ****************
*************************************************************************/

static int agent_property_get_disconnect_timestamp(
UNUSED sd_bus *bus,
UNUSED const char *path,
UNUSED const char *interface,
UNUSED const char *property,
sd_bus_message *reply,
void *userdata,
UNUSED sd_bus_error *ret_error) {
Agent *agent = userdata;

return sd_bus_message_append(reply, "t", agent->disconnect_timestamp);
}


static const sd_bus_vtable agent_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_METHOD("CreateProxy", "sss", "", agent_method_create_proxy, 0),
SD_BUS_METHOD("RemoveProxy", "sss", "", agent_method_remove_proxy, 0),

SD_BUS_PROPERTY("Status", "s", agent_property_get_status, 0, SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
SD_BUS_PROPERTY("DisconnectTimestamp",
"t",
agent_property_get_disconnect_timestamp,
0,
SD_BUS_VTABLE_PROPERTY_EXPLICIT),
SD_BUS_VTABLE_END
};

Expand Down Expand Up @@ -2257,6 +2319,13 @@ static bool agent_connect(Agent *agent) {

agent->connection_state = AGENT_CONNECTION_STATE_CONNECTED;
agent->connection_retry_count = 0;
agent->disconnect_timestamp = 0;

r = sd_bus_emit_properties_changed(
agent->api_bus, BC_AGENT_OBJECT_PATH, AGENT_INTERFACE, "Status", NULL);
if (r < 0) {
bc_log_errorf("Failed to emit status property changed: %s", strerror(-r));
}

r = sd_bus_match_signal_async(
agent->peer_dbus,
Expand Down
2 changes: 2 additions & 0 deletions src/agent/agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct Agent {

AgentConnectionState connection_state;
uint64_t connection_retry_count;
time_t disconnect_timestamp;

char *orch_addr;
char *api_bus_service_name;
Expand Down Expand Up @@ -107,6 +108,7 @@ bool agent_start(Agent *agent);
bool agent_stop(Agent *agent);

bool agent_is_connected(Agent *agent);
char *agent_is_online(Agent *agent);

void agent_remove_proxy(Agent *agent, ProxyService *proxy, bool emit);

Expand Down
39 changes: 39 additions & 0 deletions src/bindings/python/bluechi/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,3 +875,42 @@ def remove_proxy(self, local_service_name: str, node: str, unit: str) -> None:
node,
unit,
)

@property
def status(self) -> str:
"""
Status:

The connection status of the agent with the BlueChi controller.
On any change, a signal is emitted on the org.freedesktop.DBus.Properties interface.
"""
return self.get_proxy().Status

def on_status_changed(self, callback: Callable[[Variant], None]):
"""
Status:

The connection status of the agent with the BlueChi controller.
On any change, a signal is emitted on the org.freedesktop.DBus.Properties interface.
"""

def on_properties_changed(
interface: str,
changed_props: Dict[str, Variant],
invalidated_props: Dict[str, Variant],
) -> None:
value = changed_props.get("Status")
if value is not None:
callback(value)

self.get_properties_proxy().PropertiesChanged.connect(on_properties_changed)

@property
def disconnect_timestamp(self) -> UInt64:
"""
DisconnectTimestamp:

A timestamp indicating when the agent lost connection to the BlueChi controller.
If the connection is active (agent is online), this value is 0.
"""
return self.get_proxy().DisconnectTimestamp
30 changes: 15 additions & 15 deletions tests/bluechi_test/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,21 @@ def wait_for_unit_state_to_be(self, unit_name: str, expected_state: str, timeout
Latest state: {latest_state}")
return False

def run_python(self, python_script_path: str) -> \
Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:

target_file_dir = os.path.join("/", "tmp")
target_file_name = get_random_name(10)
content = read_file(python_script_path)
self.create_file(target_file_dir, target_file_name, content)

target_file_path = os.path.join(target_file_dir, target_file_name)
result, output = self.exec_run(f'python3 {target_file_path}')
try:
os.remove(target_file_path)
finally:
return result, output


class BluechiNodeContainer(BluechiContainer):

Expand Down Expand Up @@ -183,18 +198,3 @@ def thaw_unit(self, node_name: str, unit_name: str) -> None:
result, output = self.exec_run(f"bluechictl thaw {node_name} {unit_name}")
if result != 0:
raise Exception(f"Failed to thaw service {unit_name} on node {node_name}: {output}")

def run_python(self, python_script_path: str) -> \
Tuple[Optional[int], Union[Iterator[bytes], Any, Tuple[bytes, bytes]]]:

target_file_dir = os.path.join("/", "tmp")
target_file_name = get_random_name(10)
content = read_file(python_script_path)
self.create_file(target_file_dir, target_file_name, content)

target_file_path = os.path.join(target_file_dir, target_file_name)
result, output = self.exec_run(f'python3 {target_file_path}')
try:
os.remove(target_file_path)
finally:
return result, output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
summary: Test if the bluechi agent emits a status changed signal when it disconnects from the controller, e.g. when the controller goes down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# SPDX-License-Identifier: LGPL-2.1-or-later

import unittest

from bluechi.api import Agent

# Test to verify that the agent on the node is connected to the controller
# and the properties for status and disconnected timestamp are set


class TestAgentIsConnected(unittest.TestCase):

def test_agent_is_connected(self):
agent = Agent()
assert agent.status == "online"
assert agent.disconnect_timestamp == 0


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-License-Identifier: LGPL-2.1-or-later

import unittest

from dasbus.error import DBusError
from dasbus.loop import EventLoop
from dasbus.typing import Variant

from bluechi.api import Node, Agent


class TestMonitorCtrlDown(unittest.TestCase):

def setUp(self) -> None:
super().setUp()

self.agent_state = None
self.agent_disconnected_timestamp = 0
self.agent_on_node = Agent()

def test_monitor_ctrl_down(self):
loop = EventLoop()

def on_state_change(state: Variant):
self.agent_state = state.get_string()
self.agent_disconnected_timestamp = self.agent_on_node.disconnect_timestamp
loop.quit()

self.agent_on_node.on_status_changed(on_state_change)

# stop the bluechi controller service to trigger a disconnected message in the agent
# hacky solution to cause a disconnect:
# stop the controller service
# problem:
# dasbus will raise a DBusError for connection timeout on v1.4, v1.7 doesn't
# the command will be executed anyway, resulting in the desired shutdown of the node
# therefore, the DBusError is silenced for now, but all other errors are still raised
try:
node = Node("node-foo")
node.stop_unit('bluechi.service', 'replace')
except DBusError:
pass

loop.run()

assert self.agent_state == "offline"
assert self.agent_disconnected_timestamp != 0


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-License-Identifier: LGPL-2.1-or-later

import os
from typing import Dict

from bluechi_test.test import BluechiTest
from bluechi_test.container import BluechiControllerContainer, BluechiNodeContainer
from bluechi_test.config import BluechiControllerConfig, BluechiNodeConfig

node_foo_name = "node-foo"


def start_agent_in_ctrl_container(ctrl: BluechiControllerContainer):
assert ctrl.wait_for_unit_state_to_be("bluechi", "active")

node_foo_config = BluechiNodeConfig(
file_name="agent.conf",
node_name=node_foo_name,
manager_host="localhost",
manager_port=ctrl.config.port,
)
ctrl.create_file(node_foo_config.get_confd_dir(), node_foo_config.file_name, node_foo_config.serialize())
result, _ = ctrl.exec_run("systemctl start bluechi-agent")
assert result == 0
assert ctrl.wait_for_unit_state_to_be("bluechi-agent", "active")


def exec(ctrl: BluechiControllerContainer, nodes: Dict[str, BluechiNodeContainer]):
start_agent_in_ctrl_container(ctrl)

# bluechi-agent is running, check if it is connected in Agent
result, output = ctrl.run_python(os.path.join("python", "is_agent_connected.py"))
if result != 0:
raise Exception(output)

# stop bluechi service and verify that the agent emits a status changed signal
result, output = ctrl.run_python(os.path.join("python", "monitor_ctrl_down.py"))
if result != 0:
raise Exception(output)


def test_monitor_agent_loses_connection(
bluechi_test: BluechiTest,
bluechi_ctrl_default_config: BluechiControllerConfig):

bluechi_ctrl_default_config.allowed_node_names = [node_foo_name]

bluechi_test.set_bluechi_controller_config(bluechi_ctrl_default_config)
# don't add node_foo_config to bluechi_test to prevent it being started
# as separate container
bluechi_test.run(exec)