-
Notifications
You must be signed in to change notification settings - Fork 74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TFA-FIX:CEPH-83595932-To verify crashes while executing drain and mgr failover commands #4230
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4615,3 +4615,60 @@ def get_rados_df(self, pool_name: str = None): | |
out = self.run_ceph_command(cmd=_cmd, client_exec=True) | ||
|
||
return out["pools"][0] if pool_name else out | ||
|
||
def set_service_managed_type(self, service_type, unmanaged) -> bool: | ||
""" | ||
Method to set the service to either managed or unmanaged | ||
The service types are- mon,mgr,osd,rgw, mds | ||
Args: | ||
unmanaged: True or false, for the service management | ||
|
||
returns: | ||
Pass -> True, Fail -> false | ||
""" | ||
cmd_export = f"ceph orch ls {service_type} --export" | ||
out = self.run_ceph_command(cmd=cmd_export, client_exec=True)[0] | ||
if unmanaged: | ||
log.debug( | ||
f"Setting the {service_type} service as unmanaged by cephadm. current status : {out}" | ||
) | ||
out["unmanaged"] = "true" | ||
else: | ||
log.debug( | ||
f"Setting the {service_type} service as unmanaged by cephadm. current status : {out}" | ||
) | ||
out["unmanaged"] = "false" | ||
|
||
file_name = ( | ||
f"/tmp/{service_type}_spec_{self.set_service_managed_type.__name__}.yaml" | ||
) | ||
# Creating service config file | ||
self.client.exec_command(sudo=True, cmd=f"touch {file_name}") | ||
json_out = json.dumps(out) | ||
# Adding the spec rules into the file | ||
cmd = f"echo '{json_out}' > {file_name}" | ||
self.client.exec_command(cmd=cmd, sudo=True) | ||
|
||
log.debug(f"Contents of {service_type} spec file : {out}") | ||
apply_cmd = f"ceph orch apply -i {file_name}" | ||
log.info(f"Applying the spec file via cmd : {apply_cmd}") | ||
self.client.exec_command(cmd=apply_cmd, sudo=True) | ||
|
||
time.sleep(10) | ||
# Checking for the unmanaged setting on service | ||
cmd = "ceph orch ls" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please replace the command with "ceph orch ls {service_type}" to avoid the for loop traversal
|
||
out = self.run_ceph_command(cmd=cmd) | ||
for entry in out: | ||
if entry["service_name"] == service_type: | ||
log.debug(f"Service status : {entry}") | ||
status = entry.get("unmanaged", False) | ||
if status != unmanaged: | ||
log.error( | ||
f"{service_type} Service not in unmamaned={unmanaged} state. Fail" | ||
) | ||
return False | ||
else: | ||
log.info( | ||
f"{service_type} Service in unmamaned={unmanaged} state. Pass" | ||
) | ||
return True |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -225,22 +225,79 @@ def remove_custom_host(self, host_node_name: str): | |
Returns: | ||
None | raises exception in case of failure | ||
""" | ||
status_cmd = "" | ||
try: | ||
|
||
def wait_osd_operation_status(status_cmd): | ||
status_flag = False | ||
end_time = datetime.datetime.now() + datetime.timedelta(seconds=600) | ||
log.debug( | ||
"The logic used to verify the OSD is removed or not is-" | ||
"case1: If the ceph is still in process of removing the OSD the command generated " | ||
"the proper json output.The json.loads method loads the output without any failure." | ||
"case2: If the OSDs are removed from the node then the command wont generate any output." | ||
"In this case the json.loads method throws the JSONDecodeError exception.This is the " | ||
"confirmation that the OSDs removal are completed. " | ||
) | ||
while end_time > datetime.datetime.now(): | ||
out, err = self.cephadm.shell([status_cmd]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason for executing commands with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add reason for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The information is added. |
||
try: | ||
drain_ops = json.loads(out) | ||
for entry in drain_ops: | ||
log.debug( | ||
f"OSD remove operation is in progress {osd_id}\nOperations: {entry}" | ||
) | ||
except json.JSONDecodeError: | ||
log.info(f"The OSD removal is completed on OSD : {osd_id}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could either mean that OSD removal is complete, or No OSD removal was started in the 1st place. |
||
status_flag = True | ||
break | ||
except Exception as error: | ||
log.error(f"Hit issue during drain operations: {error}") | ||
raise Exception(error) | ||
log.debug("Sleeping for 10 seconds and checking again....") | ||
time.sleep(10) | ||
return status_flag | ||
|
||
# Removing an OSD host and checking status | ||
rm_host = utils.get_node_by_id(self.cluster, host_node_name) | ||
log.info( | ||
f"Identified host : {rm_host.hostname} to be removed from the cluster" | ||
) | ||
|
||
# get list of osd_id on the host to be removed | ||
# Get list of osd_id on the host to be removed | ||
rm_osd_list = self.rados_obj.collect_osd_daemon_ids(osd_node=rm_host) | ||
log.info( | ||
f"The osd id list to be removed from the {rm_host} is {rm_osd_list}" | ||
) | ||
# Get the OSD out list and remove before drain the node | ||
osd_out_list = self.rados_obj.get_osd_list(status="out") | ||
log.info( | ||
f"The out osd id list to be removed from the {rm_host} is {osd_out_list}" | ||
) | ||
if osd_out_list: | ||
for osd_id in rm_osd_list: | ||
if osd_id in osd_out_list: | ||
osd_utils.osd_remove( | ||
self.cluster, osd_id=osd_id, zap=True, force=True | ||
) | ||
time.sleep(10) | ||
status_cmd = "ceph orch osd rm status -f json" | ||
if wait_osd_operation_status(status_cmd): | ||
log.info("The OSD successfully removed") | ||
else: | ||
log.error( | ||
"OSD removal not completed on the cluster even after 600 seconds" | ||
) | ||
raise Exception("OSD not removed error") | ||
rm_osd_list.remove(osd_id) | ||
dev_path_list = [] | ||
if rm_osd_list: | ||
for osd_id in rm_osd_list: | ||
dev_path_list.append( | ||
rados_utils.get_device_path(host=rm_host, osd_id=osd_id) | ||
) | ||
osd_utils.set_osd_out(self.cluster, osd_id=osd_id) | ||
time.sleep(30) | ||
osd_utils.osd_remove(self.cluster, osd_id=osd_id) | ||
time.sleep(30) | ||
|
||
|
@@ -253,36 +310,15 @@ def remove_custom_host(self, host_node_name: str): | |
# Sleeping for 2 seconds for removal to have started | ||
time.sleep(2) | ||
log.debug(f"Started drain operation on node : {rm_host.hostname}") | ||
|
||
status_cmd = "ceph orch osd rm status -f json" | ||
end_time = datetime.datetime.now() + datetime.timedelta(seconds=600) | ||
flag = False | ||
while end_time > datetime.datetime.now(): | ||
out, err = self.cephadm.shell([status_cmd]) | ||
try: | ||
drain_ops = json.loads(out) | ||
for entry in drain_ops: | ||
log.debug( | ||
f"Drain operations are going on host {rm_host.hostname} \nOperations: {entry}" | ||
) | ||
except json.JSONDecodeError: | ||
log.info(f"Drain operations completed on host : {rm_host.hostname}") | ||
flag = True | ||
break | ||
except Exception as error: | ||
log.error(f"Hit issue during drain operations: {error}") | ||
raise Exception(error) | ||
log.debug("Sleeping for 10 seconds and checking again....") | ||
time.sleep(10) | ||
|
||
if not flag: | ||
if wait_osd_operation_status(status_cmd): | ||
log.info( | ||
f"Completed drain operation on the host. {rm_host.hostname}\n Removing host from the cluster" | ||
) | ||
else: | ||
log.error( | ||
"Drain operation not completed on the cluster even after 600 seconds" | ||
) | ||
raise Exception("Execution Error") | ||
log.info( | ||
f"Completed drain operation on the host. {rm_host.hostname}\n Removing host from the cluster" | ||
) | ||
raise Exception("Drain operation-OSD not removed error") | ||
|
||
if dev_path_list: | ||
for dev_path in dev_path_list: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
""" | ||
The file contain the method to check the customer issue- | ||
CEPH-83593996 - Check that the Ceph cluster logs are being generated appropriately according to the log level | ||
CEPH-83595932-To verify crashes while executing drain and mgr failover commands | ||
""" | ||
|
||
import datetime | ||
|
@@ -14,14 +14,16 @@ | |
from ceph.rados.core_workflows import RadosOrchestrator | ||
from ceph.rados.mgr_workflows import MgrWorkflows | ||
from ceph.rados.serviceability_workflows import ServiceabilityMethods | ||
from tests.rados.monitor_configurations import MonConfigMethods | ||
from tests.rados.stretch_cluster import wait_for_clean_pg_sets | ||
from utility.log import Log | ||
|
||
log = Log(__name__) | ||
|
||
|
||
def run(ceph_cluster, **kw): | ||
""" | ||
# CEPH-83593996 | ||
# CEPH-83595932 | ||
Bug id - https://bugzilla.redhat.com/show_bug.cgi?id=2305677 | ||
1. Configure a cluster that have more than four OSD nodes | ||
2. Select an OSD node and drain the node | ||
|
@@ -40,21 +42,22 @@ def run(ceph_cluster, **kw): | |
mgr_obj = MgrWorkflows(node=cephadm) | ||
installer = ceph_cluster.get_nodes(role="installer")[0] | ||
service_obj = ServiceabilityMethods(cluster=ceph_cluster, **config) | ||
mon_obj = MonConfigMethods(rados_obj=rados_obj) | ||
ceph_nodes = kw.get("ceph_nodes") | ||
config = kw["config"] | ||
|
||
replicated_config = config.get("replicated_pool") | ||
pool_name = replicated_config["pool_name"] | ||
active_osd_list = rados_obj.get_osd_list(status="up") | ||
active_osd_list = get_active_osd_list(rados_obj) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reuse existing method |
||
log.info(f"The active OSDs list before starting the test-{active_osd_list}") | ||
if not rados_obj.create_pool(pool_name=pool_name): | ||
log.error("Failed to create the Pool") | ||
return 1 | ||
|
||
rados_obj.bench_write(pool_name=pool_name, byte_size="5M", rados_write_duration=90) | ||
rados_obj.bench_write(pool_name=pool_name, byte_size="5M", rados_write_duration=180) | ||
mgr_daemon = Thread( | ||
target=background_mgr_task, kwargs={"mgr_object": mgr_obj}, daemon=True | ||
) | ||
wait_for_clean_pg_sets(rados_obj) | ||
# Printing the hosts in cluster | ||
cmd_host_ls = "ceph orch host ls" | ||
out = rados_obj.run_ceph_command(cmd=cmd_host_ls) | ||
|
@@ -64,7 +67,7 @@ def run(ceph_cluster, **kw): | |
for node in ceph_nodes: | ||
if node.role == "mgr": | ||
mgr_host_object_list.append(node) | ||
log.debug(f"The mgr host node is{node.hostname}") | ||
log.debug(f"The mgr host node is {node.hostname}") | ||
|
||
mgr_daemon_list = mgr_obj.get_mgr_daemon_list() | ||
log.debug(f"The MGR daemons list are -{mgr_daemon_list}") | ||
|
@@ -104,6 +107,7 @@ def run(ceph_cluster, **kw): | |
"2. Select the node with OSD weight/reweight are 0 if none of the hosts have the _no_schedule label" | ||
"3. If both 1&2 failed then select a random OSD node" | ||
) | ||
mon_obj.set_config(section="mgr", name="debug_mgr", value="20/20") | ||
cmd_host_ls = "ceph orch host ls" | ||
out = rados_obj.run_ceph_command(cmd=cmd_host_ls) | ||
log.info(f"The node details in the cluster -{out} ") | ||
|
@@ -133,8 +137,10 @@ def run(ceph_cluster, **kw): | |
try: | ||
osd_count_before_test = get_node_osd_list(rados_obj, ceph_nodes, drain_host) | ||
log.info( | ||
f"The OSDs in the drain node before starting the test - {osd_count_before_test} " | ||
f"The OSDs in the drain node before starting the te- {osd_count_before_test} " | ||
) | ||
rados_obj.set_service_managed_type("osd", unmanaged=True) | ||
time.sleep(10) | ||
mgr_daemon.start() | ||
service_obj.remove_custom_host(host_node_name=drain_host) | ||
time.sleep(300) | ||
|
@@ -152,6 +158,9 @@ def run(ceph_cluster, **kw): | |
"The traceback messages are noticed in logs.The error snippets are noticed in the MGR logs" | ||
) | ||
return 1 | ||
rados_obj.set_service_managed_type("osd", unmanaged=False) | ||
time.sleep(10) | ||
|
||
log.info( | ||
"Adding the node by providing the deploy_osd as False, because the script is not setting the " | ||
"--unmanaged=true.Once the node is added back to the cluster the OSDs get configured automatically" | ||
|
@@ -194,7 +203,8 @@ def run(ceph_cluster, **kw): | |
return 1 | ||
|
||
if bug_exists: | ||
active_osd_list = rados_obj.get_osd_list(status="up") | ||
|
||
active_osd_list = get_active_osd_list(rados_obj) | ||
log.info( | ||
f"The active OSDs list after reproducing the issue is-{active_osd_list}" | ||
) | ||
|
@@ -237,6 +247,9 @@ def run(ceph_cluster, **kw): | |
log.info( | ||
"\n \n ************** Execution of finally block begins here *************** \n \n" | ||
) | ||
mon_obj.remove_config(section="mgr", name="debug_mgr") | ||
rados_obj.set_service_managed_type("osd", unmanaged=False) | ||
time.sleep(10) | ||
if replicated_config.get("delete_pool"): | ||
rados_obj.delete_pool(pool=pool_name) | ||
time.sleep(5) | ||
|
@@ -297,7 +310,7 @@ def background_mgr_task(mgr_object): | |
mgr_object: mgr object | ||
Returns: None | ||
""" | ||
time.sleep(20) | ||
time.sleep(5) | ||
for _ in range(10): | ||
active_mgr_before_fail = mgr_object.get_active_mgr() | ||
mgr_object.set_mgr_fail() | ||
|
@@ -350,3 +363,16 @@ def get_zero_osd_weight_list(rados_obj): | |
if not value: | ||
del zero_osd_weight_nodes[key] | ||
return zero_osd_weight_nodes | ||
|
||
|
||
def get_active_osd_list(rados_object) -> list: | ||
""" | ||
Method to fetch list of OSDs which are UP and IN | ||
Returns: | ||
List of active OSDs | ||
""" | ||
down_osd_list = rados_object.get_osd_list(status="in") | ||
out_osd_list = rados_object.get_osd_list(status="up") | ||
total_list = list(set(down_osd_list + out_osd_list)) | ||
log.info(f"The active osd list is:{total_list}") | ||
return total_list |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here instead of explicitly setting it to False, we should remove the "unmanaged" key from the dictionary if it exists
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be fine. Would not make difference with workflow.