Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TFA-FIX:CEPH-83595932-To verify crashes while executing drain and mgr failover commands #4230

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions ceph/rados/core_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -4615,3 +4615,60 @@ def get_rados_df(self, pool_name: str = None):
out = self.run_ceph_command(cmd=_cmd, client_exec=True)

return out["pools"][0] if pool_name else out

def set_service_managed_type(self, service_type, unmanaged) -> bool:
"""
Method to set the service to either managed or unmanaged
The service types are- mon,mgr,osd,rgw, mds
Args:
unmanaged: True or false, for the service management

returns:
Pass -> True, Fail -> false
"""
cmd_export = f"ceph orch ls {service_type} --export"
out = self.run_ceph_command(cmd=cmd_export, client_exec=True)[0]
if unmanaged:
log.debug(
f"Setting the {service_type} service as unmanaged by cephadm. current status : {out}"
)
out["unmanaged"] = "true"
else:
log.debug(
f"Setting the {service_type} service as unmanaged by cephadm. current status : {out}"
)
out["unmanaged"] = "false"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here instead of explicitly setting it to False, we should remove the "unmanaged" key from the dictionary if it exists

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be fine. Would not make difference with workflow.


file_name = (
f"/tmp/{service_type}_spec_{self.set_service_managed_type.__name__}.yaml"
)
# Creating service config file
self.client.exec_command(sudo=True, cmd=f"touch {file_name}")
json_out = json.dumps(out)
# Adding the spec rules into the file
cmd = f"echo '{json_out}' > {file_name}"
self.client.exec_command(cmd=cmd, sudo=True)

log.debug(f"Contents of {service_type} spec file : {out}")
apply_cmd = f"ceph orch apply -i {file_name}"
log.info(f"Applying the spec file via cmd : {apply_cmd}")
self.client.exec_command(cmd=apply_cmd, sudo=True)

time.sleep(10)
# Checking for the unmanaged setting on service
cmd = "ceph orch ls"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please replace the command with "ceph orch ls {service_type}" to avoid the for loop traversal

[ceph: root@ceph-hakumar-ryth74-node1-installer /]# ceph orch ls mon -f json-pretty         

[
  {
    "placement": {
      "label": "mon"
    },
    "service_name": "mon",
    "service_type": "mon",
    "status": {
      "created": "2024-11-15T20:59:25.948847Z",
      "last_refresh": "2024-11-22T10:12:24.482009Z",
      "running": 3,
      "size": 3
    }
  }
]

out = self.run_ceph_command(cmd=cmd)
for entry in out:
if entry["service_name"] == service_type:
log.debug(f"Service status : {entry}")
status = entry.get("unmanaged", False)
if status != unmanaged:
log.error(
f"{service_type} Service not in unmamaned={unmanaged} state. Fail"
)
return False
else:
log.info(
f"{service_type} Service in unmamaned={unmanaged} state. Pass"
)
return True
92 changes: 64 additions & 28 deletions ceph/rados/serviceability_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,22 +225,79 @@ def remove_custom_host(self, host_node_name: str):
Returns:
None | raises exception in case of failure
"""
status_cmd = ""
try:

def wait_osd_operation_status(status_cmd):
status_flag = False
end_time = datetime.datetime.now() + datetime.timedelta(seconds=600)
log.debug(
"The logic used to verify the OSD is removed or not is-"
"case1: If the ceph is still in process of removing the OSD the command generated "
"the proper json output.The json.loads method loads the output without any failure."
"case2: If the OSDs are removed from the node then the command wont generate any output."
"In this case the json.loads method throws the JSONDecodeError exception.This is the "
"confirmation that the OSDs removal are completed. "
)
while end_time > datetime.datetime.now():
out, err = self.cephadm.shell([status_cmd])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason for executing commands with self.cephadm.shell() and then performing operations on data?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add reason for self.cephadm.shell() and then performing operations on data

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The information is added.

try:
drain_ops = json.loads(out)
for entry in drain_ops:
log.debug(
f"OSD remove operation is in progress {osd_id}\nOperations: {entry}"
)
except json.JSONDecodeError:
log.info(f"The OSD removal is completed on OSD : {osd_id}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could either mean that OSD removal is complete, or No OSD removal was started in the 1st place.

status_flag = True
break
except Exception as error:
log.error(f"Hit issue during drain operations: {error}")
raise Exception(error)
log.debug("Sleeping for 10 seconds and checking again....")
time.sleep(10)
return status_flag

# Removing an OSD host and checking status
rm_host = utils.get_node_by_id(self.cluster, host_node_name)
log.info(
f"Identified host : {rm_host.hostname} to be removed from the cluster"
)

# get list of osd_id on the host to be removed
# Get list of osd_id on the host to be removed
rm_osd_list = self.rados_obj.collect_osd_daemon_ids(osd_node=rm_host)
log.info(
f"The osd id list to be removed from the {rm_host} is {rm_osd_list}"
)
# Get the OSD out list and remove before drain the node
osd_out_list = self.rados_obj.get_osd_list(status="out")
log.info(
f"The out osd id list to be removed from the {rm_host} is {osd_out_list}"
)
if osd_out_list:
for osd_id in rm_osd_list:
if osd_id in osd_out_list:
osd_utils.osd_remove(
self.cluster, osd_id=osd_id, zap=True, force=True
)
time.sleep(10)
status_cmd = "ceph orch osd rm status -f json"
if wait_osd_operation_status(status_cmd):
log.info("The OSD successfully removed")
else:
log.error(
"OSD removal not completed on the cluster even after 600 seconds"
)
raise Exception("OSD not removed error")
rm_osd_list.remove(osd_id)
dev_path_list = []
if rm_osd_list:
for osd_id in rm_osd_list:
dev_path_list.append(
rados_utils.get_device_path(host=rm_host, osd_id=osd_id)
)
osd_utils.set_osd_out(self.cluster, osd_id=osd_id)
time.sleep(30)
osd_utils.osd_remove(self.cluster, osd_id=osd_id)
time.sleep(30)

Expand All @@ -253,36 +310,15 @@ def remove_custom_host(self, host_node_name: str):
# Sleeping for 2 seconds for removal to have started
time.sleep(2)
log.debug(f"Started drain operation on node : {rm_host.hostname}")

status_cmd = "ceph orch osd rm status -f json"
end_time = datetime.datetime.now() + datetime.timedelta(seconds=600)
flag = False
while end_time > datetime.datetime.now():
out, err = self.cephadm.shell([status_cmd])
try:
drain_ops = json.loads(out)
for entry in drain_ops:
log.debug(
f"Drain operations are going on host {rm_host.hostname} \nOperations: {entry}"
)
except json.JSONDecodeError:
log.info(f"Drain operations completed on host : {rm_host.hostname}")
flag = True
break
except Exception as error:
log.error(f"Hit issue during drain operations: {error}")
raise Exception(error)
log.debug("Sleeping for 10 seconds and checking again....")
time.sleep(10)

if not flag:
if wait_osd_operation_status(status_cmd):
log.info(
f"Completed drain operation on the host. {rm_host.hostname}\n Removing host from the cluster"
)
else:
log.error(
"Drain operation not completed on the cluster even after 600 seconds"
)
raise Exception("Execution Error")
log.info(
f"Completed drain operation on the host. {rm_host.hostname}\n Removing host from the cluster"
)
raise Exception("Drain operation-OSD not removed error")

if dev_path_list:
for dev_path in dev_path_list:
Expand Down
10 changes: 8 additions & 2 deletions ceph/rados/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,17 +195,23 @@ def set_osd_in(
return ret_val


def osd_remove(ceph_cluster, osd_id, zap=False):
def osd_remove(ceph_cluster, osd_id, zap=False, force=False):
"""
osd remove
Args:
ceph_cluster: ceph cluster
osd_id: osd id
zap: flag to control zapping of device
force: flag to remove the OSD forcefully
"""
config = {"command": "rm", "service": "osd", "pos_args": [osd_id]}
cmd_args = {}
if zap:
config["base_cmd_args"] = {"zap": True}
cmd_args["zap"] = True
if force:
cmd_args["force"] = True
if bool(cmd_args):
config["base_cmd_args"] = cmd_args
log.info(f"Executing OSD {config.pop('command')} service")
osd = OSD(cluster=ceph_cluster, **config)
osd.rm(config)
Expand Down
1 change: 0 additions & 1 deletion suites/reef/rados/test_rados_all_generic_features.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1641,7 +1641,6 @@ tests:
pool_type: replicated
pg_num: 16
delete_pool: true
comments: Active BZ-2269089

- test:
name: verify scrub chunk max
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,5 @@ tests:
create: true
pool_name: mgr_test_pool
delete_pool: mgr_test_pool
comments: Active bug 2328605
desc: Ceph mgr crashed after a mgr failover with the message mgr operator
1 change: 0 additions & 1 deletion suites/reef/rados/tier-2_rados_test_omap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ tests:
pool_type: replicated
pg_num: 16
delete_pool: true
comments: Active BZ-2269089

- test:
name: Omap creations on objects
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,4 +142,5 @@ tests:
create: true
pool_name: mgr_test_pool
delete_pool: mgr_test_pool
comments: Active bug 2328605
desc: Ceph mgr crashed after a mgr failover with the message mgr operator
1 change: 0 additions & 1 deletion suites/squid/rados/tier-2_rados_test_omap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ tests:
pool_type: replicated
pg_num: 16
delete_pool: true
comments: Active BZ-2269089

- test:
name: Omap creations on objects
Expand Down
44 changes: 35 additions & 9 deletions tests/rados/test_node_drain_customer_bug.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
The file contain the method to check the customer issue-
CEPH-83593996 - Check that the Ceph cluster logs are being generated appropriately according to the log level
CEPH-83595932-To verify crashes while executing drain and mgr failover commands
"""

import datetime
Expand All @@ -14,14 +14,16 @@
from ceph.rados.core_workflows import RadosOrchestrator
from ceph.rados.mgr_workflows import MgrWorkflows
from ceph.rados.serviceability_workflows import ServiceabilityMethods
from tests.rados.monitor_configurations import MonConfigMethods
from tests.rados.stretch_cluster import wait_for_clean_pg_sets
from utility.log import Log

log = Log(__name__)


def run(ceph_cluster, **kw):
"""
# CEPH-83593996
# CEPH-83595932
Bug id - https://bugzilla.redhat.com/show_bug.cgi?id=2305677
1. Configure a cluster that have more than four OSD nodes
2. Select an OSD node and drain the node
Expand All @@ -40,21 +42,22 @@ def run(ceph_cluster, **kw):
mgr_obj = MgrWorkflows(node=cephadm)
installer = ceph_cluster.get_nodes(role="installer")[0]
service_obj = ServiceabilityMethods(cluster=ceph_cluster, **config)
mon_obj = MonConfigMethods(rados_obj=rados_obj)
ceph_nodes = kw.get("ceph_nodes")
config = kw["config"]

replicated_config = config.get("replicated_pool")
pool_name = replicated_config["pool_name"]
active_osd_list = rados_obj.get_osd_list(status="up")
active_osd_list = get_active_osd_list(rados_obj)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reuse existing method

log.info(f"The active OSDs list before starting the test-{active_osd_list}")
if not rados_obj.create_pool(pool_name=pool_name):
log.error("Failed to create the Pool")
return 1

rados_obj.bench_write(pool_name=pool_name, byte_size="5M", rados_write_duration=90)
rados_obj.bench_write(pool_name=pool_name, byte_size="5M", rados_write_duration=180)
mgr_daemon = Thread(
target=background_mgr_task, kwargs={"mgr_object": mgr_obj}, daemon=True
)
wait_for_clean_pg_sets(rados_obj)
# Printing the hosts in cluster
cmd_host_ls = "ceph orch host ls"
out = rados_obj.run_ceph_command(cmd=cmd_host_ls)
Expand All @@ -64,7 +67,7 @@ def run(ceph_cluster, **kw):
for node in ceph_nodes:
if node.role == "mgr":
mgr_host_object_list.append(node)
log.debug(f"The mgr host node is{node.hostname}")
log.debug(f"The mgr host node is {node.hostname}")

mgr_daemon_list = mgr_obj.get_mgr_daemon_list()
log.debug(f"The MGR daemons list are -{mgr_daemon_list}")
Expand Down Expand Up @@ -104,6 +107,7 @@ def run(ceph_cluster, **kw):
"2. Select the node with OSD weight/reweight are 0 if none of the hosts have the _no_schedule label"
"3. If both 1&2 failed then select a random OSD node"
)
mon_obj.set_config(section="mgr", name="debug_mgr", value="20/20")
cmd_host_ls = "ceph orch host ls"
out = rados_obj.run_ceph_command(cmd=cmd_host_ls)
log.info(f"The node details in the cluster -{out} ")
Expand Down Expand Up @@ -133,8 +137,10 @@ def run(ceph_cluster, **kw):
try:
osd_count_before_test = get_node_osd_list(rados_obj, ceph_nodes, drain_host)
log.info(
f"The OSDs in the drain node before starting the test - {osd_count_before_test} "
f"The OSDs in the drain node before starting the te- {osd_count_before_test} "
)
rados_obj.set_service_managed_type("osd", unmanaged=True)
time.sleep(10)
mgr_daemon.start()
service_obj.remove_custom_host(host_node_name=drain_host)
time.sleep(300)
Expand All @@ -152,6 +158,9 @@ def run(ceph_cluster, **kw):
"The traceback messages are noticed in logs.The error snippets are noticed in the MGR logs"
)
return 1
rados_obj.set_service_managed_type("osd", unmanaged=False)
time.sleep(10)

log.info(
"Adding the node by providing the deploy_osd as False, because the script is not setting the "
"--unmanaged=true.Once the node is added back to the cluster the OSDs get configured automatically"
Expand Down Expand Up @@ -194,7 +203,8 @@ def run(ceph_cluster, **kw):
return 1

if bug_exists:
active_osd_list = rados_obj.get_osd_list(status="up")

active_osd_list = get_active_osd_list(rados_obj)
log.info(
f"The active OSDs list after reproducing the issue is-{active_osd_list}"
)
Expand Down Expand Up @@ -237,6 +247,9 @@ def run(ceph_cluster, **kw):
log.info(
"\n \n ************** Execution of finally block begins here *************** \n \n"
)
mon_obj.remove_config(section="mgr", name="debug_mgr")
rados_obj.set_service_managed_type("osd", unmanaged=False)
time.sleep(10)
if replicated_config.get("delete_pool"):
rados_obj.delete_pool(pool=pool_name)
time.sleep(5)
Expand Down Expand Up @@ -297,7 +310,7 @@ def background_mgr_task(mgr_object):
mgr_object: mgr object
Returns: None
"""
time.sleep(20)
time.sleep(5)
for _ in range(10):
active_mgr_before_fail = mgr_object.get_active_mgr()
mgr_object.set_mgr_fail()
Expand Down Expand Up @@ -350,3 +363,16 @@ def get_zero_osd_weight_list(rados_obj):
if not value:
del zero_osd_weight_nodes[key]
return zero_osd_weight_nodes


def get_active_osd_list(rados_object) -> list:
"""
Method to fetch list of OSDs which are UP and IN
Returns:
List of active OSDs
"""
down_osd_list = rados_object.get_osd_list(status="in")
out_osd_list = rados_object.get_osd_list(status="up")
total_list = list(set(down_osd_list + out_osd_list))
log.info(f"The active osd list is:{total_list}")
return total_list
Loading