red-hat-storage · SrinivasaBharath · Nov 18, 2024 · harshkumarRH · Nov 22, 2024 · harshkumarRH
diff --git a/ceph/rados/core_workflows.py b/ceph/rados/core_workflows.py
@@ -4615,3 +4615,60 @@ def get_rados_df(self, pool_name: str = None):
         out = self.run_ceph_command(cmd=_cmd, client_exec=True)
 
         return out["pools"][0] if pool_name else out
+
+    def set_service_managed_type(self, service_type, unmanaged) -> bool:
+        """
+        Method to set the service to either managed or unmanaged
+        The service types are- mon,mgr,osd,rgw, mds
+        Args:
+            unmanaged: True or false, for the service management
+
+        returns:
+            Pass -> True, Fail -> false
+        """
+        cmd_export = f"ceph orch ls {service_type} --export"
+        out = self.run_ceph_command(cmd=cmd_export, client_exec=True)[0]
+        if unmanaged:
+            log.debug(
+                f"Setting the {service_type} service as unmanaged by cephadm. current status : {out}"
+            )
+            out["unmanaged"] = "true"
+        else:
+            log.debug(
+                f"Setting the {service_type} service as unmanaged by cephadm. current status : {out}"
+            )
+            out["unmanaged"] = "false"
+
+        file_name = (
+            f"/tmp/{service_type}_spec_{self.set_service_managed_type.__name__}.yaml"
+        )
+        # Creating service config file
+        self.client.exec_command(sudo=True, cmd=f"touch {file_name}")
+        json_out = json.dumps(out)
+        # Adding the spec rules into the file
+        cmd = f"echo '{json_out}' > {file_name}"
+        self.client.exec_command(cmd=cmd, sudo=True)
+
+        log.debug(f"Contents of {service_type} spec file : {out}")
+        apply_cmd = f"ceph orch apply -i {file_name}"
+        log.info(f"Applying the spec file via cmd : {apply_cmd}")
+        self.client.exec_command(cmd=apply_cmd, sudo=True)
+
+        time.sleep(10)
+        # Checking for the unmanaged setting on service
+        cmd = "ceph orch ls"
+        out = self.run_ceph_command(cmd=cmd)
+        for entry in out:
+            if entry["service_name"] == service_type:
+                log.debug(f"Service status : {entry}")
+                status = entry.get("unmanaged", False)
+                if status != unmanaged:
+                    log.error(
+                        f"{service_type} Service not in unmamaned={unmanaged} state. Fail"
+                    )
+                    return False
+                else:
+                    log.info(
+                        f"{service_type} Service  in unmamaned={unmanaged} state. Pass"
+                    )
+                    return True
diff --git a/ceph/rados/serviceability_workflows.py b/ceph/rados/serviceability_workflows.py
@@ -225,22 +225,79 @@ def remove_custom_host(self, host_node_name: str):
         Returns:
             None | raises exception in case of failure
         """
+        status_cmd = ""
         try:
+
+            def wait_osd_operation_status(status_cmd):
+                status_flag = False
+                end_time = datetime.datetime.now() + datetime.timedelta(seconds=600)
+                log.debug(
+                    "The logic used to verify the OSD is removed or not is-"
+                    "case1: If the ceph is still in process of removing the OSD the command generated "
+                    "the proper json output.The json.loads method loads the output without any failure."
+                    "case2: If the OSDs are removed from the node then the command wont generate any output."
+                    "In this case the json.loads method throws the JSONDecodeError exception.This is the "
+                    "confirmation that the OSDs removal are completed. "
+                )
+                while end_time > datetime.datetime.now():
+                    out, err = self.cephadm.shell([status_cmd])
+                    try:
+                        drain_ops = json.loads(out)
+                        for entry in drain_ops:
+                            log.debug(
+                                f"OSD remove operation is in progress {osd_id}\nOperations: {entry}"
+                            )
+                    except json.JSONDecodeError:
+                        log.info(f"The OSD removal is completed on OSD : {osd_id}")
+                        status_flag = True
+                        break
+                    except Exception as error:
+                        log.error(f"Hit issue during drain operations: {error}")
+                        raise Exception(error)
+                    log.debug("Sleeping for 10 seconds and checking again....")
+                    time.sleep(10)
+                return status_flag
+
             # Removing an OSD host and checking status
             rm_host = utils.get_node_by_id(self.cluster, host_node_name)
             log.info(
                 f"Identified host : {rm_host.hostname} to be removed from the cluster"
             )
 
-            # get list of osd_id on the host to be removed
+            # Get list of osd_id on the host to be removed
             rm_osd_list = self.rados_obj.collect_osd_daemon_ids(osd_node=rm_host)
+            log.info(
+                f"The osd id  list to be removed from the {rm_host} is  {rm_osd_list}"
+            )
+            # Get the OSD out list and remove before drain the node
+            osd_out_list = self.rados_obj.get_osd_list(status="out")
+            log.info(
+                f"The out osd id  list to be removed from the {rm_host} is  {osd_out_list}"
+            )
+            if osd_out_list:
+                for osd_id in rm_osd_list:
+                    if osd_id in osd_out_list:
+                        osd_utils.osd_remove(
+                            self.cluster, osd_id=osd_id, zap=True, force=True
+                        )
+                        time.sleep(10)
+                        status_cmd = "ceph orch osd rm status -f json"
+                        if wait_osd_operation_status(status_cmd):
+                            log.info("The OSD successfully removed")
+                        else:
+                            log.error(
+                                "OSD removal not completed on the cluster even after 600 seconds"
+                            )
+                            raise Exception("OSD not removed error")
+                        rm_osd_list.remove(osd_id)
             dev_path_list = []
             if rm_osd_list:
                 for osd_id in rm_osd_list:
                     dev_path_list.append(
                         rados_utils.get_device_path(host=rm_host, osd_id=osd_id)
                     )
                     osd_utils.set_osd_out(self.cluster, osd_id=osd_id)
+                    time.sleep(30)
                     osd_utils.osd_remove(self.cluster, osd_id=osd_id)
                 time.sleep(30)
 
@@ -253,36 +310,15 @@ def remove_custom_host(self, host_node_name: str):
             # Sleeping for 2 seconds for removal to have started
             time.sleep(2)
             log.debug(f"Started drain operation on node : {rm_host.hostname}")
-
-            status_cmd = "ceph orch osd rm status -f json"
-            end_time = datetime.datetime.now() + datetime.timedelta(seconds=600)
-            flag = False
-            while end_time > datetime.datetime.now():
-                out, err = self.cephadm.shell([status_cmd])
-                try:
-                    drain_ops = json.loads(out)
-                    for entry in drain_ops:
-                        log.debug(
-                            f"Drain operations are going on host {rm_host.hostname} \nOperations: {entry}"
-                        )
-                except json.JSONDecodeError:
-                    log.info(f"Drain operations completed on host : {rm_host.hostname}")
-                    flag = True
-                    break
-                except Exception as error:
-                    log.error(f"Hit issue during drain operations: {error}")
-                    raise Exception(error)
-                log.debug("Sleeping for 10 seconds and checking again....")
-                time.sleep(10)
-
-            if not flag:
+            if wait_osd_operation_status(status_cmd):
+                log.info(
+                    f"Completed drain operation on the host. {rm_host.hostname}\n Removing host from the cluster"
+                )
+            else:
                 log.error(
                     "Drain operation not completed on the cluster even after 600 seconds"
                 )
-                raise Exception("Execution Error")
-            log.info(
-                f"Completed drain operation on the host. {rm_host.hostname}\n Removing host from the cluster"
-            )
+                raise Exception("Drain operation-OSD not removed error")
 
             if dev_path_list:
                 for dev_path in dev_path_list:

diff --git a/ceph/rados/utils.py b/ceph/rados/utils.py
@@ -45,7 +45,6 @@ def set_osd_devices_unmanaged(ceph_cluster, osd_id, unmanaged):
             break
 
     if not service_name:
-        log.error(f"No orch service found for osd: {osd_id}")
         return
     log.info(f"Setting OSD service {service_name} to unmanaged={unmanaged}")
 
@@ -57,7 +56,7 @@ def set_osd_devices_unmanaged(ceph_cluster, osd_id, unmanaged):
     # return if no services found
     if "No services reported" in out or "No services reported" in err:
         log.debug(out)
-        log.error(err)
+        log.debug(err)
         return
     svc = loads(out)[0]
 
@@ -195,17 +194,23 @@ def set_osd_in(
     return ret_val
 
 
-def osd_remove(ceph_cluster, osd_id, zap=False):
+def osd_remove(ceph_cluster, osd_id, zap=False, force=False):
     """
     osd remove
     Args:
         ceph_cluster: ceph cluster
         osd_id: osd id
         zap: flag to control zapping of device
+        force: flag to remove the OSD forcefully
     """
     config = {"command": "rm", "service": "osd", "pos_args": [osd_id]}
+    cmd_args = {}
     if zap:
-        config["base_cmd_args"] = {"zap": True}
+        cmd_args["zap"] = True
+    if force:
+        cmd_args["force"] = True
+    if bool(cmd_args):
+        config["base_cmd_args"] = cmd_args
     log.info(f"Executing OSD {config.pop('command')} service")
     osd = OSD(cluster=ceph_cluster, **config)
     osd.rm(config)

diff --git a/suites/reef/rados/test_rados_all_generic_features.yaml b/suites/reef/rados/test_rados_all_generic_features.yaml
@@ -1641,7 +1641,6 @@ tests:
               pool_type: replicated
               pg_num: 16
         delete_pool: true
-      comments: Active BZ-2269089
 
   - test:
       name: verify scrub chunk max

diff --git a/suites/reef/rados/tier-2_rados_test_omap.yaml b/suites/reef/rados/tier-2_rados_test_omap.yaml
@@ -163,7 +163,6 @@ tests:
               pool_type: replicated
               pg_num: 16
         delete_pool: true
-      comments: Active BZ-2269089
 
   - test:
       name: Omap creations on objects

diff --git a/suites/squid/rados/tier-2_rados_test_omap.yaml b/suites/squid/rados/tier-2_rados_test_omap.yaml
@@ -163,7 +163,6 @@ tests:
               pool_type: replicated
               pg_num: 16
         delete_pool: true
-      comments: Active BZ-2269089
 
   - test:
       name: Omap creations on objects

diff --git a/tests/rados/test_node_drain_customer_bug.py b/tests/rados/test_node_drain_customer_bug.py
@@ -1,6 +1,6 @@
 """
 The file contain the method to check the customer issue-
- CEPH-83593996 - Check that the Ceph cluster logs are being generated appropriately according to the log level
+ CEPH-83595932-To verify crashes while executing drain and mgr failover commands
 """
 
 import datetime
@@ -21,7 +21,7 @@
 
 def run(ceph_cluster, **kw):
     """
-    # CEPH-83593996
+    # CEPH-83595932
     Bug id - https://bugzilla.redhat.com/show_bug.cgi?id=2305677
     1. Configure a cluster that have more than four OSD nodes
     2. Select an OSD node and drain the node
@@ -42,19 +42,20 @@ def run(ceph_cluster, **kw):
     service_obj = ServiceabilityMethods(cluster=ceph_cluster, **config)
     ceph_nodes = kw.get("ceph_nodes")
     config = kw["config"]
-
+    # cmd_unset_unmanaged = ""
     replicated_config = config.get("replicated_pool")
     pool_name = replicated_config["pool_name"]
-    active_osd_list = rados_obj.get_osd_list(status="up")
+    active_osd_list = rados_obj.get_active_osd_list()
     log.info(f"The active OSDs list before starting the test-{active_osd_list}")
     if not rados_obj.create_pool(pool_name=pool_name):
         log.error("Failed to create the  Pool")
         return 1
 
-    rados_obj.bench_write(pool_name=pool_name, byte_size="5M", rados_write_duration=90)
+    rados_obj.bench_write(pool_name=pool_name, byte_size="5M", rados_write_duration=180)
     mgr_daemon = Thread(
         target=background_mgr_task, kwargs={"mgr_object": mgr_obj}, daemon=True
     )
+
     # Printing the hosts in cluster
     cmd_host_ls = "ceph orch host ls"
     out = rados_obj.run_ceph_command(cmd=cmd_host_ls)
@@ -64,7 +65,7 @@ def run(ceph_cluster, **kw):
     for node in ceph_nodes:
         if node.role == "mgr":
             mgr_host_object_list.append(node)
-            log.debug(f"The mgr host node is{node.hostname}")
+            log.debug(f"The mgr host node is {node.hostname}")
 
     mgr_daemon_list = mgr_obj.get_mgr_daemon_list()
     log.debug(f"The MGR daemons list are -{mgr_daemon_list}")
@@ -133,8 +134,10 @@ def run(ceph_cluster, **kw):
     try:
         osd_count_before_test = get_node_osd_list(rados_obj, ceph_nodes, drain_host)
         log.info(
-            f"The OSDs in the drain node before starting the test - {osd_count_before_test} "
+            f"st The OSDs in the drain node before starting the te- {osd_count_before_test} "
         )
+        rados_obj.set_service_managed_type("osd", unmanaged=True)
+        time.sleep(10)
         mgr_daemon.start()
         service_obj.remove_custom_host(host_node_name=drain_host)
         time.sleep(300)
@@ -152,6 +155,9 @@ def run(ceph_cluster, **kw):
                 "The traceback messages are noticed in logs.The error snippets are noticed in the MGR logs"
             )
             return 1
+        rados_obj.set_service_managed_type("osd", unmanaged=False)
+        time.sleep(10)
+
         log.info(
             "Adding the node by providing the deploy_osd as False, because the script is not setting the "
             "--unmanaged=true.Once the node is added back to the cluster the OSDs get configured automatically"
@@ -194,7 +200,7 @@ def run(ceph_cluster, **kw):
             return 1
 
         if bug_exists:
-            active_osd_list = rados_obj.get_osd_list(status="up")
+            active_osd_list = rados_obj.get_active_osd_list()
             log.info(
                 f"The active OSDs list after reproducing the issue is-{active_osd_list}"
             )
@@ -237,6 +243,8 @@ def run(ceph_cluster, **kw):
         log.info(
             "\n \n ************** Execution of finally block begins here *************** \n \n"
         )
+        rados_obj.set_service_managed_type("osd", unmanaged=False)
+        time.sleep(10)
         if replicated_config.get("delete_pool"):
             rados_obj.delete_pool(pool=pool_name)
         time.sleep(5)
@@ -297,7 +305,7 @@ def background_mgr_task(mgr_object):
         mgr_object: mgr object
     Returns: None
     """
-    time.sleep(20)
+    time.sleep(2)
     for _ in range(10):
         active_mgr_before_fail = mgr_object.get_active_mgr()
         mgr_object.set_mgr_fail()

diff --git a/tests/rados/test_rados_preempt_scrub.py b/tests/rados/test_rados_preempt_scrub.py
@@ -85,7 +85,7 @@ def run(ceph_cluster, **kw):
 
             log_lines = [
                 "head preempted",
-                "WaitReplicas::react(const GotReplicas&) PREEMPTED",
+                "WaitReplicas::react(const GotReplicas&) PREEMPTED!",
             ]
 
             init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'")
@@ -162,7 +162,7 @@ def run(ceph_cluster, **kw):
         log.info(traceback.format_exc())
         return 1
     finally:
-        log.info("Execution of finally block")
+        log.info("===================Execution of finally block===================")
         if config.get("delete_pool"):
             method_should_succeed(rados_object.delete_pool, entry["pool_name"])
             log.info("deleted the pool successfully")