From 63338bf187b29197d302d73f6f61e4a590f04ab9 Mon Sep 17 00:00:00 2001 From: Praveen Chaudhary Date: Tue, 4 May 2021 15:55:21 -0700 Subject: [PATCH] [main.py]: Implement a config process level lock. Changes: 1.) Implement a class, which uses hsetnx for lock. 2.) lock is expired within timeout period or will be released by config python click process. 3.) After -y prompt, lock is reacquired, because timer could have expired, before user enters yes. --- config/main.py | 109 +++++++++++++++++++++++++++++++++++++++++++++ config/muxcable.py | 10 ++++- 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/config/main.py b/config/main.py index e9bab3172d..f56ca68e20 100644 --- a/config/main.py +++ b/config/main.py @@ -197,6 +197,8 @@ def breakout_warnUser_extraTables(cm, final_delPorts, confirm=True): click.secho("Below Config can not be verified, It may cause harm "\ "to the system\n {}".format(json.dumps(tables, indent=2))) click.confirm('Do you wish to Continue?', abort=True) + # Reacquire lock, if confirmation is needed for this command + cdblock.reacquireLock() except Exception as e: raise Exception("Failed in breakout_warnUser_extraTables. Error: {}".format(str(e))) return @@ -647,6 +649,8 @@ def _get_sonic_generated_services(num_asic): def _abort_if_false(ctx, param, value): if not value: ctx.abort() + # Reacquire lock, if confirmation is needed for this command + cdblock.reacquireLock() def _get_disabled_services_list(config_db): @@ -837,6 +841,103 @@ def cache_arp_entries(): open(restore_flag_file, 'w').close() return success +# class for locking entire config process +class ConfigDbLock(): + def __init__(self): + self.lockName = "LOCK|configDbLock" + self.timeout = 10 + self.pid = os.getpid() + self.client = None + return + + def acquireLock(self): + try: + # connect to db + db_kwargs = dict() + configdb = ConfigDBConnector() + configdb.connect() + + self.client = configdb.get_redis_client('CONFIG_DB') + # Set lock and expire time. Process may get killed b/w set lock and + # expire call. + if self.client.hsetnx(self.lockName, "PID", self.pid): + self.client.expire(self.lockName, self.timeout) + log.log_debug(":::Lock Acquired:::") + # if lock exists but expire timer not running, run expire time and + # abort. + elif self.client.ttl(self.lockName) == -1: + self.client.expire(self.lockName, self.timeout) + click.echo(":::Can not acquire lock, Reset Timer & Abort:::"); + sys.exit(1) + else: + click.echo(":::Can not acquire lock, Abort:::"); + sys.exit(1) + except Exception as e: + click.echo(":::Exception: acquireLock {}:::".format(e)) + sys.exit(1) + return + + def reacquireLock(self): + try: + # Try to set lock first + if self.client.hsetnx(self.lockName, "PID", self.pid): + self.client.expire(self.lockName, self.timeout) + log.log_debug(":::Lock Reacquired:::") + # if lock exists, check who owns it + else: + p = self.client.pipeline(True) + # watch, we do not want to work on modified lock + p.watch(self.lockName) + # if current process holding then extend the timer + if p.hget(self.lockName, "PID") == str(self.pid): + self.client.expire(self.lockName, self.timeout) + log.log_debug(":::Lock Timer Extended:::"); + p.unwatch() + return + else: + # some other process is holding the lock. + click.echo(":::Can not acquire lock LOCK PID: {} and self.pid:{}:::".\ + format(p.hget(self.lockName, "PID"), self.pid)) + p.unwatch() + sys.exit(1) + + except Exception as e: + click.echo(":::Exception: reacquireLock {}:::".format(e)) + sys.exit(1) + return + + def _releaseLock(self): + try: + """ + If LOCK was never acquired, self.client should be None. This + happens with 'config ?' command + """ + if self.client == None: + return + + p = self.client.pipeline(True) + # watch, we do not want to work on modified lock + p.watch(self.lockName) + # if current process holding the lock then release it. + if p.hget(self.lockName, "PID") == str(self.pid): + p.multi() + p.delete(self.lockName) + p.execute() + return + else: + # some other process is holding the lock. Do nothing. + pass + p.unwatch() + except Exception as e: + raise e + return + + def __del__(self): + self._releaseLock() + return +# end of class configdblock + +cdblock = ConfigDbLock() # This is our main entrypoint - the main 'config' command @click.group(cls=clicommon.AbbreviationGroup, context_settings=CONTEXT_SETTINGS) @click.pass_context @@ -863,6 +964,10 @@ def config(ctx): if os.geteuid() != 0: exit("Root privileges are required for this operation") + # Take lock only when config command is executed, to avoid taking lock for + # TABs and for -h. Note ? is treated as input python clicks + cdblock.acquireLock() + ctx.obj = Db() @@ -943,6 +1048,8 @@ def load(filename, yes): if not yes: click.confirm(message, abort=True) + # Reacquire lock, if confirmation is needed for this command + cdblock.reacquireLock() num_asic = multi_asic.get_num_asics() cfg_files = [] @@ -1132,6 +1239,8 @@ def reload(db, filename, yes, load_sysinfo, no_service_restart, disable_arp_cach if not yes: click.confirm(message, abort=True) + # Reacquire lock, if confirmation is needed for this command + cdblock.reacquireLock() log.log_info("'reload' executing...") diff --git a/config/muxcable.py b/config/muxcable.py index 965bceb6de..af5a55298a 100644 --- a/config/muxcable.py +++ b/config/muxcable.py @@ -296,6 +296,8 @@ def state(state, port): if port is not None and port != "all": click.confirm(('Muxcable at port {} will be changed to {} state. Continue?'.format(port, state)), abort=True) + # Reacquire lock, if confirmation is needed for this command + cdblock.reacquireLock() logical_port_list = platform_sfputil_helper.get_logical_list() if port not in logical_port_list: click.echo("ERR: This is not a valid port, valid ports ({})".format(", ".join(logical_port_list))) @@ -344,7 +346,7 @@ def state(state, port): logical_port_list_per_port = logical_port_list_for_physical_port.get(physical_port, None) - """ This check is required for checking whether or not this logical port is the one which is + """ This check is required for checking whether or not this logical port is the one which is actually mapped to physical port and by convention it is always the first port. TODO: this should be removed with more logic to check which logical port maps to actual physical port being used""" @@ -384,6 +386,8 @@ def state(state, port): elif port == "all" and port is not None: click.confirm(('Muxcables at all ports will be changed to {} state. Continue?'.format(state)), abort=True) + # Reacquire lock, if confirmation is needed for this command + cdblock.reacquireLock() logical_port_list = platform_sfputil_helper.get_logical_list() rc = True @@ -491,6 +495,8 @@ def setswitchmode(state, port): if port is not None and port != "all": click.confirm(('Muxcable at port {} will be changed to {} switching mode. Continue?'.format(port, state)), abort=True) + # Reacquire lock, if confirmation is needed for this command + cdblock.reacquireLock() logical_port_list = platform_sfputil_helper.get_logical_list() if port not in logical_port_list: click.echo("ERR: This is not a valid port, valid ports ({})".format(", ".join(logical_port_list))) @@ -563,6 +569,8 @@ def setswitchmode(state, port): elif port == "all" and port is not None: click.confirm(('Muxcable at port {} will be changed to {} switching mode. Continue?'.format(port, state)), abort=True) + # Reacquire lock, if confirmation is needed for this command + cdblock.reacquireLock() logical_port_list = platform_sfputil_helper.get_logical_list() rc = True