From 8cc1cb8173de57b224208b8b8f475dafd12470e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 16 Dec 2024 20:37:52 +0800 Subject: [PATCH 01/10] fix: conf path --- rpm/oceanbase-diagnostic-tool.spec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rpm/oceanbase-diagnostic-tool.spec b/rpm/oceanbase-diagnostic-tool.spec index c9828b68..82d6103f 100644 --- a/rpm/oceanbase-diagnostic-tool.spec +++ b/rpm/oceanbase-diagnostic-tool.spec @@ -46,7 +46,7 @@ cd $SRC_DIR \cp -rf $SRC_DIR/rpm/init.sh $BUILD_DIR/SOURCES/init.sh \cp -rf $SRC_DIR/rpm/init_obdiag_cmd.sh $BUILD_DIR/SOURCES/init_obdiag_cmd.sh \cp -rf $SRC_DIR/rpm/obdiag_backup.sh $BUILD_DIR/SOURCES/obdiag_backup.sh -\cp -rf $SRC_DIR/conf $BUILD_DIR/SOURCES/conf +\cp -rf $SRC_DIR/conf/* $BUILD_DIR/SOURCES/conf/ mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/lib/ mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/dependencies/bin find $SRC_DIR -name "obdiag" @@ -55,7 +55,7 @@ find $SRC_DIR -name "obdiag" \cp -rf $BUILD_DIR/SOURCES/resources ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/resources \cp -rf $BUILD_DIR/SOURCES/dependencies/bin ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/dependencies \cp -rf $BUILD_DIR/SOURCES/example ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ -\cp -rf $BUILD_DIR/SOURCES/conf ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ +\cp -rf $BUILD_DIR/SOURCES/conf ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/conf \cp -rf $BUILD_DIR/SOURCES/init.sh ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ \cp -rf $BUILD_DIR/SOURCES/init_obdiag_cmd.sh ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ \cp -rf $BUILD_DIR/SOURCES/obdiag_backup.sh ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ From 51e759b7aa50e2f35dbc84d686470ccf44c8cd7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Mon, 16 Dec 2024 21:07:16 +0800 Subject: [PATCH 02/10] fix: conf path --- rpm/oceanbase-diagnostic-tool.spec | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rpm/oceanbase-diagnostic-tool.spec b/rpm/oceanbase-diagnostic-tool.spec index 82d6103f..a55f87c1 100644 --- a/rpm/oceanbase-diagnostic-tool.spec +++ b/rpm/oceanbase-diagnostic-tool.spec @@ -42,11 +42,11 @@ cd $SRC_DIR \cp -rf $SRC_DIR/example $BUILD_DIR/SOURCES/example \cp -rf $SRC_DIR/resources $BUILD_DIR/SOURCES/ \cp -rf $SRC_DIR/dependencies/bin $BUILD_DIR/SOURCES/dependencies -\cp -rf $SRC_DIR/plugins $BUILD_DIR/plugins +\cp -rf $SRC_DIR/plugins $BUILD_DIR/SOURCES/ \cp -rf $SRC_DIR/rpm/init.sh $BUILD_DIR/SOURCES/init.sh \cp -rf $SRC_DIR/rpm/init_obdiag_cmd.sh $BUILD_DIR/SOURCES/init_obdiag_cmd.sh \cp -rf $SRC_DIR/rpm/obdiag_backup.sh $BUILD_DIR/SOURCES/obdiag_backup.sh -\cp -rf $SRC_DIR/conf/* $BUILD_DIR/SOURCES/conf/ +\cp -rf $SRC_DIR/conf $BUILD_DIR/SOURCES/ mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/lib/ mkdir -p ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/dependencies/bin find $SRC_DIR -name "obdiag" @@ -59,7 +59,7 @@ find $SRC_DIR -name "obdiag" \cp -rf $BUILD_DIR/SOURCES/init.sh ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ \cp -rf $BUILD_DIR/SOURCES/init_obdiag_cmd.sh ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ \cp -rf $BUILD_DIR/SOURCES/obdiag_backup.sh ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ -\cp -rf $BUILD_DIR/plugins ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ +\cp -rf $BUILD_DIR/SOURCES/plugins ${RPM_BUILD_ROOT}/usr/local/oceanbase-diagnostic-tool/ %files From ba72c0cb6a5a1af1db6216620fa9abd2ad66a7cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 18 Dec 2024 15:33:16 +0800 Subject: [PATCH 03/10] remote_client support strict_host_key_checking --- conf/inner_config.yml | 1 + rpm/init_obdiag_cmd.sh | 3 +++ src/common/config.py | 1 + src/common/ssh_client/remote_client.py | 13 ++++++++++--- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/conf/inner_config.yml b/conf/inner_config.yml index 7ee54841..cdca7e67 100644 --- a/conf/inner_config.yml +++ b/conf/inner_config.yml @@ -5,6 +5,7 @@ obdiag: file_number_limit: 50 file_size_limit: 5G dis_rsa_algorithms: 0 + strict_host_key_checking: 0 logger: log_dir: ~/.obdiag/log log_filename: obdiag.log diff --git a/rpm/init_obdiag_cmd.sh b/rpm/init_obdiag_cmd.sh index c9d3ca5d..99a3ce67 100644 --- a/rpm/init_obdiag_cmd.sh +++ b/rpm/init_obdiag_cmd.sh @@ -10,6 +10,9 @@ _obdiag_completion() { ;; 2) case "${COMP_WORDS[1]}" in + check) + type_list="run list" + ;; gather) if [ "$COMP_CWORD" -eq 2 ]; then type_list="log clog slog plan_monitor stack perf sysstat obproxy_log all scene ash tabledump parameter variable" diff --git a/src/common/config.py b/src/common/config.py index 77690afe..45e12cb2 100644 --- a/src/common/config.py +++ b/src/common/config.py @@ -70,6 +70,7 @@ 'file_number_limit': 20, 'file_size_limit': '2G', 'dis_rsa_algorithms': 0, + 'strict_host_key_checking': 0, }, 'logger': { 'log_dir': '~/.obdiag/log', diff --git a/src/common/ssh_client/remote_client.py b/src/common/ssh_client/remote_client.py index e046c628..a96d69bd 100644 --- a/src/common/ssh_client/remote_client.py +++ b/src/common/ssh_client/remote_client.py @@ -58,23 +58,30 @@ def __init__(self, context, node): remote_client_disable_rsa_algorithms = bool(self.context.inner_config.get("obdiag").get("basic").get("dis_rsa_algorithms")) if remote_client_disable_rsa_algorithms: self._disabled_rsa_algorithms = DISABLED_ALGORITHMS + remote_client_missing_host_key_policy = bool(self.context.inner_config.get("obdiag").get("basic").get("strict_host_key_checking")) self.ssh_type = "remote" if len(self.key_file) > 0: try: self._ssh_fd = paramiko.SSHClient() + if remote_client_missing_host_key_policy: + self._ssh_fd.set_missing_host_key_policy(paramiko.MissingHostKeyPolicy()) + else: + self._ssh_fd.load_system_host_keys() self._ssh_fd.set_missing_host_key_policy(paramiko.client.AutoAddPolicy()) - self._ssh_fd.load_system_host_keys() self._ssh_fd.connect(hostname=self.host_ip, username=self.username, key_filename=self.key_file, port=self.ssh_port, disabled_algorithms=self._disabled_rsa_algorithms) except AuthenticationException: self.password = input("Authentication failed, Input {0}@{1} password:\n".format(self.username, self.host_ip)) self.need_password = True self._ssh_fd.connect(hostname=self.host_ip, username=self.username, password=self.password, port=self.ssh_port, disabled_algorithms=self._disabled_rsa_algorithms) except Exception as e: - raise OBDIAGSSHConnException("ssh {0}@{1}: failed, exception:{2}".format(self.host_ip, self.ssh_port, e)) + raise OBDIAGSSHConnException("ssh {0} port {1} failed, exception:{2}".format(self.host_ip, self.ssh_port, e)) else: self._ssh_fd = paramiko.SSHClient() + if remote_client_missing_host_key_policy: + self._ssh_fd.set_missing_host_key_policy(paramiko.MissingHostKeyPolicy()) + else: + self._ssh_fd.load_system_host_keys() self._ssh_fd.set_missing_host_key_policy(paramiko.client.AutoAddPolicy()) - self._ssh_fd.load_system_host_keys() self.need_password = True self._ssh_fd.connect(hostname=self.host_ip, username=self.username, password=self.password, port=self.ssh_port, disabled_algorithms=self._disabled_rsa_algorithms) From d3760222bbb96472dc6e467e4d3003df4b4a42ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Wed, 18 Dec 2024 19:32:30 +0800 Subject: [PATCH 04/10] Clean rca old *scene.py files --- rpm/init.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rpm/init.sh b/rpm/init.sh index 53599660..16f993a9 100755 --- a/rpm/init.sh +++ b/rpm/init.sh @@ -18,6 +18,10 @@ mkdir -p ${OBDIAG_HOME} && cd ${OBDIAG_HOME} mkdir -p ${OBDIAG_HOME}/check mkdir -p ${OBDIAG_HOME}/log mkdir -p ${OBDIAG_HOME}/display + +# Clean rca old *scene.py files +find "$SOURCE_DIR/rca" -maxdepth 1 -name "*_scene.py" -type f -exec rm -f {} \; + cp -rf ${WORK_DIR}/plugins/* ${OBDIAG_HOME}/ From 5b9c64d7423e42d9630d7c6899a06a818c6907b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 19 Dec 2024 15:49:49 +0800 Subject: [PATCH 05/10] fix some bugs --- rpm/init.sh | 2 +- src/common/core.py | 4 ---- src/common/diag_cmd.py | 6 ------ src/handler/gather/gather_component_log.py | 6 ++---- 4 files changed, 3 insertions(+), 15 deletions(-) diff --git a/rpm/init.sh b/rpm/init.sh index 16f993a9..451a0548 100755 --- a/rpm/init.sh +++ b/rpm/init.sh @@ -20,7 +20,7 @@ mkdir -p ${OBDIAG_HOME}/log mkdir -p ${OBDIAG_HOME}/display # Clean rca old *scene.py files -find "$SOURCE_DIR/rca" -maxdepth 1 -name "*_scene.py" -type f -exec rm -f {} \; +find ${OBDIAG_HOME}/rca -maxdepth 1 -name "*_scene.py" -type f -exec rm -f {} \; cp -rf ${WORK_DIR}/plugins/* ${OBDIAG_HOME}/ diff --git a/src/common/core.py b/src/common/core.py index 805b6af9..8b1726f8 100644 --- a/src/common/core.py +++ b/src/common/core.py @@ -262,7 +262,6 @@ def gather_function(self, function_type, opt): since=Util.get_option(options, 'since'), scope=Util.get_option(options, 'scope'), grep=Util.get_option(options, 'grep'), - encrypt=Util.get_option(options, 'encrypt'), store_dir=Util.get_option(options, 'store_dir'), temp_dir=Util.get_option(options, 'temp_dir'), redact=Util.get_option(options, 'redact'), @@ -304,7 +303,6 @@ def gather_function(self, function_type, opt): since=Util.get_option(options, 'since'), scope=Util.get_option(options, 'scope'), grep=Util.get_option(options, 'grep'), - encrypt=Util.get_option(options, 'encrypt'), store_dir=Util.get_option(options, 'store_dir'), temp_dir=Util.get_option(options, 'temp_dir'), redact=Util.get_option(options, 'redact'), @@ -319,7 +317,6 @@ def gather_function(self, function_type, opt): since=Util.get_option(options, 'since'), scope=Util.get_option(options, 'scope'), grep=Util.get_option(options, 'grep'), - encrypt=Util.get_option(options, 'encrypt'), store_dir=Util.get_option(options, 'store_dir'), temp_dir=Util.get_option(options, 'temp_dir'), redact=Util.get_option(options, 'redact'), @@ -364,7 +361,6 @@ def gather_obproxy_log(self, opt): since=Util.get_option(options, 'since'), scope=Util.get_option(options, 'scope'), grep=Util.get_option(options, 'grep'), - encrypt=Util.get_option(options, 'encrypt'), store_dir=Util.get_option(options, 'store_dir'), temp_dir=Util.get_option(options, 'temp_dir'), redact=Util.get_option(options, 'redact'), diff --git a/src/common/diag_cmd.py b/src/common/diag_cmd.py index 64d96d22..0df94684 100644 --- a/src/common/diag_cmd.py +++ b/src/common/diag_cmd.py @@ -433,9 +433,7 @@ def __init__(self): self.parser.add_option('--from', type='string', help="specify the start of the time range. format: 'yyyy-mm-dd hh:mm:ss'") self.parser.add_option('--to', type='string', help="specify the end of the time range. format: 'yyyy-mm-dd hh:mm:ss'") self.parser.add_option('--since', type='string', help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.", default='30m') - self.parser.add_option('--scope', type='string', help="log type constrains, choices=[observer, election, rootservice, all]", default='all') self.parser.add_option('--grep', action="append", type='string', help="specify keywords constrain") - self.parser.add_option('--encrypt', type='string', help="Whether the returned results need to be encrypted, choices=[true, false]", default="false") self.parser.add_option('--store_dir', type='string', help='the dir to store gather result, current dir by default.', default='./') self.parser.add_option('--temp_dir', type='string', help='the dir for temporarily storing files on nodes', default='/tmp') self.parser.add_option('-c', type='string', help='obdiag custom config', default=os.path.expanduser('~/.obdiag/config.yml')) @@ -459,7 +457,6 @@ def __init__(self): self.parser.add_option('--since', type='string', help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.", default='30m') self.parser.add_option('--scope', type='string', help="log type constrains, choices=[observer, election, rootservice, all]", default='all') self.parser.add_option('--grep', action="append", type='string', help="specify keywords constrain") - self.parser.add_option('--encrypt', type='string', help="Whether the returned results need to be encrypted, choices=[true, false]", default="false") self.parser.add_option('--store_dir', type='string', help='the dir to store gather result, current dir by default.', default='./') self.parser.add_option('--temp_dir', type='string', help='the dir for temporarily storing files on nodes', default='/tmp') self.parser.add_option('-c', type='string', help='obdiag custom config', default=os.path.expanduser('~/.obdiag/config.yml')) @@ -570,7 +567,6 @@ def __init__(self): self.parser.add_option('--from', type='string', help="specify the start of the time range. format: 'yyyy-mm-dd hh:mm:ss'") self.parser.add_option('--to', type='string', help="specify the end of the time range. format: 'yyyy-mm-dd hh:mm:ss'") self.parser.add_option('--since', type='string', help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.", default='30m') - self.parser.add_option('--encrypt', type='string', help="Whether the returned results need to be encrypted, choices=[true, false]", default="false") self.parser.add_option('--store_dir', type='string', help='the dir to store gather result, current dir by default.', default='./') self.parser.add_option('-c', type='string', help='obdiag custom config', default=os.path.expanduser('~/.obdiag/config.yml')) self.parser.add_option('--config', action="append", type="string", help='config options Format: --config key=value') @@ -591,7 +587,6 @@ def __init__(self): self.parser.add_option('--from', type='string', help="specify the start of the time range. format: 'yyyy-mm-dd hh:mm:ss'") self.parser.add_option('--to', type='string', help="specify the end of the time range. format: 'yyyy-mm-dd hh:mm:ss'") self.parser.add_option('--since', type='string', help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.", default='30m') - self.parser.add_option('--encrypt', type='string', help="Whether the returned results need to be encrypted, choices=[true, false]", default="false") self.parser.add_option('--store_dir', type='string', help='the dir to store gather result, current dir by default.', default='./') self.parser.add_option('-c', type='string', help='obdiag custom config', default=os.path.expanduser('~/.obdiag/config.yml')) self.parser.add_option('--config', action="append", type="string", help='config options Format: --config key=value') @@ -655,7 +650,6 @@ def __init__(self): self.parser.add_option('--since', type='string', help="Specify time range that from 'n' [d]ays, 'n' [h]ours or 'n' [m]inutes. before to now. format: . example: 1h.", default='30m') self.parser.add_option('--scope', type='string', help="log type constrains, choices=[obproxy, obproxy_limit, obproxy_stat, obproxy_digest, obproxy_slow, obproxy_diagnosis, obproxy_error, all]", default='all') self.parser.add_option('--grep', action="append", type='string', help="specify keywords constrain") - self.parser.add_option('--encrypt', type='string', help="Whether the returned results need to be encrypted, choices=[true, false]", default="false") self.parser.add_option('--store_dir', type='string', help='the dir to store gather result, current dir by default.', default='./') self.parser.add_option('-c', type='string', help='obdiag custom config', default=os.path.expanduser('~/.obdiag/config.yml')) self.parser.add_option('--config', action="append", type="string", help='config options Format: --config key=value') diff --git a/src/handler/gather/gather_component_log.py b/src/handler/gather/gather_component_log.py index 7e08f102..edd1e5ce 100644 --- a/src/handler/gather/gather_component_log.py +++ b/src/handler/gather/gather_component_log.py @@ -61,7 +61,6 @@ def __init__(self, *args, **kwargs): self.since_option = None self.scope = None self.grep = None - self.encrypt = None self.store_dir = None self.temp_dir = None self.redact = None @@ -83,7 +82,6 @@ def init(self, context, *args, **kwargs): self.since_option = kwargs.get('since', None) self.scope = kwargs.get('scope', None) self.grep = kwargs.get('grep', None) - self.encrypt = kwargs.get('encrypt', None) self.store_dir = kwargs.get('store_dir', None) self.temp_dir = kwargs.get('temp_dir', None) self.redact = kwargs.get('redact', None) @@ -99,7 +97,6 @@ def init(self, context, *args, **kwargs): "tmp_dir": const.GATHER_LOG_TEMPORARY_DIR_DEFAULT, "scope": self.scope, "grep": self.grep, - "encrypt": self.encrypt, "store_dir": self.store_dir, "from_time": self.from_time_str, "to_time": self.to_time_str, @@ -126,7 +123,8 @@ def __check_option(self): # check store_dir if not os.path.exists(self.store_dir): - raise Exception("store_dir: {0} is not exist".format(self.store_dir)) + self.stdio.warn('args --store_dir [{0}] incorrect: No such directory, Now create it'.format(os.path.abspath(self.store_dir))) + os.makedirs(os.path.abspath(self.store_dir)) if self.is_scene is False: target_dir = os.path.join("obdiag_gather_pack_{0}".format(TimeUtils.timestamp_to_filename_time(TimeUtils.get_current_us_timestamp()))) self.store_dir = os.path.join(self.store_dir or "./", target_dir) From fcc41048ec5f467fd08129ed98a544c0b278326d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 19 Dec 2024 17:26:00 +0800 Subject: [PATCH 06/10] fix: gather all not support scope option --- src/common/core.py | 2 -- src/handler/gather/gather_component_log.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/core.py b/src/common/core.py index 8b1726f8..52c43c98 100644 --- a/src/common/core.py +++ b/src/common/core.py @@ -301,7 +301,6 @@ def gather_function(self, function_type, opt): from_option=Util.get_option(options, 'from'), to_option=Util.get_option(options, 'to'), since=Util.get_option(options, 'since'), - scope=Util.get_option(options, 'scope'), grep=Util.get_option(options, 'grep'), store_dir=Util.get_option(options, 'store_dir'), temp_dir=Util.get_option(options, 'temp_dir'), @@ -315,7 +314,6 @@ def gather_function(self, function_type, opt): from_option=Util.get_option(options, 'from'), to_option=Util.get_option(options, 'to'), since=Util.get_option(options, 'since'), - scope=Util.get_option(options, 'scope'), grep=Util.get_option(options, 'grep'), store_dir=Util.get_option(options, 'store_dir'), temp_dir=Util.get_option(options, 'temp_dir'), diff --git a/src/handler/gather/gather_component_log.py b/src/handler/gather/gather_component_log.py index edd1e5ce..a3447410 100644 --- a/src/handler/gather/gather_component_log.py +++ b/src/handler/gather/gather_component_log.py @@ -81,6 +81,8 @@ def init(self, context, *args, **kwargs): self.to_option = self.to_option.strip() self.since_option = kwargs.get('since', None) self.scope = kwargs.get('scope', None) + if isinstance(self.scope, bool): + self.scope = "all" self.grep = kwargs.get('grep', None) self.store_dir = kwargs.get('store_dir', None) self.temp_dir = kwargs.get('temp_dir', None) From 5cf3846b1f028c115e4e6e63f50e087a0adfac92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Thu, 19 Dec 2024 21:04:34 +0800 Subject: [PATCH 07/10] add check task --- docs/analyze_flt_trace.md | 44 ---- docs/analyze_ob_log.md | 84 ------- docs/analyze_sql.md | 31 --- docs/analyze_sql_review.md | 20 -- docs/check.md | 209 ------------------ docs/gather_admin.md | 72 ------ docs/gather_all.md | 98 -------- docs/gather_ash.md | 50 ----- docs/gather_awr.md | 43 ---- docs/gather_ob_log.md | 60 ----- docs/gather_ob_stack.md | 23 -- docs/gather_obproxy_log.md | 60 ----- docs/gather_perf.md | 47 ---- docs/gather_scene.md | 177 --------------- docs/gather_sql_plan_monitor.md | 16 -- docs/gather_sysstat.md | 31 --- docs/rca.md | 30 --- .../tasks/observer/cluster/datafile_next.yaml | 14 ++ .../check/tasks/observer/cluster/major.yaml | 2 +- .../observer/err_code/find_err_4108.yaml | 27 +++ 20 files changed, 42 insertions(+), 1096 deletions(-) delete mode 100644 docs/analyze_flt_trace.md delete mode 100644 docs/analyze_ob_log.md delete mode 100644 docs/analyze_sql.md delete mode 100644 docs/analyze_sql_review.md delete mode 100644 docs/check.md delete mode 100644 docs/gather_admin.md delete mode 100644 docs/gather_all.md delete mode 100644 docs/gather_ash.md delete mode 100644 docs/gather_awr.md delete mode 100644 docs/gather_ob_log.md delete mode 100644 docs/gather_ob_stack.md delete mode 100644 docs/gather_obproxy_log.md delete mode 100644 docs/gather_perf.md delete mode 100644 docs/gather_scene.md delete mode 100644 docs/gather_sql_plan_monitor.md delete mode 100644 docs/gather_sysstat.md delete mode 100644 docs/rca.md create mode 100644 plugins/check/tasks/observer/cluster/datafile_next.yaml create mode 100644 plugins/check/tasks/observer/err_code/find_err_4108.yaml diff --git a/docs/analyze_flt_trace.md b/docs/analyze_flt_trace.md deleted file mode 100644 index eb6e1a91..00000000 --- a/docs/analyze_flt_trace.md +++ /dev/null @@ -1,44 +0,0 @@ -## analyze flt_trace - -### Step 1: 查找疑似慢的sql -在sql audit中,如果有明确的SQL语句可以通过通过query_sql查到疑似慢sql的 flt_trace_id, 例如: -```shell script -OceanBase(root@test)>select query_sql, flt_trace_id from oceanbase.gv$ob_sql_audit where query_sql like 'select @@version_comment limit 1'; -+----------------------------------+--------------------------------------+ -| query_sql | flt_trace_id | -+----------------------------------+--------------------------------------+ -| select @@version_comment limit 1 | 00060aa3-d607-f5f2-328b-388e17f687cb | -+----------------------------------+--------------------------------------+ -1 row in set (0.001 sec) -``` -其中flt_trace_id为00060aa3-d607-f5f2-328b-388e17f687cb. - -### Step 2: 设置配置文件 - -例子 -```shell script -obdiag config -h192.168.1.1 -uroot@sys -p***** -P2881 -``` - -### Step 3: 执行全链路诊断命令 -```shell script -$ obdiag analyze flt_trace -h -Usage: obdiag analyze flt_trace [options] - -Options: - --flt_trace_id=FLT_TRACE_ID - flt trace id, . format: xxxxxxxx-xxxx-xxxx-xxxx- - xxxxxxxxxxxx - --files=FILES specify files - --top=TOP top leaf span - --recursion=RECURSION - Maximum number of recursion - --output=OUTPUT Print the result to the maximum output line on the - screen - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` diff --git a/docs/analyze_ob_log.md b/docs/analyze_ob_log.md deleted file mode 100644 index 93fb33c4..00000000 --- a/docs/analyze_ob_log.md +++ /dev/null @@ -1,84 +0,0 @@ -## analyze log -'obdiag analyze log' can specify a time range to analyze the OceanBase logs on the target host and the log files passing OceanBase for analysis. -``` -$ obdiag analyze log -h -Usage: obdiag analyze log [options] - -Options: - --from=FROM specify the start of the time range. 'format: yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. 'format: yyyy-mm-dd - hh:mm:ss' - --since=SINCE Specify time range that from 'n' [d]ays, 'n' [h]ours - or 'n' [m]inutes. before to now. format: . - example: 1h. - --scope=SCOPE log type constrains, choices=[observer, election, - rootservice, all] - --grep=GREP specify keywords constrain - --log_level=LOG_LEVEL - oceanbase logs greater than or equal to this level - will be analyze, choices=[DEBUG, TRACE, INFO, WDIAG, - WARN, EDIAG, ERROR] - --files=FILES specify files - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` - -Example: -```shell script -$ obdiag analyze log --scope observer --from "2023-10-08 10:25:00" --to "2023-10-08 11:30:00" - -... -FileListInfo: -+----------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Node | LogList | -+================+=======================================================================================================================================================================================================================+ -| 192.168.2.11 | ['observer.log.20231008104204260', 'observer.log.20231008111305072', 'observer.log.20231008114410668', 'observer.log.wf.20231008104204260', 'observer.log.wf.20231008111305072', 'observer.log.wf.20231008114410668'] | -+----------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -... - - -Analyze OceanBase Online Log Summary: -+----------------+-----------+------------------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -| Node | Status | FileName | ErrorCode | Message | Count | -+================+===========+==============================================================================+=============+===============================================================================================================================+=========+ -| 192.168.2.11 | Completed | analyze_pack_20231008171201/192_168_2_11/observer.log.20231008104204260 | -5006 | You have an error in your SQL syntax; check the manual that corresponds to your OceanBase version for the right syntax to use | 2 | -+----------------+-----------+------------------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -| 192.168.2.11 | Completed | analyze_pack_20231008171201/192_168_2_11/observer.log.20231008111305072 | -5006 | You have an error in your SQL syntax; check the manual that corresponds to your OceanBase version for the right syntax to use | 8 | -+----------------+-----------+------------------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -| 192.168.2.11 | Completed | analyze_pack_20231008171201/192_168_2_11/observer.log.20231008114410668 | -5006 | You have an error in your SQL syntax; check the manual that corresponds to your OceanBase version for the right syntax to use | 10 | -+----------------+-----------+------------------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -| 192.168.2.11 | Completed | analyze_pack_20231008171201/192_168_2_11/observer.log.20231008114410668 | -4009 | IO error | 20 | -+----------------+-----------+------------------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -For more details, please run cmd ' cat analyze_pack_20231008171201/result_details.txt ' -``` - -```shell script -$ ls -lh test/ --rw-r--r-- 1 admin staff 256M Oct 8 17:24 observer.log.20231008104204260 --rw-r--r-- 1 admin staff 256M Oct 8 17:24 observer.log.20231008111305072 --rw-r--r-- 1 admin staff 256M Oct 8 17:24 observer.log.20231008114410668 --rw-r--r-- 1 admin staff 18K Oct 8 17:24 observer.log.wf.20231008104204260 --rw-r--r-- 1 admin staff 19K Oct 8 17:24 observer.log.wf.20231008111305072 --rw-r--r-- 1 admin staff 18K Oct 8 17:24 observer.log.wf.20231008114410668 - -$ obdiag analyze log --files test/ - -Analyze OceanBase Offline Log Summary: -+-----------+-----------+-----------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -| Node | Status | FileName | ErrorCode | Message | Count | -+===========+===========+=======================================================================+=============+===============================================================================================================================+=========+ -| 127.0.0.1 | Completed | analyze_pack_20231008172144/127_0_0_1_/observer.log.20231008104204260 | -5006 | You have an error in your SQL syntax; check the manual that corresponds to your OceanBase version for the right syntax to use | 2 | -+-----------+-----------+-----------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -| 127.0.0.1 | Completed | analyze_pack_20231008172144/127_0_0_1_/observer.log.20231008111305072 | -5006 | You have an error in your SQL syntax; check the manual that corresponds to your OceanBase version for the right syntax to use | 8 | -+-----------+-----------+-----------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -| 127.0.0.1 | Completed | analyze_pack_20231008172144/127_0_0_1_/observer.log.20231008114410668 | -5006 | You have an error in your SQL syntax; check the manual that corresponds to your OceanBase version for the right syntax to use | 10 | -+-----------+-----------+-----------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -| 127.0.0.1 | Completed | analyze_pack_20231008172144/127_0_0_1_/observer.log.20231008114410668 | -4009 | IO error | 20 | -+-----------+-----------+-----------------------------------------------------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+---------+ -For more details, please run cmd ' cat analyze_pack_20231008172144/result_details.txt ' -``` diff --git a/docs/analyze_sql.md b/docs/analyze_sql.md deleted file mode 100644 index e4d5a101..00000000 --- a/docs/analyze_sql.md +++ /dev/null @@ -1,31 +0,0 @@ -## analyze sql - -```bash -$ obdiag analyze sql [options] - -Options: - --host=HOST tenant connection host - --port=PORT tenant connection port - --password=PASSWORD tenant connection user password - --user=USER tenant connection user name - --from=FROM specify the start of the time range. format: 'yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. format: 'yyyy-mm-dd - hh:mm:ss' - --since=SINCE Specify time range that from 'n' [d]ays, 'n' [h]ours - or 'n' [m]inutes. before to now. format: . - example: 1h. - --level=LEVEL The alarm level, optional parameters [critical, warn, - notice, ok] - --output=OUTPUT The format of the output results, choices=[json, html] - --limit=LIMIT The limit on the number of data rows returned by - sql_audit for the tenant. - --store_dir=STORE_DIR - the dir to store result, current dir by default. - --elapsed_time=ELAPSED_TIME - The minimum threshold for filtering execution time, - measured in microseconds. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` diff --git a/docs/analyze_sql_review.md b/docs/analyze_sql_review.md deleted file mode 100644 index 4b560960..00000000 --- a/docs/analyze_sql_review.md +++ /dev/null @@ -1,20 +0,0 @@ -## analyze sql_review - -```bash -$ obdiag analyze sql_review [options] - -Options: - --host=HOST tenant connection host - --port=PORT tenant connection port - --password=PASSWORD tenant connection user password - --user=USER tenant connection user name - --files=FILES specify files - --level=LEVEL The alarm level, optional parameters [critical, warn, - notice, ok] - --output=OUTPUT The format of the output results, choices=[json, html] - --store_dir=STORE_DIR - the dir to store result, current dir by default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` diff --git a/docs/check.md b/docs/check.md deleted file mode 100644 index ffc00853..00000000 --- a/docs/check.md +++ /dev/null @@ -1,209 +0,0 @@ -## check命令 - -通过内部对一些已知问题的归纳分析,对用户现有集群进行数据采集分析。巡检模块的目的是依赖已有的案例提供多个检测项对用户集群进行分析,发现已存在或可能会导致集群出现异常问题的原因分析并提供运维建议。 - -## 注意 -在独立使用check能力,且未配置ocp的情况下,请务必完整配置config.yml内的OBCLUSTER、NODES信息 -巡检极为依赖配置信息 - -## 快速使用 - -```shell script -obdiag check -``` -### 关联动态可配参数: -```shell script ---cases={CasesName} - -CasesName是对需要执行的巡检项目的集合名,巡检集合保存在"~/.obdiag/{check_target}_check_package.yaml" ,每次仅能选择一个集合 -若未设定则默认执行所有的巡检项 - -{check_target}目前已支持的类型: -observer ->check_package.yaml -obproxy ->obproxy_check_package.yaml - - -Example: -obdiag check --cases= ad -obdiag check --obproxy_cases= proxy -obdiag check --cases=ad --obproxy_cases=proxy - ---report_type={ReportType} - -report_type是生成报告的格式,目前支持"table", "json", "xml"三种格式 -若未设定则默认为table - ---store_dir={StoreDir} - -store_dir是生成报告的存放路径,若未设定则默认为当前工作目录下的check_report文件夹内 - - -``` - -### 关联持久化参数: -持久化参数主要是部分日常不会修改的参数,依赖于conf/inner_config.yml - -若使用rpm方式进行安装,config.yml位于 -```shell script -/user/local/oceanbase-diagnostic-tool/conf/inner_config.yml -``` - -check功能所关联的配置项在"CHECK"下,基本上的参数均无需变更或更改频率较低 -```yaml script -check: - ignore_version: false - work_path: "~/.obdiag/check" - report: - report_path: "./check_report/" - export_type: table - package_file: "~/.obdiag/check/check_package.yaml" - tasks_base_path: "~/.obdiag/check/tasks/" -``` -ignore_version: 表示是否需要在执行巡检项时跳过版本匹配 -work_path: 巡检场景的存储目录 -report: 下主要是对报告的参数进行配置 -- report_path: 表示输出报告的路径 -- export_type: 表示输出报告的类型,目前支持table 、json 、xml后续需要支持的可以提交issue -package_file: 表示巡检项集合的保存路径 -tasks_base_path: 表示巡检项所保存的头路径,下面存储了不同check_target的巡检项目文件 - - - -## task编写教程 -task是一个独立的巡检场景,可以理解为一个专业的,用yaml编写的,用obdiag识别的脚本文件。 - -task会包含一些用于巡检的前置声明,用于实现对ob进行更为专业的巡检 -### 开始编写前 -编写前需要确定yaml需要放在哪 - -可以先进入conf.yml文件中设置CHECK.tasks_base_path所标识的目录里,看下分析下编写的巡检场景是否属于已有的大类,若没有就创建一个文件夹用于声明这个大类 - -例: - - -```ssh script -#先进入${CHECK.tasks_base_path} ,然后创建一个文件夹test,并创建我们的示例文件test.yaml(以observer为测试目标) -cd ~/.obdiag/check/tasks/observer -mkdir test -cd test -touch test.yaml -``` - -以上便完成了编写前的步骤 - - -### 开始编写 -开始编写就是开始编辑我们的test.yaml - -```yaml script -# 首先需要声明下这个场景的作用,为了让大家看得懂 - -info: "for test" -``` -简单的内容已经结束,开始复杂的编写,注意细节 - -#### task编写 - -task的作用是声明巡检执行的步骤,其基础结构是一个list - - - - -为什么task是一个list? -- 是为了兼容不同版本可能导致的步骤的出入、或者压根这个巡检项目没法有 - -task的一个元素的结构如下 - -| 参数名 | 是否必填 | | | | -|---------| --- | --- |----------------------------------------------------------| --- | -| version | 否 | 表示适用的版本,使用方式见下示例 | 用str的形式表示范围,需要完整的数字的版本号,3.x版本为三位,4.x版本为四位如:[3.1.1,3.2.0] | | -| steps | 是 | 所执行步骤 | 为list结构 | | - -如下就是一个示例 - -```yaml script -info: testinfo -task: - - version: "[3.1.0,3.2.4]" - steps: - {steps_object} - - version: [4.2.0.0,4.3.0.0] - steps: - {steps_object} - ``` -steps又是一个list,用来表示具体的多个执行流程 - -steps的一个元素的结构即单个流程,如下 - -| 参数名 | 是否必填 | | -|-----------|------|-------------------------------------------------------------------------------| -| type | 是 | 表示适用的执行类型,目前支持get_system_parameter/ssh/sql,后续会持续增加支持的类型 | -| {ssh/sql} | 是 | 根据所选的类型提供的参数,这块比较依赖代码里的对执行类型的逻辑说明,本章节后续会对支持的进行类型进行详细的使用说明 | -| result | 否 | 结构为一个单独的对象,用于对这个步骤结束后需要进行的操作进行解析,如校验结果逻辑,逻辑不通过时需要报错的文本信息进行说明等等。具体本章节后续会进行详细说明 | - -各种类型示例如下,"step:" 仅为一个标记,无实际作用 - -##### get_system_parameter -```yaml -step: - type: get_system_parameter - parameter_name: parameter - result: - set_value: servervm.max_map_count - - -``` -##### ssh -远程执行指令并获取对应的返回值 -```yaml -step: - type: ssh - ssh: wc -l /proc/${task_OBServer_pid}/maps | awk '{print $1}' - result: - set_value: observerMaps - -``` -##### sql -执行sql并获取对应的值 -```yaml -step: - type: sql - sql: select tenant_name from oceanbase.__all_tenant from where tenant_id=${taskTenantId}; - result: - set_value: tenant_name - -``` - - - -### result(verify功能) -这个字段也是verify功能的主要依赖字段,用于对task获取结果的验证 - -| 参数名 | 是否必填 | | | -|-------------|------|---------------------------------|------------------------------------------------------------------| -| set_value | 否 | 将执行后的值赋值,作为一个适用于整个task的变量 | 例如set_value: max_map_count | -| verify_type | 否 | 默认为base,一般需要和verify联动 | 用于设置验证的方式,base即为通过verify的表达式进行验证,true或false,同时提供了以下常见的判断类型,减少编写量 | -| verify | 否 | 服务于verify_type | 用于验证执行结果是否符合预期,若不符合,会输出errMsg部分的信息。 | -| report_type | 否 | 用于设置本步骤若出现verify为false需要执行的告警级别 | 默认告警级别为critical | -| err_msg | 否 | 用于非正常执行时答应的日志,支持配置全局变量 | 在verify为false的时候所输出的msg建议配置了verify,就一定要配上err_msg | - -目前verify_type支持的类型,除了base外的类型仅适用于int类型。 - -between:判断set_value的值是否在verify提供的范围内; - -max:是否小于verify提供的值 - -min:是否大于min提供的值 - -equal:是否等于verify(兼容字符串或int,但是${set_value}和verify必须是同类型) - -base: -verify表达式会用于替换如下shell式子中的new_expr内进行执行验证,在编写verify时可以手动在本地进行逻辑验证 -``` -if ${new_expr}; then - echo "true" -else - echo "false" -fi -``` - diff --git a/docs/gather_admin.md b/docs/gather_admin.md deleted file mode 100644 index 1c639bd6..00000000 --- a/docs/gather_admin.md +++ /dev/null @@ -1,72 +0,0 @@ -## gather admin命令 - -通过ob_admin工具能解析clog和slog文件,并对所选时间范围内的clog和slog进行一键收集。 -``` -$ obdiag gather clog -h -Usage: obdiag gather clog [options] - -Options: - --from=FROM specify the start of the time range. 'format: yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. 'format: yyyy-mm-dd - hh:mm:ss' - --since=SINCE Specify time range that from 'n' [d]ays, 'n' [h]ours - or 'n' [m]inutes. before to now. format: . - example: 1h. - --encrypt=ENCRYPT Whether the returned results need to be encrypted, - choices=[true, false] - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. - -``` - -执行结果: -```shell script -$ obdiag gather clog --from "2023-01-16 18:25:00" --to "2023-01-17 01:30:00" - -Gather clog Summary: -+----------------+-----------+---------+--------+----------------------------------------------------------------------+ -| Node | Status | Size | Time | PackPath | -+================+===========+=========+========+======================================================================+ -| 192.168.2.11 | Completed | 15.762K | 6 s | gather_pack_20230118002457/clog_192.168.2.11 _20230118002458.zip | -+----------------+-----------+---------+--------+----------------------------------------------------------------------+ -``` - -``` -$ obdiag gather slog -h -Usage: obdiag gather slog [options] - -Options: - --from=FROM specify the start of the time range. 'format: yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. 'format: yyyy-mm-dd - hh:mm:ss' - --since=SINCE Specify time range that from 'n' [d]ays, 'n' [h]ours - or 'n' [m]inutes. before to now. format: . - example: 1h. - --encrypt=ENCRYPT Whether the returned results need to be encrypted, - choices=[true, false] - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. - -``` - -执行结果: -```shell script -$ obdiag gather slog --from "2023-01-16 18:25:00" --to "2023-01-17 01:30:00" - -Gather slog Summary: -+----------------+-----------+---------+--------+----------------------------------------------------------------------+ -| Node | Status | Size | Time | PackPath | -+================+===========+=========+========+======================================================================+ -| 192.168.2.11 | Completed | 15.762K | 6 s | gather_pack_20230118002457/slog_192.168.2.11 _20230118002458.zip | -+----------------+-----------+---------+--------+----------------------------------------------------------------------+ -``` \ No newline at end of file diff --git a/docs/gather_all.md b/docs/gather_all.md deleted file mode 100644 index 68f56644..00000000 --- a/docs/gather_all.md +++ /dev/null @@ -1,98 +0,0 @@ -## gather all命令 - -该命令用户收集集群的日志、observer所在主机的信息以及observer的堆栈信息 - -例子: -```shell script - $ obdiag gather all -h -Usage: obdiag gather all [options] - -Options: - --from=FROM specify the start of the time range. 'format: yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. 'format: yyyy-mm-dd - hh:mm:ss' - --since=SINCE Specify time range that from 'n' [d]ays, 'n' [h]ours - or 'n' [m]inutes. before to now. format: . - example: 1h. - --scope=SCOPE log type constrains, choices=[observer, election, - rootservice, all] - --grep=GREP specify keywords constrain - --encrypt=ENCRYPT Whether the returned results need to be encrypted, - choices=[true, false] - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` - -例子: -```shell script - - $ obdiag gather all --from "2022-07-28 16:25:00" --to "2022-07-28 18:30:00" --encrypt true - -... -ZipFileInfo: -+---------------+-----------+ -| Node | LogSize | -+===============+===========+ -| 192.168.2.11 | 29.874M | -+---------------+-----------+ -... - -ZipFileInfo: -+----------------+-----------+ -| Node | LogSize | -+================+===========+ -| 192.168.2.12 | 143.229M | -+----------------+-----------+ -... - - -... -# 指定时间段内日志收集汇总 -Summary: -+----------------+-----------+----------+------------------+--------+------------------------------------------------------------------------------------+ -| Node | Status | Size | Password | Time | PackPath | -+================+===========+==========+==================+========+====================================================================================+ -| 192.168.2.11 | Completed | 29.874M | fB7FrrzTGl4EK5Hl | 20 s | gather_pack_20220729170718/result_192.168.2.11_20220729222724_20220730003224.zip | -+----------------+-----------+----------+------------------+--------+------------------------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 143.229M | SGRbXvMyA7lrnFW1 | 74 s | gather_pack_20220729170718/result_192.168.2.12_20220729222724_20220730003224.zip | -+----------------+-----------+----------+------------------+--------+------------------------------------------------------------------------------------+ - -... - -# observer所在主机当前时间的主机信息收集汇总 -Summary: -+----------------+-----------+---------+--------+----------------------------------------------------------------------+ -| Node | Status | Size | Time | PackPath | -+================+===========+=========+========+======================================================================+ -| 192.168.2.11 | Completed | 45.276K | 5 s | gather_pack_20220729170856/sysstat_192.168.2.11_20220729170856.zip | -+----------------+-----------+---------+--------+----------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 42.152K | 6 s | gather_pack_20220729170856/sysstat_192.168.2.12_20220729170856.zip | -+----------------+-----------+---------+--------+----------------------------------------------------------------------+ - - -# observer当前的堆栈信息 -Summary: -+----------------+-----------+---------+--------+-----------------------------------------------------------------------+ -| Node | Status | Size | Time | PackPath | -+================+===========+=========+========+=======================================================================+ -| 192.168.2.11 | Completed | 22.693K | 13 s | gather_pack_20220729170902/obstack2_192.168.2.11_20220729170902.zip | -+----------------+-----------+---------+--------+-----------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 19.902K | 13 s | gather_pack_20220729170902/obstack2_192.168.2.12_20220729170902.zip | -+----------------+-----------+---------+--------+-----------------------------------------------------------------------+ - -Gather Perf Summary: -+----------------+-----------+----------+--------+-------------------------------------------------------------------+ -| Node | Status | Size | Time | PackPath | -+================+===========+==========+========+===================================================================+ -| 192.168.2.11 | Completed | 368.178K | 90 s | gather_pack_20230117140836/perf_192.168.2.11_20230117140836.zip | -+----------------+-----------+----------+--------+-------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 368.178K | 90 s | gather_pack_20230117140836/perf_192.168.2.12_20230117140836.zip | -+----------------+-----------+----------+--------+-------------------------------------------------------------------+ - - -``` diff --git a/docs/gather_ash.md b/docs/gather_ash.md deleted file mode 100644 index 3ca65416..00000000 --- a/docs/gather_ash.md +++ /dev/null @@ -1,50 +0,0 @@ -## gather ash命令 - -该命令用户收集性能报告报告 -``` -$obdiag gather ash -h -Usage: obdiag gather ash [options] - -Options: - --trace_id=TRACE_ID The TRACE.ID of the SQL to be sampled, if left blank - or filled with NULL, indicates that TRACE.ID is not - restricted. - --sql_id=SQL_ID The SQL.ID, if left blank or filled with NULL, - indicates that SQL.ID is not restricted. - --wait_class=WAIT_CLASS - Event types to be sampled. - --report_type=REPORT_TYPE - Report type, currently only supports text type. - --from=FROM specify the start of the time range. format: 'yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. format: 'yyyy-mm-dd - hh:mm:ss' - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. - - -``` - -例子: -```shell script -obdiag gather ash - -``` - -执行结果: -```buildoutcfg -gather_ash_report start ... -gather from_time: 2024-05-08 11:18:59, to_time: 2024-05-08 11:48:59 -from_time: 2024-05-08 11:18:59, to_time: 2024-05-08 11:48:59, sql_id: None, trace_id: None, report_type: TEXT, wait_class: None, store_dir: ./ -save ash report file name: ./gather_pack_20240508114859/ash_report_20240508114859.txt - -Gather ash_report results stored in this directory: ./gather_pack_20240508114859 - -Trace ID: e6af30b2-0ced-11ef-89ff-02420b9e4df1 -If you want to view detailed obdiag logs, please run: obdiag display-trace e6af30b2-0ced-11ef-89ff-02420b9e4df1 - -``` \ No newline at end of file diff --git a/docs/gather_awr.md b/docs/gather_awr.md deleted file mode 100644 index b688b0a9..00000000 --- a/docs/gather_awr.md +++ /dev/null @@ -1,43 +0,0 @@ -## gather awr命令 - -该命令用户收集性能报告报告 -``` -$ obdiag gather awr -h -Usage: obdiag gather awr [options] - -Options: - --cluster_name=CLUSTER_NAME - cluster_name from ocp - --cluster_id=CLUSTER_ID - cluster_id from ocp - --from=FROM specify the start of the time range. format: 'yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. format: 'yyyy-mm-dd - hh:mm:ss' - --since=SINCE Specify time range that from 'n' [d]ays, 'n' [h]ours - or 'n' [m]inutes. before to now. format: . - example: 1h. - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. - -``` - -例子: -```shell script -obdiag gather awr --cluster_name obtest --cluster_id 1 - -``` - -执行结果: -```buildoutcfg -Gather AWR Summary: -+-----------+-----------+--------+--------+----------------------------------------------------------------------------------------+ -| Cluster | Status | Size | Time | PackPath | -+===========+===========+========+========+========================================================================================+ -| demo1 | Completed | 4.602M | 29 s | gather_pack_20220627005659/OBAWR_obcluster_demo1_20220625160100_20220625180100.html | -+-----------+-----------+--------+--------+----------------------------------------------------------------------------------------+ -``` \ No newline at end of file diff --git a/docs/gather_ob_log.md b/docs/gather_ob_log.md deleted file mode 100644 index cd02f445..00000000 --- a/docs/gather_ob_log.md +++ /dev/null @@ -1,60 +0,0 @@ -## gather log命令 -通过 gather log命令,可以指定时间范围的来去搜集目标主机上的OceanBase日志(后续会陆续开放除OceanBase运行日志外其他信息的搜集)。 -``` -$ obdiag gather log -h -Usage: obdiag gather log [options] - -Options: - --from=FROM specify the start of the time range. 'format: yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. 'format: yyyy-mm-dd - hh:mm:ss' - --since=SINCE Specify time range that from 'n' [d]ays, 'n' [h]ours - or 'n' [m]inutes. before to now. format: . - example: 1h. - --scope=SCOPE log type constrains, choices=[observer, election, - rootservice, all] - --grep=GREP specify keywords constrain - --encrypt=ENCRYPT Whether the returned results need to be encrypted, - choices=[true, false] - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` - -例子: -```shell script -$ obdiag gather log --scope observer --from "2022-06-25 10:25:00" --to "2022-06-25 18:30:00" --grep STORAGE --encrypt true - -... -ZipFileInfo: -+----------------+-----------+ -| Node | LogSize | -+================+===========+ -| 192.168.2.11 | 36.184M | -+----------------+-----------+ -... - -ZipFileInfo: -+----------------+-----------+ -| Node | LogSize | -+================+===========+ -| 192.168.2.12 | 44.176M | -+----------------+-----------+ -... - -Gather Ob Log Summary: -+----------------+-----------+----------+------------------+--------+---------------------------------------------------------------------+ -| Node | Status | Size | Password | Time | PackPath | -+================+===========+==========+==================+========+=====================================================================+ -| 192.168.2.11 | Completed | 36.762M | **************** | 19 s | gather_pack_20220701183246/ob_log_192.168.2.11_20220701183247.zip | -+----------------+-----------+----------+------------------+--------+---------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 638.200M | **************** | 718 s | gather_pack_20220701183246/ob_log_192.168.2.12_20220701183918.zip | -+----------------+-----------+----------+------------------+--------+---------------------------------------------------------------------+ - -``` -注意:如果选择加密模式,对于收集到的日志压缩的时候进行了加密处理,Password是zip包解压缩的密码。默认是不加密的模式。 - diff --git a/docs/gather_ob_stack.md b/docs/gather_ob_stack.md deleted file mode 100644 index ebc0ddbb..00000000 --- a/docs/gather_ob_stack.md +++ /dev/null @@ -1,23 +0,0 @@ -## gather stack命令 - -收集observer的堆栈信息 -``` -$ obdiag gather stack [-h] - -Example: obdiag gather stack -``` - -执行结果 -```shell script -Example: obdiag gather stack - -Summary: -+----------------+-----------+---------+--------+-----------------------------------------------------------------------+ -| Node | Status | Size | Time | PackPath | -+================+===========+=========+========+=======================================================================+ -| 192.168.2.11 | Completed | 19.926K | 10 s | gather_pack_20220729163951/obstack2_192.168.2.11_20220729163951.zip | -+----------------+-----------+---------+--------+-----------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 22.803K | 12 s | gather_pack_20220729163951/obstack2_192.168.2.11_20220729163951.zip | -+----------------+-----------+---------+--------+-----------------------------------------------------------------------+ - -``` \ No newline at end of file diff --git a/docs/gather_obproxy_log.md b/docs/gather_obproxy_log.md deleted file mode 100644 index 76ea23ec..00000000 --- a/docs/gather_obproxy_log.md +++ /dev/null @@ -1,60 +0,0 @@ -## gather obproxy_log命令 -通过 gather obproxy_log命令,可以指定时间范围的来去搜集目标主机上的ObProxy日志。 -``` -$ obdiag gather obproxy_log -h -Usage: obdiag gather obproxy_log [options] - -Options: - --from=FROM specify the start of the time range. 'format: yyyy-mm- - dd hh:mm:ss' - --to=TO specify the end of the time range. 'format: yyyy-mm-dd - hh:mm:ss' - --since=SINCE Specify time range that from 'n' [d]ays, 'n' [h]ours - or 'n' [m]inutes. before to now. format: . - example: 1h. - --scope=SCOPE log type constrains, choices=[observer, election, - rootservice, all] - --grep=GREP specify keywords constrain - --encrypt=ENCRYPT Whether the returned results need to be encrypted, - choices=[true, false] - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` - -例子: -```shell script -$ obdiag gather obproxy_log --scope obproxy --from "2022-06-25 10:25:00" --to "2022-06-25 18:30:00" --encrypt true - -... -ZipFileInfo: -+----------------+-----------+ -| Node | LogSize | -+================+===========+ -| 192.168.2.11 | 36.184M | -+----------------+-----------+ -... - -ZipFileInfo: -+----------------+-----------+ -| Node | LogSize | -+================+===========+ -| 192.168.2.12 | 44.176M | -+----------------+-----------+ -... - -Gather ObProxy Log Summary: -+----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ -| Node | Status | Size | Password | Time | PackPath | -+================+===========+==========+==================+========+==========================================================================+ -| 192.168.2.11 | Completed | 36.762M | **************** | 19 s | gather_pack_20220701183246/obproxy_log_192.168.2.11_20220701183247.zip | -+----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 638.200M | **************** | 718 s | gather_pack_20220701183246/obproxy_log_192.168.2.12_20220701183918.zip | -+----------------+-----------+----------+------------------+--------+--------------------------------------------------------------------------+ - -``` -注意:如果选择加密模式,对于收集到的日志压缩的时候进行了加密处理,Password是zip包解压缩的密码。默认是不加密的模式。 - diff --git a/docs/gather_perf.md b/docs/gather_perf.md deleted file mode 100644 index bbd217d1..00000000 --- a/docs/gather_perf.md +++ /dev/null @@ -1,47 +0,0 @@ -## gather perf命令 - -- 支持一键获取observer进程的"扁鹊图" ,ob运行态的调用关系,用"sample"代指 -- 支持一键获取observer进程的"perf 火焰图", 用"flame"代指 - -``` -$ obdiag gather perf [-h] - -Example: obdiag gather perf --scope all -``` - -执行结果 -```shell script -Example: obdiag gather perf --scope all - -Gather Perf Summary: -+----------------+-----------+----------+--------+-------------------------------------------------------------------+ -| Node | Status | Size | Time | PackPath | -+================+===========+==========+========+===================================================================+ -| 192.168.2.11 | Completed | 368.178K | 90 s | gather_pack_20230117140836/perf_192.168.2.11_20230117140836.zip | -+----------------+-----------+----------+--------+-------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 368.178K | 90 s | gather_pack_20230117140836/perf_192.168.2.12_20230117140836.zip | -+----------------+-----------+----------+--------+-------------------------------------------------------------------+ - -``` -scope 可选项: ["sample", "flame", "all"] -- sample:表示采集扁鹊图 -- flame: 表示采集ob的Perf火焰图 -- all: 表示扁鹊图、ob的perf火焰图,默认值 - -## 查看结果 -收集到的数据是通过perf工具进行采集的,可以通过Flame Graph中的工具对数据进行图形转化 - -### 图形转化步骤 - -1. 解压收集到的数据 -解压采集到的文件,例如解压:perf_192.168.2.11_20230117140836.zip,解压之后有如下文件 -```shell script -flame.viz flame.data sample.data sample.viz -``` -对应关系如下: -- flame.viz(perf 火焰图) -- sample.viz(扁鹊图) - - - - diff --git a/docs/gather_scene.md b/docs/gather_scene.md deleted file mode 100644 index 21c9adae..00000000 --- a/docs/gather_scene.md +++ /dev/null @@ -1,177 +0,0 @@ -## gather scenes 命令 - -该命令可以一键执行将某些问题场景所需要的排查信息统一捞回,解决分布式节点信息捞取难的通点 - -## 查看当前支持的场景 - -```shell script -obdiag gather scene list -``` - -```bash -obdiag gather scene list - -[Other Problem Gather Scenes]: ------------------------------------------------------------------------------------------- -command info_en info_cn ------------------------------------------------------------------------------------------- -obdiag gather scene run --scene=other.application_error [application error] [应用报错问题] ------------------------------------------------------------------------------------------- - -[Obproxy Problem Gather Scenes]: ----------------------------------------------------------------------------------- -command info_en info_cn ----------------------------------------------------------------------------------- -obdiag gather scene run --scene=obproxy.restart [obproxy restart] [obproxy无故重启] ----------------------------------------------------------------------------------- - -[Observer Problem Gather Scenes]: ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ -command info_en info_cn ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ -obdiag gather scene run --scene=observer.backup [backup problem] [数据备份问题] -obdiag gather scene run --scene=observer.backup_clean [backup clean] [备份清理问题] -obdiag gather scene run --scene=observer.clog_disk_full [clog disk full] [clog盘满] -obdiag gather scene run --scene=observer.compaction [compaction] [合并问题] -obdiag gather scene run --scene=observer.cpu_high [High CPU] [CPU高] -obdiag gather scene run --scene=observer.delay_of_primary_and_backup [delay of primary and backup] [主备库延迟] -obdiag gather scene run --scene=observer.log_archive [log archive] [日志归档问题] -obdiag gather scene run --scene=observer.long_transaction [long transaction] [长事务] -obdiag gather scene run --scene=observer.memory [memory problem] [内存问题] -obdiag gather scene run --scene=observer.perf_sql --env "{db_connect: '-hxx -Pxx -uxx -pxx -Dxx', trace_id: 'xx'}" [SQL performance problem] [SQL性能问题] -obdiag gather scene run --scene=observer.recovery [recovery] [数据恢复问题] -obdiag gather scene run --scene=observer.restart [restart] [observer无故重启] -obdiag gather scene run --scene=observer.rootservice_switch [rootservice switch] [有主改选或者无主选举的切主] -obdiag gather scene run --scene=observer.sql_err --env "{db_connect: '-hxx -Pxx -uxx -pxx -Dxx', trace_id: 'xx'}" [SQL execution error] [SQL 执行出错] -obdiag gather scene run --scene=observer.suspend_transaction [suspend transaction] [悬挂事务] -obdiag gather scene run --scene=observer.unit_data_imbalance [unit data imbalance] [unit迁移/缩小 副本不均衡问题] -obdiag gather scene run --scene=observer.unknown [unknown problem] [未能明确问题的场景] ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ -``` - -## 快速使用 - -```shell script -obdiag gather scene run --scene={SceneName} -``` -### 关联动态可配参数: -```shell script ---scene={SceneName} - -SceneName是对需要执行收集的场景 - - -Example1: -obdiag gather scene run --scene=observer.unknown -``` - - -## task编写教程 -一个task表示一个独立的场景,可以理解为一个专业的,用yaml编写的,用obdiag识别的脚本文件。 - -### 开始编写前 -编写前需要确定yaml需要放在哪 - -可以先进入~/.obdiag/inner_config.yml文件中设置 gather.scenes_base_path 所标识的目录里,看下编写的采集场景是否属于已有的大类,若没有就创建一个文件夹用于声明这个大类 - -例: - - -```ssh script -#先进入${gather.scenes_base_path} ,并创建我们的示例文件test.yaml(以observer为测试目标) -cd ~/.obdiag/gather/tasks/observer -touch test.yaml -``` - -以上便完成了编写前的步骤 - - -### 开始编写 -开始编写就是开始编辑我们的test.yaml - -```yaml script -# 首先需要声明下这个场景的作用,为了让大家看得懂 - -info: "for test" -``` -简单的内容已经结束,开始复杂的编写,注意细节 - -#### task编写 - -task的作用是声明场景采集执行的步骤,其基础结构是一个list - - - - -为什么task是一个list? -- 是为了兼容不同版本可能导致的步骤的不同 - -task的一个元素的结构如下 - -| 参数名 | 是否必填 | | | | -|---------| --- | --- |----------------------------------------------------------| --- | -| version | 否 | 表示适用的版本,使用方式见下示例 | 用str的形式表示范围,需要完整的数字的版本号,3.x版本为三位,4.x版本为四位如:[3.1.1,3.2.0],版本支持遵循左开又闭的原则 | | -| steps | 是 | 所执行步骤 | 为list结构 | | - -如下就是一个示例 - -```yaml script -info: testinfo -task: - - version: "[3.1.0,3.2.4]" - steps: - {steps_object} - - version: [4.2.0.0,4.3.0.0] - steps: - {steps_object} - ``` -steps又是一个list,用来表示具体的多个执行流程 - -steps的一个元素的结构即单个流程,如下 - -| 参数名 | 是否必填 | | -|-----------|------|-------------------------------------------------------------------------------| -| type | 是 | 表示适用的执行类型,目前支持 ssh/sql/log/obproxy_log/sysstat, 后续会持续增加支持的类型 | -| {ssh/sql/log/obproxy_log/sysstat} | 是 | 根据所选的类型提供的参数,这块比较依赖代码里的对执行类型的逻辑说明,本章节后续会对支持的进行类型进行详细的使用说明 | -各种类型示例如下,"step:" 仅为一个标记,无实际作用 - - -##### ssh -远程执行指令并获取对应的返回值 -```yaml -step: - type: ssh - ssh: wc -l /proc/${task_OBServer_pid}/maps | awk '{print $1}' - -``` -##### sql -执行sql并获取对应的值 -```yaml -step: - type: sql - sql: select tenant_name from oceanbase.__all_tenant from where tenant_id=${taskTenantId}; -``` - -##### log -收集observer的日志 -```yaml -step: - type: log - grep: "" # 过滤字段 -``` - -##### obproxy_log -收集 obproxy 的日志 -```yaml -step: - type: obproxy_log - grep: "" # 过滤字段 -``` - -##### sysstat -收集主机的信息 -```yaml -step: - type: sysstat - sysstat: "" -``` \ No newline at end of file diff --git a/docs/gather_sql_plan_monitor.md b/docs/gather_sql_plan_monitor.md deleted file mode 100644 index 1bed755a..00000000 --- a/docs/gather_sql_plan_monitor.md +++ /dev/null @@ -1,16 +0,0 @@ -## gather plan_monitor命令 -```shell script -$ obdiag gather plan_monitor -h -Usage: obdiag gather plan_monitor [options] - -Options: - --trace_id=TRACE_ID sql trace id - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - --env=ENV env, eg: "{env1=xxx, env2=xxx}" - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` - diff --git a/docs/gather_sysstat.md b/docs/gather_sysstat.md deleted file mode 100644 index 9a831975..00000000 --- a/docs/gather_sysstat.md +++ /dev/null @@ -1,31 +0,0 @@ -## gather sysstat命令 - -收集主机dmesg信息、主机cpu\内存信息 -``` -$ obdiag gather sysstat -h -Usage: obdiag gather sysstat [options] - -Options: - --store_dir=STORE_DIR - the dir to store gather result, current dir by - default. - -c C obdiag custom config - -h, --help Show help and exit. - -v, --verbose Activate verbose output. -``` - -执行结果: -```buildoutcfg -Example: obdiag gather sysstat - -结果: - -Summary: -+----------------+-----------+---------+--------+------------------------------------------------------------------------+ -| Node | Status | Size | Time | PackPath | -+================+===========+=========+========+========================================================================+ -| 192.168.2.11 | Completed | 45.209K | 5 s | gather_pack_20220729164233/sysstat_192.168.2.11_20220729164233.zip | -+----------------+-----------+---------+--------+------------------------------------------------------------------------+ -| 192.168.2.12 | Completed | 42.170K | 5 s | gather_pack_20220729164233/sysstat_192.168.2.12_20220729164233.zip | -+----------------+-----------+---------+--------+------------------------------------------------------------------------+ -``` \ No newline at end of file diff --git a/docs/rca.md b/docs/rca.md deleted file mode 100644 index 2a9c5435..00000000 --- a/docs/rca.md +++ /dev/null @@ -1,30 +0,0 @@ -## rca命令 - -通过内部对一些已知问题的归纳分析,通过对 - - -## 快速使用 - -```shell script -obdiag rca run --scene={scene_name} -obdiag rca list -``` -### 关联动态可配参数: -```shell script -scene_name是需要执行的根因分析场景的名称,可以通过obdiag rca list获取 - -``` - -### 关联持久化参数: -持久化参数主要是部分日常不会修改的参数,依赖于{obdiag安装目录}/conf/inner_config.yml - -若使用rpm方式进行安装,inner_config.yml位于 -```shell script -/user/local/oceanbase-diagnostic-tool/conf/config.yml -``` - -rca功能所关联的配置项在"rca"下,基本上的参数均无需变更或更改频率较低 -```yaml script -rca: - result_path: "./obdiag_rca/" # rca报告保存的地址 -``` diff --git a/plugins/check/tasks/observer/cluster/datafile_next.yaml b/plugins/check/tasks/observer/cluster/datafile_next.yaml new file mode 100644 index 00000000..d15ca6ea --- /dev/null +++ b/plugins/check/tasks/observer/cluster/datafile_next.yaml @@ -0,0 +1,14 @@ +info: "Check node'parameter 'datafile_maxsize'. When the datafile_maxsize is set and is greater than datafile_size, check if datafile_next is 0. If this value is 0, then the data file will not grow. issue #573" +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: "select value from oceanbase.gv$ob_parameters where name = 'datafile_next' and svr_ip = '#{remote_ip}' and svr_port = '#{remote_port}';" + result: + set_value: datafile_next + report_type: warning + verify_type: min + verify: 0 + err_msg: "node: #{remote_ip} datafile_next is 0, the data file will not grow. More info: https://github.com/oceanbase/obdiag/issues/573" + + diff --git a/plugins/check/tasks/observer/cluster/major.yaml b/plugins/check/tasks/observer/cluster/major.yaml index 6439301c..e5341e49 100644 --- a/plugins/check/tasks/observer/cluster/major.yaml +++ b/plugins/check/tasks/observer/cluster/major.yaml @@ -20,7 +20,7 @@ FROM ( set_value: major_hold_nu verify_type: equal verify: 0 - err_msg: 'major have hold' + err_msg: 'major have hold. please check it. And you can execute "obdiag rca run --scene=major_hold" to check it.' diff --git a/plugins/check/tasks/observer/err_code/find_err_4108.yaml b/plugins/check/tasks/observer/err_code/find_err_4108.yaml new file mode 100644 index 00000000..ed753d44 --- /dev/null +++ b/plugins/check/tasks/observer/err_code/find_err_4108.yaml @@ -0,0 +1,27 @@ +info: 'Check whether Error 4108 is reported when enable_sql_audit is set to True.' +task: + - version: "[4.0.0.0,*]" + steps: + - type: sql + sql: 'select count(0) from oceanbase.GV$OB_PARAMETERS where NAME="enable_sql_audit" and VALUE<>"True" ;' + result: + set_value: sql_audit + # report_type: warning + verify: "[ '0' == ${sql_audit} ]" + err_msg: 'Unable to proceed because enable_sql_audit is set to False' + - type: sql + sql: 'select count(0) from oceanbase.GV$OB_SQL_AUDIT where RET_CODE ="-4108";' + result: + set_value: err_4108 + verify_type: equal + verify: 0 + err_msg: 'number of sql_error_4108 is #{err_4108}. maybe the disk is damaged.' + - type: ssh + ssh: "grep \"checksum error\" #{remote_home_path}/log/observer.log | awk '{print $1}'" + result: + set_value: checksum_error_log + verify: '[ -z "$checksum_error_log" ]' + err_msg: "node #{remote_ip} has checksum error log in #{remote_home_path}/log/observer.log . maybe the disk is damaged. please check it." + + + From 084e9b07e90382f15c6051b1e7e76eed406b075c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 20 Dec 2024 11:29:21 +0800 Subject: [PATCH 08/10] update log --- src/handler/gather/gather_component_log.py | 11 ++++++----- src/handler/rca/rca_handler.py | 4 +++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/handler/gather/gather_component_log.py b/src/handler/gather/gather_component_log.py index ea9731b1..81e7cf03 100644 --- a/src/handler/gather/gather_component_log.py +++ b/src/handler/gather/gather_component_log.py @@ -230,11 +230,11 @@ def handle(self): new_context.stdio = self.stdio.sub_io() # use Process must delete ssh_client, and GatherLogOnNode will rebuild it. if "ssh_client" in node or "ssher" in node: - clear_node = copy.deepcopy(node) - if "ssh_client" in node: - del clear_node["ssh_client"] - if "ssher" in node: - del clear_node["ssher"] + clear_node = {} + for node_param in node: + if node_param == "ssh_client" or node_param == "ssher": + continue + clear_node[node_param] = node[node_param] tasks.append(GatherLogOnNode(new_context, clear_node, self.gather_log_conf_dict, semaphore)) else: tasks.append(GatherLogOnNode(new_context, node, self.gather_log_conf_dict, semaphore)) @@ -255,6 +255,7 @@ def handle(self): with open(os.path.join(self.store_dir, "result_summary.txt"), 'a', encoding='utf-8') as fileobj: fileobj.write(summary_tuples.get_string()) except Exception as e: + self.stdio.exception(e) self.stdio.verbose("gather log error: {0}".format(e)) finally: self.stdio.stop_loading("succeed") diff --git a/src/handler/rca/rca_handler.py b/src/handler/rca/rca_handler.py index 445109a2..240cf581 100644 --- a/src/handler/rca/rca_handler.py +++ b/src/handler/rca/rca_handler.py @@ -291,9 +291,11 @@ def __init__(self, context): self.scene = Util.get_option(self.context.options, "scene") self.version = "unknown" try: - if self.context.get_variable("ob_cluster").get("db_host") is not None or len(self.context.cluster_config.get("servers")) > 0: + ob_cluster = self.context.get_variable("ob_cluster") + if ob_cluster is not None and ob_cluster.get("db_host") is not None and len(self.context.cluster_config.get("servers")) > 0: self.version = get_version_by_type(self.context, "observer") except Exception as e: + self.stdio.exception(e) self.stdio.verbose("rca get obcluster version fail. Maybe the scene need not it, skip it. Exception: {0}".format(e)) self.stdio.warn("rca get obcluster version fail. if the scene need not it, skip it") From 14621753ddea798d79a17762070556ca4c3f129a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 20 Dec 2024 11:40:15 +0800 Subject: [PATCH 09/10] update log --- src/handler/gather/gather_component_log.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/handler/gather/gather_component_log.py b/src/handler/gather/gather_component_log.py index 81e7cf03..c6ba2c35 100644 --- a/src/handler/gather/gather_component_log.py +++ b/src/handler/gather/gather_component_log.py @@ -256,12 +256,17 @@ def handle(self): fileobj.write(summary_tuples.get_string()) except Exception as e: self.stdio.exception(e) + self.stdio.verbose("gather log error: {0}".format(e)) finally: self.stdio.stop_loading("succeed") - last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(self.store_dir, "result_summary.txt")) - self.stdio.print(last_info) + if os.path.exists(os.path.join(self.store_dir, "result_summary.txt")): + last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(self.store_dir, "result_summary.txt")) + self.stdio.print(last_info) + else: + self.stdio.print("No log file is gathered, please check the gather log config") + return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="gather log failed,please check the gather log config or check obdiag log") if self.redact and len(self.redact) > 0: self.stdio.start_loading("gather redact start") try: From 9b556b6b8ea8c50e5e1c531cdd91185814ecdc24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=A0=E7=A3=8A?= Date: Fri, 20 Dec 2024 11:43:03 +0800 Subject: [PATCH 10/10] update log --- src/handler/gather/gather_component_log.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/handler/gather/gather_component_log.py b/src/handler/gather/gather_component_log.py index b2073584..3a87959d 100644 --- a/src/handler/gather/gather_component_log.py +++ b/src/handler/gather/gather_component_log.py @@ -264,7 +264,7 @@ def handle(self): last_info = "For result details, please run cmd \033[32m' cat {0} '\033[0m\n".format(os.path.join(self.store_dir, "result_summary.txt")) self.stdio.print(last_info) else: - self.stdio.print("No log file is gathered, please check the gather log config") + self.stdio.warn("No log file is gathered, please check the gather log config") return ObdiagResult(ObdiagResult.SERVER_ERROR_CODE, error_data="gather log failed,please check the gather log config or check obdiag log") if self.redact and len(self.redact) > 0: self.stdio.start_loading("gather redact start")