diff --git a/.gitignore b/.gitignore index 43bf1857bf..283192a180 100644 --- a/.gitignore +++ b/.gitignore @@ -82,9 +82,5 @@ MANIFEST docs/docs_generator/bin/ # Bazel -bazel-bin -bazel-out -bazel-testlogs -bazel-weave -bazel-deploy-agent +bazel-* MODULE.bazel* diff --git a/deploy-agent/README.md b/deploy-agent/README.md index 75cf171f0a..d99705e957 100644 --- a/deploy-agent/README.md +++ b/deploy-agent/README.md @@ -5,6 +5,7 @@ See https://github.com/pinterest/teletraan/wiki for more details. 1. Install [pre-commit](https://pre-commit.com/#install) ```bash +cd teletraan pip install pre-commit pre-commit install ``` @@ -15,6 +16,7 @@ Ensure that your python version is at least python3.8. ## Building ```bash +cd teletraan/deploy-agent/ sudo bazel build //deployd:deploy-agent ``` diff --git a/deploy-agent/deployd/__init__.py b/deploy-agent/deployd/__init__.py index 90d00dfb3a..b9b20ad3d8 100644 --- a/deploy-agent/deployd/__init__.py +++ b/deploy-agent/deployd/__init__.py @@ -27,4 +27,4 @@ # 2: puppet applied successfully with changes PUPPET_SUCCESS_EXIT_CODES = [0, 2] -__version__ = '1.2.65' +__version__ = '1.2.66' diff --git a/deploy-agent/deployd/client/client.py b/deploy-agent/deployd/client/client.py index 549833608c..f86e0e5383 100644 --- a/deploy-agent/deployd/client/client.py +++ b/deploy-agent/deployd/client/client.py @@ -19,6 +19,9 @@ import socket import traceback import json +from pathlib import Path +import re +import subprocess from deployd.client.base_client import BaseClient from deployd.client.restfulclient import RestfulClient @@ -32,6 +35,12 @@ log = logging.getLogger(__name__) +NORMANDIE_CERT_FILEPATH = "/var/lib/normandie/fuse/cert/generic" +SAN_URI_PATTERN = r"URI:(\S+),?" +STATUSERRNO_PATTERN = r"StatusErrno=(\d+)" +ACTIVESTATE_PATTERN = r"ActiveState=(\S+)" +SUBSTATE_PATTERN = r"SubState=(\S+)" + class Client(BaseClient): def __init__(self, config=None, hostname=None, ip=None, hostgroup=None, @@ -51,6 +60,8 @@ def __init__(self, config=None, hostname=None, ip=None, hostgroup=None, # keep trying to fetch it from facter every time self._stage_type_fetched = False self._account_id = None + self._normandie_status = None + self._knox_status = None def _read_host_info(self) -> bool: if self._use_facter: @@ -196,10 +207,13 @@ def _read_host_info(self) -> bool: info = json.loads(ec2_metadata) self._account_id = info.get('AccountId', None) + self._normandie_status = self.get_normandie_status() + self._knox_status = self.get_knox_status() + log.info("Host information is loaded. " "Host name: {}, IP: {}, host id: {}, agent_version={}, autoscaling_group: {}, " - "availability_zone: {}, ec2_tags: {}, stage_type: {}, group: {}, account id: {}".format(self._hostname, self._ip, self._id, - self._agent_version, self._autoscaling_group, self._availability_zone, self._ec2_tags, self._stage_type, self._hostgroup, self._account_id)) + "availability_zone: {}, ec2_tags: {}, stage_type: {}, group: {}, account id: {}, normandie_status: {}, knox_status: {}".format(self._hostname, self._ip, self._id, + self._agent_version, self._autoscaling_group, self._availability_zone, self._ec2_tags, self._stage_type, self._hostgroup, self._account_id, self._normandie_status, self._knox_status)) if not self._availability_zone: log.error("Fail to read host info: availablity zone") @@ -209,6 +223,63 @@ def _read_host_info(self) -> bool: return True + def get_normandie_status(self) -> Optional[str]: + path = Path(NORMANDIE_CERT_FILEPATH) + cmd = [ + "openssl", + "x509", + "-in", + path.as_posix(), + "-noout", + "-text", + "-certopt", + "no_subject,no_header,no_version,no_serial,no_signame,no_validity,no_issuer,no_pubkey,no_sigdump,no_aux", + ] + try: + cert = subprocess.check_output(cmd).decode("utf-8") + except subprocess.CalledProcessError as e: + log.exception(f"failed to get spiffe id from normandie: {e}") + return 'ERROR' + + matcher = re.search(SAN_URI_PATTERN, cert) + spiff_id = matcher.group(1) + + if spiff_id: + return 'OK' + else: + return 'ERROR' + + def get_knox_status(self) -> Optional[str]: + cmd = [ + "systemctl", + "show", + "knox", + "--property=Result", + "--property=StatusErrno", + "--property=ActiveState", + "--property=SubState" + ] + try: + status = subprocess.check_output(cmd).decode("utf-8") + except subprocess.CalledProcessError as e: + log.exception(f"failed to get knox service status from systemctl: {e}") + return 'ERROR' + + # Use three different matchers and pattern to not make assumptions on the order of the properties + matcher = re.search(STATUSERRNO_PATTERN, status) + statusErrNo = matcher.group(1) + + matcher = re.search(ACTIVESTATE_PATTERN, status) + activeState = matcher.group(1) + + matcher = re.search(SUBSTATE_PATTERN, status) + subState = matcher.group(1) + + if statusErrNo == "0" and activeState == "active" and subState == "running": + return "OK" + else: + return "ERROR" + def send_reports(self, env_reports=None) -> Optional[PingResponse]: try: if self._read_host_info(): @@ -229,7 +300,9 @@ def send_reports(self, env_reports=None) -> Optional[PingResponse]: availabilityZone=self._availability_zone, ec2Tags=self._ec2_tags, stageType=self._stage_type, - accountId=self._account_id) + accountId=self._account_id, + normandieStatus=self._normandie_status, + knoxStatus=self._knox_status) with create_stats_timer('deploy.agent.request.latency', tags={'host': self._hostname}): diff --git a/deploy-agent/deployd/types/ping_request.py b/deploy-agent/deployd/types/ping_request.py index 88651b1434..76206cdaae 100644 --- a/deploy-agent/deployd/types/ping_request.py +++ b/deploy-agent/deployd/types/ping_request.py @@ -3,9 +3,9 @@ # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,7 +20,8 @@ class PingRequest(object): def __init__(self, hostId=None, hostName=None, hostIp=None, groups=None, reports=None, - agentVersion=None, autoscalingGroup=None, availabilityZone=None, ec2Tags=None, stageType=None, accountId=None): + agentVersion=None, autoscalingGroup=None, availabilityZone=None, ec2Tags=None, stageType=None, + accountId=None, normandieStatus=None, knoxStatus=None): self.hostId = hostId self.hostName = hostName self.hostIp = hostIp @@ -32,6 +33,8 @@ def __init__(self, hostId=None, hostName=None, hostIp=None, groups=None, reports self.ec2Tags = ec2Tags self.stageType = stageType self.accountId = accountId + self.normandieStatus = normandieStatus + self.knoxStatus = knoxStatus def to_json(self): ping_requests = {} @@ -52,6 +55,10 @@ def to_json(self): ping_requests["accountId"] = self.accountId if self.ec2Tags: ping_requests["ec2Tags"] = self.ec2Tags + if self.normandieStatus: + ping_requests["normandieStatus"] = self.normandieStatus + if self.knoxStatus: + ping_requests["knoxStatus"] = self.knoxStatus ping_requests["reports"] = [] for report in self.reports: @@ -76,16 +83,16 @@ def to_json(self): ping_report["deployAlias"] = report.deployAlias ping_report["containerHealthStatus"] = report.containerHealthStatus ping_report["agentState"] = report.state - + if report.extraInfo: ping_report["extraInfo"] = \ json.dumps(report.extraInfo, ensure_ascii=False).encode('utf8') - + ping_requests["reports"].append(ping_report) return ping_requests def __str__(self): return "PingRequest(hostId={}, hostName={}, hostIp={}, agentVersion={}, autoscalingGroup={}, " \ - "availabilityZone={}, ec2Tags={}, stageType={}, groups={}, accountId={}, reports={})".format(self.hostId, self.hostName, + "availabilityZone={}, ec2Tags={}, stageType={}, groups={}, accountId={}, normandieStatus={}, knoxStatus={}, reports={})".format(self.hostId, self.hostName, self.hostIp, self.agentVersion, self.autoscalingGroup, self.availabilityZone, self.ec2Tags, self.stageType, - self.groups, self.accountId, ",".join(str(v) for v in self.reports)) + self.groups, self.accountId, self.normandieStatus, self.knoxStatus, ",".join(str(v) for v in self.reports)) diff --git a/deploy-agent/tests/unit/deploy/client/test_client.py b/deploy-agent/tests/unit/deploy/client/test_client.py index 7dd5052723..13bf6996a2 100644 --- a/deploy-agent/tests/unit/deploy/client/test_client.py +++ b/deploy-agent/tests/unit/deploy/client/test_client.py @@ -23,6 +23,30 @@ def test_read_host_info(self): self.assertIsNotNone(client._ip) self.assertTrue(return_value) + def test_read_host_info_normandie(self): + client = Client(config=Config()) + client._ec2_tags = {} + client._availability_zone = "us-east-1" + return_value: bool = client._read_host_info() + self.assertTrue(return_value) + + # On a host with normandie, the normandie status should be set to OK + # On a host without, such as build agents, the normandie status should be ERROR + self.assertIsNotNone(client._normandie_status) + self.assertTrue(client._normandie_status == "OK" or client._normandie_status == "ERROR") + + def test_read_host_info_knox(self): + client = Client(config=Config()) + client._ec2_tags = {} + client._availability_zone = "us-east-1" + return_value: bool = client._read_host_info() + self.assertTrue(return_value) + + # On a host with knox, the knox status should be set to OK + # On a host without, such as build agents, the knox status should be ERROR + self.assertIsNotNone(client._knox_status) + self.assertTrue(client._knox_status == "OK" or client._knox_status == "ERROR") + def test_read_host_info_no_ec2_tags_provided(self): client = Client(config=Config()) with self.assertRaises(AttributeError):