Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CDP-8328: Emit normandie and knox statuses inside pings #1771

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,5 @@ MANIFEST
docs/docs_generator/bin/

# Bazel
bazel-bin
bazel-out
bazel-testlogs
bazel-weave
bazel-deploy-agent
bazel-*
MODULE.bazel*
2 changes: 2 additions & 0 deletions deploy-agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ See https://github.com/pinterest/teletraan/wiki for more details.

1. Install [pre-commit](https://pre-commit.com/#install)
```bash
cd teletraan
pip install pre-commit
pre-commit install
```
Expand All @@ -15,6 +16,7 @@ Ensure that your python version is at least python3.8.

## Building
```bash
cd teletraan/deploy-agent/
sudo bazel build //deployd:deploy-agent
```

Expand Down
98 changes: 95 additions & 3 deletions deploy-agent/deployd/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
import socket
import traceback
import json
from pathlib import Path
import re
import subprocess

from deployd.client.base_client import BaseClient
from deployd.client.restfulclient import RestfulClient
Expand All @@ -32,6 +35,12 @@

log = logging.getLogger(__name__)

NORMANDIE_CERT_FILEPATH = "/var/lib/normandie/fuse/cert/generic"
SAN_URI_PATTERN = r"URI:(\S+),?"
STATUSERRNO_PATTERN = r"StatusErrno=(\d+)"
ACTIVESTATE_PATTERN = r"ActiveState=(\S+)"
SUBSTATE_PATTERN = r"SubState=(\S+)"


class Client(BaseClient):
def __init__(self, config=None, hostname=None, ip=None, hostgroup=None,
Expand All @@ -51,6 +60,8 @@ def __init__(self, config=None, hostname=None, ip=None, hostgroup=None,
# keep trying to fetch it from facter every time
self._stage_type_fetched = False
self._account_id = None
self._normandie_status = None
self._knox_status = None

def _read_host_info(self) -> bool:
if self._use_facter:
Expand Down Expand Up @@ -196,10 +207,24 @@ def _read_host_info(self) -> bool:
info = json.loads(ec2_metadata)
self._account_id = info.get('AccountId', None)

# Retrieve Normandie Status, swallowing exceptions if any: Ping should always be sent.
try:
self._normandie_status = self.get_normandie_status()
except Exception as e:
log.exception(f"Failed to get normandie status.: {e}")
self._normandie_status = 'ERROR'

# Retrieve Knox Status, swallowing exceptions if any: Ping should always be sent.
try:
self._knox_status = self.get_knox_status()
except Exception as e:
log.exception(f"Failed to get knox status.: {e}")
self._knox_status = 'ERROR'

log.info("Host information is loaded. "
"Host name: {}, IP: {}, host id: {}, agent_version={}, autoscaling_group: {}, "
"availability_zone: {}, ec2_tags: {}, stage_type: {}, group: {}, account id: {}".format(self._hostname, self._ip, self._id,
self._agent_version, self._autoscaling_group, self._availability_zone, self._ec2_tags, self._stage_type, self._hostgroup, self._account_id))
"availability_zone: {}, ec2_tags: {}, stage_type: {}, group: {}, account id: {}, normandie_status: {}, knox_status: {}".format(self._hostname, self._ip, self._id,
self._agent_version, self._autoscaling_group, self._availability_zone, self._ec2_tags, self._stage_type, self._hostgroup, self._account_id, self._normandie_status, self._knox_status))

if not self._availability_zone:
log.error("Fail to read host info: availablity zone")
Expand All @@ -209,6 +234,71 @@ def _read_host_info(self) -> bool:

return True

def get_normandie_status(self) -> Optional[str]:
path = Path(NORMANDIE_CERT_FILEPATH)
cmd = [
"openssl",
"x509",
"-in",
path.as_posix(),
"-noout",
"-text",
"-certopt",
"no_subject,no_header,no_version,no_serial,no_signame,no_validity,no_issuer,no_pubkey,no_sigdump,no_aux",
]
try:
cert = subprocess.check_output(cmd).decode("utf-8")
except subprocess.CalledProcessError as e:
log.exception(f"failed to get spiffe id from normandie: {e}")
return 'ERROR'

matcher = re.search(SAN_URI_PATTERN, cert)
if matcher is None:
return 'ERROR'
spiff_id = matcher.group(1)

if spiff_id:
return 'OK'
else:
return 'ERROR'

def get_knox_status(self) -> Optional[str]:
cmd = [
"systemctl",
"show",
"knox",
"--property=Result",
"--property=StatusErrno",
"--property=ActiveState",
"--property=SubState"
]
try:
status = subprocess.check_output(cmd).decode("utf-8")
except subprocess.CalledProcessError as e:
log.exception(f"failed to get knox service status from systemctl: {e}")
return 'ERROR'

# Use three different matchers and pattern to not make assumptions on the order of the properties
matcher = re.search(STATUSERRNO_PATTERN, status)
if matcher is None:
return 'ERROR'
statusErrNo = matcher.group(1)

matcher = re.search(ACTIVESTATE_PATTERN, status)
if matcher is None:
return 'ERROR'
activeState = matcher.group(1)

matcher = re.search(SUBSTATE_PATTERN, status)
if matcher is None:
return 'ERROR'
subState = matcher.group(1)

if statusErrNo == "0" and activeState == "active" and subState == "running":
return "OK"
else:
return "ERROR"

def send_reports(self, env_reports=None) -> Optional[PingResponse]:
try:
if self._read_host_info():
Expand All @@ -229,7 +319,9 @@ def send_reports(self, env_reports=None) -> Optional[PingResponse]:
availabilityZone=self._availability_zone,
ec2Tags=self._ec2_tags,
stageType=self._stage_type,
accountId=self._account_id)
accountId=self._account_id,
normandieStatus=self._normandie_status,
knoxStatus=self._knox_status)

with create_stats_timer('deploy.agent.request.latency',
tags={'host': self._hostname}):
Expand Down
13 changes: 10 additions & 3 deletions deploy-agent/deployd/types/ping_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
class PingRequest(object):

def __init__(self, hostId=None, hostName=None, hostIp=None, groups=None, reports=None,
agentVersion=None, autoscalingGroup=None, availabilityZone=None, ec2Tags=None, stageType=None, accountId=None):
agentVersion=None, autoscalingGroup=None, availabilityZone=None, ec2Tags=None, stageType=None,
accountId=None, normandieStatus=None, knoxStatus=None):
self.hostId = hostId
self.hostName = hostName
self.hostIp = hostIp
Expand All @@ -32,6 +33,8 @@ def __init__(self, hostId=None, hostName=None, hostIp=None, groups=None, reports
self.ec2Tags = ec2Tags
self.stageType = stageType
self.accountId = accountId
self.normandieStatus = normandieStatus
self.knoxStatus = knoxStatus

def to_json(self):
ping_requests = {}
Expand All @@ -52,6 +55,10 @@ def to_json(self):
ping_requests["accountId"] = self.accountId
if self.ec2Tags:
ping_requests["ec2Tags"] = self.ec2Tags
if self.normandieStatus:
ping_requests["normandieStatus"] = self.normandieStatus
if self.knoxStatus:
ping_requests["knoxStatus"] = self.knoxStatus

ping_requests["reports"] = []
for report in self.reports:
Expand Down Expand Up @@ -86,6 +93,6 @@ def to_json(self):

def __str__(self):
return "PingRequest(hostId={}, hostName={}, hostIp={}, agentVersion={}, autoscalingGroup={}, " \
"availabilityZone={}, ec2Tags={}, stageType={}, groups={}, accountId={}, reports={})".format(self.hostId, self.hostName,
"availabilityZone={}, ec2Tags={}, stageType={}, groups={}, accountId={}, normandieStatus={}, knoxStatus={}, reports={})".format(self.hostId, self.hostName,
self.hostIp, self.agentVersion, self.autoscalingGroup, self.availabilityZone, self.ec2Tags, self.stageType,
self.groups, self.accountId, ",".join(str(v) for v in self.reports))
self.groups, self.accountId, self.normandieStatus, self.knoxStatus, ",".join(str(v) for v in self.reports))
38 changes: 38 additions & 0 deletions deploy-agent/tests/unit/deploy/client/test_client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from unittest import mock
from tests import TestCase

from deployd.client.client import Client
Expand All @@ -23,6 +24,43 @@ def test_read_host_info(self):
self.assertIsNotNone(client._ip)
self.assertTrue(return_value)

def test_read_host_info_normandie(self):
client = Client(config=Config())
client._ec2_tags = {}
client._availability_zone = "us-east-1"
return_value: bool = client._read_host_info()
self.assertTrue(return_value)

# On a host with normandie, the normandie status should be set to OK
# On a host without, such as build agents, the normandie status should be ERROR
self.assertIsNotNone(client._normandie_status)
self.assertTrue(client._normandie_status == "OK" or client._normandie_status == "ERROR")

# Normandie status should be ERROR even when the subprocess call returns a non-parseable output
@mock.patch("subprocess.check_output")
def test_read_host_info_normandie_error(self, mock_check_output):
mock_check_output.return_value = b"not a parseable SAN URL"
client = Client(config=Config())
client._ec2_tags = {}
client._availability_zone = "us-east-1"
return_value: bool = client._read_host_info()
self.assertTrue(return_value)

self.assertIsNotNone(client._normandie_status)
self.assertEqual(client._normandie_status, "ERROR")

def test_read_host_info_knox(self):
client = Client(config=Config())
client._ec2_tags = {}
client._availability_zone = "us-east-1"
return_value: bool = client._read_host_info()
self.assertTrue(return_value)

# On a host with knox, the knox status should be set to OK
# On a host without, such as build agents, the knox status should be ERROR
self.assertIsNotNone(client._knox_status)
self.assertTrue(client._knox_status == "OK" or client._knox_status == "ERROR")

def test_read_host_info_no_ec2_tags_provided(self):
client = Client(config=Config())
with self.assertRaises(AttributeError):
Expand Down
Loading