From 5d11956bceabf4919fd6c85b20cd8f1d3b890ec6 Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Mon, 28 Feb 2022 11:13:47 +0000 Subject: [PATCH 1/9] Add the lambda handler #1564 --- ingestion/monitoring/errorLogsToSlack.py | 64 ++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 ingestion/monitoring/errorLogsToSlack.py diff --git a/ingestion/monitoring/errorLogsToSlack.py b/ingestion/monitoring/errorLogsToSlack.py new file mode 100644 index 000000000..93c9e46c7 --- /dev/null +++ b/ingestion/monitoring/errorLogsToSlack.py @@ -0,0 +1,64 @@ +import json +import os +import requests +import re +from base64 import b64decode +from gzip import decompress +from io import StringIO + +def decode(event): + event_data = event["awslogs"]["data"] + compressed = b64decode(event_data) + uncompressed = decompress(compressed) + decoded = json.loads(uncompressed) + + return decoded + +def communicate(event): + slack_url = os.getenv("SLACK_WEBHOOK") + message_header = {'Content-Type': 'application/json'} + message = {'text': event} + response = requests.post(url=slack_url, data=json.dumps(message), headers=message_header) + if response.status_code != 200: + raise ValueError( + 'Request to slack returned an error %s, the response is:\n%s' + % (response.status_code, response.text) + ) + +def define_errortype(decoded): + max_memory = False + for e in decoded['logEvents']: + if "'dateRange': {'start':".lower() in e["message"].lower(): + communicate("BACKFILL <@U01F70GPXNW>") + for e in decoded['logEvents']: + communicate(e["message"]) + if "error" in e["message"].lower(): + communicate("ERROR TYPE: Parser <@U011A0TFM7X> <@U017KLSPEM7>") + print(e["message"]) + communicate(e["message"]) + return None + if "filtering cases" in e["message"].lower(): + filter_message = e["message"] + if "memory size" in e["message"].lower(): + memory_use = re.findall(r'\d* MB', e["message"]) + if len(set(memory_use)) == 1: + max_memory = True + if "timed out" in e["message"].lower(): + if max_memory == True: + communicate("ERROR TYPE: Time out, max memory reached <@U011A0TFM7X> <@U017KLSPEM7>") + print(e["message"]) + communicate(e["message"]) + return None + else: + communicate("ERROR TYPE: Time out, max memory NOT reached <@U01F70GPXNW>") + print(filter_message) + communicate(filter_message) + print(e["message"]) + communicate(e["message"]) + return None + +def lambda_handler(event, context): + decoded = decode(event) + print(decoded['logGroup']) + communicate(decoded['logGroup']) + define_errortype(decoded) From 765e24b823df25e7a8d1dcc9d27ab83a1d0bbd33 Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Mon, 28 Feb 2022 14:43:35 +0000 Subject: [PATCH 2/9] Rewrite script to avoid dependency on lambda #1564 --- ingestion/monitoring/errorLogsToSlack.py | 119 +++++++++++----------- ingestion/monitoring/poetry.lock | 120 +++++++++++++++++++++++ ingestion/monitoring/pyproject.toml | 16 +++ 3 files changed, 200 insertions(+), 55 deletions(-) create mode 100644 ingestion/monitoring/poetry.lock create mode 100644 ingestion/monitoring/pyproject.toml diff --git a/ingestion/monitoring/errorLogsToSlack.py b/ingestion/monitoring/errorLogsToSlack.py index 93c9e46c7..194d1942c 100644 --- a/ingestion/monitoring/errorLogsToSlack.py +++ b/ingestion/monitoring/errorLogsToSlack.py @@ -1,64 +1,73 @@ import json +import logging import os import requests import re -from base64 import b64decode -from gzip import decompress -from io import StringIO +import sys -def decode(event): - event_data = event["awslogs"]["data"] - compressed = b64decode(event_data) - uncompressed = decompress(compressed) - decoded = json.loads(uncompressed) +import boto3 - return decoded -def communicate(event): - slack_url = os.getenv("SLACK_WEBHOOK") - message_header = {'Content-Type': 'application/json'} - message = {'text': event} - response = requests.post(url=slack_url, data=json.dumps(message), headers=message_header) - if response.status_code != 200: - raise ValueError( - 'Request to slack returned an error %s, the response is:\n%s' - % (response.status_code, response.text) - ) +class SlackHandler(logging.Handler): + def __init__(self, webhook_url, level=logging.NOTSET): + super().__init__(level) + self.slack_url = webhook_url + + def emit(self, record): + message_header = {'Content-Type': 'application/json'} + message = {'text': f"[{record.levelname}] {record.message}"} + response = requests.post(url=self.slack_url, data=json.dumps(message), headers=message_header) + if response.status_code != 200: + raise ValueError( + f"Request to slack returned an error {response.status_code}, the response is:\n{response.text}" + ) + + +def interpret(message): + lower = message.lower() + if "'dateRange': {'start':".lower() in lower: + return (logger.INFO, f"BACKFILL <@U01F70GPXNW>\n{message}") + if "error" in lower: + return (logger.ERROR, f"ERROR TYPE: Parser <@U011A0TFM7X> <@U017KLSPEM7>\n{message}") + if "timed out" in lower: + return (logger.ERROR, f"ERROR TYPE: Time out\n{message}") + return (logger.WARN, message) -def define_errortype(decoded): - max_memory = False - for e in decoded['logEvents']: - if "'dateRange': {'start':".lower() in e["message"].lower(): - communicate("BACKFILL <@U01F70GPXNW>") - for e in decoded['logEvents']: - communicate(e["message"]) - if "error" in e["message"].lower(): - communicate("ERROR TYPE: Parser <@U011A0TFM7X> <@U017KLSPEM7>") - print(e["message"]) - communicate(e["message"]) - return None - if "filtering cases" in e["message"].lower(): - filter_message = e["message"] - if "memory size" in e["message"].lower(): - memory_use = re.findall(r'\d* MB', e["message"]) - if len(set(memory_use)) == 1: - max_memory = True - if "timed out" in e["message"].lower(): - if max_memory == True: - communicate("ERROR TYPE: Time out, max memory reached <@U011A0TFM7X> <@U017KLSPEM7>") - print(e["message"]) - communicate(e["message"]) - return None - else: - communicate("ERROR TYPE: Time out, max memory NOT reached <@U01F70GPXNW>") - print(filter_message) - communicate(filter_message) - print(e["message"]) - communicate(e["message"]) - return None +def setup_logger(): + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + stdoutHandler = logging.StreamHandler(stream=sys.stdout) + stdoutHandler.setLevel(logging.DEBUG) + logger.addHandler(stdoutHandler) + slackHandler = SlackHandler(os.getenv('SLACK_WEBHOOK'), logging.DEBUG) + logger.addHandler(slackHandler) + return logger -def lambda_handler(event, context): - decoded = decode(event) - print(decoded['logGroup']) - communicate(decoded['logGroup']) - define_errortype(decoded) + +if __name__ == '__main__': + logger = setup_logger() + logGroup = os.getenv('INGESTION_LOG_GROUP') + logStream = os.getenv('INGESTION_LOG_STREAM') + if logGroup is None or logStream is None: + logger.critical(f"Cannot get messages from log group {logGroup} and stream {logStream}") + sys.exit(1) + logger.info(f"Output from {logGroup}/{logStream}:") + hasMore = True + oldNext = None + logClient = boto3.client('logs') + while hasMore: + response = logClient.get_log_events( + logGroupName=logGroup, + logStreamName=logStream, + startFromHead=True, + nextToken=oldNext + ) + newNext = response['nextForwardToken'] + if (not newNext) or (newNext == oldNext): + hasMore = False + else: + oldNext = newNext + for message in [e['message'] for e in response['events']]: + (severity, output) = interpret(message) + logger.log(severity, output) + logger.info(f"End of output from {logGroup}/{logStream}") diff --git a/ingestion/monitoring/poetry.lock b/ingestion/monitoring/poetry.lock new file mode 100644 index 000000000..7167cb99c --- /dev/null +++ b/ingestion/monitoring/poetry.lock @@ -0,0 +1,120 @@ +[[package]] +name = "boto3" +version = "1.21.8" +description = "The AWS SDK for Python" +category = "main" +optional = false +python-versions = ">= 3.6" + +[package.dependencies] +botocore = ">=1.24.8,<1.25.0" +jmespath = ">=0.7.1,<1.0.0" +s3transfer = ">=0.5.0,<0.6.0" + +[package.extras] +crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] + +[[package]] +name = "botocore" +version = "1.24.8" +description = "Low-level, data-driven core of boto 3." +category = "main" +optional = false +python-versions = ">= 3.6" + +[package.dependencies] +jmespath = ">=0.7.1,<1.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = ">=1.25.4,<1.27" + +[package.extras] +crt = ["awscrt (==0.12.5)"] + +[[package]] +name = "jmespath" +version = "0.10.0" +description = "JSON Matching Expressions" +category = "main" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "s3transfer" +version = "0.5.2" +description = "An Amazon S3 Transfer Manager" +category = "main" +optional = false +python-versions = ">= 3.6" + +[package.dependencies] +botocore = ">=1.12.36,<2.0a.0" + +[package.extras] +crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "urllib3" +version = "1.26.8" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" + +[package.extras] +brotli = ["brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + +[metadata] +lock-version = "1.1" +python-versions = "^3.9" +content-hash = "f72750d640129cd8b3aba6390ef68790154dad2ab0a38307e1e39ac9f4686b72" + +[metadata.files] +boto3 = [ + {file = "boto3-1.21.8-py3-none-any.whl", hash = "sha256:9b6903fe9cc92d2f6111db28675263f1ab45adbcf1483025c82a304ce7790b71"}, + {file = "boto3-1.21.8.tar.gz", hash = "sha256:f2ce641957c1782e382548ced4a447189e45851bbe58c1f6752ff2b661527de7"}, +] +botocore = [ + {file = "botocore-1.24.8-py3-none-any.whl", hash = "sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c"}, + {file = "botocore-1.24.8.tar.gz", hash = "sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759"}, +] +jmespath = [ + {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, + {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, +] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] +s3transfer = [ + {file = "s3transfer-0.5.2-py3-none-any.whl", hash = "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971"}, + {file = "s3transfer-0.5.2.tar.gz", hash = "sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"}, +] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +urllib3 = [ + {file = "urllib3-1.26.8-py2.py3-none-any.whl", hash = "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed"}, + {file = "urllib3-1.26.8.tar.gz", hash = "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"}, +] diff --git a/ingestion/monitoring/pyproject.toml b/ingestion/monitoring/pyproject.toml new file mode 100644 index 000000000..fcc85dbaf --- /dev/null +++ b/ingestion/monitoring/pyproject.toml @@ -0,0 +1,16 @@ +[tool.poetry] +name = "monitoring" +version = "0.1.0" +description = "Monitoring functions for Global.health ingestion" +authors = ["Global.health "] +license = "MIT" + +[tool.poetry.dependencies] +python = "^3.9" +boto3 = "^1.21.8" + +[tool.poetry.dev-dependencies] + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" From f0e54046f8f6d571f7b2c22f762effd1abcc9475 Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Mon, 28 Feb 2022 15:18:44 +0000 Subject: [PATCH 3/9] Improve formatting of messages #1564 Handle next token correctly Handle slack rate limiting us --- ingestion/monitoring/errorLogsToSlack.py | 43 +++++++++++----- ingestion/monitoring/poetry.lock | 63 +++++++++++++++++++++++- ingestion/monitoring/pyproject.toml | 1 + 3 files changed, 95 insertions(+), 12 deletions(-) diff --git a/ingestion/monitoring/errorLogsToSlack.py b/ingestion/monitoring/errorLogsToSlack.py index 194d1942c..4ffb4f7a0 100644 --- a/ingestion/monitoring/errorLogsToSlack.py +++ b/ingestion/monitoring/errorLogsToSlack.py @@ -4,6 +4,7 @@ import requests import re import sys +from time import sleep import boto3 @@ -17,25 +18,33 @@ def emit(self, record): message_header = {'Content-Type': 'application/json'} message = {'text': f"[{record.levelname}] {record.message}"} response = requests.post(url=self.slack_url, data=json.dumps(message), headers=message_header) - if response.status_code != 200: + if response.status_code == 429 and response['error'] == 'rate_limited': + sleep(response['retry_after']) + elif response.status_code != 200: raise ValueError( f"Request to slack returned an error {response.status_code}, the response is:\n{response.text}" ) def interpret(message): + graham = "<@U011A0TFM7X>" + abhishek = "<@U01F70FAJ6N>" + jim = "<@U01TAHDR4F7>" + engineers = f"{graham} {abhishek} {jim}" lower = message.lower() if "'dateRange': {'start':".lower() in lower: - return (logger.INFO, f"BACKFILL <@U01F70GPXNW>\n{message}") + return (logging.INFO, f"BACKFILL INITIATED\n{message}") if "error" in lower: - return (logger.ERROR, f"ERROR TYPE: Parser <@U011A0TFM7X> <@U017KLSPEM7>\n{message}") + return (logging.ERROR, f"PARSER ERROR: {engineers}\n{message}") if "timed out" in lower: - return (logger.ERROR, f"ERROR TYPE: Time out\n{message}") - return (logger.WARN, message) + return (logging.ERROR, f"TIME OUT: {engineers}\n{message}") + if lower.startswith('info:'): + return (logging.INFO, message) + return (logging.WARN, message) def setup_logger(): logger = logging.getLogger(__name__) - logger.setLevel(logging.DEBUG) + logger.setLevel(logging.WARN) stdoutHandler = logging.StreamHandler(stream=sys.stdout) stdoutHandler.setLevel(logging.DEBUG) logger.addHandler(stdoutHandler) @@ -43,6 +52,11 @@ def setup_logger(): logger.addHandler(slackHandler) return logger +def log_messages(cloudwatch_response, logger): + for message in [e['message'] for e in cloudwatch_response['events']]: + (severity, output) = interpret(message) + logger.log(severity, output) + if __name__ == '__main__': logger = setup_logger() @@ -52,9 +66,18 @@ def setup_logger(): logger.critical(f"Cannot get messages from log group {logGroup} and stream {logStream}") sys.exit(1) logger.info(f"Output from {logGroup}/{logStream}:") - hasMore = True - oldNext = None + hasMore = False + oldNext = '' logClient = boto3.client('logs') + response = logClient.get_log_events( + logGroupName=logGroup, + logStreamName=logStream, + startFromHead=True + ) + log_messages(response, logger) + oldNext = response['nextForwardToken'] + if oldNext and len(oldNext) > 0: + hasMore = true while hasMore: response = logClient.get_log_events( logGroupName=logGroup, @@ -62,12 +85,10 @@ def setup_logger(): startFromHead=True, nextToken=oldNext ) + log_messages(response, logger) newNext = response['nextForwardToken'] if (not newNext) or (newNext == oldNext): hasMore = False else: oldNext = newNext - for message in [e['message'] for e in response['events']]: - (severity, output) = interpret(message) - logger.log(severity, output) logger.info(f"End of output from {logGroup}/{logStream}") diff --git a/ingestion/monitoring/poetry.lock b/ingestion/monitoring/poetry.lock index 7167cb99c..04f9e67c5 100644 --- a/ingestion/monitoring/poetry.lock +++ b/ingestion/monitoring/poetry.lock @@ -30,6 +30,33 @@ urllib3 = ">=1.25.4,<1.27" [package.extras] crt = ["awscrt (==0.12.5)"] +[[package]] +name = "certifi" +version = "2021.10.8" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "charset-normalizer" +version = "2.0.12" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" +optional = false +python-versions = ">=3.5.0" + +[package.extras] +unicode_backport = ["unicodedata2"] + +[[package]] +name = "idna" +version = "3.3" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" +optional = false +python-versions = ">=3.5" + [[package]] name = "jmespath" version = "0.10.0" @@ -49,6 +76,24 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" [package.dependencies] six = ">=1.5" +[[package]] +name = "requests" +version = "2.27.1" +description = "Python HTTP for Humans." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} +idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] + [[package]] name = "s3transfer" version = "0.5.2" @@ -87,7 +132,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "f72750d640129cd8b3aba6390ef68790154dad2ab0a38307e1e39ac9f4686b72" +content-hash = "60b861f5ce04f9e11a7be3886ba13e441bd4d943ed0a698db843dffa14d3b27b" [metadata.files] boto3 = [ @@ -98,6 +143,18 @@ botocore = [ {file = "botocore-1.24.8-py3-none-any.whl", hash = "sha256:9fbc5c57b31850c51c87abc3e166ed4e0f343665bec4e1a0ff814fbc9704642c"}, {file = "botocore-1.24.8.tar.gz", hash = "sha256:a5431d806dc75fb1844463d921759fcd8d387674443af8d7fd0867f296b02759"}, ] +certifi = [ + {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, + {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"}, +] +charset-normalizer = [ + {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"}, + {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"}, +] +idna = [ + {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, + {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, +] jmespath = [ {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, @@ -106,6 +163,10 @@ python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, ] +requests = [ + {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, + {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, +] s3transfer = [ {file = "s3transfer-0.5.2-py3-none-any.whl", hash = "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971"}, {file = "s3transfer-0.5.2.tar.gz", hash = "sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"}, diff --git a/ingestion/monitoring/pyproject.toml b/ingestion/monitoring/pyproject.toml index fcc85dbaf..fff3e6081 100644 --- a/ingestion/monitoring/pyproject.toml +++ b/ingestion/monitoring/pyproject.toml @@ -8,6 +8,7 @@ license = "MIT" [tool.poetry.dependencies] python = "^3.9" boto3 = "^1.21.8" +requests = "^2.27.1" [tool.poetry.dev-dependencies] From ce8a89a8616386bf6955a3960a51233817087419 Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Mon, 28 Feb 2022 15:40:08 +0000 Subject: [PATCH 4/9] Document the new script #1564 --- ingestion/monitoring/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ingestion/monitoring/README.md b/ingestion/monitoring/README.md index 7e4f5d395..14d34caf6 100644 --- a/ingestion/monitoring/README.md +++ b/ingestion/monitoring/README.md @@ -1,3 +1,20 @@ +# Error monitoring + +The `errorLogsToSlack.py` script reads log messages from a given Cloudwatch stream +and posts any errors to Slack. It has three inputs, all passed via the environment: + + - `SLACK_WEBHOOK` is the webhook URL to post messages to Slack. + - `INGESTION_LOG_GROUP` is the Cloudwatch log group name. + - `INGESTION_LOG_STREAM` is the Cloudwatch log stream name. + +Typically, all would be set up EventBridge in AWS when it's run in Batch. + +## To set up for a new instance + +1. see https://api.slack.com/messaging/webhooks for details on creating a Slack app and enabling web hooks. +2. change the Slack user IDs in the script to ones that represent users in your workspace (who should get notified on ingestion errors). +3. deploy to Batch + # Data monitoring Data monitoring scripts, currently there's a script to alert daily about From 29e601f7a9d10a24163a27ae0391dd6d57254c39 Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Mon, 28 Feb 2022 16:03:58 +0000 Subject: [PATCH 5/9] Build Docker image for script #1564 --- ingestion/monitoring/Dockerfile | 71 +++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 ingestion/monitoring/Dockerfile diff --git a/ingestion/monitoring/Dockerfile b/ingestion/monitoring/Dockerfile new file mode 100644 index 000000000..aebc80c16 --- /dev/null +++ b/ingestion/monitoring/Dockerfile @@ -0,0 +1,71 @@ +# `python-base` sets up all our shared environment variables +FROM python:3.9-slim as python-base + +ENV PYTHONUNBUFFERED=1 \ + # prevents python creating .pyc files + PYTHONDONTWRITEBYTECODE=1 \ + \ + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on \ + PIP_DEFAULT_TIMEOUT=100 \ + \ + # https://python-poetry.org/docs/configuration/#using-environment-variables + POETRY_VERSION=1.1.5 \ + # make poetry install to this location + POETRY_HOME="/opt/poetry" \ + # make poetry create the virtual environment in the project's root + # it gets named `.venv` + POETRY_VIRTUALENVS_IN_PROJECT=true \ + # do not ask any interactive question + POETRY_NO_INTERACTION=1 \ + \ + # this is where our requirements + virtual environment will live + PYSETUP_PATH="/opt/pysetup" \ + VENV_PATH="/opt/pysetup/.venv" + +# prepend poetry and venv to path +ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH" + +# `builder-base` stage is used to build deps + create our virtual environment +FROM python-base as builder-base +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + curl \ + build-essential + +# install poetry - respects $POETRY_VERSION & $POETRY_HOME +RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python + +# copy project requirement files here to ensure they will be cached. +WORKDIR $PYSETUP_PATH +COPY poetry.lock pyproject.toml ./ + +ENV PATH="${PATH}:/root/.poetry/bin" + +# install runtime deps - uses $POETRY_VIRTUALENVS_IN_PROJECT internally +RUN poetry install --no-dev + +# `development` image is used during development / testing +FROM python-base as development + +RUN apt-get update && apt-get upgrade -y curl \ + awscli + +WORKDIR $PYSETUP_PATH + +# copy in our built poetry + venv +COPY --from=builder-base $POETRY_HOME $POETRY_HOME +COPY --from=builder-base $PYSETUP_PATH $PYSETUP_PATH +ENV PATH="${PATH}:/root/.poetry/bin" + +# will become mountpoint of our code +WORKDIR /app + +COPY errorLogsToSlack.py poetry.lock pyproject.toml ./ + +# quicker install as runtime deps are already installed +RUN poetry install + +# notice I haven't set the environment variables here, these +# should be configured at invocation time +CMD ["poetry", "run", "python3", "./errorLogsToSlack.py"] From 9b9f8a586b6f9c97379bd08bf55b051841fa7263 Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Tue, 1 Mar 2022 16:03:43 +0000 Subject: [PATCH 6/9] fix installation of poetry on python 3.10 #1564 --- ingestion/monitoring/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ingestion/monitoring/Dockerfile b/ingestion/monitoring/Dockerfile index aebc80c16..0ad95442d 100644 --- a/ingestion/monitoring/Dockerfile +++ b/ingestion/monitoring/Dockerfile @@ -1,5 +1,5 @@ # `python-base` sets up all our shared environment variables -FROM python:3.9-slim as python-base +FROM python:3.10-slim as python-base ENV PYTHONUNBUFFERED=1 \ # prevents python creating .pyc files @@ -34,7 +34,7 @@ RUN apt-get update \ build-essential # install poetry - respects $POETRY_VERSION & $POETRY_HOME -RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python +RUN curl -sSL https://install.python-poetry.org/ | python3 - # copy project requirement files here to ensure they will be cached. WORKDIR $PYSETUP_PATH From b420fe77820fb109c563883ca3060b98b20fc9e9 Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Wed, 2 Mar 2022 14:58:43 +0000 Subject: [PATCH 7/9] Collect logstream details from args to support submission-time definition #1564 --- ingestion/monitoring/Dockerfile | 5 +++-- ingestion/monitoring/errorLogsToSlack.py | 9 +++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/ingestion/monitoring/Dockerfile b/ingestion/monitoring/Dockerfile index 0ad95442d..812b1f437 100644 --- a/ingestion/monitoring/Dockerfile +++ b/ingestion/monitoring/Dockerfile @@ -66,6 +66,7 @@ COPY errorLogsToSlack.py poetry.lock pyproject.toml ./ # quicker install as runtime deps are already installed RUN poetry install -# notice I haven't set the environment variables here, these -# should be configured at invocation time +# notice I haven't set the environment variables or args here. +# the slack webhook should be configured in the job definition, +# and the args should be configured at submission time CMD ["poetry", "run", "python3", "./errorLogsToSlack.py"] diff --git a/ingestion/monitoring/errorLogsToSlack.py b/ingestion/monitoring/errorLogsToSlack.py index 4ffb4f7a0..c7cdf066a 100644 --- a/ingestion/monitoring/errorLogsToSlack.py +++ b/ingestion/monitoring/errorLogsToSlack.py @@ -1,3 +1,4 @@ +import argparse import json import logging import os @@ -60,8 +61,12 @@ def log_messages(cloudwatch_response, logger): if __name__ == '__main__': logger = setup_logger() - logGroup = os.getenv('INGESTION_LOG_GROUP') - logStream = os.getenv('INGESTION_LOG_STREAM') + parser = argparse.ArgumentParser() + parser.add_argument("group", help="AWS log group name for the failed parser") + parser.add_argument("stream", help="AWS log stream name for the failed parser") + args = parser.parse_args() + logGroup = args.group + logStream = args.stream if logGroup is None or logStream is None: logger.critical(f"Cannot get messages from log group {logGroup} and stream {logStream}") sys.exit(1) From c4e2138d13bdb506a05c5c308c189caaa144f454 Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Wed, 2 Mar 2022 16:11:10 +0000 Subject: [PATCH 8/9] Pin the installed version of poetry #1564 --- ingestion/monitoring/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingestion/monitoring/Dockerfile b/ingestion/monitoring/Dockerfile index 812b1f437..5c8b57216 100644 --- a/ingestion/monitoring/Dockerfile +++ b/ingestion/monitoring/Dockerfile @@ -34,7 +34,7 @@ RUN apt-get update \ build-essential # install poetry - respects $POETRY_VERSION & $POETRY_HOME -RUN curl -sSL https://install.python-poetry.org/ | python3 - +RUN curl -sSL https://install.python-poetry.org/ | python3 - --version 1.1.13 # copy project requirement files here to ensure they will be cached. WORKDIR $PYSETUP_PATH From 17a350f9db9df3da91e31725602aa337b8ad6bea Mon Sep 17 00:00:00 2001 From: Graham Lee Date: Wed, 2 Mar 2022 16:11:36 +0000 Subject: [PATCH 9/9] workflow to publish the error reporter to ECR #1564 --- .../ingestion-error-reporter-deploy.yml | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 .github/workflows/ingestion-error-reporter-deploy.yml diff --git a/.github/workflows/ingestion-error-reporter-deploy.yml b/.github/workflows/ingestion-error-reporter-deploy.yml new file mode 100644 index 000000000..5b2e21171 --- /dev/null +++ b/.github/workflows/ingestion-error-reporter-deploy.yml @@ -0,0 +1,60 @@ +name: Ingestion error reporter deploy + +on: + push: + branches: [main, '*-stable'] + paths: + - '.github/workflows/ingestion-error-reporter-deploy.yml' + - 'ingestion/monitoring/errorLogsToSlack.py' + - 'ingestion/monitoring/pyproject.toml' + - 'ingestion/monitoring/poetry.lock' + # Build whenever a new tag is created. + tags: + - "*" + workflow_dispatch: + branches: [main, '*-stable'] + paths: + - '.github/workflows/ingestion-error-reporter-deploy.yml' + - 'ingestion/monitoring/errorLogsToSlack.py' + - 'ingestion/monitoring/pyproject.toml' + - 'ingestion/monitoring/poetry.lock' + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-1 + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Build, tag, and push image to Amazon ECR (latest) + if: ${{ github.ref == 'refs/heads/main' }} + working-directory: ingestion/functions + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: gdh-ingestor-error-reporter + IMAGE_TAG: ${{ github.sha }} + run: | + docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY -f Dockerfile-clean . + docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG + docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest + + - name: Build, tag, and push image to Amazon ECR (stable) + if: ${{ endsWith(github.ref, '-stable') }} + working-directory: ingestion/functions + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: gdh-ingestor-error-monitor + IMAGE_TAG: ${{ github.sha }} + run: | + docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY:stable -f Dockerfile-clean . + docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG + docker push $ECR_REGISTRY/$ECR_REPOSITORY:stable