Skip to content

Commit

Permalink
feat: create monitoring in cloudwatch for aws batch failures (#367)
Browse files Browse the repository at this point in the history
* feat: add eventbridge rule and sns topic

* fix: tflint and add data source

* fix: event pattern

* docs: update Terraform docs

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
Wi11Shell and github-actions[bot] authored Oct 8, 2024
1 parent 0a57d72 commit 58c076c
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 1 deletion.
3 changes: 3 additions & 0 deletions infra/terraform/modules/service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
| <a name="module_ecs_cluster"></a> [ecs\_cluster](#module\_ecs\_cluster) | terraform-aws-modules/ecs/aws//modules/cluster | ~> 5.10 |
| <a name="module_ecs_service"></a> [ecs\_service](#module\_ecs\_service) | terraform-aws-modules/ecs/aws//modules/service | ~> 5.10 |
| <a name="module_eventbridge"></a> [eventbridge](#module\_eventbridge) | terraform-aws-modules/eventbridge/aws | ~> 3.7 |
| <a name="module_eventbridge_sns"></a> [eventbridge\_sns](#module\_eventbridge\_sns) | terraform-aws-modules/eventbridge/aws | ~> 3.7 |
| <a name="module_log_bucket"></a> [log\_bucket](#module\_log\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 4.0 |
| <a name="module_records"></a> [records](#module\_records) | terraform-aws-modules/route53/aws//modules/records | ~> 4.0 |
| <a name="module_route53_records"></a> [route53\_records](#module\_route53\_records) | terraform-aws-modules/acm/aws | ~> 5.0 |
| <a name="module_sns_batch_fail"></a> [sns\_batch\_fail](#module\_sns\_batch\_fail) | terraform-aws-modules/sns/aws | ~> 6.1 |

## Resources

Expand All @@ -34,6 +36,7 @@
| [aws_cloudwatch_log_group.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
| [aws_lb_listener_rule.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lb_listener_rule) | resource |
| [aws_lb_target_group.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lb_target_group) | resource |
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
| [aws_canonical_user_id.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/canonical_user_id) | data source |
| [aws_cloudfront_log_delivery_canonical_user_id.cloudfront](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/cloudfront_log_delivery_canonical_user_id) | data source |
| [aws_route53_zone.public](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/route53_zone) | data source |
Expand Down
92 changes: 91 additions & 1 deletion infra/terraform/modules/service/batch.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
data "aws_caller_identity" "current" {}

locals {
default_retry_policy = {
attempts = 1
Expand Down Expand Up @@ -155,9 +157,97 @@ module "eventbridge" {
}

schedules = local.schedules

}

module "eventbridge_sns" {
source = "terraform-aws-modules/eventbridge/aws"
version = "~> 3.7"

create_bus = false

role_name = "vol-app-${var.environment}-batch-fail-role"

rules = {
batch-fail-sns = {
name = "${var.environment}-batch-fail-event"
description = "Capture failed Batch Events sent to SNS"
event_pattern = jsonencode({
"source" : ["aws.batch"],
"detail-type" : ["Batch Job State Change"],
"detail" : {
"status" : [
"FAILED"
],
"jobName" : [{
"wildcard" : "vol-app-${var.environment}-*"
}]
}
})
enabled = true
}
}

targets = {
batch-fail-sns = [
{
name = "batch-fail-event"
arn = module.sns_batch_fail.topic_arn
}
]
}

}

module "sns_batch_fail" {
source = "terraform-aws-modules/sns/aws"
version = "~> 6.1"

name = "vol-app-${var.environment}-batch-fail-topic"
use_name_prefix = true
display_name = "batch-event-failed"


create_topic_policy = true
enable_default_topic_policy = true
topic_policy_statements = {
pub = {
actions = ["sns:Publish"]
principals = [{
type = "AWS"
identifiers = [
"arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"
]
}]
},

sub = {
actions = [
"sns:Subscribe",
"sns:Receive",
]

principals = [{
type = "Service"
identifiers = ["events.amazonaws.com"]
}]

conditions = [{
test = "ArnLike"
variable = "aws:SourceArn"
values = [module.eventbridge_sns.eventbridge_bus_arn]
}]
}
}

tags = {
"Name" = "vol-app-${var.environment}-aws-sns-batch-fail"

}

}

resource "aws_cloudwatch_log_group" "this" {
name = "/aws/batch/vol-app-${var.environment}"
retention_in_days = 1
}
}

0 comments on commit 58c076c

Please sign in to comment.