Skip to content

Commit

Permalink
feat: Add support for email based alerting (#591)
Browse files Browse the repository at this point in the history
  • Loading branch information
ps-occrp authored Apr 15, 2024
1 parent 55ca440 commit af8ec91
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 4 deletions.
7 changes: 7 additions & 0 deletions modules/backup/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,25 @@ fetch workflows.googleapis.com/Workflow

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| backup\_monitoring\_frequency | Timeframe in which there should be at least one successfull backup | `string` | `"1d"` | no |
| backup\_retention\_time | The number of days backups should be kept | `number` | `30` | no |
| backup\_runs\_list\_max\_results | The max amount of backups to list when fetching internal backup runs for the instance. This number must be larger then the amount of backups you wish to keep. E.g. for a daily backup schedule and a backup\_retention\_time of 30 days, you'd need to set this to at least 31 for old backups to get deleted. | `number` | `31` | no |
| backup\_schedule | The cron schedule to execute the internal backup | `string` | `"45 2 * * *"` | no |
| compress\_export | Whether or not to compress the export when storing in the bucket; Only valid for MySQL and PostgreSQL | `bool` | `true` | no |
| connector\_params\_timeout | The end-to-end duration the connector call is allowed to run for before throwing a timeout exception. The default value is 1800 and this should be the maximum for connector methods that are not long-running operations. Otherwise, for long-running operations, the maximum timeout for a connector call is 31536000 seconds (one year). | `number` | `1800` | no |
| create\_email\_notification\_channel | Create email notification channel to send alerts | `bool` | `false` | no |
| email\_notification\_channel\_name | Name of email notification channel | `string` | `"Email Notification"` | no |
| enable\_backup\_monitoring | Whether to monitor backup workflows or not | `bool` | `false` | no |
| enable\_connector\_params | Whether to enable connector-specific parameters for Google Workflow SQL Export. | `bool` | `false` | no |
| enable\_export\_backup | Weather to create exports to GCS Buckets with this module | `bool` | `true` | no |
| enable\_export\_monitoring | Whether to monitor export workflows or not | `bool` | `false` | no |
| enable\_internal\_backup | Wether to create internal backups with this module | `bool` | `true` | no |
| export\_databases | The list of databases that should be exported - if is an empty set all databases will be exported | `set(string)` | `[]` | no |
| export\_monitoring\_frequency | Timeframe in which there should be at least one successfull export | `string` | `"1d"` | no |
| export\_schedule | The cron schedule to execute the export to GCS | `string` | `"15 3 * * *"` | no |
| export\_uri | The bucket and path uri for exporting to GCS | `string` | n/a | yes |
| log\_db\_name\_to\_export | Whether or not to log database name in the export workflow | `bool` | `false` | no |
| monitoring\_email | Email address to send alerts | `string` | `null` | no |
| project\_id | The project ID | `string` | n/a | yes |
| region | The region where to run the workflow | `string` | `"us-central1"` | no |
| scheduler\_timezone | The Timezone in which the Scheduler Jobs are triggered | `string` | `"Etc/GMT"` | no |
Expand Down
71 changes: 67 additions & 4 deletions modules/backup/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
locals {
create_service_account = var.service_account == null || var.service_account == "" ? true : false
service_account = local.create_service_account ? google_service_account.sql_backup_serviceaccount[0].email : var.service_account
backup_name = "sql-backup-${var.sql_instance}${var.unique_suffix}"
export_name = var.use_sql_instance_replica_in_exporter ? "sql-export-${var.sql_instance_replica}${var.unique_suffix}" : "sql-export-${var.sql_instance}${var.unique_suffix}"
}


Expand Down Expand Up @@ -52,14 +54,23 @@ data "google_sql_database_instance" "backup_instance" {
project = var.project_id
}

resource "google_monitoring_notification_channel" "email" {
count = var.create_email_notification_channel ? 1 : 0
display_name = var.email_notification_channel_name
type = "email"
labels = {
email_address = var.monitoring_email
}
}

################################
# #
# Internal Backups #
# #
################################
resource "google_workflows_workflow" "sql_backup" {
count = var.enable_internal_backup ? 1 : 0
name = "sql-backup-${var.sql_instance}${var.unique_suffix}"
name = local.backup_name
region = var.region
description = "Workflow for backing up the CloudSQL Instance "
project = var.project_id
Expand All @@ -74,7 +85,7 @@ resource "google_workflows_workflow" "sql_backup" {

resource "google_cloud_scheduler_job" "sql_backup" {
count = var.enable_internal_backup ? 1 : 0
name = "sql-backup-${var.sql_instance}${var.unique_suffix}"
name = local.backup_name
project = var.project_id
region = var.region
description = "Managed by Terraform - Triggers a SQL Backup via Workflows"
Expand All @@ -91,14 +102,40 @@ resource "google_cloud_scheduler_job" "sql_backup" {
}
}

# We want to get notified if there hasn't been at least one successful backup in a day
resource "google_monitoring_alert_policy" "sql_backup_workflow_success_alert" {
count = var.enable_internal_backup && var.enable_backup_monitoring ? 1 : 0
display_name = "Failed workflow: ${local.backup_name}"
combiner = "OR"

conditions {
display_name = "Failed workflow: ${local.backup_name}"
condition_monitoring_query_language {
query = <<-EOT
fetch workflows.googleapis.com/Workflow
| filter workflow_id == '${local.backup_name}'
| metric 'workflows.googleapis.com/finished_execution_count'
| filter metric.status == 'SUCCEEDED'
| group_by ${var.backup_monitoring_frequency}, [value_finished_execution_count_sum: sum(value.finished_execution_count)]
| every ${var.backup_monitoring_frequency}
| condition val() < 1 '1'
EOT
duration = "3600s"
trigger { count = 1 }
evaluation_missing_data = "EVALUATION_MISSING_DATA_ACTIVE"
}
}
notification_channels = [google_monitoring_notification_channel.email[0].id]
}

################################
# #
# External Backups #
# #
################################
resource "google_workflows_workflow" "sql_export" {
count = var.enable_export_backup ? 1 : 0
name = var.use_sql_instance_replica_in_exporter ? "sql-export-${var.sql_instance_replica}${var.unique_suffix}" : "sql-export-${var.sql_instance}${var.unique_suffix}"
name = local.export_name
region = var.region
description = "Workflow for backing up the CloudSQL Instance"
project = var.project_id
Expand All @@ -120,7 +157,7 @@ resource "google_workflows_workflow" "sql_export" {

resource "google_cloud_scheduler_job" "sql_export" {
count = var.enable_export_backup ? 1 : 0
name = var.use_sql_instance_replica_in_exporter ? "sql-export-${var.sql_instance_replica}${var.unique_suffix}" : "sql-export-${var.sql_instance}${var.unique_suffix}"
name = local.export_name
project = var.project_id
region = var.region
description = "Managed by Terraform - Triggers a SQL Export via Workflows"
Expand All @@ -143,3 +180,29 @@ resource "google_storage_bucket_iam_member" "sql_instance_account" {
member = "serviceAccount:${data.google_sql_database_instance.backup_instance.service_account_email_address}"
role = "roles/storage.objectCreator"
}

# We want to get notified if there hasn't been at least one successful backup in a day
resource "google_monitoring_alert_policy" "sql_export_workflow_success_alert" {
count = var.enable_export_backup && var.enable_export_monitoring ? 1 : 0
display_name = "Failed workflow: ${local.export_name}"
combiner = "OR"

conditions {
display_name = "Failed workflow: ${local.export_name}"
condition_monitoring_query_language {
query = <<-EOT
fetch workflows.googleapis.com/Workflow
| filter workflow_id == '${local.export_name}'
| metric 'workflows.googleapis.com/finished_execution_count'
| filter metric.status == 'SUCCEEDED'
| group_by ${var.export_monitoring_frequency}, [value_finished_execution_count_sum: sum(value.finished_execution_count)]
| every ${var.export_monitoring_frequency}
| condition val() < 1 '1'
EOT
duration = "3600s"
trigger { count = 1 }
evaluation_missing_data = "EVALUATION_MISSING_DATA_ACTIVE"
}
}
notification_channels = [google_monitoring_notification_channel.email[0].id]
}
42 changes: 42 additions & 0 deletions modules/backup/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,45 @@ variable "use_serverless_export" {
type = bool
default = false
}

variable "monitoring_email" {
description = "Email address to send alerts"
type = string
default = null
}

variable "enable_backup_monitoring" {
description = "Whether to monitor backup workflows or not"
type = bool
default = false
}

variable "backup_monitoring_frequency" {
description = "Timeframe in which there should be at least one successfull backup"
type = string
default = "1d"
}

variable "enable_export_monitoring" {
description = "Whether to monitor export workflows or not"
type = bool
default = false
}

variable "export_monitoring_frequency" {
description = "Timeframe in which there should be at least one successfull export"
type = string
default = "1d"
}

variable "create_email_notification_channel" {
description = "Create email notification channel to send alerts"
type = bool
default = false
}

variable "email_notification_channel_name" {
description = "Name of email notification channel"
type = string
default = "Email Notification"
}

0 comments on commit af8ec91

Please sign in to comment.