Skip to content

Commit

Permalink
fix(aws): CleanupAlarmsAgent cycle to catch exceptions (#6333)
Browse files Browse the repository at this point in the history
  • Loading branch information
christosarvanitis authored Jan 22, 2025
1 parent fcc5394 commit c4df136
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,36 +86,38 @@ class CleanupAlarmsAgent implements RunnableAgent, CustomScheduledAgent {
getAccounts().each { NetflixAmazonCredentials credentials ->
credentials.regions.each { AmazonCredentials.AWSRegion region ->
log.info("Looking for alarms to delete")

def cloudWatch = amazonClientProvider.getCloudWatch(credentials, region.name)
Set<String> attachedAlarms = getAttachedAlarms(amazonClientProvider.getAutoScaling(credentials, region.name))
def describeAlarmsRequest = new DescribeAlarmsRequest().withStateValue(StateValue.INSUFFICIENT_DATA)

while (true) {
def result = cloudWatch.describeAlarms(describeAlarmsRequest)

List<MetricAlarm> alarmsToDelete = result.metricAlarms.findAll {
it.stateUpdatedTimestamp.before(DateTime.now().minusDays(daysToLeave).toDate()) &&
!attachedAlarms.contains(it.alarmName) &&
ALARM_NAME_PATTERN.matcher(it.alarmName).matches()
}

if (alarmsToDelete) {
// terminate up to 20 alarms at a time (avoids any AWS limits on # of concurrent deletes)
alarmsToDelete.collate(20).each {
log.info("Deleting ${it.size()} alarms in ${credentials.name}/${region.name} " +
"(alarms: ${it.alarmName.join(", ")})")
cloudWatch.deleteAlarms(new DeleteAlarmsRequest().withAlarmNames(it.alarmName))
Thread.sleep(500)
try {
def cloudWatch = amazonClientProvider.getCloudWatch(credentials, region.name)
Set<String> attachedAlarms = getAttachedAlarms(amazonClientProvider.getAutoScaling(credentials, region.name))
def describeAlarmsRequest = new DescribeAlarmsRequest().withStateValue(StateValue.INSUFFICIENT_DATA)

while (true) {
def result = cloudWatch.describeAlarms(describeAlarmsRequest)

List<MetricAlarm> alarmsToDelete = result.metricAlarms.findAll {
it.stateUpdatedTimestamp.before(DateTime.now().minusDays(daysToLeave).toDate()) &&
!attachedAlarms.contains(it.alarmName) &&
ALARM_NAME_PATTERN.matcher(it.alarmName).matches()
}

}
if (alarmsToDelete) {
// terminate up to 20 alarms at a time (avoids any AWS limits on # of concurrent deletes)
alarmsToDelete.collate(20).each {
log.info("Deleting ${it.size()} alarms in ${credentials.name}/${region.name} " +
"(alarms: ${it.alarmName.join(", ")})")
cloudWatch.deleteAlarms(new DeleteAlarmsRequest().withAlarmNames(it.alarmName))
Thread.sleep(500)
}
}

if (result.nextToken) {
describeAlarmsRequest.withNextToken(result.nextToken)
} else {
break
if (result.nextToken) {
describeAlarmsRequest.withNextToken(result.nextToken)
} else {
break
}
}
} catch (Exception e) {
log.error("Error occurred while processing alarms for ${credentials.name}/${region.name}: ${e.message}", e)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class CleanupDetachedInstancesAgent implements RunnableAgent, CustomScheduledAge
getAccounts().each { NetflixAmazonCredentials credentials ->
credentials.regions.each { AmazonCredentials.AWSRegion region ->
log.info("Looking for instances pending termination in ${credentials.name}:${region.name}")

try {
def amazonEC2 = amazonClientProvider.getAmazonEC2(credentials, region.name, true)
def describeInstancesRequest = new DescribeInstancesRequest().withFilters(
new Filter("tag-key", [DetachInstancesAtomicOperation.TAG_PENDING_TERMINATION])
Expand Down Expand Up @@ -103,6 +103,9 @@ class CleanupDetachedInstancesAgent implements RunnableAgent, CustomScheduledAge
break
}
}
} catch (Exception e) {
log.error("Error occurred while processing instances pending termination for ${credentials.name}/${region.name}: ${e.message}", e)
}
}
}
}
Expand Down

0 comments on commit c4df136

Please sign in to comment.