Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
clean completed jobs after 24h
Browse files Browse the repository at this point in the history
  • Loading branch information
suiguoxin committed Mar 19, 2021
1 parent c38d54c commit 68732d5
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/alert-manager/deploy/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ rules:
verbs: ["patch"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["create"]
verbs: ["create", "list", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
Expand Down
41 changes: 40 additions & 1 deletion src/alert-manager/src/alert-handler/controllers/node.js
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ const getK8sV1Job = (jobName, nodeName, minorNumber) => {
name: jobName,
},
spec: {
ttlSecondsAfterFinished: 86400, // TODO: enable this feature when install k8s / delete the job elsewhere
// TTL feature is currently alpha[Kubernetes 1.15]
// To avoid using this fearure, jobs will be cleaned with function `cleanCompletedfixNvidiaGPULowPerfJobs` regularlly
// ttlSecondsAfterFinished: 86400,
template: {
metadata: {
name: 'nvidia-gpu-low-perf-fixer',
Expand Down Expand Up @@ -170,8 +172,45 @@ const fixNvidiaGPULowPerf = (req, res) => {
});
};

// clean completed jobs which were used to fix NvidiaGPULowPerf issue
// the jobs completed for more than 24 hours will be deleted
const cleanCompletedfixNvidiaGPULowPerfJobs = (req, res) => {
logger.info(
'Cleaning completed jobs which were used to fix NvidiaGPULowPerf issue...',
);

const k8sApi = kc.makeApiClient(k8s.BatchV1Api);
k8sApi
.listNamespacedJob('default')
.then((response) => {
logger.info(`Successfully get job list.`);
const jobs = response.body.items;
jobs.forEach((job) => {
const jobName = job.metadata.name;
// check job name & if the job has completed
if (
jobName.startsWith('nvidia-gpu-low-perf-fixer-') &&
(job.status.succeeded === 1 || jobs.status.failed === 1) &&
new Date() - new Date(job.status.completionTime) > 24 * 60 * 60 * 1000 // completed for more than 24h
)
k8sApi
.deleteNamespacedJob(jobName, 'default')
.then((response) => {
logger.info(`Successfully deleted job ${jobName}`);
})
.catch((error) => {
logger.info(`Failed to delete job ${jobName}`, error);
});
});
})
.catch((error) => {
logger.error('Failed to list jobs:', error);
});
};

// module exports
module.exports = {
cordonNodes,
fixNvidiaGPULowPerf,
cleanCompletedfixNvidiaGPULowPerfJobs,
};
7 changes: 7 additions & 0 deletions src/alert-manager/src/alert-handler/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ require('module-alias/register');
const express = require('express');
const bearerToken = require('express-bearer-token');
const actions = require('@alert-handler/routes/actions');
const nodeController = require('@alert-handler/controllers/node');
const logger = require('@alert-handler/common/logger');

const app = express();
Expand All @@ -36,3 +37,9 @@ const port = parseInt(process.env.SERVER_PORT);
app.listen(port, () => {
logger.info(`alert-handler listening at http://localhost:${port}`);
});

// check completed jobs which were used to fix NvidiaGPULowPerf issue every 1 hour
setInterval(
nodeController.cleanCompletedfixNvidiaGPULowPerfJobs,
60 * 60 * 1000,
);

0 comments on commit 68732d5

Please sign in to comment.