Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Kubescape for AI cluster scan #45

Merged
merged 2 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions hack/assistant-setup/file-search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ export async function setupFileSearch(client: OpenAI): Promise<string> {
const vectorStore = await client.beta.vectorStores.create({
name: vectorStoreName,
});
console.log(
`Created vector store '${vectorStore.name}' (ID: ${vectorStore.id})`,
);

console.log(
"Uploading files to vector store and waiting for the file batch processing to complete. This might take a few minutes...",
Expand Down
2 changes: 1 addition & 1 deletion hack/assistant-setup/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ const instructions = dedent`

async function main() {
let cfg: Config = {
projectID: "",
projectID: undefined,
assistantID: "",
};
const assistantEnv = process.env["ASSISTANT_ENV"];
Expand Down
93 changes: 93 additions & 0 deletions hack/assistant-setup/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -237,5 +237,98 @@ export function setupTools(): Array<AssistantTool> {
},
},
},
{
type: "function",
function: {
name: "kubescapeScanCluster",
description: dedent`
It serves as an all-in-one tool for vulnerability and misconfiguration scanning for the whole Kubernetes cluster.
Kubescape includes misconfiguration and vulnerability scanning as well as risk analysis and security compliance indicators.
All results are presented in context and users get many cues on what to do based on scan results.
It saves Kubernetes users and admins precious time, effort, and resources.
`,
},
},
{
type: "function",
function: {
name: "kubescapeScanWorkload",
description: dedent`
Allows you to comprehensively report on the security posture of individual workloads running in a Kubernetes cluster.
This includes both misconfiguration and image vulnerability scanning.
This scan results in information that gives a 360-degree assessment of your workload's security posture.

Usage:
# Scan a workload
kubescape scan workload {kind}/{name}
# Scan a workload in a specific namespace
kubescape scan workload {kind}/{name} --namespace {namespace}
`,
parameters: {
type: "object",
properties: {
namespace: {
type: "string",
description: "Kubernetes namespace, e.g. kube-system",
},
resource_kind: {
type: "string",
description:
"Kubernetes workload kind, e.g. Deployment or StatefulSet.",
},
resource_name: {
type: "string",
description: "Kubernetes workload name, e.g. botkube-api-server.",
},
},
required: ["resource_kind", "resource_name"],
},
},
},
{
type: "function",
function: {
name: "kubescapeScanImage",
description: dedent`
Scan an image for vulnerabilities.

Usage:
kubescape scan image "nginx"
kubescape scan image "nginx:latest"
`,
parameters: {
type: "object",
properties: {
image: {
type: "string",
description: "Image name with tag, e.g. nginx:latest",
},
},
required: ["image"],
},
},
},
{
type: "function",
function: {
name: "kubescapeScanControl",
description: dedent`
Allows you to get details about a given Kubescape issue based on ID like "C-0188" or "C-0007".

Usage:
kubescape scan control {control ID}
`,
parameters: {
type: "object",
properties: {
control: {
type: "string",
description: "Control ID, e.g. C-0188.",
},
},
required: ["control"],
},
},
},
];
}
121 changes: 74 additions & 47 deletions internal/source/ai-brain/assistant.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (
"strings"
"time"

"github.com/MakeNowJust/heredoc"

"github.com/kubeshop/botkube/pkg/ptr"

"github.com/kubeshop/botkube-cloud-plugins/internal/otelx"
Expand All @@ -28,52 +30,61 @@ import (
)

const (
cacheTTL = 8 * time.Hour
openAIPollInterval = 2 * time.Second
maxToolExecutionRetries = 3
quotaExceededErrCode = "quota_exceeded"
tracerName = "source.aibrain"
serviceName = "botkube-plugins-source-ai-brain"
clusterScanSubcommandName = "scan"

clusterScanPrompt = `
Scan the Kubernetes cluster for critical issues that could significantly impact the cluster's health, stability, or security.
Focus on problems that may not be immediately apparent through events or standard monitoring.

Provide a concise overview of the scan results, including the total number of
critical issues found. If there were no issues found for a specific check, do
not include that section in the report. List the Kubernetes objects directly
affected by the issue. Make sure that your checks are relevant to the current
state of the cluster, do not include resources that no longer exist.

Summary section needs to be at the top of the report, followed by specific checks.

Specific Checks:

Pod Health:
Identify pods in a crash-loop backoff state with a high restart count.
Identify pods that have been OOMKilled (Out of Memory Killed) multiple times.
Look for pods stuck in a pending state for an extended period.
Resource Utilization:
Identify nodes or pods with critically high CPU or memory usage (e.g., above 90% of limits).
Check for critical resource starvation issues affecting multiple pods or namespaces.
Configuration:
Look for pods running with very insecure capabilities (e.g., ALL, NET_RAW, SYS_ADMIN).
Identify pods using deprecated or insecure container images.
Check for misconfigured network policies that could expose sensitive services.
Networking:
Identify pods or services experiencing significant network latency or packet loss.
Check for network partitions or connectivity issues between critical components.

Additional Guidance for the LLM Agent:

Prioritize issues that pose the most immediate threat to the cluster's stability, performance, or security.
Filter out informational or low-severity issues that are unlikely to cause major problems.
Be as specific as possible in the descriptions. Do not exceed 2000 characters in your response.
`
cacheTTL = 8 * time.Hour
openAIPollInterval = 2 * time.Second
maxToolExecutionRetries = 3
quotaExceededErrCode = "quota_exceeded"
serviceName = "botkube-plugins-source-ai-brain"
temperature float32 = 0.1
msgSplitPattern = "\n\n---\n\n"
clusterScanSubcommandName = "scan"
multipleMessagesDelay = 500 * time.Millisecond
)

var temperature float32 = 0.1
var (
clusterScanPrompt = heredoc.Doc(`
Scan the Kubernetes cluster for critical issues that could significantly impact the cluster's health, stability, or security.
Focus on problems that may not be immediately apparent through events or standard monitoring.
Use Kubescape and kubectl tools to scan the cluster, and then aggregate the results based on the instructions.
Prioritize Kubescape scan results over the kubectl tools results. Include links for Kubescape controls which you got them from Kubescape scan results.

Provide a concise overview of the scan results, including the total number of issues found.
If there were no issues found for a specific check, do not include that section in the report.
List the Kubernetes objects directly affected by the issue.
Make sure that your checks are relevant to the current state of the cluster, do not include resources that no longer exist.

Summary section needs to be at the top of the report, followed by specific checks.
Summary outlines what are the issues and how many of them were found, and one line sentence about the overall cluster state based on the results.
Use emojis for the severity of the issues in the summary (critical/high/medium/low), and also for the headlines of the checks to distinguish them.
Use a separator "\n\n---\n\n" to split the message into TWO logical sections, no more.

Specific checks:

Pod Health:
Identify pods in a crash-loop backoff state with a high restart count.
Identify pods that have been OOMKilled (Out of Memory Killed) multiple times.
Look for pods stuck in a pending state for an extended period.
Resource Utilization:
Identify nodes or pods with critically high CPU or memory usage. By critically high we mean over 90% or more.
Check for critical resource starvation issues affecting multiple pods or namespaces.
Configuration:
Look for pods running with very insecure capabilities (e.g., ALL, NET_RAW, SYS_ADMIN).
Identify pods using deprecated or insecure container images.
Check for misconfigured network policies that could expose sensitive services.
Networking:
Identify pods or services experiencing significant network latency or packet loss.
Check for network partitions or connectivity issues between critical components.
Security
Under this section, include Security posture from Kubescape scan.

Additional Guidance for the LLM Agent:

Prioritize issues that pose the most immediate threat to the cluster's stability, performance, or security.
Skip the check output if there are no issues found for a given check. Filter out informational issues.
Be as specific as possible in the descriptions. Do not exceed 3000 characters in your response.
Don't show kubescape commands.
At the end of the message, add "Feel free to ask me to provide additional details, or help on how to resolve found issues!", without a separator, in any form you like.`)
)

type tool func(ctx context.Context, args []byte, p *Payload) (string, error)

Expand Down Expand Up @@ -105,6 +116,7 @@ func newAssistant(cfg *Config, log logrus.FieldLogger, out chan source.Event, ku
tracer := otel.Tracer(serviceName)

kcRunner := NewKubectlRunner(kubeConfigPath, tracer)
ksRunner := NewKubescapeRunner(kubeConfigPath, tracer)
bkRunner, err := NewBotkubeRunner(tracer)
if err != nil {
return nil, fmt.Errorf("while creating Botkube runner: %w", err)
Expand Down Expand Up @@ -133,6 +145,10 @@ func newAssistant(cfg *Config, log logrus.FieldLogger, out chan source.Event, ku
"kubectlLogs": kcRunner.Logs,
"botkubeGetStartupAgentConfiguration": bkRunner.GetStartupAgentConfiguration,
"botkubeGetAgentStatus": bkRunner.GetAgentStatus,
"kubescapeScanCluster": ksRunner.ScanCluster,
"kubescapeScanWorkload": ksRunner.ScanWorkload,
"kubescapeScanControl": ksRunner.ScanControl,
"kubescapeScanImage": ksRunner.ScanImage,
},
vectorStoreIDForThread: cfg.VectorStoreIDForThread,
}, nil
Expand Down Expand Up @@ -226,7 +242,7 @@ func (i *assistant) handleThread(ctx context.Context, p *Payload) (err error) {
})
run, err := i.openaiClient.CreateRun(ctx, threadID, openai.RunRequest{
AssistantID: i.assistID,
Temperature: &temperature,
Temperature: ptr.FromType(temperature),
})
if err != nil {
return fmt.Errorf("while creating a thread run: %w", err)
Expand Down Expand Up @@ -359,8 +375,19 @@ func (i *assistant) handleStatusCompleted(ctx context.Context, run openai.Run, p

textValue := i.trimCitationsIfPresent(i.log, c.Text)

i.out <- source.Event{
Message: msgAIAnswer(run, p, textValue, toolCalls),
msgs := strings.Split(textValue, msgSplitPattern)
isMultiMessage := len(msgs) > 1
for j, msg := range msgs {
isLastMessage := j == len(msgs)-1
i.out <- source.Event{
Message: msgAIAnswer(run, p, msg, toolCalls, isLastMessage),
}

if isMultiMessage {
// Ugly workaround to force ordering of messages in the same thread
// Probably PubSub related?
time.Sleep(multipleMessagesDelay)
madebyrogal marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

Expand Down
12 changes: 9 additions & 3 deletions internal/source/ai-brain/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,16 @@ func binaryDependencies() map[string]api.Dependency {
"darwin/amd64": fmt.Sprintf("https://dl.k8s.io/release/%s/bin/darwin/amd64/kubectl", kubectlVersion),
"darwin/arm64": fmt.Sprintf("https://dl.k8s.io/release/%s/bin/darwin/arm64/kubectl", kubectlVersion),
"linux/amd64": fmt.Sprintf("https://dl.k8s.io/release/%s/bin/linux/amd64/kubectl", kubectlVersion),
"linux/s390x": fmt.Sprintf("https://dl.k8s.io/release/%s/bin/linux/s390x/kubectl", kubectlVersion),
"linux/ppc64le": fmt.Sprintf("https://dl.k8s.io/release/%s/bin/linux/ppc64le/kubectl", kubectlVersion),
"linux/arm64": fmt.Sprintf("https://dl.k8s.io/release/%s/bin/linux/arm64/kubectl", kubectlVersion),
"linux/386": fmt.Sprintf("https://dl.k8s.io/release/%s/bin/linux/386/kubectl", kubectlVersion),
},
},
kubescapeBinaryName: {
URLs: map[string]string{
"windows/amd64": fmt.Sprintf("https://github.com/kubescape/kubescape/releases/download/%s/kubescape-arm64-macos-latest", kubescapeVersion),
pkosiec marked this conversation as resolved.
Show resolved Hide resolved
"darwin/amd64": fmt.Sprintf("https://github.com/kubescape/kubescape/releases/download/%s/kubescape-macos-latest", kubescapeVersion),
"darwin/arm64": fmt.Sprintf("https://github.com/kubescape/kubescape/releases/download/%s/kubescape-macos-latest", kubescapeVersion),
"linux/amd64": fmt.Sprintf("https://github.com/kubescape/kubescape/releases/download/%s/kubescape-ubuntu-latest", kubescapeVersion),
"linux/arm64": fmt.Sprintf("https://github.com/kubescape/kubescape/releases/download/%s/kubescape-arm64-ubuntu-latest", kubescapeVersion),
pkosiec marked this conversation as resolved.
Show resolved Hide resolved
},
},
}
Expand Down
Loading
Loading