Skip to content

Commit

Permalink
Merge pull request #692 from cybozu-go/repair-queue
Browse files Browse the repository at this point in the history
Implement repair queue
  • Loading branch information
morimoto-cybozu authored Jan 25, 2024
2 parents 8d30a89 + 60afa8c commit ad3cd0f
Show file tree
Hide file tree
Showing 44 changed files with 3,383 additions and 44 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
runs-on: ubuntu-22.04
strategy:
matrix:
suite: [functions, robustness, operators, reboot]
suite: [functions, robustness, operators, reboot, repair]
env:
SUITE: ${{ matrix.suite }}
CLUSTER: "cke-cluster.yml"
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ This project employs a versioning scheme described in [RELEASE.md](RELEASE.md#ve

## [Unreleased]

### Added

- Implement repair queue in [#692](https://github.com/cybozu-go/cke/pull/692)

## [1.27.3]

### Changed
Expand Down
36 changes: 36 additions & 0 deletions cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,41 @@ type Reboot struct {
const DefaultRebootEvictionTimeoutSeconds = 600
const DefaultMaxConcurrentReboots = 1

type Repair struct {
RepairProcedures []RepairProcedure `json:"repair_procedures"`
MaxConcurrentRepairs *int `json:"max_concurrent_repairs,omitempty"`
ProtectedNamespaces *metav1.LabelSelector `json:"protected_namespaces,omitempty"`
EvictRetries *int `json:"evict_retries,omitempty"`
EvictInterval *int `json:"evict_interval,omitempty"`
EvictionTimeoutSeconds *int `json:"eviction_timeout_seconds,omitempty"`
}

type RepairProcedure struct {
MachineTypes []string `json:"machine_types"`
RepairOperations []RepairOperation `json:"repair_operations"`
}

type RepairOperation struct {
Operation string `json:"operation"`
RepairSteps []RepairStep `json:"repair_steps"`
HealthCheckCommand []string `json:"health_check_command"`
CommandTimeoutSeconds *int `json:"command_timeout_seconds,omitempty"`
}

type RepairStep struct {
RepairCommand []string `json:"repair_command"`
CommandTimeoutSeconds *int `json:"command_timeout_seconds,omitempty"`
CommandRetries *int `json:"command_retries,omitempty"`
CommandInterval *int `json:"command_interval,omitempty"`
NeedDrain bool `json:"need_drain,omitempty"`
WatchSeconds *int `json:"watch_seconds,omitempty"`
}

const DefaultMaxConcurrentRepairs = 1
const DefaultRepairEvictionTimeoutSeconds = 600
const DefaultRepairHealthCheckCommandTimeoutSeconds = 30
const DefaultRepairCommandTimeoutSeconds = 30

// Options is a set of optional parameters for k8s components.
type Options struct {
Etcd EtcdParams `json:"etcd"`
Expand All @@ -307,6 +342,7 @@ type Cluster struct {
DNSServers []string `json:"dns_servers"`
DNSService string `json:"dns_service"`
Reboot Reboot `json:"reboot"`
Repair Repair `json:"repair"`
Options Options `json:"options"`
}

Expand Down
85 changes: 85 additions & 0 deletions cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package cke

import (
"os"
"slices"
"testing"

"github.com/google/go-cmp/cmp"
Expand Down Expand Up @@ -131,6 +132,90 @@ func testClusterYAML(t *testing.T) {
if c.Reboot.ProtectedNamespaces.MatchLabels["app"] != "sample" {
t.Error(`c.Reboot.ProtectedNamespaces.MatchLabels["app"] != "sample"`)
}
if len(c.Repair.RepairProcedures) != 1 {
t.Fatal(`len(c.Repair.RepairProcedures) != 1`)
}
if !slices.Equal(c.Repair.RepairProcedures[0].MachineTypes, []string{"Cray-1", "Cray-2"}) {
t.Error(`c.Repair.RepairProcedures[0].MachineTypes != {"Cray-1", "Cray-2"}`)
}
if len(c.Repair.RepairProcedures[0].RepairOperations) != 1 {
t.Fatal(`len(c.Repair.RepairProcedures[0].RepairOperations) != 1`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].Operation != "unreachable" {
t.Error(`c.Repair.RepairProcedures[0].RepairOperations[0].OperationName != "unreachable"`)
}
if len(c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps) != 2 {
t.Fatal(`len(c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps) != 2`)
}
if !slices.Equal(c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].RepairCommand, []string{"reset", "remotely"}) {
t.Error(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].RepairCommand != {"reset", "remotely"}`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandTimeoutSeconds == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandTimeoutSeconds == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandTimeoutSeconds != 10 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandTimeoutSeconds != 10`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandRetries == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandRetries == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandRetries != 1 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandRetries != 1`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandInterval == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandInterval == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandInterval != 5 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandInterval != 5`)
}
if !c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].NeedDrain {
t.Fatal(`!c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].NeedDrain`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].WatchSeconds == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].WatchSeconds == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].WatchSeconds != 60 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].WatchSeconds != 60`)
}
if !slices.Equal(c.Repair.RepairProcedures[0].RepairOperations[0].HealthCheckCommand, []string{"knock"}) {
t.Error(`c.Repair.RepairProcedures[0].RepairOperations[0].HealthCheckCommand != {"knock"}`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].CommandTimeoutSeconds == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].CommandTimeoutSeconds == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].CommandTimeoutSeconds != 30 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].CommandTimeoutSeconds != 30`)
}
if c.Repair.MaxConcurrentRepairs == nil {
t.Fatal(`c.Repair.MaxConcurrentRepairs == nil`)
}
if *c.Repair.MaxConcurrentRepairs != 2 {
t.Error(`*c.Repair.MaxConcurrentRepairs != 2`)
}
if c.Repair.ProtectedNamespaces == nil {
t.Fatal(`c.Repair.ProtectedNamespaces == nil`)
}
if c.Repair.ProtectedNamespaces.MatchLabels["app"] != "protected" {
t.Error(`c.Repair.ProtectedNamespaces.MatchLabels["app"] != "protected"`)
}
if c.Repair.EvictRetries == nil {
t.Fatal(`c.Repair.EvictRetries == nil`)
}
if *c.Repair.EvictRetries != 3 {
t.Error(`*c.Repair.EvictRetries != 3`)
}
if c.Repair.EvictInterval == nil {
t.Fatal(`c.Repair.EvictInterval == nil`)
}
if *c.Repair.EvictInterval != 5 {
t.Error(`*c.Repair.EvictInterval != 5`)
}
if c.Repair.EvictionTimeoutSeconds == nil {
t.Fatal(`c.Repair.EvictionTimeoutSeconds == nil`)
}
if *c.Repair.EvictionTimeoutSeconds != 120 {
t.Error(`*c.Repair.EvictionTimeoutSeconds != 120`)
}
if c.Options.Etcd.VolumeName != "myetcd" {
t.Error(`c.Options.Etcd.VolumeName != "myetcd"`)
}
Expand Down
56 changes: 56 additions & 0 deletions docs/ckecli.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ $ ckecli [--config FILE] <subcommand> args...
- [`ckecli reboot-queue cancel INDEX`](#ckecli-reboot-queue-cancel-index)
- [`ckecli reboot-queue cancel-all`](#ckecli-reboot-queue-cancel-all)
- [`ckecli reboot-queue reset-backoff`](#ckecli-reboot-queue-reset-backoff)
- [`ckecli repair-queue`](#ckecli-repair-queue)
- [`ckecli repair-queue enable|disable`](#ckecli-repair-queue-enabledisable)
- [`ckecli repair-queue is-enabled`](#ckecli-repair-queue-is-enabled)
- [`ckecli repair-queue add OPERATION MACHINE_TYPE ADDRESS`](#ckecli-repair-queue-add-operation-machine_type-address)
- [`ckecli repair-queue list`](#ckecli-repair-queue-list)
- [`ckecli repair-queue delete INDEX`](#ckecli-repair-queue-delete-index)
- [`ckecli repair-queue delete-finished`](#ckecli-repair-queue-delete-finished)
- [`ckecli repair-queue delete-unfinished`](#ckecli-repair-queue-delete-unfinished)
- [`ckecli repair-queue reset-backoff`](#ckecli-repair-queue-reset-backoff)
- [`ckecli sabakan`](#ckecli-sabakan)
- [`ckecli sabakan enable|disable`](#ckecli-sabakan-enabledisable)
- [`ckecli sabakan is-enabled`](#ckecli-sabakan-is-enabled)
Expand Down Expand Up @@ -311,6 +320,53 @@ Cancel all the reboot queue entries.
Reset `drain_backoff_count` and `drain_backoff_expire` of the entries in reboot queue.
Resetting these values makes CKE try to reboot nodes again immediately.

## `ckecli repair-queue`

Control a queue of repair requests.

### `ckecli repair-queue enable|disable`

Enable/Disable processing repair queue entries.

### `ckecli repair-queue is-enabled`

Show repair queue is enabled or disabled.
This displays `true` or `false`.

### `ckecli repair-queue add OPERATION MACHINE_TYPE ADDRESS`

Append a repair request to the repair queue.
The repair target is a machine with an IP address `ADDRESS` and a machine type `MACHINE_TYPE`.
The machine should be processed with an operation `OPERATION`.

### `ckecli repair-queue list`

List the entries in the repair queue.

### `ckecli repair-queue delete INDEX`

Delete the specified repair queue entry.
This has two meanings: this clears up an old entry if the specified entry has finished and cancels an ongoing entry otherwise.

Unlike the reboot queue, repair queue entries remain in the queue even after they finish.

### `ckecli repair-queue delete-finished`

Delete all finished repair queue entries.
Entries in `succeeded` or `failed` status are deleted.
This displays the index numbers of deleted entries, one per line.

### `ckecli repair-queue delete-unfinished`

Delete all unfinished repair queue entries.
Entries not in `succeeded` or `failed` status are deleted.
This displays the index numbers of deleted entries, one per line.

### `ckecli repair-queue reset-backoff`

Reset `drain_backoff_count` and `drain_backoff_expire` of the entries in repair queue.
Resetting these values makes CKE try to drain machines again immediately.

## `ckecli sabakan`

Control [sabakan integration feature](sabakan-integration.md).
Expand Down
46 changes: 45 additions & 1 deletion docs/cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ a YAML or JSON object with these fields:
- [Node](#node)
- [Taint](#taint)
- [Reboot](#reboot)
- [Repair](#repair)
- [RepairProcedure](#repairprocedure)
- [Options](#options)
- [ServiceParams](#serviceparams)
- [Mount](#mount)
Expand All @@ -27,6 +29,7 @@ a YAML or JSON object with these fields:
| `dns_servers` | false | array | List of upstream DNS server IP addresses. |
| `dns_service` | false | string | Upstream DNS service name with namespace as `namespace/service`. |
| `reboot` | false | `Reboot` | See [Reboot](#reboot). |
| `repair` | false | `Repair` | See [Repair](#repair). |
| `options` | false | `Options` | See [Options](#options). |

* `control_plane_tolerations` is used in [sabakan integration](sabakan-integration.md#strategy).
Expand Down Expand Up @@ -68,7 +71,7 @@ Reboot
------

| Name | Required | Type | Description |
|----------------------------| -------- | -------------------------------- |-------------------------------------------------------------------------|
| -------------------------- | -------- | -------------------------------- | ----------------------------------------------------------------------- |
| `reboot_command` | true | array | A command to reboot. List of strings. |
| `boot_check_command` | true | array | A command to check nodes booted. List of strings. |
| `eviction_timeout_seconds` | false | *int | Deadline for eviction. Must be positive. Default: 600 (10 minutes). |
Expand Down Expand Up @@ -98,6 +101,47 @@ The Pods in the non-protected namespaces are also tried to be deleted gracefully

If `protected_namespaces` is not given, all namespaces are protected.

Repair
------

| Name | Required | Type | Description |
| -------------------------- | -------- | -------------------------------- | --------------------------------------------------------------------- |
| `repair_procedures` | true | `[]RepairProcedure` | List of [repair procedures](#repairprocedure). |
| `max_concurrent_repairs` | false | \*int | Maximum number of machines to be repaired concurrently. Default: 1 |
| `protected_namespaces` | false | [`LabelSelector`][LabelSelector] | A label selector to protect namespaces. |
| `evict_retries` | false | \*int | Number of eviction retries, not including initial attempt. Default: 0 |
| `evict_interval` | false | \*int | Number of time between eviction retries in seconds. Default: 0 |
| `eviction_timeout_seconds` | false | *int | Deadline for eviction. Must be positive. Default: 600 (10 minutes) |

The repair configurations control the [repair functionality](repair.md).

### RepairProcedure

| Name | Required | Type | Description |
| ------------------- | -------- | ------------------- | ------------------------------------------------------------------------------------ |
| `machine_types` | true | array | Type names of the target machines to be repaired by this procedure. List of strings. |
| `repair_operations` | true | `[]RepairOperation` | List of [repair operations](#repairoperation). |

#### RepairOperation

| Name | Required | Type | Description |
| ------------------------- | -------- | -------------- | --------------------------------------------------------------- |
| `operation` | true | string | Name of repair operation. |
| `repair_steps` | true | `[]RepairStep` | Sequences of [repair steps](#repairstep). |
| `health_check_command` | true | array | A command to check repaired machine's health. List of strings. |
| `command_timeout_seconds` | false | \*int | Deadline for health retrieval. Zero means infinity. Default: 30 |

##### RepairStep

| Name | Required | Type | Description |
| ------------------------- | -------- | ----- | -------------------------------------------------------------------------------------------------------------------------------- |
| `repair_command` | true | array | A command and its arguments to repair the target machine. List of strings. |
| `command_timeout_seconds` | false | \*int | Deadline for repairing. Zero means infinity. Default: 30 |
| `command_retries` | false | \*int | Number of repair retries, not including initial attempt. Default: 0 |
| `command_interval` | false | \*int | Interval of time between repair retries in seconds. Default: 0 |
| `need_drain` | false | bool | If true, perform drain of Pods on the target machine prior to the execution of the repair command. Default: false |
| `watch_seconds` | false | \*int | Follow-up duration in seconds to watch whether the machine becomes healthy after the execution of the repair command. Default: 0 |

Options
-------

Expand Down
Loading

0 comments on commit ad3cd0f

Please sign in to comment.