Skip to content

Commit

Permalink
feat(region,host): start pod when container exited (#21258)
Browse files Browse the repository at this point in the history
  • Loading branch information
zexi authored Oct 3, 2024
1 parent 12d77b0 commit 2e7b443
Show file tree
Hide file tree
Showing 23 changed files with 2,072 additions and 80 deletions.
56 changes: 34 additions & 22 deletions pkg/apis/compute/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package compute

import (
"reflect"
"time"

"yunion.io/x/jsonutils"
"yunion.io/x/pkg/gotypes"
Expand Down Expand Up @@ -55,34 +56,36 @@ const (
)

const (
CONTAINER_STATUS_PULLING_IMAGE = "pulling_image"
CONTAINER_STATUS_PULL_IMAGE_FAILED = "pull_image_failed"
CONTAINER_STATUS_PULLED_IMAGE = "pulled_image"
CONTAINER_STATUS_CREATING = "creating"
CONTAINER_STATUS_CREATE_FAILED = "create_failed"
CONTAINER_STATUS_SAVING_IMAGE = "saving_image"
CONTAINER_STATUS_SAVE_IMAGE_FAILED = "save_image_failed"
CONTAINER_STATUS_STARTING = "starting"
CONTAINER_STATUS_START_FAILED = "start_failed"
CONTAINER_STATUS_STOPPING = "stopping"
CONTAINER_STATUS_STOP_FAILED = "stop_failed"
CONTAINER_STATUS_SYNC_STATUS = "sync_status"
CONTAINER_STATUS_SYNC_STATUS_FAILED = "sync_status_failed"
CONTAINER_STATUS_UNKNOWN = "unknown"
CONTAINER_STATUS_CREATED = "created"
CONTAINER_STATUS_EXITED = "exited"
CONTAINER_STATUS_RUNNING = "running"
CONTAINER_STATUS_DELETING = "deleting"
CONTAINER_STATUS_DELETE_FAILED = "delete_failed"
CONTAINER_STATUS_COMMITTING = "committing"
CONTAINER_STATUS_COMMIT_FAILED = "commit_failed"
CONTAINER_STATUS_PULLING_IMAGE = "pulling_image"
CONTAINER_STATUS_PULL_IMAGE_FAILED = "pull_image_failed"
CONTAINER_STATUS_PULLED_IMAGE = "pulled_image"
CONTAINER_STATUS_CREATING = "creating"
CONTAINER_STATUS_CREATE_FAILED = "create_failed"
CONTAINER_STATUS_SAVING_IMAGE = "saving_image"
CONTAINER_STATUS_SAVE_IMAGE_FAILED = "save_image_failed"
CONTAINER_STATUS_STARTING = "starting"
CONTAINER_STATUS_START_FAILED = "start_failed"
CONTAINER_STATUS_STOPPING = "stopping"
CONTAINER_STATUS_STOP_FAILED = "stop_failed"
CONTAINER_STATUS_SYNC_STATUS = "sync_status"
CONTAINER_STATUS_SYNC_STATUS_FAILED = "sync_status_failed"
CONTAINER_STATUS_UNKNOWN = "unknown"
CONTAINER_STATUS_CREATED = "created"
CONTAINER_STATUS_EXITED = "exited"
CONTAINER_STATUS_CRASH_LOOP_BACK_OFF = "crash_loop_back_off"
CONTAINER_STATUS_RUNNING = "running"
CONTAINER_STATUS_DELETING = "deleting"
CONTAINER_STATUS_DELETE_FAILED = "delete_failed"
CONTAINER_STATUS_COMMITTING = "committing"
CONTAINER_STATUS_COMMIT_FAILED = "commit_failed"
// for health check
CONTAINER_STATUS_PROBING = "probing"
CONTAINER_STATUS_PROBE_FAILED = "probe_failed"
)

var (
ContainerRunningStatus = sets.NewString(CONTAINER_STATUS_RUNNING, CONTAINER_STATUS_PROBING)
ContainerExitedStatus = sets.NewString(CONTAINER_STATUS_EXITED, CONTAINER_STATUS_CRASH_LOOP_BACK_OFF)
)

const (
Expand Down Expand Up @@ -132,7 +135,9 @@ type ContainerStopInput struct {
}

type ContainerSyncStatusResponse struct {
Status string `json:"status"`
Status string `json:"status"`
StartedAt time.Time `json:"started_at"`
RestartCount int `json:"restart_count"`
}

type ContainerHostDevice struct {
Expand Down Expand Up @@ -232,3 +237,10 @@ type KubeServerContainerRegistryDetails struct {
Type string `json:"type"`
Config *KubeServerContainerRegistryConfig `json:"config"`
}

type ContainerPerformStatusInput struct {
apis.PerformStatusInput
RestartCount int `json:"restart_count"`
StartedAt *time.Time `json:"started_at"`
LastFinishedAt *time.Time `json:"last_finished_at"`
}
2 changes: 2 additions & 0 deletions pkg/apis/compute/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ const (
POD_STATUS_DELETE_CONTAINER_FAILED = "delete_container_failed"
POD_STATUS_SYNCING_CONTAINER_STATUS = "syncing_container_status"
POD_STATUS_SYNCING_CONTAINER_STATUS_FAILED = "sync_container_status_failed"
POD_STATUS_CRASH_LOOP_BACK_OFF = "crash_loop_back_off"
POD_STATUS_CONTAINER_EXITED = "container_exited"
)

const (
Expand Down
18 changes: 12 additions & 6 deletions pkg/apis/host/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
package host

import (
"time"

"yunion.io/x/onecloud/pkg/apis"
)

Expand Down Expand Up @@ -86,9 +88,10 @@ type ContainerDiskDevice struct {
}

type ContainerCreateInput struct {
Name string `json:"name"`
GuestId string `json:"guest_id"`
Spec *ContainerSpec `json:"spec"`
Name string `json:"name"`
GuestId string `json:"guest_id"`
Spec *ContainerSpec `json:"spec"`
RestartCount int `json:"restart_count"`
}

type ContainerPullImageInput struct {
Expand All @@ -103,9 +106,12 @@ type ContainerPushImageInput struct {
}

type ContainerDesc struct {
Id string `json:"id"`
Name string `json:"name"`
Spec *ContainerSpec `json:"spec"`
Id string `json:"id"`
Name string `json:"name"`
Spec *ContainerSpec `json:"spec"`
StartedAt time.Time `json:"started_at"`
LastFinishedAt time.Time `json:"last_finished_at"`
RestartCount int `json:"restart_count"`
}

type ContainerSaveVolumeMountToImageInput struct {
Expand Down
7 changes: 4 additions & 3 deletions pkg/compute/guestdrivers/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -426,9 +426,10 @@ func (p *SPodDriver) getContainerCreateInput(ctx context.Context, userCred mccli
return nil, errors.Wrap(err, "ToHostContainerSpec")
}
input := &hostapi.ContainerCreateInput{
Name: ctr.GetName(),
GuestId: ctr.GuestId,
Spec: spec,
Name: ctr.GetName(),
GuestId: ctr.GuestId,
Spec: spec,
RestartCount: ctr.RestartCount,
}
return input, nil
}
Expand Down
37 changes: 31 additions & 6 deletions pkg/compute/models/containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ type SContainer struct {
GuestId string `width:"36" charset:"ascii" create:"required" list:"user" index:"true"`
// Spec stores all container running options
Spec *api.ContainerSpec `length:"long" create:"required" list:"user" update:"user"`

// 启动时间
StartedAt time.Time `nullable:"false" created_at:"false" index:"true" get:"user" list:"user" json:"started_at"`
// 上次退出时间
LastFinishedAt time.Time `nullable:"true" created_at:"false" index:"true" get:"user" list:"user" json:"last_finished_at"`

// 重启次数
RestartCount int `nullable:"true" list:"user"`
}

func (m *SContainerManager) CreateOnPod(
Expand Down Expand Up @@ -593,9 +601,12 @@ func (c *SContainer) GetJsonDescAtHost(ctx context.Context, userCred mcclient.To
return nil, errors.Wrap(err, "ToHostContainerSpec")
}
return &hostapi.ContainerDesc{
Id: c.GetId(),
Name: c.GetName(),
Spec: spec,
Id: c.GetId(),
Name: c.GetName(),
Spec: spec,
StartedAt: c.StartedAt,
LastFinishedAt: c.LastFinishedAt,
RestartCount: c.RestartCount,
}, nil
}

Expand Down Expand Up @@ -755,13 +766,27 @@ func (c *SContainer) GetReleasedDevices(ctx context.Context, userCred mcclient.T
return out, nil
}

func (c *SContainer) PerformStatus(ctx context.Context, userCred mcclient.TokenCredential, query jsonutils.JSONObject, input apis.PerformStatusInput) (jsonutils.JSONObject, error) {
if c.GetStatus() == api.CONTAINER_STATUS_EXITED {
func (c *SContainer) PerformStatus(ctx context.Context, userCred mcclient.TokenCredential, query jsonutils.JSONObject, input api.ContainerPerformStatusInput) (jsonutils.JSONObject, error) {
if api.ContainerExitedStatus.Has(c.GetStatus()) {
if input.Status == api.CONTAINER_STATUS_PROBE_FAILED {
return nil, httperrors.NewInputParameterError("can't set container status to %s when %s", input.Status, c.Status)
}
}
return c.SVirtualResourceBase.PerformStatus(ctx, userCred, query, input)
if _, err := db.Update(c, func() error {
if input.RestartCount > 0 {
c.RestartCount = input.RestartCount
}
if input.StartedAt != nil {
c.StartedAt = *input.StartedAt
}
if input.LastFinishedAt != nil {
c.LastFinishedAt = *input.LastFinishedAt
}
return nil
}); err != nil {
return nil, errors.Wrap(err, "Update container status")
}
return c.SVirtualResourceBase.PerformStatus(ctx, userCred, query, input.PerformStatusInput)
}

func (c *SContainer) getContainerHostCommitInput(ctx context.Context, userCred mcclient.TokenCredential, input *api.ContainerCommitInput) (*hostapi.ContainerCommitInput, error) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/compute/models/guest_actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -3461,7 +3461,7 @@ func (self *SGuest) PerformStatus(ctx context.Context, userCred mcclient.TokenCr
func (self *SGuest) PerformStop(ctx context.Context, userCred mcclient.TokenCredential, query jsonutils.JSONObject,
input api.ServerStopInput) (jsonutils.JSONObject, error) {
// XXX if is force, force stop guest
if input.IsForce || utils.IsInStringArray(self.Status, []string{api.VM_RUNNING, api.VM_STOP_FAILED}) {
if input.IsForce || utils.IsInStringArray(self.Status, []string{api.VM_RUNNING, api.VM_STOP_FAILED, api.POD_STATUS_CRASH_LOOP_BACK_OFF, api.POD_STATUS_CONTAINER_EXITED}) {
if err := self.ValidateEncryption(ctx, userCred); err != nil {
return nil, errors.Wrap(httperrors.ErrForbidden, "encryption key not accessible")
}
Expand Down
9 changes: 9 additions & 0 deletions pkg/compute/tasks/container_sync_status_task.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ func (t *ContainerSyncStatusTask) OnSyncStatus(ctx context.Context, container *m
resp := new(api.ContainerSyncStatusResponse)
data.Unmarshal(resp)
container.SetStatus(ctx, t.GetUserCred(), resp.Status, "")
if _, err := db.Update(container, func() error {
if resp.RestartCount > 0 {
container.RestartCount = resp.RestartCount
}
container.StartedAt = resp.StartedAt
return nil
}); err != nil {
log.Errorf("Update container started_at: %s", err)
}
t.SetStageComplete(ctx, nil)
}

Expand Down
8 changes: 5 additions & 3 deletions pkg/hostman/container/status/status_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ func (m *manager) SetContainerStartup(podId string, containerId string, started
if started {
status = computeapi.CONTAINER_STATUS_RUNNING
}
input := &apis.PerformStatusInput{
Status: status,
Reason: result.Reason,
input := &computeapi.ContainerPerformStatusInput{
PerformStatusInput: apis.PerformStatusInput{
Status: status,
Reason: result.Reason,
},
}
if _, err := hostutils.UpdateContainerStatus(context.Background(), containerId, input); err != nil {
log.Errorf("set container(%s/%s) status failed: %s", podId, containerId, err)
Expand Down
20 changes: 20 additions & 0 deletions pkg/hostman/guestman/guestman.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"yunion.io/x/jsonutils"
"yunion.io/x/log"
"yunion.io/x/pkg/errors"
"yunion.io/x/pkg/util/clock"
"yunion.io/x/pkg/util/seclib"
"yunion.io/x/pkg/utils"

Expand All @@ -44,6 +45,8 @@ import (
"yunion.io/x/onecloud/pkg/hostman/guestman/desc"
fwd "yunion.io/x/onecloud/pkg/hostman/guestman/forwarder"
fwdpb "yunion.io/x/onecloud/pkg/hostman/guestman/forwarder/api"
"yunion.io/x/onecloud/pkg/hostman/guestman/pod/pleg"
"yunion.io/x/onecloud/pkg/hostman/guestman/pod/runtime"
"yunion.io/x/onecloud/pkg/hostman/guestman/types"
deployapi "yunion.io/x/onecloud/pkg/hostman/hostdeployer/apis"
"yunion.io/x/onecloud/pkg/hostman/hostinfo/hostconsts"
Expand Down Expand Up @@ -111,6 +114,9 @@ type SGuestManager struct {
// container related members
containerProbeManager prober.Manager
enableDirtyRecoveryFeature bool
containerRuntimeManager runtime.Runtime
pleg pleg.PodLifecycleEventGenerator
podCache runtime.Cache
}

func NewGuestManager(host hostutils.IHost, serversPath string, workerCnt int) (*SGuestManager, error) {
Expand Down Expand Up @@ -138,6 +144,20 @@ func NewGuestManager(host hostutils.IHost, serversPath string, workerCnt int) (*
}
if manager.host.IsContainerHost() {
manager.startContainerProbeManager()
runtimeMan, err := runtime.NewRuntimeManager(manager.GetCRI())
if err != nil {
return nil, errors.Wrap(err, "new container runtime manager")
}
manager.podCache = runtime.NewCache()
manager.containerRuntimeManager = runtimeMan
manager.pleg = pleg.NewGenericPLEG(runtimeMan, pleg.ChannelCapacity, pleg.RelistPeriod, manager.podCache, clock.RealClock{})
manager.pleg.Start()
go func() {
manager.syncContainerLoop(manager.pleg.Watch())
}()
go func() {
manager.reconcileContainerLoop(manager.podCache)
}()
}
return manager, nil
}
Expand Down
Loading

0 comments on commit 2e7b443

Please sign in to comment.