Skip to content

Commit

Permalink
feat(host): restart pod and containers when crashed (#21348)
Browse files Browse the repository at this point in the history
  • Loading branch information
zexi authored Oct 6, 2024
1 parent 5a5631a commit cc94930
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 21 deletions.
6 changes: 6 additions & 0 deletions pkg/apis/host/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,9 @@ type ContainerCommitInput struct {
Repository string `json:"repository"`
Auth *apis.ContainerPullImageAuthConfig `json:"auth"`
}

type ContainerStopInput struct {
Timeout int64 `json:"timeout"`
ShmSizeMB int `json:"shm_size_mb"`
ContainerName string `json:"container_name"`
}
40 changes: 24 additions & 16 deletions pkg/hostman/guestman/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ type PodInstance interface {
DeleteContainer(ctx context.Context, cred mcclient.TokenCredential, id string) (jsonutils.JSONObject, error)
SyncStatus(reason string)
SyncContainerStatus(ctx context.Context, cred mcclient.TokenCredential, ctrId string) (jsonutils.JSONObject, error)
StopContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, body jsonutils.JSONObject) (jsonutils.JSONObject, error)
StopContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, input *hostapi.ContainerStopInput) (jsonutils.JSONObject, error)
PullImage(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, input *hostapi.ContainerPullImageInput) (jsonutils.JSONObject, error)
SaveVolumeMountToImage(ctx context.Context, userCred mcclient.TokenCredential, input *hostapi.ContainerSaveVolumeMountToImageInput, ctrId string) (jsonutils.JSONObject, error)
ExecContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, input *computeapi.ContainerExecInput) (*url.URL, error)
Expand Down Expand Up @@ -707,6 +707,13 @@ func (s *sPodGuestInstance) StartLocalPod(ctx context.Context, userCred mcclient
return nil
}

func (s *sPodGuestInstance) ShouldRestartPodOnCrash() bool {
if len(s.GetContainers()) <= 1 {
return true
}
return false
}

func (s *sPodGuestInstance) startPod(ctx context.Context, userCred mcclient.TokenCredential) (*computeapi.PodStartResponse, error) {
retries := 3
sec := 5 * time.Second
Expand Down Expand Up @@ -753,8 +760,10 @@ func (s *sPodGuestInstance) _startPod(ctx context.Context, userCred mcclient.Tok
LogDirectory: s.getPodLogDir(),
DnsConfig: nil,
PortMappings: nil,
Labels: nil,
Annotations: nil,
Labels: map[string]string{
runtime.PodUIDLabel: s.GetId(),
},
Annotations: nil,
Linux: &runtimeapi.LinuxPodSandboxConfig{
CgroupParent: s.getCgroupParent(),
SecurityContext: &runtimeapi.LinuxSandboxSecurityContext{
Expand Down Expand Up @@ -1091,7 +1100,7 @@ func (s *sPodGuestInstance) doContainerStartPostLifecycle(ctx context.Context, c
return nil
}

func (s *sPodGuestInstance) StopContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, body jsonutils.JSONObject) (jsonutils.JSONObject, error) {
func (s *sPodGuestInstance) StopContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, input *hostapi.ContainerStopInput) (jsonutils.JSONObject, error) {
criId, err := s.getContainerCRIId(ctrId)
if err != nil {
if errors.Cause(err) == errors.ErrNotFound {
Expand All @@ -1104,19 +1113,18 @@ func (s *sPodGuestInstance) StopContainer(ctx context.Context, userCred mcclient

s.expectedStatus.SetContainerStatus(criId, ctrId, computeapi.CONTAINER_STATUS_EXITED)

if body.Contains("timeout") {
timeout, _ = body.Int("timeout")
if input.Timeout != 0 {
timeout = input.Timeout
}
if body.Contains("shm_size_mb") {
shmSizeMB, _ := body.Int("shm_size_mb")
if shmSizeMB > 64 {
name, err := body.GetString("container_name")
if err != nil {
return nil, errors.Wrapf(err, "not found name from body: %s", body)
}
if err := s.unmountDevShm(name); err != nil {
return nil, errors.Wrapf(err, "unmount shm %s", name)
}
shmSizeMB := input.ShmSizeMB

if shmSizeMB > 64 {
name := input.ContainerName
if name == "" {
return nil, errors.Wrapf(errors.ErrNotFound, "not found container_name from input: %s", jsonutils.Marshal(input))
}
if err := s.unmountDevShm(name); err != nil {
return nil, errors.Wrapf(err, "unmount shm %s", name)
}
}
if err := s.getCRI().StopContainer(ctx, criId, timeout); err != nil {
Expand Down
51 changes: 51 additions & 0 deletions pkg/hostman/guestman/pod_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
package guestman

import (
"context"
"fmt"
"strings"

"github.com/shirou/gopsutil/v3/disk"
Expand All @@ -26,6 +28,7 @@ import (
hostapi "yunion.io/x/onecloud/pkg/apis/host"
"yunion.io/x/onecloud/pkg/hostman/container/volume_mount"
"yunion.io/x/onecloud/pkg/hostman/options"
"yunion.io/x/onecloud/pkg/mcclient"
"yunion.io/x/onecloud/pkg/util/pod/image"
"yunion.io/x/onecloud/pkg/util/pod/nerdctl"
)
Expand Down Expand Up @@ -148,3 +151,51 @@ func (s *sPodGuestInstance) getVolumeMountUsage(drv volume_mount.IUsageVolumeMou
drv.InjectUsageTags(usage, vol)
return usage, nil
}

func (s *sPodGuestInstance) RestartLocalPodAndContainers(ctx context.Context, cred mcclient.TokenCredential) {
s.manager.GuestStartWorker.Run(newLocalPodRestartTask(ctx, cred, s), nil, nil)
}

type localPodRestartTask struct {
ctx context.Context
userCred mcclient.TokenCredential
pod *sPodGuestInstance
}

func newLocalPodRestartTask(ctx context.Context, userCred mcclient.TokenCredential, pod *sPodGuestInstance) *localPodRestartTask {
return &localPodRestartTask{
ctx: ctx,
userCred: userCred,
pod: pod,
}
}

func (t *localPodRestartTask) Run() {
log.Infof("restart pod and containers locally (%s/%s)", t.pod.Id, t.pod.GetName())
for _, ctr := range t.pod.GetContainers() {
log.Infof("stop container locally (%s/%s/%s/%s)", t.pod.Id, t.pod.GetName(), ctr.Id, ctr.Name)
if _, err := t.pod.StopContainer(t.ctx, t.userCred, ctr.Id, &hostapi.ContainerStopInput{
Timeout: 0,
ShmSizeMB: ctr.Spec.ShmSizeMB,
ContainerName: ctr.Name,
}); err != nil {
log.Errorf("stop container %s error: %v", ctr.Name, err)
}
}

if _, err := t.pod.startPod(t.ctx, t.userCred); err != nil {
log.Errorf("start pod(%s/%s) err: %s", t.pod.GetId(), t.pod.GetName(), err.Error())
return
}
for _, ctr := range t.pod.GetContainers() {
log.Infof("start container locally (%s/%s/%s/%s)", t.pod.Id, t.pod.GetName(), ctr.Id, ctr.Name)
if _, err := t.pod.StartLocalContainer(t.ctx, t.userCred, ctr.Id); err != nil {
log.Errorf("start container %s err: %s", ctr.Id, err.Error())
}
}
t.pod.SyncStatus("sync status after pod and containers restart locally")
}

func (t *localPodRestartTask) Dump() string {
return fmt.Sprintf("pod restart task %s/%s", t.pod.GetId(), t.pod.GetName())
}
12 changes: 8 additions & 4 deletions pkg/hostman/guestman/pod_sync_loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,15 @@ func (m *SGuestManager) startContainer(obj *sPodGuestInstance, ctr *hostapi.Cont
reason := fmt.Sprintf("start died container %s when exit code is %d", ctr.Id, cs.ExitCode)
ctx := context.Background()
userCred := hostutils.GetComputeSession(ctx).GetToken()
_, err := obj.StartLocalContainer(ctx, userCred, ctr.Id)
if err != nil {
return errors.Wrap(err, reason)
if obj.ShouldRestartPodOnCrash() {
obj.RestartLocalPodAndContainers(ctx, userCred)
} else {
log.Infof("%s: start local container (%s/%s) success", reason, obj.GetId(), ctr.Name)
_, err := obj.StartLocalContainer(ctx, userCred, ctr.Id)
if err != nil {
return errors.Wrap(err, reason)
} else {
log.Infof("%s: start local container (%s/%s) success", reason, obj.GetId(), ctr.Name)
}
}
return nil
}
Expand Down
6 changes: 5 additions & 1 deletion pkg/hostman/guestman/podhandlers/podhandlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,11 @@ func startContainer(ctx context.Context, userCred mcclient.TokenCredential, pod
}

func stopContainer(ctx context.Context, userCred mcclient.TokenCredential, pod guestman.PodInstance, ctrId string, body jsonutils.JSONObject) (jsonutils.JSONObject, error) {
return pod.StopContainer(ctx, userCred, ctrId, body)
input := new(hostapi.ContainerStopInput)
if err := body.Unmarshal(input); err != nil {
return nil, errors.Wrapf(err, "unmarshal to ContainerStopInput: %s", body.String())
}
return pod.StopContainer(ctx, userCred, ctrId, input)
}

func deleteContainer(ctx context.Context, userCred mcclient.TokenCredential, pod guestman.PodInstance, containerId string, body jsonutils.JSONObject) (jsonutils.JSONObject, error) {
Expand Down

0 comments on commit cc94930

Please sign in to comment.