diff --git a/pkg/apis/host/container.go b/pkg/apis/host/container.go index dc09f9dff97..ecec721b51f 100644 --- a/pkg/apis/host/container.go +++ b/pkg/apis/host/container.go @@ -125,3 +125,9 @@ type ContainerCommitInput struct { Repository string `json:"repository"` Auth *apis.ContainerPullImageAuthConfig `json:"auth"` } + +type ContainerStopInput struct { + Timeout int64 `json:"timeout"` + ShmSizeMB int `json:"shm_size_mb"` + ContainerName string `json:"container_name"` +} diff --git a/pkg/hostman/guestman/pod.go b/pkg/hostman/guestman/pod.go index db4da9393f3..5f4ce5a5c03 100644 --- a/pkg/hostman/guestman/pod.go +++ b/pkg/hostman/guestman/pod.go @@ -116,7 +116,7 @@ type PodInstance interface { DeleteContainer(ctx context.Context, cred mcclient.TokenCredential, id string) (jsonutils.JSONObject, error) SyncStatus(reason string) SyncContainerStatus(ctx context.Context, cred mcclient.TokenCredential, ctrId string) (jsonutils.JSONObject, error) - StopContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, body jsonutils.JSONObject) (jsonutils.JSONObject, error) + StopContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, input *hostapi.ContainerStopInput) (jsonutils.JSONObject, error) PullImage(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, input *hostapi.ContainerPullImageInput) (jsonutils.JSONObject, error) SaveVolumeMountToImage(ctx context.Context, userCred mcclient.TokenCredential, input *hostapi.ContainerSaveVolumeMountToImageInput, ctrId string) (jsonutils.JSONObject, error) ExecContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, input *computeapi.ContainerExecInput) (*url.URL, error) @@ -707,6 +707,13 @@ func (s *sPodGuestInstance) StartLocalPod(ctx context.Context, userCred mcclient return nil } +func (s *sPodGuestInstance) ShouldRestartPodOnCrash() bool { + if len(s.GetContainers()) <= 1 { + return true + } + return false +} + func (s *sPodGuestInstance) startPod(ctx context.Context, userCred mcclient.TokenCredential) (*computeapi.PodStartResponse, error) { retries := 3 sec := 5 * time.Second @@ -753,8 +760,10 @@ func (s *sPodGuestInstance) _startPod(ctx context.Context, userCred mcclient.Tok LogDirectory: s.getPodLogDir(), DnsConfig: nil, PortMappings: nil, - Labels: nil, - Annotations: nil, + Labels: map[string]string{ + runtime.PodUIDLabel: s.GetId(), + }, + Annotations: nil, Linux: &runtimeapi.LinuxPodSandboxConfig{ CgroupParent: s.getCgroupParent(), SecurityContext: &runtimeapi.LinuxSandboxSecurityContext{ @@ -1091,7 +1100,7 @@ func (s *sPodGuestInstance) doContainerStartPostLifecycle(ctx context.Context, c return nil } -func (s *sPodGuestInstance) StopContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, body jsonutils.JSONObject) (jsonutils.JSONObject, error) { +func (s *sPodGuestInstance) StopContainer(ctx context.Context, userCred mcclient.TokenCredential, ctrId string, input *hostapi.ContainerStopInput) (jsonutils.JSONObject, error) { criId, err := s.getContainerCRIId(ctrId) if err != nil { if errors.Cause(err) == errors.ErrNotFound { @@ -1104,19 +1113,18 @@ func (s *sPodGuestInstance) StopContainer(ctx context.Context, userCred mcclient s.expectedStatus.SetContainerStatus(criId, ctrId, computeapi.CONTAINER_STATUS_EXITED) - if body.Contains("timeout") { - timeout, _ = body.Int("timeout") + if input.Timeout != 0 { + timeout = input.Timeout } - if body.Contains("shm_size_mb") { - shmSizeMB, _ := body.Int("shm_size_mb") - if shmSizeMB > 64 { - name, err := body.GetString("container_name") - if err != nil { - return nil, errors.Wrapf(err, "not found name from body: %s", body) - } - if err := s.unmountDevShm(name); err != nil { - return nil, errors.Wrapf(err, "unmount shm %s", name) - } + shmSizeMB := input.ShmSizeMB + + if shmSizeMB > 64 { + name := input.ContainerName + if name == "" { + return nil, errors.Wrapf(errors.ErrNotFound, "not found container_name from input: %s", jsonutils.Marshal(input)) + } + if err := s.unmountDevShm(name); err != nil { + return nil, errors.Wrapf(err, "unmount shm %s", name) } } if err := s.getCRI().StopContainer(ctx, criId, timeout); err != nil { diff --git a/pkg/hostman/guestman/pod_helper.go b/pkg/hostman/guestman/pod_helper.go index b566f2794c6..1836861dea4 100644 --- a/pkg/hostman/guestman/pod_helper.go +++ b/pkg/hostman/guestman/pod_helper.go @@ -15,6 +15,8 @@ package guestman import ( + "context" + "fmt" "strings" "github.com/shirou/gopsutil/v3/disk" @@ -26,6 +28,7 @@ import ( hostapi "yunion.io/x/onecloud/pkg/apis/host" "yunion.io/x/onecloud/pkg/hostman/container/volume_mount" "yunion.io/x/onecloud/pkg/hostman/options" + "yunion.io/x/onecloud/pkg/mcclient" "yunion.io/x/onecloud/pkg/util/pod/image" "yunion.io/x/onecloud/pkg/util/pod/nerdctl" ) @@ -148,3 +151,51 @@ func (s *sPodGuestInstance) getVolumeMountUsage(drv volume_mount.IUsageVolumeMou drv.InjectUsageTags(usage, vol) return usage, nil } + +func (s *sPodGuestInstance) RestartLocalPodAndContainers(ctx context.Context, cred mcclient.TokenCredential) { + s.manager.GuestStartWorker.Run(newLocalPodRestartTask(ctx, cred, s), nil, nil) +} + +type localPodRestartTask struct { + ctx context.Context + userCred mcclient.TokenCredential + pod *sPodGuestInstance +} + +func newLocalPodRestartTask(ctx context.Context, userCred mcclient.TokenCredential, pod *sPodGuestInstance) *localPodRestartTask { + return &localPodRestartTask{ + ctx: ctx, + userCred: userCred, + pod: pod, + } +} + +func (t *localPodRestartTask) Run() { + log.Infof("restart pod and containers locally (%s/%s)", t.pod.Id, t.pod.GetName()) + for _, ctr := range t.pod.GetContainers() { + log.Infof("stop container locally (%s/%s/%s/%s)", t.pod.Id, t.pod.GetName(), ctr.Id, ctr.Name) + if _, err := t.pod.StopContainer(t.ctx, t.userCred, ctr.Id, &hostapi.ContainerStopInput{ + Timeout: 0, + ShmSizeMB: ctr.Spec.ShmSizeMB, + ContainerName: ctr.Name, + }); err != nil { + log.Errorf("stop container %s error: %v", ctr.Name, err) + } + } + + if _, err := t.pod.startPod(t.ctx, t.userCred); err != nil { + log.Errorf("start pod(%s/%s) err: %s", t.pod.GetId(), t.pod.GetName(), err.Error()) + return + } + for _, ctr := range t.pod.GetContainers() { + log.Infof("start container locally (%s/%s/%s/%s)", t.pod.Id, t.pod.GetName(), ctr.Id, ctr.Name) + if _, err := t.pod.StartLocalContainer(t.ctx, t.userCred, ctr.Id); err != nil { + log.Errorf("start container %s err: %s", ctr.Id, err.Error()) + } + } + t.pod.SyncStatus("sync status after pod and containers restart locally") +} + +func (t *localPodRestartTask) Dump() string { + return fmt.Sprintf("pod restart task %s/%s", t.pod.GetId(), t.pod.GetName()) +} diff --git a/pkg/hostman/guestman/pod_sync_loop.go b/pkg/hostman/guestman/pod_sync_loop.go index d21ad9e9a46..251ca9f55af 100644 --- a/pkg/hostman/guestman/pod_sync_loop.go +++ b/pkg/hostman/guestman/pod_sync_loop.go @@ -88,11 +88,15 @@ func (m *SGuestManager) startContainer(obj *sPodGuestInstance, ctr *hostapi.Cont reason := fmt.Sprintf("start died container %s when exit code is %d", ctr.Id, cs.ExitCode) ctx := context.Background() userCred := hostutils.GetComputeSession(ctx).GetToken() - _, err := obj.StartLocalContainer(ctx, userCred, ctr.Id) - if err != nil { - return errors.Wrap(err, reason) + if obj.ShouldRestartPodOnCrash() { + obj.RestartLocalPodAndContainers(ctx, userCred) } else { - log.Infof("%s: start local container (%s/%s) success", reason, obj.GetId(), ctr.Name) + _, err := obj.StartLocalContainer(ctx, userCred, ctr.Id) + if err != nil { + return errors.Wrap(err, reason) + } else { + log.Infof("%s: start local container (%s/%s) success", reason, obj.GetId(), ctr.Name) + } } return nil } diff --git a/pkg/hostman/guestman/podhandlers/podhandlers.go b/pkg/hostman/guestman/podhandlers/podhandlers.go index 27d4493fd5e..f21b2952937 100644 --- a/pkg/hostman/guestman/podhandlers/podhandlers.go +++ b/pkg/hostman/guestman/podhandlers/podhandlers.go @@ -178,7 +178,11 @@ func startContainer(ctx context.Context, userCred mcclient.TokenCredential, pod } func stopContainer(ctx context.Context, userCred mcclient.TokenCredential, pod guestman.PodInstance, ctrId string, body jsonutils.JSONObject) (jsonutils.JSONObject, error) { - return pod.StopContainer(ctx, userCred, ctrId, body) + input := new(hostapi.ContainerStopInput) + if err := body.Unmarshal(input); err != nil { + return nil, errors.Wrapf(err, "unmarshal to ContainerStopInput: %s", body.String()) + } + return pod.StopContainer(ctx, userCred, ctrId, input) } func deleteContainer(ctx context.Context, userCred mcclient.TokenCredential, pod guestman.PodInstance, containerId string, body jsonutils.JSONObject) (jsonutils.JSONObject, error) {