Kubernetes容器重啟原理-Kubelet Hash計算
在日常的開發(fā)工作中相信使用 Kubernetes 的同學們一定會偶爾收到容器重啟的事件告警。由于應用層面的問題導致的容器重啟相對容易排查,比如看容器的內存監(jiān)控我們能確定是不是內存超過配置的 limit; 又或者看是不是應用有 panic 沒有 recovery。
一個正常的工作日我們突然連續(xù)收到多條容器重啟告警,查看報警還是來自不同的應用。按照一般的排查思路先去查看監(jiān)控,內存沒有異常,使用值一直在 limit 之下;然后去看日志也沒有找到任何 panic 或者其他錯誤。仔細一看這幾個告警的應用都是來自同一個集群,這個時候猜測大概率和集群有關系,但是這個集群我們還有其他很多應用并沒有發(fā)生容器重啟,所以猜測應該不是集群本身的問題,那是不是和機器有關系呢?然后我把重啟過的實例所在的 node ip 都篩選出來發(fā)現(xiàn)重啟的應用都是集中在某幾臺機器。在這些節(jié)點上我去查看了一下 kubelet進程,發(fā)現(xiàn) kubelet 在容器告警的時間段都重啟了進程。在這種情況下基本就找到了容器重啟的直接原因–kubelet 重啟了。但是我們并沒有更新實例,kubelet 重啟怎么會把我們的容器重啟呢?下面我們就介紹一下根本原因 – kubelet計算容器的 hash 值。
我們知道在 Kubernetes 中的節(jié)點上運行著 kubelet 進程,這個進程負責當前節(jié)點上所有 Pod 的生命周期。在這里我們從源碼層面看看 kubelet 怎么實現(xiàn)容器的重啟。
SyncPod
我們首先看 https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/kuberuntime/kuberuntime_manager.go 中的 SyncPod 方法, 這個方法就是保證運行中的 Pod 與我們期望的配置時刻保持一致。通過以下步驟完成
根據(jù)從 API Server 獲得的 Pod Spec 以及當前 Pod 的 Status 計算所需要執(zhí)行的 Actions 在需要情況下 Kill 掉當前 Pod 的 sandbox 根據(jù)需要(如重啟)kill 掉 Pod 內的 containers 根據(jù)需要創(chuàng)建 Pod 的 sandbox 啟動下一個 init container 啟動 Pod 內的 containers
func?(m?*kubeGenericRuntimeManager)?SyncPod(pod?*v1.Pod,?_?v1.PodStatus,?podStatus?*kubecontainer.PodStatus,?pullSecrets?[]v1.Secret,?backOff?*flowcontrol.Backoff)?(result?kubecontainer.PodSyncResult)?{
?//?Step?1:?Compute?sandbox?and?container?changes.
????//?計算?pod?的
?podContainerChanges?:=?m.computePodActions(pod,?podStatus)
?glog.V(3).Infof("computePodActions?got?%+v?for?pod?%q",?podContainerChanges,?format.Pod(pod))
?if?podContainerChanges.CreateSandbox?{
??ref,?err?:=?ref.GetReference(legacyscheme.Scheme,?pod)
??if?err?!=?nil?{
???glog.Errorf("Couldn't?make?a?ref?to?pod?%q:?'%v'",?format.Pod(pod),?err)
??}
??if?podContainerChanges.SandboxID?!=?""?{
???m.recorder.Eventf(ref,?v1.EventTypeNormal,?events.SandboxChanged,?"Pod?sandbox?changed,?it?will?be?killed?and?re-created.")
??}?else?{
???glog.V(4).Infof("SyncPod?received?new?pod?%q,?will?create?a?sandbox?for?it",?format.Pod(pod))
??}
?}
?//?Step?2:?Kill?the?pod?if?the?sandbox?has?changed.
????//?sandbox?有更新,需要?kill?pod
?if?podContainerChanges.KillPod?{
????????...
??killResult?:=?m.killPodWithSyncResult(pod,?kubecontainer.ConvertPodStatusToRunningPod(m.runtimeName,?podStatus),?nil)
??result.AddPodSyncResult(killResult)
??if?killResult.Error()?!=?nil?{
???glog.Errorf("killPodWithSyncResult?failed:?%v",?killResult.Error())
???return
??}
??if?podContainerChanges.CreateSandbox?{
???m.purgeInitContainers(pod,?podStatus)
??}
?}?else?{
??//?Step?3:?kill?any?running?containers?in?this?pod?which?are?not?to?keep.
????????//?kill?掉?pod?中不需要保留的容器
??for?containerID,?containerInfo?:=?range?podContainerChanges.ContainersToKill?{
???glog.V(3).Infof("Killing?unwanted?container?%q(id=%q)?for?pod?%q",?containerInfo.name,?containerID,?format.Pod(pod))
???killContainerResult?:=?kubecontainer.NewSyncResult(kubecontainer.KillContainer,?containerInfo.name)
???result.AddSyncResult(killContainerResult)
???if?err?:=?m.killContainer(pod,?containerID,?containerInfo.name,?containerInfo.message,?nil);?err?!=?nil?{
????killContainerResult.Fail(kubecontainer.ErrKillContainer,?err.Error())
????glog.Errorf("killContainer?%q(id=%q)?for?pod?%q?failed:?%v",?containerInfo.name,?containerID,?format.Pod(pod),?err)
????return
???}
??}
?}
????...
?//?Step?4:?Create?a?sandbox?for?the?pod?if?necessary.
????//?按需創(chuàng)建?sandbox
?podSandboxID?:=?podContainerChanges.SandboxID
?if?podContainerChanges.CreateSandbox?{
??var?msg?string
??var?err?error
??glog.V(4).Infof("Creating?sandbox?for?pod?%q",?format.Pod(pod))
??createSandboxResult?:=?kubecontainer.NewSyncResult(kubecontainer.CreatePodSandbox,?format.Pod(pod))
??result.AddSyncResult(createSandboxResult)
??podSandboxID,?msg,?err?=?m.createPodSandbox(pod,?podContainerChanges.Attempt)
????????...
??}
????????...
?}
????...
?//?Step?5:?start?the?init?container.
????//?啟動?init?容器
?if?container?:=?podContainerChanges.NextInitContainerToStart;?container?!=?nil?{
??//?Start?the?next?init?container.
??startContainerResult?:=?kubecontainer.NewSyncResult(kubecontainer.StartContainer,?container.Name)
??result.AddSyncResult(startContainerResult)
????????...
??if?msg,?err?:=?m.startContainer(podSandboxID,?podSandboxConfig,?container,?pod,?podStatus,?pullSecrets,?podIP,?kubecontainer.ContainerTypeInit);?err?!=?nil?{
???startContainerResult.Fail(err,?msg)
???utilruntime.HandleError(fmt.Errorf("init?container?start?failed:?%v:?%s",?err,?msg))
???return
??}
??//?Successfully?started?the?container;?clear?the?entry?in?the?failure
??glog.V(4).Infof("Completed?init?container?%q?for?pod?%q",?container.Name,?format.Pod(pod))
?}
?//?Step?6:?start?containers?in?podContainerChanges.ContainersToStart.
????//?根據(jù)?step1?結果啟動容器
?for?_,?idx?:=?range?podContainerChanges.ContainersToStart?{
??container?:=?&pod.Spec.Containers[idx]
??startContainerResult?:=?kubecontainer.NewSyncResult(kubecontainer.StartContainer,?container.Name)
??result.AddSyncResult(startContainerResult)
????????...
??glog.V(4).Infof("Creating?container?%+v?in?pod?%v",?container,?format.Pod(pod))
??if?msg,?err?:=?m.startContainer(podSandboxID,?podSandboxConfig,?container,?pod,?podStatus,?pullSecrets,?podIP,?kubecontainer.ContainerTypeRegular);?err?!=?nil?{
????????????...
??}
?}
?return
}
computePodActions
在上面 SyncPod 方法中我們可以看到 step 1 的 computePodActions 是決定容器是否需要重啟的關鍵調用,我們看看這個方法具體的邏輯
//?computePodActions?checks?whether?the?pod?spec?has?changed?and?returns?the?changes?if?true.
func?(m?*kubeGenericRuntimeManager)?computePodActions(pod?*v1.Pod,?podStatus?*kubecontainer.PodStatus)?podActions?{
?glog.V(5).Infof("Syncing?Pod?%q:?%+v",?format.Pod(pod),?pod)
?createPodSandbox,?attempt,?sandboxID?:=?m.podSandboxChanged(pod,?podStatus)
?changes?:=?podActions{
??KillPod:???????????createPodSandbox,
??CreateSandbox:?????createPodSandbox,
??SandboxID:?????????sandboxID,
??Attempt:???????????attempt,
??ContainersToStart:?[]int{},
??ContainersToKill:??make(map[kubecontainer.ContainerID]containerToKillInfo),
?}
????//?這里我們省略其他內容,直接看判斷容器是否需要重啟的核心邏輯
?//?Number?of?running?containers?to?keep.
?keepCount?:=?0
?//?check?the?status?of?containers.
?for?idx,?container?:=?range?pod.Spec.Containers?{
??containerStatus?:=?podStatus.FindContainerStatusByName(container.Name)
??//?Call?internal?container?post-stop?lifecycle?hook?for?any?non-running?container?so?that?any
??//?allocated?cpus?are?released?immediately.?If?the?container?is?restarted,?cpus?will?be?re-allocated
??//?to?it.
??if?containerStatus?!=?nil?&&?containerStatus.State?!=?kubecontainer.ContainerStateRunning?{
???if?err?:=?m.internalLifecycle.PostStopContainer(containerStatus.ID.ID);?err?!=?nil?{
????glog.Errorf("internal?container?post-stop?lifecycle?hook?failed?for?container?%v?in?pod?%v?with?error?%v",
?????container.Name,?pod.Name,?err)
???}
??}
??//?If?container?does?not?exist,?or?is?not?running,?check?whether?we
??//?need?to?restart?it.
??if?containerStatus?==?nil?||?containerStatus.State?!=?kubecontainer.ContainerStateRunning?{
???if?kubecontainer.ShouldContainerBeRestarted(&container,?pod,?podStatus)?{
????message?:=?fmt.Sprintf("Container?%+v?is?dead,?but?RestartPolicy?says?that?we?should?restart?it.",?container)
????glog.V(3).Infof(message)
????changes.ContainersToStart?=?append(changes.ContainersToStart,?idx)
???}
???continue
??}
??//?The?container?is?running,?but?kill?the?container?if?any?of?the?following?condition?is?met.
??reason?:=?""
??restart?:=?shouldRestartOnFailure(pod)
????????//?計算容器的期望的?hash?和?當前?hash,?來判斷是否需要重啟容器
??if?expectedHash,?actualHash,?changed?:=?containerChanged(&container,?containerStatus);?changed?{
???reason?=?fmt.Sprintf("Container?spec?hash?changed?(%d?vs?%d).",?actualHash,?expectedHash)
???//?Restart?regardless?of?the?restart?policy?because?the?container
???//?spec?changed.
???restart?=?true
??}?else?if?liveness,?found?:=?m.livenessManager.Get(containerStatus.ID);?found?&&?liveness?==?proberesults.Failure?{
???//?If?the?container?failed?the?liveness?probe,?we?should?kill?it.
???reason?=?"Container?failed?liveness?probe."
??}?else?{
???//?Keep?the?container.
???keepCount?+=?1
???continue
??}
??//?We?need?to?kill?the?container,?but?if?we?also?want?to?restart?the
??//?container?afterwards,?make?the?intent?clear?in?the?message.?Also?do
??//?not?kill?the?entire?pod?since?we?expect?container?to?be?running?eventually.
??message?:=?reason
????????//?可以看到如果需要重啟容器,則把容器?id?放到待啟動?slice?里準備重啟
??if?restart?{
???message?=?fmt.Sprintf("%s.?Container?will?be?killed?and?recreated.",?message)
???changes.ContainersToStart?=?append(changes.ContainersToStart,?idx)
??}
????????//?容器信息更新到待?kill?的?map?里
??changes.ContainersToKill[containerStatus.ID]?=?containerToKillInfo{
???name:??????containerStatus.Name,
???container:?&pod.Spec.Containers[idx],
???message:???message,
??}
??glog.V(2).Infof("Container?%q?(%q)?of?pod?%s:?%s",?container.Name,?containerStatus.ID,?format.Pod(pod),?message)
?}
?if?keepCount?==?0?&&?len(changes.ContainersToStart)?==?0?{
??changes.KillPod?=?true
?}
?return?changes
}
containerChanged
在上個方法里我們看到 containerChanged的調用決定了容器是否需要重啟,接下來我們看看如果計算容器的 hash 值
func?containerChanged(container?*v1.Container,?containerStatus?*kubecontainer.ContainerStatus)?(uint64,?uint64,?bool)?{
?expectedHash?:=?kubecontainer.HashContainer(container)
?return?expectedHash,?containerStatus.Hash,?containerStatus.Hash?!=?expectedHash
}
在文件`kubernetes/pkg/kubelet/container/helpers.go`?中提供了計算?hash?的方法
//?HashContainer?returns?the?hash?of?the?container.?It?is?used?to?compare
//?the?running?container?with?its?desired?spec.
func?HashContainer(container?*v1.Container)?uint64?{
?hash?:=?fnv.New32a()
?hashutil.DeepHashObject(hash,?*container)
?return?uint64(hash.Sum32())
}
通過上述的代碼的我們可以清楚的看到只要 v1.Container 這個 struct 里任何一個字段發(fā)生改變都會導致期望的容器 hash 值更新。
下面這種圖清晰總結了 Kubelet 重啟容器的過程,詳相信對照下圖和上面的代碼大家應該能很好的了解 Kubernetes 的容器重啟過程。

原文鏈接:https://www.lxkaka.wang/kubelet-hash/
