The Garbage Collection of Pods

本文代码基于 Kubernetes v1.27 展开。

在 K8s 中，对于执行或调度失败的 Pods 来说，它的 API 对象还依然会存在于集群中。及时的清理掉这些对象以防止资源泄露，就变得尤其重要。K8s 中存在一个名为 Pod GC 的 controller 专门负责回收这种对象，在已终止 Pods 的数量达到 kube-controller-manager 设置的terminated-pod-gc-threshold阈值之后，Pod GC 便会开始清理工作，见gcTerminated。

另外，Pod GC 也会清理符合以下条件的任何 Pods：

是孤儿 Pods，即绑定到了一个已经不存在的 Node 上，见gcOrphaned
是未经调度过就终止的 Pods，见gcUnscheduledTerminating
是正在终止的 Pods，并绑定到了一个未 Ready 且带有node.kubernetes.io/out-of-service污点的 Node 上，见gcTerminating（启用NodeOutOfServiceVolumeDetach特性后）

// pkg/controller/podgc/gc_controller.go

// Pod GC controller 最终使用的方法
func (gcc *PodGCController) gc(ctx context.Context) {
	// 列举出当前集群中所有 pod 和 node 的资源
	pods, err := gcc.podLister.List(labels.Everything())
	nodes, err := gcc.nodeLister.List(labels.Everything())

	if gcc.terminatedPodThreshold > 0 { // 该阈值小于等于0，说明不启用 Pod GC，只进行一些其他的回收工作
		gcc.gcTerminated(ctx, pods)
	}
	if utilfeature.DefaultFeatureGate.Enabled(features.NodeOutOfServiceVolumeDetach) {
		gcc.gcTerminating(ctx, pods)
	}
	gcc.gcOrphaned(ctx, pods, nodes)
	gcc.gcUnscheduledTerminating(ctx, pods)
}

回收过程

gcTerminated

对于正常的 Pods 回收工作而言，需要关注的就是如何定义一个 Pod 的状态为已终止（terminated）？在 Pod GC 中，Pod 的已终止状态被描述为处于 Successed 或 Failed 阶段（phase）的 Pod。

func isPodTerminated(pod *v1.Pod) bool {
	if phase := pod.Status.Phase; phase != v1.PodPending && phase != v1.PodRunning && phase != v1.PodUnknown {
		return true
	}
	return false
}

在删除这些 Pod 对象时，每一个删除动作都由一个 goroutine 启动：

func (gcc *PodGCController) gcTerminated(ctx context.Context, pods []*v1.Pod) {
	terminatedPods := []*v1.Pod{}
	for _, pod := range pods {
		if isPodTerminated(pod) {
			terminatedPods = append(terminatedPods, pod)  // 收集所有处于已终止状态的 pods
		}
	}

	terminatedPodCount := len(terminatedPods)
	deleteCount := terminatedPodCount - gcc.terminatedPodThreshold
	if deleteCount <= 0 {  // 不及 pod 回收的阈值时，就终止此次回收
		return
	}

	sort.Sort(byEvictionAndCreationTimestamp(terminatedPods))  // 按驱逐状态和 pod 创建时间戳排序
	var wait sync.WaitGroup
	for i := 0; i < deleteCount; i++ {
		wait.Add(1)
		go func(pod *v1.Pod) {
			defer wait.Done()
			gcc.markFailedAndDeletePod(ctx, pod)  // 执行删除
		}(terminatedPods[i])
	}
	wait.Wait()
}

gcOrphaned

对于孤儿 Pods 的检测，实际上就是对 Pod spec 的NodeName是否被赋值、若赋值了是否属于已知 Node 的 Name 来进行检测的。那么对于含有未知NodeName的 Pods，Pod GC 并非直接认为这些 Pods 属于孤儿，而是在等待一个quarantineTime隔离周期（40s）之后，再去判断该NodeName还是否生效。若依旧不生效，才认为这些 Pods 为孤儿并进行删除。

Pod GC 引入一个隔离期的目的，其实就是为了防止 Node 不是真的不存在而是处于还未 Ready 状态的情况，避免有些 Pod 在 Node 进入 Ready 之前被误删。

func (gcc *PodGCController) gcOrphaned(ctx context.Context, pods []*v1.Pod, nodes []*v1.Node) {
	existingNodeNames := sets.NewString()
	for _, node := range nodes {
		existingNodeNames.Insert(node.Name)
	}
	// 将新找到的、未知的 node 进行隔离
	for _, pod := range pods {
		if pod.Spec.NodeName != "" && !existingNodeNames.Has(pod.Spec.NodeName) {
			gcc.nodeQueue.AddAfter(pod.Spec.NodeName, gcc.quarantineTime) // 在经过 quarantineTime 的隔离期之后再加入 node 队列
		}
	}
	// 检查 node 在隔离期之后是否还属于未知状态
	deletedNodesNames, quit := gcc.discoverDeletedNodes(ctx, existingNodeNames)
	if quit {
		return
	}

	for _, pod := range pods {
		if !deletedNodesNames.Has(pod.Spec.NodeName) {  // 将不属于任何 node 的 pod 删除
			continue
		}
		condition := corev1apply.PodCondition().
			WithType(v1.DisruptionTarget).
			WithStatus(v1.ConditionTrue).
			WithReason("DeletionByPodGC").
			WithMessage("PodGC: node no longer exists").
			WithLastTransitionTime(metav1.Now())
		gcc.markFailedAndDeletePodWithCondition(ctx, pod, condition)  // 执行删除
	}
}

在隔离期结束后，若NodeName仍然不属于任何的 Node，则考虑将属于该 Node 上的 Pod 进行删除：

func (gcc *PodGCController) discoverDeletedNodes(ctx context.Context, existingNodeNames sets.String) (sets.String, bool) {
	deletedNodesNames := sets.NewString()
	for gcc.nodeQueue.Len() > 0 {
		item, quit := gcc.nodeQueue.Get()
		if quit {
			return nil, true  // quit
		}
		nodeName := item.(string)
		if !existingNodeNames.Has(nodeName) {  // 仍然属于未知的 node 的话
			exists, err := gcc.checkIfNodeExists(ctx, nodeName) // 通过 kube-client 检查对应 node 是否真实存在
			switch {
			case err != nil:
				// ...
			case !exists:
				// 对于不存在的 node，加入到删除名单中
				deletedNodesNames.Insert(nodeName)
			}
		}
		gcc.nodeQueue.Done(item)
	}
	return deletedNodesNames, false
}

gcUnscheduledTerminating

这种情况的处理比较简单，可以直接判断出正处于终止中但还没有被调度到任何节点的 Pods：

func (gcc *PodGCController) gcUnscheduledTerminating(ctx context.Context, pods []*v1.Pod) {
	for _, pod := range pods {
		if pod.DeletionTimestamp == nil || len(pod.Spec.NodeName) > 0 {
			continue
		}
		gcc.markFailedAndDeletePod(ctx, pod)  // 执行删除
	}
}

gcTerminating

该特性由 KEP-2268 引入，主要是针对 Stateful 工作负载类型的考虑。让这些工作负载可以在源 Node 关停（shutdown）或进入到一种不可恢复状态时（比如硬件、OS 故障等）能够 failover 到另外一个不同的 Node 上去。

在该特性引入之前，若一个 Node 的关停没有被 kubelet 的 Node Shutdown Manager 检测到，则已关停 Node 上的 kubelet 是无法删除 Pods 的，这就会导致 StatefulSet 无法创建同名的新 Pods。若这些 Pods 拥有数据卷的挂载，则这些关联的数据卷也不会从原 Node 上删除，导致这些 Pods 并不能被绑定到一个新的 Node 上。只要关停的 Node 不被恢复，这些 Pods 就会永远卡在终止中（terminating）的状态，因为只有在 Node 恢复后，这些 Pods 才会被 kubelet 删除并创建到其他 Node 上去。

func (gcc *PodGCController) gcTerminating(ctx context.Context, pods []*v1.Pod) {
	terminatingPods := []*v1.Pod{}
	for _, pod := range pods {
		if isPodTerminating(pod) {  // => pod.ObjectMeta.DeletionTimestamp != nil
			node, err := gcc.nodeLister.Get(pod.Spec.NodeName)

			// 同时满足下列两个条件时，pod 才会被加入到 terminatingPods 列表中：
			// 1. Node 没有 ready
			// 2. 但是 Node 有 `node.kubernetes.io/out-of-service` 污点
			if !nodeutil.IsNodeReady(node) && taints.TaintKeyExists(node.Spec.Taints, v1.TaintNodeOutOfService) {
				terminatingPods = append(terminatingPods, pod)
			}
		}
	}

	deleteCount := len(terminatingPods)
	if deleteCount == 0 {
		return
	}

	sort.Sort(byEvictionAndCreationTimestamp(terminatingPods))  // 按驱逐状态和 pod 创建时间戳排序
	var wait sync.WaitGroup
	for i := 0; i < deleteCount; i++ {
		wait.Add(1)
		go func(pod *v1.Pod) {
			defer wait.Done()
			gcc.markFailedAndDeletePod(ctx, pod)  // 执行删除
		}(terminatingPods[i])
	}
	wait.Wait()
}

该特性要求用户手动为那些已经确定需要关停（并且短时间内不会恢复）的 Node 添加一个名为node.kubernetes.io/out-of-service的污点，该污点意味着 Pod 将会从 Node 上驱逐，若 Pod 不存在能容忍该污点的 toleration，则 Pod 就不会被再创建到已关停的 Node 上。

删除过程

上述回收过程的最后，其实都调用了执行删除的函数，该函数本质上为markFailedAndDeletePodWithCondition。除去PodDisruptionConditions特性之外，就是直接使用 kube-client 删除对应的 Pod：

func (gcc *PodGCController) markFailedAndDeletePod(ctx context.Context, pod *v1.Pod) error {
	return gcc.markFailedAndDeletePodWithCondition(ctx, pod, nil)
}

func (gcc *PodGCController) markFailedAndDeletePodWithCondition(ctx context.Context, pod *v1.Pod, condition *corev1apply.PodConditionApplyConfiguration) error {
	if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) {
		// 对于处于运行中阶段的 Pod，进行清楚原因的设置
		if pod.Status.Phase != v1.PodSucceeded && pod.Status.Phase != v1.PodFailed {
			podApply := corev1apply.Pod(pod.Name, pod.Namespace).WithStatus(corev1apply.PodStatus())
			podApply.Status.WithPhase(v1.PodFailed)
			// 只有在 gcOrphaned 调用下该 condition 才不为 nil，传入的 condition 就是 `DelectionByPodGC`
			if condition != nil {
				podApply.Status.WithConditions(condition)
			}
			gcc.kubeClient.CoreV1().Pods(pod.Namespace).ApplyStatus(ctx, podApply, metav1.ApplyOptions{FieldManager: fieldManager, Force: true})  // => fieldManager := "PodGC"
		}
	}
	return gcc.kubeClient.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, *metav1.NewDeleteOptions(0))
}

PodDisruptionConditions这个特性最初是由 KEP-3329 引入，其主要目的就是为失败的 Pod 提供一个对用户更加友好的状态解释。其将 Pod 的 Disruption 状态大致分为两种，即容器/程序本身的 bug 或基础设施层面的错误。对于后者来说，其规定了一系列的由基础设施引发的中断条件，Pod GC 也属于其中一个（DeletionByPodGC）。

Reference

PREVIOUSRaft 的成员变更与 Etcd 实现

NEXTCilium CNI 工作原理解析