kubernetes v1.12.1
用于管理 Node 对象,主要包括来几个两个主要功能:
集群范围内的 Node 信息同步单个 Node 的生命周期管理Node Controller 负载发现,管理和监控集群中的各个 Node 节点。kubelet 在启动时通过 API Server 注册节点信息,并定时向 API Server 发送节点信息。API Server 收到将信息写入 etcd。信息包括节点健康状况,节点资源,节点名称,节点地址信息,操作系统版本,docker 版本,kubelet 版本
nodeInfo: architecture: amd64 bootID: 6eca61f9-80de-4f23-a3f8-388dba810a3f containerRuntimeVersion: docker://17.12.1-ce kernelVersion: 3.10.0-327.el7.x86_64 kubeProxyVersion: v1.12.1 kubeletVersion: v1.12.1 machineID: 39619c0debc84e85a745c45f16fb13c0 operatingSystem: linux osImage: CentOS Linux 7 (Core) systemUUID: 420300C1-B585-FCDF-B363-A6CEB34EA1EF监测节点的健康状况。当节点变为不可访问时,节点控制器负责将NodeStatus的NodeReady条件更新为ConditionUnknown,随后从节点中卸载所有pod ,如果节点继续无法访问,(默认超时时间为40 --node-monitor-period秒,开始报告ConditionUnknown,之后为5m开始卸载)。节点控制器按每秒来检查每个节点的状态
Kube-controller-manager: 周期性检查所有节点状态,当节点处于 NotReady 状态超过一段时间后,驱逐该节点上所有 pod
路径pkg/controller/nodelifecycle/node_lifecycyle_controller.go
// Controller is the controller that manages node's life cycle. type Controller struct { taintManager *scheduler.NoExecuteTaintManager podInformerSynced cache.InformerSynced cloud cloudprovider.Interface kubeClient clientset.Interface runTaintManager bool // if set to true Controller will taint Nodes with 'TaintNodeNotReady' and 'TaintNodeUnreachable' // taints instead of evicting Pods itself. useTaintBasedEvictions bool // if set to true, NodeController will taint Nodes based on its condition for 'NetworkUnavailable', // 'MemoryPressure', 'OutOfDisk' and 'DiskPressure'. taintNodeByCondition bool nodeUpdateQueue workqueue.Interface }
注册nodelifecyclle,controllers["nodelifecycle"] = startNodeLifecycleController
// NewControllerInitializers is a public map of named controller groups (you can start more than one in an init func) // paired to their InitFunc. This allows for structured downstream composition and subdivision. func NewControllerInitializers(loopMode ControllerLoopMode) map[string]InitFunc { controllers := map[string]InitFunc{} controllers["nodeipam"] = startNodeIpamController controllers["nodelifecycle"] = startNodeLifecycleController return controllers }调用NewNodeLifecycleController函数初始化
NewNodeLifeCycleController初始化函数(第2章节讲解)
lifecycleController.Run函数主要执行体(第3章节讲解)
func startNodeLifecycleController(ctx ControllerContext) (http.Handler, bool, error) { lifecycleController, err := lifecyclecontroller.NewNodeLifecycleController( ctx.InformerFactory.Core().V1().Pods(), ctx.InformerFactory.Core().V1().Nodes(), ctx.InformerFactory.Extensions().V1beta1().DaemonSets(), ctx.Cloud, ctx.ClientBuilder.ClientOrDie("node-controller"), ctx.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration, ctx.ComponentConfig.NodeLifecycleController.NodeStartupGracePeriod.Duration, ctx.ComponentConfig.NodeLifecycleController.NodeMonitorGracePeriod.Duration, ctx.ComponentConfig.NodeLifecycleController.PodEvictionTimeout.Duration, ctx.ComponentConfig.NodeLifecycleController.NodeEvictionRate, ctx.ComponentConfig.NodeLifecycleController.SecondaryNodeEvictionRate, ctx.ComponentConfig.NodeLifecycleController.LargeClusterSizeThreshold, ctx.ComponentConfig.NodeLifecycleController.UnhealthyZoneThreshold, ctx.ComponentConfig.NodeLifecycleController.EnableTaintManager, utilfeature.DefaultFeatureGate.Enabled(features.TaintBasedEvictions), utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition), ) if err != nil { return nil, true, err } go lifecycleController.Run(ctx.Stop) return nil, true, nil }函数定义,参数较多,关注有pod ds
-node-monitor-period duration: NodeController同步NodeStatus的时间间隔(default 5s)--node-startup-grace-period duration: 在标记节点不健康之前,允许开始节点不响应的时间 (default 1m0s)--node-monitor-grace-period duration: 在标记节点不健康之前,允许运行节点不响应的时间,必须是n倍的kubelet's nodeStatusUpdateFrequency,N意味着kubelet报告node状态重试的次数(default 40s)--pod-eviction-timeout duration: 在失败的节点上删除pod的宽限时间 (default 5m0s)--node-eviction-rate float32: 当zone健康node失败情况,删除节点上的pod的速率 (default 0.1)--secondary-node-eviction-rate float32: 当zone不健康node失败情况,删除节点上的pod的速率,如果集群大小小于 large-cluster-size-threshold,则隐式地将设置为0。(default 0.01)--large-cluster-size-threshold int32: Number of nodes from which NodeController treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller(default 50)--unhealthy-zone-threshold float32: not ready 节点(至少3个)的比例达到该值时,将 Zone 标记为不健康 (default 0.55)--enable-taint-manager:如果设置为true则开启NoExecute Taints,将驱逐所有节点上(拥有这种污点的节点)不容忍运行pod (default true) // NewNodeLifecycleController returns a new taint controller. func NewNodeLifecycleController(podInformer coreinformers.PodInformer, nodeInformer coreinformers.NodeInformer, daemonSetInformer extensionsinformers.DaemonSetInformer, cloud cloudprovider.Interface, kubeClient clientset.Interface, nodeMonitorPeriod time.Duration, nodeStartupGracePeriod time.Duration, nodeMonitorGracePeriod time.Duration, podEvictionTimeout time.Duration, evictionLimiterQPS float32, secondaryEvictionLimiterQPS float32, largeClusterThreshold int32, unhealthyZoneThreshold float32, runTaintManager bool, useTaintBasedEvictions bool, taintNodeByCondition bool) (*Controller, error)包括, AddFunc UpdateFunc DeleteFunc
注册podInformerSynced为podInformer.Informer().HasSynced
podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { pod := obj.(*v1.Pod) if nc.taintManager != nil { nc.taintManager.PodUpdated(nil, pod) } }, UpdateFunc: func(prev, obj interface{}) { prevPod := prev.(*v1.Pod) newPod := obj.(*v1.Pod) if nc.taintManager != nil { nc.taintManager.PodUpdated(prevPod, newPod) } }, DeleteFunc: func(obj interface{}) { pod, isPod := obj.(*v1.Pod) // We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly. if !isPod { deletedState, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { glog.Errorf("Received unexpected object: %v", obj) return } pod, ok = deletedState.Obj.(*v1.Pod) if !ok { glog.Errorf("DeletedFinalStateUnknown contained non-Pod object: %v", deletedState.Obj) return } } if nc.taintManager != nil { nc.taintManager.PodUpdated(pod, nil) } }, }) nc.podInformerSynced = podInformer.Informer().HasSynced--enable-taint-manager:如果设置为true则开启NoExecute Taints,将驱逐所有节点上(拥有这种污点的节点)不容忍运行pod (default true)
添加node informer的event handler,包括taintManager.NodeUpdated
if nc.runTaintManager { nc.taintManager = scheduler.NewNoExecuteTaintManager(kubeClient) nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: nodeutil.CreateAddNodeHandler(func(node *v1.Node) error { nc.taintManager.NodeUpdated(nil, node) return nil }), UpdateFunc: nodeutil.CreateUpdateNodeHandler(func(oldNode, newNode *v1.Node) error { nc.taintManager.NodeUpdated(oldNode, newNode) return nil }), DeleteFunc: nodeutil.CreateDeleteNodeHandler(func(node *v1.Node) error { nc.taintManager.NodeUpdated(node, nil) return nil }), }) }
逻辑多,欠套之深,看着哎
观察到的node condition未ready时,
如果useTaintBasedEvictions为true,
如果node已经被Taint为UnreachableTaint,则将其改成NotReadyTaint,否则将node加入到taint队列
如果useTaintBasedEvictions为false,
在descisionTimestamp < readyTransitionTimestamp + nc.podEvictionTimeout (默认5分钟)
将node入队PodEvictor,由PodEvictor负责处理
decisionTimestamp := nc.now() if currentReadyCondition != nil { // Check eviction timeout against decisionTimestamp if observedReadyCondition.Status == v1.ConditionFalse { if nc.useTaintBasedEvictions { // We want to update the taint straight away if Node is already tainted with the UnreachableTaint if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { taintToAdd := *NotReadyTaintTemplate if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) { glog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.") } } else if nc.markNodeForTainting(node) { glog.V(2).Infof("Node %v is NotReady as of %v. Adding it to the Taint queue.", node.Name, decisionTimestamp, ) } } else { if decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { if nc.evictPods(node) { glog.V(2).Infof("Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout, ) } } } }观察到的node condition为unknown时,
如果useTaintBasedEvictions为true,
如果node已经被Taint为NotReadyTaint,则将其改成Unreachable Taint,否则将node加入到taint队列
如果useTaintBasedEvictions为false,
在descisionTimestamp < probeTimestamp + nc.podEvictionTimeout (默认5分钟)
将node入队PodEvictor,由PodEvictor负责处理
if observedReadyCondition.Status == v1.ConditionUnknown { if nc.useTaintBasedEvictions { // We want to update the taint straight away if Node is already tainted with the UnreachableTaint if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { taintToAdd := *UnreachableTaintTemplate if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) { glog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.") } } else if nc.markNodeForTainting(node) { glog.V(2).Infof("Node %v is unresponsive as of %v. Adding it to the Taint queue.", node.Name, decisionTimestamp, ) } } else { if decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) { if nc.evictPods(node) { glog.V(2).Infof("Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod, ) } } } }观察到的node condition为true时,
如果useTaintBasedEvictions为true,
标记node为reachable状态
如果useTaintBasedEvictions为false,
删除node在pod eviction,node已经是ready情况,取小pod eviction处理
if observedReadyCondition.Status == v1.ConditionTrue { if nc.useTaintBasedEvictions { removed, err := nc.markNodeAsReachable(node) if err != nil { glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name) } if removed { glog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name) } } else { if nc.cancelPodEviction(node) { glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) } } // remove shutdown taint this is needed always depending do we use taintbased or not err := nc.markNodeAsNotShutdown(node) if err != nil { glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name) } }报告event: NodeNotReady ,并标记所有pod也为not ready
// Report node event. if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue { nodeutil.RecordNodeStatusChange(nc.recorder, node, "NodeNotReady") if err = nodeutil.MarkAllPodsNotReady(nc.kubeClient, node); err != nil { utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err)) } }
2.1 WaitForCacheSync 函数等待 PodInformer、NodeInformer、DaemonSetInformer 的HasSyncs 都返回 true,全部完成同步
// WaitForCacheSync is a wrapper around cache.WaitForCacheSync that generates log messages // indicating that the controller identified by controllerName is waiting for syncs, followed by // either a successful or failed sync. func WaitForCacheSync(controllerName string, stopCh <-chan struct{}, cacheSyncs ...cache.InformerSynced) bool { glog.Infof("Waiting for caches to sync for %s controller", controllerName) if !cache.WaitForCacheSync(stopCh, cacheSyncs...) { utilruntime.HandleError(fmt.Errorf("Unable to sync caches for %s controller", controllerName)) return false } glog.Infof("Caches are synced for %s controller", controllerName) return true }WaitForCacheSync 每隔 100 毫秒全部执行一次,直到所有 cacheSyncs 都返回 true,表示全部 cache 同步
const syncedPollPeriod = 100 * time.Millisecond // WaitForCacheSync waits for caches to populate. It returns true if it was successful, false // if the controller should shutdown func WaitForCacheSync(stopCh <-chan struct{}, cacheSyncs ...InformerSynced) bool { err := wait.PollUntil(syncedPollPeriod, func() (bool, error) { for _, syncFunc := range cacheSyncs { if !syncFunc() { return false, nil } } return true, nil }, stopCh) if err != nil { glog.V(2).Infof("stop requested") return false } glog.V(4).Infof("caches populated") return true }
初始化一个 node manager 结构体,包括一堆线程数的个数
调用 waitForCacheSync 进行 cache 同步
启动 goroutine 进行 node 状态监控。对节点进行分类,包括 added deleted newZoneRepresentatives,分别进行处理。跟新所有节点状态