Slide 21
Slide 21 text
21
▸ 03:56:00 worker-2電源断
▸ 03:56:35 [node-healthcheck-controller-manager] Unhealhtyノードの検出
▸ 04:01:35 SelfNodeRemediation作成
▸ 04:01:37 SelfNodeRemediation更新 (phase: Pre-Reboot-Completed)
▸ 04:04:37 SelfNodeRemediation更新 (phase: Reboot-Completed)
▸ 04:04:38 [self-node-remediation-controller] out-of-service taint add
▸ 04:04:47 [kube-controller-manager] force detaching
▸ 04:04:48 SelfNodeRemediation更新 (phase: Fencing-Completed)
▸ 04:04:48 [self-node-remediation-controller] out-of-service taint remove
ノード電源断
kube-controller-manager-master-2 kube-controller-manager I0328 04:04:47.598293
1 reconciler.go:277] "attacherDetacher.DetachVolume started: node has out-of-servic
taint, force detaching" node="worker-2"
volumeName="kubernetes.io/csi/openshift-storage.rbd.csi.ceph.com^0001-0011-openshif
storage-0000000000000002-67bda59f-cf8f-4ad3-9ad9-df681d058b9c"
kube-controller-manager-master-2 kube-controller-manager I0328 04:04:47.634199
1 reconciler.go:277] "attacherDetacher.DetachVolume started: node has out-of-servic
taint, force detaching" node="worker-2"
volumeName="kubernetes.io/csi/openshift-storage.rbd.csi.ceph.com^0001-0011-openshif
storage-0000000000000002-25426b29-4dfa-48bf-8099-e7827b7cdca1"
controllers.SelfNodeRemediation out-of-service taint added {"selfnoderemediation":
"openshift-workload-availability/worker-2", "new taints":
[{"key":"node.kubernetes.io/unreachable","effect":"NoSchedule","timeAdded":"2024-03-2
8T03:56:35Z"},{"key":"node.kubernetes.io/unreachable","effect":"NoExecute","timeAdded
":"2024-03-28T03:56:40Z"},{"key":"medik8s.io/remediation","value":"self-node-remediat
ion","effect":"NoExecute","timeAdded":"2024-03-28T04:01:36Z"},{"key":"node.kubernetes
.io/unschedulable","effect":"NoSchedule","timeAdded":"2024-03-28T04:01:36Z"},{"key":"
node.kubernetes.io/out-of-service","value":"nodeshutdown","effect":"NoExecute","timeA
dded":"2024-03-28T04:04:38Z"}]}
controllers.SelfNodeRemediation out-of-service taint removed
{"selfnoderemediation": "openshift-workload-availability/worker-2", "new taints":
[{"key":"node.kubernetes.io/unreachable","effect":"NoSchedule","timeAdded":"2024-03-2
8T03:56:35Z"},{"key":"node.kubernetes.io/unreachable","effect":"NoExecute","timeAdded
":"2024-03-28T03:56:40Z"},{"key":"medik8s.io/remediation","value":"self-node-remediat
ion","effect":"NoExecute","timeAdded":"2024-03-28T04:01:36Z"},{"key":"node.kubernetes
.io/unschedulable","effect":"NoSchedule","timeAdded":"2024-03-28T04:01:36Z"}]}
func (r *SelfNodeRemediationReconciler) useOutOfServiceTaint(node *v1.Node, snr
*v1alpha1.SelfNodeRemediation) (time.Duration, error) {
if err := r.addOutOfServiceTaint(node); err != nil {
return 0, err
}
// We can not control to delete node resources by the "out-of-service" tai
// So timer is used to avoid to keep waiting to complete
if !r.isResourceDeletionCompleted(node) {
isExpired, timeLeft := r.isResourceDeletionExpired(snr)
if !isExpired {
return timeLeft, nil
}
// if the timer is expired, exponential backoff is triggered
return 0, errors.New("Not ready to delete out-of-service taint")
}
if err := r.removeOutOfServiceTaint(node); err != nil {
return 0, err
}
return 0, nil
}
controllers.NodeHealthCheck Node is going to match unhealthy condition {"node":
"worker-2", "condition type": "Ready", "condition status": "Unknown", "duration
left": "4m59.920799888s"}
controllers.NodeHealthCheck.resource manager Creating a remediation CR {"CR
name": "worker-2", "CR kind": "SelfNodeRemediation", "namespace":
"openshift-workload-availability"}