Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[4.3] Add a "reboot request" annotation flow, and request-reboot command #926

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions cmd/machine-config-daemon/request_reboot.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package main

import (
"flag"
"fmt"
"os"

daemon "github.com/openshift/machine-config-operator/pkg/daemon"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
)

var requestRebootCmd = &cobra.Command{
Use: "request-reboot",
DisableFlagsInUseLine: true,
Short: "Request a reboot",
Args: cobra.ExactArgs(1),
Run: executeRequestReboot,
}

// init executes upon import
func init() {
rootCmd.AddCommand(requestRebootCmd)
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
}

func runRequestReboot(_ *cobra.Command, args []string) error {
flag.Set("logtostderr", "true")
flag.Parse()

return daemon.RequestReboot(args[0])
}

// Execute runs the command
func executeRequestReboot(cmd *cobra.Command, args []string) {
err := runRequestReboot(cmd, args)
if err != nil {
fmt.Printf("error: %v\n", err)
os.Exit(1)
}
}
3 changes: 3 additions & 0 deletions pkg/apis/machineconfiguration.openshift.io/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ type MachineConfigPoolStatus struct {
// A node is marked degraded if applying a configuration failed..
DegradedMachineCount int32 `json:"degradedMachineCount"`

// RequestedRebootMachineCount is the number of machines which have a reboot requested annotation.
RequestedRebootMachineCount int32 `json:"requestedRebootMachineCount"`

// conditions represents the latest available observations of current state.
// +optional
Conditions []MachineConfigPoolCondition `json:"conditions"`
Expand Down
102 changes: 89 additions & 13 deletions pkg/controller/node/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,32 @@ func (ctrl *Controller) deleteMachineConfigPool(obj interface{}) {
// TODO(abhinavdahiya): handle deletes.
}

// reconcileRebootApproval removes the approval annotation if there's no request
func (ctrl *Controller) reconcileRebootApproval(poolName string, node *corev1.Node) (*corev1.Node, error) {
var retNode *corev1.Node

// If there's a stray approval with no request, delete it
_, rebootRequested := node.Annotations[daemonconsts.MachineConfigDaemonRebootRequestedAnnotation]
_, rebootApproved := node.Annotations[daemonconsts.MachineConfigDaemonRebootApprovedAnnotation]
if !rebootRequested && rebootApproved {
glog.Infof("Pool %s: Removing stale reboot approval for node %s", poolName, node.Name)
var err error
retNode, err = internal.UpdateNodeRetry(ctrl.kubeClient.CoreV1().Nodes(), ctrl.nodeLister, node.Name, func(node *corev1.Node) {
_, rebootRequested := node.Annotations[daemonconsts.MachineConfigDaemonRebootRequestedAnnotation]
_, rebootApproved := node.Annotations[daemonconsts.MachineConfigDaemonRebootApprovedAnnotation]
if !rebootRequested && rebootApproved {
delete(node.Annotations, daemonconsts.MachineConfigDaemonRebootApprovedAnnotation)
}
})
if err != nil {
return nil, goerrs.Wrapf(err, "updating reboot annotation")
}
return retNode, nil
}

return nil, nil
}

func (ctrl *Controller) addNode(obj interface{}) {
node := obj.(*corev1.Node)
if node.DeletionTimestamp != nil {
Expand All @@ -395,6 +421,9 @@ func (ctrl *Controller) addNode(obj interface{}) {
return
}
glog.V(4).Infof("Node %s added", node.Name)
if _, err := ctrl.reconcileRebootApproval(pool.Name, node); err != nil {
glog.Errorf("%v", err)
}
ctrl.enqueueMachineConfigPool(pool)
}

Expand Down Expand Up @@ -447,7 +476,25 @@ func (ctrl *Controller) updateNode(old, cur interface{}) {
}
}

_, oldRebootRequested := oldNode.Annotations[daemonconsts.MachineConfigDaemonRebootRequestedAnnotation]
_, newRebootRequested := curNode.Annotations[daemonconsts.MachineConfigDaemonRebootRequestedAnnotation]
if oldRebootRequested != newRebootRequested {
glog.Infof("Pool %s: node %s changed reboot request state: %v", pool.Name, curNode.Name, newRebootRequested)
changed = true
}

newNode, err := ctrl.reconcileRebootApproval(pool.Name, curNode)
if err != nil {
glog.Errorf("%v", err)
return
}
if newNode != nil {
curNode = newNode
changed = true
}

if !changed {
glog.V(4).Infof("No relevant changes to pool %s node %s", pool.Name, curNode.Name)
return
}

Expand Down Expand Up @@ -677,8 +724,11 @@ func (ctrl *Controller) syncMachineConfigPool(key string) error {
}

candidates := getCandidateMachines(pool, nodes, maxunavail)
if len(candidates) == 0 {
glog.V(3).Infof("Pool %s: No candidates to update", pool.Name)
}
for _, node := range candidates {
if err := ctrl.setDesiredMachineConfigAnnotation(node.Name, pool.Spec.Configuration.Name); err != nil {
if err := ctrl.updateCandidateNode(pool.Name, node, pool.Spec.Configuration.Name); err != nil {
return err
}
}
Expand Down Expand Up @@ -711,8 +761,16 @@ func (ctrl *Controller) getNodesForPool(pool *mcfgv1.MachineConfigPool) ([]*core
return nodes, nil
}

func (ctrl *Controller) setDesiredMachineConfigAnnotation(nodeName, currentConfig string) error {
glog.Infof("Setting node %s to desired config %s", nodeName, currentConfig)
func (ctrl *Controller) updateCandidateNode(poolName string, node *corev1.Node, currentConfig string) error {
nodeName := node.Name
_, rebootRequested := node.Annotations[daemonconsts.MachineConfigDaemonRebootRequestedAnnotation]
needsConfigChange := node.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] != currentConfig

if needsConfigChange {
glog.Infof("Pool %s: Setting node %s to desired config %s", poolName, nodeName, currentConfig)
} else if rebootRequested {
glog.Infof("Pool %s: Approving node %s reboot", poolName, nodeName)
}
return clientretry.RetryOnConflict(nodeUpdateBackoff, func() error {
oldNode, err := ctrl.kubeClient.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
if err != nil {
Expand All @@ -722,16 +780,27 @@ func (ctrl *Controller) setDesiredMachineConfigAnnotation(nodeName, currentConfi
if err != nil {
return err
}
_, rebootRequested := oldNode.Annotations[daemonconsts.MachineConfigDaemonRebootRequestedAnnotation]
_, rebootApproved := oldNode.Annotations[daemonconsts.MachineConfigDaemonRebootApprovedAnnotation]
needsRebootApproval := rebootRequested && !rebootApproved
needsConfigChange := oldNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] != currentConfig
if !needsRebootApproval && !needsConfigChange {
// The RebootRequested flag was most likely removed, nothing to do then.
return nil
}

newNode := oldNode.DeepCopy()
if newNode.Annotations == nil {
newNode.Annotations = map[string]string{}
curNode := oldNode.DeepCopy()
if curNode.Annotations == nil {
curNode.Annotations = map[string]string{}
}
if newNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] == currentConfig {
return nil
if needsConfigChange {
curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] = currentConfig
}
newNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] = currentConfig
newData, err := json.Marshal(newNode)
if rebootRequested {
curNode.Annotations[daemonconsts.MachineConfigDaemonRebootApprovedAnnotation] = ""
}

newData, err := json.Marshal(curNode)
if err != nil {
return err
}
Expand All @@ -751,30 +820,37 @@ func getCandidateMachines(pool *mcfgv1.MachineConfigPool, nodesInPool []*corev1.
unavail := getUnavailableMachines(nodesInPool)
// If we're at capacity, there's nothing to do.
if len(unavail) >= maxUnavailable {
glog.V(3).Infof("Pool %s: No progress possible: unavail %v >= maxunavail %v", pool.Name, len(unavail), maxUnavailable)
return nil
}
capacity := maxUnavailable - len(unavail)
failingThisConfig := 0
// We only look at nodes which aren't already targeting our desired config
// We only look at nodes which aren't already targeting our desired config, or
// are requesting a reboot.
var nodes []*corev1.Node
for _, node := range nodesInPool {
if node.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] == targetConfig {
_, rebootRequested := node.Annotations[daemonconsts.MachineConfigDaemonRebootRequestedAnnotation]
_, rebootApproved := node.Annotations[daemonconsts.MachineConfigDaemonRebootApprovedAnnotation]
needsRebootApproval := rebootRequested && !rebootApproved
targetingThisConfig := node.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] == targetConfig
if !needsRebootApproval && targetingThisConfig {
if isNodeMCDFailing(node) {
failingThisConfig++
}
continue
}

nodes = append(nodes, node)
}

// Nodes which are failing to target this config also count against
// availability - it might be a transient issue, and if the issue
// clears we don't want multiple to update at once.
if failingThisConfig >= capacity {
glog.V(3).Infof("Pool %s: No progress possible: failingThisConfig %v >= capacity %v", pool.Name, failingThisConfig, capacity)
return nil
}
capacity -= failingThisConfig
glog.V(3).Infof("Pool %s: Capacity %v with %v nodes to progress", pool.Name, capacity, len(nodes))

if len(nodes) < capacity {
return nodes
Expand Down
29 changes: 23 additions & 6 deletions pkg/controller/node/node_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ func checkAction(expected, actual core.Action, t *testing.T) {
e, _ := expected.(core.PatchAction)
expPatch := e.GetPatch()
patch := a.GetPatch()
assert.Equal(t, expPatch, patch)
assert.Equal(t, string(expPatch), string(patch))
}
}

Expand Down Expand Up @@ -608,13 +608,12 @@ func assertPatchesNode0ToV1(t *testing.T, actions []core.Action) {
t.Fatal(actions)
}

expected := []byte(`{"metadata":{"annotations":{"machineconfiguration.openshift.io/desiredConfig":"v1"}}}`)
expected := fmt.Sprintf(`{"metadata":{"annotations":{"%s":"v1"}}}`, daemonconsts.DesiredMachineConfigAnnotationKey)
actual := actions[1].(core.PatchAction).GetPatch()
assert.Equal(t, expected, actual)
assert.Equal(t, expected, string(actual))
}

func TestSetDesiredMachineConfigAnnotation(t *testing.T) {

func TestUpdateCandidateNode(t *testing.T) {
tests := []struct {
node *corev1.Node
extraannos map[string]string
Expand Down Expand Up @@ -676,6 +675,24 @@ func TestSetDesiredMachineConfigAnnotation(t *testing.T) {
t.Fatal(actions)
}
},
}, {
node: newNode("node-0", "v1", "v1"),
extraannos: map[string]string{daemonconsts.MachineConfigDaemonRebootRequestedAnnotation: ""},
verify: func(actions []core.Action, t *testing.T) {
if !assert.Equal(t, 2, len(actions)) {
t.Fatal("actions")
}
if !actions[0].Matches("get", "nodes") || actions[0].(core.GetAction).GetName() != "node-0" {
t.Fatal(actions)
}
if !actions[1].Matches("patch", "nodes") {
t.Fatal(actions)
}

expected := fmt.Sprintf(`{"metadata":{"annotations":{"%s":""}}}`, daemonconsts.MachineConfigDaemonRebootApprovedAnnotation)
actual := actions[1].(core.PatchAction).GetPatch()
assert.Equal(t, expected, string(actual))
},
}}

for idx, test := range tests {
Expand All @@ -694,7 +711,7 @@ func TestSetDesiredMachineConfigAnnotation(t *testing.T) {

c := f.newController()

err := c.setDesiredMachineConfigAnnotation(test.node.Name, "v1")
err := c.updateCandidateNode("piscine", test.node, "v1")
if !assert.Nil(t, err) {
return
}
Expand Down
Loading