8000 Add multiple concurrent node reboot feature by trstringer · Pull Request #660 · kubereboot/kured · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Add multiple concurrent node reboot feature #660

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions .github/workflows/on-pr.yaml
8000
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,90 @@ jobs:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh



# This ensures the latest code works with the manifests built from tree.
# It is useful for two things:
# - Test manifests changes (obviously), ensuring they don't break existing clusters
# - Ensure manifests work with the latest versions even with no manifest change
# (compared to helm charts, manifests cannot easily template changes based on versions)
# Helm charts are _trailing_ releases, while manifests are done during development.
# Concurrency = 2
e2e-manifests-concurent:
name: End-to-End test with kured with code and manifests from HEAD (concurrent)
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
kubernetes:
- "1.25"
- "1.26"
- "1.27"
steps:
- uses: actions/checkout@v3
- name: Ensure go version
uses: actions/setup-go@v4
with:
go-version-file: 'go.mod'
check-latest: true
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Setup GoReleaser
run: make bootstrap-tools
- name: Find current tag version
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
id: tags
- name: Build artifacts
run: |
VERSION="${{ steps.tags.outputs.sha_short }}" make image
VERSION="${{ steps.tags.outputs.sha_short }}" make manifest

- name: Workaround "Failed to attach 1 to compat systemd cgroup /actions_job/..." on gh actions
run: |
sudo bash << EOF
cp /etc/docker/daemon.json /etc/docker/daemon.json.old
echo '{}' > /etc/docker/daemon.json
systemctl restart docker || journalctl --no-pager -n 500
systemctl status docker
EOF

# Default name for helm/kind-action kind clusters is "chart-testing"
- name: Create kind cluster with 5 nodes
uses: helm/kind-action@v1.8.0
with:
config: .github/kind-cluster-${{ matrix.kubernetes }}.yaml
version: v0.14.0

- name: Preload previously built images onto kind cluster
run: kind load docker-image ghcr.io/${{ github.repository }}:${{ steps.tags.outputs.sha_short }} --name chart-testing

- name: Do not wait for an hour before detecting the rebootSentinel
run: |
sed -i 's/#\(.*\)--period=1h/\1--period=30s/g' kured-ds.yaml
sed -i 's/#\(.*\)--concurrency=1/\1--concurrency=2/g' kured-ds.yaml

- name: Install kured with kubectl
run: |
kubectl apply -f kured-rbac.yaml && kubectl apply -f kured-ds.yaml

- name: Ensure kured is ready
uses: nick-invision/retry@v2.8.3
with:
timeout_minutes: 10
max_attempts: 10
retry_wait_seconds: 60
# DESIRED CURRENT READY UP-TO-DATE AVAILABLE should all be = to cluster_size
command: "kubectl get ds -n kube-system kured | grep -E 'kured.*5.*5.*5.*5.*5'"

- name: Create reboot sentinel files
run: |
./tests/kind/create-reboot-sentinels.sh

- name: Follow reboot until success
env:
DEBUG: true
run: |
./tests/kind/follow-coordinated-reboot.sh
46 changes: 36 additions & 10 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ var (
preRebootNodeLabels []string
postRebootNodeLabels []string
nodeID string
concurrency int

rebootDays []string
rebootStart string
Expand Down Expand Up @@ -167,6 +168,8 @@ func NewRootCommand() *cobra.Command {
"command for which a zero return code will trigger a reboot command")
rootCmd.PersistentFlags().StringVar(&rebootCommand, "reboot-command", "/bin/systemctl reboot",
"command to run when a reboot is required")
rootCmd.PersistentFlags().IntVar(&concurrency, "concurrency", 1,
"amount of nodes to concurrently reboot. Defaults to 1")

rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
"slack hook URL for reboot notifications [deprecated in favor of --notify-url]")
Expand Down Expand Up @@ -421,8 +424,14 @@ func rebootBlocked(blockers ...RebootBlocker) bool {
return false
}

func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
holding, err := lock.Test(metadata)
func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}, isMultiLock bool) bool {
var holding bool
var err error
if isMultiLock {
holding, err = lock.TestMultiple()
} else {
holding, err = lock.Test(metadata)
}
if err != nil {
log.Fatalf("Error testing lock: %v", err)
}
Expand All @@ -432,8 +441,17 @@ func holding(lock *daemonsetlock.DaemonSetLock, metadata interface{}) bool {
return holding
}

func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration) bool {
holding, holder, err := lock.Acquire(metadata, TTL)
func acquire(lock *daemonsetlock.DaemonSetLock, metadata interface{}, TTL time.Duration, maxOwners int) bool {
var holding bool
var holder string
var err error
if maxOwners > 1 {
var holders []string
holding, holders, err = lock.AcquireMultiple(metadata, TTL, maxOwners)
holder = strings.Join(holders, ",")
} else {
holding, holder, err = lock.Acquire(metadata, TTL)
}
switch {
case err != nil:
log.Fatalf("Error acquiring lock: %v", err)
Expand All @@ -454,9 +472,16 @@ func throttle(releaseDelay time.Duration) {
}
}

func release(lock *daemonsetlock.DaemonSetLock) {
func release(lock *daemonsetlock.DaemonSetLock, isMultiLock bool) {
log.Infof("Releasing lock")
if err := lock.Release(); err != nil {

var err error
if isMultiLock {
err = lock.ReleaseMultiple()
} else {
err = lock.Release()
}
if err != nil {
log.Fatalf("Error releasing lock: %v", err)
}
}
Expand Down Expand Up @@ -641,7 +666,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
source := rand.NewSource(time.Now().UnixNano())
tick := delaytick.New(source, 1*time.Minute)
for range tick {
if holding(lock, &nodeMeta) {
if holding(lock, &nodeMeta, concurrency > 1) {
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
log.Errorf("Error retrieving node object via k8s API: %v", err)
Expand Down Expand Up @@ -674,7 +699,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
}
}
throttle(releaseDelay)
release(lock)
release(lock, concurrency > 1)
break
} else {
break
Expand Down Expand Up @@ -732,7 +757,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
}
}

if !holding(lock, &nodeMeta) && !acquire(lock, &nodeMeta, TTL) {
if !holding(lock, &nodeMeta, concurrency > 1) && !acquire(lock, &nodeMeta, TTL, concurrency) {
// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
preferNoScheduleTaint.Enable()
continue
Expand All @@ -754,7 +779,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
if err != nil {
if !forceReboot {
log.Errorf("Unable to cordon or drain %s: %v, will release lock and retry cordon and drain before rebooting when lock is next acquired", node.GetName(), err)
release(lock)
release(lock, concurrency > 1)
log.Infof("Performing a best-effort uncordon after failed cordon and drain")
uncordon(client, node)
continue
Expand Down Expand Up @@ -832,6 +857,7 @@ func root(cmd *cobra.Command, args []string) {
log.Infof("Blocking Pod Selectors: %v", podSelectors)
log.Infof("Reboot schedule: %v", window)
log.Infof("Reboot check command: %s every %v", sentinelCommand, period)
log.Infof("Concurrency: %v", concurrency)
log.Infof("Reboot command: %s", restartCommand)
if annotateNodes {
log.Infof("Will annotate nodes during kured reboot operations")
Expand Down
1 change: 1 addition & 0 deletions kured-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,4 @@ spec:
# - --log-format=text
# - --metrics-host=""
# - --metrics-port=8080
# - --concurrency=1
Loading
0