From c044b8aa263d9618c0096092558d98efe17d5b61 Mon Sep 17 00:00:00 2001 From: redscholar Date: Tue, 23 Sep 2025 17:26:29 +0800 Subject: [PATCH] fix: scaling down etcd Signed-off-by: redscholar --- builtin/core/playbooks/add_nodes.yaml | 81 ++++++++- builtin/core/playbooks/create_cluster.yaml | 3 +- builtin/core/playbooks/delete_cluster.yaml | 6 +- builtin/core/playbooks/delete_nodes.yaml | 100 +++++++++-- builtin/core/roles/defaults/vars/v1.31.yaml | 1 + .../etcd/{ => install}/files/backup.service | 0 .../etcd/{ => install}/files/etcd.service | 0 .../{ => install}/tasks/backup_service.yaml | 0 .../etcd/{ => install}/tasks/install.yaml | 9 +- .../core/roles/etcd/install/tasks/main.yaml | 4 + .../etcd/{ => install}/templates/backup.sh | 0 .../etcd/{ => install}/templates/backup.timer | 0 builtin/core/roles/etcd/meta/main.yaml | 23 +++ .../prepare.yaml => prepare/tasks/main.yaml} | 45 +++-- .../etcd/{ => prepare}/templates/etcd.env | 20 ++- .../roles/etcd/scaling_down/tasks/main.yaml | 165 ++++++++++++++++++ .../tasks/main.yaml} | 25 ++- builtin/core/roles/etcd/tasks/main.yaml | 20 --- .../upgrade.yaml => upgrade/tasks/main.yaml} | 0 .../core/roles/uninstall/etcd/tasks/main.yaml | 31 ---- .../kubernetes/tasks/kubernetes.yaml | 6 +- 21 files changed, 428 insertions(+), 111 deletions(-) rename builtin/core/roles/etcd/{ => install}/files/backup.service (100%) rename builtin/core/roles/etcd/{ => install}/files/etcd.service (100%) rename builtin/core/roles/etcd/{ => install}/tasks/backup_service.yaml (100%) rename builtin/core/roles/etcd/{ => install}/tasks/install.yaml (88%) create mode 100644 builtin/core/roles/etcd/install/tasks/main.yaml rename builtin/core/roles/etcd/{ => install}/templates/backup.sh (100%) rename builtin/core/roles/etcd/{ => install}/templates/backup.timer (100%) create mode 100644 builtin/core/roles/etcd/meta/main.yaml rename builtin/core/roles/etcd/{tasks/prepare.yaml => prepare/tasks/main.yaml} (60%) rename builtin/core/roles/etcd/{ => prepare}/templates/etcd.env (78%) create mode 100644 builtin/core/roles/etcd/scaling_down/tasks/main.yaml rename builtin/core/roles/etcd/{tasks/expansion.yaml => scaling_up/tasks/main.yaml} (54%) delete mode 100644 builtin/core/roles/etcd/tasks/main.yaml rename builtin/core/roles/etcd/{tasks/upgrade.yaml => upgrade/tasks/main.yaml} (100%) delete mode 100644 builtin/core/roles/uninstall/etcd/tasks/main.yaml diff --git a/builtin/core/playbooks/add_nodes.yaml b/builtin/core/playbooks/add_nodes.yaml index 8de4fae7..7282dfe9 100644 --- a/builtin/core/playbooks/add_nodes.yaml +++ b/builtin/core/playbooks/add_nodes.yaml @@ -36,7 +36,86 @@ - etcd gather_facts: true roles: - - etcd + - role: etcd + when: + - .etcd.deployment_type | eq "external" + +- hosts: + - kube_control_plane + tasks: + - name: AddNodes | Check if should update apiserver certificates + run_once: true + add_hostvars: + hosts: kube_control_plane + vars: + need_installed_etcd: >- + {{- $needInstalled := list -}} + {{- range .groups.etcd -}} + {{- if and ((index $.hostvars . "etcd_install_LoadState" "stdout") | eq "not-found") ($.delete_nodes | default list | has . | not) -}} + {{- $needInstalled = append $needInstalled . -}} + {{- end -}} + {{- end -}} + {{ $needInstalled | toJson }} + - name: AddNodes | Update apiserver etcd certificates + when: + - .need_installed_etcd | fromJson | empty | not + - .etcd.deployment_type | eq "external" + block: + - name: AddNodes | Copy etcd CA certificate to control plane node + copy: + src: >- + {{ .etcd.ca_file }} + dest: /etc/kubernetes/pki/etcd/ca.crt + - name: AddNodes | Copy etcd client certificate to control plane node + copy: + src: >- + {{ .etcd.cert_file }} + dest: /etc/kubernetes/pki/etcd/client.crt + - name: AddNodes | Copy etcd client key to control plane node + copy: + src: >- + {{ .etcd.key_file }} + dest: /etc/kubernetes/pki/etcd/client.key + - name: AddNodes | update ks-apiserver + command: | + {{- $endpoints := list -}} + {{- range .groups.etcd | default list -}} + {{- $endpoints = append $endpoints (printf "https://%s:2379" (index $.hostvars . "internal_ipv4")) -}} + {{- end -}} + ETCD_ENDPOINTS="{{ join "," $endpoints }}" + + if ! grep -q 'ClusterConfiguration' /etc/kubernetes/kubeadm-config.yaml 2>/dev/null; then + kubectl get cm kubeadm-config -n kube-system -o=jsonpath='{.data.ClusterConfiguration}' > /etc/kubernetes/kubeadm-config.yaml + fi + + awk -v ep="$ETCD_ENDPOINTS" ' + BEGIN { + n = split(ep, arr, ",") + for (i = 1; i <= n; i++) { + print " - " arr[i] + } + } + ' > /etc/kubernetes/kubeadm_new_endpoints.yaml + # delete old endpoint + sed -i '/^[[:space:]]*endpoints:/{ + :loop + N + s/\n[[:space:]]\+-.*//; t loop + s/\n[[:space:]]*\n/\n/g + P + D + }' /etc/kubernetes/kubeadm-config.yaml + # insert new endpoint + sed -i "/^[[:space:]]*endpoints:/r /etc/kubernetes/kubeadm_new_endpoints.yaml" /etc/kubernetes/kubeadm-config.yaml + rm /etc/kubernetes/kubeadm_new_endpoints.yaml + # update kubeadm-config + {{- if .kubernetes.kube_version | semverCompare "&2 exit 1 {{- end }} + tasks: + - name: DeleteNode | Update etcd certificate for kube_control_plane + when: + - .delete.etcd + - .etcd.deployment_type | eq "external" + block: + - name: DeleteNode | Check if should update apiserver certificates + run_once: true + add_hostvars: + hosts: kube_control_plane + vars: + need_uninstall_etcd: >- + {{- $needUnInstalled := list -}} + {{- range .groups.etcd -}} + {{- if $.delete_nodes | default list | has . -}} + {{- $needUnInstalled = append $needUnInstalled . -}} + {{- end -}} + {{- end -}} + {{ $needUnInstalled | toJson }} + - name: DeleteNode | Update apiserver etcd certificates + when: + - .need_uninstall_etcd | fromJson | empty | not + block: + - name: DeleteNode | Copy etcd CA certificate to control plane node + copy: + src: >- + {{ .etcd.ca_file }} + dest: /etc/kubernetes/pki/etcd/ca.crt + - name: DeleteNode | Copy etcd client certificate to control plane node + copy: + src: >- + {{ .etcd.cert_file }} + dest: /etc/kubernetes/pki/etcd/client.crt + - name: DeleteNode | Copy etcd client key to control plane node + copy: + src: >- + {{ .etcd.key_file }} + dest: /etc/kubernetes/pki/etcd/client.key + - name: DeleteNode | update ks-apiserver + command: | + {{- $endpoints := list -}} + {{- range .groups.etcd | default list -}} + {{- if $.need_uninstall_etcd | fromJson | has . | not -}} + {{- $endpoints = append $endpoints (printf "https://%s:2379" (index $.hostvars . "internal_ipv4")) -}} + {{- end -}} + {{- end -}} + ETCD_ENDPOINTS="{{ join "," $endpoints }}" + + if ! grep -q 'ClusterConfiguration' /etc/kubernetes/kubeadm-config.yaml 2>/dev/null; then + kubectl get cm kubeadm-config -n kube-system -o=jsonpath='{.data.ClusterConfiguration}' > /etc/kubernetes/kubeadm-config.yaml + fi + + awk -v ep="$ETCD_ENDPOINTS" ' + BEGIN { + n = split(ep, arr, ",") + for (i = 1; i <= n; i++) { + print " - " arr[i] + } + } + ' > /etc/kubernetes/kubeadm_new_endpoints.yaml + # delete old endpoint + sed -i '/^[[:space:]]*endpoints:/{ + :loop + N + s/\n[[:space:]]\+-.*//; t loop + s/\n[[:space:]]*\n/\n/g + P + D + }' /etc/kubernetes/kubeadm-config.yaml + # insert new endpoint + sed -i "/^[[:space:]]*endpoints:/r /etc/kubernetes/kubeadm_new_endpoints.yaml" /etc/kubernetes/kubeadm-config.yaml + rm /etc/kubernetes/kubeadm_new_endpoints.yaml + # update kubeadm-config + {{- if .kubernetes.kube_version | semverCompare "v%s" (index .etcd_install_version "stdout" "etcd Version")) + + - role: etcd/scaling_up + when: + - .installed_etcd | empty | not + - .need_installed_etcd | fromJson | empty | not + + - role: etcd/scaling_down + when: + - .need_uninstall_etcd | fromJson | empty | not + + - role: etcd/install + when: + - .etcd_install_LoadState.stdout | eq "not-found" + - .need_uninstall_etcd | fromJson | has .inventory_hostname | not + diff --git a/builtin/core/roles/etcd/tasks/prepare.yaml b/builtin/core/roles/etcd/prepare/tasks/main.yaml similarity index 60% rename from builtin/core/roles/etcd/tasks/prepare.yaml rename to builtin/core/roles/etcd/prepare/tasks/main.yaml index 628a32e4..e8daf19b 100644 --- a/builtin/core/roles/etcd/tasks/prepare.yaml +++ b/builtin/core/roles/etcd/prepare/tasks/main.yaml @@ -6,31 +6,39 @@ fail_msg: >- etcd service is installed but not running -- name: Prepare | Set etcd node parameters +- name: Prepare | Gather etcd node state and membership block: - - name: Prepare | Identify nodes with installed or missing etcd + - name: Prepare | Detect installed, to-install, and to-remove etcd nodes run_once: true add_hostvars: hosts: etcd vars: installed_etcd: >- - {{- $needInstalled := list -}} + {{- $installed := list -}} {{- range .groups.etcd -}} - {{- if (index $.hostvars . "etcd_install_LoadState" "stdout") | eq "loaded" -}} - {{- $needInstalled = append $needInstalled . -}} + {{- if and ((index $.hostvars . "etcd_install_LoadState" "stdout") | eq "loaded") ($.delete_nodes | default list | has . | not) -}} + {{- $installed = append $installed . -}} {{- end -}} {{- end -}} - {{ $needInstalled | first | default "" }} + {{ $installed | first | default "" }} need_installed_etcd: >- {{- $needInstalled := list -}} {{- range .groups.etcd -}} - {{- if (index $.hostvars . "etcd_install_LoadState" "stdout") | eq "not-found" -}} + {{- if and ((index $.hostvars . "etcd_install_LoadState" "stdout") | eq "not-found") ($.delete_nodes | default list | has . | not) -}} {{- $needInstalled = append $needInstalled . -}} {{- end -}} {{- end -}} {{ $needInstalled | toJson }} + need_uninstall_etcd: >- + {{- $needUnInstalled := list -}} + {{- range .groups.etcd -}} + {{- if $.delete_nodes | default list | has . -}} + {{- $needUnInstalled = append $needUnInstalled . -}} + {{- end -}} + {{- end -}} + {{ $needUnInstalled | toJson }} -- name: Prepare | Check installed etcd version +- name: Prepare | Validate installed etcd version when: .etcd_install_LoadState.stdout | eq "loaded" block: - name: Prepare | Ensure target etcd version is not lower than installed version @@ -40,29 +48,32 @@ fail_msg: >- Installed etcd version: {{ index .etcd_install_version "stdout" "etcd Version" }} is lower than target etcd version: {{ .etcd.etcd_version }} -- name: Prepare | Synchronize etcd package to node if new install or upgrade - when: - - or (.etcd_install_version.error | empty | not) (.etcd.etcd_version | semverCompare (printf ">v%s" (index .etcd_install_version "stdout" "etcd Version"))) +- name: Prepare | Distribute etcd package for install or upgrade + when: >- + or + (.etcd_install_version.error | empty | not) + (.etcd.etcd_version | semverCompare (printf ">v%s" (index .etcd_install_version "stdout" "etcd Version"))) block: - - name: Prepare | Copy etcd binary package to remote node + - name: Prepare | Copy etcd binary package to node copy: src: >- {{ .binary_dir }}/etcd/{{ .etcd.etcd_version }}/{{ .binary_type }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz dest: >- {{ .tmp_dir }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz - - name: Prepare | Extract etcd binary package to /usr/local/bin/ + - name: Prepare | Extract etcd binaries to /usr/local/bin/ command: | tar --strip-components=1 -C /usr/local/bin/ -xvf {{ .tmp_dir }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz \ --wildcards 'etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}/etcd*' -- name: Prepare | Synchronize certificates to node for new install or expansion +- name: Prepare | Synchronize certificates and etcd.env when changed when: >- or - (.etcd_install_version.error | empty | not) + (.etcd_install_LoadState.stdout | eq "not-found") (and (.installed_etcd | empty | not) (.need_installed_etcd | fromJson | empty | not) ) + (.need_uninstall_etcd | fromJson | empty | not) block: - name: Prepare | Copy CA certificate to etcd node copy: @@ -79,3 +90,7 @@ src: >- {{ .etcd.key_file }} dest: /etc/ssl/etcd/ssl/server.key + - name: Prepare | Render /etc/etcd.env configuration file + template: + src: etcd.env + dest: /etc/etcd.env diff --git a/builtin/core/roles/etcd/templates/etcd.env b/builtin/core/roles/etcd/prepare/templates/etcd.env similarity index 78% rename from builtin/core/roles/etcd/templates/etcd.env rename to builtin/core/roles/etcd/prepare/templates/etcd.env index a9a74707..2524aba2 100644 --- a/builtin/core/roles/etcd/templates/etcd.env +++ b/builtin/core/roles/etcd/prepare/templates/etcd.env @@ -1,15 +1,17 @@ {{- $ips := list -}} {{- $state := "new" -}} {{- range .groups.etcd | default list -}} - {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" -}} - {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" -}} - {{- if $internalIPv4 | empty | not -}} - {{- $ips = append $ips (printf "%s=https://%s:2380" (index $.hostvars . "hostname") $internalIPv4) -}} - {{- else if $internalIPv6 | empty | not }} - {{- $ips = append $ips (printf "%s=https://%s:2380" (index $.hostvars . "hostname") $internalIPv6) -}} - {{- end -}} - {{ if index $.hostvars . "etcd_install_LoadState" "stdout" | eq "loaded" -}} - {{- $state := "existing" -}} + {{- if $.need_uninstall_etcd | fromJson | default list | has . | not -}} + {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" -}} + {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" -}} + {{- if $internalIPv4 | empty | not -}} + {{- $ips = append $ips (printf "%s=https://%s:2380" (index $.hostvars . "hostname") $internalIPv4) -}} + {{- else if $internalIPv6 | empty | not }} + {{- $ips = append $ips (printf "%s=https://%s:2380" (index $.hostvars . "hostname") $internalIPv6) -}} + {{- end -}} + {{ if index $.hostvars . "etcd_install_LoadState" "stdout" | eq "loaded" -}} + {{- $state = "existing" -}} + {{- end -}} {{- end -}} {{- end -}} ETCD_DATA_DIR={{ .etcd.env.data_dir }} diff --git a/builtin/core/roles/etcd/scaling_down/tasks/main.yaml b/builtin/core/roles/etcd/scaling_down/tasks/main.yaml new file mode 100644 index 00000000..ea38c76c --- /dev/null +++ b/builtin/core/roles/etcd/scaling_down/tasks/main.yaml @@ -0,0 +1,165 @@ +--- +- name: ScalingDown | Execute actions on etcd nodes scheduled for removal + block: + - name: ScalingDown | Remove etcd member from cluster + run_once: true + delegate_to: "{{ .installed_etcd }}" + when: + - .need_uninstall_etcd | default "[]" | fromJson | empty | not + command: | + {{- $need_uninstall_etcd_hosts := list -}} + {{- range (.need_uninstall_etcd | default "[]" | fromJson) -}} + {{- $need_uninstall_etcd_hosts = append $need_uninstall_etcd_hosts (index $.hostvars . "hostname") -}} + {{- end -}} + for hostname in {{ $need_uninstall_etcd_hosts | join " " }};do + # Get the member ID of the node to be removed + MEMBER_ID=$(ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:2379 \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member list | grep $hostname | awk -F',' '{print $1}') + + if [ -z "$MEMBER_ID" ]; then + echo "Member does not exist, skipping removal." + exit 0 + fi + echo "Removing member $MEMBER_ID" + + # Remove the member from the etcd cluster + ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:2379 \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member remove "$MEMBER_ID" + + ############################################ + # Wait for the member ID to disappear from the list (ensure removal has been committed) + ############################################ + echo "Waiting for member $MEMBER_ID to disappear from the cluster..." + + for i in $(seq 1 60); do + STILL_PRESENT=$(ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:2379 \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member list | awk -F',' '{print $1}' | grep -w "$MEMBER_ID" || true) + + if [ -z "$STILL_PRESENT" ]; then + echo "Member $MEMBER_ID successfully removed from the cluster." + break + fi + + sleep 2 + done + + if [ -n "$STILL_PRESENT" ]; then + echo "ERROR: Timeout waiting for member $MEMBER_ID to be removed." + exit 1 + fi + + ############################################ + # Wait for an etcd leader to exist (ensure quorum has recovered) + ############################################ + echo "Waiting for etcd leader to be present..." + ALL_ENDPOINTS=$(ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:2379 \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member list | awk -F',' '{gsub(/^ +| +$/,"",$5); print $5}' | tr '\n' ',' | sed 's/,$//') + + if [ -z "$ALL_ENDPOINTS" ]; then + echo "ERROR: Cannot get endpoints from etcd member list" + exit 1 + fi + + for i in $(seq 1 60); do + # endpoint status text format fields: + # endpoint, ID, DB SIZE, IS LEADER, LEADER ID + LEADER_LINE=$(ETCDCTL_API=3 etcdctl \ + --endpoints="$ALL_ENDPOINTS" \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint status | awk -F',' '{print $8}' | grep -v "^$" | head -n1) + + if [ -n "$LEADER" ] && [ "$LEADER" != "0" ]; then + echo "Leader is present: $LEADER" + break + fi + + sleep 2 + done + + if [ -z "$LEADER" ] || [ "$LEADER" = "0" ]; then + echo "ERROR: No leader found after member removal." + exit 1 + fi + + echo "ETCD member $MEMBER_ID removed and quorum is stable." + done + - name: ScalingDown | delete etcd + # If need_uninstall_etcd is empty, remove the entire cluster. + # If need_uninstall_etcd is not empty, remove only the specified node. + when: >- + or + (.need_uninstall_etcd | default "[]" | fromJson | empty) + (.need_uninstall_etcd | default "[]" | fromJson | has .inventory_hostname) + block: + - name: ScalingDown | Stop and disable the etcd systemd service + ignore_errors: true + command: | + systemctl stop etcd.service + systemctl disable etcd.service + rm -rf /etc/systemd/system/etcd.service* + systemctl daemon-reload + systemctl reset-failed etcd.service + - name: ScalingDown | Remove traffic prioritization rules for etcd ports + when: .etcd.traffic_priority + command: | + tc filter del dev eth0 parent 1: protocol ip prio 1 u32 match ip sport 2379 0xffff + tc filter del dev eth0 parent 1: protocol ip prio 1 u32 match ip sport 2380 0xffff + - name: ScalingDown | Delete all etcd data, configuration, and binaries + command: | + rm -rf {{ .etcd.env.data_dir }} + rm -rf /etc/ssl/etcd/ + rm -rf /etc/etcd.env + rm -rf /usr/local/bin/etcd* + rm -rf /var/lib/etcd* + - name: ScalingDown | Remove backup-etcd timer, service, and backup scripts + ignore_errors: true + command: | + systemctl disable --now backup-etcd.timer + rm /etc/systemd/system/backup-etcd.timer + rm -rf /etc/systemd/system/backup-etcd.service* + rm /usr/local/bin/kube-scripts/backup_etcd.sh + systemctl daemon-reexec && systemctl daemon-reload + +- name: ScalingDown | Restart etcd cluster on remaining members + when: + - .need_uninstall_etcd | default "[]" | fromJson | empty | not + - .etcd_install_LoadState.stdout | eq "loaded" + - .need_uninstall_etcd | default "[]" | fromJson | has .inventory_hostname | not + block: + - name: ScalingDown | Restart the etcd service + command: | + systemctl restart etcd.service + - name: ScalingDown | Wait for etcd service to become healthy + command: | + for ((i=1; i<=12; i++)); do + if ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:2379 \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint health >/dev/null 2>&1; then + echo "✅ etcd is healthy" + exit 0 + fi + sleep 5 + done + echo "❌ etcd did not become healthy within 1 minute" + exit 1 \ No newline at end of file diff --git a/builtin/core/roles/etcd/tasks/expansion.yaml b/builtin/core/roles/etcd/scaling_up/tasks/main.yaml similarity index 54% rename from builtin/core/roles/etcd/tasks/expansion.yaml rename to builtin/core/roles/etcd/scaling_up/tasks/main.yaml index e2fbea33..7cca4f58 100644 --- a/builtin/core/roles/etcd/tasks/expansion.yaml +++ b/builtin/core/roles/etcd/scaling_up/tasks/main.yaml @@ -1,14 +1,11 @@ -- name: Expansion | Expand cluster on existing etcd nodes - when: .etcd_install_LoadState.stdout | eq "loaded" +- name: ScalingUp | Restart etcd cluster on existing members + when: + - .etcd_install_LoadState.stdout | eq "loaded" block: - - name: Expansion | Update /etc/etcd.env configuration file - template: - src: etcd.env - dest: /etc/etcd.env - - name: Expansion | Restart etcd service + - name: ScalingUp | Restart the etcd service command: | systemctl restart etcd.service - - name: Expansion | Verify etcd service becomes healthy within 1 minute + - name: ScalingUp | Ensure etcd service becomes healthy command: | for ((i=1; i<=12; i++)); do if ETCDCTL_API=3 etcdctl \ @@ -17,16 +14,18 @@ --cert=/etc/ssl/etcd/ssl/server.crt \ --key=/etc/ssl/etcd/ssl/server.key \ endpoint health >/dev/null 2>&1; then - echo "✅ etcd is health" + echo "✅ etcd is healthy" exit 0 fi sleep 5 done - echo "❌ etcd etcd is not health in 1 minute" + echo "❌ etcd did not become healthy within 1 minute" exit 1 -- name: Expansion | Add new etcd member from non-installed node - when: .etcd_install_LoadState.stdout | eq "not-found" +- name: ScalingUp | Add new etcd member from a node where etcd is not yet installed + when: + - .etcd_install_LoadState.stdout | eq "not-found" + - .need_installed_etcd | fromJson | has .inventory_hostname delegate_to: "{{ .installed_etcd }}" command: | ETCDCTL_API=3 etcdctl \ @@ -34,4 +33,4 @@ --cacert=/etc/ssl/etcd/ssl/ca.crt \ --cert=/etc/ssl/etcd/ssl/server.crt \ --key=/etc/ssl/etcd/ssl/server.key \ - member add {{ .inventory_hostname }} \ No newline at end of file + member add {{ .hostname }} \ No newline at end of file diff --git a/builtin/core/roles/etcd/tasks/main.yaml b/builtin/core/roles/etcd/tasks/main.yaml deleted file mode 100644 index 167fccc7..00000000 --- a/builtin/core/roles/etcd/tasks/main.yaml +++ /dev/null @@ -1,20 +0,0 @@ ---- -- include_tasks: prepare.yaml - -- name: ETCD | Upgrade etcd if a newer version is available - when: - - .etcd_install_LoadState.stdout | eq "loaded" - - .etcd.etcd_version | semverCompare (printf ">v%s" (index .etcd_install_version "stdout" "etcd Version")) - include_tasks: upgrade.yaml - -- name: ETCD | Expand the etcd cluster by adding new nodes if required - when: - - .installed_etcd | empty | not - - .need_installed_etcd | fromJson | empty | not - include_tasks: expansion.yaml - -- name: ETCD | Install etcd and set up the backup service if not already present - when: .etcd_install_LoadState.stdout | eq "not-found" - block: - - include_tasks: install.yaml - - include_tasks: backup_service.yaml diff --git a/builtin/core/roles/etcd/tasks/upgrade.yaml b/builtin/core/roles/etcd/upgrade/tasks/main.yaml similarity index 100% rename from builtin/core/roles/etcd/tasks/upgrade.yaml rename to builtin/core/roles/etcd/upgrade/tasks/main.yaml diff --git a/builtin/core/roles/uninstall/etcd/tasks/main.yaml b/builtin/core/roles/uninstall/etcd/tasks/main.yaml deleted file mode 100644 index b32dc350..00000000 --- a/builtin/core/roles/uninstall/etcd/tasks/main.yaml +++ /dev/null @@ -1,31 +0,0 @@ ---- -- name: ETCD | Completely uninstall the etcd service and remove all related files - block: - - name: ETCD | Stop and disable the etcd systemd service - ignore_errors: true - command: | - systemctl stop etcd.service - systemctl disable etcd.service - rm -rf /etc/systemd/system/etcd.service* - systemctl daemon-reload - systemctl reset-failed etcd.service - - name: ETCD | Remove traffic priority rules for etcd ports - command: | - tc filter del dev eth0 parent 1: protocol ip prio 1 u32 match ip sport 2379 0xffff - tc filter del dev eth0 parent 1: protocol ip prio 1 u32 match ip sport 2380 0xffff - when: .etcd.traffic_priority - - name: ETCD | Delete all etcd data, configuration, and binaries - command: | - rm -rf {{ .etcd.env.data_dir }} - rm -rf /etc/ssl/etcd/ - rm -rf /etc/etcd.env - rm -rf /usr/local/bin/etcd* - -- name: ETCD | Uninstall backup-etcd timer and service, and remove backup scripts - ignore_errors: true - command: | - systemctl disable --now backup-etcd.timer - rm /etc/systemd/system/backup-etcd.timer - rm -rf /etc/systemd/system/backup-etcd.service* - rm /usr/local/bin/kube-scripts/backup_etcd.sh - systemctl daemon-reexec && systemctl daemon-reload diff --git a/builtin/core/roles/uninstall/kubernetes/tasks/kubernetes.yaml b/builtin/core/roles/uninstall/kubernetes/tasks/kubernetes.yaml index b21504fa..a8378d6b 100644 --- a/builtin/core/roles/uninstall/kubernetes/tasks/kubernetes.yaml +++ b/builtin/core/roles/uninstall/kubernetes/tasks/kubernetes.yaml @@ -2,7 +2,8 @@ - name: Kubernetes | Completely reset the node using kubeadm ignore_errors: true command: | - kubeadm reset -f + # After Kubernetes v1.27.0, the remove-etcd-member phase will automatically clean up /var/lib/etcd + kubeadm reset -f {{ if .etcd.deployment_type | eq "external" }}--skip-phases remove-etcd-member{{ end }} - name: Kubernetes | Gracefully stop and disable the kubelet service ignore_errors: true @@ -23,5 +24,4 @@ rm -rf /var/log/pods/ rm -rf /etc/kubernetes/ rm -rf ~/.kube/config - rm -rf /root/.kube/config - rm -rf /var/lib/etcd \ No newline at end of file + rm -rf /root/.kube/config \ No newline at end of file