Upgrade a GKE cluster to a new Kubernetes version
Safely upgrade GKE clusters to new Kubernetes versions with automated validation, health checks, and staged control plane/node upgrades. Use before production upgrades to ensure compatibility and minimize downtime.
/plugin marketplace add kcns008/cluster-code/plugin install kcns008-cloud-gcp-plugins-cloud-gcp@kcns008/cluster-codeSafely upgrade a GKE cluster (Standard or Autopilot) to a new Kubernetes version with automated validation and monitoring.
This command provides a comprehensive GKE cluster upgrade workflow with:
gcloud --version >= 450.0.0)kubectl configured with cluster admin accessgcloud auth list)container.clusters.updatecontainer.clusters.getcontainer.operations.getGKE offers three release channels that automatically manage upgrades:
Clusters in release channels auto-upgrade during maintenance windows.
Clusters not enrolled in a release channel can specify exact versions but require manual upgrade management.
CLUSTER_NAME="${CLUSTER_NAME}"
GCP_PROJECT="${GCP_PROJECT}"
GCP_REGION="${GCP_REGION}"
GCP_ZONE="${GCP_ZONE}"
TARGET_VERSION="${TARGET_VERSION}"
# Determine location type
if [[ -n "$GCP_REGION" ]]; then
LOCATION="$GCP_REGION"
LOCATION_TYPE="region"
elif [[ -n "$GCP_ZONE" ]]; then
LOCATION="$GCP_ZONE"
LOCATION_TYPE="zone"
else
echo "โ ERROR: Either --region or --zone must be specified"
exit 1
fi
echo "๐ Validating cluster upgrade: $CLUSTER_NAME"
echo ""
# Set project
gcloud config set project "$GCP_PROJECT"
# Check if cluster exists
if ! gcloud container clusters describe "$CLUSTER_NAME" --$LOCATION_TYPE="$LOCATION" &>/dev/null; then
echo "โ ERROR: Cluster '$CLUSTER_NAME' not found in $LOCATION"
exit 1
fi
# Get cluster details
CLUSTER_INFO=$(gcloud container clusters describe "$CLUSTER_NAME" --$LOCATION_TYPE="$LOCATION" --format=json)
CURRENT_VERSION=$(echo "$CLUSTER_INFO" | jq -r '.currentMasterVersion')
CLUSTER_STATUS=$(echo "$CLUSTER_INFO" | jq -r '.status')
CLUSTER_MODE=$(echo "$CLUSTER_INFO" | jq -r '.autopilot.enabled // false')
RELEASE_CHANNEL=$(echo "$CLUSTER_INFO" | jq -r '.releaseChannel.channel // "UNSPECIFIED"')
NODE_COUNT=$(echo "$CLUSTER_INFO" | jq -r '.currentNodeCount // 0')
if [[ "$CLUSTER_MODE" == "true" ]]; then
MODE="Autopilot"
else
MODE="Standard"
fi
echo "Cluster: $CLUSTER_NAME"
echo "Mode: $MODE"
echo "Current version: $CURRENT_VERSION"
echo "Status: $CLUSTER_STATUS"
echo "Release channel: $RELEASE_CHANNEL"
echo "Nodes: $NODE_COUNT"
# Verify cluster is in RUNNING state
if [[ "$CLUSTER_STATUS" != "RUNNING" ]]; then
echo "โ ERROR: Cluster must be in RUNNING state for upgrade"
echo " Current state: $CLUSTER_STATUS"
exit 1
fi
echo "โ
Cluster is ready for upgrade"
echo ""
echo "๐ Determining target version..."
# Get available versions and upgrade information
SERVER_CONFIG=$(gcloud container get-server-config --$LOCATION_TYPE="$LOCATION" --format=json)
if [[ -z "$TARGET_VERSION" ]]; then
# Auto-select target version based on release channel or current version
if [[ "$RELEASE_CHANNEL" == "UNSPECIFIED" || "$RELEASE_CHANNEL" == "null" ]]; then
# No release channel - get next available version
VALID_VERSIONS=$(echo "$SERVER_CONFIG" | jq -r '.validMasterVersions[]')
# Find next minor version
CURRENT_MINOR=$(echo "$CURRENT_VERSION" | grep -oP '^\d+\.\d+' || echo "$CURRENT_VERSION" | cut -d. -f1-2)
TARGET_VERSION=$(echo "$VALID_VERSIONS" | grep "^$CURRENT_MINOR" | head -1)
if [[ -z "$TARGET_VERSION" ]]; then
# Try next minor version
NEXT_MINOR=$(echo "$CURRENT_MINOR" | awk -F. '{print $1"."$2+1}')
TARGET_VERSION=$(echo "$VALID_VERSIONS" | grep "^$NEXT_MINOR" | head -1)
fi
if [[ -z "$TARGET_VERSION" ]]; then
echo "โ ERROR: Could not determine target version"
echo " Available versions:"
echo "$VALID_VERSIONS" | sed 's/^/ - /'
exit 1
fi
echo "Auto-selected version: $TARGET_VERSION (latest compatible)"
else
# Use release channel default
TARGET_VERSION=$(echo "$SERVER_CONFIG" | jq -r ".channels.${RELEASE_CHANNEL}.defaultVersion")
if [[ "$TARGET_VERSION" == "null" || -z "$TARGET_VERSION" ]]; then
echo "โ ERROR: Could not determine default version for $RELEASE_CHANNEL channel"
exit 1
fi
echo "Release channel ($RELEASE_CHANNEL) version: $TARGET_VERSION"
fi
else
echo "Using specified version: $TARGET_VERSION"
# Verify version is available
VALID_VERSIONS=$(echo "$SERVER_CONFIG" | jq -r '.validMasterVersions[]')
if ! echo "$VALID_VERSIONS" | grep -q "^$TARGET_VERSION$"; then
echo "โ ERROR: Version $TARGET_VERSION is not available in $LOCATION"
echo ""
echo "Available versions:"
echo "$VALID_VERSIONS" | sed 's/^/ - /'
exit 1
fi
fi
echo "Target version: $TARGET_VERSION"
echo ""
echo "โ
Validating upgrade path..."
# Parse version components
CURRENT_MAJOR=$(echo "$CURRENT_VERSION" | cut -d. -f1)
CURRENT_MINOR=$(echo "$CURRENT_VERSION" | cut -d. -f2)
CURRENT_PATCH=$(echo "$CURRENT_VERSION" | cut -d. -f3 | cut -d- -f1)
TARGET_MAJOR=$(echo "$TARGET_VERSION" | cut -d. -f1)
TARGET_MINOR=$(echo "$TARGET_VERSION" | cut -d. -f2)
TARGET_PATCH=$(echo "$TARGET_VERSION" | cut -d. -f3 | cut -d- -f1)
# Check for downgrades
if [[ "$TARGET_MAJOR" -lt "$CURRENT_MAJOR" ]] || \
[[ "$TARGET_MAJOR" -eq "$CURRENT_MAJOR" && "$TARGET_MINOR" -lt "$CURRENT_MINOR" ]]; then
echo "โ ERROR: Downgrading is not supported"
echo " Current: $CURRENT_VERSION"
echo " Target: $TARGET_VERSION"
exit 1
fi
# Check for same version
if [[ "$CURRENT_VERSION" == "$TARGET_VERSION" ]]; then
echo "โน๏ธ Cluster is already at version $TARGET_VERSION"
echo " No upgrade needed"
exit 0
fi
# Check minor version skip
MINOR_DIFF=$((TARGET_MINOR - CURRENT_MINOR))
if [[ $MINOR_DIFF -gt 1 ]]; then
echo "โ ๏ธ WARNING: Skipping minor versions (from $CURRENT_MINOR to $TARGET_MINOR)"
echo " GKE allows this, but it's recommended to upgrade one minor version at a time"
fi
echo "โ
Valid upgrade path: $CURRENT_VERSION โ $TARGET_VERSION"
echo ""
echo "๐ฅ Running pre-upgrade health check..."
# Get kubectl credentials
gcloud container clusters get-credentials "$CLUSTER_NAME" \
--$LOCATION_TYPE="$LOCATION" \
--project="$GCP_PROJECT"
# Check node health
echo " Checking node health..."
UNHEALTHY_NODES=$(kubectl get nodes --no-headers | grep -v " Ready" | wc -l)
TOTAL_NODES=$(kubectl get nodes --no-headers | wc -l)
if [[ $UNHEALTHY_NODES -gt 0 ]]; then
echo "โ ๏ธ WARNING: $UNHEALTHY_NODES of $TOTAL_NODES nodes are not Ready"
kubectl get nodes | grep -v " Ready"
else
echo "โ
All $TOTAL_NODES nodes are Ready"
fi
# Check pod health
echo " Checking pod health..."
FAILING_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l)
if [[ $FAILING_PODS -gt 0 ]]; then
echo "โ ๏ธ WARNING: $FAILING_PODS pods are not Running/Succeeded"
echo " Review: kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded"
else
echo "โ
All pods are healthy"
fi
# Check for deprecated APIs
echo " Checking for deprecated APIs..."
if command -v pluto &>/dev/null; then
pluto detect-all-in-cluster --target-versions k8s=v$TARGET_MAJOR.$TARGET_MINOR.0
else
echo "โน๏ธ Install 'pluto' for deprecated API detection:"
echo " brew install FairwindsOps/tap/pluto"
fi
# Check PodDisruptionBudgets
PDB_COUNT=$(kubectl get pdb --all-namespaces --no-headers 2>/dev/null | wc -l)
if [[ $PDB_COUNT -gt 0 ]]; then
echo "โน๏ธ Found $PDB_COUNT PodDisruptionBudgets (may affect node upgrade timing)"
fi
echo "โ
Pre-upgrade health check completed"
if [[ "${SKIP_BACKUP}" != "true" && "${DRY_RUN}" != "true" ]]; then
echo ""
echo "๐พ Creating pre-upgrade backup..."
BACKUP_DIR="./cluster-backup-upgrade-$CLUSTER_NAME-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BACKUP_DIR"
# Backup cluster configuration
gcloud container clusters describe "$CLUSTER_NAME" \
--$LOCATION_TYPE="$LOCATION" \
--format=json > "$BACKUP_DIR/cluster-config.json"
# Backup node pool configurations (Standard mode only)
if [[ "$MODE" == "Standard" ]]; then
gcloud container node-pools list \
--cluster="$CLUSTER_NAME" \
--$LOCATION_TYPE="$LOCATION" \
--format=json > "$BACKUP_DIR/node-pools.json" 2>/dev/null
fi
# Backup Kubernetes resources
kubectl get all --all-namespaces -o yaml > "$BACKUP_DIR/all-resources.yaml"
kubectl get pv,pvc --all-namespaces -o yaml > "$BACKUP_DIR/volumes.yaml"
kubectl get configmap,secret --all-namespaces -o yaml > "$BACKUP_DIR/configs-secrets.yaml"
kubectl get crds -o yaml > "$BACKUP_DIR/crds.yaml" 2>/dev/null
echo "โ
Backup saved to: $BACKUP_DIR"
fi
echo ""
echo "๐ UPGRADE EXECUTION"
echo "===================="
if [[ "${DRY_RUN}" == "true" ]]; then
echo ""
echo "๐งช DRY RUN MODE - No changes will be made"
echo ""
echo "Would upgrade:"
echo " Cluster: $CLUSTER_NAME"
echo " Mode: $MODE"
echo " From: $CURRENT_VERSION"
echo " To: $TARGET_VERSION"
echo " Location: $LOCATION"
echo " Node pools: ${UPGRADE_NODES}"
exit 0
fi
echo ""
echo "๐ Upgrade Summary:"
echo " Cluster: $CLUSTER_NAME"
echo " Mode: $MODE"
echo " Current Version: $CURRENT_VERSION"
echo " Target Version: $TARGET_VERSION"
echo " Location: $LOCATION"
echo " Release Channel: $RELEASE_CHANNEL"
echo ""
echo "Upgrade phases:"
echo " 1. Control plane upgrade (~5-10 minutes)"
if [[ "${UPGRADE_NODES}" == "true" && "$MODE" == "Standard" ]]; then
echo " 2. Node pool upgrades (~10-20 minutes per pool)"
elif [[ "$MODE" == "Autopilot" ]]; then
echo " 2. Node upgrade (automatic, managed by GKE)"
fi
echo ""
echo "Impact:"
echo " โข Control plane: Brief API unavailability during upgrade"
echo " โข Workloads: Continue running during control plane upgrade"
if [[ "${UPGRADE_NODES}" == "true" ]]; then
echo " โข Nodes: Rolling replacement (pods will be rescheduled)"
fi
echo ""
read -p "Continue with upgrade? (yes/no): " CONFIRM
if [[ "$CONFIRM" != "yes" ]]; then
echo "โ Upgrade cancelled"
exit 0
fi
echo ""
echo "Step 1: Upgrading control plane to $TARGET_VERSION..."
echo "โณ This may take 5-10 minutes..."
echo ""
UPGRADE_START=$(date +%s)
# Initiate control plane upgrade
gcloud container clusters upgrade "$CLUSTER_NAME" \
--master \
--cluster-version="$TARGET_VERSION" \
--$LOCATION_TYPE="$LOCATION" \
--quiet
if [[ $? -eq 0 ]]; then
echo "โ
Control plane upgraded successfully"
else
echo "โ Control plane upgrade failed"
exit 1
fi
# Verify control plane version
echo ""
echo "Verifying control plane version..."
NEW_MASTER_VERSION=$(gcloud container clusters describe "$CLUSTER_NAME" \
--$LOCATION_TYPE="$LOCATION" \
--format="value(currentMasterVersion)")
if [[ "$NEW_MASTER_VERSION" == "$TARGET_VERSION" ]]; then
echo "โ
Control plane verified at version $NEW_MASTER_VERSION"
else
echo "โ ๏ธ Control plane version mismatch"
echo " Expected: $TARGET_VERSION"
echo " Actual: $NEW_MASTER_VERSION"
fi
if [[ "${UPGRADE_NODES}" == "true" ]]; then
if [[ "$MODE" == "Standard" ]]; then
echo ""
echo "Step 2: Upgrading node pools..."
# Get all node pools
NODE_POOLS=$(gcloud container node-pools list \
--cluster="$CLUSTER_NAME" \
--$LOCATION_TYPE="$LOCATION" \
--format="value(name)")
if [[ -z "$NODE_POOLS" ]]; then
echo "โน๏ธ No node pools found"
else
for POOL in $NODE_POOLS; do
echo ""
echo " Upgrading node pool: $POOL"
# Get current pool version
POOL_VERSION=$(gcloud container node-pools describe "$POOL" \
--cluster="$CLUSTER_NAME" \
--$LOCATION_TYPE="$LOCATION" \
--format="value(version)")
echo " Current version: $POOL_VERSION"
echo " Target version: $TARGET_VERSION"
if [[ "$POOL_VERSION" == "$TARGET_VERSION" ]]; then
echo " โ
Already at target version"
continue
fi
echo " โณ Upgrading (rolling node replacement)..."
# Upgrade node pool
gcloud container clusters upgrade "$CLUSTER_NAME" \
--node-pool="$POOL" \
--cluster-version="$TARGET_VERSION" \
--$LOCATION_TYPE="$LOCATION" \
--quiet
if [[ $? -eq 0 ]]; then
echo " โ
Node pool $POOL upgraded successfully"
else
echo " โ Node pool $POOL upgrade failed"
exit 1
fi
# Brief pause between node pool upgrades
sleep 5
done
echo ""
echo "โ
All node pools upgraded"
fi
elif [[ "$MODE" == "Autopilot" ]]; then
echo ""
echo "Step 2: Node upgrade (Autopilot mode)"
echo " Autopilot clusters automatically upgrade nodes"
echo " Node upgrade will happen gradually over the next few hours"
echo " Monitor progress: gcloud container operations list --filter='type=UPGRADE_NODES'"
fi
else
echo ""
echo "โ ๏ธ Skipping node upgrade (--upgrade-nodes=false)"
echo " Nodes are still running version $CURRENT_VERSION"
echo ""
echo "To upgrade nodes manually:"
if [[ "$MODE" == "Standard" ]]; then
echo " gcloud container clusters upgrade $CLUSTER_NAME \\"
echo " --node-pool=<POOL_NAME> \\"
echo " --cluster-version=$TARGET_VERSION \\"
echo " --$LOCATION_TYPE=$LOCATION"
else
echo " Autopilot nodes will auto-upgrade during maintenance window"
fi
fi
echo ""
echo "๐ POST-UPGRADE VERIFICATION"
echo "============================"
# Verify control plane version
echo ""
echo "Control Plane Version:"
FINAL_MASTER_VERSION=$(gcloud container clusters describe "$CLUSTER_NAME" \
--$LOCATION_TYPE="$LOCATION" \
--format="value(currentMasterVersion)")
if [[ "$FINAL_MASTER_VERSION" == "$TARGET_VERSION" ]]; then
echo "โ
$FINAL_MASTER_VERSION"
else
echo "โ ๏ธ $FINAL_MASTER_VERSION (expected: $TARGET_VERSION)"
fi
# Check node versions
echo ""
echo "Node Versions:"
kubectl get nodes -o custom-columns=\
NAME:.metadata.name,\
VERSION:.status.nodeInfo.kubeletVersion,\
STATUS:.status.conditions[-1].type
# Count nodes by version
echo ""
echo "Node Version Distribution:"
kubectl get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' | \
sort | uniq -c | sed 's/^/ /'
# Check node health
echo ""
echo "Node Health:"
READY_NODES=$(kubectl get nodes --no-headers | grep " Ready" | wc -l)
TOTAL_NODES=$(kubectl get nodes --no-headers | wc -l)
if [[ $READY_NODES -eq $TOTAL_NODES ]]; then
echo "โ
All $TOTAL_NODES nodes are Ready"
else
echo "โ ๏ธ Only $READY_NODES of $TOTAL_NODES nodes are Ready"
fi
# Check pod health
echo ""
echo "Pod Health:"
RUNNING_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l)
TOTAL_PODS=$(kubectl get pods --all-namespaces --no-headers 2>/dev/null | wc -l)
echo "Running Pods: $RUNNING_PODS / $TOTAL_PODS"
FAILING_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l)
if [[ $FAILING_PODS -gt 0 ]]; then
echo "โ ๏ธ $FAILING_PODS pods not in Running/Succeeded state"
echo ""
echo "Review failing pods:"
kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded
else
echo "โ
All pods healthy"
fi
# Run cluster diagnostics
echo ""
echo "Running cluster diagnostics..."
if command -v k8sgpt &>/dev/null; then
k8sgpt analyze --explain --filter=Node,Pod | head -20
else
echo "โน๏ธ Install k8sgpt for AI-powered diagnostics"
fi
# Calculate upgrade time
UPGRADE_END=$(date +%s)
TOTAL_TIME=$(( (UPGRADE_END - UPGRADE_START) / 60 ))
echo ""
echo "โ
UPGRADE COMPLETED SUCCESSFULLY"
echo "=================================="
echo ""
echo "Cluster: $CLUSTER_NAME"
echo "Mode: $MODE"
echo "Old version: $CURRENT_VERSION"
echo "New version: $FINAL_MASTER_VERSION"
echo "Total time: $TOTAL_TIME minutes"
echo ""
echo "Next steps:"
echo " 1. Monitor workloads for any compatibility issues"
echo " 2. Test critical application flows"
echo " 3. Review Kubernetes $TARGET_VERSION release notes"
echo " 4. Update documentation and runbooks"
if [[ "${UPGRADE_NODES}" != "true" ]]; then
echo " 5. Schedule node pool upgrades"
fi
if [[ "$MODE" == "Autopilot" ]]; then
echo " 5. Monitor automatic node upgrades over next few hours"
fi
echo ""
echo "GKE Console: https://console.cloud.google.com/kubernetes/clusters/$LOCATION_TYPE/$LOCATION/$CLUSTER_NAME?project=$GCP_PROJECT"
echo ""
GKE does not support direct rollback. If upgrade causes issues:
For critical issues, contact Google Cloud Support with:
| Aspect | Autopilot | Standard |
|---|---|---|
| Control Plane | Auto-upgrades in channel | Auto or manual |
| Nodes | Auto-upgrades gradually | Manual or auto |
| Timing | Maintenance window | On-demand or scheduled |
| Control | Limited (channel-based) | Full control |
| Disruption | Minimized by GKE | Depends on configuration |
Solution: This is expected. Wait 5-10 minutes for upgrade to complete.
Solution: Check node logs and events:
kubectl describe node <NODE_NAME>
gcloud logging read "resource.type=k8s_node AND resource.labels.node_name=<NODE_NAME>"
Solution: Check for deprecated API usage:
kubectl api-resources --verbs=list --namespaced -o name | \
xargs -n 1 kubectl get --show-kind --ignore-not-found -A
gcp-cluster-delete: Delete clustersgcp-cluster-create: Create new clusterscluster-diagnose: Run comprehensive diagnosticsbackup-cluster: Create cluster backupnode-drain: Manually drain nodes