feat: add AlibabaCloud-AutoNLBs-V3 network plugin #371

Workflow file for this run

.github/workflows/e2e-1.30.yaml at ff03c42

	name: E2E-1.30

	on:
	push:
	branches:
	- master
	- release-*
	pull_request: {}
	workflow_dispatch: {}

	env:
	# Common versions
	GO_VERSION: "1.23.4"
	KIND_VERSION: "v0.22.0"
	KIND_IMAGE: "kindest/node:v1.30.8"
	KIND_CLUSTER_NAME: "ci-testing"
	CERT_MANAGER_VERSION: "v1.18.2"

	jobs:
	game-kruise:
	runs-on: ubuntu-24.04
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: true
	fetch-depth: 0
	fetch-tags: true
	- name: Ensure tags are available
	run: git fetch --force --tags
	- name: Setup Go
	uses: actions/setup-go@v3
	with:
	go-version: ${{ env.GO_VERSION }}
	- name: Determine build metadata
	run: \|
	echo "::group::Determine build metadata"
	bash ./scripts/ci/determine-build-metadata.sh
	echo "::endgroup::"
	# Prepare audit policy before cluster creation so extraMounts can find it
	- name: Prepare audit policy
	run: \|
	echo "::group::Prepare audit policy"
	bash ./scripts/ci/prepare-kind-audit.sh
	echo "::endgroup::"
	- name: Setup Kind Cluster
	uses: helm/kind-action@v1.12.0
	with:
	node_image: ${{ env.KIND_IMAGE }}
	cluster_name: ${{ env.KIND_CLUSTER_NAME }}
	config: ./test/kind-conf.yaml
	version: ${{ env.KIND_VERSION }}
	- name: Ensure audit log file exists and is world-readable
	run: \|
	echo "::group::Ensure audit log file"
	bash ./scripts/ci/ensure-audit-log.sh
	echo "::endgroup::"
	- name: Build image
	run: \|
	echo "::group::Build manager image"
	bash ./scripts/ci/build-manager-image.sh
	echo "::endgroup::"
	- name: Install Cert-Manager
	run: \|
	echo "::group::Install Cert-Manager"
	bash ./scripts/ci/install-cert-manager.sh
	echo "::endgroup::"
	- name: Deploy Observability Infrastructure
	run: \|
	echo "::group::Deploy observability stack"
	set -ex
	echo "=== Deploying observability stack for tracing E2E tests ==="
	cd test/e2e

	# Deploy the stack (script will not exit on pod failures)
	./setup-k8s-observability.sh deploy

	echo ""
	echo "=== Checking deployment status ==="
	kubectl get pods -n observability -o wide

	# Check if OTel Collector is running properly
	OTEL_READY=$(kubectl get pods -n observability -l app=otel-collector -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null \|\| echo "false")

	if [ "$OTEL_READY" != "true" ]; then
	echo ""
	echo "❌ ERROR: OTel Collector is not ready!"
	echo ""
	echo "=== Running comprehensive diagnostics ==="
	./debug-otel-collector.sh observability \|\| true
	echo ""
	echo "=== Extracting error keywords from logs ==="
	kubectl logs -n observability -l app=otel-collector --tail=200 2>&1 \| grep -E -i "error\|fatal\|panic\|fail\|invalid" \| head -50 \|\| echo "No obvious errors found"
	echo ""
	echo "=== Checking previous logs if pod restarted ==="
	kubectl logs -n observability -l app=otel-collector --previous --tail=100 2>&1 \|\| echo "No previous logs available"
	exit 1
	fi

	# Check other components (warnings only, don't fail)
	for component in tempo loki prometheus; do
	READY=$(kubectl get pods -n observability -l app=$component -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null \|\| echo "false")
	if [ "$READY" != "true" ]; then
	echo "⚠️ WARNING: $component is not ready, but continuing..."
	else
	echo "✅ $component is ready"
	fi
	done

	echo ""
	echo "=== Final observability stack status ==="
	kubectl get pods -n observability
	echo "✅ Observability stack deployment completed"
	echo "::endgroup::"
	- name: Install Kruise
	run: \|
	echo "::group::Install Kruise"
	bash ./scripts/ci/install-kruise.sh
	echo "::endgroup::"
	- name: Install Kruise Game in HA mode
	run: \|
	echo "::group::Install Kruise Game"
	set -ex
	IMG=${E2E_IMAGE} \
	ENABLE_HA=true \
	ENABLE_TRACING=true \
	OTEL_COLLECTOR_ENDPOINT=otel-collector.observability.svc.cluster.local:4317 \
	OTEL_SAMPLING_RATE=1.0 \
	./scripts/deploy_kind.sh

	# Wait for the controller manager to be ready at least 1 replica
	for i in {1..30}; do
	set +e
	PODS=$(kubectl get pod -n kruise-game-system \| grep '1/1' \| wc -l)
	set -e
	if [ "$PODS" -eq "1" ]; then
	break
	fi
	echo "Waiting for controller ready... ($i/10)"
	sleep 3
	done

	# Verify that at least 2 controller replicas are running
	PODS=$(kubectl get pod -n kruise-game-system --selector=control-plane=controller-manager -o jsonpath='{.items..metadata.name}' \| wc -w)
	if [ "$PODS" -lt "2" ]; then
	echo "HA mode requires at least 2 controller replicas, but found $PODS"
	kubectl get pod -n kruise-game-system
	exit 1
	fi
	echo "Kruise Game installed in HA mode successfully"
	echo "::endgroup::"
	- name: Verify Kind Cluster
	run: \|
	echo "::group::Verify Kind cluster"
	bash ./scripts/ci/verify-kind-cluster.sh
	echo "::endgroup::"
	- name: Setup Port Forwards for Observability
	run: \|
	echo "::group::Setup observability port-forwards"
	set -x # Enable command echoing for debugging
	echo "=== Setting up port forwards for Tempo and Loki ==="

	# First, verify the services exist and have endpoints
	echo "--- Checking Tempo service ---"
	kubectl get svc -n observability tempo -o yaml \|\| echo "❌ Tempo service not found"
	kubectl get endpoints -n observability tempo \|\| echo "❌ Tempo endpoints not found"

	echo "--- Checking Loki service ---"
	kubectl get svc -n observability loki -o yaml \|\| echo "❌ Loki service not found"
	kubectl get endpoints -n observability loki \|\| echo "❌ Loki endpoints not found"

	echo "--- Checking Tempo pod status ---"
	kubectl get pods -n observability -l app.kubernetes.io/name=tempo \|\| echo "❌ No Tempo pods"

	echo "--- Checking Loki pod status ---"
	kubectl get pods -n observability -l app.kubernetes.io/name=loki \|\| echo "❌ No Loki pods"

	# Port forward Tempo (background, with verbose output)
	echo "--- Starting Tempo port-forward ---"
	kubectl port-forward -n observability svc/tempo 3200:3200 -v=6 &
	TEMPO_PID=$!
	echo $TEMPO_PID > /tmp/tempo-pf.pid
	echo "Tempo port-forward PID: $TEMPO_PID"

	# Port forward Loki (background, with verbose output)
	echo "--- Starting Loki port-forward ---"
	kubectl port-forward -n observability svc/loki 3100:3100 -v=6 &
	LOKI_PID=$!
	echo $LOKI_PID > /tmp/loki-pf.pid
	echo "Loki port-forward PID: $LOKI_PID"

	# Wait for port forwards to be ready
	echo "--- Waiting for port forwards to establish ---"
	sleep 10

	# Check if processes are still running
	echo "--- Checking port-forward processes ---"
	if ps -p $TEMPO_PID > /dev/null; then
	echo "✓ Tempo port-forward process is running"
	else
	echo "❌ Tempo port-forward process died"
	cat /tmp/tempo-pf.pid
	fi

	if ps -p $LOKI_PID > /dev/null; then
	echo "✓ Loki port-forward process is running"
	else
	echo "❌ Loki port-forward process died"
	cat /tmp/loki-pf.pid
	fi

	# Check if ports are listening
	echo "--- Checking listening ports ---"
	netstat -tuln \| grep -E ':(3200\|3100)' \|\| echo "⚠️ Ports not listening"
	ss -tuln \| grep -E ':(3200\|3100)' \|\| echo "⚠️ Ports not found by ss"

	# Try to connect to the ports
	echo "--- Testing connectivity ---"

	echo "Testing Tempo (localhost:3200)..."
	if curl -v --max-time 5 http://localhost:3200/ready 2>&1; then
	echo "✓ Tempo /ready endpoint responded"
	else
	echo "❌ Tempo /ready endpoint failed"
	fi

	echo "Testing Tempo search API..."
	if curl -v --max-time 5 "http://localhost:3200/api/search?tags=service.name=test&limit=1" 2>&1; then
	echo "✓ Tempo /api/search endpoint responded"
	else
	echo "❌ Tempo /api/search endpoint failed"
	fi

	echo "Testing Loki (localhost:3100)..."
	if curl -v --max-time 5 http://localhost:3100/ready 2>&1; then
	echo "✓ Loki /ready endpoint responded"
	else
	echo "❌ Loki /ready endpoint failed"
	fi

	echo "--- Port forward setup complete ---"
	echo "TEMPO_PID=$TEMPO_PID"
	echo "LOKI_PID=$LOKI_PID"
	echo "::endgroup::"
	- name: Verify Tracing Configuration
	run: \|
	echo "::group::Verify tracing configuration"
	bash ./scripts/ci/verify-tracing-config.sh
	echo "::endgroup::"
	- name: Verify Controller Metrics Endpoint
	run: \|
	echo "::group::Verify controller metrics"
	set -euo pipefail
	echo "=== Verifying controller metrics endpoint ==="
	METRICS_SVC=$(kubectl get svc -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \| grep metrics-service \| head -n 1 \|\| true)
	if [ -z "$METRICS_SVC" ]; then
	echo "❌ Could not find controller metrics Service"
	kubectl get svc -n kruise-game-system
	exit 1
	fi
	echo "Using metrics service: $METRICS_SVC"

	echo "Waiting for metrics endpoints to be ready..."
	for i in {1..12}; do
	ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null \|\| true)
	if [ -n "$ENDPOINT_READY" ]; then
	echo "Endpoints ready (IP=$ENDPOINT_READY)"
	break
	fi
	echo " endpoints not ready yet (attempt $i/12); sleeping 5s"
	sleep 5
	done

	ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null \|\| true)
	if [ -z "$ENDPOINT_READY" ]; then
	echo "❌ Metrics service has no ready endpoints"
	kubectl describe svc "$METRICS_SVC" -n kruise-game-system
	kubectl get pods -n kruise-game-system -l control-plane=controller-manager
	exit 1
	fi

	echo "Attempting to query metrics via API server service proxy..."
	set +e
	PROXY_OUTPUT=$(kubectl get --raw "/api/v1/namespaces/kruise-game-system/services/${METRICS_SVC}:http-metrics/proxy/metrics" 2> /tmp/proxy_err \| head -n 200)
	PROXY_STATUS=$?
	set -e

	if [ $PROXY_STATUS -ne 0 ]; then
	echo "❌ Service proxy request failed:"
	cat /tmp/proxy_err
	echo "--- Service describe ---"
	kubectl describe svc "$METRICS_SVC" -n kruise-game-system \|\| true
	echo "--- Endpoints ---"
	kubectl get endpoints "$METRICS_SVC" -n kruise-game-system -o yaml \|\| true
	echo "--- Controller pods ---"
	kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o wide \|\| true

	echo "Attempting to read metrics directly from controller pod..."
	CONTROLLER_POD=$(kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')
	set +e
	DIRECT_FULL_OUTPUT=$(kubectl exec -n kruise-game-system "$CONTROLLER_POD" -- wget -qO- http://127.0.0.1:8080/metrics 2> /tmp/direct_err)
	DIRECT_STATUS=$?
	set -e
	if [ $DIRECT_STATUS -ne 0 ]; then
	echo "❌ Direct pod metrics request failed:"
	cat /tmp/direct_err
	exit 1
	fi
	DIRECT_OUTPUT=$(echo "$DIRECT_FULL_OUTPUT" \| head -n 200)
	echo "--- Sample metrics output (first 20 lines) ---"
	echo "$DIRECT_OUTPUT" \| head -n 20

	if ! echo "$DIRECT_OUTPUT" \| grep -q "controller_runtime_webhook_requests_total"; then
	echo "❌ Expected controller-runtime metrics not found even via direct pod exec"
	exit 1
	fi

	echo "⚠️ Service proxy failed but direct pod metrics endpoint is reachable"
	else
	echo "--- Sample metrics output (first 20 lines) ---"
	echo "$PROXY_OUTPUT" \| head -n 20
	if ! echo "$PROXY_OUTPUT" \| grep -q "controller_runtime_webhook_requests_total"; then
	echo "❌ Expected controller-runtime metrics not found in /metrics output"
	exit 1
	fi
	echo "✅ Controller metrics endpoint reachable via service proxy"
	fi
	echo "::endgroup::"
	- name: Run E2E Tests before failover
	env:
	E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
	E2E_ARTIFACT_SUFFIX: before-ha
	E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log
	TEMPO_URL: http://localhost:3200
	LOKI_URL: http://localhost:3100
	E2E_OBSERVABILITY_DEBUG: "true"
	run: \|
	echo "::group::Run E2E tests (before failover)"
	bash ./scripts/ci/run-e2e-tests.sh
	echo "::endgroup::"
	- name: Test HA Failover
	run: \|
	echo "::group::Test HA failover"
	set -e
	NAMESPACE=kruise-game-system
	LEASE_NAME=game-kruise-manager
	echo "--- Identifying initial leader ---"
	LEADER_POD=$(kubectl get lease $LEASE_NAME -n $NAMESPACE -o jsonpath='{.spec.holderIdentity}' \| awk -F'_' '{print $1}')
	if [ -z "$LEADER_POD" ]; then
	echo "Could not determine leader pod."
	exit 1
	fi
	echo "Current leader is $LEADER_POD"

	echo "--- Deleting leader pod to trigger failover ---"
	kubectl delete pod $LEADER_POD -n $NAMESPACE

	echo "--- Waiting for new leader to be elected ---"
	for i in {1..30}; do
	NEW_LEADER_POD=$(kubectl get lease $LEASE_NAME -n $NAMESPACE -o jsonpath='{.spec.holderIdentity}' \| awk -F'_' '{print $1}')
	if [ -n "$NEW_LEADER_POD" ] && [ "$NEW_LEADER_POD" != "$LEADER_POD" ]; then
	echo "New leader elected: $NEW_LEADER_POD"
	break
	fi
	echo "Waiting for new leader... ($i/30)"
	sleep 5
	done

	if [ "$NEW_LEADER_POD" == "$LEADER_POD" ] \|\| [ -z "$NEW_LEADER_POD" ]; then
	echo "Failover failed. A new leader was not elected in time."
	kubectl get lease $LEASE_NAME -n $NAMESPACE -o yaml
	exit 1
	fi

	echo "--- Verifying all controller pods are ready after failover ---"
	# Wait for the controller manager to be ready at least 1 replica
	for i in {1..30}; do
	set +e
	PODS=$(kubectl get pod -n kruise-game-system \| grep '1/1' \| wc -l)
	set -e
	if [ "$PODS" -eq "1" ]; then
	break
	fi
	echo "Waiting for controller ready... ($i/10)"
	sleep 3
	done
	echo "HA Failover successful."
	echo "::endgroup::"
	- name: Restore Port Forwards after HA Failover
	run: \|
	echo "::group::Restore observability port-forwards"
	echo "=== Re-establishing port forwards after HA failover ==="

	# Port forward Tempo (background)
	echo "--- Starting Tempo port-forward ---"
	kubectl port-forward -n observability svc/tempo 3200:3200 &
	TEMPO_PID=$!
	echo $TEMPO_PID > /tmp/tempo-pf.pid
	echo "Tempo port-forward PID: $TEMPO_PID"

	# Port forward Loki (background)
	echo "--- Starting Loki port-forward ---"
	kubectl port-forward -n observability svc/loki 3100:3100 &
	LOKI_PID=$!
	echo $LOKI_PID > /tmp/loki-pf.pid
	echo "Loki port-forward PID: $LOKI_PID"

	# Wait for port forwards to be ready
	echo "--- Waiting for port forwards to establish ---"
	sleep 10

	# Verify connectivity
	echo "--- Testing Tempo connectivity ---"
	curl -s --max-time 5 http://localhost:3200/ready \|\| echo "Warning: Tempo not immediately reachable"

	echo "--- Port forwards restored ---"
	echo "::endgroup::"
	- name: Run E2E Tests after failover
	env:
	E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
	E2E_ARTIFACT_SUFFIX: after-ha
	E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log
	E2E_GINKGO_FLAGS: "-v"
	E2E_MAX_RESTARTS: "1"
	TEMPO_URL: http://localhost:3200
	LOKI_URL: http://localhost:3100
	E2E_OBSERVABILITY_DEBUG: "false"
	run: \|
	echo "::group::Run E2E tests (after failover)"
	bash ./scripts/ci/run-e2e-tests.sh
	echo "::endgroup::"
	- name: Collect Additional Diagnostics
	if: always()
	env:
	E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
	run: \|
	echo "::group::Collect E2E diagnostics"
	bash ./scripts/ci/collect-e2e-artifacts.sh
	echo "::endgroup::"
	- name: Upload E2E Test Artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: e2e-test-artifacts-${{ env.KIND_VERSION }}
	path: /tmp/e2e-artifacts
	if-no-files-found: warn
	retention-days: 7
	compression-level: 6

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add AlibabaCloud-AutoNLBs-V3 network plugin #371

Workflow file

feat: add AlibabaCloud-AutoNLBs-V3 network plugin #371

Uh oh!

Workflow file for this run