Skip to content

feat: add AlibabaCloud-AutoNLBs-V3 network plugin #371

feat: add AlibabaCloud-AutoNLBs-V3 network plugin

feat: add AlibabaCloud-AutoNLBs-V3 network plugin #371

Workflow file for this run

name: E2E-1.30
on:
push:
branches:
- master
- release-*
pull_request: {}
workflow_dispatch: {}
env:
# Common versions
GO_VERSION: "1.23.4"
KIND_VERSION: "v0.22.0"
KIND_IMAGE: "kindest/node:v1.30.8"
KIND_CLUSTER_NAME: "ci-testing"
CERT_MANAGER_VERSION: "v1.18.2"
jobs:
game-kruise:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
fetch-tags: true
- name: Ensure tags are available
run: git fetch --force --tags
- name: Setup Go
uses: actions/setup-go@v3
with:
go-version: ${{ env.GO_VERSION }}
- name: Determine build metadata
run: |
echo "::group::Determine build metadata"
bash ./scripts/ci/determine-build-metadata.sh
echo "::endgroup::"
# Prepare audit policy before cluster creation so extraMounts can find it
- name: Prepare audit policy
run: |
echo "::group::Prepare audit policy"
bash ./scripts/ci/prepare-kind-audit.sh
echo "::endgroup::"
- name: Setup Kind Cluster
uses: helm/kind-action@v1.12.0
with:
node_image: ${{ env.KIND_IMAGE }}
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
config: ./test/kind-conf.yaml
version: ${{ env.KIND_VERSION }}
- name: Ensure audit log file exists and is world-readable
run: |
echo "::group::Ensure audit log file"
bash ./scripts/ci/ensure-audit-log.sh
echo "::endgroup::"
- name: Build image
run: |
echo "::group::Build manager image"
bash ./scripts/ci/build-manager-image.sh
echo "::endgroup::"
- name: Install Cert-Manager
run: |
echo "::group::Install Cert-Manager"
bash ./scripts/ci/install-cert-manager.sh
echo "::endgroup::"
- name: Deploy Observability Infrastructure
run: |
echo "::group::Deploy observability stack"
set -ex
echo "=== Deploying observability stack for tracing E2E tests ==="
cd test/e2e
# Deploy the stack (script will not exit on pod failures)
./setup-k8s-observability.sh deploy
echo ""
echo "=== Checking deployment status ==="
kubectl get pods -n observability -o wide
# Check if OTel Collector is running properly
OTEL_READY=$(kubectl get pods -n observability -l app=otel-collector -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
if [ "$OTEL_READY" != "true" ]; then
echo ""
echo "❌ ERROR: OTel Collector is not ready!"
echo ""
echo "=== Running comprehensive diagnostics ==="
./debug-otel-collector.sh observability || true
echo ""
echo "=== Extracting error keywords from logs ==="
kubectl logs -n observability -l app=otel-collector --tail=200 2>&1 | grep -E -i "error|fatal|panic|fail|invalid" | head -50 || echo "No obvious errors found"
echo ""
echo "=== Checking previous logs if pod restarted ==="
kubectl logs -n observability -l app=otel-collector --previous --tail=100 2>&1 || echo "No previous logs available"
exit 1
fi
# Check other components (warnings only, don't fail)
for component in tempo loki prometheus; do
READY=$(kubectl get pods -n observability -l app=$component -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
if [ "$READY" != "true" ]; then
echo "⚠️ WARNING: $component is not ready, but continuing..."
else
echo "✅ $component is ready"
fi
done
echo ""
echo "=== Final observability stack status ==="
kubectl get pods -n observability
echo "✅ Observability stack deployment completed"
echo "::endgroup::"
- name: Install Kruise
run: |
echo "::group::Install Kruise"
bash ./scripts/ci/install-kruise.sh
echo "::endgroup::"
- name: Install Kruise Game in HA mode
run: |
echo "::group::Install Kruise Game"
set -ex
IMG=${E2E_IMAGE} \
ENABLE_HA=true \
ENABLE_TRACING=true \
OTEL_COLLECTOR_ENDPOINT=otel-collector.observability.svc.cluster.local:4317 \
OTEL_SAMPLING_RATE=1.0 \
./scripts/deploy_kind.sh
# Wait for the controller manager to be ready at least 1 replica
for i in {1..30}; do
set +e
PODS=$(kubectl get pod -n kruise-game-system | grep '1/1' | wc -l)
set -e
if [ "$PODS" -eq "1" ]; then
break
fi
echo "Waiting for controller ready... ($i/10)"
sleep 3
done
# Verify that at least 2 controller replicas are running
PODS=$(kubectl get pod -n kruise-game-system --selector=control-plane=controller-manager -o jsonpath='{.items..metadata.name}' | wc -w)
if [ "$PODS" -lt "2" ]; then
echo "HA mode requires at least 2 controller replicas, but found $PODS"
kubectl get pod -n kruise-game-system
exit 1
fi
echo "Kruise Game installed in HA mode successfully"
echo "::endgroup::"
- name: Verify Kind Cluster
run: |
echo "::group::Verify Kind cluster"
bash ./scripts/ci/verify-kind-cluster.sh
echo "::endgroup::"
- name: Setup Port Forwards for Observability
run: |
echo "::group::Setup observability port-forwards"
set -x # Enable command echoing for debugging
echo "=== Setting up port forwards for Tempo and Loki ==="
# First, verify the services exist and have endpoints
echo "--- Checking Tempo service ---"
kubectl get svc -n observability tempo -o yaml || echo "❌ Tempo service not found"
kubectl get endpoints -n observability tempo || echo "❌ Tempo endpoints not found"
echo "--- Checking Loki service ---"
kubectl get svc -n observability loki -o yaml || echo "❌ Loki service not found"
kubectl get endpoints -n observability loki || echo "❌ Loki endpoints not found"
echo "--- Checking Tempo pod status ---"
kubectl get pods -n observability -l app.kubernetes.io/name=tempo || echo "❌ No Tempo pods"
echo "--- Checking Loki pod status ---"
kubectl get pods -n observability -l app.kubernetes.io/name=loki || echo "❌ No Loki pods"
# Port forward Tempo (background, with verbose output)
echo "--- Starting Tempo port-forward ---"
kubectl port-forward -n observability svc/tempo 3200:3200 -v=6 &
TEMPO_PID=$!
echo $TEMPO_PID > /tmp/tempo-pf.pid
echo "Tempo port-forward PID: $TEMPO_PID"
# Port forward Loki (background, with verbose output)
echo "--- Starting Loki port-forward ---"
kubectl port-forward -n observability svc/loki 3100:3100 -v=6 &
LOKI_PID=$!
echo $LOKI_PID > /tmp/loki-pf.pid
echo "Loki port-forward PID: $LOKI_PID"
# Wait for port forwards to be ready
echo "--- Waiting for port forwards to establish ---"
sleep 10
# Check if processes are still running
echo "--- Checking port-forward processes ---"
if ps -p $TEMPO_PID > /dev/null; then
echo "✓ Tempo port-forward process is running"
else
echo "❌ Tempo port-forward process died"
cat /tmp/tempo-pf.pid
fi
if ps -p $LOKI_PID > /dev/null; then
echo "✓ Loki port-forward process is running"
else
echo "❌ Loki port-forward process died"
cat /tmp/loki-pf.pid
fi
# Check if ports are listening
echo "--- Checking listening ports ---"
netstat -tuln | grep -E ':(3200|3100)' || echo "⚠️ Ports not listening"
ss -tuln | grep -E ':(3200|3100)' || echo "⚠️ Ports not found by ss"
# Try to connect to the ports
echo "--- Testing connectivity ---"
echo "Testing Tempo (localhost:3200)..."
if curl -v --max-time 5 http://localhost:3200/ready 2>&1; then
echo "✓ Tempo /ready endpoint responded"
else
echo "❌ Tempo /ready endpoint failed"
fi
echo "Testing Tempo search API..."
if curl -v --max-time 5 "http://localhost:3200/api/search?tags=service.name=test&limit=1" 2>&1; then
echo "✓ Tempo /api/search endpoint responded"
else
echo "❌ Tempo /api/search endpoint failed"
fi
echo "Testing Loki (localhost:3100)..."
if curl -v --max-time 5 http://localhost:3100/ready 2>&1; then
echo "✓ Loki /ready endpoint responded"
else
echo "❌ Loki /ready endpoint failed"
fi
echo "--- Port forward setup complete ---"
echo "TEMPO_PID=$TEMPO_PID"
echo "LOKI_PID=$LOKI_PID"
echo "::endgroup::"
- name: Verify Tracing Configuration
run: |
echo "::group::Verify tracing configuration"
bash ./scripts/ci/verify-tracing-config.sh
echo "::endgroup::"
- name: Verify Controller Metrics Endpoint
run: |
echo "::group::Verify controller metrics"
set -euo pipefail
echo "=== Verifying controller metrics endpoint ==="
METRICS_SVC=$(kubectl get svc -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep metrics-service | head -n 1 || true)
if [ -z "$METRICS_SVC" ]; then
echo "❌ Could not find controller metrics Service"
kubectl get svc -n kruise-game-system
exit 1
fi
echo "Using metrics service: $METRICS_SVC"
echo "Waiting for metrics endpoints to be ready..."
for i in {1..12}; do
ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true)
if [ -n "$ENDPOINT_READY" ]; then
echo "Endpoints ready (IP=$ENDPOINT_READY)"
break
fi
echo " endpoints not ready yet (attempt $i/12); sleeping 5s"
sleep 5
done
ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true)
if [ -z "$ENDPOINT_READY" ]; then
echo "❌ Metrics service has no ready endpoints"
kubectl describe svc "$METRICS_SVC" -n kruise-game-system
kubectl get pods -n kruise-game-system -l control-plane=controller-manager
exit 1
fi
echo "Attempting to query metrics via API server service proxy..."
set +e
PROXY_OUTPUT=$(kubectl get --raw "/api/v1/namespaces/kruise-game-system/services/${METRICS_SVC}:http-metrics/proxy/metrics" 2> /tmp/proxy_err | head -n 200)
PROXY_STATUS=$?
set -e
if [ $PROXY_STATUS -ne 0 ]; then
echo "❌ Service proxy request failed:"
cat /tmp/proxy_err
echo "--- Service describe ---"
kubectl describe svc "$METRICS_SVC" -n kruise-game-system || true
echo "--- Endpoints ---"
kubectl get endpoints "$METRICS_SVC" -n kruise-game-system -o yaml || true
echo "--- Controller pods ---"
kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o wide || true
echo "Attempting to read metrics directly from controller pod..."
CONTROLLER_POD=$(kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')
set +e
DIRECT_FULL_OUTPUT=$(kubectl exec -n kruise-game-system "$CONTROLLER_POD" -- wget -qO- http://127.0.0.1:8080/metrics 2> /tmp/direct_err)
DIRECT_STATUS=$?
set -e
if [ $DIRECT_STATUS -ne 0 ]; then
echo "❌ Direct pod metrics request failed:"
cat /tmp/direct_err
exit 1
fi
DIRECT_OUTPUT=$(echo "$DIRECT_FULL_OUTPUT" | head -n 200)
echo "--- Sample metrics output (first 20 lines) ---"
echo "$DIRECT_OUTPUT" | head -n 20
if ! echo "$DIRECT_OUTPUT" | grep -q "controller_runtime_webhook_requests_total"; then
echo "❌ Expected controller-runtime metrics not found even via direct pod exec"
exit 1
fi
echo "⚠️ Service proxy failed but direct pod metrics endpoint is reachable"
else
echo "--- Sample metrics output (first 20 lines) ---"
echo "$PROXY_OUTPUT" | head -n 20
if ! echo "$PROXY_OUTPUT" | grep -q "controller_runtime_webhook_requests_total"; then
echo "❌ Expected controller-runtime metrics not found in /metrics output"
exit 1
fi
echo "✅ Controller metrics endpoint reachable via service proxy"
fi
echo "::endgroup::"
- name: Run E2E Tests before failover
env:
E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
E2E_ARTIFACT_SUFFIX: before-ha
E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log
TEMPO_URL: http://localhost:3200
LOKI_URL: http://localhost:3100
E2E_OBSERVABILITY_DEBUG: "true"
run: |
echo "::group::Run E2E tests (before failover)"
bash ./scripts/ci/run-e2e-tests.sh
echo "::endgroup::"
- name: Test HA Failover
run: |
echo "::group::Test HA failover"
set -e
NAMESPACE=kruise-game-system
LEASE_NAME=game-kruise-manager
echo "--- Identifying initial leader ---"
LEADER_POD=$(kubectl get lease $LEASE_NAME -n $NAMESPACE -o jsonpath='{.spec.holderIdentity}' | awk -F'_' '{print $1}')
if [ -z "$LEADER_POD" ]; then
echo "Could not determine leader pod."
exit 1
fi
echo "Current leader is $LEADER_POD"
echo "--- Deleting leader pod to trigger failover ---"
kubectl delete pod $LEADER_POD -n $NAMESPACE
echo "--- Waiting for new leader to be elected ---"
for i in {1..30}; do
NEW_LEADER_POD=$(kubectl get lease $LEASE_NAME -n $NAMESPACE -o jsonpath='{.spec.holderIdentity}' | awk -F'_' '{print $1}')
if [ -n "$NEW_LEADER_POD" ] && [ "$NEW_LEADER_POD" != "$LEADER_POD" ]; then
echo "New leader elected: $NEW_LEADER_POD"
break
fi
echo "Waiting for new leader... ($i/30)"
sleep 5
done
if [ "$NEW_LEADER_POD" == "$LEADER_POD" ] || [ -z "$NEW_LEADER_POD" ]; then
echo "Failover failed. A new leader was not elected in time."
kubectl get lease $LEASE_NAME -n $NAMESPACE -o yaml
exit 1
fi
echo "--- Verifying all controller pods are ready after failover ---"
# Wait for the controller manager to be ready at least 1 replica
for i in {1..30}; do
set +e
PODS=$(kubectl get pod -n kruise-game-system | grep '1/1' | wc -l)
set -e
if [ "$PODS" -eq "1" ]; then
break
fi
echo "Waiting for controller ready... ($i/10)"
sleep 3
done
echo "HA Failover successful."
echo "::endgroup::"
- name: Restore Port Forwards after HA Failover
run: |
echo "::group::Restore observability port-forwards"
echo "=== Re-establishing port forwards after HA failover ==="
# Port forward Tempo (background)
echo "--- Starting Tempo port-forward ---"
kubectl port-forward -n observability svc/tempo 3200:3200 &
TEMPO_PID=$!
echo $TEMPO_PID > /tmp/tempo-pf.pid
echo "Tempo port-forward PID: $TEMPO_PID"
# Port forward Loki (background)
echo "--- Starting Loki port-forward ---"
kubectl port-forward -n observability svc/loki 3100:3100 &
LOKI_PID=$!
echo $LOKI_PID > /tmp/loki-pf.pid
echo "Loki port-forward PID: $LOKI_PID"
# Wait for port forwards to be ready
echo "--- Waiting for port forwards to establish ---"
sleep 10
# Verify connectivity
echo "--- Testing Tempo connectivity ---"
curl -s --max-time 5 http://localhost:3200/ready || echo "Warning: Tempo not immediately reachable"
echo "--- Port forwards restored ---"
echo "::endgroup::"
- name: Run E2E Tests after failover
env:
E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
E2E_ARTIFACT_SUFFIX: after-ha
E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log
E2E_GINKGO_FLAGS: "-v"
E2E_MAX_RESTARTS: "1"
TEMPO_URL: http://localhost:3200
LOKI_URL: http://localhost:3100
E2E_OBSERVABILITY_DEBUG: "false"
run: |
echo "::group::Run E2E tests (after failover)"
bash ./scripts/ci/run-e2e-tests.sh
echo "::endgroup::"
- name: Collect Additional Diagnostics
if: always()
env:
E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
run: |
echo "::group::Collect E2E diagnostics"
bash ./scripts/ci/collect-e2e-artifacts.sh
echo "::endgroup::"
- name: Upload E2E Test Artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-test-artifacts-${{ env.KIND_VERSION }}
path: /tmp/e2e-artifacts
if-no-files-found: warn
retention-days: 7
compression-level: 6