feat: add AlibabaCloud-AutoNLBs-V3 network plugin #371
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: E2E-1.30 | |
| on: | |
| push: | |
| branches: | |
| - master | |
| - release-* | |
| pull_request: {} | |
| workflow_dispatch: {} | |
| env: | |
| # Common versions | |
| GO_VERSION: "1.23.4" | |
| KIND_VERSION: "v0.22.0" | |
| KIND_IMAGE: "kindest/node:v1.30.8" | |
| KIND_CLUSTER_NAME: "ci-testing" | |
| CERT_MANAGER_VERSION: "v1.18.2" | |
| jobs: | |
| game-kruise: | |
| runs-on: ubuntu-24.04 | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| submodules: true | |
| fetch-depth: 0 | |
| fetch-tags: true | |
| - name: Ensure tags are available | |
| run: git fetch --force --tags | |
| - name: Setup Go | |
| uses: actions/setup-go@v3 | |
| with: | |
| go-version: ${{ env.GO_VERSION }} | |
| - name: Determine build metadata | |
| run: | | |
| echo "::group::Determine build metadata" | |
| bash ./scripts/ci/determine-build-metadata.sh | |
| echo "::endgroup::" | |
| # Prepare audit policy before cluster creation so extraMounts can find it | |
| - name: Prepare audit policy | |
| run: | | |
| echo "::group::Prepare audit policy" | |
| bash ./scripts/ci/prepare-kind-audit.sh | |
| echo "::endgroup::" | |
| - name: Setup Kind Cluster | |
| uses: helm/kind-action@v1.12.0 | |
| with: | |
| node_image: ${{ env.KIND_IMAGE }} | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| config: ./test/kind-conf.yaml | |
| version: ${{ env.KIND_VERSION }} | |
| - name: Ensure audit log file exists and is world-readable | |
| run: | | |
| echo "::group::Ensure audit log file" | |
| bash ./scripts/ci/ensure-audit-log.sh | |
| echo "::endgroup::" | |
| - name: Build image | |
| run: | | |
| echo "::group::Build manager image" | |
| bash ./scripts/ci/build-manager-image.sh | |
| echo "::endgroup::" | |
| - name: Install Cert-Manager | |
| run: | | |
| echo "::group::Install Cert-Manager" | |
| bash ./scripts/ci/install-cert-manager.sh | |
| echo "::endgroup::" | |
| - name: Deploy Observability Infrastructure | |
| run: | | |
| echo "::group::Deploy observability stack" | |
| set -ex | |
| echo "=== Deploying observability stack for tracing E2E tests ===" | |
| cd test/e2e | |
| # Deploy the stack (script will not exit on pod failures) | |
| ./setup-k8s-observability.sh deploy | |
| echo "" | |
| echo "=== Checking deployment status ===" | |
| kubectl get pods -n observability -o wide | |
| # Check if OTel Collector is running properly | |
| OTEL_READY=$(kubectl get pods -n observability -l app=otel-collector -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "false") | |
| if [ "$OTEL_READY" != "true" ]; then | |
| echo "" | |
| echo "❌ ERROR: OTel Collector is not ready!" | |
| echo "" | |
| echo "=== Running comprehensive diagnostics ===" | |
| ./debug-otel-collector.sh observability || true | |
| echo "" | |
| echo "=== Extracting error keywords from logs ===" | |
| kubectl logs -n observability -l app=otel-collector --tail=200 2>&1 | grep -E -i "error|fatal|panic|fail|invalid" | head -50 || echo "No obvious errors found" | |
| echo "" | |
| echo "=== Checking previous logs if pod restarted ===" | |
| kubectl logs -n observability -l app=otel-collector --previous --tail=100 2>&1 || echo "No previous logs available" | |
| exit 1 | |
| fi | |
| # Check other components (warnings only, don't fail) | |
| for component in tempo loki prometheus; do | |
| READY=$(kubectl get pods -n observability -l app=$component -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "false") | |
| if [ "$READY" != "true" ]; then | |
| echo "⚠️ WARNING: $component is not ready, but continuing..." | |
| else | |
| echo "✅ $component is ready" | |
| fi | |
| done | |
| echo "" | |
| echo "=== Final observability stack status ===" | |
| kubectl get pods -n observability | |
| echo "✅ Observability stack deployment completed" | |
| echo "::endgroup::" | |
| - name: Install Kruise | |
| run: | | |
| echo "::group::Install Kruise" | |
| bash ./scripts/ci/install-kruise.sh | |
| echo "::endgroup::" | |
| - name: Install Kruise Game in HA mode | |
| run: | | |
| echo "::group::Install Kruise Game" | |
| set -ex | |
| IMG=${E2E_IMAGE} \ | |
| ENABLE_HA=true \ | |
| ENABLE_TRACING=true \ | |
| OTEL_COLLECTOR_ENDPOINT=otel-collector.observability.svc.cluster.local:4317 \ | |
| OTEL_SAMPLING_RATE=1.0 \ | |
| ./scripts/deploy_kind.sh | |
| # Wait for the controller manager to be ready at least 1 replica | |
| for i in {1..30}; do | |
| set +e | |
| PODS=$(kubectl get pod -n kruise-game-system | grep '1/1' | wc -l) | |
| set -e | |
| if [ "$PODS" -eq "1" ]; then | |
| break | |
| fi | |
| echo "Waiting for controller ready... ($i/10)" | |
| sleep 3 | |
| done | |
| # Verify that at least 2 controller replicas are running | |
| PODS=$(kubectl get pod -n kruise-game-system --selector=control-plane=controller-manager -o jsonpath='{.items..metadata.name}' | wc -w) | |
| if [ "$PODS" -lt "2" ]; then | |
| echo "HA mode requires at least 2 controller replicas, but found $PODS" | |
| kubectl get pod -n kruise-game-system | |
| exit 1 | |
| fi | |
| echo "Kruise Game installed in HA mode successfully" | |
| echo "::endgroup::" | |
| - name: Verify Kind Cluster | |
| run: | | |
| echo "::group::Verify Kind cluster" | |
| bash ./scripts/ci/verify-kind-cluster.sh | |
| echo "::endgroup::" | |
| - name: Setup Port Forwards for Observability | |
| run: | | |
| echo "::group::Setup observability port-forwards" | |
| set -x # Enable command echoing for debugging | |
| echo "=== Setting up port forwards for Tempo and Loki ===" | |
| # First, verify the services exist and have endpoints | |
| echo "--- Checking Tempo service ---" | |
| kubectl get svc -n observability tempo -o yaml || echo "❌ Tempo service not found" | |
| kubectl get endpoints -n observability tempo || echo "❌ Tempo endpoints not found" | |
| echo "--- Checking Loki service ---" | |
| kubectl get svc -n observability loki -o yaml || echo "❌ Loki service not found" | |
| kubectl get endpoints -n observability loki || echo "❌ Loki endpoints not found" | |
| echo "--- Checking Tempo pod status ---" | |
| kubectl get pods -n observability -l app.kubernetes.io/name=tempo || echo "❌ No Tempo pods" | |
| echo "--- Checking Loki pod status ---" | |
| kubectl get pods -n observability -l app.kubernetes.io/name=loki || echo "❌ No Loki pods" | |
| # Port forward Tempo (background, with verbose output) | |
| echo "--- Starting Tempo port-forward ---" | |
| kubectl port-forward -n observability svc/tempo 3200:3200 -v=6 & | |
| TEMPO_PID=$! | |
| echo $TEMPO_PID > /tmp/tempo-pf.pid | |
| echo "Tempo port-forward PID: $TEMPO_PID" | |
| # Port forward Loki (background, with verbose output) | |
| echo "--- Starting Loki port-forward ---" | |
| kubectl port-forward -n observability svc/loki 3100:3100 -v=6 & | |
| LOKI_PID=$! | |
| echo $LOKI_PID > /tmp/loki-pf.pid | |
| echo "Loki port-forward PID: $LOKI_PID" | |
| # Wait for port forwards to be ready | |
| echo "--- Waiting for port forwards to establish ---" | |
| sleep 10 | |
| # Check if processes are still running | |
| echo "--- Checking port-forward processes ---" | |
| if ps -p $TEMPO_PID > /dev/null; then | |
| echo "✓ Tempo port-forward process is running" | |
| else | |
| echo "❌ Tempo port-forward process died" | |
| cat /tmp/tempo-pf.pid | |
| fi | |
| if ps -p $LOKI_PID > /dev/null; then | |
| echo "✓ Loki port-forward process is running" | |
| else | |
| echo "❌ Loki port-forward process died" | |
| cat /tmp/loki-pf.pid | |
| fi | |
| # Check if ports are listening | |
| echo "--- Checking listening ports ---" | |
| netstat -tuln | grep -E ':(3200|3100)' || echo "⚠️ Ports not listening" | |
| ss -tuln | grep -E ':(3200|3100)' || echo "⚠️ Ports not found by ss" | |
| # Try to connect to the ports | |
| echo "--- Testing connectivity ---" | |
| echo "Testing Tempo (localhost:3200)..." | |
| if curl -v --max-time 5 http://localhost:3200/ready 2>&1; then | |
| echo "✓ Tempo /ready endpoint responded" | |
| else | |
| echo "❌ Tempo /ready endpoint failed" | |
| fi | |
| echo "Testing Tempo search API..." | |
| if curl -v --max-time 5 "http://localhost:3200/api/search?tags=service.name=test&limit=1" 2>&1; then | |
| echo "✓ Tempo /api/search endpoint responded" | |
| else | |
| echo "❌ Tempo /api/search endpoint failed" | |
| fi | |
| echo "Testing Loki (localhost:3100)..." | |
| if curl -v --max-time 5 http://localhost:3100/ready 2>&1; then | |
| echo "✓ Loki /ready endpoint responded" | |
| else | |
| echo "❌ Loki /ready endpoint failed" | |
| fi | |
| echo "--- Port forward setup complete ---" | |
| echo "TEMPO_PID=$TEMPO_PID" | |
| echo "LOKI_PID=$LOKI_PID" | |
| echo "::endgroup::" | |
| - name: Verify Tracing Configuration | |
| run: | | |
| echo "::group::Verify tracing configuration" | |
| bash ./scripts/ci/verify-tracing-config.sh | |
| echo "::endgroup::" | |
| - name: Verify Controller Metrics Endpoint | |
| run: | | |
| echo "::group::Verify controller metrics" | |
| set -euo pipefail | |
| echo "=== Verifying controller metrics endpoint ===" | |
| METRICS_SVC=$(kubectl get svc -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep metrics-service | head -n 1 || true) | |
| if [ -z "$METRICS_SVC" ]; then | |
| echo "❌ Could not find controller metrics Service" | |
| kubectl get svc -n kruise-game-system | |
| exit 1 | |
| fi | |
| echo "Using metrics service: $METRICS_SVC" | |
| echo "Waiting for metrics endpoints to be ready..." | |
| for i in {1..12}; do | |
| ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) | |
| if [ -n "$ENDPOINT_READY" ]; then | |
| echo "Endpoints ready (IP=$ENDPOINT_READY)" | |
| break | |
| fi | |
| echo " endpoints not ready yet (attempt $i/12); sleeping 5s" | |
| sleep 5 | |
| done | |
| ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true) | |
| if [ -z "$ENDPOINT_READY" ]; then | |
| echo "❌ Metrics service has no ready endpoints" | |
| kubectl describe svc "$METRICS_SVC" -n kruise-game-system | |
| kubectl get pods -n kruise-game-system -l control-plane=controller-manager | |
| exit 1 | |
| fi | |
| echo "Attempting to query metrics via API server service proxy..." | |
| set +e | |
| PROXY_OUTPUT=$(kubectl get --raw "/api/v1/namespaces/kruise-game-system/services/${METRICS_SVC}:http-metrics/proxy/metrics" 2> /tmp/proxy_err | head -n 200) | |
| PROXY_STATUS=$? | |
| set -e | |
| if [ $PROXY_STATUS -ne 0 ]; then | |
| echo "❌ Service proxy request failed:" | |
| cat /tmp/proxy_err | |
| echo "--- Service describe ---" | |
| kubectl describe svc "$METRICS_SVC" -n kruise-game-system || true | |
| echo "--- Endpoints ---" | |
| kubectl get endpoints "$METRICS_SVC" -n kruise-game-system -o yaml || true | |
| echo "--- Controller pods ---" | |
| kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o wide || true | |
| echo "Attempting to read metrics directly from controller pod..." | |
| CONTROLLER_POD=$(kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}') | |
| set +e | |
| DIRECT_FULL_OUTPUT=$(kubectl exec -n kruise-game-system "$CONTROLLER_POD" -- wget -qO- http://127.0.0.1:8080/metrics 2> /tmp/direct_err) | |
| DIRECT_STATUS=$? | |
| set -e | |
| if [ $DIRECT_STATUS -ne 0 ]; then | |
| echo "❌ Direct pod metrics request failed:" | |
| cat /tmp/direct_err | |
| exit 1 | |
| fi | |
| DIRECT_OUTPUT=$(echo "$DIRECT_FULL_OUTPUT" | head -n 200) | |
| echo "--- Sample metrics output (first 20 lines) ---" | |
| echo "$DIRECT_OUTPUT" | head -n 20 | |
| if ! echo "$DIRECT_OUTPUT" | grep -q "controller_runtime_webhook_requests_total"; then | |
| echo "❌ Expected controller-runtime metrics not found even via direct pod exec" | |
| exit 1 | |
| fi | |
| echo "⚠️ Service proxy failed but direct pod metrics endpoint is reachable" | |
| else | |
| echo "--- Sample metrics output (first 20 lines) ---" | |
| echo "$PROXY_OUTPUT" | head -n 20 | |
| if ! echo "$PROXY_OUTPUT" | grep -q "controller_runtime_webhook_requests_total"; then | |
| echo "❌ Expected controller-runtime metrics not found in /metrics output" | |
| exit 1 | |
| fi | |
| echo "✅ Controller metrics endpoint reachable via service proxy" | |
| fi | |
| echo "::endgroup::" | |
| - name: Run E2E Tests before failover | |
| env: | |
| E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts | |
| E2E_ARTIFACT_SUFFIX: before-ha | |
| E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log | |
| TEMPO_URL: http://localhost:3200 | |
| LOKI_URL: http://localhost:3100 | |
| E2E_OBSERVABILITY_DEBUG: "true" | |
| run: | | |
| echo "::group::Run E2E tests (before failover)" | |
| bash ./scripts/ci/run-e2e-tests.sh | |
| echo "::endgroup::" | |
| - name: Test HA Failover | |
| run: | | |
| echo "::group::Test HA failover" | |
| set -e | |
| NAMESPACE=kruise-game-system | |
| LEASE_NAME=game-kruise-manager | |
| echo "--- Identifying initial leader ---" | |
| LEADER_POD=$(kubectl get lease $LEASE_NAME -n $NAMESPACE -o jsonpath='{.spec.holderIdentity}' | awk -F'_' '{print $1}') | |
| if [ -z "$LEADER_POD" ]; then | |
| echo "Could not determine leader pod." | |
| exit 1 | |
| fi | |
| echo "Current leader is $LEADER_POD" | |
| echo "--- Deleting leader pod to trigger failover ---" | |
| kubectl delete pod $LEADER_POD -n $NAMESPACE | |
| echo "--- Waiting for new leader to be elected ---" | |
| for i in {1..30}; do | |
| NEW_LEADER_POD=$(kubectl get lease $LEASE_NAME -n $NAMESPACE -o jsonpath='{.spec.holderIdentity}' | awk -F'_' '{print $1}') | |
| if [ -n "$NEW_LEADER_POD" ] && [ "$NEW_LEADER_POD" != "$LEADER_POD" ]; then | |
| echo "New leader elected: $NEW_LEADER_POD" | |
| break | |
| fi | |
| echo "Waiting for new leader... ($i/30)" | |
| sleep 5 | |
| done | |
| if [ "$NEW_LEADER_POD" == "$LEADER_POD" ] || [ -z "$NEW_LEADER_POD" ]; then | |
| echo "Failover failed. A new leader was not elected in time." | |
| kubectl get lease $LEASE_NAME -n $NAMESPACE -o yaml | |
| exit 1 | |
| fi | |
| echo "--- Verifying all controller pods are ready after failover ---" | |
| # Wait for the controller manager to be ready at least 1 replica | |
| for i in {1..30}; do | |
| set +e | |
| PODS=$(kubectl get pod -n kruise-game-system | grep '1/1' | wc -l) | |
| set -e | |
| if [ "$PODS" -eq "1" ]; then | |
| break | |
| fi | |
| echo "Waiting for controller ready... ($i/10)" | |
| sleep 3 | |
| done | |
| echo "HA Failover successful." | |
| echo "::endgroup::" | |
| - name: Restore Port Forwards after HA Failover | |
| run: | | |
| echo "::group::Restore observability port-forwards" | |
| echo "=== Re-establishing port forwards after HA failover ===" | |
| # Port forward Tempo (background) | |
| echo "--- Starting Tempo port-forward ---" | |
| kubectl port-forward -n observability svc/tempo 3200:3200 & | |
| TEMPO_PID=$! | |
| echo $TEMPO_PID > /tmp/tempo-pf.pid | |
| echo "Tempo port-forward PID: $TEMPO_PID" | |
| # Port forward Loki (background) | |
| echo "--- Starting Loki port-forward ---" | |
| kubectl port-forward -n observability svc/loki 3100:3100 & | |
| LOKI_PID=$! | |
| echo $LOKI_PID > /tmp/loki-pf.pid | |
| echo "Loki port-forward PID: $LOKI_PID" | |
| # Wait for port forwards to be ready | |
| echo "--- Waiting for port forwards to establish ---" | |
| sleep 10 | |
| # Verify connectivity | |
| echo "--- Testing Tempo connectivity ---" | |
| curl -s --max-time 5 http://localhost:3200/ready || echo "Warning: Tempo not immediately reachable" | |
| echo "--- Port forwards restored ---" | |
| echo "::endgroup::" | |
| - name: Run E2E Tests after failover | |
| env: | |
| E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts | |
| E2E_ARTIFACT_SUFFIX: after-ha | |
| E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log | |
| E2E_GINKGO_FLAGS: "-v" | |
| E2E_MAX_RESTARTS: "1" | |
| TEMPO_URL: http://localhost:3200 | |
| LOKI_URL: http://localhost:3100 | |
| E2E_OBSERVABILITY_DEBUG: "false" | |
| run: | | |
| echo "::group::Run E2E tests (after failover)" | |
| bash ./scripts/ci/run-e2e-tests.sh | |
| echo "::endgroup::" | |
| - name: Collect Additional Diagnostics | |
| if: always() | |
| env: | |
| E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts | |
| run: | | |
| echo "::group::Collect E2E diagnostics" | |
| bash ./scripts/ci/collect-e2e-artifacts.sh | |
| echo "::endgroup::" | |
| - name: Upload E2E Test Artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-test-artifacts-${{ env.KIND_VERSION }} | |
| path: /tmp/e2e-artifacts | |
| if-no-files-found: warn | |
| retention-days: 7 | |
| compression-level: 6 |