Skip to content

Build and Test Docker Image #403

Build and Test Docker Image

Build and Test Docker Image #403

name: Build and Test Docker Image
on:
workflow_dispatch:
concurrency:
group: build-docker-image-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
packages: write
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
VALIDATOR_LABELS: gcp-docker-validator
RUNNER_VERSION: '2.332.0'
IMAGE_NAME: ghcr.io/inclusionai/areal-runtime
IMAGE_TAG: test
jobs:
start-builder:
name: Start areal-docker-builder instance
runs-on: ubuntu-latest
outputs:
was_running: ${{ steps.start-instance.outputs.was_running }}
env:
INSTANCE_NAME: areal-docker-builder
INSTANCE_ZONE: us-central1-f
steps:
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v3
with:
credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v3
- name: Start builder instance if stopped
id: start-instance
run: |
set -euo pipefail
# Check instance status
status=$(gcloud compute instances describe "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$INSTANCE_ZONE" \
--format="get(status)" || echo "NOT_FOUND")
if [ "$status" = "NOT_FOUND" ]; then
echo "Error: Instance $INSTANCE_NAME not found in zone $INSTANCE_ZONE" >&2
exit 1
fi
if [ "$status" = "RUNNING" ]; then
echo "Instance $INSTANCE_NAME is already running."
echo "was_running=true" >> $GITHUB_OUTPUT
elif [ "$status" = "TERMINATED" ] || [ "$status" = "SUSPENDED" ]; then
echo "Instance $INSTANCE_NAME is $status. Starting it..."
gcloud compute instances start "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$INSTANCE_ZONE"
echo "Instance started successfully."
echo "was_running=false" >> $GITHUB_OUTPUT
else
echo "Instance $INSTANCE_NAME has unexpected status: $status" >&2
exit 1
fi
- name: Wait for builder runner to be online
uses: actions/github-script@v8
env:
INSTANCE_NAME: areal-docker-builder
GH_PAT: ${{ secrets.GH_PAT }}
with:
github-token: ${{ secrets.GH_PAT }}
script: |
const instanceName = process.env.INSTANCE_NAME;
const maxAttempts = 120;
const delayMs = 10000;
const pat = process.env.GH_PAT;
if (!pat) {
core.setFailed('GH_PAT secret is not configured.');
return;
}
const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const response = await github.rest.actions.listSelfHostedRunnersForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
per_page: 100,
request: {
headers: {
authorization: `token ${pat}`,
},
},
});
const found = response.data.runners.find((runner) => runner.name === instanceName);
if (found && found.status === 'online') {
core.info(`Builder runner ${instanceName} is online.`);
return;
}
core.info(`Builder runner ${instanceName} not ready yet (attempt ${attempt}/${maxAttempts}).`);
await wait(delayMs);
}
throw new Error(`Timed out waiting for builder runner ${instanceName} to come online.`);
build-and-push-images:
needs:
- start-builder
name: Build and push Docker images
runs-on: [self-hosted, areal-docker-builder]
timeout-minutes: 240 # ~90 min/image x 2 + buffer for cache misses
steps:
- uses: actions/checkout@v6
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v4
with:
driver: docker
- name: Log in to GitHub Container Registry
uses: docker/login-action@v4
with:
registry: ghcr.io
username: inclusionai
password: ${{ secrets.GHCR_TOKEN }}
- name: Build and push sglang image
uses: docker/build-push-action@v7
with:
context: .
file: ./Dockerfile
push: true
tags: ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}-sglang
build-args: |
VARIANT=sglang
- name: Build and push vllm image
uses: docker/build-push-action@v7
with:
context: .
file: ./Dockerfile
push: true
tags: ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}-vllm
build-args: |
VARIANT=vllm
- name: Image details
run: |
echo "✅ Docker images built and pushed successfully!"
echo "Images:"
echo " - ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}-sglang"
echo " - ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}-vllm"
echo "Commit: ${{ github.sha }}"
echo "Branch: ${{ github.head_ref || github.ref_name }}"
run-tests-sglang:
name: Run tests with sglang image
needs:
- build-and-push-images
uses: ./.github/workflows/test-areal.yml
with:
image_tag: test
variant: sglang
secrets: inherit
run-tests-vllm:
name: Run tests with vllm image
needs:
- build-and-push-images
uses: ./.github/workflows/test-areal.yml
with:
image_tag: test
variant: vllm
secrets: inherit
promote-images:
name: Promote ${{ matrix.variant }} test image to dev
needs:
- run-tests-sglang
- run-tests-vllm
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
variant: [sglang, vllm]
steps:
- name: Log in to GitHub Container Registry
uses: docker/login-action@v4
with:
registry: ghcr.io
username: inclusionai
password: ${{ secrets.GHCR_TOKEN }}
- name: Pull test image and push as dev
env:
IMAGE_NAME: ghcr.io/inclusionai/areal-runtime
run: |
docker pull $IMAGE_NAME:test-${{ matrix.variant }}
docker tag $IMAGE_NAME:test-${{ matrix.variant }} $IMAGE_NAME:dev-${{ matrix.variant }}
docker push $IMAGE_NAME:dev-${{ matrix.variant }}
echo "✅ ${{ matrix.variant }} image promoted from :test-${{ matrix.variant }} to :dev-${{ matrix.variant }}"
bake-gcp-image:
name: Bake GCP CI image with promoted Docker images
needs:
- promote-images
uses: ./.github/workflows/bake-gcp-image.yml
with:
image_tag: dev
secrets: inherit
cleanup-test-images:
name: Delete ${{ matrix.variant }} test image from registry
needs:
- build-and-push-images
- run-tests-sglang
- run-tests-vllm
- promote-images
if: always() && needs.build-and-push-images.result == 'success'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
variant: [sglang, vllm]
steps:
- name: Delete test image from GHCR
env:
GH_TOKEN: ${{ secrets.GHCR_TOKEN }}
run: |
PACKAGE_NAME="areal-runtime"
TAG="test-${{ matrix.variant }}"
# Get the package version ID for the test tag
PACKAGE_VERSION_ID=$(curl -s -H "Authorization: Bearer $GH_TOKEN" \
"https://api.github.com/orgs/inclusionai/packages/container/$PACKAGE_NAME/versions?per_page=100" \
| jq -r ".[] | select(.metadata.container.tags[] == \"$TAG\") | .id")
if [ -n "$PACKAGE_VERSION_ID" ] && [ "$PACKAGE_VERSION_ID" != "null" ]; then
curl -X DELETE -H "Authorization: Bearer $GH_TOKEN" \
"https://api.github.com/orgs/inclusionai/packages/container/$PACKAGE_NAME/versions/$PACKAGE_VERSION_ID"
echo "✅ Deleted ${{ matrix.variant }} test image from registry"
else
echo "⚠️ ${{ matrix.variant }} test image not found or already deleted"
fi
stop-builder:
name: Stop areal-docker-builder instance
needs:
- start-builder
- build-and-push-images
- run-tests-sglang
- run-tests-vllm
- promote-images
- bake-gcp-image
- cleanup-test-images
if: always() && needs.start-builder.outputs.was_running != 'true'
runs-on: ubuntu-latest
env:
INSTANCE_NAME: areal-docker-builder
INSTANCE_ZONE: us-central1-f
steps:
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v3
with:
credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v3
- name: Stop builder instance
run: |
# Check if instance is running
status=$(gcloud compute instances describe "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$INSTANCE_ZONE" \
--format="get(status)" 2>/dev/null || echo "NOT_FOUND")
if [ "$status" = "NOT_FOUND" ]; then
echo "Warning: Instance $INSTANCE_NAME not found in zone $INSTANCE_ZONE"
exit 0
fi
if [ "$status" = "RUNNING" ]; then
echo "Stopping instance $INSTANCE_NAME..."
gcloud compute instances stop "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$INSTANCE_ZONE"
echo "Instance stopped successfully."
else
echo "Instance $INSTANCE_NAME is already in status: $status"
fi