Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pkg/recipe/criteria.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) {
return CriteriaAcceleratorAny, nil
case "h100":
return CriteriaAcceleratorH100, nil
case "h200":
return CriteriaAcceleratorH200, nil
case "gb200":
return CriteriaAcceleratorGB200, nil
case "b200":
Expand Down Expand Up @@ -216,7 +218,7 @@ type Criteria struct {
// Service is the Kubernetes service type (eks, gke, aks, oke, self-managed).
Service CriteriaServiceType `json:"service,omitempty" yaml:"service,omitempty"`

// Accelerator is the GPU/accelerator type (h100, gb200, b200, a100, l40).
// Accelerator is the GPU/accelerator type (h100, h200, gb200, b200, a100, l40).
Accelerator CriteriaAcceleratorType `json:"accelerator,omitempty" yaml:"accelerator,omitempty"`

// Intent is the workload intent (training, inference).
Expand Down
2 changes: 2 additions & 0 deletions pkg/recipe/criteria_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ func TestParseCriteriaAcceleratorType(t *testing.T) {
{"any", "any", CriteriaAcceleratorAny, false},
{"h100", "h100", CriteriaAcceleratorH100, false},
{"H100 uppercase", "H100", CriteriaAcceleratorH100, false},
{"h200", "h200", CriteriaAcceleratorH200, false},
{"H200 uppercase", "H200", CriteriaAcceleratorH200, false},
{"gb200", "gb200", CriteriaAcceleratorGB200, false},
{"b200", "b200", CriteriaAcceleratorB200, false},
{"a100", "a100", CriteriaAcceleratorA100, false},
Expand Down
2 changes: 2 additions & 0 deletions pkg/recipe/snapshot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,8 @@ func TestMatchAccelerator(t *testing.T) {
}{
{"H100 uppercase", "NVIDIA H100 80GB HBM3", CriteriaAcceleratorH100},
{"H100 lowercase", "h100-sxm", CriteriaAcceleratorH100},
{"H200 uppercase", "NVIDIA H200 141GB HBM3e", CriteriaAcceleratorH200},
{"H200 lowercase", "h200-sxm", CriteriaAcceleratorH200},
{"A100", "A100-SXM4-80GB", CriteriaAcceleratorA100},
{"GB200", "NVIDIA GB200", CriteriaAcceleratorGB200},
{"B200", "NVIDIA-B200", CriteriaAcceleratorB200},
Expand Down
44 changes: 44 additions & 0 deletions recipes/overlays/h200-aks-inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-inference

spec:
# Inherits from aks-inference recipe (AKS + inference settings)
base: aks-inference

criteria:
service: aks
accelerator: h200
intent: inference

# Specific constraints for H200 on AKS inference workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"

# Skyhook customizations omitted — Skyhook packages do not support
# service: aks. The skyhook-operator itself is inherited from base
# and still deploys. This follows the same pattern as Kind overlays
# (h100-kind-inference.yaml) which also omit Skyhook tuning.
componentRefs:
- name: gpu-operator
type: Helm
dependencyRefs:
- cert-manager
- kube-prometheus-stack
48 changes: 48 additions & 0 deletions recipes/overlays/h200-aks-training.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-training

spec:
# Inherits from aks-training recipe (AKS + training settings)
base: aks-training

criteria:
service: aks
accelerator: h200
intent: training

# Specific constraints for H200 on AKS training workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"

# Skyhook customizations omitted — Skyhook packages do not support
# service: aks. The skyhook-operator itself is inherited from base
# and still deploys. This follows the same pattern as Kind overlays
# (h100-kind-training.yaml) which also omit Skyhook tuning.
componentRefs:
# H200-specific GPU Operator overrides (inherits valuesFile from aks-training)
- name: gpu-operator
type: Helm
dependencyRefs:
- cert-manager
- kube-prometheus-stack
overrides:
gdrcopy:
enabled: true
92 changes: 92 additions & 0 deletions recipes/overlays/h200-aks-ubuntu-inference-dynamo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-ubuntu-inference-dynamo

spec:
# Inherits from h200-aks-ubuntu-inference (H200 + Ubuntu inference settings)
# Adds Dynamo inference platform components.
base: h200-aks-ubuntu-inference

criteria:
service: aks
accelerator: h200
os: ubuntu
intent: inference
platform: dynamo

# DRA requires Kubernetes 1.34+ (GA)
constraints:
- name: K8s.server.version
value: ">= 1.34"

componentRefs:
- name: nvidia-dra-driver-gpu
type: Helm
overrides:
gpuResourcesEnabledOverride: true

- name: dynamo-crds
type: Helm
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
version: "0.9.0"
valuesFile: components/dynamo-crds/values.yaml

- name: dynamo-platform
type: Helm
source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
version: "0.9.0"
valuesFile: components/dynamo-platform/values.yaml
dependencyRefs:
- dynamo-crds
- cert-manager
- kube-prometheus-stack
- kai-scheduler
overrides:
etcd:
persistence:
storageClass: managed-csi
nats:
config:
jetstream:
fileStore:
pvc:
storageClassName: managed-csi

validation:
deployment:
checks:
- operator-health
- expected-resources
- gpu-operator-version
- check-nvidia-smi
constraints:
- name: Deployment.gpu-operator.version
value: ">= v24.6.0"
conformance:
checks:
- platform-health
- gpu-operator-health
- dra-support
- accelerator-metrics
- ai-service-metrics
- inference-gateway
- gang-scheduling
- pod-autoscaling
- cluster-autoscaling
- robust-controller
- secure-accelerator-access
43 changes: 43 additions & 0 deletions recipes/overlays/h200-aks-ubuntu-inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-ubuntu-inference

spec:
# Inherits from h200-aks-inference recipe (H200 + AKS + inference settings)
# This overlay adds Ubuntu-specific configurations
base: h200-aks-inference

criteria:
service: aks
accelerator: h200
os: ubuntu
intent: inference

# H200 + Ubuntu specific constraints for inference workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"
- name: OS.release.ID
value: ubuntu
- name: OS.release.VERSION_ID
value: "24.04"
- name: OS.sysctl./proc/sys/kernel/osrelease
value: ">= 6.8"

componentRefs: []
54 changes: 54 additions & 0 deletions recipes/overlays/h200-aks-ubuntu-training-kubeflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-ubuntu-training-kubeflow

spec:
# Inherits from h200-aks-ubuntu-training recipe (H200 + AKS + Ubuntu + training settings)
# This overlay adds Kubeflow Training Operator for distributed training with TrainJob
base: h200-aks-ubuntu-training

criteria:
service: aks
accelerator: h200
os: ubuntu
intent: training
platform: kubeflow

# Constraints for H200 on AKS with Ubuntu for Kubeflow training workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"
- name: OS.release.ID
value: ubuntu
- name: OS.release.VERSION_ID
value: "24.04"
- name: OS.sysctl./proc/sys/kernel/osrelease
value: ">= 6.8"

# Kubeflow Training Operator for TrainJob support
componentRefs:
- name: kubeflow-trainer
type: Helm
valuesFile: components/kubeflow-trainer/values.yaml
manifestFiles:
- components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
dependencyRefs:
- cert-manager
- kube-prometheus-stack
- gpu-operator
72 changes: 72 additions & 0 deletions recipes/overlays/h200-aks-ubuntu-training.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

kind: RecipeMetadata
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h200-aks-ubuntu-training

spec:
# Inherits from h200-aks-training recipe (H200 + AKS + training settings)
# This overlay adds Ubuntu-specific configurations
base: h200-aks-training

criteria:
service: aks
accelerator: h200
os: ubuntu
intent: training

# Constraints for H200 on AKS with Ubuntu for training workloads
# Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
constraints:
- name: K8s.server.version
value: ">= 1.32.4"
- name: OS.release.ID
value: ubuntu
- name: OS.release.VERSION_ID
value: "24.04"
- name: OS.sysctl./proc/sys/kernel/osrelease
value: ">= 6.8"

componentRefs: []

validation:
deployment:
checks:
- operator-health
- expected-resources
- gpu-operator-version
- check-nvidia-smi
constraints:
- name: Deployment.gpu-operator.version
value: ">= v24.6.0"
performance:
checks:
- nccl-all-reduce-bw
constraints:
- name: nccl-all-reduce-bw
value: ">= 100"
conformance:
checks:
- platform-health
- gpu-operator-health
- dra-support
- accelerator-metrics
- ai-service-metrics
- gang-scheduling
- pod-autoscaling
- cluster-autoscaling
- robust-controller
- secure-accelerator-access
Loading