NVIDIA · Jont828 · Mar 31, 2026 · Apr 3, 2026 · Apr 8, 2026 · Apr 14, 2026
@@ -94,6 +94,8 @@ func ParseCriteriaAcceleratorType(s string) (CriteriaAcceleratorType, error) {
 		return CriteriaAcceleratorAny, nil
 	case "h100":
 		return CriteriaAcceleratorH100, nil
+	case "h200":
+		return CriteriaAcceleratorH200, nil
 	case "gb200":
 		return CriteriaAcceleratorGB200, nil
 	case "b200":
@@ -216,7 +218,7 @@ type Criteria struct {
 	// Service is the Kubernetes service type (eks, gke, aks, oke, self-managed).
 	Service CriteriaServiceType `json:"service,omitempty" yaml:"service,omitempty"`
 
-	// Accelerator is the GPU/accelerator type (h100, gb200, b200, a100, l40).
+	// Accelerator is the GPU/accelerator type (h100, h200, gb200, b200, a100, l40).
 	Accelerator CriteriaAcceleratorType `json:"accelerator,omitempty" yaml:"accelerator,omitempty"`
 
 	// Intent is the workload intent (training, inference).

@@ -70,6 +70,8 @@ func TestParseCriteriaAcceleratorType(t *testing.T) {
 		{"any", "any", CriteriaAcceleratorAny, false},
 		{"h100", "h100", CriteriaAcceleratorH100, false},
 		{"H100 uppercase", "H100", CriteriaAcceleratorH100, false},
+		{"h200", "h200", CriteriaAcceleratorH200, false},
+		{"H200 uppercase", "H200", CriteriaAcceleratorH200, false},
 		{"gb200", "gb200", CriteriaAcceleratorGB200, false},
 		{"b200", "b200", CriteriaAcceleratorB200, false},
 		{"a100", "a100", CriteriaAcceleratorA100, false},

@@ -451,6 +451,8 @@ func TestMatchAccelerator(t *testing.T) {
 	}{
 		{"H100 uppercase", "NVIDIA H100 80GB HBM3", CriteriaAcceleratorH100},
 		{"H100 lowercase", "h100-sxm", CriteriaAcceleratorH100},
+		{"H200 uppercase", "NVIDIA H200 141GB HBM3e", CriteriaAcceleratorH200},
+		{"H200 lowercase", "h200-sxm", CriteriaAcceleratorH200},
 		{"A100", "A100-SXM4-80GB", CriteriaAcceleratorA100},
 		{"GB200", "NVIDIA GB200", CriteriaAcceleratorGB200},
 		{"B200", "NVIDIA-B200", CriteriaAcceleratorB200},

@@ -0,0 +1,44 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-inference
+
+spec:
+  # Inherits from aks-inference recipe (AKS + inference settings)
+  base: aks-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    intent: inference
+
+  # Specific constraints for H200 on AKS inference workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+
+  # Skyhook customizations omitted — Skyhook packages do not support
+  # service: aks. The skyhook-operator itself is inherited from base
+  # and still deploys. This follows the same pattern as Kind overlays
+  # (h100-kind-inference.yaml) which also omit Skyhook tuning.
+  componentRefs:
+    - name: gpu-operator
+      type: Helm
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
@@ -0,0 +1,48 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-training
+
+spec:
+  # Inherits from aks-training recipe (AKS + training settings)
+  base: aks-training
+
+  criteria:
+    service: aks
+    accelerator: h200
+    intent: training
+
+  # Specific constraints for H200 on AKS training workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+
+  # Skyhook customizations omitted — Skyhook packages do not support
+  # service: aks. The skyhook-operator itself is inherited from base
+  # and still deploys. This follows the same pattern as Kind overlays
+  # (h100-kind-training.yaml) which also omit Skyhook tuning.
+  componentRefs:
+    # H200-specific GPU Operator overrides (inherits valuesFile from aks-training)
+    - name: gpu-operator
+      type: Helm
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
+      overrides:
+        gdrcopy:
+          enabled: true
@@ -0,0 +1,92 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-inference-dynamo
+
+spec:
+  # Inherits from h200-aks-ubuntu-inference (H200 + Ubuntu inference settings)
+  # Adds Dynamo inference platform components.
+  base: h200-aks-ubuntu-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: inference
+    platform: dynamo
+
+  # DRA requires Kubernetes 1.34+ (GA)
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.34"
+
+  componentRefs:
+    - name: nvidia-dra-driver-gpu
+      type: Helm
+      overrides:
+        gpuResourcesEnabledOverride: true
+
+    - name: dynamo-crds
+      type: Helm
+      source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
+      version: "0.9.0"
+      valuesFile: components/dynamo-crds/values.yaml
+
+    - name: dynamo-platform
+      type: Helm
+      source: https://helm.ngc.nvidia.com/nvidia/ai-dynamo
+      version: "0.9.0"
+      valuesFile: components/dynamo-platform/values.yaml
+      dependencyRefs:
+        - dynamo-crds
+        - cert-manager
+        - kube-prometheus-stack
+        - kai-scheduler
+      overrides:
+        etcd:
+          persistence:
+            storageClass: managed-csi
+        nats:
+          config:
+            jetstream:
+              fileStore:
+                pvc:
+                  storageClassName: managed-csi
+
+  validation:
+    deployment:
+      checks:
+        - operator-health
+        - expected-resources
+        - gpu-operator-version
+        - check-nvidia-smi
+      constraints:
+        - name: Deployment.gpu-operator.version
+          value: ">= v24.6.0"
+    conformance:
+      checks:
+        - platform-health
+        - gpu-operator-health
+        - dra-support
+        - accelerator-metrics
+        - ai-service-metrics
+        - inference-gateway
+        - gang-scheduling
+        - pod-autoscaling
+        - cluster-autoscaling
+        - robust-controller
+        - secure-accelerator-access
@@ -0,0 +1,43 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-inference
+
+spec:
+  # Inherits from h200-aks-inference recipe (H200 + AKS + inference settings)
+  # This overlay adds Ubuntu-specific configurations
+  base: h200-aks-inference
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: inference
+
+  # H200 + Ubuntu specific constraints for inference workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+    - name: OS.release.ID
+      value: ubuntu
+    - name: OS.release.VERSION_ID
+      value: "24.04"
+    - name: OS.sysctl./proc/sys/kernel/osrelease
+      value: ">= 6.8"
+
+  componentRefs: []
@@ -0,0 +1,54 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-training-kubeflow
+
+spec:
+  # Inherits from h200-aks-ubuntu-training recipe (H200 + AKS + Ubuntu + training settings)
+  # This overlay adds Kubeflow Training Operator for distributed training with TrainJob
+  base: h200-aks-ubuntu-training
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: training
+    platform: kubeflow
+
+  # Constraints for H200 on AKS with Ubuntu for Kubeflow training workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+    - name: OS.release.ID
+      value: ubuntu
+    - name: OS.release.VERSION_ID
+      value: "24.04"
+    - name: OS.sysctl./proc/sys/kernel/osrelease
+      value: ">= 6.8"
+
+  # Kubeflow Training Operator for TrainJob support
+  componentRefs:
+    - name: kubeflow-trainer
+      type: Helm
+      valuesFile: components/kubeflow-trainer/values.yaml
+      manifestFiles:
+        - components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
+        - gpu-operator
@@ -0,0 +1,72 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: h200-aks-ubuntu-training
+
+spec:
+  # Inherits from h200-aks-training recipe (H200 + AKS + training settings)
+  # This overlay adds Ubuntu-specific configurations
+  base: h200-aks-training
+
+  criteria:
+    service: aks
+    accelerator: h200
+    os: ubuntu
+    intent: training
+
+  # Constraints for H200 on AKS with Ubuntu for training workloads
+  # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key}
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.32.4"
+    - name: OS.release.ID
+      value: ubuntu
+    - name: OS.release.VERSION_ID
+      value: "24.04"
+    - name: OS.sysctl./proc/sys/kernel/osrelease
+      value: ">= 6.8"
+
+  componentRefs: []
+
+  validation:
+    deployment:
+      checks:
+        - operator-health
+        - expected-resources
+        - gpu-operator-version
+        - check-nvidia-smi
+      constraints:
+        - name: Deployment.gpu-operator.version
+          value: ">= v24.6.0"
+    performance:
+      checks:
+        - nccl-all-reduce-bw
+      constraints:
+        - name: nccl-all-reduce-bw
+          value: ">= 100"
+    conformance:
+      checks:
+        - platform-health
+        - gpu-operator-health
+        - dra-support
+        - accelerator-metrics
+        - ai-service-metrics
+        - gang-scheduling
+        - pod-autoscaling
+        - cluster-autoscaling
+        - robust-controller
+        - secure-accelerator-access