Skip to content

Commit d48eecd

Browse files
authored
feat: default network policy per SandboxTemplate (kubernetes-sigs#287)
* feat: default network policy nit update update to fix e2e tests policy per template nit updated api comments update update update nit updated update update update nit nit nit nit update api udpdate nit * creation of sandboxtemplate controller to handle networkpolicy lifecycle update nit
1 parent 1f3aaaa commit d48eecd

9 files changed

Lines changed: 719 additions & 152 deletions

cmd/agent-sandbox-controller/main.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func main() {
6060
var sandboxConcurrentWorkers int
6161
var sandboxClaimConcurrentWorkers int
6262
var sandboxWarmPoolConcurrentWorkers int
63+
var sandboxTemplateConcurrentWorkers int
6364
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
6465
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
6566
flag.BoolVar(&enableLeaderElection, "leader-elect", true,
@@ -84,6 +85,7 @@ func main() {
8485
flag.IntVar(&sandboxConcurrentWorkers, "sandbox-concurrent-workers", 1, "Max concurrent reconciles for the Sandbox controller")
8586
flag.IntVar(&sandboxClaimConcurrentWorkers, "sandbox-claim-concurrent-workers", 1, "Max concurrent reconciles for the SandboxClaim controller")
8687
flag.IntVar(&sandboxWarmPoolConcurrentWorkers, "sandbox-warm-pool-concurrent-workers", 1, "Max concurrent reconciles for the SandboxWarmPool controller")
88+
flag.IntVar(&sandboxTemplateConcurrentWorkers, "sandbox-template-concurrent-workers", 1, "Max concurrent reconciles for the SandboxTemplate controller")
8789
opts := zap.Options{
8890
Development: true,
8991
}
@@ -98,7 +100,7 @@ func main() {
98100
os.Exit(1)
99101
}
100102
// A logical maximum (too much will create unnecessary load on the API server)
101-
totalWorkers := sandboxConcurrentWorkers + sandboxClaimConcurrentWorkers + sandboxWarmPoolConcurrentWorkers
103+
totalWorkers := sandboxConcurrentWorkers + sandboxClaimConcurrentWorkers + sandboxWarmPoolConcurrentWorkers + sandboxTemplateConcurrentWorkers
102104
if totalWorkers > 1000 {
103105
setupLog.Info("Warning: total concurrent workers exceeds 1000, which could lead to resource exhaustion", "total", totalWorkers)
104106
}
@@ -220,6 +222,16 @@ func main() {
220222
os.Exit(1)
221223
}
222224

225+
if err = (&extensionscontrollers.SandboxTemplateReconciler{
226+
Client: mgr.GetClient(),
227+
Scheme: mgr.GetScheme(),
228+
Recorder: mgr.GetEventRecorderFor("sandboxtemplate-controller"),
229+
Tracer: instrumenter,
230+
}).SetupWithManager(mgr, sandboxTemplateConcurrentWorkers); err != nil {
231+
setupLog.Error(err, "unable to create controller", "controller", "SandboxTemplate")
232+
os.Exit(1)
233+
}
234+
223235
if err = (&extensionscontrollers.SandboxWarmPoolReconciler{
224236
Client: mgr.GetClient(),
225237
}).SetupWithManager(mgr, sandboxWarmPoolConcurrentWorkers); err != nil {

extensions/api/v1alpha1/sandboxtemplate_types.go

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,24 @@ import (
2323
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
2424
// Important: Run "make" to regenerate code after modifying this file
2525

26+
// NetworkPolicyManagement defines whether the controller automatically generates
27+
// and manages a shared NetworkPolicy for this template.
28+
type NetworkPolicyManagement string
29+
2630
const (
2731
// SandboxIDLabel is the label key applied to the Pod to identify the owning Claim UID.
2832
// The SandboxClaim controller injects this label into the Pod
2933
// System-injected labels/annotations shouldn't be touched
3034
SandboxIDLabel = "agents.x-k8s.io/claim-uid"
35+
36+
// NetworkPolicyManagementManaged means the controller will ensure a shared NetworkPolicy exists.
37+
// This shared NetworkPolicy will be a user provide one or a default controller created policy.
38+
// This is the default behavior if the field is omitted.
39+
NetworkPolicyManagementManaged NetworkPolicyManagement = "Managed"
40+
41+
// NetworkPolicyManagementUnmanaged means the controller will skip NetworkPolicy
42+
// creation entirely, allowing external systems (like Cilium) to manage networking.
43+
NetworkPolicyManagementUnmanaged NetworkPolicyManagement = "Unmanaged"
3144
)
3245

3346
// NetworkPolicySpec defines the desired state of the NetworkPolicy.
@@ -55,7 +68,19 @@ type SandboxTemplateSpec struct {
5568
PodTemplate sandboxv1alpha1.PodTemplate `json:"podTemplate" protobuf:"bytes,3,opt,name=podTemplate"`
5669

5770
// NetworkPolicy defines the network policy to be applied to the sandboxes
58-
// created from this template.
71+
// created from this template. A single shared NetworkPolicy is created per Template.
72+
// Behavior is dictated by the NetworkPolicyManagement field:
73+
// - If Management is "Unmanaged": This field is completely ignored.
74+
// - If Management is "Managed" (default) and this field is omitted (nil): The controller
75+
// automatically applies a strict Secure Default policy:
76+
// * Ingress: Allow traffic only from the Sandbox Router.
77+
// * Egress: Allow Public Internet only. Blocks internal IPs (RFC1918), Metadata Server, etc.
78+
// - If Management is "Managed" and this field is provided: The controller applies your custom rules.
79+
// Update Behavior:
80+
// Because the NetworkPolicy is shared at the template level, any updates to these rules
81+
// will be applied to the single shared policy object. The underlying Kubernetes CNI will then
82+
// dynamically enforce the updated rules across all existing and future sandboxes
83+
// referencing this template.
5984
// NOTE: This is a restricted subset of the standard Kubernetes NetworkPolicySpec.
6085
// Fields like 'PodSelector' and 'PolicyTypes' are intentionally excluded because
6186
// they are managed by the controller to ensure strict isolation and default-deny posture.
@@ -66,6 +91,13 @@ type SandboxTemplateSpec struct {
6691
// otherwise the sidecars may fail health checks.
6792
// +optional
6893
NetworkPolicy *NetworkPolicySpec `json:"networkPolicy,omitempty"`
94+
95+
// NetworkPolicyManagement defines whether the controller manages the NetworkPolicy.
96+
// Valid values are "Managed" (default) or "Unmanaged".
97+
// +kubebuilder:validation:Enum=Managed;Unmanaged
98+
// +kubebuilder:default=Managed
99+
// +optional
100+
NetworkPolicyManagement NetworkPolicyManagement `json:"networkPolicyManagement,omitempty"`
69101
}
70102

71103
// SandboxTemplateStatus defines the observed state of Sandbox.

extensions/controllers/sandboxclaim_controller.go

Lines changed: 25 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,11 @@ import (
1818
"context"
1919
"errors"
2020
"fmt"
21-
"reflect"
2221
"sort"
2322
"time"
2423

2524
corev1 "k8s.io/api/core/v1"
26-
networkingv1 "k8s.io/api/networking/v1"
25+
"k8s.io/apimachinery/pkg/api/equality"
2726
k8errors "k8s.io/apimachinery/pkg/api/errors"
2827
"k8s.io/apimachinery/pkg/api/meta"
2928
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -45,8 +44,9 @@ import (
4544

4645
// TODO: These constants should be imported from the main controller package Issue #216
4746
const (
48-
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
49-
poolNameNone = "none"
47+
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
48+
poolNameNone = "none"
49+
sandboxTemplateLabel = "agents.x-k8s.io/sandbox-template-ref-hash"
5050
)
5151

5252
// ErrTemplateNotFound is a sentinel error indicating a SandboxTemplate was not found.
@@ -66,7 +66,6 @@ type SandboxClaimReconciler struct {
6666
//+kubebuilder:rbac:groups=agents.x-k8s.io,resources=sandboxes,verbs=get;list;watch;create;update;patch;delete
6767
//+kubebuilder:rbac:groups=extensions.agents.x-k8s.io,resources=sandboxtemplates,verbs=get;list;watch
6868
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch
69-
//+kubebuilder:rbac:groups=networking.k8s.io,resources=networkpolicies,verbs=get;list;watch;create;update;patch;delete
7069
//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch
7170
//+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;watch;create;update;patch;delete
7271

@@ -194,13 +193,6 @@ func (r *SandboxClaimReconciler) reconcileActive(ctx context.Context, claim *ext
194193

195194
// Only attempt network policy reconciliation if template was found.
196195
if templateErr == nil || k8errors.IsNotFound(templateErr) {
197-
// This ensures the firewall is up before the pod starts.
198-
if template != nil {
199-
if npErr := r.reconcileNetworkPolicy(ctx, claim, template); npErr != nil {
200-
return nil, fmt.Errorf("failed to reconcile network policy: %w", npErr)
201-
}
202-
}
203-
204196
// Try getting sandbox even if template is not found
205197
// It is possible that the template was deleted after the sandbox was created
206198
sandbox, err := r.getOrCreateSandbox(ctx, claim, template)
@@ -249,7 +241,7 @@ func (r *SandboxClaimReconciler) updateStatus(ctx context.Context, oldStatus *ex
249241
return claim.Status.Conditions[i].Type < claim.Status.Conditions[j].Type
250242
})
251243

252-
if reflect.DeepEqual(oldStatus, &claim.Status) {
244+
if equality.Semantic.DeepEqual(oldStatus, &claim.Status) {
253245
return nil
254246
}
255247

@@ -399,7 +391,6 @@ func (r *SandboxClaimReconciler) tryAdoptPodFromPool(ctx context.Context, claim
399391

400392
// Remove the pool labels
401393
delete(pod.Labels, poolLabel)
402-
delete(pod.Labels, sandboxTemplateRefHash)
403394

404395
// Remove existing owner references (from SandboxWarmPool)
405396
pod.OwnerReferences = nil
@@ -414,6 +405,8 @@ func (r *SandboxClaimReconciler) tryAdoptPodFromPool(ctx context.Context, claim
414405
// Label required by NetworkPolicy
415406
// We add the new label with the Claim UID for unique targeting.
416407
pod.Labels[extensionsv1alpha1.SandboxIDLabel] = string(claim.UID)
408+
// Adopted pods must have the template hash label to ensure they are selected by the correct NetworkPolicy.
409+
pod.Labels[sandboxTemplateLabel] = sandboxcontrollers.NameHash(claim.Spec.TemplateRef.Name)
417410

418411
// Update the pod
419412
if err := r.Update(ctx, pod); err != nil {
@@ -441,6 +434,11 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
441434
},
442435
}
443436

437+
// Determine if we are in "Secure By Default" mode
438+
management := template.Spec.NetworkPolicyManagement
439+
isManaged := management == "" || management == extensionsv1alpha1.NetworkPolicyManagementManaged
440+
isSecureByDefault := isManaged && template.Spec.NetworkPolicy == nil
441+
444442
// Propagate the trace context annotation to the Sandbox resource
445443
if sandbox.Annotations == nil {
446444
sandbox.Annotations = make(map[string]string)
@@ -460,10 +458,23 @@ func (r *SandboxClaimReconciler) createSandbox(ctx context.Context, claim *exten
460458
automount := false
461459
sandbox.Spec.PodTemplate.Spec.AutomountServiceAccountToken = &automount
462460
}
461+
// To prevent internal DNS enumeration while still allowing public domain resolution,
462+
// we explicitly override the Pod's DNS config to use external public resolvers.
463+
// We only inject this if using the strict "Secure by Default" policy. If the user
464+
// provides custom rules or is Unmanaged, we leave DNS alone for air-gapped/proxy compatibility.
465+
if isSecureByDefault && sandbox.Spec.PodTemplate.Spec.DNSPolicy == "" {
466+
sandbox.Spec.PodTemplate.Spec.DNSPolicy = corev1.DNSNone
467+
sandbox.Spec.PodTemplate.Spec.DNSConfig = &corev1.PodDNSConfig{
468+
Nameservers: []string{"8.8.8.8", "1.1.1.1"}, // Google & Cloudflare public DNS
469+
}
470+
}
471+
463472
if sandbox.Spec.PodTemplate.ObjectMeta.Labels == nil {
464473
sandbox.Spec.PodTemplate.ObjectMeta.Labels = make(map[string]string)
465474
}
466475
sandbox.Spec.PodTemplate.ObjectMeta.Labels[extensionsv1alpha1.SandboxIDLabel] = string(claim.UID)
476+
// This handles the scenario where the Warm Pool is empty (or disabled), and you have to create a brand new Pod from scratch.
477+
sandbox.Spec.PodTemplate.ObjectMeta.Labels[sandboxTemplateLabel] = sandboxcontrollers.NameHash(template.Name)
467478

468479
if err := controllerutil.SetControllerReference(claim, sandbox, r.Scheme); err != nil {
469480
err = fmt.Errorf("failed to set controller reference for sandbox: %w", err)
@@ -578,69 +589,6 @@ func (r *SandboxClaimReconciler) SetupWithManager(mgr ctrl.Manager, concurrentWo
578589
Complete(r)
579590
}
580591

581-
// reconcileNetworkPolicy ensures a NetworkPolicy exists for the claimed Sandbox.
582-
func (r *SandboxClaimReconciler) reconcileNetworkPolicy(ctx context.Context, claim *extensionsv1alpha1.SandboxClaim, template *extensionsv1alpha1.SandboxTemplate) error {
583-
logger := log.FromContext(ctx)
584-
585-
// 1. Cleanup Check: If missing, delete existing policy
586-
if template == nil || template.Spec.NetworkPolicy == nil {
587-
existingNP := &networkingv1.NetworkPolicy{
588-
ObjectMeta: metav1.ObjectMeta{
589-
Name: claim.Name + "-network-policy",
590-
Namespace: claim.Namespace,
591-
},
592-
}
593-
if err := r.Delete(ctx, existingNP); err != nil {
594-
if !k8errors.IsNotFound(err) {
595-
logger.Error(err, "Failed to clean up disabled NetworkPolicy")
596-
return err
597-
}
598-
} else {
599-
logger.Info("Deleted disabled NetworkPolicy", "name", existingNP.Name)
600-
}
601-
return nil
602-
}
603-
604-
np := &networkingv1.NetworkPolicy{
605-
ObjectMeta: metav1.ObjectMeta{
606-
Name: claim.Name + "-network-policy",
607-
Namespace: claim.Namespace,
608-
},
609-
}
610-
611-
_, err := controllerutil.CreateOrUpdate(ctx, r.Client, np, func() error {
612-
np.Spec.PodSelector = metav1.LabelSelector{
613-
MatchLabels: map[string]string{
614-
extensionsv1alpha1.SandboxIDLabel: string(claim.UID),
615-
},
616-
}
617-
np.Spec.PolicyTypes = []networkingv1.PolicyType{
618-
networkingv1.PolicyTypeIngress,
619-
networkingv1.PolicyTypeEgress,
620-
}
621-
622-
templateNP := template.Spec.NetworkPolicy
623-
624-
if len(templateNP.Ingress) > 0 {
625-
np.Spec.Ingress = templateNP.Ingress
626-
}
627-
628-
if len(templateNP.Egress) > 0 {
629-
np.Spec.Egress = templateNP.Egress
630-
}
631-
632-
return controllerutil.SetControllerReference(claim, np, r.Scheme)
633-
})
634-
635-
if err != nil {
636-
logger.Error(err, "Failed to create or update NetworkPolicy for claim")
637-
return err
638-
}
639-
640-
logger.Info("Successfully reconciled NetworkPolicy for claim", "NetworkPolicy.Name", np.Name)
641-
return nil
642-
}
643-
644592
// recordCreationLatencyMetric detects and records transitions to Ready state.
645593
func (r *SandboxClaimReconciler) recordCreationLatencyMetric(
646594
claim *extensionsv1alpha1.SandboxClaim,

0 commit comments

Comments
 (0)