From e934f0b570487f7047ec49866e40b82e71d86bf1 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 25 Jun 2026 16:18:23 -0700 Subject: [PATCH 1/7] feat(kubernetes): support agent-sandbox v1beta1 Signed-off-by: Taylor Mutch --- architecture/gateway.md | 11 +- crates/openshell-driver-kubernetes/README.md | 8 +- .../openshell-driver-kubernetes/src/driver.rs | 86 +++++++++-- crates/openshell-server/src/auth/k8s_sa.rs | 142 ++++++++++++++++-- docs/reference/sandbox-compute-drivers.mdx | 2 +- e2e/with-kube-gateway.sh | 7 +- tasks/scripts/helm-k3s-local.sh | 7 +- 7 files changed, 219 insertions(+), 44 deletions(-) diff --git a/architecture/gateway.md b/architecture/gateway.md index 979422d7e..2e0256249 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -66,10 +66,13 @@ token through `IssueSandboxToken`. The gateway validates that projected token with Kubernetes `TokenReview`, requires the configured sandbox service account, checks the returned pod binding against the live pod UID, and verifies the pod's controlling `Sandbox` ownerReference against the live Sandbox CR UID and -sandbox-id label before minting the gateway JWT. Supervisors renew gateway JWTs -in memory before expiry only while the sandbox record still exists. Older tokens -are not server-revoked; shared deployments bound replay exposure with short -`gateway_jwt.ttl_secs` lifetimes. The config default is +sandbox-id label before minting the gateway JWT. The bootstrap path accepts +both `agents.x-k8s.io/v1beta1` ownerReferences from newer Agent Sandbox +controllers and `agents.x-k8s.io/v1alpha1` ownerReferences from existing +deployments. Supervisors renew gateway JWTs in memory before expiry only while +the sandbox record still exists. Older tokens are not server-revoked; shared +deployments bound replay exposure with short `gateway_jwt.ttl_secs` lifetimes. +The config default is `gateway_jwt.ttl_secs = 0` for local single-player Docker, Podman, and VM gateways; those tokens carry `exp = 0` and do not expire. Kubernetes and other shared deployments should set a positive TTL. diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 3cdb9fa57..6db873ecb 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -15,9 +15,11 @@ credential injection, policy polling, logs, and the gateway relay. ## Sandbox Resource -The driver works with the `agents.x-k8s.io/v1alpha1` `Sandbox` custom resource. -Driver events map Kubernetes object state and platform events into the shared -compute-driver protobuf surface used by the gateway. +The driver works with the `agents.x-k8s.io` `Sandbox` custom resource. It +detects the served Sandbox API at runtime and uses the current API before +falling back to the `v1alpha1` API. Driver events map Kubernetes object state and +platform events into the shared compute-driver protobuf surface used by the +gateway. Kubernetes API calls use explicit timeouts so gRPC handlers do not block indefinitely when the API server is slow or unavailable. diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index a32034f3a..2b1301652 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -80,7 +80,9 @@ impl From for openshell_core::ComputeDriverError { const KUBE_API_TIMEOUT: Duration = Duration::from_secs(30); const SANDBOX_GROUP: &str = "agents.x-k8s.io"; -const SANDBOX_VERSION: &str = "v1alpha1"; +const SANDBOX_VERSION_V1BETA1: &str = "v1beta1"; +const SANDBOX_VERSION_V1ALPHA1: &str = "v1alpha1"; +const SANDBOX_VERSIONS: &[&str] = &[SANDBOX_VERSION_V1BETA1, SANDBOX_VERSION_V1ALPHA1]; pub const SANDBOX_KIND: &str = "Sandbox"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; @@ -256,16 +258,53 @@ impl KubernetesComputeDriver { &self.config.ssh_socket_path } - fn watch_api(&self) -> Api { - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, SANDBOX_VERSION, SANDBOX_KIND); + fn sandbox_api_for_version(&self, client: Client, version: &str) -> Api { + let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, version, SANDBOX_KIND); let resource = ApiResource::from_gvk(&gvk); - Api::namespaced_with(self.watch_client.clone(), &self.config.namespace, &resource) + Api::namespaced_with(client, &self.config.namespace, &resource) } - fn api(&self) -> Api { - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, SANDBOX_VERSION, SANDBOX_KIND); - let resource = ApiResource::from_gvk(&gvk); - Api::namespaced_with(self.client.clone(), &self.config.namespace, &resource) + fn api_for_version(&self, version: &str) -> Api { + self.sandbox_api_for_version(self.client.clone(), version) + } + + fn watch_api_for_version(&self, version: &str) -> Api { + self.sandbox_api_for_version(self.watch_client.clone(), version) + } + + async fn supported_api_version(&self, watch_client: bool) -> Result<&'static str, String> { + let client = if watch_client { + self.watch_client.clone() + } else { + self.client.clone() + }; + for version in SANDBOX_VERSIONS { + let api = self.sandbox_api_for_version(client.clone(), version); + match tokio::time::timeout(KUBE_API_TIMEOUT, api.list(&ListParams::default().limit(1))) + .await + { + Ok(Ok(_)) => return Ok(version), + Ok(Err(err)) if should_try_next_sandbox_api_version(&err) => { + debug!( + namespace = %self.config.namespace, + sandbox_api_version = %version, + error = %err, + "Sandbox API version is not available; trying next supported version" + ); + } + Ok(Err(err)) => return Err(err.to_string()), + Err(_elapsed) => { + return Err(format!( + "timed out after {}s waiting for Kubernetes API", + KUBE_API_TIMEOUT.as_secs() + )); + } + } + } + Err(format!( + "no supported Agent Sandbox API version is available; tried {}", + SANDBOX_VERSIONS.join(", ") + )) } async fn has_gpu_capacity(&self) -> Result { @@ -306,7 +345,8 @@ impl KubernetesComputeDriver { "Fetching sandbox from Kubernetes" ); - let api = self.api(); + let version = self.supported_api_version(false).await?; + let api = self.api_for_version(version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.get(name)).await { Ok(Ok(obj)) => sandbox_from_object(&self.config.namespace, obj).map(Some), Ok(Err(KubeError::Api(err))) if err.code == 404 => { @@ -341,7 +381,8 @@ impl KubernetesComputeDriver { "Listing sandboxes from Kubernetes" ); - let api = self.api(); + let version = self.supported_api_version(false).await?; + let api = self.api_for_version(version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.list(&ListParams::default())).await { Ok(Ok(list)) => { let mut sandboxes = list @@ -396,7 +437,11 @@ impl KubernetesComputeDriver { "Creating sandbox in Kubernetes" ); - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, SANDBOX_VERSION, SANDBOX_KIND); + let version = self + .supported_api_version(false) + .await + .map_err(KubernetesDriverError::Message)?; + let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, version, SANDBOX_KIND); let resource = ApiResource::from_gvk(&gvk); let mut obj = DynamicObject::new(name, &resource); obj.metadata = ObjectMeta { @@ -430,7 +475,7 @@ impl KubernetesComputeDriver { .provider_spiffe_workload_api_socket_path, }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); - let api = self.api(); + let api = self.api_for_version(version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.create(&PostParams::default(), &obj)).await { @@ -473,7 +518,8 @@ impl KubernetesComputeDriver { "Deleting sandbox from Kubernetes" ); - let api = self.api(); + let version = self.supported_api_version(false).await?; + let api = self.api_for_version(version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.delete(name, &DeleteParams::default())) .await { @@ -508,7 +554,8 @@ impl KubernetesComputeDriver { } pub async fn sandbox_exists(&self, name: &str) -> Result { - let api = self.api(); + let version = self.supported_api_version(false).await?; + let api = self.api_for_version(version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.get(name)).await { Ok(Ok(_)) => Ok(true), Ok(Err(KubeError::Api(err))) if err.code == 404 => Ok(false), @@ -524,7 +571,8 @@ impl KubernetesComputeDriver { #[allow(clippy::unused_async)] pub async fn watch_sandboxes(&self) -> Result { let namespace = self.config.namespace.clone(); - let sandbox_api = self.watch_api(); + let version = self.supported_api_version(true).await?; + let sandbox_api = self.watch_api_for_version(version); let event_api: Api = Api::namespaced(self.watch_client.clone(), &namespace); let mut sandbox_stream = watcher::watcher(sandbox_api, watcher::Config::default()).boxed(); let mut event_stream = watcher::watcher(event_api, watcher::Config::default()).boxed(); @@ -650,6 +698,14 @@ impl KubernetesComputeDriver { } } +fn should_try_next_sandbox_api_version(err: &KubeError) -> bool { + matches!( + err, + KubeError::Api(api) + if api.code == 404 && api.message.contains("could not find the requested resource") + ) +} + fn validate_gpu_request( gpu_requirements: Option<&GpuResourceRequirements>, ) -> Result<(), tonic::Status> { diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs index 59f6651bb..5eab90446 100644 --- a/crates/openshell-server/src/auth/k8s_sa.rs +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -23,6 +23,7 @@ use k8s_openapi::api::{ core::v1::Pod, }; use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; +use kube::Error as KubeError; use kube::api::{Api, ApiResource, PostParams}; use kube::core::{DynamicObject, gvk::GroupVersionKind}; use std::sync::Arc; @@ -40,8 +41,10 @@ pub const ISSUE_SANDBOX_TOKEN_PATH: &str = "/openshell.v1.OpenShell/IssueSandbox /// include `patch pods` (see plan ยง11.8). pub const SANDBOX_ID_ANNOTATION: &str = "openshell.io/sandbox-id"; const SANDBOX_API_GROUP: &str = "agents.x-k8s.io"; -const SANDBOX_API_VERSION: &str = "v1alpha1"; -const SANDBOX_API_VERSION_FULL: &str = "agents.x-k8s.io/v1alpha1"; +const SANDBOX_API_VERSION_V1BETA1: &str = "v1beta1"; +const SANDBOX_API_VERSION_V1ALPHA1: &str = "v1alpha1"; +const SANDBOX_API_VERSION_FULL_V1BETA1: &str = "agents.x-k8s.io/v1beta1"; +const SANDBOX_API_VERSION_FULL_V1ALPHA1: &str = "agents.x-k8s.io/v1alpha1"; const SANDBOX_KIND: &str = "Sandbox"; const SANDBOX_ID_LABEL: &str = "openshell.ai/sandbox-id"; const POD_NAME_EXTRA: &str = "authentication.kubernetes.io/pod-name"; @@ -140,6 +143,7 @@ struct TokenReviewIdentity { #[derive(Debug, Clone, PartialEq, Eq)] struct SandboxOwnerReference { + api_version: String, name: String, uid: String, } @@ -149,7 +153,8 @@ struct SandboxOwnerReference { pub struct LiveK8sResolver { token_reviews_api: Api, pods_api: Api, - sandboxes_api: Api, + sandboxes_api_v1beta1: Api, + sandboxes_api_v1alpha1: Api, expected_audience: String, sandbox_namespace: String, expected_service_account: String, @@ -164,20 +169,51 @@ impl LiveK8sResolver { ) -> Self { let token_reviews_api: Api = Api::all(client.clone()); let pods_api: Api = Api::namespaced(client.clone(), namespace); - let sandbox_gvk = - GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION, SANDBOX_KIND); - let sandbox_resource = ApiResource::from_gvk(&sandbox_gvk); - let sandboxes_api: Api = - Api::namespaced_with(client, namespace, &sandbox_resource); + let sandbox_gvk_v1beta1 = + GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION_V1BETA1, SANDBOX_KIND); + let sandbox_resource_v1beta1 = ApiResource::from_gvk(&sandbox_gvk_v1beta1); + let sandbox_gvk_v1alpha1 = GroupVersionKind::gvk( + SANDBOX_API_GROUP, + SANDBOX_API_VERSION_V1ALPHA1, + SANDBOX_KIND, + ); + let sandbox_resource_v1alpha1 = ApiResource::from_gvk(&sandbox_gvk_v1alpha1); + let sandboxes_api_v1beta1: Api = + Api::namespaced_with(client.clone(), namespace, &sandbox_resource_v1beta1); + let sandboxes_api_v1alpha1: Api = + Api::namespaced_with(client, namespace, &sandbox_resource_v1alpha1); Self { token_reviews_api, pods_api, - sandboxes_api, + sandboxes_api_v1beta1, + sandboxes_api_v1alpha1, expected_audience, sandbox_namespace: namespace.to_string(), expected_service_account, } } + + async fn get_sandbox_cr_for_owner( + &self, + owner: &SandboxOwnerReference, + ) -> Result, KubeError> { + let apis = if owner.api_version == SANDBOX_API_VERSION_FULL_V1ALPHA1 { + [&self.sandboxes_api_v1alpha1, &self.sandboxes_api_v1beta1] + } else { + [&self.sandboxes_api_v1beta1, &self.sandboxes_api_v1alpha1] + }; + + for api in apis { + match api.get_opt(&owner.name).await { + Ok(Some(sandbox_cr)) => return Ok(Some(sandbox_cr)), + Ok(None) => {} + Err(err) if should_try_next_sandbox_api_version(&err) => {} + Err(err) => return Err(err), + } + } + + Ok(None) + } } #[async_trait] @@ -258,10 +294,11 @@ impl K8sIdentityResolver for LiveK8sResolver { let sandbox_id = pod_sandbox_id(&pod)?; let owner = sandbox_owner_reference(&pod)?; - let sandbox_cr = self.sandboxes_api.get_opt(&owner.name).await.map_err(|e| { + let sandbox_cr = self.get_sandbox_cr_for_owner(&owner).await.map_err(|e| { warn!( pod = %identity.pod_name, sandbox_owner = %owner.name, + sandbox_owner_api_version = %owner.api_version, error = %e, "failed to fetch owning Sandbox CR for pod identity validation" ); @@ -271,6 +308,7 @@ impl K8sIdentityResolver for LiveK8sResolver { warn!( pod = %identity.pod_name, sandbox_owner = %owner.name, + sandbox_owner_api_version = %owner.api_version, "pod ownerReference points to a Sandbox CR that does not exist" ); return Err(Status::permission_denied("sandbox owner not found")); @@ -370,10 +408,25 @@ fn pod_sandbox_id(pod: &Pod) -> Result { #[allow(clippy::result_large_err)] fn sandbox_owner_reference(pod: &Pod) -> Result { let owner_refs = pod.metadata.owner_references.as_deref().unwrap_or_default(); - let mut sandbox_refs = owner_refs.iter().filter(|owner| { - owner.api_version == SANDBOX_API_VERSION_FULL && owner.kind == SANDBOX_KIND - }); + let mut sandbox_refs = owner_refs + .iter() + .filter(|owner| is_supported_sandbox_owner_reference(owner)); let Some(owner) = sandbox_refs.next() else { + let unsupported_sandbox_api_versions = owner_refs + .iter() + .filter(|owner| owner.kind == SANDBOX_KIND) + .map(|owner| owner.api_version.as_str()) + .collect::>(); + if !unsupported_sandbox_api_versions.is_empty() { + warn!( + api_versions = ?unsupported_sandbox_api_versions, + supported_api_versions = ?[ + SANDBOX_API_VERSION_FULL_V1BETA1, + SANDBOX_API_VERSION_FULL_V1ALPHA1, + ], + "pod Sandbox ownerReference uses unsupported apiVersion" + ); + } return Err(Status::permission_denied( "pod is not controlled by an OpenShell Sandbox", )); @@ -394,11 +447,30 @@ fn sandbox_owner_reference(pod: &Pod) -> Result { )); } Ok(SandboxOwnerReference { + api_version: owner.api_version.clone(), name: owner.name.clone(), uid: owner.uid.clone(), }) } +fn is_supported_sandbox_owner_reference( + owner: &k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference, +) -> bool { + owner.kind == SANDBOX_KIND + && matches!( + owner.api_version.as_str(), + SANDBOX_API_VERSION_FULL_V1BETA1 | SANDBOX_API_VERSION_FULL_V1ALPHA1 + ) +} + +fn should_try_next_sandbox_api_version(err: &KubeError) -> bool { + matches!( + err, + KubeError::Api(api) + if api.code == 404 && api.message.contains("could not find the requested resource") + ) +} + #[allow(clippy::result_large_err)] fn validate_sandbox_owner_reference( owner: &SandboxOwnerReference, @@ -515,8 +587,12 @@ mod tests { } fn sandbox_owner(name: &str, uid: &str) -> OwnerReference { + sandbox_owner_with_api_version(SANDBOX_API_VERSION_FULL_V1BETA1, name, uid) + } + + fn sandbox_owner_with_api_version(api_version: &str, name: &str, uid: &str) -> OwnerReference { OwnerReference { - api_version: SANDBOX_API_VERSION_FULL.to_string(), + api_version: api_version.to_string(), block_owner_deletion: None, controller: Some(true), kind: SANDBOX_KIND.to_string(), @@ -549,7 +625,7 @@ mod tests { fn sandbox_cr(name: &str, uid: &str, sandbox_id: &str) -> DynamicObject { let sandbox_gvk = - GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION, SANDBOX_KIND); + GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION_V1BETA1, SANDBOX_KIND); let sandbox_resource = ApiResource::from_gvk(&sandbox_gvk); let mut cr = DynamicObject::new(name, &sandbox_resource); cr.metadata.uid = Some(uid.to_string()); @@ -681,6 +757,27 @@ mod tests { assert_eq!( owner, SandboxOwnerReference { + api_version: SANDBOX_API_VERSION_FULL_V1BETA1.to_string(), + name: "sandbox-a".to_string(), + uid: "cr-uid-a".to_string(), + } + ); + } + + #[test] + fn sandbox_owner_reference_accepts_v1alpha1_owner() { + let pod = pod_with_owner_refs(vec![sandbox_owner_with_api_version( + SANDBOX_API_VERSION_FULL_V1ALPHA1, + "sandbox-a", + "cr-uid-a", + )]); + + let owner = sandbox_owner_reference(&pod).expect("expected v1alpha1 Sandbox owner"); + + assert_eq!( + owner, + SandboxOwnerReference { + api_version: SANDBOX_API_VERSION_FULL_V1ALPHA1.to_string(), name: "sandbox-a".to_string(), uid: "cr-uid-a".to_string(), } @@ -696,6 +793,20 @@ mod tests { assert_eq!(err.code(), tonic::Code::PermissionDenied); } + #[test] + fn sandbox_owner_reference_rejects_unsupported_sandbox_api_version() { + let pod = pod_with_owner_refs(vec![sandbox_owner_with_api_version( + "agents.x-k8s.io/v1", + "sandbox-a", + "cr-uid-a", + )]); + + let err = + sandbox_owner_reference(&pod).expect_err("unsupported apiVersion must fail closed"); + + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + #[test] fn sandbox_owner_reference_requires_controlling_owner() { let mut owner = sandbox_owner("sandbox-a", "cr-uid-a"); @@ -722,6 +833,7 @@ mod tests { #[test] fn validate_sandbox_owner_reference_requires_matching_cr_uid_and_label() { let owner = SandboxOwnerReference { + api_version: SANDBOX_API_VERSION_FULL_V1BETA1.to_string(), name: "sandbox-a".to_string(), uid: "cr-uid-a".to_string(), }; diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index c81afd66d..faaa14530 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -283,7 +283,7 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA | `workspace_default_storage_size` | `server.workspaceDefaultStorageSize` | Set the default workspace PVC size for new sandboxes. | | `sa_token_ttl_secs` | `server.sandboxJwt.k8sSaTokenTtlSecs` | Set the projected ServiceAccount token TTL used for the bootstrap token exchange. | -The Kubernetes driver creates namespaced `agents.x-k8s.io/v1alpha1` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. The Agent Sandbox controller turns those resources into sandbox pods and related storage. +The Kubernetes driver creates namespaced `agents.x-k8s.io` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. It detects the served Sandbox API at runtime so supported Agent Sandbox installations work without version-specific operator configuration. The Agent Sandbox controller turns those resources into sandbox pods and related storage. `Sandbox.spec.volumeClaimTemplates` is immutable after creation. To change storage configuration, delete the sandbox and create a new one with the updated spec. diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index bea1c01d3..55fef08a3 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -51,9 +51,10 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # shellcheck source=e2e/support/gateway-common.sh source "${ROOT}/e2e/support/gateway-common.sh" -# Upstream agent-sandbox release. Bump in lockstep with the supported Sandbox -# field set in crates/openshell-driver-kubernetes (see sandbox_to_k8s_spec). -AGENT_SANDBOX_VERSION="${AGENT_SANDBOX_VERSION:-v0.4.6}" +# Upstream agent-sandbox release. The Kubernetes driver supports the v1beta1 +# Sandbox API introduced in v0.5.0 and falls back to v1alpha1 for v0.4.6 +# clusters. Override this env var to exercise the v1alpha1 controller release. +AGENT_SANDBOX_VERSION="${AGENT_SANDBOX_VERSION:-v0.5.0}" e2e_preserve_mise_dirs e2e_align_docker_host_with_cli_context diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh index 59d8939a6..f9ac186f5 100755 --- a/tasks/scripts/helm-k3s-local.sh +++ b/tasks/scripts/helm-k3s-local.sh @@ -33,9 +33,10 @@ DEFAULT_SANDBOX_PRELOAD_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base PRELOAD_SANDBOX_IMAGE="${HELM_K3S_PRELOAD_SANDBOX_IMAGE-${DEFAULT_SANDBOX_PRELOAD_IMAGE}}" # Upstream agent-sandbox release pinned for both CRDs/controller and extensions. -# Refresh this tag in lockstep with the supported field set in -# crates/openshell-driver-kubernetes (see sandbox_to_k8s_spec). -AGENT_SANDBOX_VERSION="${AGENT_SANDBOX_VERSION:-v0.4.6}" +# The Kubernetes driver supports the v1beta1 Sandbox API introduced in v0.5.0 +# and falls back to v1alpha1 for v0.4.6 clusters. Override this env var to +# exercise the v1alpha1 controller release. +AGENT_SANDBOX_VERSION="${AGENT_SANDBOX_VERSION:-v0.5.0}" default_kubeconfig="${ROOT}/kubeconfig" if [[ -n "${HELM_K3S_KUBECONFIG:-}" ]]; then From d0540b2de0a5cabc3c372ffdd2c7a70179214cde Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 25 Jun 2026 16:25:55 -0700 Subject: [PATCH 2/7] ci(kubernetes): test agent-sandbox api versions Signed-off-by: Taylor Mutch --- .github/workflows/branch-e2e.yml | 10 ++++++++++ .github/workflows/e2e-kubernetes-test.yml | 6 ++++++ tasks/test.toml | 12 ++++++++++++ 3 files changed, 28 insertions(+) diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index 23d7d1cf6..ebe783406 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -115,12 +115,22 @@ jobs: kubernetes-e2e: needs: [pr_metadata, build-gateway, build-supervisor] if: needs.pr_metadata.outputs.should_run == 'true' && needs.pr_metadata.outputs.run_core_e2e == 'true' + strategy: + fail-fast: false + matrix: + include: + - agent_sandbox_api: v1beta1 + agent_sandbox_version: v0.5.0 + - agent_sandbox_api: v1alpha1 + agent_sandbox_version: v0.4.6 permissions: contents: read packages: read uses: ./.github/workflows/e2e-kubernetes-test.yml with: image-tag: ${{ github.sha }} + job-name: Kubernetes E2E (Rust smoke, Agent Sandbox ${{ matrix.agent_sandbox_api }}) + agent-sandbox-version: ${{ matrix.agent_sandbox_version }} kubernetes-ha-e2e: needs: [pr_metadata, build-gateway, build-supervisor] diff --git a/.github/workflows/e2e-kubernetes-test.yml b/.github/workflows/e2e-kubernetes-test.yml index 5ff375922..f8c042b2a 100644 --- a/.github/workflows/e2e-kubernetes-test.yml +++ b/.github/workflows/e2e-kubernetes-test.yml @@ -32,6 +32,11 @@ on: required: false type: string default: "" + agent-sandbox-version: + description: "Agent Sandbox release to install before OpenShell" + required: false + type: string + default: "v0.5.0" mise-version: description: "mise version to install on the bare Kubernetes e2e runner" required: false @@ -114,6 +119,7 @@ jobs: - name: Run Kubernetes E2E (Rust smoke) env: + AGENT_SANDBOX_VERSION: ${{ inputs.agent-sandbox-version }} OPENSHELL_E2E_KUBE_CONTEXT: kind-${{ env.KIND_CLUSTER_NAME }} OPENSHELL_E2E_KUBE_EXTRA_VALUES: ${{ inputs.extra-helm-values }} OPENSHELL_E2E_KUBE_EXTERNAL_POSTGRES_SECRET: ${{ inputs.external-postgres-secret }} diff --git a/tasks/test.toml b/tasks/test.toml index 444ea15e1..1d0f97856 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -94,6 +94,18 @@ run = "e2e/rust/e2e-podman.sh" description = "Run Rust CLI e2e tests against an OpenShell gateway deployed on Kubernetes via Helm (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster; otherwise creates a local k3d cluster when k3d is installed; set OPENSHELL_E2E_KUBE_TEST= to scope to one test)" run = "e2e/rust/e2e-kubernetes.sh" +["e2e:kubernetes:v1alpha1"] +description = "Run Kubernetes e2e against Agent Sandbox v1alpha1" +env = { AGENT_SANDBOX_VERSION = "v0.4.6" } +run = "e2e/rust/e2e-kubernetes.sh" + +["e2e:kubernetes:agent-sandbox-versions"] +description = "Run Kubernetes e2e against Agent Sandbox v1beta1 and v1alpha1" +run = [ + "e2e/rust/e2e-kubernetes.sh", + "AGENT_SANDBOX_VERSION=v0.4.6 e2e/rust/e2e-kubernetes.sh", +] + ["e2e:kubernetes:db"] description = "Run Kubernetes e2e with all database backend scenarios (SQLite and external PostgreSQL with existingSecret)" env = { OPENSHELL_E2E_KUBE_DB_SCENARIOS = "1" } From 7d2940fae13851ba11da241ba1f9fb86650060fe Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 26 Jun 2026 00:18:41 -0700 Subject: [PATCH 3/7] fix(kubernetes): retry agent-sandbox raw 404s Signed-off-by: Taylor Mutch --- .../openshell-driver-kubernetes/src/driver.rs | 38 ++++++++++++++++--- crates/openshell-server/src/auth/k8s_sa.rs | 38 ++++++++++++++++--- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 2b1301652..6495916d5 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -699,11 +699,11 @@ impl KubernetesComputeDriver { } fn should_try_next_sandbox_api_version(err: &KubeError) -> bool { - matches!( - err, - KubeError::Api(api) - if api.code == 404 && api.message.contains("could not find the requested resource") - ) + // Kubernetes returns a structured 404 for some missing API resources and a + // raw "404 page not found" body for others. Both mean the probed + // group/version is unavailable and the next supported Sandbox API version + // should be tried. + matches!(err, KubeError::Api(api) if api.code == 404) } fn validate_gpu_request( @@ -2069,6 +2069,34 @@ mod tests { } } + fn kube_api_error(code: u16, message: &str) -> KubeError { + KubeError::Api(kube::core::ErrorResponse { + status: if code == 404 { + "404 Not Found".to_string() + } else { + "Failure".to_string() + }, + message: message.to_string(), + reason: "Failed to parse error data".to_string(), + code, + }) + } + + #[test] + fn sandbox_api_version_probe_retries_on_structured_and_raw_404() { + let structured = kube_api_error(404, "could not find the requested resource"); + assert!(should_try_next_sandbox_api_version(&structured)); + + let raw = kube_api_error(404, "404 page not found\n"); + assert!(should_try_next_sandbox_api_version(&raw)); + } + + #[test] + fn sandbox_api_version_probe_keeps_non_404_errors() { + let err = kube_api_error(403, "sandboxes.agents.x-k8s.io is forbidden"); + assert!(!should_try_next_sandbox_api_version(&err)); + } + #[test] fn driver_config_rejects_invalid_shape() { let template = SandboxTemplate { diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs index 5eab90446..eed0e5f08 100644 --- a/crates/openshell-server/src/auth/k8s_sa.rs +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -464,11 +464,11 @@ fn is_supported_sandbox_owner_reference( } fn should_try_next_sandbox_api_version(err: &KubeError) -> bool { - matches!( - err, - KubeError::Api(api) - if api.code == 404 && api.message.contains("could not find the requested resource") - ) + // Kubernetes returns a structured 404 for some missing API resources and a + // raw "404 page not found" body for others. Both mean the probed + // group/version is unavailable and the next supported Sandbox API version + // should be tried. + matches!(err, KubeError::Api(api) if api.code == 404) } #[allow(clippy::result_large_err)] @@ -558,6 +558,34 @@ mod tests { h } + fn kube_api_error(code: u16, message: &str) -> KubeError { + KubeError::Api(kube::core::ErrorResponse { + status: if code == 404 { + "404 Not Found".to_string() + } else { + "Failure".to_string() + }, + message: message.to_string(), + reason: "Failed to parse error data".to_string(), + code, + }) + } + + #[test] + fn sandbox_api_version_probe_retries_on_structured_and_raw_404() { + let structured = kube_api_error(404, "could not find the requested resource"); + assert!(should_try_next_sandbox_api_version(&structured)); + + let raw = kube_api_error(404, "404 page not found\n"); + assert!(should_try_next_sandbox_api_version(&raw)); + } + + #[test] + fn sandbox_api_version_probe_keeps_non_404_errors() { + let err = kube_api_error(403, "sandboxes.agents.x-k8s.io is forbidden"); + assert!(!should_try_next_sandbox_api_version(&err)); + } + fn token_review_status( authenticated: bool, audiences: Vec<&str>, From bb610a539d7562e776558b5fb32c533cf067ed5a Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 26 Jun 2026 00:54:52 -0700 Subject: [PATCH 4/7] fix(kubernetes): harden agent-sandbox api setup Signed-off-by: Taylor Mutch --- .../openshell-driver-kubernetes/src/driver.rs | 65 +++++++++---------- e2e/with-kube-gateway.sh | 24 ++++++- 2 files changed, 54 insertions(+), 35 deletions(-) diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 6495916d5..a2defc00b 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -258,36 +258,23 @@ impl KubernetesComputeDriver { &self.config.ssh_socket_path } - fn sandbox_api_for_version(&self, client: Client, version: &str) -> Api { - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, version, SANDBOX_KIND); + fn sandbox_api(&self, client: Client, sandbox_api_version: &str) -> Api { + let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, sandbox_api_version, SANDBOX_KIND); let resource = ApiResource::from_gvk(&gvk); Api::namespaced_with(client, &self.config.namespace, &resource) } - fn api_for_version(&self, version: &str) -> Api { - self.sandbox_api_for_version(self.client.clone(), version) - } - - fn watch_api_for_version(&self, version: &str) -> Api { - self.sandbox_api_for_version(self.watch_client.clone(), version) - } - - async fn supported_api_version(&self, watch_client: bool) -> Result<&'static str, String> { - let client = if watch_client { - self.watch_client.clone() - } else { - self.client.clone() - }; - for version in SANDBOX_VERSIONS { - let api = self.sandbox_api_for_version(client.clone(), version); + async fn supported_sandbox_api_version(&self, client: Client) -> Result<&'static str, String> { + for sandbox_api_version in SANDBOX_VERSIONS { + let api = self.sandbox_api(client.clone(), sandbox_api_version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.list(&ListParams::default().limit(1))) .await { - Ok(Ok(_)) => return Ok(version), + Ok(Ok(_)) => return Ok(sandbox_api_version), Ok(Err(err)) if should_try_next_sandbox_api_version(&err) => { debug!( namespace = %self.config.namespace, - sandbox_api_version = %version, + sandbox_api_version = %sandbox_api_version, error = %err, "Sandbox API version is not available; trying next supported version" ); @@ -345,8 +332,10 @@ impl KubernetesComputeDriver { "Fetching sandbox from Kubernetes" ); - let version = self.supported_api_version(false).await?; - let api = self.api_for_version(version); + let sandbox_api_version = self + .supported_sandbox_api_version(self.client.clone()) + .await?; + let api = self.sandbox_api(self.client.clone(), sandbox_api_version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.get(name)).await { Ok(Ok(obj)) => sandbox_from_object(&self.config.namespace, obj).map(Some), Ok(Err(KubeError::Api(err))) if err.code == 404 => { @@ -381,8 +370,10 @@ impl KubernetesComputeDriver { "Listing sandboxes from Kubernetes" ); - let version = self.supported_api_version(false).await?; - let api = self.api_for_version(version); + let sandbox_api_version = self + .supported_sandbox_api_version(self.client.clone()) + .await?; + let api = self.sandbox_api(self.client.clone(), sandbox_api_version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.list(&ListParams::default())).await { Ok(Ok(list)) => { let mut sandboxes = list @@ -437,11 +428,11 @@ impl KubernetesComputeDriver { "Creating sandbox in Kubernetes" ); - let version = self - .supported_api_version(false) + let sandbox_api_version = self + .supported_sandbox_api_version(self.client.clone()) .await .map_err(KubernetesDriverError::Message)?; - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, version, SANDBOX_KIND); + let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, sandbox_api_version, SANDBOX_KIND); let resource = ApiResource::from_gvk(&gvk); let mut obj = DynamicObject::new(name, &resource); obj.metadata = ObjectMeta { @@ -475,7 +466,7 @@ impl KubernetesComputeDriver { .provider_spiffe_workload_api_socket_path, }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); - let api = self.api_for_version(version); + let api = self.sandbox_api(self.client.clone(), sandbox_api_version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.create(&PostParams::default(), &obj)).await { @@ -518,8 +509,10 @@ impl KubernetesComputeDriver { "Deleting sandbox from Kubernetes" ); - let version = self.supported_api_version(false).await?; - let api = self.api_for_version(version); + let sandbox_api_version = self + .supported_sandbox_api_version(self.client.clone()) + .await?; + let api = self.sandbox_api(self.client.clone(), sandbox_api_version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.delete(name, &DeleteParams::default())) .await { @@ -554,8 +547,10 @@ impl KubernetesComputeDriver { } pub async fn sandbox_exists(&self, name: &str) -> Result { - let version = self.supported_api_version(false).await?; - let api = self.api_for_version(version); + let sandbox_api_version = self + .supported_sandbox_api_version(self.client.clone()) + .await?; + let api = self.sandbox_api(self.client.clone(), sandbox_api_version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.get(name)).await { Ok(Ok(_)) => Ok(true), Ok(Err(KubeError::Api(err))) if err.code == 404 => Ok(false), @@ -571,8 +566,10 @@ impl KubernetesComputeDriver { #[allow(clippy::unused_async)] pub async fn watch_sandboxes(&self) -> Result { let namespace = self.config.namespace.clone(); - let version = self.supported_api_version(true).await?; - let sandbox_api = self.watch_api_for_version(version); + let sandbox_api_version = self + .supported_sandbox_api_version(self.watch_client.clone()) + .await?; + let sandbox_api = self.sandbox_api(self.watch_client.clone(), sandbox_api_version); let event_api: Api = Api::namespaced(self.watch_client.clone(), &namespace); let mut sandbox_stream = watcher::watcher(sandbox_api, watcher::Config::default()).boxed(); let mut event_stream = watcher::watcher(event_api, watcher::Config::default()).boxed(); diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index 55fef08a3..47b8730dc 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -89,6 +89,28 @@ kctl() { kubectl --context "${KUBE_CONTEXT}" "$@" } +wait_for_agent_sandbox_crd() { + local deadline + local established + + deadline=$(( $(date +%s) + 120 )) + while [ "$(date +%s)" -lt "${deadline}" ]; do + if kctl get crd/sandboxes.agents.x-k8s.io >/dev/null 2>&1; then + established="$(kctl get crd/sandboxes.agents.x-k8s.io \ + -o 'jsonpath={.status.conditions[?(@.type=="Established")].status}' \ + 2>/dev/null || true)" + if [ "${established}" = "True" ]; then + return 0 + fi + fi + sleep 2 + done + + echo "Timed out waiting for agent-sandbox Sandbox CRD to become Established" >&2 + kctl get crd/sandboxes.agents.x-k8s.io -o yaml >&2 || true + return 1 +} + helmctl() { helm --kube-context "${KUBE_CONTEXT}" "$@" } @@ -534,7 +556,7 @@ fi echo "Installing agent-sandbox CRDs and controller (${AGENT_SANDBOX_VERSION})..." _agent_sandbox_base="https://github.com/kubernetes-sigs/agent-sandbox/releases/download/${AGENT_SANDBOX_VERSION}" kctl apply -f "${_agent_sandbox_base}/manifest.yaml" -kctl wait --for=condition=Established crd/sandboxes.agents.x-k8s.io --timeout=120s +wait_for_agent_sandbox_crd kctl -n agent-sandbox-system rollout status deployment/agent-sandbox-controller --timeout=300s helm_extra_args=() From 647117357993b3e8be3bf64a124528c2a7efaf6f Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 26 Jun 2026 08:44:21 -0700 Subject: [PATCH 5/7] fix(kubernetes): cache agent-sandbox api version Signed-off-by: Taylor Mutch --- .../openshell-driver-kubernetes/src/driver.rs | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index a2defc00b..e096b6867 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -34,8 +34,9 @@ use openshell_core::proto_struct::{struct_to_json_object, value_to_json}; use serde::Deserialize; use std::collections::BTreeMap; use std::pin::Pin; +use std::sync::Arc; use std::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::{OnceCell, mpsc}; use tokio_stream::wrappers::ReceiverStream; use tracing::{debug, info, warn}; @@ -192,6 +193,7 @@ const WORKSPACE_SENTINEL: &str = ".workspace-initialized"; pub struct KubernetesComputeDriver { client: Client, watch_client: Client, + sandbox_api_version: Arc>, config: KubernetesComputeConfig, } @@ -234,6 +236,7 @@ impl KubernetesComputeDriver { Ok(Self { client, watch_client, + sandbox_api_version: Arc::new(OnceCell::new()), config, }) } @@ -265,12 +268,31 @@ impl KubernetesComputeDriver { } async fn supported_sandbox_api_version(&self, client: Client) -> Result<&'static str, String> { + self.sandbox_api_version + .get_or_try_init( + || async move { self.detect_supported_sandbox_api_version(client).await }, + ) + .await + .copied() + } + + async fn detect_supported_sandbox_api_version( + &self, + client: Client, + ) -> Result<&'static str, String> { for sandbox_api_version in SANDBOX_VERSIONS { let api = self.sandbox_api(client.clone(), sandbox_api_version); match tokio::time::timeout(KUBE_API_TIMEOUT, api.list(&ListParams::default().limit(1))) .await { - Ok(Ok(_)) => return Ok(sandbox_api_version), + Ok(Ok(_)) => { + debug!( + namespace = %self.config.namespace, + sandbox_api_version = %sandbox_api_version, + "Selected Agent Sandbox API version" + ); + return Ok(sandbox_api_version); + } Ok(Err(err)) if should_try_next_sandbox_api_version(&err) => { debug!( namespace = %self.config.namespace, From 7f9c1cac9dada9cb75de7f6da6db0cd5a6c23728 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 26 Jun 2026 09:10:31 -0700 Subject: [PATCH 6/7] docs(kubernetes): document agent-sandbox upgrade behavior Signed-off-by: Taylor Mutch --- crates/openshell-driver-kubernetes/README.md | 10 ++++++---- docs/kubernetes/setup.mdx | 4 ++++ docs/reference/sandbox-compute-drivers.mdx | 4 +++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 6db873ecb..831e4edf2 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -16,10 +16,12 @@ credential injection, policy polling, logs, and the gateway relay. ## Sandbox Resource The driver works with the `agents.x-k8s.io` `Sandbox` custom resource. It -detects the served Sandbox API at runtime and uses the current API before -falling back to the `v1alpha1` API. Driver events map Kubernetes object state and -platform events into the shared compute-driver protobuf surface used by the -gateway. +detects the served Sandbox API at runtime, caches the selected API version for +the gateway process, and uses `v1beta1` when available before falling back to +`v1alpha1`. Restart the gateway after an in-place Agent Sandbox upgrade so the +driver can detect served API versions again. Driver events map Kubernetes object +state and platform events into the shared compute-driver protobuf surface used +by the gateway. Kubernetes API calls use explicit timeouts so gRPC handlers do not block indefinitely when the API server is slow or unavailable. diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index 0a25eceeb..5ab786519 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -50,6 +50,10 @@ kubectl -n agent-sandbox-system get pods The controller pod should reach `Running` status within a few seconds. For cluster-specific setup instructions, including KinD and GKE walkthroughs, refer to the [Agent Sandbox getting started guide](https://agent-sandbox.sigs.k8s.io/docs/getting_started/). +### Upgrade Agent Sandbox + +OpenShell detects the served Agent Sandbox `Sandbox` API when the Kubernetes gateway first needs it and caches that choice for the gateway process. If you upgrade Agent Sandbox in place, restart the OpenShell gateway after the Agent Sandbox controller and CRD rollout completes so the gateway can detect the served API versions again. Existing sandboxes keep running during the upgrade, and the restarted gateway can continue managing them. + ## Install OpenShell diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index faaa14530..ba13dcbd2 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -283,7 +283,9 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA | `workspace_default_storage_size` | `server.workspaceDefaultStorageSize` | Set the default workspace PVC size for new sandboxes. | | `sa_token_ttl_secs` | `server.sandboxJwt.k8sSaTokenTtlSecs` | Set the projected ServiceAccount token TTL used for the bootstrap token exchange. | -The Kubernetes driver creates namespaced `agents.x-k8s.io` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. It detects the served Sandbox API at runtime so supported Agent Sandbox installations work without version-specific operator configuration. The Agent Sandbox controller turns those resources into sandbox pods and related storage. +The Kubernetes driver creates namespaced `agents.x-k8s.io` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. It detects the served Sandbox API at runtime, caches the selected API version for the gateway process, and uses `v1beta1` when available before falling back to `v1alpha1`, so supported Agent Sandbox installations work without version-specific operator configuration. The Agent Sandbox controller turns those resources into sandbox pods and related storage. + +If Agent Sandbox is upgraded in place, restart the OpenShell gateway after the controller and CRD rollout completes so the gateway can detect the served API versions again. `Sandbox.spec.volumeClaimTemplates` is immutable after creation. To change storage configuration, delete the sandbox and create a new one with the updated spec. From 13f02100520b36fc84355c3a282c6275a89a459a Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 26 Jun 2026 09:18:48 -0700 Subject: [PATCH 7/7] refactor(kubernetes): collapse agent-sandbox api selection Signed-off-by: Taylor Mutch --- .../openshell-driver-kubernetes/src/driver.rs | 86 +++++++++++-------- 1 file changed, 52 insertions(+), 34 deletions(-) diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index e096b6867..909568302 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -89,6 +89,11 @@ pub const SANDBOX_KIND: &str = "Sandbox"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; const SPIFFE_WORKLOAD_API_VOLUME_NAME: &str = "spiffe-workload-api"; +struct AgentSandboxApi { + api: Api, + resource: ApiResource, +} + // This POC treats the selected Struct as a driver-local typed schema. Once the // Kubernetes shape stabilizes, these serde structs may move to driver-local // protobuf definitions, but the typed decode should stay inside this driver. @@ -261,10 +266,16 @@ impl KubernetesComputeDriver { &self.config.ssh_socket_path } - fn sandbox_api(&self, client: Client, sandbox_api_version: &str) -> Api { + fn agent_sandbox_api(&self, client: Client, sandbox_api_version: &str) -> AgentSandboxApi { let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, sandbox_api_version, SANDBOX_KIND); let resource = ApiResource::from_gvk(&gvk); - Api::namespaced_with(client, &self.config.namespace, &resource) + let api = Api::namespaced_with(client, &self.config.namespace, &resource); + AgentSandboxApi { api, resource } + } + + async fn supported_agent_sandbox_api(&self, client: Client) -> Result { + let sandbox_api_version = self.supported_sandbox_api_version(client.clone()).await?; + Ok(self.agent_sandbox_api(client, sandbox_api_version)) } async fn supported_sandbox_api_version(&self, client: Client) -> Result<&'static str, String> { @@ -281,9 +292,12 @@ impl KubernetesComputeDriver { client: Client, ) -> Result<&'static str, String> { for sandbox_api_version in SANDBOX_VERSIONS { - let api = self.sandbox_api(client.clone(), sandbox_api_version); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.list(&ListParams::default().limit(1))) - .await + let agent_sandbox_api = self.agent_sandbox_api(client.clone(), sandbox_api_version); + match tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.list(&ListParams::default().limit(1)), + ) + .await { Ok(Ok(_)) => { debug!( @@ -354,11 +368,10 @@ impl KubernetesComputeDriver { "Fetching sandbox from Kubernetes" ); - let sandbox_api_version = self - .supported_sandbox_api_version(self.client.clone()) + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) .await?; - let api = self.sandbox_api(self.client.clone(), sandbox_api_version); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.get(name)).await { + match tokio::time::timeout(KUBE_API_TIMEOUT, agent_sandbox_api.api.get(name)).await { Ok(Ok(obj)) => sandbox_from_object(&self.config.namespace, obj).map(Some), Ok(Err(KubeError::Api(err))) if err.code == 404 => { debug!(sandbox_name = %name, "Sandbox not found in Kubernetes"); @@ -392,11 +405,15 @@ impl KubernetesComputeDriver { "Listing sandboxes from Kubernetes" ); - let sandbox_api_version = self - .supported_sandbox_api_version(self.client.clone()) + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) .await?; - let api = self.sandbox_api(self.client.clone(), sandbox_api_version); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.list(&ListParams::default())).await { + match tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.list(&ListParams::default()), + ) + .await + { Ok(Ok(list)) => { let mut sandboxes = list .items @@ -450,13 +467,11 @@ impl KubernetesComputeDriver { "Creating sandbox in Kubernetes" ); - let sandbox_api_version = self - .supported_sandbox_api_version(self.client.clone()) + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) .await .map_err(KubernetesDriverError::Message)?; - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, sandbox_api_version, SANDBOX_KIND); - let resource = ApiResource::from_gvk(&gvk); - let mut obj = DynamicObject::new(name, &resource); + let mut obj = DynamicObject::new(name, &agent_sandbox_api.resource); obj.metadata = ObjectMeta { name: Some(name.to_string()), namespace: Some(self.config.namespace.clone()), @@ -488,9 +503,11 @@ impl KubernetesComputeDriver { .provider_spiffe_workload_api_socket_path, }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); - let api = self.sandbox_api(self.client.clone(), sandbox_api_version); - - match tokio::time::timeout(KUBE_API_TIMEOUT, api.create(&PostParams::default(), &obj)).await + match tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.create(&PostParams::default(), &obj), + ) + .await { Ok(Ok(_result)) => { info!( @@ -531,12 +548,14 @@ impl KubernetesComputeDriver { "Deleting sandbox from Kubernetes" ); - let sandbox_api_version = self - .supported_sandbox_api_version(self.client.clone()) + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) .await?; - let api = self.sandbox_api(self.client.clone(), sandbox_api_version); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.delete(name, &DeleteParams::default())) - .await + match tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.delete(name, &DeleteParams::default()), + ) + .await { Ok(Ok(_response)) => { info!(sandbox_name = %name, "Sandbox deleted from Kubernetes"); @@ -569,11 +588,10 @@ impl KubernetesComputeDriver { } pub async fn sandbox_exists(&self, name: &str) -> Result { - let sandbox_api_version = self - .supported_sandbox_api_version(self.client.clone()) + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) .await?; - let api = self.sandbox_api(self.client.clone(), sandbox_api_version); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.get(name)).await { + match tokio::time::timeout(KUBE_API_TIMEOUT, agent_sandbox_api.api.get(name)).await { Ok(Ok(_)) => Ok(true), Ok(Err(KubeError::Api(err))) if err.code == 404 => Ok(false), Ok(Err(err)) => Err(err.to_string()), @@ -588,12 +606,12 @@ impl KubernetesComputeDriver { #[allow(clippy::unused_async)] pub async fn watch_sandboxes(&self) -> Result { let namespace = self.config.namespace.clone(); - let sandbox_api_version = self - .supported_sandbox_api_version(self.watch_client.clone()) + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.watch_client.clone()) .await?; - let sandbox_api = self.sandbox_api(self.watch_client.clone(), sandbox_api_version); let event_api: Api = Api::namespaced(self.watch_client.clone(), &namespace); - let mut sandbox_stream = watcher::watcher(sandbox_api, watcher::Config::default()).boxed(); + let mut sandbox_stream = + watcher::watcher(agent_sandbox_api.api, watcher::Config::default()).boxed(); let mut event_stream = watcher::watcher(event_api, watcher::Config::default()).boxed(); let (tx, rx) = mpsc::channel(256);