From b48e0d39c5d26dc8e747ae5e33e9727a486a09b0 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 25 Jun 2026 22:07:29 +0100 Subject: [PATCH 1/3] fix(webapp): verify deployment image exists before finalizing --- ...verify-deployment-image-before-finalize.md | 6 + apps/webapp/app/env.server.ts | 3 + .../services/finalizeDeploymentV2.server.ts | 40 +++++ .../services/verifyDeploymentImage.server.ts | 164 ++++++++++++++++++ .../webapp/test/verifyDeploymentImage.test.ts | 147 ++++++++++++++++ 5 files changed, 360 insertions(+) create mode 100644 .server-changes/verify-deployment-image-before-finalize.md create mode 100644 apps/webapp/app/v3/services/verifyDeploymentImage.server.ts create mode 100644 apps/webapp/test/verifyDeploymentImage.test.ts diff --git a/.server-changes/verify-deployment-image-before-finalize.md b/.server-changes/verify-deployment-image-before-finalize.md new file mode 100644 index 00000000000..f821da49443 --- /dev/null +++ b/.server-changes/verify-deployment-image-before-finalize.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Verify a deployment's image exists in the registry before marking it deployed, so a deploy whose image wasn't pushed fails instead of silently breaking runs (can be turned off via `DEPLOY_IMAGE_VERIFICATION_ENABLED=0` for setups that push images out of band) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 2da52942477..e793cd086ba 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -553,6 +553,9 @@ const EnvironmentSchema = z // log-only mode before enforcement. DEPRECATE_V3_CLI_DEPLOYS_ENABLED: z.string().default("0"), + // Verify the deploy image exists before promoting. Disable for out-of-band/air-gapped push. ECR only. + DEPLOY_IMAGE_VERIFICATION_ENABLED: BoolEnv.default(true), + OBJECT_STORE_BASE_URL: z.string().optional(), OBJECT_STORE_BUCKET: z.string().optional(), OBJECT_STORE_ACCESS_KEY_ID: z.string().optional(), diff --git a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts index 7ca8a379a3d..269d0cd8158 100644 --- a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts @@ -16,6 +16,7 @@ import { getEcrAuthToken, isEcrRegistry } from "../getDeploymentImageRef.server" import { tryCatch } from "@trigger.dev/core"; import { getRegistryConfig, type RegistryConfig } from "../registryConfig.server"; import { ComputeTemplateCreationService } from "./computeTemplateCreation.server"; +import { ecrImageExists } from "./verifyDeploymentImage.server"; export class FinalizeDeploymentV2Service extends BaseService { public async call( @@ -75,6 +76,11 @@ export class FinalizeDeploymentV2Service extends BaseService { }); } + // The CLI claims the image is already in the registry (local build, or a + // self-hosted setup). Verify before promoting so we never mark a + // deployment DEPLOYED when nothing was actually pushed. + await this.#assertImagePullable(deployment, body); + await this.#createTemplateIfNeeded(deployment, id, authenticatedEnv, writer); return finalizeService.call(authenticatedEnv, id, body); } @@ -142,10 +148,44 @@ export class FinalizeDeploymentV2Service extends BaseService { pushedImage: pushResult.image, }); + // Belt and suspenders: confirm the push actually landed before promoting. + await this.#assertImagePullable(deployment, body); + await this.#createTemplateIfNeeded(deployment, id, authenticatedEnv, writer); return finalizeService.call(authenticatedEnv, id, body); } + async #assertImagePullable( + deployment: { imageReference: string | null; type: string | null }, + body: FinalizeDeploymentRequestBody + ): Promise { + if (!env.DEPLOY_IMAGE_VERIFICATION_ENABLED) { + return; + } + + if (!deployment.imageReference) { + return; + } + + const registryConfig = getRegistryConfig(deployment.type === "MANAGED"); + + const result = await ecrImageExists({ + imageReference: deployment.imageReference, + imageDigest: body.imageDigest, + registryConfig, + }); + + if (result === "missing") { + throw new ServiceValidationError( + "Deployment image was not found in the registry. It may not have been pushed (for example a local build without a push, or a push to a different registry). Aborting the deploy to avoid promoting a version that cannot start." + ); + } + + // "unknown" (non-ECR registry, unparseable ref, or an API/permission error) + // is logged inside ecrImageExists - proceed, since a verifier failure must + // not become a deploy outage. + } + async #createTemplateIfNeeded( deployment: { imageReference: string | null; worker: { project: { id: string } } | null }, deploymentFriendlyId: string, diff --git a/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts b/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts new file mode 100644 index 00000000000..dd26cd56922 --- /dev/null +++ b/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts @@ -0,0 +1,164 @@ +import { BatchGetImageCommand, type BatchGetImageCommandOutput } from "@aws-sdk/client-ecr"; +import { tryCatch } from "@trigger.dev/core"; +import { logger } from "~/services/logger.server"; +import { + type AssumeRoleConfig, + createEcrClient, + isEcrRegistry, + parseEcrRegistryDomain, +} from "../getDeploymentImageRef.server"; +import { type RegistryConfig } from "../registryConfig.server"; + +const SHA256_DIGEST = /^sha256:[a-f0-9]{64}$/; + +export type ImageLookupResult = "found" | "missing" | "unknown"; + +/** + * Split a stored ECR image reference into repository + tag. + * + * Trust boundary: the ref is platform-generated, but we still bind the lookup to + * our configured host (region/account come from the env host) and only parse refs + * that sit under it. Returns null otherwise. + */ +export function parseEcrImageReference( + imageReference: string, + registryHost: string +): { repositoryName: string; tag: string } | null { + const prefix = `${registryHost}/`; + if (!imageReference.startsWith(prefix)) { + return null; + } + + // namespace/projectRef:tag, optionally @sha256:... which we drop here + const remainder = imageReference.slice(prefix.length).split("@")[0]; + const lastColon = remainder.lastIndexOf(":"); + + if (lastColon <= 0) { + return null; + } + + const repositoryName = remainder.slice(0, lastColon); + const tag = remainder.slice(lastColon + 1); + + if (!repositoryName || !tag || tag.includes("/")) { + return null; + } + + return { repositoryName, tag }; +} + +export function interpretBatchGetImageResponse( + response: BatchGetImageCommandOutput +): ImageLookupResult { + if (response.images && response.images.length > 0) { + return "found"; + } + + if (response.failures?.some((failure) => failure.failureCode === "ImageNotFound")) { + return "missing"; + } + + // No image and no explicit not-found failure (some other failure code) - + // we can't say it's missing, so don't block the deploy on it. + return "unknown"; +} + +type BatchGetImageInput = { + region: string; + assumeRole?: AssumeRoleConfig; + registryId?: string; + repositoryName: string; + imageIds: { imageTag?: string; imageDigest?: string }[]; +}; + +type BatchGetImageSender = (input: BatchGetImageInput) => Promise; + +const sendBatchGetImage: BatchGetImageSender = async ({ + region, + assumeRole, + registryId, + repositoryName, + imageIds, +}) => { + const ecr = await createEcrClient({ region, assumeRole }); + return ecr.send( + new BatchGetImageCommand({ + repositoryName, + registryId, + imageIds, + // We only care whether the manifest exists, not its contents. + acceptedMediaTypes: [ + "application/vnd.docker.distribution.manifest.v2+json", + "application/vnd.oci.image.manifest.v1+json", + "application/vnd.oci.image.index.v1+json", + "application/vnd.docker.distribution.manifest.list.v2+json", + ], + }) + ); +}; + +/** + * Pre-promotion backstop: check the deployment image actually exists in ECR. + * + * Returns "unknown" for non-ECR registries or any error we can't read as a + * definitive miss - callers treat "unknown" as "proceed", so a verifier failure + * never becomes a deploy outage. `_send` is a test seam. + */ +export async function ecrImageExists( + { + imageReference, + imageDigest, + registryConfig, + }: { + imageReference: string; + imageDigest?: string; + registryConfig: RegistryConfig; + }, + _send: BatchGetImageSender = sendBatchGetImage +): Promise { + if (!isEcrRegistry(registryConfig.host)) { + return "unknown"; + } + + const parsed = parseEcrImageReference(imageReference, registryConfig.host); + + if (!parsed) { + logger.warn("Could not parse deployment image reference for verification", { imageReference }); + return "unknown"; + } + + const { accountId, region } = parseEcrRegistryDomain(registryConfig.host); + + // imageDigest is supplied by the CLI request body - validate before trusting it. + // Prefer it when valid (catches a tag that resolves to a different image), else + // fall back to the platform-generated tag. + const validDigest = + imageDigest && SHA256_DIGEST.test(imageDigest.trim()) ? imageDigest.trim() : undefined; + const imageId = validDigest ? { imageDigest: validDigest } : { imageTag: parsed.tag }; + + const [error, response] = await tryCatch( + _send({ + region, + assumeRole: registryConfig.ecrAssumeRoleArn + ? { + roleArn: registryConfig.ecrAssumeRoleArn, + externalId: registryConfig.ecrAssumeRoleExternalId, + } + : undefined, + registryId: accountId, + repositoryName: parsed.repositoryName, + imageIds: [imageId], + }) + ); + + if (error) { + logger.error("Failed to verify deployment image in ECR", { + imageReference, + repositoryName: parsed.repositoryName, + error: error.message, + }); + return "unknown"; + } + + return interpretBatchGetImageResponse(response); +} diff --git a/apps/webapp/test/verifyDeploymentImage.test.ts b/apps/webapp/test/verifyDeploymentImage.test.ts new file mode 100644 index 00000000000..cb869040799 --- /dev/null +++ b/apps/webapp/test/verifyDeploymentImage.test.ts @@ -0,0 +1,147 @@ +import { describe, expect, it } from "vitest"; +import { + ecrImageExists, + interpretBatchGetImageResponse, + parseEcrImageReference, +} from "~/v3/services/verifyDeploymentImage.server"; +import { type RegistryConfig } from "~/v3/registryConfig.server"; + +const ECR_HOST = "123456789012.dkr.ecr.us-east-1.amazonaws.com"; +const ecrConfig: RegistryConfig = { host: ECR_HOST, namespace: "deployments-test" }; + +describe("parseEcrImageReference", () => { + it("splits repository and tag for a ref under the configured host", () => { + const ref = `${ECR_HOST}/deployments-test/proj_abc:20240101.1.prod.a1b2c3d4`; + expect(parseEcrImageReference(ref, ECR_HOST)).toEqual({ + repositoryName: "deployments-test/proj_abc", + tag: "20240101.1.prod.a1b2c3d4", + }); + }); + + it("drops a trailing @sha256 digest", () => { + const ref = `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4@sha256:${"a".repeat(64)}`; + expect(parseEcrImageReference(ref, ECR_HOST)).toEqual({ + repositoryName: "deployments-test/proj_abc", + tag: "v1.prod.a1b2c3d4", + }); + }); + + it("returns null when the ref is not under the configured host (trust boundary)", () => { + const ref = "evil.example.com/whatever/proj_abc:v1"; + expect(parseEcrImageReference(ref, ECR_HOST)).toBeNull(); + }); + + it("returns null when there is no tag", () => { + expect(parseEcrImageReference(`${ECR_HOST}/deployments-test/proj_abc`, ECR_HOST)).toBeNull(); + }); + + it("returns null when the tag segment contains a slash", () => { + // a stray colon earlier in the path must not be treated as the tag separator + expect(parseEcrImageReference(`${ECR_HOST}/ns:weird/proj_abc`, ECR_HOST)).toBeNull(); + }); +}); + +describe("interpretBatchGetImageResponse", () => { + it("returns found when an image is present", () => { + expect(interpretBatchGetImageResponse({ images: [{}] } as any)).toBe("found"); + }); + + it("returns missing on an ImageNotFound failure", () => { + expect( + interpretBatchGetImageResponse({ failures: [{ failureCode: "ImageNotFound" }] } as any) + ).toBe("missing"); + }); + + it("returns unknown when there is neither an image nor a not-found failure", () => { + expect(interpretBatchGetImageResponse({ failures: [{ failureCode: "Other" }] } as any)).toBe( + "unknown" + ); + expect(interpretBatchGetImageResponse({} as any)).toBe("unknown"); + }); +}); + +describe("ecrImageExists", () => { + it("returns unknown for a non-ECR registry without calling the registry", async () => { + let called = false; + const result = await ecrImageExists( + { + imageReference: "registry.digitalocean.com/trigger-deployments/proj_abc:v1", + registryConfig: { host: "registry.digitalocean.com", namespace: "trigger-deployments" }, + }, + async () => { + called = true; + return {} as any; + } + ); + expect(result).toBe("unknown"); + expect(called).toBe(false); + }); + + it("returns found when the image exists", async () => { + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + registryConfig: ecrConfig, + }, + async () => ({ images: [{}] }) as any + ); + expect(result).toBe("found"); + }); + + it("returns missing when the registry reports ImageNotFound", async () => { + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + registryConfig: ecrConfig, + }, + async () => ({ failures: [{ failureCode: "ImageNotFound" }] }) as any + ); + expect(result).toBe("missing"); + }); + + it("returns unknown (fails open) when the registry call throws", async () => { + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + registryConfig: ecrConfig, + }, + async () => { + throw new Error("AccessDenied"); + } + ); + expect(result).toBe("unknown"); + }); + + it("queries by digest when a valid digest is supplied", async () => { + const digest = `sha256:${"b".repeat(64)}`; + let seen: any; + await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + imageDigest: digest, + registryConfig: ecrConfig, + }, + async (input) => { + seen = input; + return { images: [{}] } as any; + } + ); + expect(seen.imageIds).toEqual([{ imageDigest: digest }]); + }); + + it("falls back to the tag when the supplied digest is malformed", async () => { + let seen: any; + await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + imageDigest: "not-a-digest", + registryConfig: ecrConfig, + }, + async (input) => { + seen = input; + return { images: [{}] } as any; + } + ); + expect(seen.imageIds).toEqual([{ imageTag: "v1.prod.a1b2c3d4" }]); + }); +}); From 3c7200f72ff25f34d2d95c016ca7534d8de3c829 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 25 Jun 2026 22:29:30 +0100 Subject: [PATCH 2/3] fix(webapp): fail closed on image verification + address review --- .../services/finalizeDeploymentV2.server.ts | 15 ++++++-- .../services/verifyDeploymentImage.server.ts | 34 +++++++++---------- .../webapp/test/verifyDeploymentImage.test.ts | 32 ++++++++++++++++- 3 files changed, 59 insertions(+), 22 deletions(-) diff --git a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts index 269d0cd8158..464dd8aabef 100644 --- a/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts +++ b/apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts @@ -169,6 +169,11 @@ export class FinalizeDeploymentV2Service extends BaseService { const registryConfig = getRegistryConfig(deployment.type === "MANAGED"); + // ECR-only: non-ECR (self-hosted) registries can't be checked this way, so skip. + if (!isEcrRegistry(registryConfig.host)) { + return; + } + const result = await ecrImageExists({ imageReference: deployment.imageReference, imageDigest: body.imageDigest, @@ -181,9 +186,13 @@ export class FinalizeDeploymentV2Service extends BaseService { ); } - // "unknown" (non-ECR registry, unparseable ref, or an API/permission error) - // is logged inside ecrImageExists - proceed, since a verifier failure must - // not become a deploy outage. + // Fail closed: if we can't confirm the image is present, don't promote a version + // that might not start. Set DEPLOY_IMAGE_VERIFICATION_ENABLED=0 for out-of-band pushes. + if (result === "unknown") { + throw new ServiceValidationError( + "Could not verify the deployment image exists in the registry. Aborting the deploy." + ); + } } async #createTemplateIfNeeded( diff --git a/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts b/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts index dd26cd56922..9b625a2b4fa 100644 --- a/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts +++ b/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts @@ -1,4 +1,8 @@ -import { BatchGetImageCommand, type BatchGetImageCommandOutput } from "@aws-sdk/client-ecr"; +import { + BatchGetImageCommand, + type BatchGetImageCommandOutput, + RepositoryNotFoundException, +} from "@aws-sdk/client-ecr"; import { tryCatch } from "@trigger.dev/core"; import { logger } from "~/services/logger.server"; import { @@ -81,28 +85,17 @@ const sendBatchGetImage: BatchGetImageSender = async ({ imageIds, }) => { const ecr = await createEcrClient({ region, assumeRole }); - return ecr.send( - new BatchGetImageCommand({ - repositoryName, - registryId, - imageIds, - // We only care whether the manifest exists, not its contents. - acceptedMediaTypes: [ - "application/vnd.docker.distribution.manifest.v2+json", - "application/vnd.oci.image.manifest.v1+json", - "application/vnd.oci.image.index.v1+json", - "application/vnd.docker.distribution.manifest.list.v2+json", - ], - }) - ); + // No acceptedMediaTypes: only the single-manifest types are valid enum values, and + // we only care whether the image exists, not its manifest format. + return ecr.send(new BatchGetImageCommand({ repositoryName, registryId, imageIds })); }; /** * Pre-promotion backstop: check the deployment image actually exists in ECR. * - * Returns "unknown" for non-ECR registries or any error we can't read as a - * definitive miss - callers treat "unknown" as "proceed", so a verifier failure - * never becomes a deploy outage. `_send` is a test seam. + * "found"/"missing" are definitive (a nonexistent repo counts as missing). + * "unknown" means we couldn't determine it - non-ECR registry, unparseable ref, or + * an API error; the caller decides what to do with each. `_send` is a test seam. */ export async function ecrImageExists( { @@ -152,6 +145,11 @@ export async function ecrImageExists( ); if (error) { + // A missing repo is a definitive miss, not an ambiguous error. + if (error instanceof RepositoryNotFoundException) { + return "missing"; + } + logger.error("Failed to verify deployment image in ECR", { imageReference, repositoryName: parsed.repositoryName, diff --git a/apps/webapp/test/verifyDeploymentImage.test.ts b/apps/webapp/test/verifyDeploymentImage.test.ts index cb869040799..43508141cfc 100644 --- a/apps/webapp/test/verifyDeploymentImage.test.ts +++ b/apps/webapp/test/verifyDeploymentImage.test.ts @@ -1,3 +1,4 @@ +import { RepositoryNotFoundException } from "@aws-sdk/client-ecr"; import { describe, expect, it } from "vitest"; import { ecrImageExists, @@ -77,6 +78,22 @@ describe("ecrImageExists", () => { expect(called).toBe(false); }); + it("returns unknown for an unparseable ECR ref without calling the registry", async () => { + let called = false; + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc`, + registryConfig: ecrConfig, + }, + async () => { + called = true; + return {} as any; + } + ); + expect(result).toBe("unknown"); + expect(called).toBe(false); + }); + it("returns found when the image exists", async () => { const result = await ecrImageExists( { @@ -99,7 +116,7 @@ describe("ecrImageExists", () => { expect(result).toBe("missing"); }); - it("returns unknown (fails open) when the registry call throws", async () => { + it("returns unknown when the registry call throws an ambiguous error", async () => { const result = await ecrImageExists( { imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, @@ -112,6 +129,19 @@ describe("ecrImageExists", () => { expect(result).toBe("unknown"); }); + it("returns missing when the repository does not exist", async () => { + const result = await ecrImageExists( + { + imageReference: `${ECR_HOST}/deployments-test/proj_abc:v1.prod.a1b2c3d4`, + registryConfig: ecrConfig, + }, + async () => { + throw new RepositoryNotFoundException({ message: "not found", $metadata: {} }); + } + ); + expect(result).toBe("missing"); + }); + it("queries by digest when a valid digest is supplied", async () => { const digest = `sha256:${"b".repeat(64)}`; let seen: any; From d9d8b6b6259b776cf2639eb4ae505ecb6a78ce69 Mon Sep 17 00:00:00 2001 From: nicktrn <55853254+nicktrn@users.noreply.github.com> Date: Thu, 25 Jun 2026 22:41:15 +0100 Subject: [PATCH 3/3] fix(webapp): retry transient ECR verification failures before failing closed --- .../services/verifyDeploymentImage.server.ts | 47 ++++++++++++++----- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts b/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts index 9b625a2b4fa..7062f89d88e 100644 --- a/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts +++ b/apps/webapp/app/v3/services/verifyDeploymentImage.server.ts @@ -4,6 +4,7 @@ import { RepositoryNotFoundException, } from "@aws-sdk/client-ecr"; import { tryCatch } from "@trigger.dev/core"; +import pRetry, { AbortError } from "p-retry"; import { logger } from "~/services/logger.server"; import { type AssumeRoleConfig, @@ -129,19 +130,43 @@ export async function ecrImageExists( imageDigest && SHA256_DIGEST.test(imageDigest.trim()) ? imageDigest.trim() : undefined; const imageId = validDigest ? { imageDigest: validDigest } : { imageTag: parsed.tag }; + const assumeRole = registryConfig.ecrAssumeRoleArn + ? { + roleArn: registryConfig.ecrAssumeRoleArn, + externalId: registryConfig.ecrAssumeRoleExternalId, + } + : undefined; + + // Retry transient ECR failures (throttling/network) before giving up, so a blip + // doesn't fail an otherwise-fine deploy. A missing repo is definitive - don't retry. const [error, response] = await tryCatch( - _send({ - region, - assumeRole: registryConfig.ecrAssumeRoleArn - ? { - roleArn: registryConfig.ecrAssumeRoleArn, - externalId: registryConfig.ecrAssumeRoleExternalId, + pRetry( + () => + _send({ + region, + assumeRole, + registryId: accountId, + repositoryName: parsed.repositoryName, + imageIds: [imageId], + }).catch((err) => { + if (err instanceof RepositoryNotFoundException) { + throw new AbortError(err); } - : undefined, - registryId: accountId, - repositoryName: parsed.repositoryName, - imageIds: [imageId], - }) + throw err; + }), + { + retries: 2, + minTimeout: 200, + maxTimeout: 1000, + onFailedAttempt: (e) => { + logger.warn("Retrying ECR image verification", { + imageReference, + attempt: e.attemptNumber, + error: e.message, + }); + }, + } + ) ); if (error) {