Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .server-changes/verify-deployment-image-before-finalize.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: webapp
type: fix
---

Verify a deployment's image exists in the registry before marking it deployed, so a deploy whose image wasn't pushed fails instead of silently breaking runs (can be turned off via `DEPLOY_IMAGE_VERIFICATION_ENABLED=0` for setups that push images out of band)
3 changes: 3 additions & 0 deletions apps/webapp/app/env.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,9 @@ const EnvironmentSchema = z
// log-only mode before enforcement.
DEPRECATE_V3_CLI_DEPLOYS_ENABLED: z.string().default("0"),

// Verify the deploy image exists before promoting. Disable for out-of-band/air-gapped push. ECR only.
DEPLOY_IMAGE_VERIFICATION_ENABLED: BoolEnv.default(true),
Comment thread
nicktrn marked this conversation as resolved.

OBJECT_STORE_BASE_URL: z.string().optional(),
OBJECT_STORE_BUCKET: z.string().optional(),
OBJECT_STORE_ACCESS_KEY_ID: z.string().optional(),
Expand Down
49 changes: 49 additions & 0 deletions apps/webapp/app/v3/services/finalizeDeploymentV2.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import { getEcrAuthToken, isEcrRegistry } from "../getDeploymentImageRef.server"
import { tryCatch } from "@trigger.dev/core";
import { getRegistryConfig, type RegistryConfig } from "../registryConfig.server";
import { ComputeTemplateCreationService } from "./computeTemplateCreation.server";
import { ecrImageExists } from "./verifyDeploymentImage.server";

export class FinalizeDeploymentV2Service extends BaseService {
public async call(
Expand Down Expand Up @@ -75,6 +76,11 @@ export class FinalizeDeploymentV2Service extends BaseService {
});
}

// The CLI claims the image is already in the registry (local build, or a
// self-hosted setup). Verify before promoting so we never mark a
// deployment DEPLOYED when nothing was actually pushed.
await this.#assertImagePullable(deployment, body);

await this.#createTemplateIfNeeded(deployment, id, authenticatedEnv, writer);
return finalizeService.call(authenticatedEnv, id, body);
}
Expand Down Expand Up @@ -142,10 +148,53 @@ export class FinalizeDeploymentV2Service extends BaseService {
pushedImage: pushResult.image,
});

// Belt and suspenders: confirm the push actually landed before promoting.
await this.#assertImagePullable(deployment, body);

await this.#createTemplateIfNeeded(deployment, id, authenticatedEnv, writer);
return finalizeService.call(authenticatedEnv, id, body);
}

async #assertImagePullable(
deployment: { imageReference: string | null; type: string | null },
body: FinalizeDeploymentRequestBody
): Promise<void> {
if (!env.DEPLOY_IMAGE_VERIFICATION_ENABLED) {
return;
}

if (!deployment.imageReference) {
return;
}

const registryConfig = getRegistryConfig(deployment.type === "MANAGED");

// ECR-only: non-ECR (self-hosted) registries can't be checked this way, so skip.
if (!isEcrRegistry(registryConfig.host)) {
return;
}

const result = await ecrImageExists({
imageReference: deployment.imageReference,
imageDigest: body.imageDigest,
registryConfig,
});

if (result === "missing") {
throw new ServiceValidationError(
"Deployment image was not found in the registry. It may not have been pushed (for example a local build without a push, or a push to a different registry). Aborting the deploy to avoid promoting a version that cannot start."
);
}

// Fail closed: if we can't confirm the image is present, don't promote a version
// that might not start. Set DEPLOY_IMAGE_VERIFICATION_ENABLED=0 for out-of-band pushes.
if (result === "unknown") {
throw new ServiceValidationError(
"Could not verify the deployment image exists in the registry. Aborting the deploy."
);
}
Comment thread
nicktrn marked this conversation as resolved.
}

async #createTemplateIfNeeded(
deployment: { imageReference: string | null; worker: { project: { id: string } } | null },
deploymentFriendlyId: string,
Expand Down
187 changes: 187 additions & 0 deletions apps/webapp/app/v3/services/verifyDeploymentImage.server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import {
BatchGetImageCommand,
type BatchGetImageCommandOutput,
RepositoryNotFoundException,
} from "@aws-sdk/client-ecr";
import { tryCatch } from "@trigger.dev/core";
import pRetry, { AbortError } from "p-retry";
import { logger } from "~/services/logger.server";
import {
type AssumeRoleConfig,
createEcrClient,
isEcrRegistry,
parseEcrRegistryDomain,
} from "../getDeploymentImageRef.server";
import { type RegistryConfig } from "../registryConfig.server";

const SHA256_DIGEST = /^sha256:[a-f0-9]{64}$/;

export type ImageLookupResult = "found" | "missing" | "unknown";

/**
* Split a stored ECR image reference into repository + tag.
*
* Trust boundary: the ref is platform-generated, but we still bind the lookup to
* our configured host (region/account come from the env host) and only parse refs
* that sit under it. Returns null otherwise.
*/
export function parseEcrImageReference(
imageReference: string,
registryHost: string
): { repositoryName: string; tag: string } | null {
const prefix = `${registryHost}/`;
if (!imageReference.startsWith(prefix)) {
return null;
}

// namespace/projectRef:tag, optionally @sha256:... which we drop here
const remainder = imageReference.slice(prefix.length).split("@")[0];
const lastColon = remainder.lastIndexOf(":");

if (lastColon <= 0) {
return null;
}

const repositoryName = remainder.slice(0, lastColon);
const tag = remainder.slice(lastColon + 1);

if (!repositoryName || !tag || tag.includes("/")) {
return null;
}

return { repositoryName, tag };
}

export function interpretBatchGetImageResponse(
response: BatchGetImageCommandOutput
): ImageLookupResult {
if (response.images && response.images.length > 0) {
return "found";
}

if (response.failures?.some((failure) => failure.failureCode === "ImageNotFound")) {
return "missing";
}

// No image and no explicit not-found failure (some other failure code) -
// we can't say it's missing, so don't block the deploy on it.
return "unknown";
}

type BatchGetImageInput = {
region: string;
assumeRole?: AssumeRoleConfig;
registryId?: string;
repositoryName: string;
imageIds: { imageTag?: string; imageDigest?: string }[];
};

type BatchGetImageSender = (input: BatchGetImageInput) => Promise<BatchGetImageCommandOutput>;

const sendBatchGetImage: BatchGetImageSender = async ({
region,
assumeRole,
registryId,
repositoryName,
imageIds,
}) => {
const ecr = await createEcrClient({ region, assumeRole });
// No acceptedMediaTypes: only the single-manifest types are valid enum values, and
// we only care whether the image exists, not its manifest format.
return ecr.send(new BatchGetImageCommand({ repositoryName, registryId, imageIds }));
};

/**
* Pre-promotion backstop: check the deployment image actually exists in ECR.
*
* "found"/"missing" are definitive (a nonexistent repo counts as missing).
* "unknown" means we couldn't determine it - non-ECR registry, unparseable ref, or
* an API error; the caller decides what to do with each. `_send` is a test seam.
*/
export async function ecrImageExists(
{
imageReference,
imageDigest,
registryConfig,
}: {
imageReference: string;
imageDigest?: string;
registryConfig: RegistryConfig;
},
_send: BatchGetImageSender = sendBatchGetImage
): Promise<ImageLookupResult> {
if (!isEcrRegistry(registryConfig.host)) {
return "unknown";
}

const parsed = parseEcrImageReference(imageReference, registryConfig.host);

if (!parsed) {
logger.warn("Could not parse deployment image reference for verification", { imageReference });
return "unknown";
}

const { accountId, region } = parseEcrRegistryDomain(registryConfig.host);

// imageDigest is supplied by the CLI request body - validate before trusting it.
// Prefer it when valid (catches a tag that resolves to a different image), else
// fall back to the platform-generated tag.
const validDigest =
imageDigest && SHA256_DIGEST.test(imageDigest.trim()) ? imageDigest.trim() : undefined;
const imageId = validDigest ? { imageDigest: validDigest } : { imageTag: parsed.tag };

const assumeRole = registryConfig.ecrAssumeRoleArn
? {
roleArn: registryConfig.ecrAssumeRoleArn,
externalId: registryConfig.ecrAssumeRoleExternalId,
}
: undefined;

// Retry transient ECR failures (throttling/network) before giving up, so a blip
// doesn't fail an otherwise-fine deploy. A missing repo is definitive - don't retry.
const [error, response] = await tryCatch(
pRetry(
() =>
_send({
region,
assumeRole,
registryId: accountId,
repositoryName: parsed.repositoryName,
imageIds: [imageId],
}).catch((err) => {
if (err instanceof RepositoryNotFoundException) {
throw new AbortError(err);
}
throw err;
}),
{
retries: 2,
minTimeout: 200,
maxTimeout: 1000,
onFailedAttempt: (e) => {
logger.warn("Retrying ECR image verification", {
imageReference,
attempt: e.attemptNumber,
error: e.message,
});
},
}
)
);

if (error) {
// A missing repo is a definitive miss, not an ambiguous error.
if (error instanceof RepositoryNotFoundException) {
return "missing";
}

logger.error("Failed to verify deployment image in ECR", {
imageReference,
repositoryName: parsed.repositoryName,
error: error.message,
});
return "unknown";
}
Comment thread
nicktrn marked this conversation as resolved.

return interpretBatchGetImageResponse(response);
}
Loading