diff --git a/CHANGELOG.md b/CHANGELOG.md index 689718d36..e6d3a2c05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added per-step token cost tracking and estimated tool call token usage to Ask Sourcebot chat history. [#1353](https://github.com/sourcebot-dev/sourcebot/pull/1353) +- [EE] Added a context-window usage gauge to the Ask Sourcebot chat details, showing how much of the selected model's context window each turn occupies. Window sizes are resolved from the models.dev catalog. [#1370](https://github.com/sourcebot-dev/sourcebot/pull/1370) ### Fixed - Send anonymous server-side PostHog events as personless so unauthenticated requests don't inflate person counts. [#1367](https://github.com/sourcebot-dev/sourcebot/pull/1367) diff --git a/packages/web/src/app/api/(server)/ee/chat/route.ts b/packages/web/src/app/api/(server)/ee/chat/route.ts index 0f20ee8e3..cbb11f06e 100644 --- a/packages/web/src/app/api/(server)/ee/chat/route.ts +++ b/packages/web/src/app/api/(server)/ee/chat/route.ts @@ -6,6 +6,7 @@ import { additionalChatRequestParamsSchema } from "@/features/chat/types"; import { getLanguageModelKey } from "@/features/chat/utils"; import { checkAskEntitlement, getConfiguredLanguageModels, isOwnerOfChat, updateChatMessages } from "@/features/chat/utils.server"; import { getAISDKLanguageModelAndOptions } from "@/features/chat/llm.server"; +import { resolveContextWindow } from "@/features/chat/modelContextWindow.server"; import { apiHandler } from "@/lib/apiHandler"; import { ErrorCode } from "@/lib/errorCodes"; import { captureEvent } from "@/lib/posthog"; @@ -89,6 +90,11 @@ export const POST = apiHandler(async (req: NextRequest) => { const { model, providerOptions, temperature } = await getAISDKLanguageModelAndOptions(languageModelConfig); + // Total context window for the selected model, used as the + // denominator for the UI's context-usage gauge. Undefined when + // unknown (e.g. self-hosted models). + const contextWindow = await resolveContextWindow(languageModelConfig); + // No-op for non-Anthropic providers / when caching is disabled, so // it never perturbs other providers' requests. const promptCacheStrategy = getPromptCacheStrategy( @@ -139,6 +145,7 @@ export const POST = apiHandler(async (req: NextRequest) => { disabledMcpServerIds, model, modelName: languageModelConfig.displayName ?? languageModelConfig.model, + contextWindow, promptCacheStrategy, modelProviderOptions: providerOptions, modelTemperature: temperature, diff --git a/packages/web/src/ee/features/chat/agent.ts b/packages/web/src/ee/features/chat/agent.ts index d2f3a4761..8f5daa749 100644 --- a/packages/web/src/ee/features/chat/agent.ts +++ b/packages/web/src/ee/features/chat/agent.ts @@ -54,6 +54,7 @@ interface CreateMessageStreamResponseProps { disabledMcpServerIds?: string[]; model: AISDKLanguageModelV3; modelName: string; + contextWindow?: number; promptCacheStrategy: PromptCacheStrategy; onFinish: UIMessageStreamOnFinishCallback; onError: (error: unknown) => string; @@ -73,6 +74,7 @@ export const createMessageStream = async ({ disabledMcpServerIds, model, modelName, + contextWindow, promptCacheStrategy, modelProviderOptions, modelTemperature, @@ -279,6 +281,7 @@ export const createMessageStream = async ({ // phases so earlier phases' steps are preserved in order. stepTokenUsage: [...(priorMetadata?.stepTokenUsage ?? []), ...stepTokenUsage], modelName, + contextWindow, traceId, } }); diff --git a/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx b/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx index e95af69d4..7b6c7867f 100644 --- a/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx +++ b/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx @@ -86,6 +86,18 @@ const DetailsCardComponent = ({ ? Math.round((cacheReadTokens / inputTokens) * 100) : 0; + // Context-window usage gauge. "In use" is the input the model saw on its + // most recent step — i.e. the full accumulated prompt occupying the window + // right now — not the cumulative totalInputTokens. + const stepTokenUsage = metadata?.stepTokenUsage; + const currentContextTokens = stepTokenUsage && stepTokenUsage.length > 0 + ? stepTokenUsage[stepTokenUsage.length - 1].inputTokens + : undefined; + const contextWindow = metadata?.contextWindow; + const contextUsagePercent = currentContextTokens !== undefined && contextWindow !== undefined && contextWindow > 0 + ? Math.min(100, Math.round((currentContextTokens / contextWindow) * 100)) + : undefined; + const handleExpandedChanged = useCallback((next: boolean) => { captureEvent('wa_chat_details_card_toggled', { chatId, isExpanded: next }); onExpandedChanged(next); @@ -193,6 +205,23 @@ const DetailsCardComponent = ({ )} )} + {contextUsagePercent !== undefined && currentContextTokens !== undefined && contextWindow !== undefined && ( + + +
+ +
+
+ +
+ The most recent step's prompt used {currentContextTokens.toLocaleString()} of the model's {contextWindow.toLocaleString()}-token context window ({contextUsagePercent}%). +
+
+
+ )} {metadata?.totalResponseTimeMs && (
@@ -367,6 +396,61 @@ const StepTokenUsage = ({ usage, label = 'step' }: { usage: StepTokenUsageEntry, ); } + +const CONTEXT_USAGE_YELLOW_PERCENT = 50; +const CONTEXT_USAGE_RED_PERCENT = 80; + +const getContextUsageColorClass = (percent: number): string => { + if (percent >= CONTEXT_USAGE_RED_PERCENT) { + return "text-red-500"; + } + if (percent >= CONTEXT_USAGE_YELLOW_PERCENT) { + return "text-yellow-500"; + } + return "text-[#6cb38f]"; +}; + +const ContextWindowGauge = ({ total, percent }: { total: number, percent: number }) => { + const size = 14; + const strokeWidth = 2; + const radius = (size - strokeWidth) / 2; + const circumference = 2 * Math.PI * radius; + const dashOffset = circumference * (1 - Math.min(100, percent) / 100); + const colorClass = getContextUsageColorClass(percent); + + return ( +
+ + {/* Neutral gray track. */} + + {/* Progress arc. */} + + + {percent}% + of {getShortenedNumberDisplayString(total, 0).toUpperCase()} +
+ ); +} + type GuardedToolType = | 'tool-read_file' | 'tool-grep' diff --git a/packages/web/src/ee/features/mcp/askCodebase.ts b/packages/web/src/ee/features/mcp/askCodebase.ts index 4b7cfb7b0..e2b234037 100644 --- a/packages/web/src/ee/features/mcp/askCodebase.ts +++ b/packages/web/src/ee/features/mcp/askCodebase.ts @@ -2,6 +2,7 @@ import { sew } from "@/middleware/sew"; import { getConfiguredLanguageModels, updateChatMessages, checkAskEntitlement } from "@/features/chat/utils.server"; import { generateChatNameFromMessage } from "@/ee/features/chat/llm.server"; import { getAISDKLanguageModelAndOptions } from "@/features/chat/llm.server"; +import { resolveContextWindow } from "@/features/chat/modelContextWindow.server"; import { LanguageModelInfo, SBChatMessage, SearchScope } from "@/features/chat/types"; import { convertLLMOutputToPortableMarkdown, getAnswerPartFromAssistantMessage, getLanguageModelKey } from "@/features/chat/utils"; import { ErrorCode } from "@/lib/errorCodes"; @@ -84,6 +85,7 @@ export const askCodebase = (params: AskCodebaseParams): Promise models.dev top-level catalog key. Only providers +// whose Sourcebot id differs from the models.dev id need an entry; everything +// else (anthropic, openai, azure, amazon-bedrock, mistral, deepseek, xai, +// openrouter, google-vertex, google-vertex-anthropic) matches 1:1. +const PROVIDER_ID_OVERRIDES: Record = { + 'google-generative-ai': 'google', +}; + +type ModelsDevModel = { + id: string; + limit?: { + context?: number; + output?: number; + }; +}; + +type ModelsDevProvider = { + id: string; + models?: Record; +}; + +export type ModelsDevCatalog = Record; + +// Last successfully-fetched catalog. Served while fresh, and kept as a fallback +// when a later refresh fails. `catalogFetchedAt` is when it was fetched (TTL), +// `lastFailedAt` the most recent fetch failure (negative-cache backoff), and +// `inFlightFetch` dedupes concurrent fetches. +let cachedCatalog: ModelsDevCatalog | null = null; +let catalogFetchedAt = 0; +let lastFailedAt = 0; +let inFlightFetch: Promise | null = null; + +const fetchCatalog = async (): Promise => { + try { + const response = await fetch(MODELS_DEV_API_URL, { + signal: AbortSignal.timeout(FETCH_TIMEOUT_MS), + }); + if (!response.ok) { + logger.warn(`Failed to fetch models.dev catalog: ${response.status} ${response.statusText}`); + return null; + } + return await response.json() as ModelsDevCatalog; + } catch (error) { + logger.warn(`Failed to fetch models.dev catalog: ${error}`); + return null; + } +}; + +const loadCatalog = async (): Promise => { + const now = Date.now(); + const isFresh = cachedCatalog !== null && now - catalogFetchedAt <= CATALOG_TTL_MS; + const isBackingOff = now - lastFailedAt < NEGATIVE_CACHE_MS; + + // Kick off a (deduped) refresh when the cache is stale/empty and we're not + // within the post-failure backoff window. On success it replaces the cache; + // on failure it only records the failure time, leaving the last-known-good + // catalog intact. + if (!isFresh && !isBackingOff && !inFlightFetch) { + inFlightFetch = fetchCatalog().then((catalog) => { + if (catalog) { + cachedCatalog = catalog; + catalogFetchedAt = Date.now(); + } else { + lastFailedAt = Date.now(); + } + inFlightFetch = null; + return catalog; + }); + } + + // Once a catalog has loaded once, never block the request path on the + // network: serve the last-known-good value (even if stale) and let any + // refresh settle in the background. Only the very first load awaits. + if (cachedCatalog !== null) { + return cachedCatalog; + } + return inFlightFetch ?? null; +}; + +/** + * Pure lookup of a model's context window in a models.dev catalog. Separated + * from the network fetch so it can be unit-tested directly. + * + * Returns the total context window in tokens, or `undefined` when the model + * isn't catalogued or has no usable window. + */ +export const lookupContextWindow = ( + catalog: ModelsDevCatalog | null, + config: Pick, +): number | undefined => { + if (!catalog) { + return undefined; + } + const providerId = PROVIDER_ID_OVERRIDES[config.provider] ?? config.provider; + const context = catalog[providerId]?.models?.[config.model]?.limit?.context; + // `limit` is schema-optional, and models.dev reports a 0 context window for + // non-text models (image/audio/etc.). Treat both as "unknown" so the UI + // gracefully omits the gauge rather than rendering a bogus denominator. + return typeof context === 'number' && context > 0 ? context : undefined; +}; + +export const resolveContextWindow = async ( + config: Pick, +): Promise => { + const catalog = await loadCatalog(); + return lookupContextWindow(catalog, config); +}; diff --git a/packages/web/src/features/chat/modelContextWindow.test.ts b/packages/web/src/features/chat/modelContextWindow.test.ts new file mode 100644 index 000000000..9476820ae --- /dev/null +++ b/packages/web/src/features/chat/modelContextWindow.test.ts @@ -0,0 +1,162 @@ +import { afterEach, describe, expect, test, vi } from 'vitest'; +import type { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type'; + +vi.mock('server-only', () => ({ default: vi.fn() })); + +vi.mock('@sourcebot/shared', () => ({ + createLogger: () => ({ + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + }), +})); + +import { lookupContextWindow, resolveContextWindow, type ModelsDevCatalog } from './modelContextWindow.server'; + +const catalog: ModelsDevCatalog = { + anthropic: { + id: 'anthropic', + models: { + 'claude-sonnet-4-5': { id: 'claude-sonnet-4-5', limit: { context: 200000, output: 64000 } }, + }, + }, + // models.dev keys Gemini under 'google', whereas Sourcebot's provider id is + // 'google-generative-ai' — exercises PROVIDER_ID_OVERRIDES. + google: { + id: 'google', + models: { + 'gemini-2.5-pro': { id: 'gemini-2.5-pro', limit: { context: 1048576, output: 65536 } }, + }, + }, + openai: { + id: 'openai', + models: { + 'gpt-4.1': { id: 'gpt-4.1', limit: { context: 1047576 } }, + // Non-text model: models.dev reports a 0 context window. + 'gpt-image-1': { id: 'gpt-image-1', limit: { context: 0, output: 0 } }, + // Catalogued model with no `limit` object at all. + 'no-limit-model': { id: 'no-limit-model' }, + }, + }, +}; + +const model = (provider: string, modelId: string) => + ({ provider, model: modelId }) as Pick; + +describe('lookupContextWindow', () => { + test('returns the context window for a direct provider/model hit', () => { + expect(lookupContextWindow(catalog, model('anthropic', 'claude-sonnet-4-5'))).toBe(200000); + expect(lookupContextWindow(catalog, model('openai', 'gpt-4.1'))).toBe(1047576); + }); + + test('maps google-generative-ai to the catalog\'s google key', () => { + expect(lookupContextWindow(catalog, model('google-generative-ai', 'gemini-2.5-pro'))).toBe(1048576); + }); + + test('returns undefined for an uncatalogued provider', () => { + expect(lookupContextWindow(catalog, model('mistral', 'mistral-large-latest'))).toBeUndefined(); + }); + + test('returns undefined for an uncatalogued model id (e.g. openai-compatible / self-hosted)', () => { + expect(lookupContextWindow(catalog, model('openai-compatible', 'my-local-model'))).toBeUndefined(); + expect(lookupContextWindow(catalog, model('anthropic', 'claude-unknown'))).toBeUndefined(); + }); + + test('treats a 0 context window (non-text models) as unknown', () => { + expect(lookupContextWindow(catalog, model('openai', 'gpt-image-1'))).toBeUndefined(); + }); + + test('treats a missing limit object as unknown', () => { + expect(lookupContextWindow(catalog, model('openai', 'no-limit-model'))).toBeUndefined(); + }); + + test('returns undefined when the catalog is null (fetch failed / unreachable)', () => { + expect(lookupContextWindow(null, model('anthropic', 'claude-sonnet-4-5'))).toBeUndefined(); + }); +}); + +describe('resolveContextWindow', () => { + afterEach(() => { + vi.unstubAllGlobals(); + }); + + test('fetches the catalog once and resolves windows (incl. provider mapping)', async () => { + const fetchMock = vi.fn(async () => ({ + ok: true, + json: async () => catalog, + }) as unknown as Response); + vi.stubGlobal('fetch', fetchMock); + + expect(await resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBe(200000); + // Subsequent lookups reuse the cached catalog rather than refetching. + expect(await resolveContextWindow(model('google-generative-ai', 'gemini-2.5-pro'))).toBe(1048576); + expect(await resolveContextWindow(model('openai-compatible', 'my-local-model'))).toBeUndefined(); + + expect(fetchMock).toHaveBeenCalledTimes(1); + }); +}); + +describe('resolveContextWindow resilience', () => { + afterEach(() => { + vi.unstubAllGlobals(); + vi.restoreAllMocks(); + vi.resetModules(); + }); + + // Re-import the module so each scenario starts with fresh internal cache state. + const importFresh = async () => { + vi.resetModules(); + return await import('./modelContextWindow.server'); + }; + + test('negative-caches failures instead of refetching on every call', async () => { + const fetchMock = vi.fn(async () => ({ + ok: false, + status: 503, + statusText: 'Service Unavailable', + }) as unknown as Response); + vi.stubGlobal('fetch', fetchMock); + + const mod = await importFresh(); + + expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBeUndefined(); + expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBeUndefined(); + expect(await mod.resolveContextWindow(model('openai', 'gpt-4.1'))).toBeUndefined(); + + // Only the first attempt hit the network; the rest were short-circuited + // by the negative-cache window, so chat sends don't repeatedly block. + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + test('preserves the last-known-good catalog when a refresh fails', async () => { + let nowMs = 1_700_000_000_000; + vi.spyOn(Date, 'now').mockImplementation(() => nowMs); + + let shouldFail = false; + const fetchMock = vi.fn(async () => (shouldFail + ? { ok: false, status: 503, statusText: 'Service Unavailable' } + : { ok: true, json: async () => catalog }) as unknown as Response); + vi.stubGlobal('fetch', fetchMock); + + const mod = await importFresh(); + + // First load populates the cache. + expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBe(200000); + expect(fetchMock).toHaveBeenCalledTimes(1); + + // Advance past the TTL and make every refresh fail. + nowMs += 7 * 60 * 60 * 1000; + shouldFail = true; + + // Stale-while-revalidate: serves the cached value and refreshes in the + // background (which fails). + expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBe(200000); + // Let the background refresh settle. + await new Promise((resolve) => setTimeout(resolve, 0)); + expect(fetchMock).toHaveBeenCalledTimes(2); + + // The failed refresh must not have discarded the good catalog. + expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBe(200000); + }); +}); diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts index 38a737a09..dc0758462 100644 --- a/packages/web/src/features/chat/types.ts +++ b/packages/web/src/features/chat/types.ts @@ -59,6 +59,7 @@ export const sbChatMessageMetadataSchema = z.object({ totalCacheReadTokens: z.number().optional(), totalCacheWriteTokens: z.number().optional(), totalResponseTimeMs: z.number().optional(), + contextWindow: z.number().optional(), feedback: z.array(z.object({ type: z.enum(['like', 'dislike']), timestamp: z.string(), // ISO date string