diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index ed74cd94..4f12f6d0 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -112,6 +112,8 @@ jobs: env: RUNS: ${{ matrix.runs }} WARMUPS: ${{ matrix.warmups }} + SCENARIO_PROFILE: ${{ matrix.profile }} + STARTUP_RUNS: "3" CPUS: ${{ matrix.cpus }} MEMORY: ${{ matrix.memory }} run: | @@ -125,6 +127,8 @@ jobs: --target . \ --runs "${{ matrix.runs }}" \ --warmups "${{ matrix.warmups }}" \ + --scenario-profile "${{ matrix.profile }}" \ + --startup-runs 3 \ --tmp-dir "/tmp/codeclone-bench-${{ matrix.label }}" \ --output "$BENCH_JSON" @@ -143,21 +147,39 @@ jobs: raise SystemExit(1) payload = json.loads(report_path.read_text(encoding="utf-8")) + startup_probes = payload.get("startup_probes", []) scenarios = payload.get("scenarios", []) comparisons = payload.get("comparisons", {}) print("CodeClone benchmark summary") print(f"label={os.environ.get('RUNNER_OS','unknown').lower()} / {os.environ.get('GITHUB_JOB','benchmark')}") + if startup_probes: + print("startup probes:") + for probe in startup_probes: + name = str(probe.get("name", "unknown")) + stats = probe.get("stats_seconds", {}) + cpu_stats = probe.get("child_cpu_stats_seconds", {}) + print( + f"- {name:22s} median={float(stats.get('median', 0.0)):.4f}s " + f"first={float(probe.get('first_seconds', 0.0)):.4f}s " + f"cpu={float(cpu_stats.get('median', 0.0)):.4f}s" + ) for scenario in scenarios: name = str(scenario.get("name", "unknown")) stats = scenario.get("stats_seconds", {}) + cpu_stats = scenario.get("child_cpu_stats_seconds", {}) + inventory = scenario.get("inventory_sample", {}) median = float(stats.get("median", 0.0)) p95 = float(stats.get("p95", 0.0)) stdev = float(stats.get("stdev", 0.0)) digest = str(scenario.get("digest", "")) print( f"- {name:16s} median={median:.4f}s " - f"p95={p95:.4f}s stdev={stdev:.4f}s digest={digest}" + f"p95={p95:.4f}s stdev={stdev:.4f}s " + f"cpu={float(cpu_stats.get('median', 0.0)):.4f}s " + f"files={inventory.get('analyzed', 0)}/{inventory.get('cached', 0)} " + f"artifacts={float(scenario.get('artifact_total_kib_sample', 0.0)):.1f}KiB " + f"exit={scenario.get('exit_code_counts', {})} digest={digest}" ) if comparisons: @@ -174,24 +196,57 @@ jobs: "", f"- Tool: `{payload['tool']['name']} {payload['tool']['version']}`", f"- Target: `{payload['config']['target']}`", + f"- Scenario profile: `{payload['config'].get('scenario_profile', 'smoke')}`", f"- Runs: `{payload['config']['runs']}`", f"- Warmups: `{payload['config']['warmups']}`", + f"- Startup runs: `{payload['config'].get('startup_runs', 0)}`", f"- Generated: `{payload['generated_at_utc']}`", "", - "### Scenarios", - "", - "| Scenario | Median (s) | p95 (s) | Stdev (s) | Deterministic | Digest |", - "|---|---:|---:|---:|:---:|---|", ] + if startup_probes: + lines.extend( + [ + "### Startup / Import Probes", + "", + "| Probe | Median (s) | First (s) | CPU Median (s) |", + "|---|---:|---:|---:|", + ] + ) + for probe in startup_probes: + stats = probe.get("stats_seconds", {}) + cpu_stats = probe.get("child_cpu_stats_seconds", {}) + lines.append( + "| " + f"{probe.get('name', '')} | " + f"{float(stats.get('median', 0.0)):.4f} | " + f"{float(probe.get('first_seconds', 0.0)):.4f} | " + f"{float(cpu_stats.get('median', 0.0)):.4f} |" + ) + lines.append("") + + lines.extend( + [ + "### Scenarios", + "", + "| Scenario | Median (s) | p95 (s) | CPU Median (s) | Files A/C | Artifacts KiB | Exit | Deterministic | Digest |", + "|---|---:|---:|---:|---:|---:|---|:---:|---|", + ] + ) + for scenario in scenarios: stats = scenario.get("stats_seconds", {}) + cpu_stats = scenario.get("child_cpu_stats_seconds", {}) + inventory = scenario.get("inventory_sample", {}) lines.append( "| " f"{scenario.get('name', '')} | " f"{float(stats.get('median', 0.0)):.4f} | " f"{float(stats.get('p95', 0.0)):.4f} | " - f"{float(stats.get('stdev', 0.0)):.4f} | " + f"{float(cpu_stats.get('median', 0.0)):.4f} | " + f"{inventory.get('analyzed', 0)}/{inventory.get('cached', 0)} | " + f"{float(scenario.get('artifact_total_kib_sample', 0.0)):.1f} | " + f"{scenario.get('exit_code_counts', {})} | " f"{'yes' if bool(scenario.get('deterministic')) else 'no'} | " f"{scenario.get('digest', '')} |" ) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index acba6ccd..9bf842db 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -39,7 +39,7 @@ jobs: uses: actions/configure-pages@v5 - name: Build docs site - run: uv run --with zensical==0.0.43 zensical build --clean --strict + run: uv run --with zensical==0.0.46 zensical build --clean --strict - name: Generate sample report artifacts run: uv run python scripts/build_docs_example_report.py --output-dir site/examples/report/live diff --git a/.github/workflows/validation-corpus.yml b/.github/workflows/validation-corpus.yml new file mode 100644 index 00000000..540531a2 --- /dev/null +++ b/.github/workflows/validation-corpus.yml @@ -0,0 +1,92 @@ +name: validation-corpus +run-name: validation corpus • ${{ github.event_name }} • ${{ github.ref_name }} + +on: + push: + branches: [ "**" ] + pull_request: + workflow_dispatch: + inputs: + tier: + description: Corpus tier + required: true + default: all + type: choice + options: + - smoke + - gates + - full + - all + corpus-ref: + description: codeclone-validation-corpus ref + required: true + default: main + +permissions: + contents: read + +concurrency: + group: validation-corpus-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + validation-corpus: + name: corpus + runs-on: ubuntu-latest + timeout-minutes: 20 + + steps: + - name: Checkout CodeClone + uses: actions/checkout@v6.0.2 + + - name: Checkout validation corpus + uses: actions/checkout@v6.0.2 + with: + repository: orenlab/codeclone-validation-corpus + ref: ${{ github.event_name == 'workflow_dispatch' && inputs.corpus-ref || 'main' }} + path: validation-corpus + + - name: Set up Python + uses: actions/setup-python@v6.2.0 + with: + python-version: "3.14" + allow-prereleases: true + + - name: Set up uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Install CodeClone from this checkout + run: uv sync --all-extras + + - name: Install validation corpus dependencies + run: uv sync --project validation-corpus + + - name: Resolve corpus tier + shell: bash + run: | + tier="all" + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + tier="${{ inputs.tier }}" + fi + echo "CORPUS_TIER=$tier" >> "$GITHUB_ENV" + + - name: Run validation corpus + run: | + uv run --project validation-corpus python -m corpus_tools.cli \ + --tier "$CORPUS_TIER" \ + --codeclone-command "$GITHUB_WORKSPACE/.venv/bin/python -m codeclone.main" \ + --work-root "$RUNNER_TEMP/codeclone-validation-corpus-work" + + - name: Write summary + if: always() + shell: bash + run: | + { + echo "## CodeClone validation corpus" + echo + echo "- Tier: \`${CORPUS_TIER:-unknown}\`" + echo "- Corpus: \`orenlab/codeclone-validation-corpus\`" + echo "- CodeClone source: current checkout" + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.gitignore b/.gitignore index 4d93ff6f..67f79c9c 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,5 @@ extensions/vscode-codeclone/node_modules /coverage.json /benchmarks/memory_semantic_eval.md /scripts/commit_memory_phases.sh +/extensions/jetbrains-codeclone/ +/issues/ diff --git a/AGENTS.md b/AGENTS.md index 17ba6abd..9fec7fa1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -156,7 +156,7 @@ If you touched baseline/cache/report contracts or CLI/MCP audit surfaces, also e If you touched `docs/`, `zensical.toml`, docs publishing workflow, or sample-report generation, also run: ```bash -uv run --with zensical==0.0.43 zensical build --clean --strict +uv run --with zensical==0.0.46 zensical build --clean --strict ``` If you touched the MCP surface, also run: diff --git a/CHANGELOG.md b/CHANGELOG.md index f5ae6599..72a5f150 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,197 +1,78 @@ -Changelog - -[2.1.0a1] - Unreleased - -2.1.0a1 opens the CodeClone 2.1 alpha line with intent-first structural -change control, Engineering Memory, trajectory and experience layers, semantic -retrieval, Platform Observability, native agent integrations, and a reorganized -documentation site. - -Added - -* Structural Change Controller. The new - start_controlled_change / finish_controlled_change workflow reduces the - governed agent edit cycle from 7–11 MCP calls to 3–4. It combines workspace - checks, intent declaration, blast-radius mapping, bounded edit scope, patch - verification, review-claim validation, and deterministic review receipts. - CodeClone now exposes 33 agent-visible MCP tools by default. -* Live Implementation Context. The new read-only - get_implementation_context tool projects bounded structural facts for - repo-relative paths from one existing run. It reports workspace freshness, - cache origin, imports/importers, public surface, blast radius, and test - anchors, with separate deterministic digests for the off-report context - artifact and the exact bounded projection. Active intents add explicit - allowed/review/do-not-touch boundaries, while impact mode adds transitive - dependency context and baseline-sensitive findings. Engineering Memory, - tests, docs, trajectories, and Experiences remain lane-separated evidence. - Exact qualname subjects resolve through an off-report Unit and API-surface - location index, with unknown symbols reported explicitly rather than guessed. - Zero-argument queries now resolve active intent scope or bounded live dirty - paths, related module roles collapse with explicit relation tags, and one - safety-first global budget reports all ordinary and safety omissions. - Cache schema 2.9 adds a separate, rebuildable per-function relationship-fact - projection without changing Unit serialization or canonical report identity. - Cross-module calls and resolved non-call references are now attributed to - their caller with production/test lanes; conservative caller-scope shadow - guards keep ambiguous imported names as unresolved call observations. - Intra-module functions, same-module class methods, and self/cls receiver - methods now resolve against the enclosing module and class (keyed on the - actual first-parameter name, never a hardcoded self, and never for - staticmethods), only when the target definition exists; cache schema 2.10. - Per-function relationship facts now aggregate across files (cold and cached) - onto the analysis result and the MCP run record, off the canonical report. - get_implementation_context now projects call_context (callers, callees, - references, test_callers) from those facts with relation_kind x - resolution_status evidence tags, separate production and test caller lanes, - unresolved call observations, and a complete/partial/unavailable - call_graph_status; relationship records are bound into context_artifact_digest. - contract mode returns a truth-map (definition_sites, version_constants, - contract_tests, memory_conflicts) and persistence/serialization path callers - that are emitted only with a typed or memory-backed anchor and are otherwise - not_available rather than name- or directory-guessed. - Context evidence never authorizes edits; edit_allowed remains authoritative. -* Change-intent lifecycle and multi-agent coordination. - manage_change_intent supports declare, check, clear, queue, promote, and - recover operations. Renewable leases, ownership classification, optional - SQLite coordination, retention, workspace hygiene, and recoverable-intent - handling make concurrent agent work explicit and auditable. -* Engineering Memory. A local SQLite knowledge graph stores typed, - evidence-linked repository facts such as contracts, decisions, risks, test - anchors, prior changes, and git provenance. Agents receive ranked, - scope-aware context through get_relevant_memory and - query_engineering_memory; drafts remain human-governed and can be approved - through the CLI or VS Code Memory view. Memory never authorizes edits or - overrides the canonical report, gates, or Patch Trail. -* Trajectory Memory and Patch Trail. Audit-derived trajectories preserve - agent workflows, declared scope, actual changed paths, verification outcomes, - incidents, citations, and review evidence. The current trajectory-v3 - projection adds quality passports, complexity scoring, anomaly detection, - agent profiles, dashboards, semantic retrieval, and deterministic Patch Trail - summaries. Engineering Memory schema 1.7 persists trajectory and Patch - Trail evidence. -* Experience Layer. Deterministic experience-v1 patterns are distilled - from canonical trajectories across all outcomes and exposed through a - separate advisory retrieval lane. Experiences retain supporting evidence and - agent-diversity facets, but never become authority automatically; - promote_experience creates a human-governed memory draft. -* Semantic memory retrieval. Optional LanceDB-backed hybrid search combines - FTS5/BM25 and vector retrieval using deterministic Reciprocal Rank Fusion. - Local embeddings are available through codeclone[semantic-local] with - BAAI/bge-small-en-v1.5. Semantic indexing is lazy, failure-tolerant, and - eventually consistent rather than synchronously rebuilt after every finish. -* Platform Observability. Opt-in, development-only telemetry traces - CodeClone’s own CLI, MCP, analysis, database, semantic-index, and projection - worker activity. The local observer captures timings, RSS/CPU, MCP payload and - token pressure, DB query counts and shapes, causal worker chains, and costly - no-ops. JSON/HTML views provide a diagnostic cockpit, while - query_platform_observability exposes bounded MCP sections for development - agents. Observability never affects reports, gates, baselines, memory facts, - or edit authorization. -* IDE and agent integrations. The VS Code extension gains Engineering - Memory governance, trajectory dashboards, controller audit views, and - workspace session statistics. Native integrations are available for Claude - Desktop, Claude Code, Codex, and Cursor. Claude Code now has a dedicated - marketplace plugin and storefront, separate from the Desktop `.mcpb` bundle. - The Cursor plugin includes skills, rules, fail-closed preToolUse enforcement, - scoped workspace-intent checks, and a structural-review agent. -* Controller and diagnostic CLI surfaces. Added blast-radius, patch - verification, session statistics, controller audit, memory trajectory, - anomaly, agent-profile, semantic-search, and Platform Observability commands. -* Documentation and edition model. Documentation is reorganized into a - thematic 00–26 contract book with unified integration guides, dedicated - chapters for the Controller, Engineering Memory, trajectories, Experiences, - and Platform Observability, plus explicit Open Source / Team / Enterprise - retention and capability tiers. -* MCP schemas now include parameter-level descriptions and deterministic - next_tool guidance. Workspace hygiene warnings, audit events, token-budget - tracking, and documentation-contract linting were also added. -* **Corpus Analytics (intent lane, Slice 1).** Optional offline clustering of - historical change-control intents via `codeclone analytics …`. - Requires `codeclone[analytics]`. Reads audit + trajectory (+ optional registry - overlay), writes SQLite/LanceDB artifacts under `.codeclone/analytics/`, and - exports inspectable JSON/HTML with sweep comparison, cluster diagnostics, - noise exploration, explicit heuristic recommendation vs maintainer selection, - and runtime observability spans. Analytics embeddings and their lifecycle are - separate from the Engineering Memory semantic index; - `[tool.codeclone.analytics]` configures paths and clustering defaults. -* **Corpus Analytics interpretability (Slice 1.1).** JSON export schema `1.2` - and the self-contained HTML report now separate formal technical validity - from human interpretation. Valid runs expose dominant-cluster ratios, - bounded representative/boundary/noise previews, numeric summaries, - categorical correlations, small-cluster provenance completeness, and - explicit preview disclosure. Invalid and failed runs remain inspectable in a - limited diagnostic mode without partition metrics, previews, score, or rank; - sweep comparison includes every persisted candidate. Representation contract - `3` materializes explicit trajectory, Patch Trail, and registry-overlay - presence facts for new snapshots without adding live registry state to source +# Changelog + +## [2.1.0a1] - Unreleased + +CodeClone 2.1 introduces intent-first structural change control, persistent engineering context, agent workflow +evidence, platform self-observability, and broader IDE/agent integration. + +### Added + +- **Structural Change Controller** with `start_controlled_change` / `finish_controlled_change`, bounded edit scope, + blast-radius checks, patch verification, claim validation, multi-agent intent coordination, and deterministic review + receipts. +- **Live Implementation Context** via `get_implementation_context`, including bounded structural context, call + relationships, contract-oriented truth maps, freshness, test anchors, and active intent boundaries. Context remains + read-only and never authorizes edits. +- **Engineering Memory**, **Trajectory Memory**, **Patch Trail**, and **Experience Layer** for typed repository + knowledge, historical agent workflows, change evidence, reusable patterns, and human-governed promotion. +- **Semantic retrieval** with optional LanceDB hybrid search, FTS5/BM25, vector search, and deterministic Reciprocal + Rank Fusion. +- **Platform Observability** for development-time tracing of CLI, MCP, analysis phases, database activity, semantic + indexing, worker chains, memory/CPU use, MCP payload pressure, and costly no-ops. +- **Corpus Analytics** for offline intent clustering, interpretability, versioned profiles, sweep comparison, maintainer + selection, and inspectable JSON/HTML outputs. +- **Module Map** as a deterministic report-only package/module graph with cycle, hub, overloaded-module, and + unwind-candidate views. +- **Guided Finding Review** as a prioritized report-only review queue with shared finding cards, filters, progress + tracking, and reviewed-state persistence. +- **Native agent and IDE integrations** for VS Code, Claude Desktop, Claude Code, Codex, and Cursor, including + governance, audit, memory, trajectory, and structural-review workflows. +- Expanded controller, memory, trajectory, analytics, semantic-search, observability, blast-radius, patch-verification, + and diagnostic CLI/MCP surfaces. +- Reorganized documentation into a contract-focused 00–26 book with unified integration guidance and explicit edition + tiers. +- MCP schemas now include parameter descriptions, deterministic `next_tool` guidance, token-budget tracking, workspace + hygiene warnings, and documentation-contract linting. + +### Contract changes + +- Cache schema advanced to **2.9** for the rebuildable per-function relationship-fact projection and to **2.10** for + intra-module, class-method, and receiver-aware call resolution. +- Engineering Memory schema advanced to **1.7** for trajectory and Patch Trail evidence. +- Corpus Analytics store schema advanced to **1.2**. +- Corpus Analytics JSON export schema advanced through **1.2** and **1.3**. +- Corpus Analytics representation contract advanced to **3**. +- Corpus Analytics control-plane contract introduced at **1.0**. +- `derived.module_map` and `derived.review_queue` remain report-only projections excluded from the integrity digest; + they add no analysis pass, metrics family, or report schema bump. +- Live Implementation Context relationship facts remain off the canonical report and do not change canonical report identity. -* **Corpus Analytics profiles and selection control (Slice 1.2).** Added - versioned bundled and repository-local profile manifests, finite - profile-scoped sweeps, separate suitability and profile-aware ranking, - immutable profile batch receipts, configurable ordinary sweep grids, manual - clustering parameters, and append-only maintainer selection events. Store - schema is now `1.2`; JSON export schema `1.3` adds control-plane contract - `1.0`, profile context/summary, and active selection without changing - technical-validity semantics. - -Changed - -* The default project workspace moved from .cache/codeclone/ to - .codeclone/; legacy locations now produce a migration warning. -* Documentation builds now use Zensical with strict, clean builds. -* pydantic is now a base dependency. -* LCOM4 excludes Protocol methods and Pydantic validation/serialization hooks; - computed_field remains part of cohesion analysis. -* Repository test coverage is enforced at >=99%. - -Fixed - -* Durable memory writes. Engineering Memory now uses - synchronous=FULL, preserving committed drafts across unclean MCP process - exits. Intent and audit stores retain recovery-oriented - synchronous=NORMAL. -* Atomic memory ingestion. persist_batch no longer commits records and - subjects mid-batch; it now defers the commit so a later failure in the same - batch rolls back the whole ingestion instead of leaving half-written records - behind. Standalone store writes keep their previous commit-on-write behavior. -* Observable best-effort failures. The non-fatal audit-event writer and the - best-effort finish-payload memory proposer no longer swallow exceptions with - zero signal; each now increments an observability counter (audit.emit_dropped, - memory.propose_candidate_dropped) on its fallback path, so silent drops stay - countable in the cockpit. Both remain non-fatal and the telemetry never - re-raises. -* Compact implementation-context misses. get_implementation_context no longer - emits the full empty facet scaffolding (structural_context, budget_summary, - dataflow, call_context, uncertainties) when an explicit symbol query resolves - nothing. The subject_not_found response now returns only the unresolved - subject, a slim provenance block, the projection digest, and an actionable - next_steps list, so a miss does not burn agent context. -* Memory lifecycle correctness. Draft records are no longer marked stale - before human promotion. Trajectory rebuilds now deduplicate superseded - projections, repoint evidence, remove stale workflow rows, and preserve - bounded claim-validation citations. -* Workspace hygiene and intent attribution. Finish blocks only on missing - evidence or foreign dirty overlap. Out-of-scope dirt is advisory, - continue_own_wip supports resuming owned work, queued foreign intents no - longer create false overlaps, and recoverable intents do not grant foreign - attribution. -* Patch verification correctness. Identical before/after runs are rejected - for structural and governance profiles. Negative health deltas now surface a - regression advisory, and Claim Guard warns when review text overstates patch - quality. -* Semantic retrieval correctness and cost. Hybrid search now preserves - lexical and vector relevance through RRF instead of allowing metadata ranking - to suppress strong matches. Per-source vector retrieval prevents dense lanes - from crowding out other sources. Embedding providers load lazily, failures - preserve documented fallback behavior, and redundant projection jobs are - coalesced or deferred. -* Architecture and import boundaries. Blast-radius graph logic moved into - codeclone/analysis/blast_radius.py, removing the CLI-to-MCP dependency + +### Changed + +- Default project workspace moved from `.cache/codeclone/` to `.codeclone/`; legacy paths emit a migration warning. +- Documentation builds now use Zensical with strict clean builds. +- `pydantic` is now a base dependency. +- LCOM4 excludes Protocol methods and Pydantic validation/serialization hooks; `computed_field` remains included. +- Repository coverage is enforced at **>=99%**. + +### Fixed + +- Engineering Memory writes are durable and batch ingestion is atomic. +- Best-effort audit and memory-proposal failures are now observable instead of silently swallowed. +- Implementation-context misses return a compact actionable payload instead of empty scaffolding. +- Memory, trajectory, and Patch Trail lifecycle handling now avoids premature staleness, duplicate projections, stale + workflow rows, and broken evidence links. +- Workspace hygiene, intent attribution, continuation of owned work, queue handling, and recoverable-intent behavior + were corrected. +- Patch verification now rejects identical before/after runs where required, surfaces health regressions, and warns on + overstated review claims. +- Semantic retrieval now preserves lexical/vector relevance, avoids source crowding, loads embeddings lazily, and + coalesces redundant projection work. +- Blast-radius graph logic moved into `codeclone/analysis/blast_radius.py`, removing the CLI-to-MCP dependency violation. -* Regression accuracy. respect_pyproject=false no longer reports - golden-fixture clone groups as false new regressions. Documentation URLs, - plugin references, and contract tests were updated after the documentation - reorganization. +- `respect_pyproject=false` no longer reports golden-fixture clone groups as false new regressions. +- Documentation URLs, integration references, and contract tests were aligned with the reorganized site. ## [2.0.2] - 2026-05-19 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c1244803..7b26eb6c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -377,7 +377,7 @@ When changing the live sample report or its publication path: ```bash uv run python scripts/build_docs_example_report.py \ --output-dir site/examples/report/live -uv run --with zensical==0.0.43 zensical build --clean --strict +uv run --with zensical==0.0.46 zensical build --clean --strict ``` The generator runs CodeClone against the repository, stages its output in a @@ -526,7 +526,7 @@ uv run pytest -q tests/test_observability_*.py For documentation, navigation, publishing, or sample-report changes: ```bash -uv run --with zensical==0.0.43 zensical build --clean --strict +uv run --with zensical==0.0.46 zensical build --clean --strict ``` For VS Code extension changes: diff --git a/README.md b/README.md index 097ef4d0..e8a9a0a9 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,16 @@ > -

Structural Change Controller for AI-assisted Python development

+

Deterministic Structural Change Controller for AI-assisted Python development

Let agents move fast.
- Keep structural change explicit, bounded, remembered, and verifiable. + Keep structural change explicit, bounded, and verifiable.

-[![][pypi-shield]][pypi-link] [![][status-shield]][pypi-link] [![][downloads-shield]][pypi-link] [![][python-shield]][pypi-link] [![][license-shield]][license-link] +[![][pypi-shield]][pypi-link] [![][downloads-shield]][pypi-link] [![][python-shield]][pypi-link] [![][license-shield]][license-link] [![][discord-shield]][discord-link] [![][tests-shield]][tests-link] [![][benchmark-shield]][benchmark-link] @@ -34,241 +34,196 @@ --- > [!NOTE] -> This repository and the documentation site track the **unreleased v2.1.0 development line**. -> For the current stable release, use -> [CodeClone v2.0.2](https://github.com/orenlab/codeclone/tree/v2.0.2) -> or install [CodeClone 2.0.2 from PyPI](https://pypi.org/project/codeclone/2.0.2/). +> This README documents the unreleased **CodeClone 2.1 alpha line**. +> The basic analysis commands below work with the current stable release, +> [CodeClone 2.0.2](https://github.com/orenlab/codeclone/tree/v2.0.2). +> Agent change control and Engineering Memory require the 2.1 prerelease. -**CodeClone** is a deterministic **Structural Change Controller** for AI-assisted Python development, built on one -canonical structural analysis of the repository. +## What is CodeClone? -Before editing, an agent declares intent. CodeClone maps the structural blast radius, establishes explicit edit -boundaries, and exposes the regression budget. After editing, it compares the actual patch with the declared scope, -verifies structural changes, checks review claims against report facts, and leaves an auditable receipt. +CodeClone helps developers use AI coding agents without losing control of structural change. -```text -intent → blast radius → bounded edit → patch check → review receipt -``` +Before an agent edits code, CodeClone records the intended change, maps the structural blast radius, and establishes +explicit edit boundaries. After the edit, it compares the real patch with the declared scope, verifies structural +regressions, and leaves an auditable review receipt. -CodeClone does not use LLM judgment to classify structural regressions or authorize edits. Structural facts come -from deterministic analysis; the same facts serve agents, human reviewers, IDEs, and CI. +CodeClone makes silent scope expansion visible before it disappears inside an otherwise reasonable-looking diff. -## Install and try +CodeClone does not ask an LLM to decide whether a structural change is safe. It uses deterministic repository facts +shared across agents, human reviewers, IDEs, reports, and CI. -Stable release: +## Quick start -```bash -uv tool install codeclone -codeclone . -codeclone . --html --open-html-report -``` +### 1. Analyze a repository -Run without installing: +Run the stable release without installing anything: ```bash uvx codeclone@latest . ``` -Install the MCP server for local AI agents and IDE clients: +Prefer a browsable report? Generate the HTML view and open it: ```bash -uv tool install "codeclone[mcp]" -codeclone-mcp --transport stdio +uvx codeclone@latest . --html --open-html-report ``` -Install the in-development 2.1 line (alpha/beta prereleases). A plain install -resolves the latest stable release; add a prerelease flag to get 2.1: +Once you use it regularly, install it as a local tool: ```bash -uv tool install --prerelease allow "codeclone[mcp]" # uv -pip install --pre "codeclone[mcp]" # pip +uv tool install codeclone +codeclone . ``` -Run the current development line from source: +### 2. Record the current structural baseline + +Before asking an agent to change the repository, capture the accepted state once: ```bash -git clone https://github.com/orenlab/codeclone.git -cd codeclone -uv sync --all-extras -uv run codeclone . +codeclone . --update-baseline +git add codeclone.baseline.json +git commit -m "chore: add CodeClone structural baseline" ``` -## Why CodeClone +The baseline records the structural debt that already exists. Future analysis can then separate **new regressions** +from findings that were already present, so agents and reviewers can focus on what the current change introduced. -AI coding agents accelerate implementation, but they also make scope expansion easier to miss. A narrow task can -quietly spread into shared helpers, tests, public APIs, configuration, and unrelated modules while the final diff -still looks reasonable. +Updating the baseline is an explicit governance action. Do not regenerate it merely to make a failing check pass. -Most review tools start with the completed diff. CodeClone starts with the declared intent. +### 3. Set up CodeClone for your AI agent (2.1 alpha) -```text -declare intent - → inspect structural blast radius - → establish edit boundaries - → make the change - → compare declared and actual scope - → verify structural regressions - → record the outcome +Install the prerelease MCP server: + +```bash +uv tool install --prerelease allow "codeclone[mcp]" +codeclone-mcp --transport stdio ``` -The agent still writes the code. CodeClone makes the declared scope explicit before editing and exposes undeclared -expansion when the patch is verified. +Then wire it into your client: + +| Client | Setup | +|----------------|--------------------------------------------------------------------------------------------------------| +| VS Code | [Extension setup](https://orenlab.github.io/codeclone/guide/integrations/vscode/setup/) | +| Cursor | [Plugin and skills](https://orenlab.github.io/codeclone/guide/integrations/cursor/install-and-skills/) | +| Claude Code | [Plugin setup](https://orenlab.github.io/codeclone/guide/integrations/claude-code/setup/) | +| Codex | [Plugin setup](https://orenlab.github.io/codeclone/guide/integrations/codex/setup/) | +| Claude Desktop | [Bundle setup](https://orenlab.github.io/codeclone/guide/integrations/claude-desktop/setup/) | + +Every client uses the same MCP interface and the same canonical structural facts. -## Structural Change Controller +## How the controlled-change workflow works -The controller reduces the governed agent workflow to four steps: +For an agent, the normal workflow is: ```text analyze → start → edit → finish ``` -- **Start controlled change** — `start_controlled_change` checks workspace state, records intent, maps blast radius, - separates allowed paths from review context and do-not-touch boundaries, and returns the authoritative - `edit_allowed` permission. -- **Finish controlled change** — `finish_controlled_change` resolves the actual changed files once, checks scope, - verifies the patch against the canonical report, validates optional review claims, and produces a review receipt. -- **Patch Trail** — records declared, changed, untouched-in-declared, and boundary-held paths together with - verification and audit anchors. -- **Multi-agent coordination** — lease-bound intents, queues, recovery, and workspace hygiene make concurrent work - visible without treating advisory ownership as structural truth. +### Analyze -Host integrations can enforce the permission model before file edits where the host supports hooks. Regardless of -host enforcement, finish-time verification remains deterministic. +CodeClone builds one canonical structural report for the repository and compares it with the accepted baseline. -[Structural Change Controller documentation](https://orenlab.github.io/codeclone/book/12-structural-change-controller/) +### Start -## One canonical report, every structural surface +`start_controlled_change`: -CodeClone runs one deterministic structural analysis and renders its canonical report through CLI, HTML, JSON, -Markdown, SARIF, MCP, IDE integrations, GitHub Action, and CI. There is no separate analysis engine for agents. +- records the agent's intent; +- maps structural blast radius; +- separates editable paths from review context and do-not-touch boundaries; +- exposes the regression budget relative to the accepted baseline; +- returns the authoritative `edit_allowed` result. -The report covers: +### Edit -- function clones through CFG fingerprints; -- block clones through statement windows and report-only segment clones; -- clone-cohort drift, duplicated branch families, and guard/exit divergence; -- cyclomatic complexity, coupling, cohesion, dependency cycles, and dead code; -- overloaded-module and other report-only design context; -- type and docstring adoption; -- public API inventory and baseline-aware API break detection; -- external Cobertura coverage joined with structural hotspots; -- report-only security capability boundaries without vulnerability claims; -- deterministic structural health and review priorities. +The agent writes the code. CodeClone does not generate or rewrite source files. -```bash -codeclone . --json --html --md --sarif --text -``` +Where the host supports hooks, integrations can stop edits unless `edit_allowed=true`. -[How CodeClone works](https://orenlab.github.io/codeclone/guide/explanation/how-it-works/) · -[Canonical report contract](https://orenlab.github.io/codeclone/book/05-report/) +### Finish -## Baseline-aware CI +`finish_controlled_change`: -CodeClone separates accepted legacy debt from new structural regressions. +- resolves the actual changed files; +- checks declared scope against the real patch; +- verifies structural changes; +- validates optional review claims; +- records Patch Trail evidence; +- produces an auditable review receipt. -```bash -# Create and commit the project baseline once -codeclone . --update-baseline +The result is not an AI opinion about the patch. It is a deterministic comparison between declared intent, repository +structure, the accepted baseline, and the actual change. -# Gate future changes against that baseline -codeclone . --ci -``` +[Read the Structural Change Controller guide](https://orenlab.github.io/codeclone/book/12-structural-change-controller/) -The baseline is a versioned, integrity-checked contract. CI can reject newly introduced clones and baseline-aware -metric, API, and coverage regressions without requiring the existing codebase to be clean first. Absolute threshold -gates remain opt-in. +## What you get -```bash -codeclone . --fail-on-new-metrics -codeclone . --fail-complexity 20 --fail-coupling 10 --fail-cohesion 4 -codeclone . --fail-cycles --fail-dead-code -codeclone . --coverage coverage.xml --fail-on-untested-hotspots -codeclone . --api-surface --fail-on-api-break -``` +| Capability | What it provides | +|-----------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| **Structural Change Controller** | Intent-first change control, blast radius, explicit edit boundaries, patch verification, and review receipts | +| **Canonical structural analysis** | Clone detection, complexity, coupling, cohesion, dependency cycles, dead code, API inventory, coverage joins, and structural health | +| **Baseline-aware governance** | Records accepted legacy debt and separates it from regressions introduced by the current change | +| **Engineering Memory** | Local, typed, evidence-linked project knowledge and reusable histories of prior controlled changes | +| **Agent coordination** | Lease-bound intents, queues, conflicts, recovery, and workspace hygiene | +| **One report, many surfaces** | CLI, HTML, JSON, Markdown, SARIF, MCP, IDE integrations, and GitHub Actions from one canonical payload | -[Metrics and quality gates](https://orenlab.github.io/codeclone/book/16-metrics-and-quality-gates/) · -[Baseline contract](https://orenlab.github.io/codeclone/book/07-baseline/) +CodeClone requires no hosted service or cloud account. Analysis state, controller state, Engineering Memory, and +trajectories are stored locally. -## Engineering Memory +## Why intent comes before the diff -Engineering Memory gives agents durable, repository-specific context without treating model output as project truth. +Most review tools begin after the patch already exists. CodeClone begins earlier: -The local SQLite store contains typed, evidence-linked knowledge such as contracts, architecture decisions, risks, -test anchors, public surfaces, git provenance, and prior controlled changes. Scope-aware retrieval supports the -current change, while project-wide search can combine FTS5 with optional semantic retrieval. +```text +task request + → declared intent + → structural blast radius + → explicit boundary + → actual patch + → deterministic verification +``` -Audit-derived trajectories preserve how work actually unfolded. Trajectory passports, anomaly profiles, Patch Trail -evidence, and recurring advisory patterns called **Experiences** make previous successes and failures reusable. -Agent-created records remain drafts until a human approves them. +Agent scope expansion can look reasonable in the final diff. A narrow task may quietly spread into shared helpers, +tests, configuration, public APIs, or unrelated modules. -```bash -codeclone memory init --root . -codeclone memory search "baseline schema" --match all -codeclone memory approve mem-12345678 --i-know-what-im-doing -``` +By the time that expansion reaches the final diff, it already looks intentional. CodeClone catches it at the declared +boundary instead — by comparing what the agent said it would change with what it actually changed. -Memory can guide an agent. It cannot authorize edits, override blast radius, change a gate, or replace canonical -report facts. +## One canonical structural report -[Engineering Memory documentation](https://orenlab.github.io/codeclone/book/13-engineering-memory/) · -[Trajectories and Experiences](https://orenlab.github.io/codeclone/guide/memory/trajectories-and-experiences/) +CodeClone runs one deterministic analysis and renders the same canonical report through every supported surface. -## AI agents and IDE integrations +The report covers: -The MCP server is triage-first: analyze the repository, narrow the problem, inspect evidence, start a controlled -change, and finish with verification. `get_implementation_context` projects bounded, drift-aware structural context -for repo-relative paths from the existing run, with separate digests for the source artifact and exact response. -It is evidence for planning, never edit authorization. Bounded tools and resources keep the full report out of agent -context until deeper evidence is requested. +- function, block, and segment clones; +- clone drift and duplicated branch families; +- complexity, coupling, cohesion, dependency cycles, and dead code; +- public API inventory and baseline-aware API break detection; +- external coverage joined with structural hotspots; +- deterministic structural health and review priorities. ```bash -codeclone-mcp --transport stdio -codeclone-mcp --transport streamable-http +codeclone . --json --html --md --sarif --text ``` -Structural analysis tools do not mutate source files, baselines, generated reports, or analysis cache. Controller -and memory operations update only their explicit state stores. - -> [!WARNING] -> Analysis tools require an absolute repository root. Keep `stdio` as the default transport for local clients. -> Exposing HTTP beyond loopback requires explicit `--allow-remote`. - -| Surface | Install or source | Documentation | -|---------------------------|----------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------| -| **VS Code extension** | [VS Code Marketplace](https://marketplace.visualstudio.com/items?itemName=orenlab.codeclone) | [Setup](https://orenlab.github.io/codeclone/guide/integrations/vscode/setup/) | -| **Cursor plugin** | [Cursor storefront](https://github.com/orenlab/codeclone-cursor) | [Install](https://orenlab.github.io/codeclone/guide/integrations/cursor/install-and-skills/) | -| **Claude Code plugin** | [Claude Code marketplace](https://github.com/orenlab/codeclone-claude-code) | [Install](https://orenlab.github.io/codeclone/guide/integrations/claude-code/setup/) | -| **Codex plugin** | [Codex marketplace](https://github.com/orenlab/codeclone-codex) | [Install](https://orenlab.github.io/codeclone/guide/integrations/codex/setup/) | -| **Claude Desktop bundle** | [Bundle repository](https://github.com/orenlab/codeclone-claude-desktop) | [Setup](https://orenlab.github.io/codeclone/guide/integrations/claude-desktop/setup/) | - -Every client uses the same `codeclone-mcp` interface and canonical structural facts. +[How CodeClone works](https://orenlab.github.io/codeclone/guide/explanation/how-it-works/) · +[Canonical report contract](https://orenlab.github.io/codeclone/book/05-report/) -[MCP usage guide](https://orenlab.github.io/codeclone/guide/mcp/) · -[MCP interface contract](https://orenlab.github.io/codeclone/book/25-mcp-interface/) · -[Implementation-context tools](https://orenlab.github.io/codeclone/book/25-mcp-interface/tools/implementation-context/) +## Baseline-aware governance and CI -## Quick workflows +The baseline is a versioned, integrity-checked contract that records the accepted structural state of the repository. -Review only the current Git scope: +It lets CodeClone and connected agents distinguish: -```bash -codeclone . --changed-only --diff-against main -codeclone . --paths-from-git-diff HEAD~1 -``` +- findings that already existed; +- regressions introduced by the current change; +- deliberate baseline updates approved by the user. -Inspect structural blast radius or run a baseline-relative patch check: +Check future changes against the committed baseline: ```bash -codeclone . --blast-radius codeclone/analysis/parser.py -codeclone . --patch-verify +codeclone . --ci ``` -`--patch-verify` is a terminal-only controller query: it cannot combine with -`--changed-only`, `--diff-against`, or `--paths-from-git-diff`. Use changed-scope -flags for git-selected review; use `--patch-verify` alone for a trusted-baseline -budget check on the working tree. Patch-local before/after verification with -explicit changed-file evidence belongs in MCP change control (`check_patch_contract`). - Use CodeClone in GitHub Actions: ```yaml @@ -279,77 +234,98 @@ Use CodeClone in GitHub Actions: pr-comment: "true" ``` -The Action can run baseline-aware gating, publish SARIF to GitHub Code Scanning, upload reports, and maintain a PR -summary comment. +CI can reject newly introduced clones, metric regressions, API breaks, and coverage regressions without requiring the +existing repository to be clean first. +[Baseline contract](https://orenlab.github.io/codeclone/book/07-baseline/) · +[Metrics and quality gates](https://orenlab.github.io/codeclone/book/16-metrics-and-quality-gates/) · [GitHub Action documentation](https://orenlab.github.io/codeclone/getting-started/#github-action) -## Platform Observability +## Engineering Memory + +Engineering Memory gives agents durable, repository-specific context without treating model output as project truth. -Platform Observability is an opt-in diagnostics layer for developing CodeClone itself. It correlates CLI, MCP, -analysis, database, semantic-index, and projection-worker execution and exposes timings, RSS/CPU, query shapes, -payload pressure, causal worker chains, and costly no-ops. +The local SQLite store can contain: -It is disabled by default, stores no raw payload bodies, and cannot affect repository findings, gates, baselines, -memory facts, or edit authorization. +- architecture and contract notes; +- risks, test anchors, and public surfaces; +- git and change-control provenance; +- prior trajectories and Patch Trail evidence; +- recurring advisory patterns called **Experiences**. + +Agent-created records remain drafts until a human approves them. ```bash -CODECLONE_OBSERVABILITY_ENABLED=1 codeclone . -codeclone observability trace --root . --html /tmp/codeclone-observer.html +codeclone memory init --root . +codeclone memory search "baseline schema" --match all ``` -[Platform Observability documentation](https://orenlab.github.io/codeclone/book/26-platform-observability/) +Memory can guide an agent. It cannot authorize edits, override blast radius, change a gate, or replace canonical report +facts. -## Configuration +[Engineering Memory documentation](https://orenlab.github.io/codeclone/book/13-engineering-memory/) · +[Trajectories and Experiences](https://orenlab.github.io/codeclone/guide/memory/trajectories-and-experiences/) -Project configuration lives in `pyproject.toml`: +## Trust boundaries -```toml -[tool.codeclone] -baseline = "codeclone.baseline.json" +- Structural findings and gates come from deterministic analysis, not LLM judgment. +- `edit_allowed` is an explicit controller result; status or advisory ownership does not grant permission. +- Read-only analysis commands do not modify source code or project governance state. +- Baseline updates are explicit user-approved governance actions. +- Controller and memory operations write only to their explicit local state stores. +- Memory and trajectory evidence remain advisory. +- `stdio` is the recommended transport for local clients. +- Remote HTTP exposure requires explicit `--allow-remote`. -min_loc = 10 -min_stmt = 6 +## Development setup -block_min_loc = 20 -block_min_stmt = 8 -``` +Run the repository version from source: -Precedence is **CLI flags > `pyproject.toml` > built-in defaults**. +```bash +git clone https://github.com/orenlab/codeclone.git +cd codeclone +uv sync --all-extras +uv run codeclone . +``` -[Configuration reference](https://orenlab.github.io/codeclone/book/10-config-and-defaults/) · -[Inline suppressions](https://orenlab.github.io/codeclone/book/19-inline-suppressions/) +CodeClone 2.1 requires Python 3.10 or newer. ## Documentation -The documentation site contains user guides, interface contracts, report and baseline schemas, configuration -reference, integration setup, and maintainer material: - **[orenlab.github.io/codeclone](https://orenlab.github.io/codeclone/)** +- [Getting started](https://orenlab.github.io/codeclone/getting-started/) +- [Structural Change Controller](https://orenlab.github.io/codeclone/book/12-structural-change-controller/) +- [Engineering Memory](https://orenlab.github.io/codeclone/book/13-engineering-memory/) +- [MCP usage](https://orenlab.github.io/codeclone/guide/mcp/) +- [Configuration reference](https://orenlab.github.io/codeclone/book/10-config-and-defaults/) + ## License -- **Code:** MPL-2.0 (`LICENSE`) -- **Documentation and docs-site content:** MIT (`LICENSE-MIT`) +- **Code:** MPL-2.0 +- **Documentation:** MIT + +See [LICENSES.md](https://github.com/orenlab/codeclone/blob/main/LICENSES.md) for the license scope map. ## Links -- **Documentation:** - **PyPI:** +- **Discord:** - **Issues:** - **Discussions:** -- **Licenses:** [MPL-2.0](https://github.com/orenlab/codeclone/blob/main/LICENSE) · [MIT documentation license](https://github.com/orenlab/codeclone/blob/main/LICENSE-MIT) · [License scope map](https://github.com/orenlab/codeclone/blob/main/LICENSES.md) [pypi-shield]: https://img.shields.io/pypi/v/codeclone?style=flat-square&color=6366f1 -[status-shield]: https://img.shields.io/pypi/status/codeclone?style=flat-square&color=6366f1 [downloads-shield]: https://img.shields.io/pypi/dm/codeclone?style=flat-square&color=6366f1 [python-shield]: https://img.shields.io/pypi/pyversions/codeclone?style=flat-square&color=6366f1 [license-shield]: https://img.shields.io/badge/license-MPL--2.0-6366f1?style=flat-square [tests-shield]: https://img.shields.io/github/actions/workflow/status/orenlab/codeclone/tests.yml?branch=main&style=flat-square&label=tests -[benchmark-shield]: https://img.shields.io/github/actions/workflow/status/orenlab/codeclone/benchmark.yml?style=flat-square&label=benchmark +[benchmark-shield]: https://img.shields.io/github/actions/workflow/status/orenlab/codeclone/benchmark.yml?branch=main&style=flat-square&label=benchmark +[discord-shield]: https://img.shields.io/badge/Discord-Join%20community-5865F2?style=flat-square&logo=discord&logoColor=white + [pypi-link]: https://pypi.org/project/codeclone/ [license-link]: #license [tests-link]: https://github.com/orenlab/codeclone/actions/workflows/tests.yml [benchmark-link]: https://github.com/orenlab/codeclone/actions/workflows/benchmark.yml +[discord-link]: https://discord.com/invite/U72KmRvpUx diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index af04cf8a..c8037e04 100755 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -20,13 +20,21 @@ from datetime import datetime, timezone from pathlib import Path from statistics import fmean, median, pstdev -from typing import Literal +from typing import Literal, cast from codeclone import __version__ as codeclone_version from codeclone.baseline import current_python_tag -BENCHMARK_SCHEMA_VERSION = "1.0" +BENCHMARK_SCHEMA_VERSION = "1.1" BENCHMARK_CLI_MODULE = "codeclone.main" +BenchmarkProfile = Literal["smoke", "extended", "diagnostic"] +ReportFormat = Literal["html", "md", "sarif", "text"] +REPORT_FORMAT_SPECS: Mapping[ReportFormat, tuple[str, str]] = { + "html": ("--html", ".html"), + "md": ("--md", ".md"), + "sarif": ("--sarif", ".sarif"), + "text": ("--text", ".txt"), +} BENCHMARK_NEUTRAL_ARGS: tuple[str, ...] = ( "--no-fail-on-new", "--no-fail-on-new-metrics", @@ -59,17 +67,40 @@ class Scenario: name: str mode: Literal["cold", "warm"] - extra_args: tuple[str, ...] + extra_args: tuple[str, ...] = () + report_formats: tuple[ReportFormat, ...] = () + run_cap: int | None = None + warmup_cap: int | None = None + expected_exit_codes: tuple[int, ...] = (0,) @dataclass(frozen=True) class RunMeasurement: elapsed_seconds: float + child_user_seconds: float + child_system_seconds: float + exit_code: int digest: str files_found: int files_analyzed: int files_cached: int files_skipped: int + artifact_bytes: dict[str, int] + cache_bytes: int + + +@dataclass(frozen=True) +class StartupProbe: + name: str + args: tuple[str, ...] + + +@dataclass(frozen=True) +class ProbeMeasurement: + elapsed_seconds: float + child_user_seconds: float + child_system_seconds: float + exit_code: int def _percentile(sorted_values: list[float], q: float) -> float: @@ -105,6 +136,71 @@ def _stats(values: list[float]) -> dict[str, float]: } +def _as_float(value: object, default: float = 0.0) -> float: + if isinstance(value, bool): + return float(value) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return default + return default + + +def _resource_usage_seconds() -> tuple[float, float]: + try: + import resource + except ImportError: + return (0.0, 0.0) + usage = resource.getrusage(resource.RUSAGE_CHILDREN) + return (float(usage.ru_utime), float(usage.ru_stime)) + + +def _resource_delta( + before: tuple[float, float], + after: tuple[float, float], +) -> tuple[float, float]: + return ( + max(0.0, after[0] - before[0]), + max(0.0, after[1] - before[1]), + ) + + +def _normalized_env() -> dict[str, str]: + env = dict(os.environ) + env["PYTHONHASHSEED"] = "0" + env["LC_ALL"] = "C.UTF-8" + env["LANG"] = "C.UTF-8" + env["TZ"] = "UTC" + return env + + +def _artifact_paths( + *, + report_path: Path, + report_formats: tuple[ReportFormat, ...], +) -> dict[str, Path]: + paths: dict[str, Path] = {"json": report_path} + for report_format in report_formats: + _flag, suffix = REPORT_FORMAT_SPECS[report_format] + paths[report_format] = report_path.with_suffix(suffix) + return paths + + +def _artifact_size_map(paths: Mapping[str, Path]) -> dict[str, int]: + return { + name: path.stat().st_size + for name, path in sorted(paths.items()) + if path.exists() + } + + +def _file_size(path: Path) -> int: + return path.stat().st_size if path.exists() else 0 + + def _read_report(report_path: Path) -> tuple[str, dict[str, int]]: payload_obj: object = json.loads(report_path.read_text(encoding="utf-8")) if not isinstance(payload_obj, dict): @@ -155,21 +251,25 @@ def _run_cli_once( cache_path: Path, report_path: Path, extra_args: tuple[str, ...], + report_formats: tuple[ReportFormat, ...] = (), + expected_exit_codes: tuple[int, ...] = (0,), ) -> RunMeasurement: - env = dict(os.environ) - env["PYTHONHASHSEED"] = "0" - env["LC_ALL"] = "C.UTF-8" - env["LANG"] = "C.UTF-8" - env["TZ"] = "UTC" - + env = _normalized_env() + artifact_paths = _artifact_paths( + report_path=report_path, + report_formats=report_formats, + ) + report_args: list[str] = ["--json", str(report_path)] + for report_format in report_formats: + flag, _suffix = REPORT_FORMAT_SPECS[report_format] + report_args.extend([flag, str(artifact_paths[report_format])]) cmd = [ python_executable, "-m", BENCHMARK_CLI_MODULE, str(target), *BENCHMARK_NEUTRAL_ARGS, - "--json", - str(report_path), + *report_args, "--cache-path", str(cache_path), "--no-progress", @@ -177,6 +277,7 @@ def _run_cli_once( *extra_args, ] + usage_before = _resource_usage_seconds() start = time.perf_counter() completed = subprocess.run( cmd, @@ -186,7 +287,11 @@ def _run_cli_once( env=env, ) elapsed_seconds = time.perf_counter() - start - if completed.returncode != 0: + child_user_seconds, child_system_seconds = _resource_delta( + usage_before, + _resource_usage_seconds(), + ) + if completed.returncode not in expected_exit_codes: stderr_tail = "\n".join(completed.stderr.splitlines()[-20:]) stdout_tail = "\n".join(completed.stdout.splitlines()[-20:]) raise RuntimeError( @@ -197,11 +302,16 @@ def _run_cli_once( digest, files = _read_report(report_path) return RunMeasurement( elapsed_seconds=elapsed_seconds, + child_user_seconds=child_user_seconds, + child_system_seconds=child_system_seconds, + exit_code=completed.returncode, digest=digest, files_found=files["found"], files_analyzed=files["analyzed"], files_cached=files["cached"], files_skipped=files["skipped"], + artifact_bytes=_artifact_size_map(artifact_paths), + cache_bytes=_file_size(cache_path), ) @@ -252,6 +362,22 @@ def _print_bulleted_lines(header: str, lines: Sequence[str]) -> None: print(f"- {line}") +def _effective_count(requested: int, cap: int | None) -> int: + return min(requested, cap) if cap is not None else requested + + +def _exit_code_counts(measurements: Sequence[RunMeasurement]) -> dict[str, int]: + counts: dict[str, int] = {} + for measurement in measurements: + key = str(measurement.exit_code) + counts[key] = counts.get(key, 0) + 1 + return dict(sorted(counts.items(), key=lambda item: int(item[0]))) + + +def _artifact_total_kib(artifact_bytes: Mapping[str, int]) -> float: + return sum(artifact_bytes.values()) / 1024.0 + + def _scenario_result( *, scenario: Scenario, @@ -266,6 +392,8 @@ def _scenario_result( shutil.rmtree(scenario_dir) scenario_dir.mkdir(parents=True, exist_ok=True) + effective_runs = _effective_count(runs, scenario.run_cap) + effective_warmups = _effective_count(warmups, scenario.warmup_cap) warm_cache_path = scenario_dir / "shared-cache.json" cold_cache_path = scenario_dir / "cold-cache.json" @@ -276,9 +404,11 @@ def _scenario_result( cache_path=warm_cache_path, report_path=scenario_dir / "seed-report.json", extra_args=scenario.extra_args, + report_formats=scenario.report_formats, + expected_exit_codes=scenario.expected_exit_codes, ) - for idx in range(warmups): + for idx in range(effective_warmups): if scenario.mode == "warm": cache_path = warm_cache_path else: @@ -290,10 +420,12 @@ def _scenario_result( cache_path=cache_path, report_path=scenario_dir / f"warmup-report-{idx}.json", extra_args=scenario.extra_args, + report_formats=scenario.report_formats, + expected_exit_codes=scenario.expected_exit_codes, ) measurements: list[RunMeasurement] = [] - for idx in range(runs): + for idx in range(effective_runs): if scenario.mode == "warm": cache_path = warm_cache_path else: @@ -305,6 +437,8 @@ def _scenario_result( cache_path=cache_path, report_path=scenario_dir / f"run-report-{idx}.json", extra_args=scenario.extra_args, + report_formats=scenario.report_formats, + expected_exit_codes=scenario.expected_exit_codes, ) _validate_inventory_sample(scenario=scenario, measurement=measurement) measurements.append(measurement) @@ -318,26 +452,178 @@ def _scenario_result( ) timings = [m.elapsed_seconds for m in measurements] + child_user = [m.child_user_seconds for m in measurements] + child_system = [m.child_system_seconds for m in measurements] + child_cpu = [m.child_user_seconds + m.child_system_seconds for m in measurements] sample = measurements[0] return { "name": scenario.name, "mode": scenario.mode, "extra_args": list(scenario.extra_args), - "warmups": warmups, - "runs": runs, + "report_formats": list(scenario.report_formats), + "warmups": effective_warmups, + "runs": effective_runs, + "requested_warmups": warmups, + "requested_runs": runs, + "run_cap": scenario.run_cap, + "warmup_cap": scenario.warmup_cap, + "expected_exit_codes": list(scenario.expected_exit_codes), + "exit_code_counts": _exit_code_counts(measurements), "deterministic": deterministic, "digest": digests[0], "timings_seconds": timings, "stats_seconds": _stats(timings), + "child_user_stats_seconds": _stats(child_user), + "child_system_stats_seconds": _stats(child_system), + "child_cpu_stats_seconds": _stats(child_cpu), "inventory_sample": { "found": sample.files_found, "analyzed": sample.files_analyzed, "cached": sample.files_cached, "skipped": sample.files_skipped, }, + "artifact_bytes_sample": sample.artifact_bytes, + "artifact_total_kib_sample": _artifact_total_kib(sample.artifact_bytes), + "cache_bytes_sample": sample.cache_bytes, } +def _run_probe_once( + *, + python_executable: str, + probe: StartupProbe, +) -> ProbeMeasurement: + cmd = [python_executable, *probe.args] + usage_before = _resource_usage_seconds() + start = time.perf_counter() + completed = subprocess.run( + cmd, + check=False, + capture_output=True, + text=True, + env=_normalized_env(), + ) + elapsed_seconds = time.perf_counter() - start + child_user_seconds, child_system_seconds = _resource_delta( + usage_before, + _resource_usage_seconds(), + ) + if completed.returncode != 0: + stderr_tail = "\n".join(completed.stderr.splitlines()[-20:]) + stdout_tail = "\n".join(completed.stdout.splitlines()[-20:]) + raise RuntimeError( + f"startup probe {probe.name} failed with exit {completed.returncode}" + f"\nSTDOUT:\n{stdout_tail}\nSTDERR:\n{stderr_tail}" + ) + return ProbeMeasurement( + elapsed_seconds=elapsed_seconds, + child_user_seconds=child_user_seconds, + child_system_seconds=child_system_seconds, + exit_code=completed.returncode, + ) + + +def _startup_probes() -> tuple[StartupProbe, ...]: + return ( + StartupProbe(name="python_empty", args=("-c", "pass")), + StartupProbe(name="import_codeclone", args=("-c", "import codeclone")), + StartupProbe( + name="import_codeclone_main", + args=("-c", "import codeclone.main"), + ), + StartupProbe( + name="cli_version", + args=("-m", BENCHMARK_CLI_MODULE, "--version"), + ), + ) + + +def _probe_result( + *, + probe: StartupProbe, + python_executable: str, + runs: int, +) -> dict[str, object]: + measurements = [ + _run_probe_once(python_executable=python_executable, probe=probe) + for _idx in range(runs) + ] + timings = [m.elapsed_seconds for m in measurements] + child_user = [m.child_user_seconds for m in measurements] + child_system = [m.child_system_seconds for m in measurements] + child_cpu = [m.child_user_seconds + m.child_system_seconds for m in measurements] + return { + "name": probe.name, + "args": list(probe.args), + "runs": runs, + "timings_seconds": timings, + "stats_seconds": _stats(timings), + "first_seconds": timings[0] if timings else 0.0, + "subsequent_stats_seconds": _stats(timings[1:]) if len(timings) > 1 else None, + "child_user_stats_seconds": _stats(child_user), + "child_system_stats_seconds": _stats(child_system), + "child_cpu_stats_seconds": _stats(child_cpu), + "exit_code_counts": { + str(code): sum(1 for item in measurements if item.exit_code == code) + for code in sorted({item.exit_code for item in measurements}) + }, + } + + +def _scenario_profile(profile: BenchmarkProfile) -> tuple[Scenario, ...]: + core = ( + Scenario(name="cold_full", mode="cold"), + Scenario(name="warm_full", mode="warm"), + Scenario(name="warm_clones_only", mode="warm", extra_args=("--skip-metrics",)), + ) + report_scenarios = ( + Scenario( + name="cold_html", + mode="cold", + report_formats=("html",), + run_cap=3, + warmup_cap=1, + ), + Scenario( + name="warm_html", + mode="warm", + report_formats=("html",), + run_cap=5, + warmup_cap=1, + ), + Scenario( + name="cold_all_reports", + mode="cold", + report_formats=("html", "md", "sarif", "text"), + run_cap=3, + warmup_cap=1, + ), + Scenario( + name="warm_all_reports", + mode="warm", + report_formats=("html", "md", "sarif", "text"), + run_cap=5, + warmup_cap=1, + ), + ) + diagnostic_scenarios = ( + Scenario( + name="ci_cold_diagnostic", + mode="cold", + extra_args=("--ci",), + report_formats=("html",), + run_cap=3, + warmup_cap=0, + expected_exit_codes=(0, 2, 3), + ), + ) + if profile == "smoke": + return core + if profile == "extended": + return core + report_scenarios + return core + report_scenarios + diagnostic_scenarios + + def _cgroup_value(path: Path) -> str | None: try: content = path.read_text(encoding="utf-8").strip() @@ -396,12 +682,28 @@ def _median_for(name: str) -> float | None: cold_full = _median_for("cold_full") warm_full = _median_for("warm_full") warm_clones = _median_for("warm_clones_only") + cold_html = _median_for("cold_html") + warm_html = _median_for("warm_html") + cold_all_reports = _median_for("cold_all_reports") + warm_all_reports = _median_for("warm_all_reports") comparisons: dict[str, float] = {} if cold_full and warm_full: comparisons["warm_full_speedup_vs_cold_full"] = cold_full / warm_full if warm_full and warm_clones: comparisons["warm_clones_only_speedup_vs_warm_full"] = warm_full / warm_clones + if cold_full and cold_html: + comparisons["cold_html_overhead_vs_cold_full"] = cold_html / cold_full + if warm_full and warm_html: + comparisons["warm_html_overhead_vs_warm_full"] = warm_html / warm_full + if cold_full and cold_all_reports: + comparisons["cold_all_reports_overhead_vs_cold_full"] = ( + cold_all_reports / cold_full + ) + if warm_full and warm_all_reports: + comparisons["warm_all_reports_overhead_vs_warm_full"] = ( + warm_all_reports / warm_full + ) return comparisons @@ -470,7 +772,7 @@ def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description=( "Deterministic Docker-oriented benchmark for CodeClone CLI " - "(cold/warm cache scenarios)." + "(cold/warm cache, report, and startup scenarios)." ) ) parser.add_argument( @@ -502,6 +804,27 @@ def _parse_args() -> argparse.Namespace: default=int(os.environ.get("CODECLONE_BENCH_WARMUPS", "3")), help="Warmup runs per scenario", ) + parser.add_argument( + "--scenario-profile", + choices=("smoke", "extended", "diagnostic"), + default=os.environ.get("CODECLONE_BENCH_PROFILE", "smoke"), + help=( + "Scenario set: smoke keeps the historical core set; extended adds " + "report-format scenarios with per-scenario caps; diagnostic also " + "adds a CI-gate timing scenario that may exit non-zero." + ), + ) + parser.add_argument( + "--startup-runs", + type=int, + default=int(os.environ.get("CODECLONE_BENCH_STARTUP_RUNS", "3")), + help="Measured runs per startup/import probe", + ) + parser.add_argument( + "--no-startup-probes", + action="store_true", + help="Skip Python/import startup probes.", + ) parser.add_argument( "--tmp-dir", type=Path, @@ -534,6 +857,8 @@ def main() -> int: raise SystemExit("--runs must be > 0") if args.warmups < 0: raise SystemExit("--warmups must be >= 0") + if args.startup_runs <= 0: + raise SystemExit("--startup-runs must be > 0") if args.max_regression_pct < 0: raise SystemExit("--max-regression-pct must be >= 0") target = args.target.resolve() @@ -547,11 +872,10 @@ def main() -> int: shutil.rmtree(workspace) workspace.mkdir(parents=True, exist_ok=True) - scenarios = [ - Scenario(name="cold_full", mode="cold", extra_args=()), - Scenario(name="warm_full", mode="warm", extra_args=()), - Scenario(name="warm_clones_only", mode="warm", extra_args=("--skip-metrics",)), - ] + scenario_profile = str(args.scenario_profile) + if scenario_profile not in {"smoke", "extended", "diagnostic"}: + raise SystemExit(f"unknown scenario profile: {scenario_profile}") + scenarios = _scenario_profile(cast(BenchmarkProfile, scenario_profile)) scenario_results = [ _scenario_result( scenario=scenario, @@ -563,6 +887,18 @@ def main() -> int: ) for scenario in scenarios ] + startup_probe_results = ( + [ + _probe_result( + probe=probe, + python_executable=args.python_executable, + runs=args.startup_runs, + ) + for probe in _startup_probes() + ] + if not args.no_startup_probes + else [] + ) comparisons = _comparison_metrics(scenario_results) @@ -577,9 +913,13 @@ def main() -> int: "target": str(target), "runs": args.runs, "warmups": args.warmups, + "scenario_profile": scenario_profile, + "startup_runs": args.startup_runs, + "startup_probes": not args.no_startup_probes, "python_executable": args.python_executable, }, "environment": _environment(), + "startup_probes": startup_probe_results, "scenarios": scenario_results, "comparisons": comparisons, "generated_at_utc": datetime.now(timezone.utc) @@ -612,17 +952,44 @@ def main() -> int: print("CodeClone Docker benchmark") print(f"target={target}") - print(f"runs={args.runs} warmups={args.warmups}") + print( + f"profile={scenario_profile} runs={args.runs} " + f"warmups={args.warmups} startup_runs={args.startup_runs}" + ) + if startup_probe_results: + print("startup probes:") + for probe in startup_probe_results: + name = str(probe["name"]) + stats = probe["stats_seconds"] + cpu_stats = probe["child_cpu_stats_seconds"] + assert isinstance(stats, dict) + assert isinstance(cpu_stats, dict) + print( + f"- {name:22s} median={_as_float(stats['median']):.4f}s " + f"first={_as_float(probe['first_seconds']):.4f}s " + f"cpu={_as_float(cpu_stats['median']):.4f}s" + ) for scenario in scenario_results: name = str(scenario["name"]) stats = scenario["stats_seconds"] + cpu_stats = scenario["child_cpu_stats_seconds"] + inventory = scenario["inventory_sample"] + exit_counts = scenario["exit_code_counts"] assert isinstance(stats, dict) + assert isinstance(cpu_stats, dict) + assert isinstance(inventory, dict) + assert isinstance(exit_counts, dict) median_s = float(stats["median"]) p95_s = float(stats["p95"]) stdev_s = float(stats["stdev"]) + cpu_median_s = float(cpu_stats["median"]) print( - f"- {name:16s} median={median_s:.4f}s " + f"- {name:20s} median={median_s:.4f}s " f"p95={p95_s:.4f}s stdev={stdev_s:.4f}s " + f"cpu={cpu_median_s:.4f}s " + f"files={inventory.get('analyzed', 0)}/{inventory.get('cached', 0)} " + f"artifacts={_as_float(scenario['artifact_total_kib_sample']):.1f}KiB " + f"exit={exit_counts} " f"digest={scenario['digest']}" ) _print_bulleted_lines( diff --git a/benchmarks/run_docker_benchmark.sh b/benchmarks/run_docker_benchmark.sh index c828a804..f4af6831 100755 --- a/benchmarks/run_docker_benchmark.sh +++ b/benchmarks/run_docker_benchmark.sh @@ -10,6 +10,8 @@ CPUS="${CPUS:-1.0}" MEMORY="${MEMORY:-2g}" RUNS="${RUNS:-12}" WARMUPS="${WARMUPS:-3}" +SCENARIO_PROFILE="${SCENARIO_PROFILE:-smoke}" +STARTUP_RUNS="${STARTUP_RUNS:-3}" HOST_UID="$(id -u)" HOST_GID="$(id -g)" CONTAINER_USER="${CONTAINER_USER:-${HOST_UID}:${HOST_GID}}" @@ -41,6 +43,8 @@ docker run \ --output "/bench-out/$OUTPUT_BASENAME" \ --runs "$RUNS" \ --warmups "$WARMUPS" \ + --scenario-profile "$SCENARIO_PROFILE" \ + --startup-runs "$STARTUP_RUNS" \ "$@" echo "[bench] results: $OUT_DIR/$OUTPUT_BASENAME" diff --git a/codeclone/analysis/blast_radius.py b/codeclone/analysis/blast_radius.py index a778c39d..c6c33620 100644 --- a/codeclone/analysis/blast_radius.py +++ b/codeclone/analysis/blast_radius.py @@ -14,6 +14,8 @@ from typing import Final, Literal from ..paths.workspace import FORBIDDEN_WORKSPACE_GLOBS +from ..utils.coerce import as_mapping as _as_mapping +from ..utils.coerce import as_sequence as _as_sequence BlastRadiusDepth = Literal["direct", "transitive"] @@ -66,16 +68,6 @@ class BlastRadiusResult: guardrails: tuple[str, ...] -def _as_mapping(value: object) -> Mapping[str, object]: - return value if isinstance(value, Mapping) else {} - - -def _as_sequence(value: object) -> Sequence[object]: - if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)): - return value - return () - - def _as_int(value: object, default: int = 0) -> int: if isinstance(value, bool): return int(value) diff --git a/codeclone/analysis/fingerprint.py b/codeclone/analysis/fingerprint.py index dff7dbcc..ee4638b3 100644 --- a/codeclone/analysis/fingerprint.py +++ b/codeclone/analysis/fingerprint.py @@ -16,6 +16,7 @@ NormalizationConfig, normalized_ast_dump_from_list, ) +from .phase_ledger import INERT_PHASE_LEDGER, AnalysisPhaseKey, PhaseLedger def sha1(s: str) -> str: @@ -37,6 +38,8 @@ def _cfg_fingerprint_and_complexity( node: _qualnames.FunctionNode, cfg: NormalizationConfig, qualname: str, + *, + phase_ledger: PhaseLedger = INERT_PHASE_LEDGER, ) -> tuple[str, int]: """ Generate a structural fingerprint for a function using CFG analysis. @@ -60,7 +63,8 @@ def _cfg_fingerprint_and_complexity( 40-character hex SHA-1 hash of the normalized CFG """ builder = CFGBuilder() - graph = builder.build(qualname, node) + with phase_ledger.phase(AnalysisPhaseKey.UNIT_CFG): + graph = builder.build(qualname, node) cfg_normalizer = AstNormalizer(cfg) # Use generator to avoid building large list of strings @@ -69,11 +73,12 @@ def _cfg_fingerprint_and_complexity( succ_ids = ",".join( str(s.id) for s in sorted(block.successors, key=lambda s: s.id) ) - block_dump = normalized_ast_dump_from_list( - block.statements, - cfg, - normalizer=cfg_normalizer, - ) + with phase_ledger.phase(AnalysisPhaseKey.UNIT_NORMALIZE_CFG): + block_dump = normalized_ast_dump_from_list( + block.statements, + cfg, + normalizer=cfg_normalizer, + ) parts.append(f"BLOCK[{block.id}]:{block_dump}|SUCCESSORS:{succ_ids}") return sha1("|".join(parts)), cyclomatic_complexity(graph) diff --git a/codeclone/analysis/phase_ledger.py b/codeclone/analysis/phase_ledger.py new file mode 100644 index 00000000..96ff1979 --- /dev/null +++ b/codeclone/analysis/phase_ledger.py @@ -0,0 +1,199 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from dataclasses import dataclass, fields +from enum import Enum +from time import perf_counter_ns +from types import TracebackType +from typing import Literal + + +class AnalysisPhaseKey(str, Enum): + PARSE = "parse" + QUALNAME = "qualname" + MODULE_WALK = "module_walk" + RELATIONSHIP = "relationship" + SUPPRESSIONS = "suppressions" + UNIT_CFG = "unit_cfg" + UNIT_NORMALIZE_CFG = "unit_normalize_cfg" + UNIT_STRUCTURAL = "unit_structural" + UNIT_NORMALIZE_STMT = "unit_normalize_stmt" + UNIT_BLOCKS = "unit_blocks" + UNIT_SEGMENTS = "unit_segments" + CLASS_METRICS = "class_metrics" + DEAD_CODE = "dead_code" + MODULE_PASSES = "module_passes" + + +class AnalysisVolumeKey(str, Enum): + FILES_TIMED = "files_timed" + UNITS_SEEN = "units_seen" + UNITS_ELIGIBLE = "units_eligible" + UNITS_FINGERPRINTED = "units_fingerprinted" + BLOCKS_EMITTED = "blocks_emitted" + SEGMENTS_EMITTED = "segments_emitted" + + +PHASE_US_COUNTER_SUFFIXES: tuple[str, ...] = tuple( + f"phase_{key.value}_us" for key in AnalysisPhaseKey +) +PHASE_VOLUME_COUNTER_SUFFIXES: tuple[str, ...] = tuple( + key.value for key in AnalysisVolumeKey +) + + +@dataclass(frozen=True, slots=True) +class PhaseTotals: + parse_ns: int = 0 + qualname_ns: int = 0 + module_walk_ns: int = 0 + relationship_ns: int = 0 + suppressions_ns: int = 0 + unit_cfg_ns: int = 0 + unit_normalize_cfg_ns: int = 0 + unit_structural_ns: int = 0 + unit_normalize_stmt_ns: int = 0 + unit_blocks_ns: int = 0 + unit_segments_ns: int = 0 + class_metrics_ns: int = 0 + dead_code_ns: int = 0 + module_passes_ns: int = 0 + + def merge(self, other: PhaseTotals) -> PhaseTotals: + return PhaseTotals( + **{ + field.name: getattr(self, field.name) + getattr(other, field.name) + for field in fields(self) + } + ) + + def counter_map_us(self) -> dict[str, int]: + return { + f"phase_{key.value}_us": getattr(self, f"{key.value}_ns") // 1000 + for key in AnalysisPhaseKey + } + + +@dataclass(frozen=True, slots=True) +class PhaseSnapshot: + totals: PhaseTotals + volumes: tuple[tuple[str, int], ...] + + @classmethod + def empty(cls) -> PhaseSnapshot: + return cls(totals=PhaseTotals(), volumes=()) + + def merge(self, other: PhaseSnapshot) -> PhaseSnapshot: + merged_volumes = self.volume_map() + for key, value in other.volumes: + merged_volumes[key] = merged_volumes.get(key, 0) + value + return PhaseSnapshot( + totals=self.totals.merge(other.totals), + volumes=tuple(sorted(merged_volumes.items())), + ) + + def volume_map(self) -> dict[str, int]: + return dict(self.volumes) + + +class _InertPhaseContext: + __slots__ = () + + def __enter__(self) -> None: + return None + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> Literal[False]: + return False + + +_INERT_PHASE_CONTEXT = _InertPhaseContext() + + +class _ActivePhaseContext: + __slots__ = ("_key", "_ledger", "_started_ns") + + def __init__(self, ledger: PhaseLedger, key: AnalysisPhaseKey) -> None: + self._ledger = ledger + self._key = key + self._started_ns: int | None = None + + def __enter__(self) -> None: + self._started_ns = perf_counter_ns() + return None + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: TracebackType | None, + ) -> Literal[False]: + started = self._started_ns + if started is not None: + self._ledger._add_elapsed(self._key, perf_counter_ns() - started) + return False + + +class PhaseLedger: + __slots__ = ("_active", "_totals", "_volumes") + + def __init__(self, *, active: bool) -> None: + self._active = active + self._totals: dict[AnalysisPhaseKey, int] = {} + self._volumes: dict[AnalysisVolumeKey, int] = {} + + @property + def active(self) -> bool: + return self._active + + def phase(self, key: AnalysisPhaseKey) -> _InertPhaseContext | _ActivePhaseContext: + if not isinstance(key, AnalysisPhaseKey): + raise TypeError("phase key must be an AnalysisPhaseKey") + if not self._active: + return _INERT_PHASE_CONTEXT + return _ActivePhaseContext(self, key) + + def add_volume(self, key: AnalysisVolumeKey, value: int = 1) -> None: + if not isinstance(key, AnalysisVolumeKey): + raise TypeError("volume key must be an AnalysisVolumeKey") + if not self._active: + return + self._volumes[key] = self._volumes.get(key, 0) + value + + def snapshot(self) -> PhaseSnapshot: + totals = PhaseTotals( + **{f"{key.value}_ns": self._totals.get(key, 0) for key in AnalysisPhaseKey} + ) + return PhaseSnapshot( + totals=totals, + volumes=tuple( + sorted((key.value, value) for key, value in self._volumes.items()) + ), + ) + + def _add_elapsed(self, key: AnalysisPhaseKey, elapsed_ns: int) -> None: + self._totals[key] = self._totals.get(key, 0) + elapsed_ns + + +INERT_PHASE_LEDGER = PhaseLedger(active=False) + + +__all__ = [ + "INERT_PHASE_LEDGER", + "PHASE_US_COUNTER_SUFFIXES", + "PHASE_VOLUME_COUNTER_SUFFIXES", + "AnalysisPhaseKey", + "AnalysisVolumeKey", + "PhaseLedger", + "PhaseSnapshot", + "PhaseTotals", +] diff --git a/codeclone/analysis/reachability.py b/codeclone/analysis/reachability.py index 81c04cb0..f381aa16 100644 --- a/codeclone/analysis/reachability.py +++ b/codeclone/analysis/reachability.py @@ -24,8 +24,11 @@ _FASTAPI_ROUTE_METHODS = { "api_route", "delete", + "exception_handler", "get", "head", + "middleware", + "on_event", "options", "patch", "post", @@ -126,6 +129,101 @@ "process_literal_param", "process_result_value", } +_PYDANTIC_JSON_SCHEMA_SYMBOLS = { + "pydantic.json_schema.GenerateJsonSchema", +} +_PYDANTIC_JSON_SCHEMA_HOOKS = { + "any_schema", + "arguments_schema", + "arguments_v3_schema", + "bool_schema", + "bytes_schema", + "call_schema", + "callable_schema", + "chain_schema", + "complex_schema", + "computed_field_schema", + "custom_error_schema", + "dataclass_args_schema", + "dataclass_field_schema", + "dataclass_schema", + "date_schema", + "datetime_schema", + "decimal_schema", + "default_schema", + "definition_ref_schema", + "definitions_schema", + "dict_schema", + "enum_schema", + "float_schema", + "frozenset_schema", + "function_after_schema", + "function_before_schema", + "function_plain_schema", + "function_wrap_schema", + "generator_schema", + "get_cache_defs_ref_schema", + "handle_invalid_for_json_schema", + "int_schema", + "invalid_schema", + "is_instance_schema", + "is_subclass_schema", + "json_or_python_schema", + "json_schema", + "kw_arguments_schema", + "lax_or_strict_schema", + "list_schema", + "literal_schema", + "missing_sentinel_schema", + "model_field_schema", + "model_fields_schema", + "model_schema", + "multi_host_url_schema", + "none_schema", + "nullable_schema", + "p_arguments_schema", + "resolve_ref_schema", + "ser_schema", + "set_schema", + "str_schema", + "tagged_union_schema", + "time_schema", + "timedelta_schema", + "tuple_positional_schema", + "tuple_schema", + "tuple_variable_schema", + "typed_dict_field_schema", + "typed_dict_schema", + "union_schema", + "url_schema", + "uuid_schema", +} +_STARLETTE_ROUTE_BASE_SYMBOLS = { + "starlette.routing.Route", +} +_FASTAPI_ROUTE_CLASS_BASE_SYMBOLS = { + "fastapi.routing.APIRoute", +} +_STARLETTE_ROUTE_HOOKS = { + "handle", + "matches", +} +_FASTAPI_ROUTE_CLASS_HOOKS = { + "get_path", + "get_response", + "handle", + "matches", +} +_STARLETTE_APP_BASE_SYMBOLS = { + "starlette.applications.Starlette", +} +_FASTAPI_APP_BASE_SYMBOLS = { + "fastapi.FastAPI", + "fastapi.applications.FastAPI", +} +_FRAMEWORK_APP_HOOKS = { + "build_middleware_stack", +} _TYPING_CAST_SYMBOLS = { "typing.cast", "typing_extensions.cast", @@ -627,6 +725,9 @@ def visit_ClassDef(self, node: ast.ClassDef) -> None: self._handle_dependency_injector_container(node) self._handle_starlette_base_http_middleware(node) self._handle_sqlalchemy_type_decorator(node) + self._handle_pydantic_generate_json_schema(node) + self._handle_starlette_route_subclass(node) + self._handle_framework_application_subclass(node) self.generic_visit(node) def visit_Assign(self, node: ast.Assign) -> None: @@ -964,12 +1065,7 @@ def _handle_starlette_base_http_middleware(self, node: ast.ClassDef) -> None: else f"{self._module_name}:{node.name}", ) - def _handle_sqlalchemy_type_decorator(self, node: ast.ClassDef) -> None: - if not any( - _resolve_symbol(base, self._aliases) in _SQLALCHEMY_TYPE_DECORATOR_SYMBOLS - for base in node.bases - ): - return + def _class_hook_context(self, node: ast.ClassDef) -> tuple[str, str]: class_target = self._class_targets.get(id(node)) class_qualname = ( class_target.qualname.split(":", 1)[-1] @@ -981,20 +1077,101 @@ def _handle_sqlalchemy_type_decorator(self, node: ast.ClassDef) -> None: if class_target is not None else f"{self._module_name}:{node.name}" ) + return class_qualname, source_qualname + + def _emit_subclass_runtime_hooks( + self, + *, + node: ast.ClassDef, + hook_names: set[str], + framework: RuntimeReachabilityFramework, + evidence: str, + evidence_symbol_prefix: str, + ) -> None: + class_qualname, source_qualname = self._class_hook_context(node) for method in self._methods_by_class.get(class_qualname, []): method_name = method.qualname.rsplit(".", 1)[-1] - if method_name not in _SQLALCHEMY_TYPE_DECORATOR_HOOKS: + if method_name not in hook_names: continue self._emit( target=method, - framework="sqlalchemy", + framework=framework, edge_kind="runtime_hook", confidence="medium", - evidence="SQLAlchemy TypeDecorator hook", - evidence_symbol=f"TypeDecorator.{method_name}", + evidence=evidence, + evidence_symbol=f"{evidence_symbol_prefix}.{method_name}", source_qualname=source_qualname, ) + def _handle_sqlalchemy_type_decorator(self, node: ast.ClassDef) -> None: + if not any( + _resolve_symbol(base, self._aliases) in _SQLALCHEMY_TYPE_DECORATOR_SYMBOLS + for base in node.bases + ): + return + self._emit_subclass_runtime_hooks( + node=node, + hook_names=set(_SQLALCHEMY_TYPE_DECORATOR_HOOKS), + framework="sqlalchemy", + evidence="SQLAlchemy TypeDecorator hook", + evidence_symbol_prefix="TypeDecorator", + ) + + def _handle_pydantic_generate_json_schema(self, node: ast.ClassDef) -> None: + if not any( + _resolve_symbol(base, self._aliases) in _PYDANTIC_JSON_SCHEMA_SYMBOLS + for base in node.bases + ): + return + self._emit_subclass_runtime_hooks( + node=node, + hook_names=set(_PYDANTIC_JSON_SCHEMA_HOOKS), + framework="pydantic", + evidence="Pydantic GenerateJsonSchema hook", + evidence_symbol_prefix="GenerateJsonSchema", + ) + + def _handle_starlette_route_subclass(self, node: ast.ClassDef) -> None: + resolved_bases = [_resolve_symbol(base, self._aliases) for base in node.bases] + is_starlette_route = any( + base in _STARLETTE_ROUTE_BASE_SYMBOLS for base in resolved_bases + ) + is_fastapi_route = any( + base in _FASTAPI_ROUTE_CLASS_BASE_SYMBOLS for base in resolved_bases + ) + if not is_starlette_route and not is_fastapi_route: + return + hook_names = set(_STARLETTE_ROUTE_HOOKS) + if is_fastapi_route: + hook_names |= _FASTAPI_ROUTE_CLASS_HOOKS + base_symbol = "APIRoute" if is_fastapi_route else "Route" + self._emit_subclass_runtime_hooks( + node=node, + hook_names=hook_names, + framework="fastapi" if is_fastapi_route else "starlette", + evidence=f"{base_symbol} runtime hook", + evidence_symbol_prefix=base_symbol, + ) + + def _handle_framework_application_subclass(self, node: ast.ClassDef) -> None: + resolved_bases = [_resolve_symbol(base, self._aliases) for base in node.bases] + is_starlette_app = any( + base in _STARLETTE_APP_BASE_SYMBOLS for base in resolved_bases + ) + is_fastapi_app = any( + base in _FASTAPI_APP_BASE_SYMBOLS for base in resolved_bases + ) + if not is_starlette_app and not is_fastapi_app: + return + base_symbol = "FastAPI" if is_fastapi_app else "Starlette" + self._emit_subclass_runtime_hooks( + node=node, + hook_names=set(_FRAMEWORK_APP_HOOKS), + framework="fastapi" if is_fastapi_app else "starlette", + evidence=f"{base_symbol} runtime hook", + evidence_symbol_prefix=base_symbol, + ) + def _handle_dependency_injector_container(self, node: ast.ClassDef) -> None: if not any( self._is_dependency_injector_container_base(base) for base in node.bases diff --git a/codeclone/analysis/units.py b/codeclone/analysis/units.py index b4fcf7d2..e6f2b791 100644 --- a/codeclone/analysis/units.py +++ b/codeclone/analysis/units.py @@ -7,7 +7,10 @@ from __future__ import annotations import ast +from collections.abc import Callable +from functools import partial from hashlib import sha1 as _sha1 +from typing import TypeVar from .. import qualnames as _qualnames from ..blocks import extract_blocks, extract_segments @@ -43,11 +46,19 @@ from .fingerprint import _cfg_fingerprint_and_complexity, bucket_loc from .normalizer import NormalizationConfig, stmt_hashes from .parser import PARSE_TIMEOUT_SECONDS, _parse_with_limits +from .phase_ledger import ( + INERT_PHASE_LEDGER, + AnalysisPhaseKey, + AnalysisVolumeKey, + PhaseLedger, +) from .reachability import collect_runtime_reachability from .security_surfaces import collect_security_surfaces __all__ = ["extract_units_and_stats_from_source"] +_TCloneUnit = TypeVar("_TCloneUnit", BlockUnit, SegmentUnit) + def _stmt_count(node: ast.AST) -> int: body = getattr(node, "body", None) @@ -86,6 +97,19 @@ def _eligible_unit_shape( return start, end, loc, stmt_count +def _collect_timed_clone_units( + *, + phase_ledger: PhaseLedger, + phase_key: AnalysisPhaseKey, + volume_key: AnalysisVolumeKey, + collect: Callable[[], list[_TCloneUnit]], +) -> list[_TCloneUnit]: + with phase_ledger.phase(phase_key): + items = collect() + phase_ledger.add_volume(volume_key, len(items)) + return items + + def extract_units_and_stats_from_source( source: str, filepath: str, @@ -101,6 +125,7 @@ def extract_units_and_stats_from_source( collect_structural_findings: bool = True, collect_api_surface: bool = False, api_include_private_modules: bool = False, + phase_ledger: PhaseLedger = INERT_PHASE_LEDGER, ) -> tuple[ list[Unit], list[BlockUnit], @@ -110,26 +135,29 @@ def extract_units_and_stats_from_source( list[StructuralFindingGroup], ]: try: - tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS) + with phase_ledger.phase(AnalysisPhaseKey.PARSE): + tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS) except SyntaxError as e: raise ParseError(f"Failed to parse {filepath}: {e}") from e if not isinstance(tree, ast.Module): raise ParseError(f"Failed to parse {filepath}: expected module AST root") collector = _qualnames.QualnameCollector() - collector.visit(tree) + with phase_ledger.phase(AnalysisPhaseKey.QUALNAME): + collector.visit(tree) source_lines = source.splitlines() source_line_count = len(source_lines) is_test_file = is_test_filepath(filepath) # Single-pass AST walk replaces 3 separate functions / 4 walks. - _walk = _collect_module_walk_data( - tree=tree, - module_name=module_name, - collector=collector, - collect_referenced_names=not is_test_file, - ) + with phase_ledger.phase(AnalysisPhaseKey.MODULE_WALK): + _walk = _collect_module_walk_data( + tree=tree, + module_name=module_name, + collector=collector, + collect_referenced_names=not is_test_file, + ) import_names = _walk.import_names module_deps = _walk.module_deps referenced_names = _walk.referenced_names @@ -139,20 +167,22 @@ def extract_units_and_stats_from_source( non_runtime_decorator_aliases = _walk.non_runtime_decorator_aliases pydantic_module_aliases = _walk.pydantic_module_aliases cohesion_ignored_decorator_aliases = _walk.cohesion_ignored_decorator_aliases - function_relationship_facts = _collect_function_relationship_facts( - tree=tree, - module_name=module_name, - filepath=filepath, - collector=collector, - origin_lane="test" if is_test_file else "production", - ) + with phase_ledger.phase(AnalysisPhaseKey.RELATIONSHIP): + function_relationship_facts = _collect_function_relationship_facts( + tree=tree, + module_name=module_name, + filepath=filepath, + collector=collector, + origin_lane="test" if is_test_file else "production", + ) - suppression_index = _build_suppression_index_for_source( - source=source, - filepath=filepath, - module_name=module_name, - collector=collector, - ) + with phase_ledger.phase(AnalysisPhaseKey.SUPPRESSIONS): + suppression_index = _build_suppression_index_for_source( + source=source, + filepath=filepath, + module_name=module_name, + collector=collector, + ) class_names = frozenset(class_node.name for _, class_node in collector.class_nodes) module_import_names = set(import_names) module_class_names = set(class_names) @@ -164,6 +194,7 @@ def extract_units_and_stats_from_source( structural_findings: list[StructuralFindingGroup] = [] for local_name, node in collector.units: + phase_ledger.add_volume(AnalysisVolumeKey.UNITS_SEEN) unit_shape = _eligible_unit_shape( node, min_loc=min_loc, @@ -171,16 +202,24 @@ def extract_units_and_stats_from_source( ) if unit_shape is None: continue + phase_ledger.add_volume(AnalysisVolumeKey.UNITS_ELIGIBLE) start, end, loc, stmt_count = unit_shape qualname = f"{module_name}:{local_name}" - fingerprint, complexity = _cfg_fingerprint_and_complexity(node, cfg, qualname) - structure_facts = scan_function_structure( + fingerprint, complexity = _cfg_fingerprint_and_complexity( node, - filepath, + cfg, qualname, - collect_findings=collect_structural_findings, + phase_ledger=phase_ledger, ) + phase_ledger.add_volume(AnalysisVolumeKey.UNITS_FINGERPRINTED) + with phase_ledger.phase(AnalysisPhaseKey.UNIT_STRUCTURAL): + structure_facts = scan_function_structure( + node, + filepath, + qualname, + collect_findings=collect_structural_findings, + ) depth = structure_facts.nesting_depth risk = risk_level(complexity) raw_hash = _raw_source_hash_for_range(source_lines, start, end) @@ -223,11 +262,16 @@ def extract_units_and_stats_from_source( body = getattr(node, "body", None) hashes: list[str] | None = None if isinstance(body, list): - hashes = stmt_hashes(body, cfg) + with phase_ledger.phase(AnalysisPhaseKey.UNIT_NORMALIZE_STMT): + hashes = stmt_hashes(body, cfg) if needs_blocks: - block_units.extend( - extract_blocks( + blocks = _collect_timed_clone_units( + phase_ledger=phase_ledger, + phase_key=AnalysisPhaseKey.UNIT_BLOCKS, + volume_key=AnalysisVolumeKey.BLOCKS_EMITTED, + collect=partial( + extract_blocks, node, filepath=filepath, qualname=qualname, @@ -235,12 +279,17 @@ def extract_units_and_stats_from_source( block_size=4, max_blocks=15, precomputed_hashes=hashes, - ) + ), ) + block_units.extend(blocks) if needs_segments: - segment_units.extend( - extract_segments( + segments = _collect_timed_clone_units( + phase_ledger=phase_ledger, + phase_key=AnalysisPhaseKey.UNIT_SEGMENTS, + volume_key=AnalysisVolumeKey.SEGMENTS_EMITTED, + collect=partial( + extract_segments, node, filepath=filepath, qualname=qualname, @@ -248,42 +297,45 @@ def extract_units_and_stats_from_source( window_size=6, max_segments=60, precomputed_hashes=hashes, - ) + ), ) + segment_units.extend(segments) if collect_structural_findings: structural_findings.extend(structure_facts.structural_findings) - for class_qualname, class_node in collector.class_nodes: - cohesion_ignored_methods = _cohesion_ignored_method_names( - class_node, + with phase_ledger.phase(AnalysisPhaseKey.CLASS_METRICS): + for class_qualname, class_node in collector.class_nodes: + cohesion_ignored_methods = _cohesion_ignored_method_names( + class_node, + protocol_symbol_aliases=protocol_symbol_aliases, + protocol_module_aliases=protocol_module_aliases, + pydantic_module_aliases=pydantic_module_aliases, + cohesion_ignored_decorator_aliases=cohesion_ignored_decorator_aliases, + ) + class_metric = _class_metrics_for_node( + module_name=module_name, + class_qualname=class_qualname, + class_node=class_node, + filepath=filepath, + module_import_names=module_import_names, + module_class_names=module_class_names, + cohesion_ignored_methods=cohesion_ignored_methods, + ) + if class_metric is not None: + class_metrics.append(class_metric) + + with phase_ledger.phase(AnalysisPhaseKey.DEAD_CODE): + dead_candidates = _collect_dead_candidates( + filepath=filepath, + module_name=module_name, + collector=collector, protocol_symbol_aliases=protocol_symbol_aliases, protocol_module_aliases=protocol_module_aliases, + non_runtime_decorator_aliases=non_runtime_decorator_aliases, pydantic_module_aliases=pydantic_module_aliases, - cohesion_ignored_decorator_aliases=cohesion_ignored_decorator_aliases, - ) - class_metric = _class_metrics_for_node( - module_name=module_name, - class_qualname=class_qualname, - class_node=class_node, - filepath=filepath, - module_import_names=module_import_names, - module_class_names=module_class_names, - cohesion_ignored_methods=cohesion_ignored_methods, + suppression_rules_by_target=suppression_index, ) - if class_metric is not None: - class_metrics.append(class_metric) - - dead_candidates = _collect_dead_candidates( - filepath=filepath, - module_name=module_name, - collector=collector, - protocol_symbol_aliases=protocol_symbol_aliases, - protocol_module_aliases=protocol_module_aliases, - non_runtime_decorator_aliases=non_runtime_decorator_aliases, - pydantic_module_aliases=pydantic_module_aliases, - suppression_rules_by_target=suppression_index, - ) sorted_class_metrics = tuple( sorted( @@ -296,34 +348,35 @@ def extract_units_and_stats_from_source( ), ) ) - typing_coverage, docstring_coverage = collect_module_adoption( - tree=tree, - module_name=module_name, - filepath=filepath, - collector=collector, - imported_names=import_names, - ) - api_surface = None - if collect_api_surface: - api_surface = collect_module_api_surface( + with phase_ledger.phase(AnalysisPhaseKey.MODULE_PASSES): + typing_coverage, docstring_coverage = collect_module_adoption( tree=tree, module_name=module_name, filepath=filepath, collector=collector, imported_names=import_names, - include_private_modules=api_include_private_modules, ) - security_surfaces = collect_security_surfaces( - tree=tree, - module_name=module_name, - filepath=filepath, - ) - runtime_reachability = collect_runtime_reachability( - tree=tree, - module_name=module_name, - filepath=filepath, - collector=collector, - ) + api_surface = None + if collect_api_surface: + api_surface = collect_module_api_surface( + tree=tree, + module_name=module_name, + filepath=filepath, + collector=collector, + imported_names=import_names, + include_private_modules=api_include_private_modules, + ) + security_surfaces = collect_security_surfaces( + tree=tree, + module_name=module_name, + filepath=filepath, + ) + runtime_reachability = collect_runtime_reachability( + tree=tree, + module_name=module_name, + filepath=filepath, + collector=collector, + ) return ( units, diff --git a/codeclone/audit/__init__.py b/codeclone/audit/__init__.py index 7d475d64..ae0f3283 100644 --- a/codeclone/audit/__init__.py +++ b/codeclone/audit/__init__.py @@ -12,6 +12,7 @@ AUDIT_EVENT_CORE_VERSION, EVENT_ANALYSIS_COMPLETED, EVENT_BASELINE_ABUSE, + EVENT_BLAST_ARTIFACT_CREATED, EVENT_BLAST_RADIUS, EVENT_CLAIM_COMPLETED, EVENT_CLAIM_VIOLATED, @@ -27,6 +28,7 @@ EVENT_INTENT_VIOLATED, EVENT_PATCH_BUDGET, EVENT_PATCH_EXPIRED, + EVENT_PATCH_TRAIL_COMPUTED, EVENT_PATCH_VERIFIED, EVENT_PATCH_VIOLATED, EVENT_RECEIPT_CREATED, @@ -67,6 +69,7 @@ "DEFAULT_AUDIT_TOKEN_ESTIMATOR", "EVENT_ANALYSIS_COMPLETED", "EVENT_BASELINE_ABUSE", + "EVENT_BLAST_ARTIFACT_CREATED", "EVENT_BLAST_RADIUS", "EVENT_CLAIM_COMPLETED", "EVENT_CLAIM_VIOLATED", @@ -82,6 +85,7 @@ "EVENT_INTENT_VIOLATED", "EVENT_PATCH_BUDGET", "EVENT_PATCH_EXPIRED", + "EVENT_PATCH_TRAIL_COMPUTED", "EVENT_PATCH_VERIFIED", "EVENT_PATCH_VIOLATED", "EVENT_RECEIPT_CREATED", diff --git a/codeclone/audit/events.py b/codeclone/audit/events.py index 30a55c15..52801a7c 100644 --- a/codeclone/audit/events.py +++ b/codeclone/audit/events.py @@ -35,6 +35,7 @@ EVENT_WORKSPACE_CONFLICT = "workspace.conflict_detected" EVENT_WORKSPACE_GC = "workspace.gc_completed" EVENT_BLAST_RADIUS = "blast_radius.computed" +EVENT_BLAST_ARTIFACT_CREATED = "blast_artifact.created" EVENT_PATCH_BUDGET = "patch_budget.computed" EVENT_PATCH_VERIFIED = "patch_contract.verified" EVENT_PATCH_VIOLATED = "patch_contract.violated" @@ -66,6 +67,7 @@ EVENT_WORKSPACE_CONFLICT, EVENT_WORKSPACE_GC, EVENT_BLAST_RADIUS, + EVENT_BLAST_ARTIFACT_CREATED, EVENT_PATCH_BUDGET, EVENT_PATCH_VERIFIED, EVENT_PATCH_VIOLATED, @@ -83,6 +85,17 @@ # Compact mode keeps the intent description as a bounded forensic field. _COMPACT_TEXT_LIMIT = 500 + +# Forensic-retention policy: payload compaction never strips these +# event types. They are durable evidence that must survive auto_clear and stay +# exactly retrievable after the run/intent is cleared (review-receipt drill-down +# via get_review_receipt; full forensic patch trail via get_patch_trail; start +# blast drill-down via get_blast_artifact). Their complete payload is preserved +# under every payload mode; only the separately bounded event-core/replay +# projection applies. +_FULL_PAYLOAD_EVENT_TYPES: frozenset[str] = frozenset( + {EVENT_RECEIPT_CREATED, EVENT_PATCH_TRAIL_COMPUTED, EVENT_BLAST_ARTIFACT_CREATED} +) _EVENT_CORE_SCOPE_PATH_LIMIT = 50 _EVENT_CORE_CITATION_LIMIT = 32 _PROJECTION_SUPPLEMENT_FACT_KEYS = frozenset( @@ -202,6 +215,10 @@ def compact_payload_for_event( ) -> dict[str, object]: if payload is None: return {} + if event_type in _FULL_PAYLOAD_EVENT_TYPES: + # Forensic-retention policy: preserve the complete payload (e.g. the full + # typed review receipt) so it stays exactly retrievable post-clear. + return dict(payload) if event_type in _INTENT_PAYLOAD_EVENTS: return _compact_intent_payload(payload) if event_type == EVENT_INTENT_QUEUE_BLOCKED: @@ -235,6 +252,8 @@ def compact_payload_for_event( } if event_type == EVENT_BLAST_RADIUS: return _compact_blast_radius_payload(payload) + if event_type == EVENT_BLAST_ARTIFACT_CREATED: + return _compact_blast_artifact_payload(payload) if event_type == EVENT_ANALYSIS_COMPLETED: return _compact_analysis_completed_payload(payload) if event_type == EVENT_PATCH_BUDGET: @@ -252,18 +271,6 @@ def compact_payload_for_event( "violations": len(_sequence(payload.get("violations"))), "warnings": len(_sequence(payload.get("warnings"))), } - if event_type == EVENT_RECEIPT_CREATED: - receipt = _mapping(payload.get("receipt")) - return { - "format": str(payload.get("format", "")), - "verdict": str(receipt.get("verdict", "")), - "human_decisions": _sequence_field_count( - receipt, - "human_decision_points", - ), - } - if event_type == EVENT_PATCH_TRAIL_COMPUTED: - return _compact_patch_trail_payload(payload) return _compact_identifiers(payload) @@ -313,6 +320,8 @@ def _event_core_facts( }, False if event_type == EVENT_BLAST_RADIUS: return _compact_blast_radius_payload(payload), False + if event_type == EVENT_BLAST_ARTIFACT_CREATED: + return _blast_artifact_event_core_facts(payload) if event_type == EVENT_ANALYSIS_COMPLETED: return _compact_analysis_completed_payload(payload), False if event_type == EVENT_PATCH_BUDGET: @@ -447,11 +456,19 @@ def _summary_receipt_created(payload: Mapping[str, object]) -> str: return f"review receipt: {verdict}" +def _summary_blast_artifact_created(payload: Mapping[str, object]) -> str: + blast = _mapping(payload.get("blast_radius")) + artifact_id = str(payload.get("blast_artifact_id", "")).strip() or "unknown" + radius_level = str(blast.get("radius_level", "")).strip() or "unknown" + return f"blast artifact: {artifact_id}; radius={radius_level}" + + # Incident events whose summary needs bespoke per-type field extraction. _INCIDENT_BUILDERS: dict[str, Callable[[Mapping[str, object]], str]] = { EVENT_PATCH_VIOLATED: _summary_patch_violated, EVENT_BASELINE_ABUSE: _summary_baseline_abuse, EVENT_RECEIPT_CREATED: _summary_receipt_created, + EVENT_BLAST_ARTIFACT_CREATED: _summary_blast_artifact_created, } @@ -570,6 +587,31 @@ def _compact_blast_radius_payload(payload: Mapping[str, object]) -> dict[str, ob } +def _compact_blast_artifact_payload( + payload: Mapping[str, object], +) -> dict[str, object]: + blast = _mapping(payload.get("blast_radius")) + projection = _mapping(payload.get("projection_digest")) + return { + "blast_artifact_id": str(payload.get("blast_artifact_id", "")), + "run_id": str(payload.get("run_id", "")), + "projection_digest": str(projection.get("value", "")), + "detail_contract_version": str(payload.get("detail_contract_version", "")), + "radius_level": str(blast.get("radius_level", "")), + "direct_dependents": len(_sequence(blast.get("direct_dependents"))), + "transitive_dependents": len(_sequence(blast.get("transitive_dependents"))), + "clone_cohort_members": len(_sequence(blast.get("clone_cohort_members"))), + "do_not_touch": len(_sequence(blast.get("do_not_touch"))), + "review_context": len(_sequence(blast.get("review_context"))), + } + + +def _blast_artifact_event_core_facts( + payload: Mapping[str, object], +) -> tuple[dict[str, object], bool]: + return _compact_blast_artifact_payload(payload), False + + def _compact_budget_payload(payload: Mapping[str, object]) -> dict[str, object]: blast = _mapping(payload.get("blast_radius_summary")) gate = _mapping(payload.get("gate_preview")) @@ -582,22 +624,6 @@ def _compact_budget_payload(payload: Mapping[str, object]) -> dict[str, object]: } -def _compact_patch_trail_payload(payload: Mapping[str, object]) -> dict[str, object]: - counts = _patch_trail_counts(payload) - truncation = _mapping(payload.get("truncation")) - return { - "patch_trail_digest": str(payload.get("patch_trail_digest", "")), - "scope_check_status": str(payload.get("scope_check_status", "")), - "verification_status": str(payload.get("verification_status", "")), - "declared": _int_value(counts.get("declared")), - "changed": _int_value(counts.get("changed")), - "untouched_in_declared": _int_value(counts.get("untouched_in_declared")), - "unexpected": _int_value(counts.get("unexpected")), - "forbidden_touched": _int_value(counts.get("forbidden_touched")), - "truncation": bool(any(bool(value) for value in truncation.values())), - } - - def _patch_trail_event_core_facts( payload: Mapping[str, object], ) -> tuple[dict[str, object], bool]: @@ -774,6 +800,7 @@ def _payload_source(payload: Mapping[str, object] | None) -> str: "AUDIT_EVENT_CORE_VERSION", "EVENT_ANALYSIS_COMPLETED", "EVENT_BASELINE_ABUSE", + "EVENT_BLAST_ARTIFACT_CREATED", "EVENT_BLAST_RADIUS", "EVENT_CLAIM_COMPLETED", "EVENT_CLAIM_VIOLATED", diff --git a/codeclone/audit/reader.py b/codeclone/audit/reader.py index c0ba8c52..c47c1c6a 100644 --- a/codeclone/audit/reader.py +++ b/codeclone/audit/reader.py @@ -8,20 +8,26 @@ import json import sqlite3 -from collections.abc import Mapping +from collections.abc import Callable, Mapping from dataclasses import dataclass from pathlib import Path +from typing import TypeVar from ..utils.utc_timestamps import age_seconds_since_utc_timestamp from .events import ( ANALYSIS_SOURCE_CLI, ANALYSIS_SOURCE_MCP, EVENT_ANALYSIS_COMPLETED, + EVENT_BLAST_ARTIFACT_CREATED, + EVENT_PATCH_TRAIL_COMPUTED, + EVENT_RECEIPT_CREATED, repo_root_digest, ) from .schema import get_meta, open_audit_db_readonly from .validation import AuditReadError, AuditSchemaError +_ArtifactT = TypeVar("_ArtifactT") + @dataclass(frozen=True, slots=True) class AnalysisRunSnapshot: @@ -60,6 +66,408 @@ class AuditRecord: payload_json: str | None = None +@dataclass(frozen=True, slots=True) +class StoredReviewReceipt: + """A review receipt recovered from the durable audit trail. + + It is the exact payload persisted when the receipt was created + (``controller_events`` ``review_receipt.created``); it survives + ``auto_clear`` and is never re-derived from current state. + """ + + run_id: str | None + receipt_digest: str | None + verdict: str | None + receipt_version: str | None + receipt_format: str | None + created_at_utc: str + payload: Mapping[str, object] + + +@dataclass(frozen=True, slots=True) +class ReviewReceiptLookup: + """Fail-closed result of a durable review-receipt lookup.""" + + status: str + receipt: StoredReviewReceipt | None = None + match_count: int = 0 + + +@dataclass(frozen=True, slots=True) +class StoredPatchTrail: + """A patch trail recovered from the durable audit trail. + + It is the exact full forensic payload persisted when the trail was computed + (``controller_events`` ``patch_trail.computed``); it survives ``auto_clear`` + and is never re-derived from current state. + """ + + run_id: str | None + patch_trail_digest: str | None + scope_check_status: str | None + verification_status: str | None + schema_version: str | None + created_at_utc: str + payload: Mapping[str, object] + + +@dataclass(frozen=True, slots=True) +class PatchTrailLookup: + """Fail-closed result of a durable patch-trail lookup.""" + + status: str + patch_trail: StoredPatchTrail | None = None + match_count: int = 0 + + +@dataclass(frozen=True, slots=True) +class StoredBlastArtifact: + """A start-time blast artifact recovered from the durable audit trail. + + It is the exact full blast projection persisted when start created its + safety-complete summary. It is never recomputed from the current run/state. + """ + + run_id: str | None + blast_artifact_id: str + projection_digest: str | None + detail_contract_version: str | None + radius_level: str | None + created_at_utc: str + payload: Mapping[str, object] + + +@dataclass(frozen=True, slots=True) +class BlastArtifactLookup: + """Fail-closed result of a durable blast-artifact lookup.""" + + status: str + blast_artifact: StoredBlastArtifact | None = None + match_count: int = 0 + + +def lookup_review_receipt( + db_path: Path, + *, + run_id: str | None = None, + receipt_digest: str | None = None, +) -> ReviewReceiptLookup: + """Look up a durably stored review receipt by run id and/or receipt digest. + + Read-only and exact: returns the receipt payload exactly as persisted in the + audit trail, never re-derives it from current state. ``run_id`` matches the + stored short id or a full id that starts with it; ``receipt_digest`` is an + exact (non-prefix) match. Fail-closed: ``ambiguous`` when more than one + receipt matches, ``digest_mismatch`` when the run has receipts but none with + the requested digest, ``malformed_stored_receipt`` when the only matching + rows cannot be parsed. + """ + status, receipt, match_count = _lookup_audit_event_artifact( + db_path, + event_type=EVENT_RECEIPT_CREATED, + run_id=run_id, + digest=receipt_digest, + build=_build_review_receipt, + digest_of=_receipt_digest_value, + malformed_status="malformed_stored_receipt", + ) + return ReviewReceiptLookup(status=status, receipt=receipt, match_count=match_count) + + +def lookup_patch_trail( + db_path: Path, + *, + run_id: str | None = None, + patch_trail_digest: str | None = None, +) -> PatchTrailLookup: + """Look up a durably stored patch trail by run id and/or patch-trail digest. + + Read-only and exact, mirroring :func:`lookup_review_receipt`: returns the + full forensic patch-trail payload exactly as persisted in the audit trail + (``patch_trail.computed``), never re-derived. ``run_id`` matches the stored + short id or a full id that starts with it; ``patch_trail_digest`` is an exact + match. Fail-closed: ``ambiguous`` when more than one trail matches, + ``digest_mismatch`` when the run has trails but none with the requested + digest, ``malformed_stored_patch_trail`` when the only matching rows cannot + be parsed. + """ + status, trail, match_count = _lookup_audit_event_artifact( + db_path, + event_type=EVENT_PATCH_TRAIL_COMPUTED, + run_id=run_id, + digest=patch_trail_digest, + build=_build_patch_trail, + digest_of=_patch_trail_digest_value, + malformed_status="malformed_stored_patch_trail", + ) + return PatchTrailLookup(status=status, patch_trail=trail, match_count=match_count) + + +def lookup_blast_artifact( + db_path: Path, + *, + run_id: str | None = None, + blast_artifact_id: str | None = None, + projection_digest: str | None = None, +) -> BlastArtifactLookup: + """Look up a durably stored start blast artifact by stable identity. + + Read-only and exact: returns the full blast projection exactly as persisted + in the audit trail (``blast_artifact.created``), never recomputed from the + current run. ``run_id`` matches the stored short id or a full id that starts + with it; ``blast_artifact_id`` and ``projection_digest`` are exact matches. + Fail-closed: ``ambiguous`` when the provided keys identify multiple + artifacts, ``digest_mismatch`` or ``artifact_id_mismatch`` when a run has + blast artifacts but none with the requested key, and + ``malformed_stored_blast_artifact`` when matching rows cannot be parsed. + """ + status, artifact, match_count = _lookup_blast_artifact( + db_path, + run_id=run_id, + blast_artifact_id=blast_artifact_id, + projection_digest=projection_digest, + ) + return BlastArtifactLookup( + status=status, + blast_artifact=artifact, + match_count=match_count, + ) + + +def _lookup_audit_event_artifact( + db_path: Path, + *, + event_type: str, + run_id: str | None, + digest: str | None, + build: Callable[[object, object, Mapping[str, object]], _ArtifactT | None], + digest_of: Callable[[Mapping[str, object]], str | None], + malformed_status: str, +) -> tuple[str, _ArtifactT | None, int]: + """Fail-closed lookup of a durable audit-event artifact by run id and digest. + + Shared by the typed-artifact retrieval tools (review receipt, patch trail). + ``build`` turns one stored ``(run_id, created_at, payload)`` row into the + typed artifact (or ``None`` when the row is malformed); ``digest_of`` reads + the artifact's exact digest from its payload. Returns + ``(status, artifact, match_count)`` with ``status`` one of ``ok``, + ``not_found``, ``ambiguous``, ``digest_mismatch`` or ``malformed_status``. + """ + status, parsed, malformed = _read_filtered_artifacts( + db_path, + event_type=event_type, + run_id=run_id, + build=build, + ) + if status is not None: + return ("not_found", None, 0) + if digest is not None: + candidates = [item for payload, item in parsed if digest_of(payload) == digest] + else: + candidates = [item for _payload, item in parsed] + if len(candidates) == 1: + return ("ok", candidates[0], 1) + if len(candidates) > 1: + return ("ambiguous", None, len(candidates)) + if digest is not None and parsed: + return ("digest_mismatch", None, 0) + if malformed and not parsed: + return (malformed_status, None, 0) + return ("not_found", None, 0) + + +def _lookup_blast_artifact( + db_path: Path, + *, + run_id: str | None, + blast_artifact_id: str | None, + projection_digest: str | None, +) -> tuple[str, StoredBlastArtifact | None, int]: + status, parsed_rows, malformed = _read_filtered_artifacts( + db_path, + event_type=EVENT_BLAST_ARTIFACT_CREATED, + run_id=run_id, + build=_build_blast_artifact, + ) + if status is not None: + return ("not_found", None, 0) + parsed = [artifact for _payload, artifact in parsed_rows] + candidates = parsed + if blast_artifact_id is not None: + candidates = [ + item for item in candidates if item.blast_artifact_id == blast_artifact_id + ] + if projection_digest is not None: + candidates = [ + item for item in candidates if item.projection_digest == projection_digest + ] + if len(candidates) == 1: + return ("ok", candidates[0], 1) + if len(candidates) > 1: + return ("ambiguous", None, len(candidates)) + if projection_digest is not None and parsed: + return ("digest_mismatch", None, 0) + if blast_artifact_id is not None and parsed: + return ("artifact_id_mismatch", None, 0) + if malformed and not parsed: + return ("malformed_stored_blast_artifact", None, 0) + return ("not_found", None, 0) + + +def _read_filtered_artifacts( + db_path: Path, + *, + event_type: str, + run_id: str | None, + build: Callable[[object, object, Mapping[str, object]], _ArtifactT | None], +) -> tuple[str | None, list[tuple[Mapping[str, object], _ArtifactT]], int]: + if not db_path.exists(): + return ("not_found", [], 0) + rows = _read_event_payload_rows(db_path, event_type) + filtered = [ + row for row in rows if run_id is None or _run_id_matches(row[0], run_id) + ] + if not filtered: + return ("not_found", [], 0) + parsed, malformed = _parsed_artifacts(filtered, build) + return (None, parsed, malformed) + + +def _parsed_artifacts( + rows: list[tuple[object, object, object]], + build: Callable[[object, object, Mapping[str, object]], _ArtifactT | None], +) -> tuple[list[tuple[Mapping[str, object], _ArtifactT]], int]: + parsed: list[tuple[Mapping[str, object], _ArtifactT]] = [] + malformed = 0 + for stored_run_id, created_at, payload_json in rows: + payload = _parse_payload_mapping(payload_json) + artifact = ( + None if payload is None else build(stored_run_id, created_at, payload) + ) + if payload is None or artifact is None: + malformed += 1 + continue + parsed.append((payload, artifact)) + return parsed, malformed + + +def _read_event_payload_rows( + db_path: Path, event_type: str +) -> list[tuple[object, object, object]]: + try: + conn = open_audit_db_readonly(db_path) + except (sqlite3.Error, AuditSchemaError, OSError) as exc: + raise AuditReadError(f"cannot open audit database: {exc}") from exc + try: + return conn.execute( + "SELECT run_id, created_at_utc, payload_json FROM controller_events " + "WHERE event_type = ? " + "ORDER BY created_at_utc DESC, id DESC", + (event_type,), + ).fetchall() + except (sqlite3.Error, AuditSchemaError) as exc: + raise AuditReadError(f"cannot read audit database: {exc}") from exc + finally: + conn.close() + + +def _run_id_matches(stored: object, requested: str) -> bool: + stored_id = _str_or_none(stored) + if not stored_id: + return False + return stored_id == requested or requested.startswith(stored_id) + + +def _parse_payload_mapping(payload_json: object) -> Mapping[str, object] | None: + if not isinstance(payload_json, str) or not payload_json: + return None + try: + payload = json.loads(payload_json) + except (ValueError, TypeError): + return None + if not isinstance(payload, Mapping): + return None + return payload + + +def _build_review_receipt( + stored_run_id: object, + created_at: object, + payload: Mapping[str, object], +) -> StoredReviewReceipt | None: + typed = payload.get("receipt") + if not isinstance(typed, Mapping): + return None + return StoredReviewReceipt( + run_id=_str_or_none(payload.get("run_id")) or _str_or_none(stored_run_id), + receipt_digest=_receipt_digest_value(payload), + verdict=_str_or_none(payload.get("verdict")) + or _str_or_none(typed.get("verdict")), + receipt_version=_str_or_none(payload.get("receipt_version")) + or _str_or_none(typed.get("receipt_version")), + receipt_format=_str_or_none(payload.get("format")), + created_at_utc=_str_or_none(created_at) or "", + payload=payload, + ) + + +def _build_patch_trail( + stored_run_id: object, + created_at: object, + payload: Mapping[str, object], +) -> StoredPatchTrail | None: + digest = _str_or_none(payload.get("patch_trail_digest")) + if digest is None: + return None + return StoredPatchTrail( + run_id=_str_or_none(stored_run_id), + patch_trail_digest=digest, + scope_check_status=_str_or_none(payload.get("scope_check_status")), + verification_status=_str_or_none(payload.get("verification_status")), + schema_version=_str_or_none(payload.get("schema_version")), + created_at_utc=_str_or_none(created_at) or "", + payload=payload, + ) + + +def _build_blast_artifact( + stored_run_id: object, + created_at: object, + payload: Mapping[str, object], +) -> StoredBlastArtifact | None: + artifact_id = _str_or_none(payload.get("blast_artifact_id")) + blast = payload.get("blast_radius") + if artifact_id is None or not isinstance(blast, Mapping): + return None + return StoredBlastArtifact( + run_id=_str_or_none(payload.get("run_id")) or _str_or_none(stored_run_id), + blast_artifact_id=artifact_id, + projection_digest=_blast_artifact_digest_value(payload), + detail_contract_version=_str_or_none(payload.get("detail_contract_version")), + radius_level=_str_or_none(blast.get("radius_level")), + created_at_utc=_str_or_none(created_at) or "", + payload=payload, + ) + + +def _receipt_digest_value(payload: Mapping[str, object]) -> str | None: + digest = payload.get("receipt_digest") + if isinstance(digest, Mapping): + return _str_or_none(digest.get("value")) + return _str_or_none(digest) + + +def _patch_trail_digest_value(payload: Mapping[str, object]) -> str | None: + return _str_or_none(payload.get("patch_trail_digest")) + + +def _blast_artifact_digest_value(payload: Mapping[str, object]) -> str | None: + digest = payload.get("projection_digest") + if isinstance(digest, Mapping): + return _str_or_none(digest.get("value")) + return _str_or_none(digest) + + @dataclass(frozen=True, slots=True) class TypeTokenProfile: """Token stats for one event type.""" @@ -767,11 +1175,20 @@ def _short_run_id(run_id: str | None, payload: Mapping[str, object]) -> str | No "AnalysisRunSnapshot", "AuditRecord", "AuditSummary", + "BlastArtifactLookup", + "PatchTrailLookup", "PayloadFootprint", + "ReviewReceiptLookup", + "StoredBlastArtifact", + "StoredPatchTrail", + "StoredReviewReceipt", "TopPayload", "TypeTokenProfile", "WorkflowTokenProfile", "count_audit_event_core_gaps", + "lookup_blast_artifact", + "lookup_patch_trail", + "lookup_review_receipt", "payload_footprint_to_dict", "read_audit_event_core_records", "read_audit_summary", diff --git a/codeclone/cache/entries.py b/codeclone/cache/entries.py index e8e0db9e..c8466a1e 100644 --- a/codeclone/cache/entries.py +++ b/codeclone/cache/entries.py @@ -393,6 +393,7 @@ def _as_runtime_reachability_framework(value: object) -> str | None: | "dependency_injector" | "django" | "fastapi" + | "pydantic" | "sqlalchemy" | "starlette" | "typer" diff --git a/codeclone/config/memory.py b/codeclone/config/memory.py index 3fd264c6..0a993c34 100644 --- a/codeclone/config/memory.py +++ b/codeclone/config/memory.py @@ -76,7 +76,7 @@ class SemanticConfig(BaseModel): - """Validated semantic-retrieval config (Phase 20). + """Validated semantic-retrieval config. The single validation authority for ``[tool.codeclone.memory.semantic]``: ``frozen`` + ``extra="forbid"`` reject unknown keys, bad literals, and @@ -125,7 +125,7 @@ def _apply_provider_defaults(cls, data: object) -> object: class IngestConfig(BaseModel): - """Validated memory ingest path config (Phase 18+). + """Validated memory ingest path config. Empty ``contract_constants_paths`` / ``document_link_paths`` enable registry-aware auto-discovery. MCP tool-count contradiction checks run diff --git a/codeclone/config/memory_defaults.py b/codeclone/config/memory_defaults.py index f9291f55..f587b76f 100644 --- a/codeclone/config/memory_defaults.py +++ b/codeclone/config/memory_defaults.py @@ -58,7 +58,7 @@ DEFAULT_INGEST_MCP_TOOL_SCHEMA_SNAPSHOT_PATH: Final[str | None] = None DEFAULT_INGEST_MCP_TOOL_COUNT_DOC_PATHS: Final[tuple[str, ...]] = () -# Semantic retrieval index (Phase 20). Default OFF + "diagnostic" keep the +# Semantic retrieval index. Default OFF + "diagnostic" keep the # community default zero-extra-dependency and offline; a real recall model is # opt-in (fastembed/community local, api/paid later). DEFAULT_SEMANTIC_ENABLED: Final = False diff --git a/codeclone/config/observability.py b/codeclone/config/observability.py index e9574ebf..84b15902 100644 --- a/codeclone/config/observability.py +++ b/codeclone/config/observability.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Platform observability configuration (Phase 29, Track A). +"""Platform observability configuration. Env-first resolution. Default OFF — when disabled, this does the minimal env check and never imports psutil, never opens a store, never parses a pyproject diff --git a/codeclone/contracts/__init__.py b/codeclone/contracts/__init__.py index 083c00e6..7aa6ee97 100644 --- a/codeclone/contracts/__init__.py +++ b/codeclone/contracts/__init__.py @@ -16,11 +16,23 @@ REPORT_SCHEMA_VERSION: Final = "2.11" METRICS_BASELINE_SCHEMA_VERSION: Final = "1.2" ENGINEERING_MEMORY_SCHEMA_VERSION: Final = "1.7" -# Semantic retrieval index (Phase 20). Derived, rebuildable sidecar — NOT +# Semantic retrieval index. Derived, rebuildable sidecar — NOT # covered by ENGINEERING_MEMORY_SCHEMA_VERSION. Bump to invalidate the index # on an incompatible projection/row-format change (forces a rebuild, not a -# SQLite migration). -SEMANTIC_INDEX_FORMAT_VERSION: Final = "2" +# SQLite migration). v3 (Stage 2) adds the ``source_revision`` row column; the +# one-time full rebuild is actually forced by the backend schema check, not by +# this constant — it stays in sync so the reported ``index_version`` is honest. +SEMANTIC_INDEX_FORMAT_VERSION: Final = "3" +# Global escape hatch for the cheap per-row ``source_revision`` key (Stage 2 +# incremental sourcing). Bump to invalidate EVERY semantic lane at once on a +# cross-cutting projection/row-format change. Per-source projection versions are +# folded into each source's content token instead, so a single-lane projector +# change re-embeds only that lane (see ``memory.semantic.projection``). +SEMANTIC_PROJECTION_REVISION_VERSION: Final = "1" +# Per-source projection versions folded into each source's ``source_revision`` +# content token. Trajectory reuses TRAJECTORY_PROJECTION_VERSION below. +MEMORY_PROJECTION_VERSION: Final = "memory-v1" +AUDIT_PROJECTION_VERSION: Final = "audit-v1" PATCH_TRAIL_SCHEMA_VERSION: Final = "1" # Platform observability sqlite store (.codeclone/db/platform_observability.sqlite3): # a runtime-profiling plane separate from audit/memory. Bump on an incompatible @@ -129,6 +141,7 @@ def cli_help_epilog() -> str: __all__ = [ + "AUDIT_PROJECTION_VERSION", "BASELINE_FINGERPRINT_VERSION", "BASELINE_SCHEMA_VERSION", "CACHE_VERSION", @@ -180,11 +193,13 @@ def cli_help_epilog() -> str: "HEALTH_WEIGHTS", "IDE_GOVERNANCE_PROTOCOL_VERSION", "ISSUES_URL", + "MEMORY_PROJECTION_VERSION", "METRICS_BASELINE_SCHEMA_VERSION", "PATCH_TRAIL_SCHEMA_VERSION", "REPORT_SCHEMA_VERSION", "REPOSITORY_URL", "SEMANTIC_INDEX_FORMAT_VERSION", + "SEMANTIC_PROJECTION_REVISION_VERSION", "TRAJECTORY_PROJECTION_VERSION", "TRAJECTORY_PROJECTION_VERSION_V1", "TRAJECTORY_QUALITY_SCORE_VERSION", diff --git a/codeclone/core/_types.py b/codeclone/core/_types.py index 6b7dd25a..5660085c 100644 --- a/codeclone/core/_types.py +++ b/codeclone/core/_types.py @@ -8,9 +8,10 @@ from argparse import Namespace from collections.abc import Mapping -from dataclasses import dataclass +from dataclasses import dataclass, field from hashlib import sha256 from pathlib import Path +from typing import TYPE_CHECKING import orjson @@ -43,6 +44,9 @@ ) from ..utils.coerce import as_int, as_mapping, as_str +if TYPE_CHECKING: + from ..analysis.phase_ledger import PhaseSnapshot + MAX_FILE_SIZE = 10 * 1024 * 1024 DEFAULT_BATCH_SIZE = 100 PARALLEL_MIN_FILES_PER_WORKER = 8 @@ -115,6 +119,11 @@ class FileProcessResult: error_kind: str | None = None file_metrics: FileMetrics | None = None structural_findings: list[StructuralFindingGroup] | None = None + phase_snapshot: PhaseSnapshot | None = field( + default=None, + compare=False, + repr=False, + ) @dataclass(frozen=True, slots=True) @@ -143,6 +152,11 @@ class ProcessingResult: structural_findings: tuple[StructuralFindingGroup, ...] = () function_relationship_facts: tuple[FunctionRelationshipFacts, ...] = () source_stats_by_file: tuple[tuple[str, int, int, int, int], ...] = () + phase_snapshot: PhaseSnapshot | None = field( + default=None, + compare=False, + repr=False, + ) @dataclass(frozen=True, slots=True) diff --git a/codeclone/core/parallelism.py b/codeclone/core/parallelism.py index b3bcbfe1..4e969d02 100644 --- a/codeclone/core/parallelism.py +++ b/codeclone/core/parallelism.py @@ -9,6 +9,7 @@ from collections.abc import Callable, Sequence from concurrent.futures import ProcessPoolExecutor, as_completed +from ..analysis.phase_ledger import PhaseLedger, PhaseSnapshot from ..cache.entries import SourceStatsDict from ..cache.store import Cache from ..models import ( @@ -161,8 +162,16 @@ def process( block_min_stmt = int(boot.args.block_min_stmt) segment_min_loc = int(boot.args.segment_min_loc) segment_min_stmt = int(boot.args.segment_min_stmt) + from codeclone.observability.runtime import is_observability_enabled + + collect_phases = is_observability_enabled() + batch_snapshot = PhaseSnapshot.empty() + + def _phase_ledger_for_file() -> PhaseLedger | None: + return PhaseLedger(active=True) if collect_phases else None def _accept_result(result: FileProcessResult) -> None: + nonlocal batch_snapshot nonlocal files_analyzed nonlocal files_skipped nonlocal analyzed_lines @@ -171,6 +180,8 @@ def _accept_result(result: FileProcessResult) -> None: nonlocal analyzed_classes if result.success and result.stat is not None: + if result.phase_snapshot is not None: + batch_snapshot = batch_snapshot.merge(result.phase_snapshot) source_stats_payload = SourceStatsDict( lines=result.lines, functions=result.functions, @@ -271,6 +282,7 @@ def _run_sequential(files: Sequence[str]) -> None: block_min_stmt=block_min_stmt, segment_min_loc=segment_min_loc, segment_min_stmt=segment_min_stmt, + phase_ledger=_phase_ledger_for_file(), ) ) if on_advance is not None: @@ -296,6 +308,7 @@ def _run_sequential(files: Sequence[str]) -> None: block_min_stmt=block_min_stmt, segment_min_loc=segment_min_loc, segment_min_stmt=segment_min_stmt, + phase_ledger=_phase_ledger_for_file(), ) for filepath in batch ] @@ -321,6 +334,9 @@ def _run_sequential(files: Sequence[str]) -> None: else: _run_sequential(files_to_process) + volumes = batch_snapshot.volume_map() + phase_snapshot = batch_snapshot if volumes.get("files_timed", 0) > 0 else None + return ProcessingResult( units=tuple(sorted(all_units, key=_group_item_sort_key)), blocks=tuple(sorted(all_blocks, key=_group_item_sort_key)), @@ -388,4 +404,5 @@ def _run_sequential(files: Sequence[str]) -> None: (filepath, *stats) for filepath, stats in sorted(source_stats_by_file.items()) ), + phase_snapshot=phase_snapshot, ) diff --git a/codeclone/core/worker.py b/codeclone/core/worker.py index 4e45a0ec..39474be0 100644 --- a/codeclone/core/worker.py +++ b/codeclone/core/worker.py @@ -12,6 +12,11 @@ from functools import lru_cache from ..analysis.normalizer import NormalizationConfig +from ..analysis.phase_ledger import ( + INERT_PHASE_LEDGER, + AnalysisVolumeKey, + PhaseLedger, +) from ..analysis.units import extract_units_and_stats_from_source from ..cache.entries import FileStat from ..contracts import ( @@ -37,6 +42,7 @@ def process_file( block_min_stmt: int = DEFAULT_BLOCK_MIN_STMT, segment_min_loc: int = DEFAULT_SEGMENT_MIN_LOC, segment_min_stmt: int = DEFAULT_SEGMENT_MIN_STMT, + phase_ledger: PhaseLedger = INERT_PHASE_LEDGER, ) -> FileProcessResult: try: resolved = resolved_path_under_root(filepath, root) @@ -102,8 +108,13 @@ def process_file( collect_structural_findings=collect_structural_findings, collect_api_surface=collect_api_surface, api_include_private_modules=api_include_private_modules, + phase_ledger=phase_ledger, ) ) + phase_snapshot = None + if phase_ledger.active: + phase_ledger.add_volume(AnalysisVolumeKey.FILES_TIMED) + phase_snapshot = phase_ledger.snapshot() return FileProcessResult( filepath=filepath, success=True, @@ -117,6 +128,7 @@ def process_file( stat=stat, file_metrics=file_metrics, structural_findings=structural_findings, + phase_snapshot=phase_snapshot, ) except Exception as exc: # pragma: no cover - defensive shell around workers return FileProcessResult( @@ -141,6 +153,7 @@ def _invoke_process_file( block_min_stmt: int, segment_min_loc: int, segment_min_stmt: int, + phase_ledger: PhaseLedger | None = None, ) -> FileProcessResult: optional_kwargs: dict[str, object] = { "collect_structural_findings": collect_structural_findings, @@ -151,6 +164,8 @@ def _invoke_process_file( "segment_min_loc": segment_min_loc, "segment_min_stmt": segment_min_stmt, } + if phase_ledger is not None: + optional_kwargs["phase_ledger"] = phase_ledger process_callable: Callable[..., FileProcessResult] = process_file supported_names = _supported_process_file_kwarg_names(process_callable) if supported_names is None: diff --git a/codeclone/domain/findings.py b/codeclone/domain/findings.py index 0bdb392f..ebe1534c 100644 --- a/codeclone/domain/findings.py +++ b/codeclone/domain/findings.py @@ -48,7 +48,7 @@ DESIGN_KIND_INSTANCE_INDEPENDENT_METHOD: Final = "instance_independent_method" -# Classifications for instance-independent method occurrences (Phase 21). +# Classifications for instance-independent method occurrences. # Only ``candidate`` is a default-surfaced signal; the rest are context or # suppressed so default payloads avoid noisy contract methods. IIM_CLASSIFICATION_CANDIDATE: Final = "candidate" diff --git a/codeclone/findings/design/__init__.py b/codeclone/findings/design/__init__.py index ae0aa738..c118b75d 100644 --- a/codeclone/findings/design/__init__.py +++ b/codeclone/findings/design/__init__.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Report-only structural design signals (Phase 21). +"""Report-only structural design signals. Design signals are advisory review context. They never affect gates, health, baseline, fingerprints, patch verification acceptance, ``edit_allowed``, diff --git a/codeclone/findings/design/instance_methods.py b/codeclone/findings/design/instance_methods.py index f965d150..53fb8dd1 100644 --- a/codeclone/findings/design/instance_methods.py +++ b/codeclone/findings/design/instance_methods.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Instance-independent method detection (Phase 21, report-only design signal). +"""Instance-independent method detection for report-only design signals. A method is *instance-independent* when it declares ``self`` but its executable body never reads the instance receiver. This is a deterministic AST signal: diff --git a/codeclone/findings/structural/detectors.py b/codeclone/findings/structural/detectors.py index 0ebe1380..dd9b147b 100644 --- a/codeclone/findings/structural/detectors.py +++ b/codeclone/findings/structural/detectors.py @@ -6,7 +6,7 @@ """CodeClone — structural code quality analysis for Python. -Structural findings extraction layer (Phase 1: duplicated_branches). +Structural findings extraction layer for duplicated branch facts. This module is report-only: findings do not affect clone detection, fingerprints, baseline semantics, exit codes, or health scores. diff --git a/codeclone/memory/embedding/__init__.py b/codeclone/memory/embedding/__init__.py index 8e851236..fb9d75a7 100644 --- a/codeclone/memory/embedding/__init__.py +++ b/codeclone/memory/embedding/__init__.py @@ -153,7 +153,7 @@ def resolve_embedding_provider(config: SemanticConfig) -> EmbeddingProvider: "embedding_provider='fastembed' for community local semantic search" ) raise MemorySemanticUnavailableError( - "api embedding provider is not available yet (Phase 20.6); " + "api embedding provider is not available yet; " "use embedding_provider='diagnostic'" ) diff --git a/codeclone/memory/experience/store.py b/codeclone/memory/experience/store.py index d2132c60..3895ca9e 100644 --- a/codeclone/memory/experience/store.py +++ b/codeclone/memory/experience/store.py @@ -11,8 +11,10 @@ from __future__ import annotations import sqlite3 -from collections.abc import Sequence +from collections.abc import Callable, Sequence +from typing import TypeVar +from ...utils.iterutils import chunked from .models import ( Experience, ExperienceEvidence, @@ -21,6 +23,21 @@ ExperienceStatus, ) +_SQLITE_IN_QUERY_BATCH = 500 +_T = TypeVar("_T") +_FACETS_BATCH_SQL = ( + "SELECT experience_id, facet_kind, facet_value, count " + "FROM memory_experience_facets " + "WHERE experience_id IN ({placeholders}) " + "ORDER BY experience_id ASC, facet_kind ASC, facet_value ASC" +) +_EVIDENCE_BATCH_SQL = ( + "SELECT experience_id, trajectory_id, outcome, finished_at_utc " + "FROM memory_experience_evidence " + "WHERE experience_id IN ({placeholders}) " + "ORDER BY experience_id ASC, finished_at_utc ASC, trajectory_id ASC" +) + def _use_row_factory(conn: sqlite3.Connection) -> None: conn.row_factory = sqlite3.Row @@ -33,15 +50,133 @@ def replace_experiences( experiences: Sequence[Experience], ) -> int: """Replace all experiences for a project with the distilled set.""" - conn.execute("DELETE FROM memory_experiences WHERE project_id=?", (project_id,)) - for experience in experiences: - _insert_experience(conn, experience) + if not experiences: + conn.execute("DELETE FROM memory_experiences WHERE project_id=?", (project_id,)) + conn.commit() + return 0 + + new_by_digest = { + experience.experience_digest: experience for experience in experiences + } + stored_by_digest = _experiences_by_digest(conn, project_id=project_id) + existing_digests = set(stored_by_digest) + new_digests = set(new_by_digest) + + remove_digests = existing_digests - new_digests + refresh: list[Experience] = [] + for digest in sorted(new_digests): + incoming = new_by_digest[digest] + stored = stored_by_digest.get(digest) + if stored is None: + refresh.append(incoming) + continue + if _experience_content_key(stored) != _experience_content_key(incoming): + remove_digests.add(digest) + refresh.append(incoming) + + if not remove_digests and not refresh: + return len(experiences) + + for batch in chunked(tuple(sorted(remove_digests)), _SQLITE_IN_QUERY_BATCH): + placeholders = ", ".join("?" for _ in batch) + conn.execute( + f"DELETE FROM memory_experiences WHERE project_id=? " + f"AND experience_digest IN ({placeholders})", + (project_id, *batch), + ) + if refresh: + _batch_insert_experiences(conn, refresh) conn.commit() return len(experiences) -def _insert_experience(conn: sqlite3.Connection, experience: Experience) -> None: - conn.execute( +def _experiences_by_digest( + conn: sqlite3.Connection, + *, + project_id: str, +) -> dict[str, Experience]: + _use_row_factory(conn) + rows = conn.execute( + "SELECT * FROM memory_experiences WHERE project_id=?", + (project_id,), + ).fetchall() + if not rows: + return {} + return { + experience.experience_digest: experience + for experience in _hydrate_experience_rows(conn, rows) + } + + +def _group_rows_by_experience_id( + conn: sqlite3.Connection, + *, + ids: Sequence[str], + sql: str, + build: Callable[[sqlite3.Row], _T], +) -> dict[str, list[_T]]: + grouped: dict[str, list[_T]] = {experience_id: [] for experience_id in ids} + for batch in chunked(tuple(ids), _SQLITE_IN_QUERY_BATCH): + placeholders = ", ".join("?" for _ in batch) + rows = conn.execute(sql.format(placeholders=placeholders), batch).fetchall() + for row in rows: + grouped.setdefault(str(row["experience_id"]), []).append(build(row)) + return grouped + + +def _hydrate_experience_rows( + conn: sqlite3.Connection, + rows: Sequence[sqlite3.Row], +) -> list[Experience]: + experience_ids = [str(row["id"]) for row in rows] + facets_by_id = _group_rows_by_experience_id( + conn, + ids=experience_ids, + sql=_FACETS_BATCH_SQL, + build=_row_to_facet, + ) + evidence_by_id = _group_rows_by_experience_id( + conn, + ids=experience_ids, + sql=_EVIDENCE_BATCH_SQL, + build=_row_to_evidence, + ) + return [ + _row_to_experience( + row, + facets=tuple(facets_by_id.get(str(row["id"]), [])), + evidence=tuple(evidence_by_id.get(str(row["id"]), [])), + ) + for row in rows + ] + + +def _experience_content_key(experience: Experience) -> tuple[object, ...]: + """Comparable payload excluding distill timestamps refreshed every run.""" + return ( + experience.id, + experience.repo_root_digest, + experience.subject_family, + experience.signal, + experience.outcome_class, + experience.support, + experience.quality_min, + experience.information_value, + experience.status, + experience.statement, + experience.distillation_version, + experience.first_observed_at_utc, + experience.last_observed_at_utc, + experience.facets, + experience.evidence, + ) + + +def _batch_insert_experiences( + conn: sqlite3.Connection, + experiences: Sequence[Experience], +) -> None: + conn.executemany( """ INSERT INTO memory_experiences( id, project_id, repo_root_digest, subject_family, signal, @@ -51,42 +186,52 @@ def _insert_experience(conn: sqlite3.Connection, experience: Experience) -> None updated_at_utc ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, - ( - experience.id, - experience.project_id, - experience.repo_root_digest, - experience.subject_family, - experience.signal, - experience.outcome_class, - experience.support, - experience.quality_min, - experience.information_value, - experience.status, - experience.statement, - experience.experience_digest, - experience.distillation_version, - experience.first_observed_at_utc, - experience.last_observed_at_utc, - experience.distilled_at_utc, - experience.updated_at_utc, - ), - ) - conn.executemany( - "INSERT INTO memory_experience_facets(" - "experience_id, facet_kind, facet_value, count) VALUES (?, ?, ?, ?)", [ - (experience.id, facet.facet_kind, facet.facet_value, facet.count) - for facet in experience.facets - ], - ) - conn.executemany( - "INSERT INTO memory_experience_evidence(" - "experience_id, trajectory_id, outcome, finished_at_utc) VALUES (?, ?, ?, ?)", - [ - (experience.id, item.trajectory_id, item.outcome, item.finished_at_utc) - for item in experience.evidence + ( + experience.id, + experience.project_id, + experience.repo_root_digest, + experience.subject_family, + experience.signal, + experience.outcome_class, + experience.support, + experience.quality_min, + experience.information_value, + experience.status, + experience.statement, + experience.experience_digest, + experience.distillation_version, + experience.first_observed_at_utc, + experience.last_observed_at_utc, + experience.distilled_at_utc, + experience.updated_at_utc, + ) + for experience in experiences ], ) + facet_rows = [ + (experience.id, facet.facet_kind, facet.facet_value, facet.count) + for experience in experiences + for facet in experience.facets + ] + if facet_rows: + conn.executemany( + "INSERT INTO memory_experience_facets(" + "experience_id, facet_kind, facet_value, count) VALUES (?, ?, ?, ?)", + facet_rows, + ) + evidence_rows = [ + (experience.id, item.trajectory_id, item.outcome, item.finished_at_utc) + for experience in experiences + for item in experience.evidence + ] + if evidence_rows: + conn.executemany( + "INSERT INTO memory_experience_evidence(" + "experience_id, trajectory_id, outcome, finished_at_utc) " + "VALUES (?, ?, ?, ?)", + evidence_rows, + ) def count_experiences(conn: sqlite3.Connection, *, project_id: str) -> int: @@ -108,7 +253,7 @@ def list_experiences( "ORDER BY subject_family ASC, signal ASC, outcome_class ASC", (project_id,), ).fetchall() - return [_row_to_experience(conn, row) for row in rows] + return _hydrate_experience_rows(conn, rows) def list_experiences_for_subject_family( @@ -123,7 +268,7 @@ def list_experiences_for_subject_family( "ORDER BY signal ASC, outcome_class ASC", (project_id, subject_family), ).fetchall() - return [_row_to_experience(conn, row) for row in rows] + return _hydrate_experience_rows(conn, rows) def find_experience( @@ -136,45 +281,24 @@ def find_experience( "SELECT * FROM memory_experiences WHERE id=?", (experience_id,), ).fetchone() - return _row_to_experience(conn, row) if row is not None else None + if row is None: + return None + return _hydrate_experience_rows(conn, [row])[0] -def _facets_for_experience( - conn: sqlite3.Connection, - experience_id: str, -) -> tuple[ExperienceFacet, ...]: - rows = conn.execute( - "SELECT facet_kind, facet_value, count FROM memory_experience_facets " - "WHERE experience_id=? ORDER BY facet_kind ASC, facet_value ASC", - (experience_id,), - ).fetchall() - return tuple( - ExperienceFacet( - facet_kind=_facet_kind(str(row["facet_kind"])), - facet_value=str(row["facet_value"]), - count=int(row["count"]), - ) - for row in rows +def _row_to_facet(row: sqlite3.Row) -> ExperienceFacet: + return ExperienceFacet( + facet_kind=_facet_kind(str(row["facet_kind"])), + facet_value=str(row["facet_value"]), + count=int(row["count"]), ) -def _evidence_for_experience( - conn: sqlite3.Connection, - experience_id: str, -) -> tuple[ExperienceEvidence, ...]: - rows = conn.execute( - "SELECT trajectory_id, outcome, finished_at_utc " - "FROM memory_experience_evidence WHERE experience_id=? " - "ORDER BY finished_at_utc ASC, trajectory_id ASC", - (experience_id,), - ).fetchall() - return tuple( - ExperienceEvidence( - trajectory_id=str(row["trajectory_id"]), - outcome=str(row["outcome"]), - finished_at_utc=str(row["finished_at_utc"]), - ) - for row in rows +def _row_to_evidence(row: sqlite3.Row) -> ExperienceEvidence: + return ExperienceEvidence( + trajectory_id=str(row["trajectory_id"]), + outcome=str(row["outcome"]), + finished_at_utc=str(row["finished_at_utc"]), ) @@ -185,7 +309,12 @@ def _facet_kind(value: str) -> ExperienceFacetKind: raise ValueError(msg) -def _row_to_experience(conn: sqlite3.Connection, row: sqlite3.Row) -> Experience: +def _row_to_experience( + row: sqlite3.Row, + *, + facets: tuple[ExperienceFacet, ...] | None = None, + evidence: tuple[ExperienceEvidence, ...] | None = None, +) -> Experience: experience_id = str(row["id"]) return Experience( id=experience_id, @@ -205,8 +334,8 @@ def _row_to_experience(conn: sqlite3.Connection, row: sqlite3.Row) -> Experience last_observed_at_utc=str(row["last_observed_at_utc"]), distilled_at_utc=str(row["distilled_at_utc"]), updated_at_utc=str(row["updated_at_utc"]), - facets=_facets_for_experience(conn, experience_id), - evidence=_evidence_for_experience(conn, experience_id), + facets=facets if facets is not None else (), + evidence=evidence if evidence is not None else (), ) diff --git a/codeclone/memory/ide_governance.py b/codeclone/memory/ide_governance.py index ab21287d..1f21a432 100644 --- a/codeclone/memory/ide_governance.py +++ b/codeclone/memory/ide_governance.py @@ -24,7 +24,7 @@ IDE_GOVERNANCE_TICKET_TTL_SECONDS = 120 IDE_GOVERNANCE_MIN_KEY_BYTES = 32 IDE_GOVERNANCE_MAX_COMMIT_ATTEMPTS = 100 -IDE_GOVERNANCE_ALLOWED_CLIENTS = frozenset({"CodeClone VS Code"}) +IDE_GOVERNANCE_ALLOWED_CLIENTS = frozenset({"CodeClone VS Code", "CodeClone JetBrains"}) GovernanceDecision = Literal["approve", "reject", "archive"] GovernanceAction = Literal[ @@ -34,11 +34,12 @@ ] GOVERNANCE_MODE_UNAVAILABLE_MESSAGE = ( - "This action is only available through the CodeClone VS Code IDE governance " - "channel." + "This action is only available through a CodeClone IDE governance channel " + "(VS Code or JetBrains plugin)." ) GOVERNANCE_MODE_UNAVAILABLE_NEXT_STEP = ( - "Use the Memory view in the CodeClone extension to approve or reject draft records." + "Use the CodeClone Memory view in VS Code or the JetBrains plugin Memory tab " + "to approve or reject draft records." ) @@ -267,7 +268,7 @@ def _require_governance_channel( reason="governance_key_missing", message=( "IDE governance channel is active but no session key is registered. " - "Reconnect the CodeClone VS Code extension." + "Reconnect the CodeClone IDE plugin (VS Code or JetBrains)." ), ) if ( diff --git a/codeclone/memory/retrieval/__init__.py b/codeclone/memory/retrieval/__init__.py index c6b7e953..040bc610 100644 --- a/codeclone/memory/retrieval/__init__.py +++ b/codeclone/memory/retrieval/__init__.py @@ -6,6 +6,7 @@ from .service import ( QUERY_MODES, + get_memory_projection_page, get_relevant_memory, path_has_memory, query_engineering_memory, @@ -14,6 +15,7 @@ __all__ = [ "QUERY_MODES", + "get_memory_projection_page", "get_relevant_memory", "path_has_memory", "query_engineering_memory", diff --git a/codeclone/memory/retrieval/continuation.py b/codeclone/memory/retrieval/continuation.py new file mode 100644 index 00000000..c1ade62b --- /dev/null +++ b/codeclone/memory/retrieval/continuation.py @@ -0,0 +1,318 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +"""Digest-bound continuation cursors for memory retrieval lanes.""" + +from __future__ import annotations + +import base64 +import hashlib +from collections.abc import Mapping, Sequence +from typing import Final + +import orjson + +from ..exceptions import MemoryContractError + +MEMORY_CONTINUATION_CURSOR_VERSION: Final = "2" +MEMORY_CONTINUATION_CURSOR_VERSION_LEGACY: Final = "1" +MEMORY_CONTINUATION_PROJECTION_KIND: Final = "memory_retrieval_lane_projection_v1" +MEMORY_CONTINUATION_ORDERING_VERSION: Final = "memory_retrieval_lane_order_v1" +DEFAULT_MEMORY_CONTINUATION_PAGE_SIZE: Final = 20 +MAX_MEMORY_CONTINUATION_PAGE_SIZE: Final = 50 +MEMORY_CONTINUATION_LANES: Final[frozenset[str]] = frozenset( + {"records", "trajectories", "experiences"} +) + + +def memory_lane_item_ids( + lane: str, + items: Sequence[Mapping[str, object]], +) -> list[str]: + """Return the stable item identities for a memory retrieval lane.""" + + if lane == "records": + key = "id" + elif lane == "trajectories": + key = "trajectory_id" + elif lane == "experiences": + key = "id" + else: + raise MemoryContractError(f"unknown memory continuation lane: {lane}") + ids: list[str] = [] + for item in items: + value = item.get(key) + if not isinstance(value, str) or not value: + raise MemoryContractError(f"memory continuation lane {lane} lacks {key}") + ids.append(value) + return ids + + +def memory_lane_identity_digest( + lane: str, + items: Sequence[Mapping[str, object]], +) -> dict[str, str]: + """Digest the exact ordered identities of a memory retrieval lane.""" + + return _digest( + { + "projection_kind": MEMORY_CONTINUATION_PROJECTION_KIND, + "ordering_version": MEMORY_CONTINUATION_ORDERING_VERSION, + "lane": lane, + "ids": memory_lane_item_ids(lane, items), + } + ) + + +def memory_projection_request_digest( + request: Mapping[str, object], +) -> dict[str, str]: + """Return the canonical digest for a memory retrieval projection request.""" + + return _digest({"request": dict(request)}) + + +def build_memory_continuation_cursor( + *, + project_id: str, + lane: str, + request: Mapping[str, object], + items: Sequence[Mapping[str, object]], + offset: int, +) -> dict[str, object]: + """Build a deterministic cursor envelope for the next page of *lane*.""" + + if lane not in MEMORY_CONTINUATION_LANES: + raise MemoryContractError(f"unknown memory continuation lane: {lane}") + total = len(items) + if offset < 0 or offset > total: + raise MemoryContractError("memory continuation offset is out of bounds") + payload: dict[str, object] = { + "cursor_version": MEMORY_CONTINUATION_CURSOR_VERSION, + "projection_kind": MEMORY_CONTINUATION_PROJECTION_KIND, + "ordering_version": MEMORY_CONTINUATION_ORDERING_VERSION, + "project_id": project_id, + "lane": lane, + "offset": offset, + "total": total, + "request_digest": memory_projection_request_digest(request), + "lane_identity_digest": memory_lane_identity_digest(lane, items), + } + payload["cursor_digest"] = _digest(payload) + return { + "cursor": _encode_cursor(payload), + "cursor_digest": payload["cursor_digest"], + "projection_kind": MEMORY_CONTINUATION_PROJECTION_KIND, + "ordering_version": MEMORY_CONTINUATION_ORDERING_VERSION, + "offset": offset, + "total": total, + } + + +def rebase_memory_continuation_cursor( + cursor: str, + *, + offset: int, +) -> dict[str, object]: + """Return the same digest-bound cursor envelope at a smaller shown offset.""" + + payload = decode_memory_continuation_cursor(cursor) + total = payload.get("total") + if not isinstance(total, int): + raise MemoryContractError("memory continuation total is invalid") + if offset < 0 or offset > total: + raise MemoryContractError("memory continuation offset is out of bounds") + payload = dict(payload) + payload["offset"] = offset + payload.pop("cursor_digest", None) + payload["cursor_digest"] = _digest(payload) + return { + "cursor": _encode_cursor(payload), + "cursor_digest": payload["cursor_digest"], + "projection_kind": MEMORY_CONTINUATION_PROJECTION_KIND, + "ordering_version": MEMORY_CONTINUATION_ORDERING_VERSION, + "offset": offset, + "total": total, + } + + +def decode_memory_continuation_cursor(cursor: str) -> dict[str, object]: + """Decode and validate a memory continuation cursor.""" + + if not cursor.strip(): + raise MemoryContractError("memory continuation cursor is required") + try: + raw = base64.urlsafe_b64decode(_padded_base64(cursor.strip())) + payload = orjson.loads(raw) + except (ValueError, orjson.JSONDecodeError) as exc: + raise MemoryContractError("memory continuation cursor is invalid") from exc + if not isinstance(payload, dict): + raise MemoryContractError("memory continuation cursor payload is invalid") + expected_digest = payload.get("cursor_digest") + if not isinstance(expected_digest, dict): + raise MemoryContractError("memory continuation cursor digest is missing") + without_digest = dict(payload) + without_digest.pop("cursor_digest", None) + if _digest(without_digest) != expected_digest: + raise MemoryContractError("memory continuation cursor digest mismatch") + if payload.get("cursor_version") not in { + MEMORY_CONTINUATION_CURSOR_VERSION, + MEMORY_CONTINUATION_CURSOR_VERSION_LEGACY, + }: + raise MemoryContractError("memory continuation cursor version is unsupported") + if payload.get("projection_kind") != MEMORY_CONTINUATION_PROJECTION_KIND: + raise MemoryContractError("memory continuation projection kind is unsupported") + if payload.get("ordering_version") != MEMORY_CONTINUATION_ORDERING_VERSION: + raise MemoryContractError("memory continuation ordering version is unsupported") + lane = payload.get("lane") + if not isinstance(lane, str) or lane not in MEMORY_CONTINUATION_LANES: + raise MemoryContractError("memory continuation lane is invalid") + _validate_cursor_request_binding(payload) + return payload + + +def resolve_memory_continuation_request( + cursor_payload: Mapping[str, object], + *, + resolve_request: object | None = None, +) -> dict[str, object] | None: + """Resolve the full projection request from a decoded cursor payload.""" + + version = str(cursor_payload.get("cursor_version", "")) + if version == MEMORY_CONTINUATION_CURSOR_VERSION_LEGACY: + request = cursor_payload.get("request") + return dict(request) if isinstance(request, Mapping) else None + request_digest = cursor_payload.get("request_digest") + if not isinstance(request_digest, Mapping): + return None + digest_value = request_digest.get("value") + if not isinstance(digest_value, str) or not digest_value.strip(): + return None + if not callable(resolve_request): + return None + resolved = resolve_request(digest_value) + return dict(resolved) if isinstance(resolved, Mapping) else None + + +def _validate_cursor_request_binding(payload: Mapping[str, object]) -> None: + version = str(payload.get("cursor_version", "")) + if version == MEMORY_CONTINUATION_CURSOR_VERSION_LEGACY: + request = payload.get("request") + if not isinstance(request, dict): + raise MemoryContractError("memory continuation request is invalid") + return + if version == MEMORY_CONTINUATION_CURSOR_VERSION: + request_digest = payload.get("request_digest") + if not isinstance(request_digest, Mapping): + raise MemoryContractError("memory continuation request_digest is invalid") + if not isinstance(request_digest.get("value"), str): + raise MemoryContractError("memory continuation request_digest is invalid") + if payload.get("request") is not None: + raise MemoryContractError( + "memory continuation cursor must not embed request" + ) + return + raise MemoryContractError("memory continuation cursor version is unsupported") + + +def memory_continuation_page( + *, + cursor_payload: Mapping[str, object], + items: Sequence[Mapping[str, object]], + page_size: int, + request: Mapping[str, object], +) -> dict[str, object]: + """Return an exact continuation page or a fail-closed mismatch payload.""" + + lane = str(cursor_payload["lane"]) + raw_offset = cursor_payload.get("offset") + if not isinstance(raw_offset, int): + raise MemoryContractError("memory continuation offset is invalid") + offset = raw_offset + expected = cursor_payload.get("lane_identity_digest") + actual = memory_lane_identity_digest(lane, items) + if actual != expected: + return { + "status": "snapshot_mismatch", + "reason": "memory_projection_changed", + "lane": lane, + "expected_lane_identity_digest": expected, + "actual_lane_identity_digest": actual, + } + bounded_size = bounded_memory_continuation_page_size(page_size) + total = len(items) + page_items = list(items[offset : offset + bounded_size]) + next_offset = offset + len(page_items) + payload: dict[str, object] = { + "status": "ok", + "projection_kind": MEMORY_CONTINUATION_PROJECTION_KIND, + "ordering_version": MEMORY_CONTINUATION_ORDERING_VERSION, + "lane": lane, + "offset": offset, + "page_size": bounded_size, + "returned": len(page_items), + "total": total, + "response_complete": next_offset >= total, + "items": page_items, + "lane_identity_digest": actual, + } + if next_offset < total: + payload["next"] = build_memory_continuation_cursor( + project_id=str(cursor_payload["project_id"]), + lane=lane, + request=request, + items=items, + offset=next_offset, + ) + return payload + + +def bounded_memory_continuation_page_size(value: int) -> int: + """Normalize continuation page size without allowing unbounded pages.""" + + if value < 1: + raise MemoryContractError("memory continuation page_size must be >= 1") + return min(value, MAX_MEMORY_CONTINUATION_PAGE_SIZE) + + +def _encode_cursor(payload: Mapping[str, object]) -> str: + raw = orjson.dumps(payload, option=orjson.OPT_SORT_KEYS) + return base64.urlsafe_b64encode(raw).decode("ascii").rstrip("=") + + +def _padded_base64(value: str) -> bytes: + padding = "=" * (-len(value) % 4) + return f"{value}{padding}".encode("ascii") + + +def _digest(payload: object) -> dict[str, str]: + raw = orjson.dumps(payload, option=orjson.OPT_SORT_KEYS) + return { + "kind": MEMORY_CONTINUATION_PROJECTION_KIND, + "algorithm": "sha256", + "digest_version": MEMORY_CONTINUATION_CURSOR_VERSION, + "value": hashlib.sha256(raw).hexdigest(), + } + + +__all__ = [ + "DEFAULT_MEMORY_CONTINUATION_PAGE_SIZE", + "MAX_MEMORY_CONTINUATION_PAGE_SIZE", + "MEMORY_CONTINUATION_CURSOR_VERSION", + "MEMORY_CONTINUATION_CURSOR_VERSION_LEGACY", + "MEMORY_CONTINUATION_LANES", + "MEMORY_CONTINUATION_ORDERING_VERSION", + "MEMORY_CONTINUATION_PROJECTION_KIND", + "bounded_memory_continuation_page_size", + "build_memory_continuation_cursor", + "decode_memory_continuation_cursor", + "memory_continuation_page", + "memory_lane_identity_digest", + "memory_lane_item_ids", + "memory_projection_request_digest", + "rebase_memory_continuation_cursor", + "resolve_memory_continuation_request", +] diff --git a/codeclone/memory/retrieval/service.py b/codeclone/memory/retrieval/service.py index 3663e4d5..b42b7d58 100644 --- a/codeclone/memory/retrieval/service.py +++ b/codeclone/memory/retrieval/service.py @@ -50,6 +50,13 @@ trajectory_subject_keys, ) from .context_coverage import build_context_coverage +from .continuation import ( + DEFAULT_MEMORY_CONTINUATION_PAGE_SIZE, + build_memory_continuation_cursor, + decode_memory_continuation_cursor, + memory_continuation_page, + resolve_memory_continuation_request, +) from .ranking import ( RankingContext, reciprocal_rank_fusion, @@ -77,6 +84,7 @@ "trajectory_status", "trajectory_search", "trajectory_get", + "experience_get", "trajectory_anomalies", "trajectory_agents", "trajectory_dashboard", @@ -94,6 +102,7 @@ "trajectory_status", "trajectory_search", "trajectory_get", + "experience_get", "trajectory_anomalies", "trajectory_agents", "trajectory_dashboard", @@ -229,6 +238,7 @@ def _retrieval_policy(*, include_drafts: bool) -> dict[str, object]: DEFAULT_EXPERIENCE_PREVIEW_LIMIT = 10 +MEMORY_RETRIEVAL_PROJECTION_CANDIDATE_LIMIT = 5000 COMPACT_MEMORY_SUBJECT_LIMIT = 6 _MEMORY_SUBJECT_KIND_ORDER = { @@ -540,17 +550,16 @@ def _apply_conflict_penalty( return score -def _rank_records( +def _rank_record_summaries( store: SqliteEngineeringMemoryStore, *, project_id: str, candidates: Sequence[MemoryRecord], context: RankingContext, - max_records: int, detail_level: MemoryDetailLevel, lexical_ranks: Mapping[str, int] | None = None, vector_ranks: Mapping[str, int] | None = None, -) -> tuple[list[dict[str, object]], bool]: +) -> list[dict[str, object]]: # Fusion mode (hybrid search) supplies the lexical (BM25) and/or vector # rankings. There the metadata relevance_score is only a deterministic # tie-break, so the vector signal must NOT also be folded into it via @@ -601,8 +610,31 @@ def _rank_records( summary["relations"] = record_relations scored.append((primary, adjusted, record.id, summary)) scored.sort(key=lambda item: (-item[0], -item[1], item[2])) - truncated = len(scored) > max_records - return [item[3] for item in scored[:max_records]], truncated + return [item[3] for item in scored] + + +def _rank_records( + store: SqliteEngineeringMemoryStore, + *, + project_id: str, + candidates: Sequence[MemoryRecord], + context: RankingContext, + max_records: int, + detail_level: MemoryDetailLevel, + lexical_ranks: Mapping[str, int] | None = None, + vector_ranks: Mapping[str, int] | None = None, +) -> tuple[list[dict[str, object]], bool]: + ranked = _rank_record_summaries( + store, + project_id=project_id, + candidates=candidates, + context=context, + detail_level=detail_level, + lexical_ranks=lexical_ranks, + vector_ranks=vector_ranks, + ) + truncated = len(ranked) > max_records + return ranked[:max_records], truncated def _coverage_summary( @@ -675,45 +707,51 @@ def get_relevant_memory( include_drafts=effective_include_drafts, ) ] - records_payload, truncated = _rank_records( + ranked_records = _rank_record_summaries( store, project_id=project_id, candidates=visible, context=context, - max_records=max_records, detail_level=normalized_detail, ) + records_payload = ranked_records[:max_records] + truncated = len(ranked_records) > len(records_payload) trajectory_candidates = store.list_trajectories_for_subjects( project_id=project_id, subjects=trajectory_subject_keys( scope_paths=normalized_scope, symbols=tuple(normalized_symbols), ), - limit=max(DEFAULT_TRAJECTORY_PREVIEW_LIMIT * 3, max_records), + limit=MEMORY_RETRIEVAL_PROJECTION_CANDIDATE_LIMIT, ) patch_trails = _load_patch_trails_for_trajectories( store, trajectory_ids=tuple(item.id for item in trajectory_candidates), ) - trajectories_payload, trajectories_truncated = rank_trajectories_for_scope( + ranked_trajectories, _all_trajectories_truncated = rank_trajectories_for_scope( trajectory_candidates, scope_paths=normalized_scope, symbols=tuple(normalized_symbols), - max_results=min(max_records, DEFAULT_TRAJECTORY_PREVIEW_LIMIT), + max_results=max(1, len(trajectory_candidates)), include_routine=include_routine, patch_trails=patch_trails, detail_level=normalized_detail, ) + trajectory_limit = min(max_records, DEFAULT_TRAJECTORY_PREVIEW_LIMIT) + trajectories_payload = ranked_trajectories[:trajectory_limit] + trajectories_truncated = len(ranked_trajectories) > len(trajectories_payload) matching_experiences = _matching_experiences( store, project_id=project_id, families=_scope_families(normalized_scope), ) - experiences_payload = _serialize_relevant_experiences( + ranked_experiences = _serialize_relevant_experiences( matching_experiences, - max_results=min(max_records, DEFAULT_EXPERIENCE_PREVIEW_LIMIT), + max_results=len(matching_experiences), detail_level=normalized_detail, ) + experience_limit = min(max_records, DEFAULT_EXPERIENCE_PREVIEW_LIMIT) + experiences_payload = ranked_experiences[:experience_limit] coverage: dict[str, object] if normalized_scope: coverage = build_context_coverage( @@ -755,9 +793,228 @@ def get_relevant_memory( "detail_level": normalized_detail, "retrieval_policy": _retrieval_policy(include_drafts=effective_include_drafts), } + projection_request = _memory_projection_request( + scope_paths=normalized_scope, + symbols=tuple(sorted(normalized_symbols)), + blast_dependents=tuple(sorted(normalized_blast)), + scope_resolved_from=scope_resolved_from, + include_stale=include_stale, + include_drafts=effective_include_drafts, + include_routine=include_routine, + detail_level=normalized_detail, + ) + continuation = _memory_retrieval_continuation( + project_id=project_id, + request=projection_request, + lanes={ + "records": ranked_records, + "trajectories": ranked_trajectories, + "experiences": ranked_experiences, + }, + shown={ + "records": len(records_payload), + "trajectories": len(trajectories_payload), + "experiences": len(experiences_payload), + }, + ) + if continuation: + payload["continuation"] = continuation + payload["_memory_projection_request"] = projection_request return payload +def get_memory_projection_page( + store: SqliteEngineeringMemoryStore, + *, + project_id: str, + cursor: str, + page_size: int = DEFAULT_MEMORY_CONTINUATION_PAGE_SIZE, + resolve_request: object | None = None, +) -> dict[str, object]: + """Return a digest-bound continuation page for a memory retrieval lane.""" + + cursor_payload = decode_memory_continuation_cursor(cursor) + if cursor_payload.get("project_id") != project_id: + return { + "status": "snapshot_mismatch", + "reason": "project_identity_mismatch", + "expected_project_id": cursor_payload.get("project_id"), + "actual_project_id": project_id, + } + request = resolve_memory_continuation_request( + cursor_payload, + resolve_request=resolve_request, + ) + if request is None: + return { + "status": "snapshot_mismatch", + "reason": "memory_continuation_request_unavailable", + "lane": cursor_payload.get("lane"), + "request_digest": cursor_payload.get("request_digest"), + } + lanes = _memory_projection_lanes( + store, + project_id=project_id, + request=request, + ) + lane = str(cursor_payload["lane"]) + return memory_continuation_page( + cursor_payload=cursor_payload, + items=lanes[lane], + page_size=page_size, + request=request, + ) + + +def _memory_projection_lanes( + store: SqliteEngineeringMemoryStore, + *, + project_id: str, + request: Mapping[str, object], +) -> dict[str, list[dict[str, object]]]: + normalized_scope = tuple(_string_list(request, "scope_paths")) + normalized_symbols = tuple(_string_list(request, "symbols")) + normalized_blast = tuple(_string_list(request, "blast_dependents")) + normalized_detail = _normalize_detail_level(str(request.get("detail_level", ""))) + include_stale = bool(request.get("include_stale")) + include_drafts = bool(request.get("include_drafts")) + include_routine = bool(request.get("include_routine")) + context = RankingContext.from_scope( + scope_paths=normalized_scope, + symbols=normalized_symbols, + blast_dependents=normalized_blast, + ) + statuses = _default_statuses( + include_stale=include_stale, + include_drafts=include_drafts, + ) + records = store.query_records( + MemoryQuery( + project_id=project_id, + statuses=statuses, + limit=MEMORY_RETRIEVAL_PROJECTION_CANDIDATE_LIMIT, + ) + ) + visible = [ + record + for record in records + if _record_visible( + record, + include_stale=include_stale, + include_drafts=include_drafts, + ) + ] + ranked_records = _rank_record_summaries( + store, + project_id=project_id, + candidates=visible, + context=context, + detail_level=normalized_detail, + ) + trajectory_candidates = store.list_trajectories_for_subjects( + project_id=project_id, + subjects=trajectory_subject_keys( + scope_paths=normalized_scope, + symbols=normalized_symbols, + ), + limit=MEMORY_RETRIEVAL_PROJECTION_CANDIDATE_LIMIT, + ) + patch_trails = _load_patch_trails_for_trajectories( + store, + trajectory_ids=tuple(item.id for item in trajectory_candidates), + ) + ranked_trajectories, _truncated = rank_trajectories_for_scope( + trajectory_candidates, + scope_paths=normalized_scope, + symbols=normalized_symbols, + max_results=max(1, len(trajectory_candidates)), + include_routine=include_routine, + patch_trails=patch_trails, + detail_level=normalized_detail, + ) + ranked_experiences = _serialize_relevant_experiences( + _matching_experiences( + store, + project_id=project_id, + families=_scope_families(normalized_scope), + ), + max_results=MEMORY_RETRIEVAL_PROJECTION_CANDIDATE_LIMIT, + detail_level=normalized_detail, + ) + return { + "records": ranked_records, + "trajectories": ranked_trajectories, + "experiences": ranked_experiences, + } + + +def _memory_projection_request( + *, + scope_paths: Sequence[str], + symbols: Sequence[str], + blast_dependents: Sequence[str], + scope_resolved_from: str, + include_stale: bool, + include_drafts: bool, + include_routine: bool, + detail_level: MemoryDetailLevel, +) -> dict[str, object]: + return { + "scope_paths": list(scope_paths), + "symbols": list(symbols), + "blast_dependents": list(blast_dependents), + "scope_resolved_from": scope_resolved_from, + "include_stale": include_stale, + "include_drafts": include_drafts, + "include_routine": include_routine, + "detail_level": detail_level, + } + + +def _memory_retrieval_continuation( + *, + project_id: str, + request: Mapping[str, object], + lanes: Mapping[str, Sequence[dict[str, object]]], + shown: Mapping[str, int], +) -> dict[str, object]: + lane_payloads: dict[str, object] = {} + for lane, items in sorted(lanes.items()): + shown_count = shown[lane] + total = len(items) + omitted = max(0, total - shown_count) + if omitted == 0: + continue + lane_payloads[lane] = { + "status": "available", + "total": total, + "shown": shown_count, + "omitted": omitted, + "page": build_memory_continuation_cursor( + project_id=project_id, + lane=lane, + request=request, + items=items, + offset=shown_count, + ), + } + if not lane_payloads: + return {} + return { + "projection_kind": "memory_retrieval_lane_projection_v1", + "ordering_version": "memory_retrieval_lane_order_v1", + "cursor_policy": "digest_bound_recompute_or_fail_closed", + "lanes": lane_payloads, + } + + +def _string_list(payload: Mapping[str, object], key: str) -> list[str]: + value = payload.get(key) + if not isinstance(value, list) or not all(isinstance(item, str) for item in value): + raise MemoryContractError(f"memory continuation request {key} is invalid") + return list(value) + + def _load_patch_trails_for_trajectories( store: SqliteEngineeringMemoryStore, *, @@ -960,6 +1217,7 @@ def _handle_trajectory_get_mode( mode: str, project_id: str, record_id: str | None, + detail_level: MemoryDetailLevel = "compact", ) -> dict[str, object]: trajectory_id = _require_query_field( record_id, @@ -974,14 +1232,52 @@ def _handle_trajectory_get_mode( "payload": {"trajectory_id": trajectory_id}, } patch_trail_payload = store.load_trajectory_patch_trail(trajectory_id) + if detail_level == "full": + trajectory_payload = serialize_trajectory_detail( + trajectory, + patch_trail_payload=patch_trail_payload, + ) + else: + trajectory_payload = serialize_trajectory_preview( + trajectory, + patch_trail_payload=patch_trail_payload, + detail_level="compact", + ) + return { + "mode": mode, + "status": "ok", + "detail_level": detail_level, + "payload": {"trajectory": trajectory_payload}, + } + + +def _handle_experience_get_mode( + store: SqliteEngineeringMemoryStore, + *, + mode: str, + project_id: str, + record_id: str | None, +) -> dict[str, object]: + experience_id = _require_query_field( + record_id, + mode=mode, + field="record_id containing experience_id", + ) + experience = store.find_experience(experience_id) + if experience is None or experience.project_id != project_id: + return { + "mode": mode, + "status": "not_found", + "payload": {"experience_id": experience_id}, + } return { "mode": mode, "status": "ok", "detail_level": "full", "payload": { - "trajectory": serialize_trajectory_detail( - trajectory, - patch_trail_payload=patch_trail_payload, + "experience": _serialize_experience( + experience, + detail_level="full", ) }, } @@ -1628,6 +1924,10 @@ def query_engineering_memory( f"Unknown query mode {mode!r}. Allowed: {', '.join(QUERY_MODES)}." ) + resolved_scope = scope + if mode == "coverage" and not resolved_scope and path is not None: + resolved_scope = (path,) + normalized_detail = _normalize_detail_level(detail_level) effective_include_drafts = include_drafts or mode in {"for_path", "for_symbol"} @@ -1681,7 +1981,7 @@ def query_engineering_memory( store, mode=mode, project_id=project_id, - scope=scope, + scope=resolved_scope, ) if mode == "trajectory_status": return _handle_trajectory_status_mode( @@ -1695,6 +1995,14 @@ def query_engineering_memory( mode=mode, project_id=project_id, record_id=record_id, + detail_level=normalized_detail, + ) + if mode == "experience_get": + return _handle_experience_get_mode( + store, + mode=mode, + project_id=project_id, + record_id=record_id, ) filter_types, filter_statuses, filter_confidences, match_mode, include_routine = ( @@ -1807,6 +2115,7 @@ def query_engineering_memory( "QUERY_MODES", "MemoryDetailLevel", "QueryMode", + "get_memory_projection_page", "get_relevant_memory", "normalize_repo_path", "path_has_memory", diff --git a/codeclone/memory/semantic/__init__.py b/codeclone/memory/semantic/__init__.py index a01eb8f8..7ac1ce08 100644 --- a/codeclone/memory/semantic/__init__.py +++ b/codeclone/memory/semantic/__init__.py @@ -18,6 +18,7 @@ trajectory_chunk_row_id, ) from .models import ( + ExistingSourceRevision, SemanticHit, SemanticIndexStatus, SemanticProjection, @@ -85,8 +86,15 @@ def delete(self, ids: Sequence[str]) -> None: ... def known_ids(self) -> set[str]: ... def row_fingerprints(self, ids: Sequence[str]) -> dict[str, SemanticRowFingerprint]: - """Stored (text_hash, embedding_model) for the given ids, vectors not - loaded. Missing ids are omitted; empty ``ids`` returns ``{}``.""" + """Stored (text_hash, embedding_model, source_revision) for the given ids, + vectors not loaded. Missing ids are omitted; empty ``ids`` returns ``{}``.""" + ... + + def existing_revisions(self) -> dict[str, ExistingSourceRevision]: + """Every stored row grouped by source id -> (lane, source_revision, + row_ids), from one metadata scan. The rebuild diffs this against each + source's cheap ``scan`` to project only changed sources; vectors are + never loaded.""" ... @@ -140,10 +148,11 @@ def close_semantic_index(index: object | None) -> None: def resolve_semantic_index(config: SemanticConfig) -> SemanticIndex: """Resolve the semantic index for the given config. - Null when disabled; otherwise the backend. The LanceDB backend is wired in - Phase 20.2 via a lazy import inside this function (so absence never crashes - the import of the memory package). Until then an enabled index degrades to - Unavailable — read paths stay empty and explicit commands fail clear. + Null when disabled; otherwise the backend. The LanceDB backend is loaded + lazily inside this function, so absence never crashes the import of the + memory package. When the optional backend is unavailable, an enabled index + degrades to Unavailable — read paths stay empty and explicit commands fail + clear. """ if not config.enabled: return NullSemanticIndex() @@ -188,6 +197,7 @@ def _resolve_backend( "INDEXED_MEMORY_TYPES", "SEMANTIC_CHUNK_STRATEGY_VERSION", "AuditIndexSource", + "ExistingSourceRevision", "IndexSource", "MemoryIndexSource", "NullSemanticIndex", diff --git a/codeclone/memory/semantic/chunking.py b/codeclone/memory/semantic/chunking.py index 02c68fc1..fe4b204c 100644 --- a/codeclone/memory/semantic/chunking.py +++ b/codeclone/memory/semantic/chunking.py @@ -32,6 +32,9 @@ class IndexedSemanticUnit: status: str | None text: str text_hash: str + # Per-source revision (Stage 2); identical across every chunk of one source, + # so the row's stored revision groups unambiguously back to its source. + source_revision: str = "" @runtime_checkable @@ -84,6 +87,7 @@ def expand_projection( status=projection.status, text=chunk, text_hash=text_hash(chunk), + source_revision=projection.source_revision, ) for index, chunk in enumerate(chunks) ) @@ -126,6 +130,7 @@ def _single_unit( status=projection.status, text=text, text_hash=text_hash(text), + source_revision=projection.source_revision, ) diff --git a/codeclone/memory/semantic/lancedb_backend.py b/codeclone/memory/semantic/lancedb_backend.py index ec382e4c..ced160a2 100644 --- a/codeclone/memory/semantic/lancedb_backend.py +++ b/codeclone/memory/semantic/lancedb_backend.py @@ -7,6 +7,7 @@ from __future__ import annotations import importlib +from collections import defaultdict from collections.abc import Sequence from pathlib import Path from types import ModuleType @@ -14,6 +15,7 @@ from ...utils.iterutils import chunked from .models import ( + ExistingSourceRevision, SemanticHit, SemanticIndexStatus, SemanticRow, @@ -102,6 +104,7 @@ def _schema(pa: ModuleType, dimension: int) -> object: pa.field("status", pa.string()), pa.field("text_hash", pa.string()), pa.field("embedding_model", pa.string()), + pa.field("source_revision", pa.string()), pa.field("vector", pa.list_(pa.float32(), dimension)), ] ) @@ -113,6 +116,9 @@ def _schema_matches(table: _LanceTable, *, dimension: int) -> bool: parent_field = table.schema.field("parent_id") chunk_index_field = table.schema.field("chunk_index") chunk_count_field = table.schema.field("chunk_count") + # source_revision (format v3): a pre-Stage-2 table lacks it, so the + # mismatch drives the deliberate one-time drop + full rebuild. + source_revision_field = table.schema.field("source_revision") except (AttributeError, KeyError, ValueError): return False return ( @@ -120,6 +126,7 @@ def _schema_matches(table: _LanceTable, *, dimension: int) -> bool: and parent_field is not None and chunk_index_field is not None and chunk_count_field is not None + and source_revision_field is not None ) @@ -136,6 +143,7 @@ def _to_record(row: SemanticRow) -> dict[str, object]: "status": row.status, "text_hash": row.text_hash, "embedding_model": row.embedding_model, + "source_revision": row.source_revision, "vector": list(row.vector), } @@ -290,6 +298,49 @@ def known_ids(self) -> set[str]: arrow = self._table.search().select(["id"]).limit(total).to_arrow() return {str(value) for value in arrow.column("id").to_pylist()} + def existing_revisions(self) -> dict[str, ExistingSourceRevision]: + if self._table is None: + return {} + total = self._table.count_rows() + if total == 0: + return {} + # One metadata scan (no vectors), grouping every row back to its source + # object: a chunked trajectory's rows share one parent_id and one + # source_revision, so the grouped value is the source's stored revision + # plus all of its row ids (needed to keep unchanged rows during reconcile). + arrow = ( + self._table.search() + .select(["id", "parent_id", "source", "source_revision", "embedding_model"]) + .limit(total) + .to_arrow() + ) + ids = arrow.column("id").to_pylist() + parents = arrow.column("parent_id").to_pylist() + sources = arrow.column("source").to_pylist() + revisions = arrow.column("source_revision").to_pylist() + models = arrow.column("embedding_model").to_pylist() + row_ids_by_source: dict[str, set[str]] = defaultdict(set) + revision_by_source: dict[str, str] = {} + lane_by_source: dict[str, str] = {} + model_by_source: dict[str, str] = {} + for row_id, parent_id, source, revision, model in zip( + ids, parents, sources, revisions, models, strict=True + ): + source_id = str(parent_id) if parent_id is not None else str(row_id) + row_ids_by_source[source_id].add(str(row_id)) + revision_by_source[source_id] = "" if revision is None else str(revision) + lane_by_source[source_id] = str(source) + model_by_source[source_id] = "" if model is None else str(model) + return { + source_id: ExistingSourceRevision( + source=cast(SemanticSource, lane_by_source[source_id]), + source_revision=revision_by_source[source_id], + embedding_model=model_by_source[source_id], + row_ids=frozenset(row_ids), + ) + for source_id, row_ids in row_ids_by_source.items() + } + def row_fingerprints(self, ids: Sequence[str]) -> dict[str, SemanticRowFingerprint]: if self._table is None or not ids: return {} @@ -298,7 +349,7 @@ def row_fingerprints(self, ids: Sequence[str]) -> dict[str, SemanticRowFingerpri clause = ", ".join(_sql_quote(value) for value in chunk) arrow = ( self._table.search() - .select(["id", "text_hash", "embedding_model"]) + .select(["id", "text_hash", "embedding_model", "source_revision"]) .where(f"id IN ({clause})") .limit(len(chunk)) .to_arrow() @@ -306,11 +357,15 @@ def row_fingerprints(self, ids: Sequence[str]) -> dict[str, SemanticRowFingerpri row_ids = arrow.column("id").to_pylist() hashes = arrow.column("text_hash").to_pylist() models = arrow.column("embedding_model").to_pylist() - for row_id, text_hash, model in zip(row_ids, hashes, models, strict=True): + revisions = arrow.column("source_revision").to_pylist() + for row_id, text_hash, model, revision in zip( + row_ids, hashes, models, revisions, strict=True + ): result[str(row_id)] = SemanticRowFingerprint( id=str(row_id), text_hash=str(text_hash), embedding_model=str(model), + source_revision="" if revision is None else str(revision), ) return result diff --git a/codeclone/memory/semantic/models.py b/codeclone/memory/semantic/models.py index 3124dab9..910c1a0f 100644 --- a/codeclone/memory/semantic/models.py +++ b/codeclone/memory/semantic/models.py @@ -30,6 +30,11 @@ class SemanticProjection(BaseModel): status: str | None = None text: str = Field(min_length=1) text_hash: str = Field(min_length=1) + # Cheap, projection-free revision key (Stage 2). Derived identically by the + # inventory scan and the full projection, so an unchanged source row's stored + # revision always equals its freshly scanned one. Default "" = legacy/unknown + # = always changed; a real projection always sets a non-empty revision. + source_revision: str = "" class SemanticRow(BaseModel): @@ -54,14 +59,17 @@ class SemanticRow(BaseModel): status: str | None = None text_hash: str = Field(min_length=1) embedding_model: str = Field(min_length=1) + source_revision: str = "" vector: tuple[float, ...] class SemanticRowFingerprint(BaseModel): """Identity of a stored row without its vector. - The incremental rebuild fetches these (id + ``text_hash`` + model) to decide - what to re-embed, so it never loads vectors to check freshness. + The incremental rebuild fetches these (id + ``text_hash`` + model + + ``source_revision``) to decide what to re-embed, so it never loads vectors to + check freshness. ``source_revision`` guarantees a revision-changed row is + re-embedded (and its new revision persisted) even when its text is unchanged. """ model_config = ConfigDict(frozen=True, extra="forbid") @@ -69,6 +77,29 @@ class SemanticRowFingerprint(BaseModel): id: str = Field(min_length=1) text_hash: str = Field(min_length=1) embedding_model: str = Field(min_length=1) + source_revision: str = "" + + +class ExistingSourceRevision(BaseModel): + """Stored revision state for one source object, grouped from its index rows. + + The incremental rebuild reads these once (a single metadata scan, no vectors) + to partition each lane's source ids into new / unchanged / changed / deleted. + ``source_id`` is the row ``parent_id`` for chunked trajectories, else the row + ``id``; every row of one source shares the same ``source_revision``, so the + grouped value is unambiguous. ``row_ids`` are all index rows for that source + (every chunk), used to keep unchanged rows in ``seen_ids`` during reconcile. + """ + + model_config = ConfigDict(frozen=True, extra="forbid") + + source: SemanticSource + source_revision: str = "" + # Embedding model the stored rows were built with; a source whose revision is + # unchanged but whose model differs from the current provider is still stale + # (a model swap must re-embed every lane), so the partition checks both. + embedding_model: str = "" + row_ids: frozenset[str] class SemanticHit(BaseModel): @@ -120,6 +151,7 @@ class SemanticSearchResult(BaseModel): __all__ = [ + "ExistingSourceRevision", "SemanticHit", "SemanticIndexStatus", "SemanticProjection", diff --git a/codeclone/memory/semantic/projection.py b/codeclone/memory/semantic/projection.py index 5767cd57..b77fe00c 100644 --- a/codeclone/memory/semantic/projection.py +++ b/codeclone/memory/semantic/projection.py @@ -9,14 +9,25 @@ import hashlib from collections.abc import Iterable +from ...contracts import ( + AUDIT_PROJECTION_VERSION, + MEMORY_PROJECTION_VERSION, + SEMANTIC_PROJECTION_REVISION_VERSION, + TRAJECTORY_PROJECTION_VERSION, +) from ..models import MemoryRecord from ..trajectory.models import Trajectory from ..trajectory.retrieval import trajectory_semantic_text_parts from .models import SemanticProjection +# Field separator for the source_revision payload — an ASCII unit separator that +# cannot appear in version tags, source ids, or content tokens, so the joined +# fields can never collide. +_REVISION_FIELD_SEP = "\x1f" + # Prose/decision subset only. Structural records (module_role, test_anchor, # document_link, public_surface, stale_marker) are served by exact subject -# match and are NOT semantically indexed (Phase 20 spec §6.1). +# match and are NOT semantically indexed. INDEXED_MEMORY_TYPES: frozenset[str] = frozenset( { "contract_note", @@ -29,7 +40,7 @@ } ) -# Forensically useful audit incidents (Phase 20 spec §6.2). Projected from the +# Forensically useful audit incidents. Projected from the # bounded controller_events.summary column only — never payload_json. INDEXED_AUDIT_EVENTS: frozenset[str] = frozenset( { @@ -48,6 +59,41 @@ def text_hash(text: str) -> str: return hashlib.sha256(text.encode("utf-8")).hexdigest() +def source_revision(*, source_kind: str, source_id: str, content_token: str) -> str: + """Cheap, projection-free revision key for one source row (Stage 2). + + Folds the global ``SEMANTIC_PROJECTION_REVISION_VERSION`` escape hatch with the + source kind/id and a per-source content token (which carries that source's + projection version). It is derivable identically from the cheap inventory + scan and from the full projection, so an unchanged source row always hashes to + the same revision through both paths — that equality is what lets the rebuild + skip re-projecting it. It is NOT the projected text hash: computing the text + hash needs the expensive projection, which is exactly what this avoids. + """ + payload = _REVISION_FIELD_SEP.join( + (SEMANTIC_PROJECTION_REVISION_VERSION, source_kind, source_id, content_token) + ) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def memory_content_token(record: MemoryRecord) -> str: + """Cheap change token for a memory record: its projection version, mutation + timestamp, and status. No statement/subjects read required.""" + return f"{MEMORY_PROJECTION_VERSION}:{record.updated_at_utc}:{record.status}" + + +def audit_content_token() -> str: + """Audit events are immutable, so the token is the projection version alone: + a new event_id is the only thing that changes a row's revision.""" + return AUDIT_PROJECTION_VERSION + + +def trajectory_content_token(*, trajectory_digest: str) -> str: + """Trajectory change token: projection version plus the content-addressed + trajectory digest (the cheap list scan already returns it).""" + return f"{TRAJECTORY_PROJECTION_VERSION}:{trajectory_digest}" + + def is_indexed_memory_type(record_type: str) -> bool: return record_type in INDEXED_MEMORY_TYPES @@ -82,6 +128,11 @@ def project_memory_record( status=record.status, text=text, text_hash=text_hash(text), + source_revision=source_revision( + source_kind="memory", + source_id=record.id, + content_token=memory_content_token(record), + ), ) @@ -107,6 +158,11 @@ def project_audit_event( status=None, text=text, text_hash=text_hash(text), + source_revision=source_revision( + source_kind="audit", + source_id=event_id, + content_token=audit_content_token(), + ), ) @@ -129,6 +185,13 @@ def project_trajectory( status=trajectory.outcome, text=text, text_hash=text_hash(text), + source_revision=source_revision( + source_kind="trajectory", + source_id=trajectory.id, + content_token=trajectory_content_token( + trajectory_digest=trajectory.trajectory_digest + ), + ), ) @@ -142,10 +205,14 @@ def _primary_trajectory_path(trajectory: Trajectory) -> str | None: __all__ = [ "INDEXED_AUDIT_EVENTS", "INDEXED_MEMORY_TYPES", + "audit_content_token", "is_indexed_audit_event", "is_indexed_memory_type", + "memory_content_token", "project_audit_event", "project_memory_record", "project_trajectory", + "source_revision", "text_hash", + "trajectory_content_token", ] diff --git a/codeclone/memory/semantic/rebuild.py b/codeclone/memory/semantic/rebuild.py index 12e6b311..ee162ec2 100644 --- a/codeclone/memory/semantic/rebuild.py +++ b/codeclone/memory/semantic/rebuild.py @@ -27,7 +27,8 @@ expand_projection, resolve_passage_chunker, ) -from .models import SemanticRow, SemanticRowFingerprint +from .models import ExistingSourceRevision, SemanticRow, SemanticRowFingerprint +from .sources import SourceScanError if TYPE_CHECKING: from ..embedding import EmbeddingProvider @@ -42,13 +43,19 @@ @dataclass(frozen=True, slots=True) class RebuildReport: """Outcome of a semantic rebuild: indexed total, deletions, and the - embedded vs hash-skipped split (per source).""" + embedded vs skipped-unchanged split (per source). + + ``incomplete_lanes`` names sources whose scan was degraded this cycle; those + lanes were preserved (no pruning) instead of reconciled, so the rebuild is + advisory-degraded rather than authoritative for them. + """ indexed: int deleted: int = 0 embedded: int = 0 skipped_unchanged: int = 0 by_source: dict[str, int] = field(default_factory=dict) + incomplete_lanes: tuple[str, ...] = () @dataclass(frozen=True, slots=True) @@ -56,6 +63,7 @@ class _SourceIndexStats: seen_ids: set[str] embedded: int skipped_unchanged: int + document_count: int = 0 def rebuild_semantic_index( @@ -65,70 +73,188 @@ def rebuild_semantic_index( sources: Sequence[IndexSource], embed_batch_limits: EmbedBatchLimits | None = None, ) -> RebuildReport: - """Reconcile the semantic index against its sources by content hash. + """Reconcile the semantic index against its sources by source revision. - A row is re-embedded only when its projection ``text_hash`` (or the - embedding model) differs from the stored fingerprint; unchanged rows are - skipped without loading their vectors, so an unchanged corpus never loads - the embedding model. The index is a derived, rebuildable sidecar, never - updated on the write hot path. + Each source is scanned for a cheap ``source_revision`` per current row (no + projection, no full hydration). Only sources whose revision differs from the + stored one are re-projected and re-embedded; an unchanged source is never + sourced past the scan, so an unchanged corpus does no projection, embedding, + or row writes. A source whose scan is degraded preserves its lane (its stored + rows are kept and never pruned). The index is a derived, rebuildable sidecar, + never updated on the write hot path. """ limits = embed_batch_limits or EmbedBatchLimits() chunker = resolve_passage_chunker(provider) - by_source: dict[str, int] = {} - seen_ids: set[str] = set() - embedded = 0 - skipped = 0 + # One metadata scan of the stored rows, diffed per source against each + # source's cheap revision scan. A degraded source (SourceScanError or a + # non-complete scan status) preserves its lane: pruning, which is + # destructive, is gated off for the whole cycle and the lane's stored rows + # stay in seen_ids — a transient failure must never masquerade as an empty + # source and delete still-live rows. + existing = writer.existing_revisions() + scanned: list[tuple[str, _SourceIndexStats]] = [] + incomplete_lanes: list[str] = [] for source in sources: if not source.available(): continue + stats = _scan_source( + source, + writer=writer, + provider=provider, + chunker=chunker, + existing=existing, + embed_batch_limits=limits, + ) + if stats is None: + incomplete_lanes.append(source.name()) + else: + scanned.append((source.name(), stats)) + seen_ids = {row_id for _, stats in scanned for row_id in stats.seen_ids} + seen_ids |= _preserved_lane_rows(existing, incomplete_lanes) + deleted = _reconcile(writer, seen_ids=seen_ids, prune=not incomplete_lanes) + return RebuildReport( + indexed=len(seen_ids), + deleted=deleted, + embedded=sum(stats.embedded for _, stats in scanned), + skipped_unchanged=sum(stats.skipped_unchanged for _, stats in scanned), + by_source={ + name: stats.document_count + for name, stats in scanned + if stats.document_count + }, + incomplete_lanes=tuple(incomplete_lanes), + ) + + +def _preserved_lane_rows( + existing: dict[str, ExistingSourceRevision], + incomplete_lanes: Sequence[str], +) -> set[str]: + """Every stored row id of a degraded lane, so reconcile keeps it this cycle.""" + if not incomplete_lanes: + return set() + lanes = set(incomplete_lanes) + return { + row_id + for ex in existing.values() + if ex.source in lanes + for row_id in ex.row_ids + } + + +def _scan_source( + source: IndexSource, + *, + writer: SemanticIndexWriter, + provider: EmbeddingProvider, + chunker: PassageChunker, + existing: dict[str, ExistingSourceRevision], + embed_batch_limits: EmbedBatchLimits, +) -> _SourceIndexStats | None: + """Index one source, or None when its scan was degraded (lane preserved).""" + try: with span(name=f"memory.semantic.source.{source.name()}"): - stats = _index_source( + return _index_source( source, writer=writer, provider=provider, chunker=chunker, - embed_batch_limits=limits, + existing=existing, + embed_batch_limits=embed_batch_limits, ) - if stats.seen_ids: - by_source[source.name()] = _count_source_documents(source) - seen_ids |= stats.seen_ids - embedded += stats.embedded - skipped += stats.skipped_unchanged - deleted = 0 + except SourceScanError: + return None + + +def _reconcile( + writer: SemanticIndexWriter, + *, + seen_ids: set[str], + prune: bool, +) -> int: + """Delete rows absent from this rebuild. Pruning is skipped when ``prune`` is + false (a source could not be read), so a transient read failure never deletes + still-live rows; the deletions are simply deferred to the next clean rebuild.""" with span(name="memory.semantic.reconcile") as reconcile_span: - stale = writer.known_ids() - seen_ids - if stale: - writer.delete(sorted(stale)) - deleted = len(stale) + deleted = 0 + if prune: + stale = writer.known_ids() - seen_ids + if stale: + writer.delete(sorted(stale)) + deleted = len(stale) if is_observability_enabled(): reconcile_span.set_counter("indexed", len(seen_ids)) reconcile_span.set_counter("deleted", deleted) - return RebuildReport( - indexed=len(seen_ids), - deleted=deleted, - embedded=embedded, - skipped_unchanged=skipped, - by_source=by_source, - ) + return deleted -def _count_source_documents(source: IndexSource) -> int: - return sum(1 for _ in source.iter_projections()) +def _index_source( + source: IndexSource, + *, + writer: SemanticIndexWriter, + provider: EmbeddingProvider, + chunker: PassageChunker, + existing: dict[str, ExistingSourceRevision], + embed_batch_limits: EmbedBatchLimits, +) -> _SourceIndexStats | None: + scan = source.scan() + if scan.status != "complete": + return None + lane = source.name() + stored = {sid: ex for sid, ex in existing.items() if ex.source == lane} + seen: set[str] = set() + changed_ids: list[str] = [] + unchanged_rows = 0 + for source_id, revision in scan.revisions.items(): + ex = stored.get(source_id) + # NEW (absent) / CHANGED (revision differs) / legacy ("" stored revision) + # / model-swapped (rows built with a different embedding model) are all + # re-projected; an unchanged source contributes its stored rows to + # seen_ids without any projection or embedding. + if ( + ex is None + or ex.source_revision == "" + or ex.source_revision != revision + or ex.embedding_model != provider.model_id + ): + changed_ids.append(source_id) + else: + seen.update(ex.row_ids) + unchanged_rows += len(ex.row_ids) + changed_seen, embedded, skipped = _project_and_embed( + source, + changed_ids, + writer=writer, + provider=provider, + chunker=chunker, + embed_batch_limits=embed_batch_limits, + ) + seen |= changed_seen + return _SourceIndexStats( + seen_ids=seen, + embedded=embedded, + skipped_unchanged=unchanged_rows + skipped, + document_count=len(scan.revisions), + ) -def _index_source( +def _project_and_embed( source: IndexSource, + source_ids: Sequence[str], *, writer: SemanticIndexWriter, provider: EmbeddingProvider, chunker: PassageChunker, embed_batch_limits: EmbedBatchLimits, -) -> _SourceIndexStats: +) -> tuple[set[str], int, int]: + """Project + embed only the changed ``source_ids``; returns their row ids + (for seen_ids), the embedded count, and the text-hash/model skip count. Each + batch is upserted (source_revision + vector together) before reconcile runs, + so a crash mid-cycle leaves rows the next rebuild re-converges.""" seen: set[str] = set() embedded = 0 skipped = 0 - for page in chunked(source.iter_projections(), _FINGERPRINT_PAGE_SIZE): + for page in chunked(source.project(source_ids), _FINGERPRINT_PAGE_SIZE): units: list[IndexedSemanticUnit] = [] for projection in page: units.extend(expand_projection(projection, chunker)) @@ -138,11 +264,7 @@ def _index_source( changed = [ unit for unit in units - if _needs_embed( - fingerprints.get(unit.row_id), - unit, - provider.model_id, - ) + if _needs_embed(fingerprints.get(unit.row_id), unit, provider.model_id) ] skipped += len(units) - len(changed) embedded += _embed_and_upsert( @@ -151,11 +273,7 @@ def _index_source( provider=provider, embed_batch_limits=embed_batch_limits, ) - return _SourceIndexStats( - seen_ids=seen, - embedded=embedded, - skipped_unchanged=skipped, - ) + return seen, embedded, skipped def _embed_and_upsert( @@ -233,6 +351,7 @@ def _needs_embed( return ( fingerprint.text_hash != unit.text_hash or fingerprint.embedding_model != model_id + or fingerprint.source_revision != unit.source_revision ) @@ -253,6 +372,7 @@ def _row( status=unit.status, text_hash=unit.text_hash, embedding_model=model_id, + source_revision=unit.source_revision, vector=tuple(vector), ) diff --git a/codeclone/memory/semantic/sources.py b/codeclone/memory/semantic/sources.py index 8d8b079b..304c02d1 100644 --- a/codeclone/memory/semantic/sources.py +++ b/codeclone/memory/semantic/sources.py @@ -7,9 +7,10 @@ from __future__ import annotations import sqlite3 -from collections.abc import Iterator, Sequence +from collections.abc import Iterable, Iterator, Sequence +from dataclasses import dataclass from pathlib import Path -from typing import Protocol +from typing import Literal, Protocol from ...audit.schema import open_audit_db_readonly from ...audit.validation import AuditSchemaError @@ -18,10 +19,14 @@ from .models import SemanticProjection from .projection import ( INDEXED_AUDIT_EVENTS, + audit_content_token, is_indexed_memory_type, + memory_content_token, project_audit_event, project_memory_record, project_trajectory, + source_revision, + trajectory_content_token, ) # Live, retrievable statuses. rejected/archived/superseded are not surfaced by @@ -29,6 +34,35 @@ _INDEXED_STATUSES: frozenset[str] = frozenset({"active", "draft", "stale"}) _PAGE_SIZE = 200 +# How completely a source could enumerate its current rows this cycle. Only a +# ``complete`` scan lets the rebuild prune that lane's stale rows; a degraded +# scan preserves the lane (Stage 1's all-or-nothing scan_failed gate, per lane). +ScanStatus = Literal["complete", "partial", "failed"] + + +class SourceScanError(Exception): + """A source could not enumerate its current rows (a transient read failure). + + The rebuild treats the lane as *incomplete* and preserves it: a failed read + must never masquerade as an empty source, or reconcile would delete the + whole lane. ``available()`` returning False is a different, deliberate state + (the source is off) and is reconciled as a complete-empty lane. + """ + + +@dataclass(frozen=True, slots=True) +class SourceScan: + """A lane's full revision inventory plus how complete the scan was. + + ``revisions`` maps every current source id to its cheap ``source_revision`` + (no projection built, no full hydration). The rebuild diffs this against the + stored revisions to project only what changed. ``status`` gates destructive + reconcile: a degraded scan preserves the lane instead of pruning it. + """ + + revisions: dict[str, str] + status: ScanStatus = "complete" + def _primary_path(subjects: Sequence[MemorySubject]) -> str | None: for subject in subjects: @@ -40,8 +74,10 @@ def _primary_path(subjects: Sequence[MemorySubject]) -> str | None: class IndexSource(Protocol): """A source of deterministic projections to feed the semantic index. - Each source reports availability and yields projections (or nothing); a - rebuild iterates the available sources. + Each source reports availability, scans a cheap revision inventory, and + projects either every row (``iter_projections``, used by the projection + probe) or only a changed subset (``project``, used by the incremental + rebuild). """ def name(self) -> str: ... @@ -50,6 +86,18 @@ def available(self) -> bool: ... def iter_projections(self) -> Iterator[SemanticProjection]: ... + def scan(self) -> SourceScan: + """Cheap full inventory of current source ids -> ``source_revision``, + with no projection built and no full hydration. A read failure degrades + the returned ``status`` instead of raising, so the rebuild preserves the + lane rather than pruning it.""" + ... + + def project(self, source_ids: Sequence[str]) -> Iterator[SemanticProjection]: + """Build deterministic projections for ``source_ids`` only (the changed + subset from the revision partition). Empty ids yields nothing.""" + ... + class _MemoryReadStore(Protocol): """Minimal read surface MemoryIndexSource needs from the memory store.""" @@ -91,6 +139,32 @@ def available(self) -> bool: return True def iter_projections(self) -> Iterator[SemanticProjection]: + for records in self._iter_pages(): + yield from self._project_page(records) + + def scan(self) -> SourceScan: + # Cheap inventory: single-table record scan, no subjects join and no + # projection text built. Only the indexed prose/decision subset gets a + # revision; everything else is not embedded, so it has no row to track. + revisions: dict[str, str] = {} + for records in self._iter_pages(): + for record in records: + if self._is_indexed(record): + revisions[record.id] = source_revision( + source_kind="memory", + source_id=record.id, + content_token=memory_content_token(record), + ) + return SourceScan(revisions=revisions) + + def project(self, source_ids: Sequence[str]) -> Iterator[SemanticProjection]: + wanted = set(source_ids) + if not wanted: + return + for records in self._iter_pages(): + yield from self._project_page(records, wanted=wanted) + + def _iter_pages(self) -> Iterator[Sequence[MemoryRecord]]: offset = 0 while True: records = self._store.query_records( @@ -100,25 +174,39 @@ def iter_projections(self) -> Iterator[SemanticProjection]: offset=offset, ) ) - indexed = [ - record - for record in records - if is_indexed_memory_type(record.type) - and record.status in _INDEXED_STATUSES - ] - # One batched subject load per page instead of a query per record. - subjects_by_id = self._store.list_subjects_for_memories( - [record.id for record in indexed] - ) - for record in indexed: - yield project_memory_record( - record, - subject_path=_primary_path(subjects_by_id.get(record.id, [])), - ) + yield records if len(records) < _PAGE_SIZE: return offset += _PAGE_SIZE + @staticmethod + def _is_indexed(record: MemoryRecord) -> bool: + return ( + is_indexed_memory_type(record.type) and record.status in _INDEXED_STATUSES + ) + + def _project_page( + self, + records: Sequence[MemoryRecord], + *, + wanted: set[str] | None = None, + ) -> Iterator[SemanticProjection]: + indexed = [ + record + for record in records + if self._is_indexed(record) and (wanted is None or record.id in wanted) + ] + # One batched subject load per page instead of a query per record; only + # the projected (changed) subset pays the subjects join. + subjects_by_id = self._store.list_subjects_for_memories( + [record.id for record in indexed] + ) + for record in indexed: + yield project_memory_record( + record, + subject_path=_primary_path(subjects_by_id.get(record.id, [])), + ) + class TrajectoryIndexSource: """Trajectory memory as a semantic source. @@ -138,6 +226,33 @@ def available(self) -> bool: return True def iter_projections(self) -> Iterator[SemanticProjection]: + for page in self._iter_list_pages(): + yield from self._hydrate_and_project(item.id for item in page) + + def scan(self) -> SourceScan: + # Cheap inventory: the list scan already returns each trajectory_digest, + # so a revision is derivable without hydrating (no find_trajectories) — + # this is the lever that removes the full-hydration cost on an unchanged + # corpus. + revisions: dict[str, str] = {} + for page in self._iter_list_pages(): + for item in page: + revisions[item.id] = source_revision( + source_kind="trajectory", + source_id=item.id, + content_token=trajectory_content_token( + trajectory_digest=item.trajectory_digest + ), + ) + return SourceScan(revisions=revisions) + + def project(self, source_ids: Sequence[str]) -> Iterator[SemanticProjection]: + wanted = list(source_ids) + if not wanted: + return + yield from self._hydrate_and_project(wanted) + + def _iter_list_pages(self) -> Iterator[Sequence[TrajectoryListItem]]: offset = 0 while True: items = self._store.list_trajectories( @@ -145,13 +260,18 @@ def iter_projections(self) -> Iterator[SemanticProjection]: limit=_PAGE_SIZE + offset, ) page = items[offset : offset + _PAGE_SIZE] - # Batch-hydrate the page instead of one find_trajectory per item. - for trajectory in self._store.find_trajectories([item.id for item in page]): - yield project_trajectory(trajectory) + yield page if len(page) < _PAGE_SIZE: return offset += _PAGE_SIZE + def _hydrate_and_project( + self, trajectory_ids: Iterable[str] + ) -> Iterator[SemanticProjection]: + # Batch-hydrate instead of one find_trajectory per id. + for trajectory in self._store.find_trajectories(list(trajectory_ids)): + yield project_trajectory(trajectory) + class AuditIndexSource: """Audit trail as an availability-gated semantic index source. @@ -175,28 +295,45 @@ def available(self) -> bool: def iter_projections(self) -> Iterator[SemanticProjection]: if not self.available(): return - yield from self._read_projections() + yield from self._iter_event_projections(only_ids=None) - def _read_projections(self) -> Iterator[SemanticProjection]: - event_types = tuple(sorted(INDEXED_AUDIT_EVENTS)) - placeholders = ", ".join("?" for _ in event_types) - try: - conn = open_audit_db_readonly(self._db_path) - except (sqlite3.Error, AuditSchemaError, OSError): - return + def scan(self) -> SourceScan: + # Audit events are immutable, so a row's revision is just the projection + # version: a new event_id is the only thing that changes the lane. An + # unavailable source is a deliberate complete-empty lane (off, not + # failed); a transient read failure degrades to ``failed`` (preserve). + if not self.available(): + return SourceScan(revisions={}) try: - rows = conn.execute( - "SELECT event_id, event_type, summary FROM controller_events " - "WHERE summary IS NOT NULL AND summary != '' " - f"AND event_type IN ({placeholders}) " - "ORDER BY created_at_utc ASC, id ASC", - event_types, - ).fetchall() - except (sqlite3.Error, AuditSchemaError): + rows = self._fetch_event_rows( + columns=("event_id", "summary"), only_ids=None + ) + except SourceScanError: + return SourceScan(revisions={}, status="failed") + revisions: dict[str, str] = {} + for event_id, summary in rows: + if not isinstance(summary, str) or not summary.strip(): + continue + eid = str(event_id) + revisions[eid] = source_revision( + source_kind="audit", + source_id=eid, + content_token=audit_content_token(), + ) + return SourceScan(revisions=revisions) + + def project(self, source_ids: Sequence[str]) -> Iterator[SemanticProjection]: + wanted = set(source_ids) + if not wanted or not self.available(): return - finally: - conn.close() - for event_id, event_type, summary in rows: + yield from self._iter_event_projections(only_ids=wanted) + + def _iter_event_projections( + self, *, only_ids: set[str] | None + ) -> Iterator[SemanticProjection]: + for event_id, event_type, summary in self._fetch_event_rows( + columns=("event_id", "event_type", "summary"), only_ids=only_ids + ): if not isinstance(summary, str) or not summary.strip(): continue yield project_audit_event( @@ -205,10 +342,47 @@ def _read_projections(self) -> Iterator[SemanticProjection]: summary=summary, ) + def _fetch_event_rows( + self, + *, + columns: Sequence[str], + only_ids: set[str] | None, + ) -> list[tuple[object, ...]]: + """Open the audit DB read-only and fetch the forensic event rows, raising + ``SourceScanError`` on any read failure. ``columns`` is a fixed internal + allow-list (never user input); ``only_ids`` adds an ``event_id IN (...)`` + filter for the changed-subset projection.""" + event_types = tuple(sorted(INDEXED_AUDIT_EVENTS)) + type_placeholders = ", ".join("?" for _ in event_types) + sql = ( + f"SELECT {', '.join(columns)} FROM controller_events " + "WHERE summary IS NOT NULL AND summary != '' " + f"AND event_type IN ({type_placeholders}) " + ) + params: list[object] = list(event_types) + if only_ids is not None: + id_placeholders = ", ".join("?" for _ in only_ids) + sql += f"AND event_id IN ({id_placeholders}) " + params.extend(sorted(only_ids)) + sql += "ORDER BY created_at_utc ASC, id ASC" + try: + conn = open_audit_db_readonly(self._db_path) + except (sqlite3.Error, AuditSchemaError, OSError) as exc: + raise SourceScanError("audit source could not open its database") from exc + try: + return conn.execute(sql, params).fetchall() + except (sqlite3.Error, AuditSchemaError) as exc: + raise SourceScanError("audit source could not read its events") from exc + finally: + conn.close() + __all__ = [ "AuditIndexSource", "IndexSource", "MemoryIndexSource", + "ScanStatus", + "SourceScan", + "SourceScanError", "TrajectoryIndexSource", ] diff --git a/codeclone/memory/sqlite_store.py b/codeclone/memory/sqlite_store.py index 0089721a..7bda8bd1 100644 --- a/codeclone/memory/sqlite_store.py +++ b/codeclone/memory/sqlite_store.py @@ -225,6 +225,22 @@ def load_trajectory_patch_trails( trajectory_ids=trajectory_ids, ) + def find_trajectory_patch_trails_for_lookup( + self, + *, + project_id: str, + patch_trail_digest: str | None = None, + run_id: str | None = None, + ) -> tuple[list[dict[str, object]], int]: + from .trajectory.store import find_trajectory_patch_trails_for_lookup + + return find_trajectory_patch_trails_for_lookup( + self._conn, + project_id=project_id, + patch_trail_digest=patch_trail_digest, + run_id=run_id, + ) + def list_canonical_trajectories_for_export( self, *, diff --git a/codeclone/memory/trajectory/models.py b/codeclone/memory/trajectory/models.py index e1b457c5..73b1fc51 100644 --- a/codeclone/memory/trajectory/models.py +++ b/codeclone/memory/trajectory/models.py @@ -145,6 +145,10 @@ class TrajectoryListItem: started_at_utc: str finished_at_utc: str summary: str + # Content-addressed digest of the trajectory (Stage 2): lets the semantic + # source compute a cheap revision token from the list scan alone, without + # hydrating each trajectory. Defaults "" so older callers stay valid. + trajectory_digest: str = "" __all__ = [ diff --git a/codeclone/memory/trajectory/patch_trail_projector.py b/codeclone/memory/trajectory/patch_trail_projector.py index 3ba6142a..1f1300ac 100644 --- a/codeclone/memory/trajectory/patch_trail_projector.py +++ b/codeclone/memory/trajectory/patch_trail_projector.py @@ -28,7 +28,7 @@ PatchTrailInputs, VerifySnapshot, ) -from .patch_trail import PatchTrail, compute_patch_trail +from .patch_trail import PatchTrail, compute_patch_trail, patch_trail_from_mapping from .projector import TrajectoryProjectionError @@ -69,6 +69,9 @@ def project_patch_trail_from_audit( if not workflow_id.startswith("intent:"): return None ordered = tuple(sorted(records, key=_record_order_key)) + stored = _patch_trail_from_computed_event(ordered) + if stored is not None: + return stored state = _WorkflowAuditState() for record in ordered: _apply_audit_record(state, record) @@ -115,6 +118,31 @@ def project_patch_trail_from_audit( return compute_patch_trail(inputs) +def _patch_trail_from_computed_event( + records: Sequence[AuditRecord], +) -> PatchTrail | None: + for record in reversed(records): + if record.event_type != EVENT_PATCH_TRAIL_COMPUTED: + continue + payload = _audit_payload_mapping(record.payload_json) + if payload is None: + continue + trail = patch_trail_from_mapping(payload) + if trail is not None: + return trail + return None + + +def _audit_payload_mapping(payload_json: str | None) -> Mapping[str, object] | None: + if not payload_json or payload_json == "{}": + return None + try: + loaded = orjson.loads(payload_json) + except orjson.JSONDecodeError: + return None + return loaded if isinstance(loaded, dict) else None + + def _apply_audit_record(state: _WorkflowAuditState, record: AuditRecord) -> None: if record.audit_sequence is None: return diff --git a/codeclone/memory/trajectory/retrieval.py b/codeclone/memory/trajectory/retrieval.py index f0533603..7d10afca 100644 --- a/codeclone/memory/trajectory/retrieval.py +++ b/codeclone/memory/trajectory/retrieval.py @@ -144,7 +144,10 @@ def serialize_trajectory_preview( } if relevance_score is not None: payload["relevance_score"] = round(relevance_score, 3) - summary = serialize_patch_trail_summary(patch_trail_payload) + summary = serialize_patch_trail_summary( + patch_trail_payload, + run_id=trajectory.primary_run_id, + ) if summary is not None: payload["patch_trail_summary"] = summary _add_quality_fields( @@ -218,6 +221,8 @@ def _add_quality_fields( def serialize_patch_trail_summary( payload: Mapping[str, object] | None, + *, + run_id: str | None = None, ) -> dict[str, object] | None: if payload is None: return None @@ -225,13 +230,30 @@ def serialize_patch_trail_summary( if trail is None: return None summary_payload = trail.to_payload(detail_level="summary") - return { + summary: dict[str, object] = { "summary_line": patch_trail_summary_line(trail), "patch_trail_digest": trail.patch_trail_digest, "counts": summary_payload.get("counts", {}), "scope_check_status": trail.scope_check_status, "verification_status": trail.verification_status, } + effective_run_id = _clean_run_id(run_id) + if effective_run_id and trail.patch_trail_digest: + summary["patch_trail_retrieval"] = { + "tool": "get_patch_trail", + "route": ("get_patch_trail(root=..., run_id=..., patch_trail_digest=...)"), + "run_id": effective_run_id, + "patch_trail_digest": trail.patch_trail_digest, + "snapshot_identity": "patch_trail_digest + optional run_id", + } + return summary + + +def _clean_run_id(value: str | None) -> str | None: + if not isinstance(value, str): + return None + stripped = value.strip() + return stripped or None def serialize_trajectory_detail( diff --git a/codeclone/memory/trajectory/store.py b/codeclone/memory/trajectory/store.py index 49faf5f7..c1421452 100644 --- a/codeclone/memory/trajectory/store.py +++ b/codeclone/memory/trajectory/store.py @@ -483,7 +483,7 @@ def list_trajectories( rows = conn.execute( """ SELECT id, workflow_id, outcome, quality_tier, quality_score, event_count, - started_at_utc, finished_at_utc, summary + started_at_utc, finished_at_utc, summary, trajectory_digest FROM memory_trajectories WHERE project_id=? ORDER BY finished_at_utc DESC, id ASC @@ -502,6 +502,7 @@ def list_trajectories( started_at_utc=str(row["started_at_utc"]), finished_at_utc=str(row["finished_at_utc"]), summary=str(row["summary"]), + trajectory_digest=str(row["trajectory_digest"]), ) for row in rows ] @@ -1012,10 +1013,103 @@ def load_trajectory_patch_trails( return loaded_by_id +def find_trajectory_patch_trails_for_lookup( + conn: sqlite3.Connection, + *, + project_id: str, + patch_trail_digest: str | None = None, + run_id: str | None = None, +) -> tuple[list[dict[str, object]], int]: + """Return stored trajectory patch trails for exact drill-down lookup. + + The memory projection store keeps immutable patch-trail payloads keyed by + trajectory, while MCP drill-down addresses them by digest and optional run. + This helper bridges those contracts without rebuilding current state. + """ + clauses = ["t.project_id=?"] + params: list[object] = [project_id] + digest = _optional_text(patch_trail_digest) + if digest is not None: + clauses.append("pt.patch_trail_digest=?") + params.append(digest) + rows = conn.execute( + f""" + SELECT + t.primary_run_id, + t.first_run_id, + t.last_run_id, + pt.trajectory_id, + pt.patch_trail_digest, + pt.patch_trail_json, + pt.projected_at_utc + FROM memory_trajectory_patch_trails AS pt + JOIN memory_trajectories AS t ON t.id = pt.trajectory_id + WHERE {" AND ".join(clauses)} + ORDER BY pt.projected_at_utc DESC, pt.trajectory_id ASC + """, + tuple(params), + ).fetchall() + requested_run_id = _optional_text(run_id) + payloads: list[dict[str, object]] = [] + malformed = 0 + for row in rows: + if requested_run_id is not None and not _trajectory_run_id_matches( + row, + requested_run_id, + ): + continue + try: + loaded = orjson.loads(str(row["patch_trail_json"])) + except orjson.JSONDecodeError: + malformed += 1 + continue + if not isinstance(loaded, dict): + malformed += 1 + continue + payloads.append( + { + "run_id": _lookup_run_id(row), + "patch_trail_digest": str(row["patch_trail_digest"]), + "created_at_utc": str(row["projected_at_utc"]), + "payload": loaded, + } + ) + return payloads, malformed + + +def _trajectory_run_id_matches(row: sqlite3.Row, requested: str) -> bool: + return any( + _run_id_token_matches(row[key], requested) + for key in ("primary_run_id", "first_run_id", "last_run_id") + ) + + +def _run_id_token_matches(stored: object, requested: str) -> bool: + stored_id = _optional_text(stored) + requested_id = _optional_text(requested) + if stored_id is None or requested_id is None: + return False + return ( + stored_id == requested_id + or stored_id.startswith(requested_id) + or requested_id.startswith(stored_id) + ) + + +def _lookup_run_id(row: sqlite3.Row) -> str: + return ( + _optional_text(row["primary_run_id"]) + or _optional_text(row["last_run_id"]) + or _optional_text(row["first_run_id"]) + or "" + ) + + __all__ = [ "count_trajectories", "find_trajectories_by_ids", "find_trajectory", + "find_trajectory_patch_trails_for_lookup", "latest_projection_run", "list_trajectories", "list_trajectories_for_intent_id", diff --git a/codeclone/metrics/dependencies.py b/codeclone/metrics/dependencies.py index 573cc9e2..f73d3ded 100644 --- a/codeclone/metrics/dependencies.py +++ b/codeclone/metrics/dependencies.py @@ -10,9 +10,10 @@ from typing import TYPE_CHECKING from ..models import DepGraph, ModuleDep +from ..utils import coerce if TYPE_CHECKING: - from collections.abc import Iterable, Sequence + from collections.abc import Callable, Iterable, Sequence DepAdjacency = dict[str, set[str]] @@ -270,3 +271,80 @@ def build_dep_graph(*, modules: Iterable[str], deps: Sequence[ModuleDep]) -> Dep p95_depth=p95_depth, longest_chains=chains, ) + + +def select_dependency_graph_nodes( + edges: Sequence[tuple[str, str]], + *, + dep_cycles: Sequence[object], + longest_chains: Sequence[object], + max_nodes: int, + max_edges: int, + node_id_fn: Callable[[str], str] | None = None, +) -> tuple[list[str], list[tuple[str, str]], dict[str, object]]: + """Deterministic subgraph sample over directed import edges. + + Seeds cycle members, then longest-chain members, then fills remaining slots + by descending node degree (tie-break id ascending), so structurally + important nodes survive downsampling. ``node_id_fn`` maps a module name to + the active zoom id (package prefix or identity) before membership checks + when seeding from cycles and chains. Returns the shown nodes, the induced + (edge-capped) edges, and a truncation metadata mapping. + """ + all_nodes = sorted({part for edge in edges for part in edge}) + node_universe_count = len(all_nodes) + edge_universe_count = len(edges) + if node_universe_count > max_nodes: + degree_count: dict[str, int] = dict.fromkeys(all_nodes, 0) + for source, target in edges: + degree_count[source] = degree_count.get(source, 0) + 1 + degree_count[target] = degree_count.get(target, 0) + 1 + all_node_set = set(all_nodes) + nodes: list[str] = [] + node_set: set[str] = set() + + def _seed_node(node: object) -> None: + node_name = str(node).strip() + if node_id_fn is not None: + node_name = node_id_fn(node_name) + if ( + not node_name + or node_name not in all_node_set + or node_name in node_set + or len(nodes) >= max_nodes + ): + return + nodes.append(node_name) + node_set.add(node_name) + + for cycle in dep_cycles: + for node in coerce.as_sequence(cycle): + _seed_node(node) + for chain in longest_chains: + for node in coerce.as_sequence(chain): + _seed_node(node) + for node in sorted( + all_nodes, key=lambda item: (-degree_count.get(item, 0), item) + ): + _seed_node(node) + if len(nodes) >= max_nodes: + break + nodes.sort() + else: + nodes = list(all_nodes) + node_set = set(nodes) + filtered = [ + (source, target) + for source, target in edges + if source in node_set and target in node_set + ][:max_edges] + truncation: dict[str, object] = { + "truncated": len(nodes) < node_universe_count + or len(filtered) < edge_universe_count, + "node_universe_count": node_universe_count, + "node_shown_count": len(nodes), + "edge_universe_count": edge_universe_count, + "edge_shown_count": len(filtered), + "seed_policy": "cycles_then_chains_then_degree", + } + return nodes, filtered, truncation diff --git a/codeclone/models.py b/codeclone/models.py index dc5abb18..d94e47db 100644 --- a/codeclone/models.py +++ b/codeclone/models.py @@ -152,6 +152,7 @@ class DeadCandidate: "django", "fastapi", "flask", + "pydantic", "sqlalchemy", "starlette", "typer", diff --git a/codeclone/observability/__init__.py b/codeclone/observability/__init__.py index f9f0b3a8..589c95f6 100644 --- a/codeclone/observability/__init__.py +++ b/codeclone/observability/__init__.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Platform observability (Phase 29). +"""Platform observability. A runtime-profiling plane separate from audit truth, the analysis report, and the memory store: operations and stage spans for CLI / MCP / projection workers. @@ -15,12 +15,13 @@ from __future__ import annotations from .runtime import ( + DB_COUNTER_VERSION, OperationHandle, SpanHandle, bind_root, bootstrap, + counting_connection_factory, current_operation_context, - instrument_db_connection, is_observability_enabled, operation, payload_capture_enabled, @@ -32,12 +33,13 @@ ) __all__ = [ + "DB_COUNTER_VERSION", "OperationHandle", "SpanHandle", "bind_root", "bootstrap", + "counting_connection_factory", "current_operation_context", - "instrument_db_connection", "is_observability_enabled", "operation", "payload_capture_enabled", diff --git a/codeclone/observability/analysis_phases.py b/codeclone/observability/analysis_phases.py new file mode 100644 index 00000000..3732becf --- /dev/null +++ b/codeclone/observability/analysis_phases.py @@ -0,0 +1,34 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from ..analysis.phase_ledger import ( + PHASE_US_COUNTER_SUFFIXES, + PHASE_VOLUME_COUNTER_SUFFIXES, + PhaseSnapshot, +) +from .runtime import SpanHandle + + +def apply_pipeline_process_phase_counters( + span: SpanHandle, + *, + phase_snapshot: PhaseSnapshot | None, +) -> None: + if phase_snapshot is None: + return + + phase_counters = phase_snapshot.totals.counter_map_us() + for key in PHASE_US_COUNTER_SUFFIXES: + span.set_counter(key, phase_counters.get(key, 0)) + + volumes = phase_snapshot.volume_map() + for key in PHASE_VOLUME_COUNTER_SUFFIXES: + span.set_counter(key, volumes.get(key, 0)) + + +__all__ = ["apply_pipeline_process_phase_counters"] diff --git a/codeclone/observability/db_fingerprint.py b/codeclone/observability/db_fingerprint.py index b9ce496e..ec493162 100644 --- a/codeclone/observability/db_fingerprint.py +++ b/codeclone/observability/db_fingerprint.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""SQL statement fingerprinting for DB observability (Phase 29.DB, Track B). +"""SQL statement fingerprinting for DB observability. Performance-truth only: reduce a SQL statement to its normalized *shape* so the cockpit can turn "1892 queries" into "1200x SELECT evidence by trajectory_id". diff --git a/codeclone/observability/profile.py b/codeclone/observability/profile.py index 8636bd26..766a997d 100644 --- a/codeclone/observability/profile.py +++ b/codeclone/observability/profile.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""psutil resource sampling for observability profiling (Phase 29, profile=true). +"""psutil resource sampling for observability profiling. psutil is an optional dependency (``codeclone[perf]``) imported lazily inside the capture functions, so a disabled or non-profiling process never loads it. Every diff --git a/codeclone/observability/query.py b/codeclone/observability/query.py index 674c0e37..3ac58bec 100644 --- a/codeclone/observability/query.py +++ b/codeclone/observability/query.py @@ -5,7 +5,7 @@ # Copyright (c) 2026 Den Rozhnovskiy """``query_platform_observability`` — a sectioned, read-only diagnostics slicer -over the Phase 29 runtime telemetry (RFC specs/rfc-29-observability-query-tool). +over runtime telemetry. A **slicer, not a trace export API**: each call returns one bounded *section* projected from the already-computed ``AggregatesView``; no response embeds the @@ -21,6 +21,7 @@ from pathlib import Path from ..config.observability import resolve_observability_config +from .runtime import DB_COUNTER_VERSION from .store.reader import build_trace_view, open_observability_store_readonly from .views import AggregatesView, OperationView, TraceView @@ -36,7 +37,8 @@ _DB_CHATTY_QPC = 200 _CONTEXT_HEAVY_PCT = 25 _MEMORY_HEAVY_MB = 200.0 -_CONTEXT_PRESSURE_TOKENS = 8000 +_CONTEXT_PRESSURE_UNITS = 8000 +_ANALYSIS_HEAVY_WORKER_MS = 2000.0 _AGGREGATE_SECTIONS = ( "summary", @@ -48,6 +50,7 @@ "correlated_chains", "costly_noops", "pipeline", + "analysis_phase_cost", ) @@ -156,6 +159,7 @@ def _db_cost(agg: AggregatesView, cap: int) -> list[dict[str, object]]: "calls": r.span_count, "queries": r.total_queries, "writes": r.total_writes, + "rows": r.total_rows, "queries_per_call": per_call, "verdict": "query_chatty" if per_call >= _DB_CHATTY_QPC else "ok", } @@ -206,13 +210,18 @@ def _pipeline(agg: AggregatesView, cap: int) -> list[dict[str, object]]: def _agent_context_body(agg: AggregatesView, cap: int) -> dict[str, object]: agent = agg.agent if agent is None: - return {"total_response_tokens": 0, "rows": []} + return { + "total_response_tokens": 0, + "total_response_context_units": 0, + "rows": [], + } total = agent.response_tokens rows = [ { "tool": c.name, "calls": c.calls, "response_tokens": c.response_tokens, + "response_context_units": c.response_tokens, "context_percent": round(100 * c.response_tokens / total) if total else 0, "verdict": ( "context_heavy" @@ -222,7 +231,37 @@ def _agent_context_body(agg: AggregatesView, cap: int) -> dict[str, object]: } for c in agent.consumers[:cap] ] - return {"total_response_tokens": total, "rows": rows} + return { + "total_response_tokens": total, + "total_response_context_units": total, + "rows": rows, + } + + +def _analysis_phase_body(agg: AggregatesView, cap: int) -> dict[str, object]: + rows = [ + { + "phase": row.phase, + "worker_elapsed_ms": row.worker_elapsed_ms, + "share_permille": row.share_permille, + "verdict": row.verdict, + } + for row in agg.analysis_phases[:cap] + ] + body: dict[str, object] = { + "phase_worker_elapsed_total_ms": (agg.analysis_phase_worker_elapsed_total_ms), + "pipeline_process_wall_ms": agg.analysis_phase_pipeline_wall_ms, + "source_spans": agg.analysis_phase_source_spans, + "files_timed": agg.analysis_phase_files_timed, + "units_eligible": agg.analysis_phase_units_eligible, + "rows": rows, + } + if not rows: + body["message"] = ( + "no analysis phase counters in window; run with " + "CODECLONE_OBSERVABILITY_ENABLED=1 and a full analyze." + ) + return body def _chain_descendant_names(op: OperationView) -> list[str]: @@ -324,7 +363,22 @@ def _context_diagnostic(agg: AggregatesView) -> dict[str, object] | None: return None return { "kind": "context", - "message": f"{lead.name} consumed {pct}% of returned tokens.", + "message": f"{lead.name} consumed {pct}% of returned context units.", + } + + +def _analysis_diagnostic(agg: AggregatesView) -> dict[str, object] | None: + if not agg.analysis_phases: + return None + top = agg.analysis_phases[0] + if top.verdict != "phase_heavy": + return None + return { + "kind": "analysis", + "message": ( + f"{top.phase} consumed {top.share_permille / 10:.0f}% of measured " + f"extract time ({top.worker_elapsed_ms:.0f} ms)." + ), } @@ -333,20 +387,35 @@ def _top_diagnostics(agg: AggregatesView) -> list[dict[str, object]]: _memory_diagnostic(agg), _db_diagnostic(agg), _context_diagnostic(agg), + _analysis_diagnostic(agg), ) return [d for d in candidates if d is not None][:_MAX_DIAGNOSTICS] def _summary_body(trace: TraceView) -> dict[str, object]: agg = trace.aggregates - return { + body: dict[str, object] = { "operations": agg.operation_count, + "db_counter_version": DB_COUNTER_VERSION, "peak_rss_delta_mb": _round1(agg.max_rss_delta_mb), "peak_rss_mb": _round1(agg.max_peak_rss_mb), "context_pressure_tokens": agg.agent.response_tokens if agg.agent else 0, + "context_pressure_units": agg.agent.response_tokens if agg.agent else 0, "costly_noops": sum(1 for s in agg.semantic_costs if s.no_op), "top_diagnostics": _top_diagnostics(agg), } + if agg.analysis_phases: + body["analysis_phase_worker_elapsed_total_ms"] = ( + agg.analysis_phase_worker_elapsed_total_ms + ) + body["top_analysis_phases"] = [ + { + "phase": row.phase, + "share_permille": row.share_permille, + } + for row in agg.analysis_phases[:_MAX_DIAGNOSTICS] + ] + return body def _recommended_next_sections( @@ -364,14 +433,24 @@ def _recommended_next_sections( "reason": f"high query count in {top.span_name}.", } ) - if agg.agent and agg.agent.response_tokens >= _CONTEXT_PRESSURE_TOKENS: + if agg.agent and agg.agent.response_tokens >= _CONTEXT_PRESSURE_UNITS: recs.append( - {"section": "agent_context", "reason": "high context-token pressure."} + {"section": "agent_context", "reason": "high context-unit pressure."} ) if any(s.no_op for s in agg.semantic_costs): recs.append( {"section": "costly_noops", "reason": "a span ran but produced nothing."} ) + if ( + agg.analysis_phase_worker_elapsed_total_ms is not None + and agg.analysis_phase_worker_elapsed_total_ms >= _ANALYSIS_HEAVY_WORKER_MS + ) or any(row.verdict == "phase_heavy" for row in agg.analysis_phases): + recs.append( + { + "section": "analysis_phase_cost", + "reason": "pipeline.process phase breakdown available.", + } + ) return recs @@ -432,6 +511,8 @@ def query_platform_observability( response.update(_summary_body(trace)) elif section == "agent_context": response.update(_agent_context_body(agg, row_cap)) + elif section == "analysis_phase_cost": + response.update(_analysis_phase_body(agg, row_cap)) elif section == "correlated_chains": response["rows"] = _correlated_chains(trace, row_cap) else: diff --git a/codeclone/observability/render_html.py b/codeclone/observability/render_html.py index 2e09cbc3..741aad22 100644 --- a/codeclone/observability/render_html.py +++ b/codeclone/observability/render_html.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Branded HTML renderer for the observability ``TraceView`` (Phase 29 output). +"""Branded HTML renderer for the observability ``TraceView``. A single self-contained page rendered as a *runtime-diagnosis cockpit*, not a data dump. It is laid out for a top-down reading trajectory that answers the @@ -20,12 +20,14 @@ from __future__ import annotations -from collections.abc import Mapping +from collections.abc import Iterable, Mapping +from datetime import datetime from html import escape from .views import ( AgentTokenRow, AggregatesView, + AnalysisPhaseRow, DbCostRow, DbFingerprintRow, McpToolAggregate, @@ -42,6 +44,22 @@ # A no-op span only deserves a "costly" warning once it has actually spent time. _NOOP_COSTLY_MS = 50.0 _KNOWN_SURFACES = frozenset({"mcp", "cli", "memory"}) +_ANALYSIS_PHASE_LABELS = { + "parse": "Parse (ast.parse)", + "qualname": "Qualname index", + "module_walk": "Module walk", + "relationship": "Relationship facts", + "suppressions": "Suppressions", + "unit_cfg": "CFG build", + "unit_normalize_cfg": "Normalize (CFG blocks)", + "unit_structural": "Structural scan", + "unit_normalize_stmt": "Normalize (statements)", + "unit_blocks": "Block extract", + "unit_segments": "Segment extract", + "class_metrics": "Class metrics", + "dead_code": "Dead-code collect", + "module_passes": "Module passes", +} # Reuse of the CodeClone brand mark (report/html/widgets/icons.py:BRAND_LOGO). _LOGO = ( @@ -68,6 +86,7 @@ --font:"Inter","Inter Variable",-apple-system,BlinkMacSystemFont,"Segoe UI", Roboto,sans-serif; --mono:"JetBrains Mono",ui-monospace,SFMono-Regular,Menlo,Consolas,monospace; +--radius-sm:4px;--radius-md:6px;--radius-lg:8px;--radius-xl:12px; } @media (prefers-color-scheme:light){:root{ --bg:oklch(98.5% 0.006 275);--surface:#fff;--surface-2:oklch(97.3% 0.006 275); @@ -93,14 +112,14 @@ color:var(--mute);font-weight:600;margin:0 0 4px 2px} .shint{color:var(--mute);font-size:12px;margin:0 0 11px 2px} .panel{background:var(--surface);border:1px solid var(--border); -border-radius:11px;overflow:hidden} +border-radius:var(--radius-xl);overflow:hidden} .grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(148px,1fr)); gap:10px;margin-bottom:12px} .stats{display:grid;grid-template-columns:repeat(5,minmax(0,1fr));gap:10px; margin-bottom:12px} @media (max-width:760px){.stats{grid-template-columns:repeat(2,minmax(0,1fr))}} .card{background:var(--surface);border:1px solid var(--border); -border-radius:11px;padding:14px 16px} +border-radius:var(--radius-xl);padding:14px 16px} .card .v{font-size:24px;font-weight:600;letter-spacing:-0.02em; font-family:var(--mono)} .card .l{color:var(--mute);font-size:10.5px;text-transform:uppercase; @@ -133,16 +152,17 @@ .lin{color:var(--mute);font-size:11.5px;font-family:var(--mono)} .lmetric{font-family:var(--mono);font-size:14px;font-weight:600;white-space:nowrap} .badge{font-size:10px;font-weight:600;font-family:var(--mono);padding:2px 7px; -border-radius:5px;text-transform:uppercase;letter-spacing:0.03em;flex-shrink:0; +border-radius:var(--radius-sm);text-transform:uppercase;letter-spacing:0.03em;flex-shrink:0; background:color-mix(in oklch,var(--c,var(--accent)) 16%,transparent); color:var(--c,var(--accent))} .surf-mcp{--c:var(--mcp)}.surf-cli{--c:var(--cli)}.surf-memory{--c:var(--memory)} -.chip{font-size:10.5px;font-family:var(--mono);padding:1px 8px;border-radius:20px; +.chip{font-size:10.5px;font-family:var(--mono);padding:1px 8px; +border-radius:var(--radius-sm); background:var(--surface-2);color:var(--dim);border:1px solid var(--border); white-space:nowrap} .chip.warn{color:var(--warn);border-color:transparent;background:var(--warn-soft); font-weight:600} -.bar{display:block;width:100%;height:7px} +.bar{display:block;width:100%;height:6px} .dur{font-family:var(--mono);font-size:12.5px;text-align:right;white-space:nowrap; color:var(--dim)} .mem{font-family:var(--mono);font-size:11.5px;color:var(--warn);font-weight:550; @@ -158,7 +178,7 @@ .crumb .cname{font-family:var(--mono);font-size:12px;color:var(--text)} .crumb .arrow{color:var(--mute);font-size:13px} .oprow,.spanrow{display:grid; -grid-template-columns:minmax(0,1fr) 140px 56px 70px 120px; +grid-template-columns:minmax(0,1fr) 104px 56px 70px 120px; align-items:center;column-gap:13px;row-gap:2px;padding:5px 0} .lead-cell{display:flex;align-items:center;gap:9px;min-width:0} .opname{font-family:var(--mono);font-size:13px;font-weight:550;overflow:hidden; @@ -167,8 +187,11 @@ text-overflow:ellipsis;white-space:nowrap} .tick{color:var(--accent);opacity:0.6;font-size:11px;flex-shrink:0} .spanrow .counters{grid-column:2/-1;font-family:var(--mono);font-size:10.5px; -color:var(--mute);display:flex;flex-wrap:wrap;gap:0 15px} -.counters b{color:var(--dim);font-weight:550;margin-right:4px} +color:var(--mute);display:flex;flex-direction:column;gap:2px;margin-top:4px; +padding-left:17px} +.cgroup{display:block;line-height:1.55} +.cgroup>b{display:inline-block;min-width:54px;margin-right:8px;color:var(--mute); +font-weight:700;text-transform:uppercase;letter-spacing:0.04em;font-size:9px} .spans{padding-left:17px} .kids{margin-left:13px;padding-left:17px;border-left:2px solid var(--accent-soft)} .wf{padding:8px 16px 12px} @@ -177,16 +200,16 @@ .wf-cap{display:flex;align-items:center;gap:8px;margin-bottom:9px; font-family:var(--mono);font-size:11px;color:var(--mute)} .wf-cap b{color:var(--dim);font-weight:600} -.wf-row{display:grid;grid-template-columns:minmax(150px,238px) minmax(0,1fr) 58px; +.wf-row{display:grid;grid-template-columns:minmax(150px,220px) minmax(0,520px) 58px; align-items:center;column-gap:12px;padding:2px 0} .wf-label{font-family:var(--mono);font-size:11.5px;overflow:hidden; text-overflow:ellipsis;white-space:nowrap} .wf-label.op{color:var(--text);font-weight:550} .wf-label.span{color:var(--dim)} -.wf-track{position:relative;height:14px;background:var(--track);border-radius:4px} -.wf-bar{position:absolute;top:2px;height:10px;border-radius:3px; -background:var(--c,var(--accent))} -.wf-bar.span{top:3px;height:8px;opacity:0.8} +.wf-track{position:relative;height:6px;background:var(--track);border-radius:2px} +.wf-bar{position:absolute;top:0;height:6px;border-radius:2px; +background:var(--accent)} +.wf-bar.span{opacity:0.6} .wf-dur{font-family:var(--mono);font-size:11px;color:var(--mute);text-align:right; white-space:nowrap} table{width:100%;border-collapse:collapse;font-size:12.5px} @@ -197,14 +220,64 @@ white-space:nowrap} td.t{font-family:var(--font)} th.r,td.r{text-align:right} +/* "Most expensive" — one delicate idiom: accent left rule + faint tint */ +tr.lead td{background:color-mix(in oklch,var(--accent) 7%,transparent)} +tr.lead td:first-child{box-shadow:inset 2px 0 0 var(--accent)} .shape{font-family:var(--font);font-size:12.5px} .sqlraw{font-family:var(--mono);font-size:11px;color:var(--mute);max-width:440px; overflow:hidden;text-overflow:ellipsis;white-space:nowrap;margin-top:3px} tr.flag td{background:var(--warn-soft)} .muted{color:var(--mute)} +/* Analysis micro-phases — ranked share bars (core's most important timings) */ +.ph{padding:6px 16px 14px} +.ph-row{display:grid; +grid-template-columns:minmax(150px,210px) minmax(0,360px) 66px 50px auto; +align-items:center;column-gap:14px;padding:8px 0 8px 12px; +border-top:1px solid var(--border);border-left:2px solid transparent} +.ph-row:first-child{border-top:none} +.ph-namecell{display:flex;flex-direction:column;min-width:0} +.ph-name{font-family:var(--font);font-size:13px;color:var(--text);overflow:hidden; +text-overflow:ellipsis;white-space:nowrap} +.ph-row.lead{border-left-color:var(--accent)} +.ph-row.lead .ph-name{font-weight:600} +.ph-raw{font-family:var(--mono);font-size:10.5px;color:var(--mute);overflow:hidden; +text-overflow:ellipsis;white-space:nowrap} +.ph-dur{font-family:var(--mono);font-size:12.5px;color:var(--dim);text-align:right; +white-space:nowrap} +.ph-share{font-family:var(--mono);font-size:13px;font-weight:600;text-align:right; +white-space:nowrap} +.ph-row.lead .ph-share{color:var(--accent)} +.ph-sig{display:flex;justify-content:flex-end} .empty{padding:30px;text-align:center;color:var(--mute);font-size:13px} .foot{margin-top:38px;color:var(--mute);font-size:11px;text-align:center; font-family:var(--mono)} +/* Tabbed information architecture — CSS-only, radio-driven (no JS) */ +.obs-tab-input{position:absolute;width:1px;height:1px;opacity:0;pointer-events:none} +.obs-tabs{display:flex;flex-wrap:wrap;gap:2px;margin:0 0 24px; +border-bottom:1px solid var(--border);position:sticky;top:0;z-index:5; +background:var(--bg);padding-top:6px} +.obs-tab{padding:9px 15px;font-size:12.5px;font-weight:550;color:var(--mute); +cursor:pointer;border-bottom:2px solid transparent;margin-bottom:-1px; +border-radius:var(--radius-sm) var(--radius-sm) 0 0;user-select:none; +transition:color 0.15s,border-color 0.15s} +.obs-tab:hover{color:var(--text)} +.obs-tab-input:focus-visible+.obs-tab{outline:2px solid var(--accent); +outline-offset:-2px} +.obs-panel{display:none} +.obs-panel>section:first-child{margin-top:0} +.obs-lead{font-size:13px;color:var(--dim);line-height:1.55; +margin:2px 0 22px;max-width:74ch} +#t-overview:checked~.obs-tabs .obs-tab[for="t-overview"], +#t-timeline:checked~.obs-tabs .obs-tab[for="t-timeline"], +#t-operations:checked~.obs-tabs .obs-tab[for="t-operations"], +#t-cost:checked~.obs-tabs .obs-tab[for="t-cost"], +#t-phases:checked~.obs-tabs .obs-tab[for="t-phases"]{ +color:var(--accent);border-bottom-color:var(--accent)} +#t-overview:checked~.obs-panels #p-overview, +#t-timeline:checked~.obs-panels #p-timeline, +#t-operations:checked~.obs-panels #p-operations, +#t-cost:checked~.obs-panels #p-cost, +#t-phases:checked~.obs-panels #p-phases{display:block} """ @@ -212,6 +285,16 @@ def _esc(value: object) -> str: return escape(str(value)) +def _epoch_ms(value: str) -> float | None: + """Parse an ISO-8601 UTC timestamp to epoch milliseconds, or None.""" + if not value: + return None + try: + return datetime.fromisoformat(value).timestamp() * 1000.0 + except ValueError: + return None + + def _ms(value: float) -> str: return f"{value / 1000:.2f}s" if value >= 1000 else f"{value:.0f}ms" @@ -232,20 +315,33 @@ def _bytes(value: int | None) -> str: return f"{value} B" -def _tokens(value: int | None) -> str: +def _context_units(value: int | None) -> str: if not value: return "—" - return f"{value / 1000:.1f}k" if value >= 1000 else str(value) + amount = f"{value / 1000:.1f}k" if value >= 1000 else str(value) + return f"{amount} cu" -def _bar(value: float, maximum: float, *, color: str = "var(--accent)") -> str: - frac = value / maximum if maximum > 0 else 0.0 - fill = max(1.5, round(frac * 100, 1)) +def _bar( + value: float, + maximum: float, + *, + color: str = "var(--accent)", + offset_ms: float = 0.0, +) -> str: + """A thin meter bar. ``offset_ms`` shifts the fill rightward (as a fraction + of *maximum*) so sequential items cascade into a start-offset staircase; + the default 0 keeps the fill left-aligned (pure magnitude).""" + span = maximum if maximum > 0 else 1.0 + start = round(min(offset_ms / span * 100, 99.0), 1) if offset_ms > 0 else 0.0 + fill = max(1.5, round(value / span * 100, 1)) + if start + fill > 100: + fill = max(1.5, round(100 - start, 1)) return ( - '' - '' - f'' + '' + f'' ) @@ -261,14 +357,79 @@ def _reason_chip(reason_kind: str | None) -> str: return f'{_esc(reason_kind)}' +# Operations shows the FULL span counter set, grouped + formatted (never an +# alphabetical raw dump, never silently dropped). Each group: (label, ((key, +# short-label), …)). phase_* microsecond timings are converted to ms and ranked; +# any key not mapped below still appears under "other" so nothing is lost. +_COUNTER_GROUPS: tuple[tuple[str, tuple[tuple[str, str], ...]], ...] = ( + ( + "files", + ( + ("files_analyzed", "analyzed"), + ("files_timed", "timed"), + ("failed_files", "failed"), + ), + ), + ( + "units", + ( + ("units_seen", "seen"), + ("units_eligible", "eligible"), + ("units_fingerprinted", "fingerprinted"), + ), + ), + ("output", (("blocks_emitted", "blocks"), ("segments_emitted", "segments"))), + ("db", (("db_queries", "reads"), ("db_writes", "writes"))), +) + + +def _us_ms(micros: int) -> str: + ms = micros / 1000 + return f"{ms:.0f}ms" if ms >= 10 else f"{ms:.1f}ms" + + +def _counter_group(label: str, pairs: list[str]) -> str: + if not pairs: + return "" + return f'{_esc(label)}{" · ".join(pairs)}' + + def _counters(counters: Mapping[str, int]) -> str: if not counters: return "" - items = "".join( - f"{_esc(key)}{value}" - for key, value in sorted(counters.items()) + seen: set[str] = set() + groups: list[str] = [] + for label, keys in _COUNTER_GROUPS: + pairs: list[str] = [] + for key, short in keys: + if key in counters: + seen.add(key) + pairs.append(f"{short} {counters[key]:,}") + groups.append(_counter_group(label, pairs)) + phases = sorted( + ((key, value) for key, value in counters.items() if key.startswith("phase_")), + key=lambda kv: kv[1], + reverse=True, ) - return f'{items}' + if phases: + seen.update(key for key, _ in phases) + groups.append( + _counter_group( + "phases", + [ + f"{_esc(key.removeprefix('phase_').removesuffix('_us'))} " + f"{_us_ms(value)}" + for key, value in phases + ], + ) + ) + other = sorted((key, value) for key, value in counters.items() if key not in seen) + if other: + groups.append( + _counter_group("other", [f"{_esc(key)} {value:,}" for key, value in other]) + ) + body = "".join(group for group in groups if group) + return f'{body}' if body else "" def _rss_text( @@ -317,7 +478,7 @@ def _header(trace: TraceView) -> str: ) digest = f" · repo {_esc(trace.repo_root_digest)}" if trace.repo_root_digest else "" return ( - f'
{_LOGO}

Platform Observability

' + f'
{_LOGO}

CodeClone Platform Observability

' f'

{agg.operation_count} operations · ' f"{window}{digest}

" ) @@ -451,6 +612,20 @@ def _highlights(agg: AggregatesView) -> str: metric_html=f"{_esc(_ms(cpu_ms))} · {ratio:.1f}x wall", ) ) + if agg.analysis_phases: + top = agg.analysis_phases[0] + rows.append( + _highlight_row( + "Hottest extract phase", + badge_html="", + primary=top.phase, + context=_ANALYSIS_PHASE_LABELS.get(top.phase, top.phase), + metric_html=( + f"{_esc(_ms(top.worker_elapsed_ms))} · " + f"{top.share_permille / 10:.1f}%" + ), + ) + ) return f'
{"".join(rows)}
' if rows else "" @@ -534,13 +709,23 @@ def _op_row(op: OperationView, group_max: float) -> str: ) -def _span_row(span: SpanView, op_duration: float) -> str: +def _span_offset_ms(op_start_ms: float | None, span: SpanView) -> float: + """Span start relative to its operation's start (ms), for the staircase.""" + if op_start_ms is None: + return 0.0 + span_start = _epoch_ms(span.started_at_utc) + if span_start is None: + return 0.0 + return max(0.0, span_start - op_start_ms) + + +def _span_row(span: SpanView, op_duration: float, *, offset_ms: float = 0.0) -> str: color = "var(--warn)" if span.reason_kind == "unknown" else "var(--accent)" return ( '
' f'' f'{_esc(span.name)}' - f"{_bar(span.duration_ms, op_duration, color=color)}" + f"{_bar(span.duration_ms, op_duration, color=color, offset_ms=offset_ms)}" f'{_ms(span.duration_ms)}' f'{_view_rss_text(span)}' f'{_reason_chip(span.reason_kind)}' @@ -550,7 +735,11 @@ def _span_row(span: SpanView, op_duration: float) -> str: def _op_block(op: OperationView, group_max: float) -> str: op_duration = op.duration_ms or 1.0 - spans = "".join(_span_row(span, op_duration) for span in op.spans) + op_start = _epoch_ms(op.started_at_utc) + spans = "".join( + _span_row(span, op_duration, offset_ms=_span_offset_ms(op_start, span)) + for span in op.spans + ) spans_block = f'
{spans}
' if spans else "" kids = "".join(_op_block(child, group_max) for child in op.children) kids_block = f'
{kids}
' if kids else "" @@ -578,11 +767,13 @@ def _chain(trace: TraceView) -> str: return _section( "Correlated event chains", f'
{groups}
', - subtitle="What triggered what, across processes — finish → spawned worker.", + subtitle="What triggered what, across processes — finish → spawned worker. " + "Span bars cascade by start offset within their operation (the staircase is " + "the real order); width is duration.", ) -def _semantic_row(span: SpanCostView) -> str: +def _semantic_row(span: SpanCostView, *, lead: bool) -> str: costly = span.no_op and span.duration_ms >= _NOOP_COSTLY_MS if costly: verdict = 'no-op · costly' @@ -593,8 +784,9 @@ def _semantic_row(span: SpanCostView) -> str: reason = ( _esc(span.reason_kind) if span.reason_kind else '' ) + cls = " ".join(name for name, on in (("flag", costly), ("lead", lead)) if on) return ( - f'' + f'' f'{_esc(span.name)}' f'{_esc(span.operation_name)}' f"{reason}" @@ -609,7 +801,15 @@ def _semantic_row(span: SpanCostView) -> str: def _semantic(agg: AggregatesView) -> str: if not agg.semantic_costs: return "" - rows = "".join(_semantic_row(span) for span in agg.semantic_costs) + lead = max( + range(len(agg.semantic_costs)), + key=lambda i: agg.semantic_costs[i].duration_ms, + default=-1, + ) + rows = "".join( + _semantic_row(span, lead=(i == lead)) + for i, span in enumerate(agg.semantic_costs) + ) headers = ( ("Span", False), ("Operation", False), @@ -628,22 +828,27 @@ def _semantic(agg: AggregatesView) -> str: ) -def _mcp_row(tool: McpToolAggregate) -> str: +def _mcp_row(tool: McpToolAggregate, *, lead: bool) -> str: return ( - f'{_esc(tool.name)}' + f'{_esc(tool.name)}' f'{tool.count}' f'{_ms(tool.p50_duration_ms)}' f'{_ms(tool.p95_duration_ms)}' f'{_bytes(tool.p95_request_bytes)}' f'{_bytes(tool.p95_response_bytes)}' - f'{_tokens(tool.p95_response_tokens)}' + f'{_context_units(tool.p95_response_tokens)}' ) def _mcp(tools: tuple[McpToolAggregate, ...]) -> str: if not tools: return "" - rows = "".join(_mcp_row(tool) for tool in tools) + lead = max( + range(len(tools)), + key=lambda i: tools[i].p95_response_bytes or 0, + default=-1, + ) + rows = "".join(_mcp_row(tool, lead=(i == lead)) for i, tool in enumerate(tools)) headers = ( ("Tool", False), ("Calls", True), @@ -651,13 +856,13 @@ def _mcp(tools: tuple[McpToolAggregate, ...]) -> str: ("p95", True), ("↑ req p95", True), ("↓ resp p95", True), - ("resp tok p95", True), + ("resp ctx p95", True), ) return _section( "MCP tool matrix", _table(headers, rows), subtitle="Per-tool latency and payload — spot tools that flood request " - "or response bytes.", + "or response context units.", ) @@ -666,13 +871,12 @@ def _wf_bar(row: WaterfallRow, total_ms: float) -> str: left = round(min(row.offset_ms / span * 100, 99.0), 2) width = max(0.6, round(row.duration_ms / span * 100, 2)) kind = "op" if row.kind == "operation" else "span" - surf = f"surf-{row.surface}" if row.surface in _KNOWN_SURFACES else "" tick = '' if kind == "span" else "" return ( '
' f'' f"{tick}{_esc(row.label)}" - f'
' f'{_ms(row.duration_ms)}
' ) @@ -700,13 +904,13 @@ def _waterfall(trace: TraceView) -> str: ) -def _agent_row(row: AgentTokenRow, total_response: int) -> str: +def _agent_row(row: AgentTokenRow, total_response: int, *, lead: bool) -> str: share = round(row.response_tokens / total_response * 100) if total_response else 0 return ( - f'{_esc(row.name)}' + f'{_esc(row.name)}' f'{row.calls}' - f'{_tokens(row.request_tokens)}' - f'{_tokens(row.response_tokens)}' + f'{_context_units(row.request_tokens)}' + f'{_context_units(row.response_tokens)}' f'{share}%' ) @@ -717,32 +921,40 @@ def _agent(agg: AggregatesView) -> str: return "" cards = ( '
' - + _stat(_tokens(view.response_tokens), "context pressure (tok)", "accent") - + _stat(_tokens(view.request_tokens), "sent (tok)") + + _stat(_context_units(view.response_tokens), "context pressure", "accent") + + _stat(_context_units(view.request_tokens), "sent context") + _stat(str(view.mcp_calls), "mcp calls") + _stat(str(len(view.consumers)), "tools") + "
" ) - rows = "".join(_agent_row(row, view.response_tokens) for row in view.consumers) + lead = max( + range(len(view.consumers)), + key=lambda i: view.consumers[i].response_tokens, + default=-1, + ) + rows = "".join( + _agent_row(row, view.response_tokens, lead=(i == lead)) + for i, row in enumerate(view.consumers) + ) headers = ( ("Tool", False), ("Calls", True), - ("↑ tok", True), - ("↓ tok", True), + ("↑ ctx", True), + ("↓ ctx", True), ("Context %", True), ) return _section( "Agent context", cards + _table(headers, rows), - subtitle="Tokens MCP tools push back into the agent's context — the real " - "per-call cost for an LLM. The top row is your biggest context consumer.", + subtitle="Estimated context units MCP tools push back into the agent's " + "context. The top row is your biggest context consumer.", ) -def _db_row(row: DbCostRow) -> str: +def _db_row(row: DbCostRow, *, lead: bool) -> str: per_call = round(row.total_queries / row.span_count) if row.span_count else 0 return ( - f'{_esc(row.span_name)}' + f'{_esc(row.span_name)}' f'{row.span_count}' f'{row.total_queries}' f'{row.total_writes}' @@ -754,7 +966,12 @@ def _db_row(row: DbCostRow) -> str: def _db_cost(agg: AggregatesView) -> str: if not agg.db_costs: return "" - rows = "".join(_db_row(row) for row in agg.db_costs) + lead = max( + range(len(agg.db_costs)), + key=lambda i: agg.db_costs[i].total_queries, + default=-1, + ) + rows = "".join(_db_row(row, lead=(i == lead)) for i, row in enumerate(agg.db_costs)) headers = ( ("Span", False), ("Spans", True), @@ -771,12 +988,12 @@ def _db_cost(agg: AggregatesView) -> str: ) -def _db_fingerprint_row(row: DbFingerprintRow) -> str: +def _db_fingerprint_row(row: DbFingerprintRow, *, lead: bool) -> str: table = _esc(row.table_hint) if row.table_hint else "—" shape = _esc(row.summary) if row.summary else "—" raw = _esc(row.fingerprint) return ( - f'{_esc(row.span_name)}' + f'{_esc(row.span_name)}' f"{table}" f'{_esc(row.kind.upper())}' f'{row.count}' @@ -788,7 +1005,15 @@ def _db_fingerprint_row(row: DbFingerprintRow) -> str: def _db_fingerprints(agg: AggregatesView) -> str: if not agg.db_fingerprints: return "" - rows = "".join(_db_fingerprint_row(row) for row in agg.db_fingerprints) + lead = max( + range(len(agg.db_fingerprints)), + key=lambda i: agg.db_fingerprints[i].count, + default=-1, + ) + rows = "".join( + _db_fingerprint_row(row, lead=(i == lead)) + for i, row in enumerate(agg.db_fingerprints) + ) headers = ( ("Span", False), ("Table", False), @@ -804,9 +1029,9 @@ def _db_fingerprints(agg: AggregatesView) -> str: ) -def _pipeline_row(group: PipelineGroup) -> str: +def _pipeline_row(group: PipelineGroup, *, lead: bool) -> str: return ( - f'{_esc(group.name)}' + f'{_esc(group.name)}' f'{group.op_count}' f'{_ms(group.duration_ms)}' f'{_ms(group.cpu_ms)}' @@ -816,7 +1041,14 @@ def _pipeline_row(group: PipelineGroup) -> str: def _pipeline_section(agg: AggregatesView) -> str: if not agg.pipeline: return "" - rows = "".join(_pipeline_row(group) for group in agg.pipeline) + lead = max( + range(len(agg.pipeline)), + key=lambda i: agg.pipeline[i].duration_ms, + default=-1, + ) + rows = "".join( + _pipeline_row(group, lead=(i == lead)) for i, group in enumerate(agg.pipeline) + ) headers = (("Subsystem", False), ("Ops", True), ("Wall", True), ("CPU", True)) return _section( "Pipeline", @@ -825,25 +1057,194 @@ def _pipeline_section(agg: AggregatesView) -> str: ) +def _analysis_phase_row(row: AnalysisPhaseRow, max_permille: int, *, lead: bool) -> str: + label = _ANALYSIS_PHASE_LABELS.get(row.phase, row.phase) + sig = 'peak' if lead else "" + return ( + f'
' + f'{_esc(label)}' + f'{_esc(row.phase)}' + f"{_bar(row.share_permille, max_permille)}" + f'{_esc(_ms(row.worker_elapsed_ms))}' + f'{row.share_permille / 10:.1f}%' + f'{sig}
' + ) + + +def _iter_operation_tree(ops: tuple[OperationView, ...]) -> Iterable[OperationView]: + for op in ops: + yield op + yield from _iter_operation_tree(op.children) + + +def _pipeline_process_spans(trace: TraceView) -> tuple[SpanView, ...]: + roots = trace.operation_tree or trace.correlated_operations + spans: list[SpanView] = [] + seen: set[str] = set() + for op in _iter_operation_tree(roots): + for span in op.spans: + if span.name == "pipeline.process" and span.span_id not in seen: + spans.append(span) + seen.add(span.span_id) + return tuple(spans) + + +def _empty_analysis_phase_section(trace: TraceView) -> str: + process_spans = _pipeline_process_spans(trace) + if not process_spans: + return "" + files_analyzed = sum( + span.counters.get("files_analyzed", 0) for span in process_spans + ) + failed_files = sum(span.counters.get("failed_files", 0) for span in process_spans) + if files_analyzed == 0: + reason = ( + "No uncached files were processed in this window; the analysis was " + "served from cache, so file extraction micro-stages did not run. " + "Use a cold cache or changed files to capture phase timings." + ) + else: + reason = ( + "pipeline.process ran, but no analysis phase counters were recorded. " + "Restart the producing process with CODECLONE_OBSERVABILITY_ENABLED=1 " + "and analysis phase instrumentation." + ) + counters = ( + f"pipeline.process files_analyzed={files_analyzed} · " + f"failed_files={failed_files}" + ) + body = ( + '
' + f"{_esc(reason)}" + f'
{_esc(counters)}
' + "
" + ) + return _section( + "Analysis extract phases", + body, + subtitle=( + "Summed per-file worker elapsed time inside pipeline.process " + "(parse, walk, CFG, normalize). Dev-only; not repository quality." + ), + ) + + +def _analysis_phases_section(trace: TraceView) -> str: + agg = trace.aggregates + if not agg.analysis_phases: + return _empty_analysis_phase_section(trace) + max_permille = max((row.share_permille for row in agg.analysis_phases), default=1) + max_permille = max_permille or 1 + lead_idx = max( + range(len(agg.analysis_phases)), + key=lambda i: agg.analysis_phases[i].share_permille, + default=-1, + ) + rows = "".join( + _analysis_phase_row(row, max_permille, lead=(i == lead_idx)) + for i, row in enumerate(agg.analysis_phases) + ) + footer = ( + f"Worker elapsed (summed): " + f"{_ms(agg.analysis_phase_worker_elapsed_total_ms or 0.0)} · " + f"pipeline.process wall: {_ms(agg.analysis_phase_pipeline_wall_ms or 0.0)} · " + f"files timed: {agg.analysis_phase_files_timed} · " + f"units eligible: {agg.analysis_phase_units_eligible}" + ) + body = f'
{rows}

{_esc(footer)}

' + return _section( + "Analysis extract phases", + body, + subtitle=( + "Where the core spends its per-file extract time, ranked by share — " + "bars are scaled to the heaviest phase. Summed worker elapsed inside " + "pipeline.process; dev-only, not repository quality, and may exceed " + "parent pipeline wall under parallel execution." + ), + ) + + +_TABS: tuple[tuple[str, str], ...] = ( + ("overview", "Overview"), + ("timeline", "Timeline"), + ("operations", "Operations"), + ("cost", "Cost"), + ("phases", "Phases"), +) + +# One plain-language lead per tab: what the view answers, what to look at first. +_TAB_LEADS: Mapping[str, str] = { + "overview": "Start here — what this run did, and where its time and memory " + "actually went.", + "timeline": "When everything happened — operations and their spans on one " + "shared time axis.", + "operations": "What ran — the finish→worker causality chains, nested by call " + "depth.", + "cost": "What it cost — context units, MCP payloads, and database work.", + "phases": "Inside analysis — pipeline stages and per-phase extract cost.", +} + + +def _tab_shell(panels: Mapping[str, str]) -> str: + """Wrap the section panels in CSS-only radio tabs. + + The radio inputs are emitted first so the ``:checked ~`` sibling selectors + can light the active tab label and reveal the matching panel without any + script. An empty panel falls back to a placeholder so a view is never blank. + """ + inputs = "".join( + f'' + for idx, (tid, _) in enumerate(_TABS) + ) + nav = ( + '" + ) + sections: list[str] = [] + for tid, label in _TABS: + inner = panels.get(tid, "") + if not inner.strip(): + inner = ( + f'
No {_esc(label.lower())} data ' + f"recorded for this window.
" + ) + lead = _TAB_LEADS.get(tid, "") + lead_html = f'

{_esc(lead)}

' if lead else "" + sections.append( + f'
{lead_html}{inner}
' + ) + return f'{inputs}{nav}
{"".join(sections)}
' + + def render_trace_html(trace: TraceView) -> str: """Render a ``TraceView`` as a self-contained, branded diagnosis cockpit.""" + agg = trace.aggregates foot = f"CodeClone · platform observability · schema {_esc(trace.schema_version)}" + panels = { + "overview": _summary(trace) + _waste_section(agg), + "timeline": _waterfall(trace), + "operations": _chain(trace), + "cost": ( + _semantic(agg) + + _db_cost(agg) + + _db_fingerprints(agg) + + _agent(agg) + + _mcp(agg.mcp_tools) + ), + "phases": _pipeline_section(agg) + _analysis_phases_section(trace), + } return ( '' '' "CodeClone · Platform Observability" f'
' + _header(trace) - + _summary(trace) - + _waste_section(trace.aggregates) - + _waterfall(trace) - + _chain(trace) - + _semantic(trace.aggregates) - + _db_cost(trace.aggregates) - + _db_fingerprints(trace.aggregates) - + _agent(trace.aggregates) - + _mcp(trace.aggregates.mcp_tools) - + _pipeline_section(trace.aggregates) + + _tab_shell(panels) + f'

{foot}

' + "
" ) diff --git a/codeclone/observability/render_json.py b/codeclone/observability/render_json.py index b00d3217..c7536747 100644 --- a/codeclone/observability/render_json.py +++ b/codeclone/observability/render_json.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""JSON renderer for the observability ``TraceView`` (Phase 29 output). +"""JSON renderer for the observability ``TraceView``. Deterministic: sorted keys, stable indentation. The read model is the source of truth; this is a faithful projection of it. diff --git a/codeclone/observability/runtime.py b/codeclone/observability/runtime.py index 2bb19fc4..17d3ad3e 100644 --- a/codeclone/observability/runtime.py +++ b/codeclone/observability/runtime.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Observability write API (Phase 29 §4.3). +"""Observability write API. ``bootstrap`` freezes the enabled decision once per process. When disabled, ``operation``/``span`` yield a cheap inert handle and return immediately — no @@ -18,7 +18,7 @@ import sqlite3 import time import uuid -from collections.abc import Iterator +from collections.abc import Iterable, Iterator, Mapping, Sequence from contextlib import contextmanager from contextvars import ContextVar from datetime import datetime, timezone @@ -458,6 +458,15 @@ def record_elapsed_span( _DB_WRITE_KINDS = frozenset({"insert", "update", "delete", "replace"}) +# Counter-semantics version. v1 counted db_queries/db_writes per *row*: +# sqlite3.set_trace_callback fires once per executemany row, so a single batched +# executemany was indistinguishable from an N+1 loop and tripped false +# query_chatty verdicts. v2 (the _CountingConnection below) counts logical +# *statements* — db_queries/db_writes are execute/executemany calls and db_rows +# is the row volume. Bump on any counter-meaning change; old observer DBs carry +# the previous semantics and are disposable (delete to avoid mixed history). +DB_COUNTER_VERSION = 2 + def _classify_sql(sql: str) -> str: stripped = sql.lstrip() @@ -466,15 +475,17 @@ def _classify_sql(sql: str) -> str: return stripped.split(None, 1)[0].lower() -def record_db_query(sql: str) -> None: - """Trace-callback sink: attribute one SQL statement to the active span as a - ``db_queries`` counter (plus ``db_writes`` for mutations). No-op outside a - span. Performance telemetry only — never audit or contract truth. +def _record_db_statement(sql: str, *, rows: int) -> None: + """Attribute one logical SQL statement to the active span: ``db_queries`` +1 + (``db_writes`` +1 for mutations) and ``db_rows`` += ``rows`` (1 for + ``execute``, len(params) for ``executemany``). No-op outside a span. + Performance telemetry only — never audit or contract truth. """ span_handle = _CURRENT_SPAN.get() if span_handle is None: return span_handle.add_counter("db_queries", 1) + span_handle.add_counter("db_rows", rows) if _classify_sql(sql) in _DB_WRITE_KINDS: span_handle.add_counter("db_writes", 1) fingerprint = fingerprint_sql(sql).fingerprint @@ -482,6 +493,16 @@ def record_db_query(sql: str) -> None: span_handle.add_db_fingerprint(fingerprint) +def record_db_query(sql: str) -> None: + """Record one logical query (1 statement, 1 row) on the active span. + + Retained as the manual entry point for code that does DB work the counting + connection does not see (and used by tests). Equivalent to a single + ``execute`` for counting purposes. + """ + _record_db_statement(sql, rows=1) + + def record_counter(key: str, value: int = 1) -> None: """Add ``value`` to the named counter on the active span. No-op outside a span (or when disabled). Companion to ``record_db_query`` for non-SQL @@ -494,21 +515,53 @@ def record_counter(key: str, value: int = 1) -> None: span_handle.add_counter(key, value) -def instrument_db_connection(conn: sqlite3.Connection) -> None: - """Attach the per-span DB-query counter to ``conn``. No-op (and no per-query - trace overhead) when observability is disabled for this process. +_SqlParams = Sequence[object] | Mapping[str, object] + + +class _CountingConnection(sqlite3.Connection): + """``sqlite3.Connection`` that counts logical statements on the active span. + + Overriding ``execute``/``executemany`` — instead of ``set_trace_callback``, + which fires once per executemany *row* — is what makes ``db_queries`` a true + statement count: one batched ``executemany`` is one query over many rows, + distinguishable from an N+1 loop. All store access goes through these entry + points (no bare cursors), so nothing escapes the count. Counting no-ops + outside a span, so connection open (pragmas, schema) is not attributed. + """ + + def execute( # type: ignore[override] + self, sql: str, parameters: _SqlParams = () + ) -> sqlite3.Cursor: + _record_db_statement(sql, rows=1) + return super().execute(sql, parameters) + + def executemany( # type: ignore[override] + self, sql: str, parameters: Iterable[_SqlParams] + ) -> sqlite3.Cursor: + materialized = list(parameters) + _record_db_statement(sql, rows=len(materialized)) + return super().executemany(sql, materialized) + + def executescript(self, sql_script: str) -> sqlite3.Cursor: + _record_db_statement(sql_script, rows=1) + return super().executescript(sql_script) + + +def counting_connection_factory() -> type[sqlite3.Connection] | None: + """Return the per-span counting connection class when observability is + enabled, else ``None`` so callers open a plain connection with no overhead. """ - if _ENABLED: - conn.set_trace_callback(record_db_query) + return _CountingConnection if _ENABLED else None __all__ = [ + "DB_COUNTER_VERSION", "OperationHandle", "SpanHandle", "bind_root", "bootstrap", + "counting_connection_factory", "current_operation_context", - "instrument_db_connection", "is_observability_enabled", "operation", "payload_capture_enabled", diff --git a/codeclone/observability/sqlite_access.py b/codeclone/observability/sqlite_access.py index 372ea71b..5e385662 100644 --- a/codeclone/observability/sqlite_access.py +++ b/codeclone/observability/sqlite_access.py @@ -22,16 +22,15 @@ def open_instrumented_sqlite_db( foreign_keys: bool = False, synchronous: str | None = None, ) -> sqlite3.Connection: - conn = open_sqlite_db( + from codeclone.observability.runtime import counting_connection_factory + + return open_sqlite_db( path, ensure_schema=ensure_schema, foreign_keys=foreign_keys, synchronous=synchronous, + factory=counting_connection_factory(), ) - from codeclone.observability.runtime import instrument_db_connection - - instrument_db_connection(conn) - return conn def open_instrumented_sqlite_db_readonly( @@ -39,11 +38,13 @@ def open_instrumented_sqlite_db_readonly( *, validate_schema: Callable[[sqlite3.Connection], None], ) -> sqlite3.Connection: - conn = open_sqlite_db_readonly(path, validate_schema=validate_schema) - from codeclone.observability.runtime import instrument_db_connection + from codeclone.observability.runtime import counting_connection_factory - instrument_db_connection(conn) - return conn + return open_sqlite_db_readonly( + path, + validate_schema=validate_schema, + factory=counting_connection_factory(), + ) __all__ = [ diff --git a/codeclone/observability/store/reader.py b/codeclone/observability/store/reader.py index 5ef6f15d..9b3000e8 100644 --- a/codeclone/observability/store/reader.py +++ b/codeclone/observability/store/reader.py @@ -14,18 +14,24 @@ import sqlite3 from collections import defaultdict +from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import cast import orjson +from ...analysis.phase_ledger import ( + PHASE_US_COUNTER_SUFFIXES, + PHASE_VOLUME_COUNTER_SUFFIXES, +) from ...contracts import PLATFORM_OBSERVABILITY_SCHEMA_VERSION from ..db_fingerprint import describe_fingerprint from ..views import ( AgentTokenRow, AgentView, AggregatesView, + AnalysisPhaseRow, DbCostRow, DbFingerprintRow, McpToolAggregate, @@ -48,12 +54,14 @@ _MEMORY_PIPELINE_PREFIX = "memory." _SEMANTIC_COST_LIMIT = 8 _DB_FINGERPRINT_ROW_LIMIT = 15 +_PIPELINE_PROCESS_SPAN = "pipeline.process" +_PHASE_HEAVY_PERMILLE = 250 # Waste thresholds: a no-op span is only worth flagging once it has spent time; # an MCP response is "heavy" past these payload sizes. _WASTE_NOOP_MS = 50.0 _HIGH_PAYLOAD_BYTES = 16 * 1024 -_HIGH_PAYLOAD_TOKENS = 4000 +_HIGH_PAYLOAD_CONTEXT_UNITS = 4000 def open_observability_store_readonly(root: Path) -> sqlite3.Connection | None: @@ -332,13 +340,13 @@ def _waste( surface="mcp", detail=( f"p95 {tool.p95_response_bytes / 1024:.0f} KB resp · " - f"{tool.p95_response_tokens} tok" + f"{tool.p95_response_tokens} cu" ), severity=float(tool.p95_response_bytes), ) for tool in mcp_tools if tool.p95_response_bytes >= _HIGH_PAYLOAD_BYTES - or tool.p95_response_tokens >= _HIGH_PAYLOAD_TOKENS + or tool.p95_response_tokens >= _HIGH_PAYLOAD_CONTEXT_UNITS ) items.sort(key=lambda w: (-w.severity, w.kind, w.subject)) return tuple(items) @@ -428,6 +436,7 @@ def _db_costs(flat: list[OperationView]) -> tuple[DbCostRow, ...]: span_count=len(spans), total_queries=sum(s.counters.get("db_queries", 0) for s in spans), total_writes=sum(s.counters.get("db_writes", 0) for s in spans), + total_rows=sum(s.counters.get("db_rows", 0) for s in spans), max_queries=max(s.counters.get("db_queries", 0) for s in spans), ) for name, spans in grouped.items() @@ -466,6 +475,73 @@ def _db_fingerprints(flat: list[OperationView]) -> tuple[DbFingerprintRow, ...]: return tuple(rows[:_DB_FINGERPRINT_ROW_LIMIT]) +@dataclass(frozen=True, slots=True) +class _AnalysisPhaseBundle: + rows: tuple[AnalysisPhaseRow, ...] + worker_elapsed_total_ms: float | None + pipeline_wall_ms: float | None + source_spans: int + files_timed: int + units_eligible: int + + +def _phase_name_from_counter(counter: str) -> str: + return counter[len("phase_") : -len("_us")] + + +def _analysis_phase_bundle(flat: list[OperationView]) -> _AnalysisPhaseBundle: + pipeline_spans = [ + span for op in flat for span in op.spans if span.name == _PIPELINE_PROCESS_SPAN + ] + contributing_spans = [ + span + for span in pipeline_spans + if any(key in span.counters for key in PHASE_US_COUNTER_SUFFIXES) + ] + if not contributing_spans: + return _AnalysisPhaseBundle( + rows=(), + worker_elapsed_total_ms=None, + pipeline_wall_ms=None, + source_spans=0, + files_timed=0, + units_eligible=0, + ) + + phase_us = { + key: sum(span.counters.get(key, 0) for span in contributing_spans) + for key in PHASE_US_COUNTER_SUFFIXES + } + volume_totals = { + key: sum(span.counters.get(key, 0) for span in contributing_spans) + for key in PHASE_VOLUME_COUNTER_SUFFIXES + } + total_us = sum(phase_us.values()) + rows = [ + AnalysisPhaseRow( + phase=_phase_name_from_counter(key), + worker_elapsed_ms=round(value / 1000, 1), + share_permille=round(1000 * value / total_us) if total_us else 0, + verdict=( + "phase_heavy" + if total_us and round(1000 * value / total_us) >= _PHASE_HEAVY_PERMILLE + else "ok" + ), + ) + for key, value in phase_us.items() + if value + ] + rows.sort(key=lambda row: (-row.worker_elapsed_ms, row.phase)) + return _AnalysisPhaseBundle( + rows=tuple(rows), + worker_elapsed_total_ms=round(total_us / 1000, 1), + pipeline_wall_ms=round(sum(span.duration_ms for span in contributing_spans), 1), + source_spans=len(contributing_spans), + files_timed=volume_totals.get("files_timed", 0), + units_eligible=volume_totals.get("units_eligible", 0), + ) + + def _aggregates( flat: list[OperationView], spans_by_op: dict[str, tuple[SpanView, ...]] ) -> AggregatesView: @@ -532,6 +608,7 @@ def _aggregates( mcp_tools = _mcp_tool_aggregates(flat) cpu_ranked = sorted(flat, key=lambda v: (-_cpu_ms(v), v.operation_id)) heaviest_cpu = cpu_ranked[0] if cpu_ranked and _cpu_ms(cpu_ranked[0]) > 0 else None + analysis_phase_bundle = _analysis_phase_bundle(flat) return AggregatesView( operation_count=len(flat), slowest=slowest, @@ -551,6 +628,14 @@ def _aggregates( heaviest_cpu=heaviest_cpu, pipeline=_pipeline(flat), db_fingerprints=_db_fingerprints(flat), + analysis_phases=analysis_phase_bundle.rows, + analysis_phase_worker_elapsed_total_ms=( + analysis_phase_bundle.worker_elapsed_total_ms + ), + analysis_phase_pipeline_wall_ms=analysis_phase_bundle.pipeline_wall_ms, + analysis_phase_source_spans=analysis_phase_bundle.source_spans, + analysis_phase_files_timed=analysis_phase_bundle.files_timed, + analysis_phase_units_eligible=analysis_phase_bundle.units_eligible, ) diff --git a/codeclone/observability/store/schema.py b/codeclone/observability/store/schema.py index 3f0cf22f..63814f50 100644 --- a/codeclone/observability/store/schema.py +++ b/codeclone/observability/store/schema.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Observability sqlite schema (Phase 29 §4.5). +"""Observability sqlite schema. Two tables — operations (surface-level) and spans (stage/subsystem) — plus a meta row carrying the schema version. Profile columns are nullable diff --git a/codeclone/observability/store/writer.py b/codeclone/observability/store/writer.py index c8cb53a5..195415a4 100644 --- a/codeclone/observability/store/writer.py +++ b/codeclone/observability/store/writer.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Bounded, batched observability writer (Phase 29 §4.5). +"""Bounded, batched observability writer. A whole operation — its row plus every span — is persisted in a single sqlite transaction. We do NOT copy the audit per-emit commit-per-row pattern. diff --git a/codeclone/observability/views.py b/codeclone/observability/views.py index 66b9e5a9..36cc95c6 100644 --- a/codeclone/observability/views.py +++ b/codeclone/observability/views.py @@ -4,7 +4,7 @@ # SPDX-License-Identifier: MPL-2.0 # Copyright (c) 2026 Den Rozhnovskiy -"""Read-model views (Phase 29 §4.6). +"""Read-model views. ``TraceView`` is the primary artifact; JSON/text/HTML renderers are projections over it and must not drive the schema. Pure data, built by ``store/reader.py``. @@ -102,15 +102,17 @@ class McpToolAggregate: class DbCostRow: """SQLite work attributed to a span class (performance-truth, not audit). - Aggregated from span db_queries/db_writes counters; ``max_queries`` is the - worst single instance and ``queries`` ÷ a per-row productive count exposes - N+1-shaped access (many reads, little produced).""" + Aggregated from span db_queries/db_writes/db_rows counters (v2 semantics: + logical statements, not per-row trace fires). ``max_queries`` is the worst + single instance; ``total_rows`` exposes executemany amplification, and a + high statement count with little produced is the N+1 shape.""" span_name: str surface: str span_count: int total_queries: int total_writes: int + total_rows: int max_queries: int @@ -131,9 +133,21 @@ class DbFingerprintRow: summary: str = "" +@dataclass(frozen=True, slots=True) +class AnalysisPhaseRow: + phase: str + worker_elapsed_ms: float + share_permille: int + verdict: str + + @dataclass(frozen=True, slots=True) class AgentTokenRow: - """One MCP tool's cumulative token economics across the window.""" + """One MCP tool's cumulative context-unit economics across the window. + + Field names keep the historical ``*_tokens`` spelling for storage/query + compatibility; values are deterministic context-unit estimates. + """ name: str calls: int @@ -143,9 +157,9 @@ class AgentTokenRow: @dataclass(frozen=True, slots=True) class AgentView: - """Agentic context economics: how many tokens MCP tools pushed back into - the agent's context (``response_tokens`` = context pressure), ranked by the - biggest consumer. Built only when MCP operations are present.""" + """Agentic context economics: context units MCP tools pushed back into the + agent context (``response_tokens`` = legacy field for context pressure), + ranked by the biggest consumer. Built only when MCP operations are present.""" mcp_calls: int = 0 request_tokens: int = 0 @@ -197,6 +211,12 @@ class AggregatesView: heaviest_cpu: OperationView | None = None pipeline: tuple[PipelineGroup, ...] = () db_fingerprints: tuple[DbFingerprintRow, ...] = () + analysis_phases: tuple[AnalysisPhaseRow, ...] = () + analysis_phase_worker_elapsed_total_ms: float | None = None + analysis_phase_pipeline_wall_ms: float | None = None + analysis_phase_source_spans: int = 0 + analysis_phase_files_timed: int = 0 + analysis_phase_units_eligible: int = 0 @dataclass(frozen=True, slots=True) @@ -244,6 +264,7 @@ class TraceView: "AgentTokenRow", "AgentView", "AggregatesView", + "AnalysisPhaseRow", "DbCostRow", "DbFingerprintRow", "McpToolAggregate", diff --git a/codeclone/report/document/builder.py b/codeclone/report/document/builder.py index 9d22dfaa..b9239c98 100644 --- a/codeclone/report/document/builder.py +++ b/codeclone/report/document/builder.py @@ -23,7 +23,12 @@ ) from ._common import _collect_report_file_list -from .derived import _build_derived_overview, _build_derived_suggestions +from .derived import ( + _build_derived_module_map, + _build_derived_overview, + _build_derived_review_queue, + _build_derived_suggestions, +) from .findings import _build_findings_payload from .integrity import _build_integrity_payload from .inventory import ( @@ -95,6 +100,8 @@ def build_report_document( "suggestions": _build_derived_suggestions(suggestions), "overview": overview_payload, "hotlists": hotlists_payload, + "module_map": _build_derived_module_map(metrics_payload), + "review_queue": _build_derived_review_queue(findings_payload, suggestions), } integrity_payload = _build_integrity_payload( report_schema_version=report_schema_version, diff --git a/codeclone/report/document/derived.py b/codeclone/report/document/derived.py index add20423..23f209fa 100644 --- a/codeclone/report/document/derived.py +++ b/codeclone/report/document/derived.py @@ -7,8 +7,8 @@ from __future__ import annotations from collections import Counter -from collections.abc import Mapping, Sequence -from typing import TYPE_CHECKING +from collections.abc import Callable, Mapping, Sequence +from typing import TYPE_CHECKING, Final from ...domain.findings import ( CATEGORY_COHESION, @@ -45,6 +45,8 @@ design_group_id, structural_group_id, ) +from ...metrics.dependencies import select_dependency_graph_nodes +from ...metrics.overloaded_modules import _score_quantile from ...utils.coerce import as_float as _as_float from ...utils.coerce import as_int as _as_int from ...utils.coerce import as_mapping as _as_mapping @@ -396,11 +398,12 @@ def _suggestion_finding_id(suggestion: Suggestion) -> str: ) -def _build_derived_suggestions( +def _sorted_suggestions( suggestions: Sequence[Suggestion] | None, -) -> list[dict[str, object]]: - suggestion_rows = list(suggestions or ()) - suggestion_rows.sort( +) -> list[Suggestion]: + """Deterministic priority order shared by every suggestion-derived view.""" + rows = list(suggestions or ()) + rows.sort( key=lambda suggestion: ( -suggestion.priority, SEVERITY_ORDER.get(suggestion.severity, 9), @@ -408,6 +411,12 @@ def _build_derived_suggestions( _suggestion_finding_id(suggestion), ) ) + return rows + + +def _build_derived_suggestions( + suggestions: Sequence[Suggestion] | None, +) -> list[dict[str, object]]: return [ { "id": f"suggestion:{_suggestion_finding_id(suggestion)}", @@ -421,5 +430,687 @@ def _build_derived_suggestions( "steps": list(suggestion.steps), }, } - for suggestion in suggestion_rows + for suggestion in _sorted_suggestions(suggestions) + ] + + +_REVIEW_QUEUE_SCHEMA_VERSION: Final = "2" +_REVIEW_SEVERITIES: Final = ("critical", "warning", "info") +_REVIEW_FAMILIES: Final = ("clones", "structural", "dead_code", "design") +_REVIEW_FAMILY_BY_FINDING: Final = { + FAMILY_CLONE: "clones", + FAMILY_STRUCTURAL: "structural", + FAMILY_DEAD_CODE: "dead_code", + FAMILY_DESIGN: "design", +} +_REVIEW_FAMILY_BY_SUGGESTION: Final = { + FAMILY_CLONES: "clones", + FAMILY_STRUCTURAL: "structural", +} +_CLONE_REVIEW_TITLES: Final = { + CLONE_KIND_FUNCTION: "Function clone group", + CLONE_KIND_BLOCK: "Block clone group", + CLONE_KIND_SEGMENT: "Segment clone group", +} + + +def _humanize(value: str) -> str: + text = value.replace("_", " ").strip() + return text[:1].upper() + text[1:] if text else text + + +def _flatten_finding_groups( + findings: Mapping[str, object], +) -> list[Mapping[str, object]]: + """Canonical findings across families, flattened (mirrors overview).""" + groups = _as_mapping(findings.get("groups")) + clones = _as_mapping(groups.get(FAMILY_CLONES)) + flat: list[Mapping[str, object]] = [ + _as_mapping(group) + for key in ("functions", "blocks", "segments") + for group in _as_sequence(clones.get(key)) + ] + for family_key in (FAMILY_STRUCTURAL, FAMILY_DEAD_CODE, "design"): + flat.extend( + _as_mapping(group) + for group in _as_sequence(_as_mapping(groups.get(family_key)).get("groups")) + ) + return flat + + +def _finding_first_item(group: Mapping[str, object]) -> Mapping[str, object]: + items = _as_sequence(group.get("items")) + return _as_mapping(items[0]) if items else {} + + +def _finding_review_title(group: Mapping[str, object]) -> str: + family = str(group.get("family")) + category = str(group.get("category")) + qualname = str(_finding_first_item(group).get("qualname", "")).strip() + if family == FAMILY_CLONE: + base = _CLONE_REVIEW_TITLES.get(category, "Clone group") + return f"{base} ({_as_int(group.get('count'))} occurrences)" + if family == FAMILY_DEAD_CODE: + return f"Unused {category}: {qualname}" if qualname else f"Unused {category}" + if family == FAMILY_DESIGN: + return f"{_humanize(category)}: {qualname}" if qualname else _humanize(category) + return _humanize(category) + + +def _finding_review_location(group: Mapping[str, object]) -> str: + first = _finding_first_item(group) + # `path` is "" for absolute paths, so we never surface an absolute path — + # we fall back to the qualified name instead. + path = _safe_relative_path(first) + qualname = str(first.get("qualname", "")).strip() + line = _as_int(first.get("start_line")) + base = (f"{path}:{line}" if line else path) if path else qualname + extra = _as_int(group.get("count")) - 1 + if extra > 0: + return f"{base} +{extra} more" if base else f"{extra + 1} locations" + return base + + +def _finding_review_summary(group: Mapping[str, object]) -> str: + count = _as_int(group.get("count")) + spread = _as_mapping(group.get("spread")) + files = _as_int(spread.get("files")) + functions = _as_int(spread.get("functions")) + scope = str(_as_mapping(group.get("source_scope")).get("dominant_kind", "")).strip() + parts = [f"{count} occurrence{'s' if count != 1 else ''}"] + if functions or files: + parts.append( + f"{functions} function{'s' if functions != 1 else ''}" + f" / {files} file{'s' if files != 1 else ''}" + ) + if scope: + parts.append(scope) + return " · ".join(parts) + + +def _safe_relative_path(item: Mapping[str, object]) -> str: + """Relative path only — never surface an absolute path in the payload.""" + path = str(item.get("relative_path", "")).strip() + return path if path and not _is_absolute_path(path) else "" + + +def _finding_representative_rows( + group: Mapping[str, object], +) -> list[dict[str, object]]: + rows = [ + { + "relative_path": _safe_relative_path(item), + "start_line": _as_int(item.get("start_line")), + "end_line": _as_int(item.get("end_line")), + "qualname": str(item.get("qualname", "")), + "source_kind": str(item.get("source_kind", "")), + } + for item in (_as_mapping(row) for row in _as_sequence(group.get("items"))) + ] + rows.sort( + key=lambda row: ( + str(row["relative_path"]), + _as_int(row["start_line"]), + str(row["qualname"]), + ) + ) + return rows[:3] + + +def _suggestion_review_fields(suggestion: Suggestion) -> dict[str, object]: + """Remediation fields shared by every suggestion-backed review item.""" + return { + "source_kind": suggestion.source_kind, + "title": suggestion.title, + "summary": suggestion.fact_summary, + "location": suggestion.location_label or suggestion.location, + "representative_locations": _representative_location_rows(suggestion), + "effort": suggestion.effort, + "steps": list(suggestion.steps), + "has_action": True, + } + + +def _finding_identity(group: Mapping[str, object]) -> dict[str, object]: + finding_id = str(group.get("id")) + return { + "id": finding_id, + "finding_id": finding_id, + "family": _REVIEW_FAMILY_BY_FINDING.get(str(group.get("family")), "design"), + "category": str(group.get("category", "")), + "severity": str(group.get("severity", SEVERITY_INFO)), + "priority": _as_float(group.get("priority")), + "novelty": str(group.get("novelty") or "known"), + } + + +def _finding_review_item( + group: Mapping[str, object], + suggestion: Suggestion | None, +) -> dict[str, object]: + item = _finding_identity(group) + if suggestion is not None: + item.update(_suggestion_review_fields(suggestion)) + else: + item.update( + { + "source_kind": str( + _as_mapping(group.get("source_scope")).get("dominant_kind", "") + ), + "title": _finding_review_title(group), + "summary": _finding_review_summary(group), + "location": _finding_review_location(group), + "representative_locations": _finding_representative_rows(group), + "effort": "", + "steps": [], + "has_action": False, + } + ) + return item + + +def _suggestion_review_item(suggestion: Suggestion) -> dict[str, object]: + finding_id = _suggestion_finding_id(suggestion) + family = _REVIEW_FAMILY_BY_SUGGESTION.get(suggestion.finding_family) or ( + "dead_code" if suggestion.category == CATEGORY_DEAD_CODE else "design" + ) + return { + "id": finding_id, + "finding_id": finding_id, + "family": family, + "category": suggestion.category, + "severity": suggestion.severity, + "priority": suggestion.priority, + "novelty": "known", + **_suggestion_review_fields(suggestion), + } + + +def _review_sort_key(item: Mapping[str, object]) -> tuple[float, int, str, str]: + return ( + -_as_float(item.get("priority")), + SEVERITY_ORDER.get(str(item.get("severity")), 9), + str(item.get("title")), + str(item.get("finding_id")), + ) + + +def _dedup_append( + items: list[dict[str, object]], + seen: set[str], + finding_id: str, + item: dict[str, object], +) -> None: + """Append a review item once per finding id (first writer wins).""" + if finding_id in seen: + return + seen.add(finding_id) + items.append(item) + + +def _review_summary(items: Sequence[Mapping[str, object]]) -> dict[str, object]: + by_severity = dict.fromkeys(_REVIEW_SEVERITIES, 0) + by_family = dict.fromkeys(_REVIEW_FAMILIES, 0) + by_novelty = {"new": 0, "known": 0} + actionable = 0 + for item in items: + severity = str(item.get("severity")) + if severity in by_severity: + by_severity[severity] += 1 + family = str(item.get("family")) + by_family[family] = by_family.get(family, 0) + 1 + by_novelty["new" if str(item.get("novelty")) == "new" else "known"] += 1 + if item.get("has_action"): + actionable += 1 + return { + "total": len(items), + "reviewed": 0, + "actionable": actionable, + "by_severity": by_severity, + "by_family": {key: count for key, count in sorted(by_family.items()) if count}, + "by_novelty": by_novelty, + "top_priority": max( + (_as_float(item.get("priority")) for item in items), default=0.0 + ), + } + + +def _build_derived_review_queue( + findings: Mapping[str, object], + suggestions: Sequence[Suggestion] | None, +) -> dict[str, object]: + """Prioritised cross-family review queue projected over canonical findings. + + Every finding in ``findings.groups`` (clones, structural, dead-code, design) + becomes one review item, enriched with the matching suggestion's remediation + steps when one exists (the suggestion wins on title/summary/location). + Findings without a suggestion carry ``has_action=False``. The summary carries + the severity/family/novelty counts the review hub needs; ``reviewed`` starts + at 0 — the HTML tracks per-finding review state client-side. + """ + suggestion_by_id: dict[str, Suggestion] = {} + for suggestion in suggestions or (): + suggestion_by_id.setdefault(_suggestion_finding_id(suggestion), suggestion) + + items: list[dict[str, object]] = [] + seen: set[str] = set() + for group in _flatten_finding_groups(findings): + finding_id = str(group.get("id")) + _dedup_append( + items, + seen, + finding_id, + _finding_review_item(group, suggestion_by_id.get(finding_id)), + ) + for finding_id, suggestion in suggestion_by_id.items(): + _dedup_append(items, seen, finding_id, _suggestion_review_item(suggestion)) + + items.sort(key=_review_sort_key) + return { + "schema_version": _REVIEW_QUEUE_SCHEMA_VERSION, + "scope": "report_only", + "summary": _review_summary(items), + "items": items, + } + + +_MODULE_MAP_SCHEMA_VERSION: Final = "1" +_MODULE_MAP_MAX_PACKAGE_NODES: Final = 28 +_MODULE_MAP_MAX_MODULE_NODES: Final = 40 +_MODULE_MAP_MAX_EDGES: Final = 120 +_MODULE_MAP_UNWIND_CANDIDATE_CAP: Final = 25 +_MODULE_MAP_OVERMERGE_MODULE_FLOOR: Final = 80 +_MODULE_MAP_MONOLITH_PACKAGE_CEILING: Final = 2 +_MODULE_MAP_OVERMERGE_PACKAGE_CEILING: Final = 3 +_MODULE_MAP_CANDIDATE: Final = "candidate" +_MODULE_MAP_RANKED_ONLY: Final = "ranked_only" +_MODULE_MAP_NON_CANDIDATE: Final = "non_candidate" +_MODULE_MAP_SEED_POLICY: Final = "cycles_then_chains_then_degree" + + +def _module_prefix(module: str, depth: int) -> str: + parts = module.split(".") + if len(parts) <= depth: + return module + return ".".join(parts[:depth]) + + +def _package_node_id(depth: int) -> Callable[[str], str]: + def _to_package(module: str) -> str: + return _module_prefix(module, depth) + + return _to_package + + +def _module_edges_from_items(edge_items: Sequence[object]) -> list[tuple[str, str]]: + edges: list[tuple[str, str]] = [] + for item in edge_items: + mapping = _as_mapping(item) + source = str(mapping.get("source", "")).strip() + target = str(mapping.get("target", "")).strip() + if source and target: + edges.append((source, target)) + return edges + + +def _string_paths(raw: Sequence[object]) -> list[list[str]]: + return [[str(node) for node in _as_sequence(path)] for path in raw] + + +def _module_map_unavailable_shell(reason: str) -> dict[str, object]: + def _empty_truncation() -> dict[str, object]: + return { + "truncated": False, + "node_universe_count": 0, + "node_shown_count": 0, + "edge_universe_count": 0, + "edge_shown_count": 0, + "seed_policy": _MODULE_MAP_SEED_POLICY, + } + + return { + "schema_version": _MODULE_MAP_SCHEMA_VERSION, + "scope": "report_only", + "default_zoom": "packages", + "summary": { + "available": False, + "reason": reason, + "module_count": 0, + "package_count_depth2": 0, + "edge_count": 0, + "unwind_candidate_count": 0, + "overloaded_candidate_count": 0, + "overloaded_population_status": "limited", + }, + "graph_packages": { + "zoom": "packages", + "package_depth": None, + "truncation": _empty_truncation(), + "nodes": [], + "edges": [], + }, + "graph_modules": { + "zoom": "modules", + "package_depth": None, + "truncation": _empty_truncation(), + "nodes": [], + "edges": [], + }, + "unwind_candidates": [], + } + + +def _module_map_zoom_decision( + modules: Sequence[str], module_count: int +) -> tuple[str, int]: + if module_count <= _MODULE_MAP_MAX_MODULE_NODES: + return "modules", 2 + p1 = len({_module_prefix(module, 1) for module in modules}) + p2 = len({_module_prefix(module, 2) for module in modules}) + if p1 <= _MODULE_MAP_MONOLITH_PACKAGE_CEILING: + return "packages", 2 + if ( + p2 <= _MODULE_MAP_OVERMERGE_PACKAGE_CEILING + and module_count > _MODULE_MAP_OVERMERGE_MODULE_FLOOR + ): + return "packages", 3 + if p2 <= _MODULE_MAP_MAX_PACKAGE_NODES: + return "packages", 2 + if p1 <= _MODULE_MAP_MAX_PACKAGE_NODES: + return "packages", 1 + return "packages", 2 + + +def _aggregate_node_overlay( + members: Sequence[str], + *, + overloaded_by_module: Mapping[str, Mapping[str, object]], + cycle_modules: frozenset[str], +) -> dict[str, object]: + scores: list[float] = [] + statuses: set[str] = set() + reasons: set[str] = set() + source_kinds: set[str] = set() + fan_in = 0 + fan_out = 0 + in_cycle = False + for module in members: + item = overloaded_by_module.get(module, {}) + scores.append(_as_float(item.get("score"))) + statuses.add(str(item.get("candidate_status", _MODULE_MAP_NON_CANDIDATE))) + reasons.update( + str(reason) for reason in _as_sequence(item.get("candidate_reasons")) + ) + source_kinds.add(str(item.get("source_kind", ""))) + fan_in += _as_int(item.get("fan_in")) + fan_out += _as_int(item.get("fan_out")) + in_cycle = in_cycle or module in cycle_modules + if _MODULE_MAP_CANDIDATE in statuses: + candidate_status = _MODULE_MAP_CANDIDATE + elif _MODULE_MAP_RANKED_ONLY in statuses: + candidate_status = _MODULE_MAP_RANKED_ONLY + else: + candidate_status = _MODULE_MAP_NON_CANDIDATE + return { + "fan_in": fan_in, + "fan_out": fan_out, + "source_kinds": sorted(source_kinds), + "in_cycle": in_cycle, + "overloaded": { + "score": max(scores) if scores else 0.0, + "candidate_status": candidate_status, + "candidate_reasons": sorted(reasons), + }, + } + + +def _module_map_node( + node_id: str, + *, + package_depth: int | None, + overloaded_by_module: Mapping[str, Mapping[str, object]], + cycle_modules: frozenset[str], +) -> dict[str, object]: + if package_depth is not None: + members = sorted( + module + for module in overloaded_by_module + if _module_prefix(module, package_depth) == node_id + ) + overlay = _aggregate_node_overlay( + members, + overloaded_by_module=overloaded_by_module, + cycle_modules=cycle_modules, + ) + fan_in = _as_int(overlay["fan_in"]) + fan_out = _as_int(overlay["fan_out"]) + source_kinds: object = overlay["source_kinds"] + in_cycle = bool(overlay["in_cycle"]) + overloaded: object = overlay["overloaded"] + else: + item = overloaded_by_module.get(node_id, {}) + fan_in = _as_int(item.get("fan_in")) + fan_out = _as_int(item.get("fan_out")) + source_kinds = sorted({str(item.get("source_kind", ""))}) if item else [] + in_cycle = node_id in cycle_modules + overloaded = { + "score": _as_float(item.get("score")), + "candidate_status": str( + item.get("candidate_status", _MODULE_MAP_NON_CANDIDATE) + ), + "candidate_reasons": sorted( + str(reason) for reason in _as_sequence(item.get("candidate_reasons")) + ), + } + return { + "id": node_id, + "label": node_id, + "fan_in": fan_in, + "fan_out": fan_out, + "total_degree": fan_in + fan_out, + "source_kinds": source_kinds, + "in_cycle": in_cycle, + "overloaded": overloaded, + } + + +def _build_module_graph_view( + module_edges: Sequence[tuple[str, str]], + *, + zoom: str, + package_depth: int | None, + dep_cycles: Sequence[Sequence[str]], + longest_chains: Sequence[Sequence[str]], + max_nodes: int, + overloaded_by_module: Mapping[str, Mapping[str, object]], + cycle_modules: frozenset[str], +) -> dict[str, object]: + weights: Counter[tuple[str, str]] = Counter() + node_id_fn: Callable[[str], str] | None + if package_depth is not None: + node_id_fn = _package_node_id(package_depth) + for source, target in module_edges: + edge = ( + _module_prefix(source, package_depth), + _module_prefix(target, package_depth), + ) + if edge[0] != edge[1]: + weights[edge] += 1 + else: + node_id_fn = None + for source, target in module_edges: + if source != target: + weights[(source, target)] += 1 + nodes, sampled_edges, truncation = select_dependency_graph_nodes( + sorted(weights), + dep_cycles=dep_cycles, + longest_chains=longest_chains, + max_nodes=max_nodes, + max_edges=_MODULE_MAP_MAX_EDGES, + node_id_fn=node_id_fn, + ) + return { + "zoom": zoom, + "package_depth": package_depth, + "truncation": truncation, + "nodes": [ + _module_map_node( + node_id, + package_depth=package_depth, + overloaded_by_module=overloaded_by_module, + cycle_modules=cycle_modules, + ) + for node_id in nodes + ], + "edges": [ + {"source": source, "target": target, "weight": weights[(source, target)]} + for source, target in sampled_edges + ], + } + + +def _unwind_signals( + item: Mapping[str, object], + *, + chain_modules: frozenset[str], + p90_fan_in: float, +) -> list[str]: + reasons = {str(reason) for reason in _as_sequence(item.get("candidate_reasons"))} + fan_in = _as_int(item.get("fan_in")) + fan_out = _as_int(item.get("fan_out")) + instability = _as_float(item.get("instability")) + signals: list[str] = [] + if "dependency_pressure" in reasons: + signals.append("dependency_pressure") + if "hub_like_shape" in reasons: + signals.append("hub_like_shape") + if "repeated_import_pressure" in reasons: + signals.append("repeated_import_pressure") + if str(item.get("module")) in chain_modules: + signals.append("chain_bottleneck") + if instability >= 0.75 and fan_out >= 3: + signals.append("high_instability") + if fan_in >= p90_fan_in and fan_in > 2 * fan_out + 1: + signals.append("central_sink") + return signals + + +def _module_map_unwind_candidates( + overloaded_items: Sequence[Mapping[str, object]], + *, + longest_chains: Sequence[Sequence[str]], +) -> list[dict[str, object]]: + chain_modules = frozenset(str(node) for chain in longest_chains for node in chain) + fan_in_sorted = sorted(_as_int(item.get("fan_in")) for item in overloaded_items) + p90_fan_in = ( + _score_quantile([float(value) for value in fan_in_sorted], 0.9) + if fan_in_sorted + else 0.0 + ) + rows: list[dict[str, object]] = [] + for item in overloaded_items: + signals = _unwind_signals( + item, chain_modules=chain_modules, p90_fan_in=p90_fan_in + ) + candidate_status = str(item.get("candidate_status", _MODULE_MAP_NON_CANDIDATE)) + emit = bool(signals) and ( + candidate_status == _MODULE_MAP_CANDIDATE + or "chain_bottleneck" in signals + or "high_instability" in signals + or "central_sink" in signals + ) + if not emit: + continue + rows.append( + { + "module": str(item.get("module")), + "filepath": str(item.get("filepath", "")), + "source_kind": str(item.get("source_kind", "")), + "fan_in": _as_int(item.get("fan_in")), + "fan_out": _as_int(item.get("fan_out")), + "score": _as_float(item.get("score")), + "dependency_score": _as_float(item.get("dependency_score")), + "candidate_status": candidate_status, + "signals": signals, + } + ) + rows.sort( + key=lambda row: ( + -len(_as_sequence(row["signals"])), + -_as_float(row["dependency_score"]), + -_as_int(row["fan_in"]), + -_as_int(row["fan_out"]), + str(row["module"]), + ) + ) + return rows[:_MODULE_MAP_UNWIND_CANDIDATE_CAP] + + +def _build_derived_module_map( + metrics_payload: Mapping[str, object], +) -> dict[str, object]: + families = _as_mapping(metrics_payload.get("families")) + dependencies = _as_mapping(families.get("dependencies")) + module_edges = _module_edges_from_items(_as_sequence(dependencies.get("items"))) + if not dependencies or not module_edges: + return _module_map_unavailable_shell("dependencies_skipped") + modules = sorted({node for edge in module_edges for node in edge}) + module_count = len(modules) + dep_cycles = _string_paths(_as_sequence(dependencies.get("cycles"))) + longest_chains = _string_paths(_as_sequence(dependencies.get("longest_chains"))) + cycle_modules = frozenset(node for cycle in dep_cycles for node in cycle) + overloaded = _as_mapping(families.get("overloaded_modules")) + overloaded_items = [ + _as_mapping(item) for item in _as_sequence(overloaded.get("items")) ] + overloaded_summary = _as_mapping(overloaded.get("summary")) + population_status = str(overloaded_summary.get("population_status") or "ok") + overloaded_by_module: dict[str, Mapping[str, object]] = { + str(item.get("module")): item for item in overloaded_items + } + zoom, package_depth = _module_map_zoom_decision(modules, module_count) + unwind = _module_map_unwind_candidates( + overloaded_items, longest_chains=longest_chains + ) + overloaded_candidate_count = sum( + 1 + for item in overloaded_items + if str(item.get("candidate_status")) == _MODULE_MAP_CANDIDATE + ) + return { + "schema_version": _MODULE_MAP_SCHEMA_VERSION, + "scope": "report_only", + "default_zoom": zoom, + "summary": { + "available": True, + "module_count": module_count, + "package_count_depth2": len( + {_module_prefix(module, 2) for module in modules} + ), + "edge_count": len(set(module_edges)), + "unwind_candidate_count": len(unwind), + "overloaded_candidate_count": overloaded_candidate_count, + "overloaded_population_status": population_status, + }, + "graph_packages": _build_module_graph_view( + module_edges, + zoom="packages", + package_depth=package_depth, + dep_cycles=dep_cycles, + longest_chains=longest_chains, + max_nodes=_MODULE_MAP_MAX_PACKAGE_NODES, + overloaded_by_module=overloaded_by_module, + cycle_modules=cycle_modules, + ), + "graph_modules": _build_module_graph_view( + module_edges, + zoom="modules", + package_depth=None, + dep_cycles=dep_cycles, + longest_chains=longest_chains, + max_nodes=_MODULE_MAP_MAX_MODULE_NODES, + overloaded_by_module=overloaded_by_module, + cycle_modules=cycle_modules, + ), + "unwind_candidates": unwind, + } diff --git a/codeclone/report/html/assemble.py b/codeclone/report/html/assemble.py index dacfd710..b0573d25 100644 --- a/codeclone/report/html/assemble.py +++ b/codeclone/report/html/assemble.py @@ -40,8 +40,10 @@ TAB_DEAD_CODE, TAB_DEPENDENCIES, TAB_FINDINGS, + TAB_MODULE_MAP, TAB_OVERVIEW, TAB_QUALITY, + TAB_REVIEW, TAB_SUGGESTIONS, TABLIST_ARIA_LABEL, THEME_BUTTON_TEXT, @@ -56,7 +58,9 @@ from .sections._dead_code import render_dead_code_panel from .sections._dependencies import render_dependencies_panel from .sections._meta import build_topbar_provenance_summary, render_meta_panel +from .sections._module_map import render_module_map_panel from .sections._overview import render_overview_panel +from .sections._review import render_review_panel from .sections._structural import render_structural_panel from .sections._suggestions import render_suggestions_panel from .template import FONT_CSS_URL, REPORT_TEMPLATE @@ -111,8 +115,10 @@ def build_html_report( # -- Render sections -- overview_html = render_overview_panel(ctx) + review_html = render_review_panel(ctx) clones_html, _novelty_enabled, _total_new, _total_known = render_clones_panel(ctx) quality_html = render_quality_panel(ctx) + module_map_html = render_module_map_panel(ctx) dependencies_html = render_dependencies_panel(ctx) dead_code_html = render_dead_code_panel(ctx) suggestions_html = render_suggestions_panel(ctx) @@ -136,6 +142,15 @@ def build_html_report( == CONFIDENCE_HIGH ) dep_cycles = len(_as_sequence(ctx.dependencies_map.get("cycles"))) + module_map_summary = _as_mapping( + _as_mapping(ctx.derived_map.get("module_map")).get("summary") + ) + module_map_unwind = _as_int(module_map_summary.get("unwind_candidate_count")) + review_total = _as_int( + _as_mapping( + _as_mapping(ctx.derived_map.get("review_queue")).get("summary") + ).get("total") + ) structural_count = len( tuple(normalize_structural_findings(ctx.structural_findings)) ) @@ -152,23 +167,23 @@ def build_html_report( _as_int(_as_mapping(ctx.complexity_map.get("summary")).get("high_risk")) + _as_int(_as_mapping(ctx.coupling_map.get("summary")).get("high_risk")) + _as_int(_as_mapping(ctx.cohesion_map.get("summary")).get("low_cohesion")) - + _as_int( - _as_mapping(ctx.overloaded_modules_map.get("summary")).get("candidates") - ) + coverage_review_items + _as_int(_as_mapping(ctx.security_surfaces_map.get("summary")).get("items")) ) - def _tab_badge(count: int) -> str: + def _tab_badge(count: int, unit: str) -> str: if count == 0: return "" - return f'{count}' + title = f"{count} {unit}" + return f'{count}' # -- Main tab navigation -- tab_icon_keys: dict[str, str] = { "overview": "overview", + "review": "review", "clones": "clones", "quality": "quality", + "module-map": "module-map", "dependencies": "dependencies", "dead-code": "dead-code", "suggestions": "suggestions", @@ -176,21 +191,48 @@ def _tab_badge(count: int) -> str: } tab_defs = [ ("overview", TAB_OVERVIEW, overview_html, ""), - ("clones", TAB_CLONES, clones_html, _tab_badge(ctx.clone_groups_total)), - ("quality", TAB_QUALITY, quality_html, _tab_badge(quality_issues)), - ("dependencies", TAB_DEPENDENCIES, dependencies_html, _tab_badge(dep_cycles)), - ("dead-code", TAB_DEAD_CODE, dead_code_html, _tab_badge(dead_high_conf)), + ( + "review", + TAB_REVIEW, + review_html, + _tab_badge(review_total, "findings to review"), + ), + ( + "clones", + TAB_CLONES, + clones_html, + _tab_badge(ctx.clone_groups_total, "clone groups"), + ), + ("quality", TAB_QUALITY, quality_html, _tab_badge(quality_issues, "issues")), + ( + "module-map", + TAB_MODULE_MAP, + module_map_html, + _tab_badge(module_map_unwind, "unwind candidates"), + ), + ( + "dependencies", + TAB_DEPENDENCIES, + dependencies_html, + _tab_badge(dep_cycles, "dependency cycles"), + ), + ( + "dead-code", + TAB_DEAD_CODE, + dead_code_html, + _tab_badge(dead_high_conf, "high-confidence dead-code items"), + ), ( "suggestions", TAB_SUGGESTIONS, suggestions_html, - _tab_badge(len(ctx.suggestions)), + _tab_badge(len(ctx.suggestions), "suggestions"), ), ( "structural-findings", TAB_FINDINGS, structural_html, - _tab_badge(structural_count), + _tab_badge(structural_count, "structural findings"), ), ] diff --git a/codeclone/report/html/assets/css.py b/codeclone/report/html/assets/css.py index af4776dd..4b08eb99 100644 --- a/codeclone/report/html/assets/css.py +++ b/codeclone/report/html/assets/css.py @@ -28,9 +28,9 @@ --bg-overlay:oklch(29% 0.033 275); --bg-subtle:oklch(34% 0.038 275); - /* Border — same hue, higher chroma for legibility */ - --border:oklch(32% 0.035 275); - --border-strong:oklch(44% 0.045 275); + /* Border — quiet hairlines; -strong only for hover/emphasis */ + --border:oklch(28% 0.018 275); + --border-strong:oklch(40% 0.028 275); /* Text — muted greys keep a trace of indigo so they feel alive */ --text-primary:oklch(95% 0.010 275); @@ -44,7 +44,8 @@ --accent-soft:oklch(30% 0.12 275); /* Semantic — brand-adjacent, hue-rotated so they read as siblings - of the indigo instead of raw Tailwind defaults */ + of the indigo instead of raw Tailwind defaults. Light-mode lightness is + tuned so severity badge text clears WCAG AA (>=4.5:1) on its muted bg. */ --success:oklch(74% 0.15 162); --success-muted:color-mix(in oklch,oklch(74% 0.15 162) 18%,transparent); --warning:oklch(80% 0.15 82); @@ -55,18 +56,34 @@ --info:oklch(72% 0.13 238); --info-muted:color-mix(in oklch,oklch(72% 0.13 238) 18%,transparent); - /* elevation */ - --shadow-sm:0 1px 2px rgba(0,0,0,.25); - --shadow-md:0 2px 8px rgba(0,0,0,.3); - --shadow-lg:0 4px 16px rgba(0,0,0,.35); - --shadow-xl:0 8px 32px rgba(0,0,0,.4); + /* elevation — soft, diffuse, layered */ + --shadow-sm:0 1px 2px rgba(0,0,0,.18); + --shadow-md:0 4px 14px -3px rgba(0,0,0,.34); + --shadow-lg:0 10px 30px -8px rgba(0,0,0,.44); + --shadow-xl:0 20px 50px -14px rgba(0,0,0,.55); - /* radii */ + /* radii — crisp, not bubbly */ --radius-sm:4px; --radius-md:6px; --radius-lg:8px; --radius-xl:12px; + /* page-background glow (dark only; light overrides to transparent) */ + --bg-glow:color-mix(in oklch,var(--accent-primary) 9%,transparent); + + /* badge design code — one scale for every read-only label badge */ + --badge-font:var(--font-sans); + --badge-size:.68rem; + --badge-weight:600; + --badge-tracking:.015em; + --badge-pad:2px var(--sp-2); + --badge-radius:var(--radius-sm); + + /* count sort — tabular numerals shared by counts and micro-stats */ + --count-font:var(--font-numeric); + --count-size:.64rem; + --count-weight:700; + /* spacing */ --sp-1:4px;--sp-2:8px;--sp-3:12px;--sp-4:16px;--sp-5:20px;--sp-6:24px;--sp-8:32px;--sp-10:40px; @@ -90,34 +107,34 @@ so the whole theme feels like one family in both modes. */ @media(prefers-color-scheme:light){ :root:not([data-theme]){ - --bg-body:oklch(98.5% 0.006 275);--bg-surface:#ffffff; + --bg-body:oklch(98.5% 0.006 275);--bg-surface:#ffffff;--bg-glow:transparent; --bg-raised:oklch(97% 0.010 275);--bg-overlay:oklch(93% 0.015 275);--bg-subtle:oklch(88% 0.020 275); - --border:oklch(88% 0.020 275);--border-strong:oklch(78% 0.028 275); + --border:oklch(92% 0.010 275);--border-strong:oklch(85% 0.016 275); --text-primary:oklch(22% 0.040 275);--text-secondary:oklch(42% 0.048 275);--text-muted:oklch(58% 0.040 275); --accent-primary:#4f46e5;--accent-hover:#6366f1;--accent-muted:color-mix(in oklch,#4f46e5 12%,transparent); --accent-soft:oklch(94% 0.045 275); - --success:oklch(52% 0.16 162);--success-muted:color-mix(in oklch,oklch(52% 0.16 162) 12%,transparent); - --warning:oklch(60% 0.15 65);--warning-muted:color-mix(in oklch,oklch(60% 0.15 65) 12%,transparent); - --error:oklch(55% 0.22 20);--error-muted:color-mix(in oklch,oklch(55% 0.22 20) 12%,transparent); - --danger:oklch(55% 0.22 20);--info:oklch(52% 0.18 238);--info-muted:color-mix(in oklch,oklch(52% 0.18 238) 12%,transparent); - --shadow-sm:0 1px 2px rgba(0,0,0,.06);--shadow-md:0 2px 8px rgba(0,0,0,.08); - --shadow-lg:0 4px 16px rgba(0,0,0,.1);--shadow-xl:0 8px 32px rgba(0,0,0,.12); + --success:oklch(47% 0.16 162);--success-muted:color-mix(in oklch,oklch(52% 0.16 162) 12%,transparent); + --warning:oklch(51.5% 0.15 65);--warning-muted:color-mix(in oklch,oklch(60% 0.15 65) 12%,transparent); + --error:oklch(50.5% 0.22 20);--error-muted:color-mix(in oklch,oklch(55% 0.22 20) 12%,transparent); + --danger:oklch(50.5% 0.22 20);--info:oklch(48.5% 0.18 238);--info-muted:color-mix(in oklch,oklch(52% 0.18 238) 12%,transparent); + --shadow-sm:0 1px 2px rgba(17,20,38,.05);--shadow-md:0 4px 14px -3px rgba(17,20,38,.08); + --shadow-lg:0 12px 30px -8px rgba(17,20,38,.12);--shadow-xl:0 22px 50px -14px rgba(17,20,38,.16); color-scheme:light; } } [data-theme="light"]{ - --bg-body:oklch(98.5% 0.006 275);--bg-surface:#ffffff; + --bg-body:oklch(98.5% 0.006 275);--bg-surface:#ffffff;--bg-glow:transparent; --bg-raised:oklch(97% 0.010 275);--bg-overlay:oklch(93% 0.015 275);--bg-subtle:oklch(88% 0.020 275); - --border:oklch(88% 0.020 275);--border-strong:oklch(78% 0.028 275); + --border:oklch(92% 0.010 275);--border-strong:oklch(85% 0.016 275); --text-primary:oklch(22% 0.040 275);--text-secondary:oklch(42% 0.048 275);--text-muted:oklch(58% 0.040 275); --accent-primary:#4f46e5;--accent-hover:#6366f1;--accent-muted:color-mix(in oklch,#4f46e5 12%,transparent); --accent-soft:oklch(94% 0.045 275); - --success:oklch(52% 0.16 162);--success-muted:color-mix(in oklch,oklch(52% 0.16 162) 12%,transparent); - --warning:oklch(60% 0.15 65);--warning-muted:color-mix(in oklch,oklch(60% 0.15 65) 12%,transparent); - --error:oklch(55% 0.22 20);--error-muted:color-mix(in oklch,oklch(55% 0.22 20) 12%,transparent); - --danger:oklch(55% 0.22 20);--info:oklch(52% 0.18 238);--info-muted:color-mix(in oklch,oklch(52% 0.18 238) 12%,transparent); - --shadow-sm:0 1px 2px rgba(0,0,0,.06);--shadow-md:0 2px 8px rgba(0,0,0,.08); - --shadow-lg:0 4px 16px rgba(0,0,0,.1);--shadow-xl:0 8px 32px rgba(0,0,0,.12); + --success:oklch(47% 0.16 162);--success-muted:color-mix(in oklch,oklch(52% 0.16 162) 12%,transparent); + --warning:oklch(51.5% 0.15 65);--warning-muted:color-mix(in oklch,oklch(60% 0.15 65) 12%,transparent); + --error:oklch(50.5% 0.22 20);--error-muted:color-mix(in oklch,oklch(55% 0.22 20) 12%,transparent); + --danger:oklch(50.5% 0.22 20);--info:oklch(48.5% 0.18 238);--info-muted:color-mix(in oklch,oklch(52% 0.18 238) 12%,transparent); + --shadow-sm:0 1px 2px rgba(17,20,38,.05);--shadow-md:0 4px 14px -3px rgba(17,20,38,.08); + --shadow-lg:0 12px 30px -8px rgba(17,20,38,.12);--shadow-xl:0 22px 50px -14px rgba(17,20,38,.16); color-scheme:light; } """ @@ -131,7 +148,10 @@ html{-webkit-text-size-adjust:100%;text-size-adjust:100%;-webkit-font-smoothing:antialiased; -moz-osx-font-smoothing:grayscale;scroll-behavior:smooth;scrollbar-gutter:stable} body{font-family:var(--font-sans);font-size:14px;line-height:1.6;color:var(--text-primary); - background:var(--bg-body);overflow-x:hidden; + background: + radial-gradient(1100px 460px at 50% -12%,var(--bg-glow),transparent 70%), + var(--bg-body); + background-attachment:fixed;overflow-x:hidden; /* Inter stylistic alternates: zero — slashed zero (disambiguates 0 from O in metric values) ss02 — disambiguation set (I/l/1/0 clear apart) @@ -202,13 +222,18 @@ border:none;cursor:pointer;font-size:.85rem;font-weight:500;color:var(--text-muted); white-space:nowrap;border-radius:var(--radius-md);transition:all var(--dur-fast) var(--ease)} .main-tab:hover{color:var(--text-primary);background:var(--bg-raised)} -.main-tab[aria-selected="true"]{color:var(--accent-primary);background:var(--accent-muted)} +.main-tab[aria-selected="true"]{color:#fff;background:var(--accent-primary); + box-shadow:0 1px 4px color-mix(in oklch,var(--accent-primary) 42%,transparent)} +.main-tab[aria-selected="true"]:hover{background:var(--accent-hover)} .main-tab-icon{flex-shrink:0;opacity:.72} +.main-tab[aria-selected="true"] .main-tab-icon{opacity:1} .main-tab-label{display:inline-flex;align-items:center} .tab-count{display:inline-flex;align-items:center;justify-content:center;min-width:18px; - height:18px;padding:0 5px;font-size:.68rem;font-weight:700;border-radius:var(--radius-sm); + height:18px;padding:0 5px;border-radius:var(--radius-sm); + font-family:var(--count-font);font-size:var(--count-size);font-weight:var(--count-weight); + font-variant-numeric:tabular-nums; background:var(--bg-overlay);color:var(--text-muted);margin-left:var(--sp-1)} -.main-tab[aria-selected="true"] .tab-count{background:var(--accent-primary); +.main-tab[aria-selected="true"] .tab-count{background:rgba(255,255,255,.24); color:#fff} /* Tab panels */ @@ -257,21 +282,38 @@ .btn.ghost:hover{background:var(--bg-raised);border-color:var(--border)} .btn.btn-icon{padding:var(--sp-1);min-width:28px;justify-content:center} .btn svg{width:14px;height:14px} +.btn:hover{box-shadow:var(--shadow-sm)} + +/* Smart controls — one accent focus ring for every button + tactile press */ +button:focus-visible,a:focus-visible,summary:focus-visible{ + outline:2px solid var(--accent-primary);outline-offset:2px} +.btn:active,.prov-pill:active,.theme-toggle:active,.badge-btn:active,.badge-tab:active, +.review-launchpad-cta:active,.review-toggle:active,.review-chip:active, +.clone-nav-btn:active{transform:translateY(.5px) scale(.985)} +@media(prefers-reduced-motion:reduce){ + .btn:active,.prov-pill:active,.theme-toggle:active,.badge-btn:active,.badge-tab:active, + .review-launchpad-cta:active,.review-toggle:active,.review-chip:active, + .clone-nav-btn:active{transform:none} +} /* Inputs */ input[type="text"]{padding:var(--sp-1) var(--sp-3);font-size:.85rem;border:1px solid var(--border); border-radius:var(--radius-md);background:var(--bg-body);color:var(--text-primary);outline:none; - transition:border-color var(--dur-fast) var(--ease)} -input[type="text"]:focus{border-color:var(--accent-primary);box-shadow:0 0 0 2px var(--accent-muted)} + transition:border-color var(--dur-fast) var(--ease),box-shadow var(--dur-fast) var(--ease)} +input[type="text"]:hover{border-color:var(--border-strong)} +input[type="text"]:focus{border-color:var(--accent-primary);box-shadow:0 0 0 3px var(--accent-muted)} input[type="text"]::placeholder{color:var(--text-muted)} /* Selects */ .select{padding:var(--sp-1) var(--sp-3);padding-right:var(--sp-6);font-size:.8rem; border:1px solid var(--border);border-radius:var(--radius-md);background:var(--bg-raised); - color:var(--text-secondary);cursor:pointer;appearance:none; + color:var(--text-secondary);cursor:pointer;appearance:none;outline:none; + transition:border-color var(--dur-fast) var(--ease),box-shadow var(--dur-fast) var(--ease), + color var(--dur-fast) var(--ease); background-image:url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' fill='none' stroke='%236b6f88' stroke-width='2'%3E%3Cpath d='M3 4.5l3 3 3-3'/%3E%3C/svg%3E"); background-repeat:no-repeat;background-position:right 8px center} -.select:focus{border-color:var(--accent-primary);outline:none} +.select:hover{border-color:var(--border-strong);color:var(--text-primary)} +.select:focus{border-color:var(--accent-primary);box-shadow:0 0 0 3px var(--accent-muted)} /* Checkbox labels */ .inline-check{display:inline-flex;align-items:center;gap:var(--sp-1);font-size:.8rem; @@ -316,7 +358,7 @@ .filters-btn{display:inline-flex;align-items:center;gap:var(--sp-1);white-space:nowrap} .filters-btn-ico{flex:none} .filters-count{display:inline-flex;align-items:center;justify-content:center; - min-width:18px;height:18px;padding:0 5px;border-radius:999px; + min-width:18px;height:18px;padding:0 5px;border-radius:var(--radius-sm); background:var(--accent-primary);color:#fff;font-size:.68rem;font-weight:600; line-height:1} .filters-btn[aria-expanded="true"]{border-color:var(--accent-primary); @@ -341,7 +383,7 @@ border-color:var(--border-strong)} /* Suggestions count pill (right side of the shared toolbar). */ -.suggestions-count-label{font-size:.8rem;color:var(--text-muted);font-weight:500; +.suggestions-count-label,.toolbar-count-label{font-size:.8rem;color:var(--text-muted);font-weight:500; font-variant-numeric:tabular-nums;white-space:nowrap} """ @@ -350,16 +392,23 @@ # --------------------------------------------------------------------------- _INSIGHT = """\ -.insight-banner{padding:var(--sp-3) var(--sp-4);border-radius:var(--radius-md); - margin-bottom:var(--sp-4);border-left:3px solid var(--border);background:none} -.insight-question{font-size:.78rem;font-weight:500;color:var(--text-muted); - text-transform:uppercase;letter-spacing:.03em;margin-bottom:2px} -.insight-answer{font-size:.82rem;color:var(--text-secondary);line-height:1.5} - -.insight-ok{border-left-color:var(--success);background:var(--success-muted)} -.insight-warn{border-left-color:var(--warning);background:var(--warning-muted)} -.insight-risk{border-left-color:var(--error);background:var(--error-muted)} -.insight-info{border-left-color:var(--info);background:var(--info-muted)} +.insight-banner{position:relative;padding:var(--sp-4) var(--sp-5); + border-radius:var(--radius-lg);margin-bottom:var(--sp-5); + border:1px solid var(--border);background:var(--bg-surface);overflow:hidden} +.insight-banner::before{content:"";position:absolute;inset:0 auto 0 0;width:3px; + background:var(--border-strong)} +.insight-question{font-size:.72rem;font-weight:600;color:var(--text-muted); + text-transform:uppercase;letter-spacing:.06em;margin-bottom:5px} +.insight-answer{font-size:.88rem;color:var(--text-secondary);line-height:1.55} + +.insight-ok::before{background:var(--success)} +.insight-ok{background:color-mix(in oklch,var(--success-muted) 55%,var(--bg-surface))} +.insight-warn::before{background:var(--warning)} +.insight-warn{background:color-mix(in oklch,var(--warning-muted) 55%,var(--bg-surface))} +.insight-risk::before{background:var(--error)} +.insight-risk{background:color-mix(in oklch,var(--error-muted) 55%,var(--bg-surface))} +.insight-info::before{background:var(--info)} +.insight-info{background:color-mix(in oklch,var(--info-muted) 55%,var(--bg-surface))} .insight-banner .overview-summary-grid{margin:0} .insight-banner .overview-summary-item{background:none;border:none;border-radius:0;padding:0} .insight-banner .overview-summary-label{font-size:.76rem;margin-bottom:var(--sp-2); @@ -380,10 +429,11 @@ linear-gradient(to right,rgba(0,0,0,.15),transparent) left center / 14px 100% no-repeat scroll, linear-gradient(to left,rgba(0,0,0,.15),transparent) right center / 14px 100% no-repeat scroll} .table{inline-size:max-content;min-inline-size:100%;border-collapse:collapse;font-size:.82rem; - font-family:var(--font-mono)} + font-family:var(--font-sans)} .table th{position:sticky;top:0;z-index:2;padding:var(--sp-2) var(--sp-3);text-align:left;font-family:var(--font-sans); - font-weight:600;font-size:.75rem;text-transform:uppercase;letter-spacing:.05em; - color:var(--text-muted);background:var(--bg-overlay);border-bottom:1px solid var(--border); + font-weight:600;font-size:.72rem;text-transform:uppercase;letter-spacing:.06em; + color:var(--text-secondary);background:var(--bg-overlay); + border-bottom:2px solid color-mix(in oklch,var(--accent-primary) 30%,var(--border)); white-space:nowrap;cursor:default;user-select:none} .table th[data-sortable]{cursor:pointer} .table th[data-sortable]:hover{color:var(--text-primary)} @@ -391,17 +441,53 @@ .table th[aria-sort] .sort-icon{opacity:1;color:var(--accent-primary)} .table td{padding:var(--sp-2) var(--sp-3);border-bottom:1px solid var(--border);color:var(--text-secondary); vertical-align:top} -.table tr:last-child td{border-bottom:none} -.table tr:hover td{background:var(--bg-raised)} +.table tbody tr:nth-child(even) td{background:color-mix(in oklch,var(--bg-raised) 45%,transparent)} +.table tbody tr:last-child td{border-bottom:none} +.table tbody tr:hover td{background:var(--accent-muted)} .table .col-name{font-weight:500;color:var(--text-primary);max-width:360px;overflow:hidden; text-overflow:ellipsis;white-space:nowrap} .table .col-file,.table .col-path{color:var(--text-muted);max-width:240px;overflow:hidden; text-overflow:ellipsis;white-space:nowrap} -.table .col-number,.table .col-num{font-variant-numeric:tabular-nums;text-align:right;white-space:nowrap} +.table .col-number,.table .col-num{font-family:var(--font-numeric); + font-variant-numeric:tabular-nums;text-align:right;white-space:nowrap;color:var(--text-primary)} .table .col-risk,.table .col-badge,.table .col-cat{white-space:nowrap} .table .col-steps{max-width:120px;word-break:break-word} .table .col-wide{max-width:320px;word-break:break-all} +.table .col-score{min-width:130px;white-space:nowrap} +.table .col-chips{max-width:300px} .table-empty{padding:var(--sp-8);text-align:center;color:var(--text-muted);font-size:.9rem} + +/* Typed table cells: score bar, status pill, chips (shared badge vocabulary) */ +.score-bar{display:inline-flex;align-items:center;gap:7px;min-width:110px} +.score-bar-track{flex:1;height:5px;border-radius:3px;background:var(--accent-muted);overflow:hidden} +.score-bar-fill{display:block;height:100%;border-radius:3px;background:var(--accent-primary)} +.score-bar--strong .score-bar-fill{background:var(--accent-hover)} +.score-bar-val{font-family:var(--font-numeric);font-variant-numeric:tabular-nums; + font-size:.78rem;color:var(--text-secondary)} +.score-bar--strong .score-bar-val{color:var(--accent-primary);font-weight:600} +.metric-meter{display:inline-flex;align-items:center;gap:8px;width:100%; + flex-direction:row-reverse;justify-content:flex-start} +.metric-meter-track{flex:1;max-width:60px;height:5px;border-radius:3px; + background:var(--bg-overlay);overflow:hidden} +.metric-meter-fill{display:block;height:100%;border-radius:3px; + background:color-mix(in oklch,var(--accent-primary) 70%,var(--text-muted))} +.metric-meter-val{font-family:var(--font-numeric);font-variant-numeric:tabular-nums; + font-size:.8rem;color:var(--text-primary);min-width:22px;text-align:right} +.metric-meter--mid .metric-meter-fill{background:var(--warning)} +.metric-meter--mid .metric-meter-val{color:var(--warning)} +.metric-meter--high .metric-meter-fill{background:var(--error)} +.metric-meter--high .metric-meter-val{color:var(--error);font-weight:600} +.status-pill--candidate{background:var(--accent-muted);color:var(--accent-primary)} +.status-pill--ranked{background:var(--bg-overlay);color:var(--text-secondary)} +.status-pill--neutral{background:var(--bg-overlay);color:var(--text-muted)} +.chip{margin:1px 3px 1px 0;background:var(--bg-overlay);color:var(--text-secondary); + border:1px solid var(--border)} +/* Code sort: identifiers / globs in mono, distinct from sans label badges */ +.code-chip{display:inline-flex;align-items:center;max-width:100%;font-family:var(--font-mono); + font-size:.72rem;padding:2px var(--sp-2);border-radius:var(--radius-sm); + background:var(--bg-overlay);color:var(--text-secondary);border:1px solid var(--border); + white-space:nowrap;overflow:hidden;text-overflow:ellipsis} +.table .col-code{max-width:240px} """ # --------------------------------------------------------------------------- @@ -429,8 +515,8 @@ _SECTIONS = """\ .section{margin-bottom:var(--sp-6)} -.subsection-title{font-size:1rem;font-weight:600;color:var(--text-primary); - margin-bottom:var(--sp-3);padding-bottom:var(--sp-2);border-bottom:1px solid var(--border)} +.subsection-title{font-size:.95rem;font-weight:600;color:var(--text-primary); + margin:var(--sp-4) 0 var(--sp-2);padding-bottom:var(--sp-2);border-bottom:1px solid var(--border)} .section-body{display:flex;flex-direction:column;gap:var(--sp-3)} /* Clone groups */ @@ -514,8 +600,16 @@ # --------------------------------------------------------------------------- _BADGES = """\ -.risk-badge,.severity-badge{display:inline-flex;align-items:center;font-size:.68rem;font-weight:600; - padding:2px var(--sp-2);border-radius:var(--radius-sm);text-transform:uppercase;letter-spacing:.02em} +/* One typographic scale for every read-only label badge; color/background and + any per-variant tweaks (uppercase, etc.) live in the modifiers below. */ +.risk-badge,.severity-badge,.source-kind-badge,.status-pill, +.finding-meta-badge,.suggestion-chip,.chip,.launchpad-sev{ + display:inline-flex;align-items:center;white-space:nowrap;line-height:1.2; + font-family:var(--badge-font);font-size:var(--badge-size); + font-weight:var(--badge-weight);letter-spacing:var(--badge-tracking); + padding:var(--badge-pad);border-radius:var(--badge-radius); + font-variant-numeric:tabular-nums} +.risk-badge,.severity-badge{text-transform:uppercase} .risk-critical,.severity-critical{background:var(--error-muted);color:var(--error)} .risk-high,.severity-high{background:var(--error-muted);color:var(--error)} .risk-warning,.severity-warning{background:var(--warning-muted);color:var(--warning)} @@ -523,8 +617,7 @@ .risk-low,.severity-low{background:var(--success-muted);color:var(--success)} .risk-info,.severity-info{background:var(--info-muted);color:var(--info)} -.source-kind-badge{display:inline-flex;align-items:center;font-size:.68rem;font-weight:500; - padding:2px var(--sp-2);border-radius:var(--radius-sm);background:var(--bg-overlay);color:var(--text-muted)} +.source-kind-badge{background:var(--bg-overlay);color:var(--text-muted)} .source-kind-production{background:var(--error-muted);color:var(--error)} .source-kind-test,.source-kind-test_util{background:var(--info-muted);color:var(--info)} .source-kind-fixture,.source-kind-conftest{background:var(--warning-muted);color:var(--warning)} @@ -554,10 +647,14 @@ .overview-kpi-grid--with-health .meta-item{min-width:0} .overview-kpi-grid--with-health .meta-item{min-height:0} .overview-kpi-cards .meta-item{display:grid;grid-template-rows:auto 1fr auto; - align-items:start;padding:var(--sp-3) var(--sp-4);gap:var(--sp-2);min-height:0} -.overview-kpi-cards .meta-item .meta-label{font-size:.75rem;min-height:18px} + align-items:start;padding:var(--sp-3) var(--sp-4);gap:var(--sp-2);min-height:0; + box-shadow:var(--shadow-sm);transition:border-color var(--dur-fast) var(--ease), + box-shadow var(--dur-normal) var(--ease),transform var(--dur-fast) var(--ease)} +.overview-kpi-cards .meta-item:hover{box-shadow:var(--shadow-md);transform:translateY(-1px)} +.overview-kpi-cards .meta-item .meta-label{font-size:.68rem;min-height:18px; + text-transform:uppercase;letter-spacing:.05em;font-weight:600} .overview-kpi-cards .meta-item .meta-value{display:flex;align-items:center; - font-size:1.55rem;line-height:1;padding:var(--sp-1) 0} + font-size:1.85rem;line-height:1;padding:var(--sp-1) 0;letter-spacing:-0.02em} .overview-kpi-cards .kpi-detail{margin-top:0;gap:4px;align-self:end} .overview-kpi-cards .kpi-micro{padding:2px 6px;font-size:.65rem} .overview-kpi-grid--with-health .overview-health-card{padding:var(--sp-2)} @@ -596,10 +693,11 @@ transition:stroke-dashoffset 1s var(--ease)} .health-ring-label{position:absolute;inset:0;display:flex;flex-direction:column; align-items:center;justify-content:center} -.health-ring-score{font-family:var(--font-numeric);font-size:1.85rem;font-weight:680; +.health-ring-score{font-family:var(--font-numeric);font-size:2.15rem;font-weight:700; color:var(--text-primary);font-variant-numeric:tabular-nums;line-height:1; - letter-spacing:-0.018em} -.health-ring-grade{font-size:.72rem;font-weight:500;color:var(--text-muted);margin-top:3px} + letter-spacing:-0.022em} +.health-ring-grade{font-size:.7rem;font-weight:600;color:var(--text-muted);margin-top:4px; + text-transform:uppercase;letter-spacing:.06em} .health-ring-delta{font-size:.65rem;font-weight:600;margin-top:3px} .health-ring-delta--up{color:var(--success)} .health-ring-delta--down{color:var(--error)} @@ -664,13 +762,20 @@ .meta-item .meta-value--bad{color:var(--error)} .meta-item .meta-value--warn{color:var(--warning)} .meta-item .meta-value--muted{color:var(--text-muted)} +.meta-item .meta-value--accent{color:var(--accent-primary)} +.meta-item .meta-value-sec{font-family:var(--font-numeric);font-size:.9rem;font-weight:500; + color:var(--text-muted);margin-left:5px;letter-spacing:0} +.meta-item .meta-subtext{font-family:var(--font-sans);font-size:.7rem;color:var(--text-muted); + margin-top:3px;line-height:1.35} +.meta-item--accent{border-color:var(--accent-primary)} +.meta-item--accent:hover{border-color:var(--accent-primary)} .kpi-detail{display:flex;flex-wrap:wrap;gap:3px;margin-top:2px} .kpi-detail code{font-size:.78rem} -.kpi-micro{display:inline-flex;align-items:center;gap:3px;font-size:.62rem; +.kpi-micro{display:inline-flex;align-items:center;gap:3px;font-size:var(--count-size); padding:1px 5px;border-radius:var(--radius-sm);background:var(--bg-raised); - white-space:nowrap;line-height:1.3;font-family:inherit} -.kpi-micro-val{font-family:inherit;font-weight:500;font-variant-numeric:tabular-nums; - color:var(--text-muted)} + white-space:nowrap;line-height:1.3;font-family:var(--font-sans)} +.kpi-micro-val{font-family:var(--count-font);font-weight:var(--count-weight); + font-variant-numeric:tabular-nums;color:var(--text-muted)} .kpi-micro-lbl{font-weight:400;color:var(--text-muted);text-transform:lowercase} .kpi-micro--baselined{color:var(--success);font-weight:500;font-size:.6rem} .kpi-delta{font-size:.62rem;font-weight:700;margin-left:auto; @@ -700,7 +805,8 @@ .overview-cluster-copy{font-size:.82rem;color:var(--text-muted);margin-top:2px} .overview-cluster-empty{display:flex;flex-direction:column;align-items:center;gap:var(--sp-2); padding:var(--sp-5);text-align:center;color:var(--text-muted);font-size:.85rem} -.empty-icon{color:var(--success);opacity:.35;width:32px;height:32px;flex-shrink:0} +.empty-icon{color:var(--success);opacity:.35;width:32px;height:32px;flex-shrink:0; + margin-bottom:var(--sp-3)} .overview-list{display:grid;grid-template-columns:repeat(2,1fr);gap:var(--sp-2)} /* Overview rows */ @@ -771,7 +877,7 @@ .breakdown-row .source-kind-badge{justify-content:center;min-width:0;width:100%;text-align:center} .breakdown-count{font-size:.8rem;font-weight:600;font-variant-numeric:tabular-nums; color:var(--text-primary);text-align:right} -.breakdown-bar-track{height:6px;border-radius:3px;background:var(--bg-raised);overflow:hidden} +.breakdown-bar-track{height:6px;border-radius:3px;background:var(--bg-raised);overflow:hidden;display:flex} .breakdown-bar-fill{display:block;height:100%;border-radius:3px; background:var(--accent-primary);transition:width .6s var(--ease)} /* Directory hotspot entries */ @@ -828,7 +934,6 @@ .families-label{font-size:.75rem;font-weight:500;color:var(--text-secondary);text-align:right} .families-count{font-size:.8rem;font-weight:600;font-variant-numeric:tabular-nums; color:var(--text-primary);text-align:right} -.breakdown-bar-track{display:flex} .breakdown-bar-fill--baselined{opacity:.5} .breakdown-bar-fill--new{border-radius:0 3px 3px 0} .families-delta{font-size:.65rem;font-weight:600;font-variant-numeric:tabular-nums;white-space:nowrap} @@ -848,12 +953,23 @@ .stat-cards .meta-item .meta-value,.dep-stats .meta-item .meta-value{display:flex;align-items:center} .stat-cards .kpi-detail,.dep-stats .kpi-detail{margin-top:0;align-self:end} .dep-graph-wrap{overflow:hidden;margin-bottom:var(--sp-4);border:1px solid var(--border); - border-radius:var(--radius-lg);background:var(--bg-surface);padding:var(--sp-4)} -.dep-graph-svg{display:block;width:100%;height:auto;max-height:680px;margin:0 auto} -.dep-graph-svg text{fill:var(--text-secondary);font-family:var(--font-mono)} -.dep-node{transition:fill-opacity var(--dur-fast) var(--ease)} -.dep-edge{transition:stroke-opacity var(--dur-fast) var(--ease)} -.dep-label{transition:fill var(--dur-fast) var(--ease)} + border-radius:var(--radius-lg); + background:linear-gradient(180deg,var(--bg-surface),var(--bg-raised)); + padding:var(--sp-5)} +.dep-graph-svg{display:block;height:auto;margin:0 auto;overflow:visible} +.dep-graph-svg text{font-family:var(--font-mono)} +.dep-edge{transition:stroke-opacity var(--dur-fast) var(--ease),stroke-width var(--dur-fast) var(--ease)} +.block-node{transition:opacity var(--dur-fast) var(--ease),filter var(--dur-fast) var(--ease); + vector-effect:non-scaling-stroke} +.block-node-label{font-size:12px;font-weight:600;pointer-events:none; + letter-spacing:.01em;transition:opacity var(--dur-fast) var(--ease)} +.block-node-ring{pointer-events:none;transition:opacity var(--dur-fast) var(--ease); + vector-effect:non-scaling-stroke} +.dep-graph-svg[data-graph-density="wide"] .block-node-label{font-size:12.5px} +.dep-graph-svg .block-node:hover{filter:brightness(1.08) drop-shadow(0 2px 6px rgb(79 70 229 / .18))} +.mm-truncation-notice{margin-bottom:var(--sp-4);padding:var(--sp-2) var(--sp-4); + font-size:.8rem;color:var(--text-muted);background:var(--bg-raised); + border:1px solid var(--border);border-radius:var(--radius-lg)} /* Hub bar */ .dep-hub-bar{display:flex;align-items:center;gap:var(--sp-2);flex-wrap:wrap; @@ -864,7 +980,8 @@ .dep-hub-pill{display:inline-flex;align-items:center;gap:var(--sp-1);padding:var(--sp-1) var(--sp-2); border-radius:var(--radius-sm);background:var(--bg-overlay);font-size:.8rem} .dep-hub-name{color:var(--text-primary);font-family:var(--font-mono);font-size:.8rem} -.dep-hub-deg{font-size:.68rem;font-weight:600;color:var(--accent-primary); +.dep-hub-deg{font-family:var(--count-font);font-size:var(--count-size); + font-weight:var(--count-weight);font-variant-numeric:tabular-nums;color:var(--accent-primary); background:var(--accent-muted);padding:2px var(--sp-2);border-radius:var(--radius-sm)} /* Legend */ @@ -920,39 +1037,114 @@ /* List layout */ .suggestions-list{display:flex;flex-direction:column;gap:var(--sp-2)} -/* Card — full-width row */ -.suggestion-card{background:var(--bg-surface);border:1px solid var(--border);border-radius:var(--radius-lg); - overflow:hidden;transition:border-color var(--dur-fast) var(--ease),box-shadow var(--dur-fast) var(--ease)} -.suggestion-card:hover{border-color:var(--border-strong);box-shadow:var(--shadow-sm)} -.suggestion-card[data-severity="critical"]{border-left:3px solid var(--error)} -.suggestion-card[data-severity="warning"]{border-left:3px solid var(--warning)} -.suggestion-card[data-severity="info"]{border-left:3px solid var(--info)} - -/* Header row: severity pill · title · meta badges */ -.suggestion-head{padding:var(--sp-3) var(--sp-4);display:flex;align-items:center; - gap:var(--sp-2);flex-wrap:wrap} -.suggestion-sev{font-size:.68rem;font-weight:600;text-transform:uppercase;letter-spacing:.04em; - padding:2px var(--sp-2);border-radius:var(--radius-sm);white-space:nowrap} -.suggestion-sev--critical{background:var(--error-muted);color:var(--error)} -.suggestion-sev--warning{background:var(--warning-muted);color:var(--warning)} -.suggestion-sev--info{background:var(--info-muted);color:var(--info)} +/* Finding / review card — shared chrome (severity stripe · head · meta · body). + One source of truth for findings, suggestions, and the review queue. */ +.finding-card{position:relative;display:flex;background:var(--bg-surface); + border:1px solid var(--border);border-radius:var(--radius-lg);overflow:hidden; + box-shadow:var(--shadow-sm); + transition:border-color var(--dur-fast) var(--ease), + box-shadow var(--dur-normal) var(--ease),transform var(--dur-fast) var(--ease)} +.finding-card:hover{border-color:var(--border-strong);box-shadow:var(--shadow-md); + transform:translateY(-1px)} +.finding-card-stripe{flex:0 0 4px;align-self:stretch;background:var(--border-strong)} +.finding-card--critical{border-color:color-mix(in oklch,var(--error) 22%,var(--border))} +.finding-card--critical .finding-card-stripe{background:var(--error)} +.finding-card--warning{border-color:color-mix(in oklch,var(--warning) 16%,var(--border))} +.finding-card--warning .finding-card-stripe{background:var(--warning)} +.finding-card--info .finding-card-stripe{background:var(--info)} +.finding-card-main{flex:1;min-width:0;padding:var(--sp-3) var(--sp-4)} +.finding-card-head{display:flex;justify-content:space-between;gap:var(--sp-3); + align-items:flex-start} +.finding-card-headings{min-width:0} +.finding-card-eyebrow{font-size:.66rem;text-transform:uppercase;letter-spacing:.04em; + color:var(--text-muted)} +.finding-card-title{display:flex;align-items:center;gap:var(--sp-2);margin-top:2px} +.finding-card-title-text{font-size:.9rem;font-weight:600;color:var(--text-primary); + min-width:0;overflow:hidden;text-overflow:ellipsis} +.finding-card-loc{font-family:var(--font-mono);font-size:.74rem;color:var(--text-secondary); + margin-top:4px;word-break:break-all} +.finding-card-actions{flex-shrink:0} +.finding-card-meta{display:flex;flex-wrap:wrap;gap:6px;margin-top:9px} +.finding-meta-badge{background:var(--bg-overlay);color:var(--text-muted)} +.finding-meta-badge--easy{color:var(--success);background:var(--success-muted, rgba(34,197,94,.1))} +.finding-meta-badge--moderate{color:var(--warning);background:var(--warning-muted)} +.finding-meta-badge--hard{color:var(--error);background:var(--error-muted)} +.finding-meta-badge--new{color:var(--accent-primary);background:var(--accent-muted); + text-transform:uppercase;letter-spacing:.04em} .suggestion-sev-inline{font-size:.68rem;font-weight:600;padding:2px var(--sp-2); border-radius:var(--radius-sm)} -.suggestion-title{font-weight:600;font-size:.85rem;color:var(--text-primary);flex:1;min-width:0} -.suggestion-meta{display:flex;align-items:center;gap:var(--sp-2);flex-shrink:0;flex-wrap:wrap} -.suggestion-meta-badge{font-size:.68rem;font-weight:600;padding:2px var(--sp-2); - border-radius:var(--radius-sm);background:var(--bg-overlay);color:var(--text-muted); - white-space:nowrap;line-height:1.2;font-variant-numeric:tabular-nums} -.suggestion-effort--easy{color:var(--success);background:var(--success-muted, rgba(34,197,94,.1))} -.suggestion-effort--moderate{color:var(--warning);background:var(--warning-muted)} -.suggestion-effort--hard{color:var(--error);background:var(--error-muted)} /* Body — context + summary */ -.suggestion-body{padding:0 var(--sp-4) var(--sp-3);display:flex;flex-direction:column;gap:var(--sp-1)} +.finding-card-body{margin-top:9px;display:flex;flex-direction:column;gap:var(--sp-1)} + +/* Overview launchpad: entry banner into the Review hub */ +.review-launchpad{display:flex;align-items:center;justify-content:space-between;gap:var(--sp-4); + flex-wrap:wrap;margin-bottom:var(--sp-4);padding:var(--sp-3) var(--sp-4); + border:1px solid var(--accent-primary);border-radius:var(--radius-lg); + background:var(--accent-muted)} +.review-launchpad-title{font-size:.95rem;font-weight:600;color:var(--text-primary)} +.review-launchpad-sevs{display:flex;flex-wrap:wrap;gap:6px;margin-top:5px} +.launchpad-sev{color:var(--text-secondary);background:var(--bg-overlay)} +.launchpad-sev--critical{color:var(--danger); + background:color-mix(in oklch,var(--danger) 14%,transparent)} +.launchpad-sev--warning{color:var(--warning); + background:color-mix(in oklch,var(--warning) 14%,transparent)} +.launchpad-sev--info{color:var(--info); + background:color-mix(in oklch,var(--info) 14%,transparent)} +.review-launchpad-cta{display:inline-flex;align-items:center;gap:7px;flex-shrink:0; + font-size:.82rem;font-weight:600;font-family:var(--font-sans);cursor:pointer; + padding:9px 16px;border-radius:var(--radius-md);border:0; + color:#fff;background:var(--accent-primary); + transition:background var(--dur-fast) var(--ease)} +.review-launchpad-cta:hover{background:var(--accent-hover)} + +/* Review hub: progress · filters · queue · per-item reviewed toggle */ +.review-progress{background:var(--bg-surface);border:1px solid var(--border); + border-radius:var(--radius-lg);padding:var(--sp-3) var(--sp-4);margin-bottom:var(--sp-4)} +.review-progress-head{display:flex;justify-content:space-between;align-items:baseline; + font-size:.78rem;color:var(--text-secondary);margin-bottom:7px} +.review-progress-title{font-weight:500} +.review-progress-label b{color:var(--text-primary);font-variant-numeric:tabular-nums} +.review-progress-track{height:7px;border-radius:4px;background:var(--bg-overlay);overflow:hidden} +.review-progress-bar{height:100%;border-radius:4px;background:var(--accent-primary); + transition:width var(--dur-base) var(--ease)} +.review-queue{display:flex;flex-direction:column;gap:9px} +.review-card[data-filter-hidden="true"]{display:none} + +/* Shared filter system — inline density: one-click toggle chips */ +.toolbar--filters{margin-bottom:var(--sp-4)} +.filter-chips{display:flex;flex-wrap:wrap;gap:6px} +.filter-reset{font-size:.72rem;padding:var(--sp-1) var(--sp-3);margin-right:var(--sp-2)} +.filter-chip{display:inline-flex;align-items:center;gap:6px;cursor:pointer; + font-family:var(--font-sans);font-size:.72rem;font-weight:500; + padding:4px 10px;border-radius:var(--radius-md); + background:var(--bg-overlay);color:var(--text-secondary);border:1px solid var(--border); + transition:border-color var(--dur-fast) var(--ease),color var(--dur-fast) var(--ease), + background var(--dur-fast) var(--ease)} +.filter-chip:hover{border-color:var(--border-strong);color:var(--text-primary)} +.filter-chip[aria-pressed="true"]{border-color:var(--accent-primary); + color:var(--accent-primary);background:var(--accent-muted)} +.filter-chip--critical[aria-pressed="true"]{border-color:var(--danger);color:var(--danger); + background:color-mix(in oklch,var(--danger) 16%,transparent)} +.filter-chip--warning[aria-pressed="true"]{border-color:var(--warning);color:var(--warning); + background:color-mix(in oklch,var(--warning) 16%,transparent)} +.filter-chip--info[aria-pressed="true"]{border-color:var(--info);color:var(--info); + background:color-mix(in oklch,var(--info) 16%,transparent)} +.filter-chip-count{font-family:var(--count-font);font-size:var(--count-size); + font-weight:var(--count-weight);font-variant-numeric:tabular-nums;opacity:.85} +.review-toggle{display:inline-flex;align-items:center;justify-content:center; + width:30px;height:30px;border-radius:8px;cursor:pointer;color:var(--text-muted); + background:transparent;border:1px solid var(--border); + transition:border-color var(--dur-fast) var(--ease),color var(--dur-fast) var(--ease)} +.review-toggle:hover{border-color:var(--accent-primary);color:var(--accent-primary)} +.review-card.is-reviewed{opacity:.55} +.review-card.is-reviewed .finding-card-title-text{text-decoration:line-through; + text-decoration-color:var(--text-muted)} +.review-card.is-reviewed .review-toggle{background:var(--accent-primary); + border-color:var(--accent-primary);color:#fff} .suggestion-context{display:flex;gap:var(--sp-1);flex-wrap:wrap} -.suggestion-chip{font-size:.68rem;font-weight:500;padding:2px var(--sp-2);border-radius:var(--radius-sm); - background:var(--bg-overlay);color:var(--text-muted);white-space:nowrap} -.suggestion-summary{font-size:.8rem;font-family:var(--font-mono);color:var(--text-secondary);line-height:1.5} +.suggestion-chip{background:var(--bg-overlay);color:var(--text-muted)} +.suggestion-summary{font-size:.8rem;font-family:var(--font-sans);color:var(--text-secondary);line-height:1.5} .suggestion-action{display:flex;align-items:center;gap:var(--sp-1); font-size:.8rem;font-weight:500;color:var(--accent-primary);margin-top:var(--sp-1)} .suggestion-action-icon{flex-shrink:0;color:var(--accent-primary)} @@ -1003,22 +1195,8 @@ _STRUCTURAL = """\ /* Structural findings — list layout */ .sf-list{display:flex;flex-direction:column;gap:var(--sp-2)} -.sf-card{background:var(--bg-surface);border:1px solid var(--border);border-left:3px solid var(--info); - border-radius:var(--radius-lg); - overflow:hidden;transition:border-color var(--dur-fast) var(--ease),box-shadow var(--dur-fast) var(--ease)} -.sf-card:hover{border-color:var(--border-strong);box-shadow:var(--shadow-sm)} - -/* Header row */ -.sf-head{padding:var(--sp-3) var(--sp-4);display:flex;align-items:center;gap:var(--sp-2);flex-wrap:wrap} -.sf-kind-badge{font-size:.68rem;font-weight:600;text-transform:uppercase;letter-spacing:.03em; - padding:2px var(--sp-2);border-radius:var(--radius-sm);white-space:nowrap; - background:var(--info-muted);color:var(--info)} -.sf-title{font-weight:600;font-size:.85rem;color:var(--text-primary);flex:1;min-width:0} -.sf-meta{display:flex;align-items:center;gap:var(--sp-1);flex-shrink:0;flex-wrap:wrap} +/* Card chrome is the shared .finding-card; only structural content rules below. */ .sf-why-btn{font-size:.72rem;color:var(--accent-primary);font-weight:500} - -/* Body */ -.sf-body{padding:0 var(--sp-4) var(--sp-3);display:flex;flex-direction:column;gap:var(--sp-2)} .sf-chips{display:flex;flex-wrap:wrap;gap:var(--sp-1)} .sf-scope-text{font-size:.8rem;font-family:var(--font-mono);color:var(--text-secondary)} .sf-inline-action{display:flex;align-items:flex-start;gap:var(--sp-2);padding:var(--sp-2) var(--sp-3); @@ -1040,7 +1218,6 @@ .sf-table{table-layout:fixed} .sf-kind-meta{font-weight:normal;font-size:.8rem;color:var(--text-muted)} -.subsection-title{font-size:.95rem;margin:var(--sp-4) 0 var(--sp-2)} .finding-occurrences-more summary{font-size:.8rem;color:var(--accent-primary);cursor:pointer; padding:var(--sp-1) var(--sp-3)} .sf-card[data-filter-hidden="true"]{display:none} @@ -1078,7 +1255,7 @@ border:1px solid var(--border); box-shadow:0 1px 2px color-mix(in srgb,var(--text-primary) 3%,transparent)} .prov-section:last-child{margin-bottom:0} -.prov-section-title{font-size:.66rem;font-weight:700;text-transform:uppercase;letter-spacing:.09em; +.prov-section-title{font-size:.66rem;font-weight:600;text-transform:uppercase;letter-spacing:.06em; color:var(--text-secondary);margin:0 calc(-1*var(--sp-4)) var(--sp-2); padding:0 var(--sp-4) var(--sp-2);border:none; border-bottom:1px solid color-mix(in srgb,var(--border) 60%,transparent); @@ -1096,11 +1273,12 @@ /* Provenance summary badges */ .prov-summary{display:flex;flex-wrap:wrap;align-items:center;gap:6px; padding:var(--sp-2) var(--sp-4);border-top:1px solid var(--border)} -.prov-badge{display:inline-flex;align-items:center;gap:4px;font-size:.68rem; - padding:2px var(--sp-2);border-radius:var(--radius-sm);background:var(--bg-raised); +.prov-badge{display:inline-flex;align-items:center;gap:4px;font-size:var(--badge-size); + padding:2px var(--sp-2);border-radius:var(--badge-radius);background:var(--bg-raised); white-space:nowrap;line-height:1.3;border:1px solid color-mix(in srgb,var(--border) 55%,transparent); - font-family:var(--font-mono);letter-spacing:.005em} -.prov-badge-val{font-weight:600;font-variant-numeric:tabular-nums;color:var(--text-primary)} + font-family:var(--badge-font);letter-spacing:var(--badge-tracking)} +.prov-badge-val{font-family:var(--count-font);font-weight:var(--count-weight); + font-variant-numeric:tabular-nums;color:var(--text-primary)} .prov-badge-lbl{font-weight:400;color:var(--text-muted);text-transform:lowercase} .prov-badge--inline{padding:2px 8px} .prov-badge--inline .prov-badge-val{font-weight:500} @@ -1172,7 +1350,6 @@ _EMPTY = """\ .empty{display:flex;align-items:center;justify-content:center;padding:var(--sp-10)} .empty-card{text-align:center;max-width:400px} -.empty-icon{margin-bottom:var(--sp-3);color:var(--success)} .empty-icon svg{width:40px;height:40px} .empty-card h2{margin-bottom:var(--sp-2)} .empty-card p{color:var(--text-secondary);font-size:.9rem} @@ -1258,7 +1435,7 @@ color-mix(in srgb,var(--bg-raised) 55%,transparent) 0%, var(--bg-surface) 100%)} .prov-hero-badge{display:inline-flex;align-items:center;gap:7px; - padding:6px 12px 6px 10px;border-radius:999px;font-weight:700;font-size:.78rem; + padding:6px 12px 6px 10px;border-radius:var(--radius-md);font-weight:700;font-size:.78rem; letter-spacing:.005em;white-space:nowrap;flex-shrink:0; border:1px solid var(--border);background:var(--bg-surface)} .prov-hero-icon{flex-shrink:0} @@ -1318,8 +1495,6 @@ .overview-row-spread{margin-left:0;width:100%} .suggestion-head{flex-direction:column;align-items:flex-start} .suggestion-facts{grid-template-columns:1fr} - .sf-head{flex-direction:column;align-items:flex-start} - .sf-meta{width:100%} .dir-hotspot-head{flex-wrap:wrap;align-items:flex-start} .dir-hotspot-detail{flex-wrap:wrap;align-items:flex-start} .dir-hotspot-bar-track{width:min(148px,42%);min-width:96px} diff --git a/codeclone/report/html/assets/js.py b/codeclone/report/html/assets/js.py index e4c948e5..11bed3e1 100644 --- a/codeclone/report/html/assets/js.py +++ b/codeclone/report/html/assets/js.py @@ -91,6 +91,14 @@ tabs.forEach(t=>t.addEventListener('click',()=>activate(t.dataset.tab))); + // Cross-tab jump buttons (e.g. Overview launchpad -> Review) + $$('[data-goto-tab]').forEach(el=>{ + el.addEventListener('click',()=>{ + const id=el.dataset.gotoTab; + if(tabs.some(t=>t.dataset.tab===id)){activate(id);window.scrollTo(0,0)} + }); + }); + // Keyboard: arrow left/right const tabList=$('[role="tablist"].main-tabs'); if(tabList){ @@ -463,33 +471,51 @@ _DEP_GRAPH = """\ (function initDepGraph(){ - const svg=$('.dep-graph-svg'); - if(!svg)return; - const nodes=$$('.dep-node'); - const labels=$$('.dep-label'); - const edges=$$('.dep-edge'); - - function highlight(name){ - nodes.forEach(n=>{n.style.fillOpacity=n.dataset.node===name?'1':'0.15'}); - labels.forEach(l=>{l.style.fill=l.dataset.node===name?'var(--text-primary)':'var(--text-muted)'; - l.style.fillOpacity=l.dataset.node===name?'1':'0.3'}); + $$('.dep-graph-svg').forEach(svg=>{ + const q=s=>[...svg.querySelectorAll(s)]; + const nodes=q('.block-node'); + const labels=q('.block-node-label'); + const rings=q('.block-node-ring'); + const edges=q('.dep-edge'); + if(!nodes.length)return; + + const adj={}; edges.forEach(e=>{ - const connected=e.dataset.source===name||e.dataset.target===name; - e.style.strokeOpacity=connected?'0.8':'0.05'; - e.style.strokeWidth=connected?'2':'1'; + const s=e.dataset.source,t=e.dataset.target; + e.dataset.baseWidth=e.getAttribute('stroke-width')||'1'; + e.dataset.baseMarker=e.getAttribute('marker-end')||''; + (adj[s]=adj[s]||new Set()).add(t); + (adj[t]=adj[t]||new Set()).add(s); }); - } - function reset(){ - nodes.forEach(n=>{n.style.fillOpacity=''}); - labels.forEach(l=>{l.style.fill='';l.style.fillOpacity=''}); - edges.forEach(e=>{e.style.strokeOpacity='';e.style.strokeWidth=''}); - } + function highlight(name){ + const near=adj[name]||new Set(); + const on=n=>n===name||near.has(n); + [...nodes,...labels,...rings].forEach(el=>{ + el.style.opacity=on(el.dataset.node)?'1':'0.16'; + }); + edges.forEach(e=>{ + const connected=e.dataset.source===name||e.dataset.target===name; + e.style.strokeOpacity=connected?'0.9':'0.06'; + e.style.strokeWidth=connected?String(Number(e.dataset.baseWidth||1)+0.7):e.dataset.baseWidth; + e.setAttribute('marker-end',connected?e.dataset.baseMarker:'none'); + }); + } - [...nodes,...labels].forEach(el=>{ - el.addEventListener('mouseenter',()=>highlight(el.dataset.node)); - el.addEventListener('mouseleave',reset); - el.style.cursor='pointer'; + function reset(){ + [...nodes,...labels,...rings].forEach(el=>{el.style.opacity=''}); + edges.forEach(e=>{ + e.style.strokeOpacity=''; + e.style.strokeWidth=e.dataset.baseWidth||''; + e.setAttribute('marker-end',e.dataset.baseMarker||''); + }); + } + + [...nodes,...labels].forEach(el=>{ + el.addEventListener('mouseenter',()=>highlight(el.dataset.node)); + el.addEventListener('mouseleave',reset); + el.style.cursor='pointer'; + }); }); })(); """ @@ -850,6 +876,78 @@ })(); """ +# --------------------------------------------------------------------------- +# Review hub: per-finding reviewed state (localStorage) + progress + filters +# --------------------------------------------------------------------------- + +_REVIEW = """\ +(function initReview(){ + const panel=$('[data-review-panel]'); + if(!panel)return; + const KEY='codeclone-reviewed'; + function load(){try{return new Set(JSON.parse(localStorage.getItem(KEY)||'[]'))}catch(e){return new Set()}} + function save(s){try{localStorage.setItem(KEY,JSON.stringify([...s]))}catch(e){}} + const reviewed=load(); + const cards=$$('[data-review-card]'); + const total=cards.length; + const bar=$('[data-review-progress-bar]'); + const label=$('[data-review-progress-label]'); + function refresh(){ + let done=0; + cards.forEach(c=>{ + const on=reviewed.has(c.dataset.findingId); + c.classList.toggle('is-reviewed',on); + const btn=c.querySelector('[data-review-toggle]'); + if(btn)btn.setAttribute('aria-pressed',on?'true':'false'); + if(on)done++; + }); + if(bar)bar.style.width=(total?Math.round(done/total*100):0)+'%'; + if(label)label.textContent=done+' / '+total; + } + panel.addEventListener('click',function(e){ + const btn=e.target.closest('[data-review-toggle]'); + if(!btn)return; + const card=btn.closest('[data-review-card]'); + if(!card)return; + const id=card.dataset.findingId; + if(reviewed.has(id))reviewed.delete(id);else reviewed.add(id); + save(reviewed);refresh(); + }); + const chips=$$('[data-filter-dim]'); + const countLabel=$('[data-review-count]'); + const resetBtn=$('[data-filter-reset]'); + function applyFilters(){ + const active={}; + chips.forEach(ch=>{ + if(ch.getAttribute('aria-pressed')==='true'){ + (active[ch.dataset.filterDim]=active[ch.dataset.filterDim]||new Set()) + .add(ch.dataset.filterValue); + } + }); + const dims=Object.keys(active); + let shown=0; + cards.forEach(c=>{ + const hide=dims.some(dim=>!active[dim].has(c.dataset[dim])); + c.setAttribute('data-filter-hidden',hide?'true':'false'); + if(!hide)shown++; + }); + if(countLabel)countLabel.textContent=shown+' shown'; + if(resetBtn)resetBtn.hidden=dims.length===0; + } + chips.forEach(ch=>ch.addEventListener('click',function(){ + ch.setAttribute('aria-pressed', + ch.getAttribute('aria-pressed')==='true'?'false':'true'); + applyFilters(); + })); + if(resetBtn)resetBtn.addEventListener('click',function(){ + chips.forEach(ch=>ch.setAttribute('aria-pressed','false')); + applyFilters(); + }); + applyFilters(); + refresh(); +})(); +""" + # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- @@ -865,6 +963,7 @@ _MODALS, _SUGGESTIONS, _DEP_GRAPH, + _REVIEW, _META_PANEL, _EXPORT, _CMD_PALETTE, diff --git a/codeclone/report/html/primitives/filters.py b/codeclone/report/html/primitives/filters.py index f578b163..daec53a5 100644 --- a/codeclone/report/html/primitives/filters.py +++ b/codeclone/report/html/primitives/filters.py @@ -57,3 +57,26 @@ def _render_select( ) parts.append("") return "".join(parts) + + +def _render_filter_chips( + *, + dim: str, + options: Sequence[tuple[str, str, int]], +) -> str: + """Render the inline density of the shared filter system. + + One toggle chip per value of dimension *dim*. Each option is + ``(value, label, count)``. Chips carry ``data-filter-dim``/``data-filter-value`` + and ``aria-pressed`` so the shared filter JS can toggle them; the value also + gets a ``filter-chip--`` modifier for severity coloring. + """ + chips = "".join( + f'' + for value, label, count in options + ) + return f'
{chips}
' diff --git a/codeclone/report/html/sections/_clones.py b/codeclone/report/html/sections/_clones.py index b18624fb..0e0e27c5 100644 --- a/codeclone/report/html/sections/_clones.py +++ b/codeclone/report/html/sections/_clones.py @@ -247,6 +247,12 @@ def _render_suppressed_clone_panel( headers=("Kind", "Group", "File", "Type", "Occurrences", "Rule", "Pattern"), rows=rows, empty_message="No suppressed clone groups.", + column_types={ + "Kind": "chips", + "Type": "chips", + "Rule": "code", + "Pattern": "code", + }, ctx=ctx, ) diff --git a/codeclone/report/html/sections/_coupling.py b/codeclone/report/html/sections/_coupling.py index 23a860d3..faa2aa1a 100644 --- a/codeclone/report/html/sections/_coupling.py +++ b/codeclone/report/html/sections/_coupling.py @@ -212,72 +212,15 @@ def _cohesion_cards(summary: Mapping[str, object]) -> str: return f'
{"".join(cards)}
' -def _overloaded_cards( - summary: Mapping[str, object], - rows_data: Sequence[object], -) -> str: - candidates = _as_int(summary.get("candidates")) - total_modules = _as_int(summary.get("total")) - critical = sum( - 1 - for r in rows_data - if str(_as_mapping(r).get("candidate_status", "")).strip().lower() == "critical" - ) - scores = [ - _as_int(_as_mapping(r).get("score")) - for r in rows_data - if _as_int(_as_mapping(r).get("score")) > 0 - ] - max_score = max(scores) if scores else 0 - locs = [ - _as_int(_as_mapping(r).get("loc")) - for r in rows_data - if _as_int(_as_mapping(r).get("loc")) > 0 - ] - avg_loc = int(sum(locs) / len(locs)) if locs else 0 - cards = [ - _stat_card( - "Overloaded", - candidates, - detail=_micro_badges(("total analyzed", total_modules)), - value_tone="bad" if candidates > 0 else "good", - glossary_tip_fn=glossary_tip, - ), - _stat_card( - "Critical", - critical, - value_tone="bad" if critical > 0 else "good", - glossary_tip_fn=glossary_tip, - ), - _stat_card( - "Max score", - max_score, - detail=_micro_badges(("threshold", summary.get("threshold", "n/a"))), - value_tone="warn" if max_score > 0 else "muted", - glossary_tip_fn=glossary_tip, - ), - _stat_card( - "Avg LOC", - avg_loc, - detail=_micro_badges(("modules", len(locs))), - value_tone="muted", - glossary_tip_fn=glossary_tip, - ), - ] - return f'
{"".join(cards)}
' - - def render_quality_panel(ctx: ReportContext) -> str: """Build the unified Quality tab (Complexity + Coupling + Cohesion sub-tabs).""" coupling_summary = _as_mapping(ctx.coupling_map.get("summary")) cohesion_summary = _as_mapping(ctx.cohesion_map.get("summary")) complexity_summary = _as_mapping(ctx.complexity_map.get("summary")) - overloaded_modules_summary = _as_mapping(ctx.overloaded_modules_map.get("summary")) coverage_join_summary = coverage_join_quality_summary(ctx) coupling_high_risk = _as_int(coupling_summary.get("high_risk")) cohesion_low = _as_int(cohesion_summary.get("low_cohesion")) complexity_high_risk = _as_int(complexity_summary.get("high_risk")) - overloaded_module_candidates = _as_int(overloaded_modules_summary.get("candidates")) coverage_review_items = coverage_join_quality_count(ctx) security_surface_items = security_surfaces_quality_count(ctx) coverage_hotspots = _as_int(coverage_join_summary.get("coverage_hotspots")) @@ -296,7 +239,6 @@ def render_quality_panel(ctx: ReportContext) -> str: f"High-complexity: {complexity_high_risk}; " f"high-coupling: {coupling_high_risk}; " f"low-cohesion: {cohesion_low}; " - f"overloaded modules: {overloaded_module_candidates}; " f"security surfaces: {security_surface_items}; " f"max CC {cc_max}; " f"max CBO {coupling_summary.get('max', 'n/a')}; " @@ -310,9 +252,7 @@ def render_quality_panel(ctx: ReportContext) -> str: ) else: answer += " Coverage join unavailable." - if overloaded_module_candidates > 0 or ( - coupling_high_risk > 0 and cohesion_low > 0 - ): + if coupling_high_risk > 0 and cohesion_low > 0: tone = "risk" elif ( coupling_high_risk > 0 @@ -343,6 +283,7 @@ def render_quality_panel(ctx: ReportContext) -> str: headers=("Function", "File", "CC", "Nesting", "Risk"), rows=cx_rows, empty_message="Complexity metrics are not available.", + column_types={"CC": "meter", "Nesting": "meter"}, ctx=ctx, ) @@ -366,6 +307,7 @@ def render_quality_panel(ctx: ReportContext) -> str: rows=cp_rows, empty_message="Coupling metrics are not available.", raw_html_headers=("Coupled classes",), + column_types={"CBO": "meter"}, ctx=ctx, ) @@ -389,40 +331,7 @@ def render_quality_panel(ctx: ReportContext) -> str: headers=("Class", "File", "LCOM4", "Risk", "Methods", "Fields"), rows=ch_rows, empty_message="Cohesion metrics are not available.", - ctx=ctx, - ) - - gm_rows_data = _as_sequence(ctx.overloaded_modules_map.get("items")) - gm_rows = [ - ( - str(_as_mapping(r).get("module", "")), - str( - _as_mapping(r).get("relative_path") - or _as_mapping(r).get("filepath") - or "" - ), - str(_as_mapping(r).get("score", "")), - str(_as_mapping(r).get("candidate_status", "")), - str(_as_mapping(r).get("loc", "")), - f"{_as_mapping(r).get('fan_in', '')}/{_as_mapping(r).get('fan_out', '')}", - str(_as_mapping(r).get("complexity_total", "")), - ) - for r in gm_rows_data[:50] - ] - gm_panel = _overloaded_cards( - overloaded_modules_summary, gm_rows_data - ) + render_rows_table( - headers=( - "Module", - "File", - "Score", - "Status", - "LOC", - "Fan-in/out", - "Complexity total", - ), - rows=gm_rows, - empty_message="Overloaded-module profiling is not available.", + column_types={"LCOM4": "meter", "Methods": "meter", "Fields": "meter"}, ctx=ctx, ) @@ -430,12 +339,6 @@ def render_quality_panel(ctx: ReportContext) -> str: ("complexity", "Complexity", complexity_high_risk, cx_panel), ("coupling", "Coupling (CBO)", coupling_high_risk, cp_panel), ("cohesion", "Cohesion (LCOM4)", cohesion_low, ch_panel), - ( - "overloaded-modules", - "Overloaded Modules", - overloaded_module_candidates, - gm_panel, - ), ] coverage_join_panel = render_coverage_join_panel(ctx) if coverage_join_panel: diff --git a/codeclone/report/html/sections/_coverage_join.py b/codeclone/report/html/sections/_coverage_join.py index 48218504..f0fba344 100644 --- a/codeclone/report/html/sections/_coverage_join.py +++ b/codeclone/report/html/sections/_coverage_join.py @@ -85,6 +85,7 @@ def render_coverage_join_panel(ctx: ReportContext) -> str: empty_message=_coverage_join_empty_message(), empty_description=_coverage_join_empty_description(), raw_html_headers=("Location",), + column_types={"CC": "meter", "Status": "chips"}, ctx=ctx, ) ) diff --git a/codeclone/report/html/sections/_dead_code.py b/codeclone/report/html/sections/_dead_code.py index ffdad1d9..10c4e59b 100644 --- a/codeclone/report/html/sections/_dead_code.py +++ b/codeclone/report/html/sections/_dead_code.py @@ -102,6 +102,7 @@ def render_dead_code_panel(ctx: ReportContext) -> str: headers=("Name", "File", "Line", "Kind", "Confidence", "Rule", "Source"), rows=suppressed_rows, empty_message="No suppressed dead-code candidates.", + column_types={"Source": "source_kind"}, ctx=ctx, ) diff --git a/codeclone/report/html/sections/_dependencies.py b/codeclone/report/html/sections/_dependencies.py index c3fbbfbe..483da055 100644 --- a/codeclone/report/html/sections/_dependencies.py +++ b/codeclone/report/html/sections/_dependencies.py @@ -8,10 +8,10 @@ from __future__ import annotations -import math -from collections.abc import Mapping, Sequence +from collections.abc import Sequence from typing import TYPE_CHECKING +from codeclone.metrics.dependencies import select_dependency_graph_nodes from codeclone.utils import coerce as _coerce from ..primitives.escape import _escape_html @@ -23,6 +23,14 @@ _tab_empty, ) from ..widgets.components import Tone, insight_block +from ..widgets.dep_graph_layout import ( + BlockNodeStyle, + _build_cycle_edges, + _build_degree_maps, + _hub_threshold, + block_node_style_for, + render_block_diagram, +) from ..widgets.glossary import glossary_tip from ..widgets.tables import render_rows_table @@ -41,327 +49,33 @@ def _select_dep_nodes( dep_cycles: Sequence[object], longest_chains: Sequence[object], ) -> tuple[list[str], list[tuple[str, str]]]: - all_nodes = sorted({part for edge in edges for part in edge}) - if len(all_nodes) > 20: - degree_count: dict[str, int] = dict.fromkeys(all_nodes, 0) - for source, target in edges: - degree_count[source] = degree_count.get(source, 0) + 1 - degree_count[target] = degree_count.get(target, 0) + 1 - all_node_set = set(all_nodes) - nodes: list[str] = [] - node_set: set[str] = set() - - def _seed_node(node: object) -> None: - node_name = str(node).strip() - if ( - not node_name - or node_name not in all_node_set - or node_name in node_set - or len(nodes) >= 20 - ): - return - nodes.append(node_name) - node_set.add(node_name) - - # Keep the visual graph aligned with the dependency tables. When we - # downsample a large graph, cycle members and longest-chain nodes must - # remain visible instead of being dropped behind high-degree hubs. - for cycle in dep_cycles: - for node in _as_sequence(cycle): - _seed_node(node) - for chain in longest_chains: - for node in _as_sequence(chain): - _seed_node(node) - - for node in sorted( - all_nodes, key=lambda item: (-degree_count.get(item, 0), item) - ): - _seed_node(node) - if len(nodes) >= 20: - break - nodes.sort() - else: - nodes = all_nodes - node_set = set(nodes) - filtered = [ - (source, target) - for source, target in edges - if source in node_set and target in node_set - ][:100] + # Shared deterministic sampler (metrics.dependencies). Dependencies tab keeps + # its historical caps (20 nodes / 100 edges) and module-level identity zoom. + nodes, filtered, _truncation = select_dependency_graph_nodes( + edges, + dep_cycles=dep_cycles, + longest_chains=longest_chains, + max_nodes=20, + max_edges=100, + ) return nodes, filtered -def _build_degree_maps( - nodes: Sequence[str], - edges: Sequence[tuple[str, str]], -) -> tuple[dict[str, int], dict[str, int]]: - in_degree: dict[str, int] = dict.fromkeys(nodes, 0) - out_degree: dict[str, int] = dict.fromkeys(nodes, 0) - for source, target in edges: - in_degree[target] += 1 - out_degree[source] += 1 - return in_degree, out_degree - - -def _build_layer_groups( - nodes: Sequence[str], - edges: Sequence[tuple[str, str]], - in_degree: Mapping[str, int], - out_degree: Mapping[str, int], -) -> dict[int, list[str]]: - children: dict[str, list[str]] = {node: [] for node in nodes} - for source, target in edges: - children[source].append(target) - - layers: dict[str, int] = {} - roots = sorted(node for node in nodes if in_degree[node] == 0) - if not roots: - roots = sorted(nodes, key=lambda node: -out_degree.get(node, 0))[:1] - queue = list(roots) - for node in queue: - layers.setdefault(node, 0) - while queue: - node = queue.pop(0) - for child in children.get(node, []): - if child in layers: - continue - layers[child] = layers[node] + 1 - queue.append(child) - - max_layer = max(layers.values(), default=0) - for node in nodes: - if node not in layers: - layers[node] = max_layer + 1 - - layer_groups: dict[int, list[str]] = {} - for node, layer in layers.items(): - layer_groups.setdefault(layer, []).append(node) - for layer in layer_groups: - layer_groups[layer].sort() - return layer_groups - - -def _layout_dep_graph( - layer_groups: Mapping[int, Sequence[str]], +def _dep_node_style( + node: str, *, - in_degree: Mapping[str, int], - out_degree: Mapping[str, int], -) -> tuple[int, int, int, dict[str, tuple[float, float]]]: - num_layers = max(layer_groups.keys(), default=0) + 1 - max_per_layer = max((len(members) for members in layer_groups.values()), default=1) - pad_x, pad_y = 56.0, 36.0 - prefer_horizontal = num_layers >= 6 and num_layers > max_per_layer + 2 - - def _ordered_members(members: Sequence[str]) -> list[str]: - if not prefer_horizontal or len(members) < 3: - return list(members) - ranked = sorted( - members, - key=lambda node: ( - -(in_degree.get(node, 0) + out_degree.get(node, 0)), - node, - ), - ) - center = (len(ranked) - 1) / 2 - slot_order = sorted( - range(len(ranked)), - key=lambda index: (abs(index - center), index), - ) - ordered = [""] * len(ranked) - for node, slot in zip(ranked, slot_order, strict=False): - ordered[slot] = node - return ordered - - if prefer_horizontal: - width = max(920, min(1600, num_layers * 118 + max_per_layer * 28 + 180)) - height = max(300, max_per_layer * 84 + 104) - else: - width = max(600, min(1200, max_per_layer * 70 + 140)) - height = max(260, num_layers * 80 + 80) - - positions: dict[str, tuple[float, float]] = {} - for layer_index in range(num_layers): - members = layer_groups.get(layer_index, []) - count = len(members) - if prefer_horizontal: - members = _ordered_members(members) - layer_step = (width - 2 * pad_x) / max(1, num_layers - 1) - x = pad_x + layer_index * layer_step - fan = min(14.0, layer_step * 0.12) - offset_unit = fan / max(1, count - 1) - center = (count - 1) / 2 - for index, node in enumerate(members): - y = pad_y + (index + 0.5) * ((height - 2 * pad_y) / max(1, count)) - positions[node] = (x + (index - center) * offset_unit, y) - continue - - y = pad_y + layer_index * ((height - 2 * pad_y) / max(1, num_layers - 1)) - for index, node in enumerate(members): - x = pad_x + (index + 0.5) * ((width - 2 * pad_x) / max(1, count)) - positions[node] = (x, y) - return width, height, max_per_layer, positions - - -def _hub_threshold( - nodes: Sequence[str], in_degree: Mapping[str, int], out_degree: Mapping[str, int] -) -> int: - degrees = [in_degree.get(node, 0) + out_degree.get(node, 0) for node in nodes] - if not degrees: - return 99 - degrees_sorted = sorted(degrees, reverse=True) - return int(degrees_sorted[max(0, len(degrees_sorted) // 5)]) - - -def _build_node_radii( - nodes: Sequence[str], - in_degree: Mapping[str, int], - out_degree: Mapping[str, int], - cycle_node_set: set[str], + degree: int, hub_threshold: int, -) -> dict[str, float]: - node_radii: dict[str, float] = {} - for node in nodes: - degree = in_degree.get(node, 0) + out_degree.get(node, 0) - if node in cycle_node_set: - node_radii[node] = min(8.0, max(5.0, 3.5 + degree * 0.4)) - elif degree >= hub_threshold and degree > 2: - node_radii[node] = min(10.0, max(6.0, 4.0 + degree * 0.5)) - elif degree <= 1: - node_radii[node] = 3.0 - else: - node_radii[node] = min(6.0, max(3.5, 3.0 + degree * 0.3)) - return node_radii - - -def _build_svg_defs() -> str: - return ( - "" - '' - '' - '' - '' - '' - '' - "" + in_cycle: bool, +) -> BlockNodeStyle: + return block_node_style_for( + in_cycle=in_cycle, + is_hub=degree >= hub_threshold and degree > 2, + is_leaf=degree <= 1, + title=node, ) -def _build_cycle_edges(dep_cycles: Sequence[object]) -> set[tuple[str, str]]: - cycle_edges: set[tuple[str, str]] = set() - for cycle in dep_cycles: - parts = [str(part) for part in _as_sequence(cycle)] - for index in range(len(parts)): - cycle_edges.add((parts[index], parts[(index + 1) % len(parts)])) - return cycle_edges - - -def _render_dep_edges( - edges: Sequence[tuple[str, str]], - positions: Mapping[str, tuple[float, float]], - node_radii: Mapping[str, float], - cycle_edges: set[tuple[str, str]], -) -> list[str]: - rendered: list[str] = [] - for source, target in edges: - x1, y1 = positions[source] - x2, y2 = positions[target] - source_radius, target_radius = node_radii[source], node_radii[target] - dx, dy = x2 - x1, y2 - y1 - distance = math.sqrt(dx * dx + dy * dy) or 1.0 - ux, uy = dx / distance, dy / distance - x1a, y1a = x1 + ux * (source_radius + 2), y1 + uy * (source_radius + 2) - x2a, y2a = x2 - ux * (target_radius + 4), y2 - uy * (target_radius + 4) - mx = (x1a + x2a) / 2 - (y2a - y1a) * 0.06 - my = (y1a + y2a) / 2 + (x2a - x1a) * 0.06 - is_cycle = (source, target) in cycle_edges - stroke = "var(--danger)" if is_cycle else "var(--border-strong)" - opacity = "0.6" if is_cycle else "0.3" - marker = "dep-arrow-cycle" if is_cycle else "dep-arrow" - rendered.append( - f'' - ) - return rendered - - -def _render_dep_nodes_and_labels( - nodes: Sequence[str], - *, - positions: Mapping[str, tuple[float, float]], - node_radii: Mapping[str, float], - in_degree: Mapping[str, int], - out_degree: Mapping[str, int], - cycle_node_set: set[str], - hub_threshold: int, - max_per_layer: int, - prefer_horizontal: bool, -) -> tuple[list[str], list[str]]: - nodes_svg: list[str] = [] - labels_svg: list[str] = [] - rotate_labels = prefer_horizontal or max_per_layer > 6 - - for node in nodes: - x, y = positions[node] - radius = node_radii[node] - degree = in_degree.get(node, 0) + out_degree.get(node, 0) - label = _short_label(node) - is_cycle = node in cycle_node_set - is_hub = degree >= hub_threshold and degree > 2 - is_secondary = not is_hub and not is_cycle - - if is_cycle: - fill, fill_opacity, extra = ( - "var(--danger)", - "0.85", - 'stroke="var(--danger)" stroke-width="1.5" stroke-dasharray="3,2"', - ) - elif is_hub: - fill, fill_opacity, extra = ( - "var(--accent-primary)", - "1", - 'filter="url(#glow)"', - ) - elif degree <= 1: - fill, fill_opacity, extra = "var(--text-muted)", "0.4", "" - else: - fill, fill_opacity, extra = "var(--accent-primary)", "0.7", "" - - nodes_svg.append( - f'' - ) - - font_size = "10" if is_hub else ("8" if is_secondary else "9") - if rotate_labels: - label_x = ( - x + radius + (4 if is_secondary else 6 if prefer_horizontal else 0) - ) - label_y = ( - y - radius - (1 if is_secondary else 2 if prefer_horizontal else 6) - ) - labels_svg.append( - f'' - f"{_escape_html(node)}{_escape_html(label)}" - ) - continue - - labels_svg.append( - f'' - f"{_escape_html(node)}{_escape_html(label)}" - ) - - return nodes_svg, labels_svg - - def _render_dep_svg( edges: Sequence[tuple[str, str]], cycle_node_set: set[str], @@ -377,50 +91,23 @@ def _render_dep_svg( longest_chains=longest_chains, ) in_degree, out_degree = _build_degree_maps(nodes, filtered_edges) - layer_groups = _build_layer_groups(nodes, filtered_edges, in_degree, out_degree) - width, height, max_per_layer, positions = _layout_dep_graph( - layer_groups, - in_degree=in_degree, - out_degree=out_degree, - ) - prefer_horizontal = width > height hub_threshold = _hub_threshold(nodes, in_degree, out_degree) - node_radii = _build_node_radii( - nodes, - in_degree, - out_degree, - cycle_node_set, - hub_threshold, - ) cycle_edges = _build_cycle_edges(dep_cycles) - defs = _build_svg_defs() - edge_svg = _render_dep_edges(filtered_edges, positions, node_radii, cycle_edges) - node_svg, label_svg = _render_dep_nodes_and_labels( - nodes, - positions=positions, - node_radii=node_radii, - in_degree=in_degree, - out_degree=out_degree, - cycle_node_set=cycle_node_set, - hub_threshold=hub_threshold, - max_per_layer=max_per_layer, - prefer_horizontal=prefer_horizontal, - ) - label_pad = 44 if prefer_horizontal else (50 if max_per_layer > 6 else 0) - label_pad_x = 52 if prefer_horizontal else (28 if max_per_layer > 6 else 0) - vb_x = -label_pad_x - vb_y = -label_pad - vb_w = width + label_pad_x * 2 - vb_h = height + label_pad + def _style(node: str) -> BlockNodeStyle: + return _dep_node_style( + node, + degree=in_degree.get(node, 0) + out_degree.get(node, 0), + hub_threshold=hub_threshold, + in_cycle=node in cycle_node_set, + ) - return ( - '
' - f'' - f"{defs}{''.join(edge_svg)}{''.join(node_svg)}{''.join(label_svg)}" - "
" + return render_block_diagram( + nodes, + filtered_edges, + style_fn=_style, + aria_label="Module dependency graph", + danger_edges=cycle_edges, ) @@ -530,16 +217,19 @@ def render_dependencies_panel(ctx: ReportContext) -> str: else "" ) - # Legend + # Legend (box swatches matching the block-diagram nodes) legend = ( '
' '' - ' Hub' + ' Hub' '' - ' Leaf' + ' Leaf' '' - ' Cycle
' + ' ' + "Cycle
" ) # Tables @@ -581,6 +271,7 @@ def render_dependencies_panel(ctx: ReportContext) -> str: rows=dep_chain_rows, empty_message="No dependency chains detected.", raw_html_headers=("Longest chain",), + column_types={"Length": "meter"}, ctx=ctx, ) + '

Detected cycles

' diff --git a/codeclone/report/html/sections/_module_map.py b/codeclone/report/html/sections/_module_map.py new file mode 100644 index 00000000..6cde35b4 --- /dev/null +++ b/codeclone/report/html/sections/_module_map.py @@ -0,0 +1,399 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +"""Module map panel renderer. + +Render-only: draws the precomputed ``derived.module_map`` graph (sampled +packages/modules), unwind-candidate triage, and a top-overloaded slice. No +projection math lives here — the graph, truncation, and unwind rows are +computed once in ``report.document.derived``. +""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING + +from codeclone.utils import coerce as _coerce + +from ..widgets.badges import _micro_badges, _stat_card, _tab_empty +from ..widgets.components import Tone, insight_block +from ..widgets.dep_graph_layout import ( + BlockNodeStyle, + _hub_threshold, + block_node_style_for, + render_block_diagram, +) +from ..widgets.glossary import glossary_tip +from ..widgets.tables import render_rows_table +from ..widgets.tabs import render_split_tabs + +if TYPE_CHECKING: + from .._context import ReportContext + +_as_int = _coerce.as_int +_as_float = _coerce.as_float +_as_mapping = _coerce.as_mapping +_as_sequence = _coerce.as_sequence + +_CANDIDATE = "candidate" +_OVERLOADED_TABLE_CAP = 50 +_OVERLOADED_HEADING = "Overloaded Modules" +_EMPTY_GRAPH_MESSAGE = "Dependency graph is not available." +_OVERLOADED_EMPTY_MESSAGE = "Overloaded-module profiling is not available." +_METRICS_SKIPPED = "Metrics are skipped for this run." + +# Mandatory honesty copy (spec §11): report-only, sampled SVG, full tables. +_MODULE_MAP_INSIGHT = ( + "Report-only import-graph signals for refactor triage. Not CI gates. The SVG " + "may show a deterministic sample of packages/modules on large repos; unwind " + "and overload tables list module-level facts for the full codebase. Verify in " + "source before editing." +) + +_MM_LEGEND = ( + '
' + '' + ' Hub' + '' + '' + ' Overload candidate' + '' + ' ' + "In cycle" + '' + ' Leaf
' +) + + +def _mm_node_title(node: Mapping[str, object], overloaded: Mapping[str, object]) -> str: + reasons = ", ".join( + str(reason) for reason in _as_sequence(overloaded.get("candidate_reasons")) + ) + title = ( + f"{node.get('id')} · in {_as_int(node.get('fan_in'))} · " + f"out {_as_int(node.get('fan_out'))} · " + f"score {_as_float(overloaded.get('score')):.2f}" + ) + if reasons: + title = f"{title} · {reasons}" + return title + + +def _mm_node_style(node: Mapping[str, object], *, hub_threshold: int) -> BlockNodeStyle: + total_degree = _as_int(node.get("total_degree")) + overloaded = _as_mapping(node.get("overloaded")) + is_candidate = str(overloaded.get("candidate_status")) == _CANDIDATE + is_tests = [str(k) for k in _as_sequence(node.get("source_kinds"))] == ["tests"] + return block_node_style_for( + in_cycle=bool(node.get("in_cycle")), + is_hub=total_degree >= hub_threshold and total_degree > 2, + is_leaf=total_degree <= 1, + ring="var(--warning)" if is_candidate else "", + dashed=is_tests, + title=_mm_node_title(node, overloaded), + ) + + +def _render_module_map_svg(graph: Mapping[str, object]) -> str: + nodes = [_as_mapping(node) for node in _as_sequence(graph.get("nodes"))] + if not nodes: + return _tab_empty(_EMPTY_GRAPH_MESSAGE) + node_ids = [str(node.get("id")) for node in nodes] + by_id = {str(node.get("id")): node for node in nodes} + edge_rows = [_as_mapping(edge) for edge in _as_sequence(graph.get("edges"))] + edges = [(str(e.get("source")), str(e.get("target"))) for e in edge_rows] + weights = { + (str(e.get("source")), str(e.get("target"))): _as_int(e.get("weight")) + for e in edge_rows + } + total_degree = {nid: _as_int(by_id[nid].get("total_degree")) for nid in node_ids} + hub_threshold = _hub_threshold(node_ids, total_degree, dict.fromkeys(node_ids, 0)) + + def _style(node_id: str) -> BlockNodeStyle: + return _mm_node_style(by_id[node_id], hub_threshold=hub_threshold) + + return render_block_diagram( + node_ids, + edges, + style_fn=_style, + aria_label="Module map graph", + edge_weight_fn=lambda edge: weights.get(edge, 1), + ) + + +def _mm_stat_cards( + summary: Mapping[str, object], active_graph: Mapping[str, object] +) -> str: + truncation = _as_mapping(active_graph.get("truncation")) + node_total = _as_int(truncation.get("node_universe_count")) + edge_total = _as_int(truncation.get("edge_universe_count")) + graph_subtext = ( + "deterministic sample" if bool(truncation.get("truncated")) else "full graph" + ) + cards = [ + _stat_card( + "Nodes shown", + _as_int(truncation.get("node_shown_count")), + secondary=f"/ {node_total}", + subtext=graph_subtext, + css_class="meta-item", + glossary_tip_fn=glossary_tip, + ), + _stat_card( + "Edges shown", + _as_int(truncation.get("edge_shown_count")), + secondary=f"/ {edge_total}", + subtext=graph_subtext, + css_class="meta-item", + glossary_tip_fn=glossary_tip, + ), + _stat_card( + "Unwind candidates", + _as_int(summary.get("unwind_candidate_count")), + subtext=( + f"of {_as_int(summary.get('module_count'))} modules · " + f"{_as_int(summary.get('package_count_depth2'))} packages" + ), + value_tone="accent", + css_class="meta-item meta-item--accent", + glossary_tip_fn=glossary_tip, + ), + ] + return "".join(cards) + + +def _mm_truncation_notice(active_graph: Mapping[str, object]) -> str: + truncation = _as_mapping(active_graph.get("truncation")) + if not bool(truncation.get("truncated")): + return "" + return ( + '
' + f"Showing {_as_int(truncation.get('node_shown_count'))} of " + f"{_as_int(truncation.get('node_universe_count'))} nodes and " + f"{_as_int(truncation.get('edge_shown_count'))} of " + f"{_as_int(truncation.get('edge_universe_count'))} edges — a deterministic " + "sample seeded by cycles, then chains, then degree. Tables below are full." + "
" + ) + + +def _mm_zoom_toggle( + default_zoom: str, + graph_packages: Mapping[str, object], + graph_modules: Mapping[str, object], +) -> str: + package_count = len(_as_sequence(graph_packages.get("nodes"))) + module_count = len(_as_sequence(graph_modules.get("nodes"))) + return render_split_tabs( + group_id="module-map-zoom", + active_id=default_zoom, + tabs=[ + ( + "packages", + "Packages", + package_count, + _render_module_map_svg(graph_packages), + ), + ( + "modules", + "Modules", + module_count, + _render_module_map_svg(graph_modules), + ), + ], + ) + + +def _mm_unwind_table(unwind_candidates: Sequence[object], ctx: ReportContext) -> str: + rows = [ + ( + str(_as_mapping(row).get("module")), + str(_as_int(_as_mapping(row).get("fan_in"))), + str(_as_int(_as_mapping(row).get("fan_out"))), + f"{_as_float(_as_mapping(row).get('score')):.2f}", + str(_as_mapping(row).get("candidate_status")), + ", ".join(str(s) for s in _as_sequence(_as_mapping(row).get("signals"))), + ) + for row in unwind_candidates + ] + return render_rows_table( + headers=("Module", "Fan-in", "Fan-out", "Score", "Status", "Signals"), + rows=rows, + empty_message="No unwind candidates detected.", + column_types={ + "Fan-in": "meter", + "Fan-out": "meter", + "Score": "score", + "Status": "status", + "Signals": "chips", + }, + ctx=ctx, + ) + + +def _overloaded_cards( + summary: Mapping[str, object], + rows_data: Sequence[object], +) -> str: + candidates = _as_int(summary.get("candidates")) + total_modules = _as_int(summary.get("total")) + ranked_only = sum( + 1 + for r in rows_data + if str(_as_mapping(r).get("candidate_status", "")).strip().lower() + == "ranked_only" + ) + population_status = str(summary.get("population_status", "")).strip().lower() + max_score = _as_float(summary.get("top_score")) + if max_score <= 0.0: + row_scores = [_as_float(_as_mapping(r).get("score")) for r in rows_data] + max_score = max(row_scores) if row_scores else 0.0 + cutoff = _as_float(summary.get("candidate_score_cutoff")) + locs = [ + _as_int(_as_mapping(r).get("loc")) + for r in rows_data + if _as_int(_as_mapping(r).get("loc")) > 0 + ] + avg_loc = int(sum(locs) / len(locs)) if locs else 0 + cards = [ + _stat_card( + "Overloaded", + candidates, + detail=_micro_badges(("total analyzed", total_modules)), + value_tone="bad" if candidates > 0 else "good", + glossary_tip_fn=glossary_tip, + ), + _stat_card( + "Ranked only", + ranked_only, + detail=_micro_badges(("population", population_status)) + if population_status + else "", + value_tone=( + "warn" + if population_status == "limited" + else ("muted" if ranked_only else "good") + ), + glossary_tip_fn=glossary_tip, + ), + _stat_card( + "Max score", + f"{max_score:.2f}", + detail=_micro_badges(("cutoff", f"{cutoff:.2f}")) if cutoff > 0.0 else "", + value_tone="warn" if max_score > 0 else "muted", + glossary_tip_fn=glossary_tip, + ), + _stat_card( + "Avg LOC", + avg_loc, + detail=_micro_badges(("modules", len(locs))), + value_tone="muted", + glossary_tip_fn=glossary_tip, + ), + ] + return f'
{"".join(cards)}
' + + +def _render_overloaded_modules_section(ctx: ReportContext) -> str: + """Render the full overloaded-modules profile (cards + table). + + Driven by ``metrics.families.overloaded_modules`` directly, so it renders + independently of dependency-graph availability — overloaded responsibility + is module-level and belongs in the Module map regardless of graph sampling. + """ + overloaded = _as_mapping(ctx.overloaded_modules_map) + if not overloaded: + return "" + summary = _as_mapping(overloaded.get("summary")) + rows_data = _as_sequence(overloaded.get("items")) + rows = [ + ( + str(_as_mapping(r).get("module", "")), + str( + _as_mapping(r).get("relative_path") + or _as_mapping(r).get("filepath") + or "" + ), + str(_as_mapping(r).get("score", "")), + str(_as_mapping(r).get("candidate_status", "")), + str(_as_mapping(r).get("loc", "")), + f"{_as_mapping(r).get('fan_in', '')}/{_as_mapping(r).get('fan_out', '')}", + str(_as_mapping(r).get("complexity_total", "")), + ) + for r in rows_data[:_OVERLOADED_TABLE_CAP] + ] + return ( + f'

{_OVERLOADED_HEADING}

' + + _overloaded_cards(summary, rows_data) + + render_rows_table( + headers=( + "Module", + "File", + "Score", + "Status", + "LOC", + "Fan-in/out", + "Complexity total", + ), + rows=rows, + empty_message=_OVERLOADED_EMPTY_MESSAGE, + column_types={ + "Score": "score", + "Status": "status", + "LOC": "meter", + "Complexity total": "meter", + }, + ctx=ctx, + ) + ) + + +def _render_graph_block(ctx: ReportContext, module_map: Mapping[str, object]) -> str: + summary = _as_mapping(module_map.get("summary")) + if not module_map or not bool(summary.get("available")): + return _tab_empty(_EMPTY_GRAPH_MESSAGE) + + default_zoom = str(module_map.get("default_zoom") or "packages") + graph_packages = _as_mapping(module_map.get("graph_packages")) + graph_modules = _as_mapping(module_map.get("graph_modules")) + active_graph = graph_packages if default_zoom == "packages" else graph_modules + + return ( + _mm_truncation_notice(active_graph) + + f'
{_mm_stat_cards(summary, active_graph)}
' + + _mm_zoom_toggle(default_zoom, graph_packages, graph_modules) + + _MM_LEGEND + + '

Unwind candidates

' + + _mm_unwind_table(_as_sequence(module_map.get("unwind_candidates")), ctx) + ) + + +def render_module_map_panel(ctx: ReportContext) -> str: + module_map = _as_mapping(ctx.derived_map.get("module_map")) + + answer = _MODULE_MAP_INSIGHT if ctx.metrics_available else _METRICS_SKIPPED + tone: Tone = "info" + insight = insight_block( + question="Where should refactoring unwind dependencies?", + answer=answer, + tone=tone, + ) + + # The import graph + unwind triage need the derived projection; the + # overloaded-modules profile is a module-level metrics view that renders + # independently (it moved here from the Quality tab — single home for + # module responsibility). + return ( + insight + + _render_graph_block(ctx, module_map) + + _render_overloaded_modules_section(ctx) + ) diff --git a/codeclone/report/html/sections/_overview.py b/codeclone/report/html/sections/_overview.py index 26a78b23..b7e7978d 100644 --- a/codeclone/report/html/sections/_overview.py +++ b/codeclone/report/html/sections/_overview.py @@ -786,6 +786,45 @@ def _overloaded_modules_section(ctx: ReportContext) -> str: ) +_LAUNCHPAD_SEVERITIES = ( + ("critical", "critical"), + ("warning", "warning"), + ("info", "info"), +) +_LAUNCHPAD_ARROW = ( + '' +) + + +def _review_launchpad_html(ctx: ReportContext) -> str: + """Entry banner: surface the review queue and jump into the Review tab.""" + derived = _as_mapping(getattr(ctx, "derived_map", {})) + summary = _as_mapping(_as_mapping(derived.get("review_queue")).get("summary")) + total = _as_int(summary.get("total")) + if total <= 0: + return "" + by_severity = _as_mapping(summary.get("by_severity")) + chips = "".join( + f'' + f"{_as_int(by_severity.get(key))} {label}" + for key, label in _LAUNCHPAD_SEVERITIES + if _as_int(by_severity.get(key)) > 0 + ) + noun = "finding" if total == 1 else "findings" + return ( + '
' + '
' + f'
{total} {noun} ready to review
' + f'
{chips}
' + "
" + '" + "
" + ) + + def render_overview_panel(ctx: ReportContext) -> str: """Build the Overview tab panel HTML.""" complexity_summary = _as_mapping(ctx.complexity_map.get("summary")) @@ -1042,6 +1081,7 @@ def _baselined_detail( answer=overview_answer, tone=overview_tone, ) + + _review_launchpad_html(ctx) + '
' + health_gauge + '
' diff --git a/codeclone/report/html/sections/_review.py b/codeclone/report/html/sections/_review.py new file mode 100644 index 00000000..9e391b45 --- /dev/null +++ b/codeclone/report/html/sections/_review.py @@ -0,0 +1,162 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +"""Review hub panel — the prioritized, cross-family finding-review queue. + +Render-only: reads the precomputed ``derived.review_queue`` and draws each +actionable item with the shared :func:`finding_card`. A per-item reviewed toggle +and the progress bar are wired client-side (localStorage keyed by finding id); +no projection logic lives here. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from codeclone.utils import coerce as _coerce + +from ..primitives.escape import _escape_html +from ..primitives.filters import _render_filter_chips +from ..widgets.badges import _tab_empty +from ..widgets.cards import finding_card, meta_badge_html +from ..widgets.components import Tone, insight_block + +if TYPE_CHECKING: + from collections.abc import Mapping + + from .._context import ReportContext + +_as_int = _coerce.as_int +_as_float = _coerce.as_float +_as_mapping = _coerce.as_mapping +_as_sequence = _coerce.as_sequence + +_EMPTY_MESSAGE = "No findings to review." +_METRICS_SKIPPED = "Metrics are skipped for this run." +_REVIEW_INSIGHT = ( + "Findings to review, highest priority first. Mark items reviewed as you go — " + "progress is saved in your browser. Report-only triage: verify in source " + "before editing." +) +_FAMILY_LABELS = { + "clones": "Clones", + "structural": "Structural", + "dead_code": "Dead code", + "design": "Quality", + "metrics": "Quality", +} +_SEVERITIES = ("critical", "warning", "info") + +_REVIEW_TOGGLE = ( + '" +) + + +def _family_label(family: str) -> str: + return _FAMILY_LABELS.get(family, family or "other") + + +def _render_review_item(item: Mapping[str, object]) -> str: + finding_id = str(item.get("finding_id")) + family = str(item.get("family")) + severity = str(item.get("severity")) + effort = str(item.get("effort")) + novelty = str(item.get("novelty")) + meta_badges = [meta_badge_html(f"priority {_as_float(item.get('priority')):.2f}")] + if effort: + meta_badges.append(meta_badge_html(effort, tone=effort)) + meta_badges.append(meta_badge_html(_family_label(family))) + if novelty == "new": + meta_badges.append(meta_badge_html("new", tone="new")) + data_attrs = ( + ' data-review-card="true" ' + f'data-finding-id="{_escape_html(finding_id)}" ' + f'data-severity="{_escape_html(severity)}" ' + f'data-family="{_escape_html(family)}" ' + f'data-novelty="{_escape_html(novelty)}"' + ) + return finding_card( + severity=severity, + title=str(item.get("title")), + eyebrow=f"{_family_label(family)} · {item.get('source_kind')}", + location=str(item.get("location")), + meta_badges=tuple(meta_badges), + body_html=_escape_html(str(item.get("summary"))), + actions_html=_REVIEW_TOGGLE, + card_class="review-card", + data_attrs=data_attrs, + ) + + +def _review_progress(total: int) -> str: + return ( + '
' + '
' + 'Progress' + '' + f"0 / {total} reviewed
" + '
' + '
' + ) + + +def _review_toolbar(summary: Mapping[str, object], total: int) -> str: + """Inline density of the shared filter system: one-click chips + count.""" + by_severity = _as_mapping(summary.get("by_severity")) + by_family = _as_mapping(summary.get("by_family")) + sev_opts = tuple( + (severity, severity.title(), _as_int(by_severity.get(severity))) + for severity in _SEVERITIES + if _as_int(by_severity.get(severity)) > 0 + ) + fam_opts = tuple( + (family, _family_label(family), _as_int(count)) + for family, count in sorted(by_family.items()) + ) + return ( + '" + ) + + +def render_review_panel(ctx: ReportContext) -> str: + queue = _as_mapping(ctx.derived_map.get("review_queue")) + summary = _as_mapping(queue.get("summary")) + items = [_as_mapping(item) for item in _as_sequence(queue.get("items"))] + + answer = _REVIEW_INSIGHT if ctx.metrics_available else _METRICS_SKIPPED + tone: Tone = "info" + insight = insight_block( + question="What needs review, and in what order?", + answer=answer, + tone=tone, + ) + if not items: + return insight + _tab_empty(_EMPTY_MESSAGE) + + cards = "".join(_render_review_item(item) for item in items) + return ( + "
" + + insight + + _review_progress(len(items)) + + _review_toolbar(summary, len(items)) + + f'
{cards}
' + ) diff --git a/codeclone/report/html/sections/_structural.py b/codeclone/report/html/sections/_structural.py index 24b8db63..aa9775dc 100644 --- a/codeclone/report/html/sections/_structural.py +++ b/codeclone/report/html/sections/_structural.py @@ -34,6 +34,7 @@ ) from ..primitives.escape import _escape_html from ..widgets.badges import _source_kind_badge_html, _tab_empty +from ..widgets.cards import finding_card, meta_badge_html from ..widgets.snippets import _FileCache, _render_code_block from ..widgets.tabs import render_split_tabs @@ -401,44 +402,49 @@ def _render_finding_card( func_word = "function" if spread["functions"] == 1 else "functions" file_word = "file" if spread["files"] == 1 else "files" kind_label = _KIND_LABEL.get(group.finding_kind, group.finding_kind) - source_chip = _escape_html(source_kind_label(source_kind)) - finding_kind_chip = _escape_html(group.finding_kind.replace("_", " ")) - context_chips = ( - f'{source_chip}' - f'{finding_kind_chip}' - ) scope_text = _finding_scope_text(deduped_items) finding_id = structural_group_id(group.finding_kind, group.finding_key) chips_html = _signature_chips_html(group.signature) - return ( - f'
' - '
' - 'info' - f'{_escape_html(kind_label)}' - '' - f'' - f"{spread['functions']} {func_word} \u00b7 {spread['files']} {file_word}" - f'' - "
" - '
' - f'
{context_chips}
' + ) + spread_badge = meta_badge_html( + f"{spread['functions']} {func_word} \u00b7 {spread['files']} {file_word}" + ) + body_html = ( f'
{chips_html}
' f'
{_escape_html(scope_text)}
' f"{inline_action_html}" - "
" + ) + details_html = ( '
' f"Occurrences ({count})" f'
{table_html}
' "
" - "
", + ) + data_attrs = ( + f' id="finding-{_escape_html(finding_id)}"' + f' data-finding-id="{_escape_html(finding_id)}"' + ' data-sf-group="true"' + f' data-source-kind="{_escape_html(source_kind)}"' + f' data-spread-bucket="{_escape_html(spread_bucket)}"' + ) + + return ( + finding_card( + severity="info", + title=kind_label, + eyebrow=source_kind_label(source_kind), + meta_badges=(spread_badge,), + body_html=body_html, + details_html=details_html, + actions_html=why_button, + card_class="sf-card", + data_attrs=data_attrs, + ), source_kind, ) diff --git a/codeclone/report/html/sections/_suggestions.py b/codeclone/report/html/sections/_suggestions.py index f5ad11b2..749509be 100644 --- a/codeclone/report/html/sections/_suggestions.py +++ b/codeclone/report/html/sections/_suggestions.py @@ -31,6 +31,7 @@ from ..primitives.escape import _escape_html from ..primitives.filters import SPREAD_OPTIONS, _render_select from ..widgets.badges import _micro_badges, _stat_card, _tab_empty +from ..widgets.cards import finding_card, meta_badge_html from ..widgets.components import insight_block from ..widgets.glossary import glossary_tip @@ -136,14 +137,18 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: else "" ) - # Effort badge — color-coded - effort_cls = f" suggestion-effort--{_escape_html(s.effort)}" + # Meta badges — effort is colour-coded by the shared meta-badge tone. effort_label = s.effort.title() priority_label = _priority_badge_label(s.priority) spread_label = _spread_label( spread_functions=s.spread_functions, spread_files=s.spread_files, ) + meta_badges = ( + meta_badge_html(effort_label, tone=s.effort), + meta_badge_html(priority_label), + meta_badge_html(spread_label), + ) # Locations inside details locs_html = "" @@ -178,26 +183,8 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: f"{_escape_html(s.severity)}" ) - return ( - f'
" - # -- header row -- - '
' - f'{_escape_html(s.severity)}' - f'{_escape_html(s.title)}' - '' - f'{_escape_html(effort_label)}' - f'{_escape_html(priority_label)}' - f'{_escape_html(spread_label)}' - "
" - # -- body -- - '
' - f"{ctx_html}" - f"{_render_fact_summary(s.fact_summary)}" - f"{next_step_html}" - "
" - # -- expandable details -- + body_html = f"{ctx_html}{_render_fact_summary(s.fact_summary)}{next_step_html}" + details_html = ( '
' "Details" '
' @@ -222,7 +209,28 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: f"{locs_html}" f"{steps_html}" "
" - "
" + ) + data_attrs = _build_data_attrs( + { + "data-suggestion-card": "true", + "data-severity": s.severity, + "data-category": s.category, + "data-family": s.finding_family, + "data-source-kind": s.source_kind, + "data-clone-type": s.clone_type, + "data-actionable": actionable, + "data-spread-bucket": spread_bucket, + "data-count": str(s.fact_count), + } + ) + return finding_card( + severity=s.severity, + title=s.title, + meta_badges=meta_badges, + body_html=body_html, + details_html=details_html, + card_class="suggestion-card", + data_attrs=data_attrs, ) diff --git a/codeclone/report/html/widgets/badges.py b/codeclone/report/html/widgets/badges.py index ba77e0a3..a974ec49 100644 --- a/codeclone/report/html/widgets/badges.py +++ b/codeclone/report/html/widgets/badges.py @@ -36,13 +36,16 @@ __all__ = [ "CHECK_CIRCLE_SVG", "INFO_CIRCLE_SVG", + "_chips_html", "_inline_empty", "_micro_badges", "_quality_badge_html", "_render_chain_flow", + "_score_bar_html", "_short_label", "_source_kind_badge_html", "_stat_card", + "_status_pill_html", "_tab_empty", "_tab_empty_info", ] @@ -111,6 +114,81 @@ def _source_kind_badge_html(source_kind: str) -> str: ) +_STATUS_PILL_CLASSES: dict[str, str] = { + "candidate": "status-pill--candidate", + "ranked_only": "status-pill--ranked", + "non_candidate": "status-pill--neutral", +} + + +def _status_pill_html(status: str) -> str: + """Render a candidate-status value as a coloured pill.""" + key = status.strip().lower() + if not key: + return "" + cls = _STATUS_PILL_CLASSES.get(key, "status-pill--neutral") + return ( + f'{_escape_html(key.replace("_", " "))}' + ) + + +def _score_bar_html(value: str) -> str: + """Render a 0..1 score as an indigo progress bar plus its rounded value.""" + try: + score = float(value) + except (TypeError, ValueError): + return _escape_html(str(value)) + pct = max(0, min(100, round(score * 100))) + strong = " score-bar--strong" if score >= 0.8 else "" + return ( + f'' + f'' + f'' + f'{score:.2f}' + ) + + +def _metric_meter_html(value: str, *, fraction: float) -> str: + """Render a numeric metric as its value plus a magnitude bar. + + *fraction* (0..1) is the value's share of the column maximum; the bar fills + to that share and tints by band (low/mid/high) so table magnitudes read at a + glance without altering the underlying number. + """ + text = str(value).strip() + try: + float(text) + except (TypeError, ValueError): + return _escape_html(text) + pct = max(0, min(100, round(fraction * 100))) + if fraction >= 0.66: + band = " metric-meter--high" + elif fraction >= 0.33: + band = " metric-meter--mid" + else: + band = "" + return ( + f'' + f'' + f'' + f'{_escape_html(text)}' + ) + + +def _chips_html(text: str) -> str: + """Render a comma-separated string as a row of compact chips.""" + parts = [part.strip() for part in str(text).split(",") if part.strip()] + return "".join(f'{_escape_html(part)}' for part in parts) + + +def _code_chip_html(text: str) -> str: + """Render an identifier / glob value as a compact monospace code chip.""" + value = str(text).strip() + if not value or value == "-": + return _escape_html(value) + return f'{_escape_html(value)}' + + _INLINE_EMPTY_ICONS: dict[str, str] = { "good": ( '?' + secondary_html = ( + f'{_escape_html(secondary)}' + if secondary + else "" + ) + subtext_html = ( + f'
{_escape_html(subtext)}
' if subtext else "" + ) detail_html = "" if detail: detail_html = f'
{detail}
' @@ -267,7 +361,8 @@ def _stat_card( return ( f'
' f'
{_escape_html(label)}{tip_html}{delta_html}
' - f'
{_escape_html(str(value))}
' - f"{detail_html}" + f'
' + f"{_escape_html(str(value))}{secondary_html}
" + f"{subtext_html}{detail_html}" "
" ) diff --git a/codeclone/report/html/widgets/cards.py b/codeclone/report/html/widgets/cards.py new file mode 100644 index 00000000..b51753b1 --- /dev/null +++ b/codeclone/report/html/widgets/cards.py @@ -0,0 +1,97 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +"""Shared finding/review card — one card chrome for the whole report. + +``finding_card`` is the single source of truth for the visual shell of an +actionable item (a finding, a suggestion, a review-queue entry): a severity +stripe + severity badge, a title, optional eyebrow/location, a meta-badge row, +and slots for body, expandable details, and right-aligned actions. Each surface +supplies its own slot content, so the chrome stays identical everywhere without +duplicating markup. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from ..primitives.escape import _escape_html +from .badges import _quality_badge_html + +if TYPE_CHECKING: + from collections.abc import Sequence + +_SEVERITIES = ("critical", "warning", "info") + + +def severity_key(severity: str) -> str: + """Normalise an arbitrary severity string to a known stripe key.""" + key = severity.strip().lower() + return key if key in _SEVERITIES else "info" + + +def meta_badge_html(text: str, *, tone: str = "") -> str: + """A compact monospace meta badge (effort, priority, spread, signals…).""" + tone_cls = f" finding-meta-badge--{_escape_html(tone)}" if tone else "" + return f'{_escape_html(text)}' + + +def finding_card( + *, + severity: str, + title: str, + eyebrow: str = "", + location: str = "", + meta_badges: Sequence[str] = (), + body_html: str = "", + details_html: str = "", + actions_html: str = "", + card_class: str = "", + data_attrs: str = "", +) -> str: + """Render the shared card shell. ``data_attrs`` is inserted verbatim and is + expected to carry its own leading space when non-empty.""" + sev = severity_key(severity) + extra_class = f" {card_class}" if card_class else "" + eyebrow_html = ( + f'
{_escape_html(eyebrow)}
' + if eyebrow + else "" + ) + location_html = ( + f'
{_escape_html(location)}
' + if location + else "" + ) + meta_html = ( + f'
{"".join(meta_badges)}
' + if meta_badges + else "" + ) + actions = ( + f'
{actions_html}
' + if actions_html + else "" + ) + body = f'
{body_html}
' if body_html else "" + return ( + f'
' + '' + '
' + '
' + '
' + f"{eyebrow_html}" + '
' + f"{_quality_badge_html(sev)}" + f'{_escape_html(title)}' + "
" + f"{location_html}" + "
" + f"{actions}" + "
" + f"{meta_html}{body}{details_html}" + "
" + ) diff --git a/codeclone/report/html/widgets/dep_graph_layout.py b/codeclone/report/html/widgets/dep_graph_layout.py new file mode 100644 index 00000000..8e9dac32 --- /dev/null +++ b/codeclone/report/html/widgets/dep_graph_layout.py @@ -0,0 +1,577 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. +# SPDX-License-Identifier: MPL-2.0 +# Copyright (c) 2026 Den Rozhnovskiy + +"""Shared block-diagram SVG layout for module/dependency graphs. + +Renders a layered flowchart: rectangular nodes with the label inside, stacked +top→bottom by topological depth, joined by lane-aware curved connectors whose +arrows point in import direction (``source`` → ``target``). Both the +Dependencies tab and the Module map tab draw through +:func:`render_block_diagram`, passing a per-node :class:`BlockNodeStyle` +callback, so the geometry stays identical and each tab only owns its own node +accents. +""" + +from __future__ import annotations + +import math +from dataclasses import dataclass +from hashlib import sha1 +from typing import TYPE_CHECKING + +from codeclone.utils.coerce import as_sequence + +from ..primitives.escape import _escape_html +from .badges import _short_label + +if TYPE_CHECKING: + from collections.abc import Callable, Mapping, Sequence + from collections.abc import Set as AbstractSet + +_BOX_H = 32 +_BOX_W_MIN = 76 +_BOX_W_MAX = 184 +_BOX_CHAR_W = 8 +_BOX_PAD_X = 30 +_LABEL_PAD_X = 28 +_ROW_GAP = 92 +_COL_GAP = 30 +_BLOCK_PAD = 34 +_LABEL_MAX = 20 +_MAX_ROW_WIDTH = 980 +_WRAPPED_ROW_GAP = 54 +# Fan endpoints spread across this fraction of a box edge so converging arrows +# enter/leave at distinct points instead of clumping at the centre. +_FAN_SPREAD_FRAC = 0.70 +_FAN_SPREAD_STEP = 17.0 +_LANE_STEP = 10.0 +_COMPACT_NODE_LIMIT = 8 +_WIDE_NODE_LIMIT = 18 +_COMPACT_RENDER_MAX = 820 +_COMFORTABLE_RENDER_MAX = 1320 +_WIDE_RENDER_MAX = 1180 + + +@dataclass(frozen=True, slots=True) +class BlockNodeStyle: + """Per-node visual accent for a block-diagram node. + + ``ring`` draws an outer halo (overload candidate); ``dashed`` dashes the box + border (test-only modules); empty strings/False mean "no accent". + """ + + fill: str + text_fill: str + stroke: str = "var(--border)" + ring: str = "" + dashed: bool = False + title: str = "" + + +def block_node_style_for( + *, + in_cycle: bool, + is_hub: bool, + is_leaf: bool, + ring: str = "", + dashed: bool = False, + title: str = "", +) -> BlockNodeStyle: + """Shared node palette for both graph tabs (single visual vocabulary). + + Precedence: cycle (danger, dashed) → hub (indigo fill) → leaf (muted) → + ordinary. ``ring`` (overload candidate) and ``dashed`` (test-only modules) + are independent accents the caller opts into. + """ + if in_cycle: + return BlockNodeStyle( + fill="var(--bg-surface)", + text_fill="var(--danger)", + stroke="var(--danger)", + ring=ring, + dashed=True, + title=title, + ) + if is_hub: + return BlockNodeStyle( + fill="var(--accent-primary)", + text_fill="#fff", + stroke="var(--accent-primary)", + ring=ring, + title=title, + ) + if is_leaf: + return BlockNodeStyle( + fill="var(--bg-surface)", + text_fill="var(--text-muted)", + stroke="var(--border)", + ring=ring, + title=title, + ) + return BlockNodeStyle( + fill="var(--bg-overlay)", + text_fill="var(--text-secondary)", + stroke="var(--border-strong)", + ring=ring, + dashed=dashed, + title=title, + ) + + +def _build_degree_maps( + nodes: Sequence[str], + edges: Sequence[tuple[str, str]], +) -> tuple[dict[str, int], dict[str, int]]: + in_degree: dict[str, int] = dict.fromkeys(nodes, 0) + out_degree: dict[str, int] = dict.fromkeys(nodes, 0) + for source, target in edges: + in_degree[target] += 1 + out_degree[source] += 1 + return in_degree, out_degree + + +def _build_layer_groups( + nodes: Sequence[str], + edges: Sequence[tuple[str, str]], + in_degree: Mapping[str, int], + out_degree: Mapping[str, int], +) -> dict[int, list[str]]: + children: dict[str, list[str]] = {node: [] for node in nodes} + for source, target in edges: + children[source].append(target) + + layers: dict[str, int] = {} + roots = sorted(node for node in nodes if in_degree[node] == 0) + if not roots: + roots = sorted(nodes, key=lambda node: -out_degree.get(node, 0))[:1] + queue = list(roots) + for node in queue: + layers.setdefault(node, 0) + while queue: + node = queue.pop(0) + for child in children.get(node, []): + if child in layers: + continue + layers[child] = layers[node] + 1 + queue.append(child) + + max_layer = max(layers.values(), default=0) + for node in nodes: + if node not in layers: + layers[node] = max_layer + 1 + + layer_groups: dict[int, list[str]] = {} + for node, layer in layers.items(): + layer_groups.setdefault(layer, []).append(node) + for layer in layer_groups: + layer_groups[layer].sort() + return layer_groups + + +def _hub_threshold( + nodes: Sequence[str], in_degree: Mapping[str, int], out_degree: Mapping[str, int] +) -> int: + degrees = [in_degree.get(node, 0) + out_degree.get(node, 0) for node in nodes] + if not degrees: + return 99 + degrees_sorted = sorted(degrees, reverse=True) + return int(degrees_sorted[max(0, len(degrees_sorted) // 5)]) + + +def _build_cycle_edges(dep_cycles: Sequence[object]) -> set[tuple[str, str]]: + cycle_edges: set[tuple[str, str]] = set() + for cycle in dep_cycles: + parts = [str(part) for part in as_sequence(cycle)] + for index in range(len(parts)): + cycle_edges.add((parts[index], parts[(index + 1) % len(parts)])) + return cycle_edges + + +def _box_width(label: str) -> int: + return min(_BOX_W_MAX, max(_BOX_W_MIN, len(label) * _BOX_CHAR_W + _BOX_PAD_X)) + + +def _label_fit_attrs(label: str, width: int) -> str: + """Clamp long SVG text to the node's inner width across browser fonts.""" + max_text_width = max(18.0, width - _LABEL_PAD_X) + if len(label) * _BOX_CHAR_W <= max_text_width: + return "" + return f' textLength="{max_text_width:.1f}" lengthAdjust="spacingAndGlyphs"' + + +def _edge_stroke_width(weight: int) -> int: + if weight <= 1: + return 1 + return 1 + min(2, math.floor(math.log2(weight))) + + +def _layout_block_diagram( + layer_groups: Mapping[int, Sequence[str]], + box_widths: Mapping[str, int], + degree: Mapping[str, int] | None = None, +) -> tuple[int, int, dict[str, tuple[float, float]]]: + """Place each node box centre-aligned per topological layer (top→bottom).""" + degree = degree or {} + num_layers = max(layer_groups.keys(), default=0) + 1 + + def _ordered_members(members: Sequence[str]) -> list[str]: + if len(members) < 3: + return list(members) + ranked = sorted(members, key=lambda node: (-degree.get(node, 0), node)) + center = (len(ranked) - 1) / 2 + slots = sorted(range(len(ranked)), key=lambda idx: (abs(idx - center), idx)) + ordered = [""] * len(ranked) + for node, slot in zip(ranked, slots, strict=False): + ordered[slot] = node + return ordered + + def _row_width(members: Sequence[str]) -> int: + if not members: + return 0 + return sum(box_widths[m] for m in members) + _COL_GAP * (len(members) - 1) + + def _wrapped_rows(members: Sequence[str]) -> list[list[str]]: + rows: list[list[str]] = [] + current: list[str] = [] + current_width = 0 + for member in members: + member_width = box_widths[member] + next_width = ( + member_width if not current else current_width + _COL_GAP + member_width + ) + if current and next_width > _MAX_ROW_WIDTH: + rows.append(current) + current = [member] + current_width = member_width + continue + current.append(member) + current_width = next_width + if current: + rows.append(current) + return rows or [[]] + + visual_rows: list[list[str]] = [] + row_logical_layers: list[int] = [] + for layer in range(num_layers): + rows = _wrapped_rows(_ordered_members(layer_groups.get(layer, []))) + visual_rows.extend(rows) + row_logical_layers.extend([layer] * len(rows)) + row_widths = [_row_width(row) for row in visual_rows] + canvas_width = max(row_widths, default=0) + + positions: dict[str, tuple[float, float]] = {} + current_y = _BOX_H / 2 + previous_layer: int | None = None + for visual_index, members in enumerate(visual_rows): + layer = row_logical_layers[visual_index] + if visual_index > 0: + current_y += _WRAPPED_ROW_GAP if previous_layer == layer else _ROW_GAP + cursor = (canvas_width - row_widths[visual_index]) / 2 + for member in members: + width = box_widths[member] + positions[member] = (cursor + width / 2, current_y) + cursor += width + _COL_GAP + previous_layer = layer + canvas_height = int( + (max((pos[1] for pos in positions.values()), default=_BOX_H / 2)) + _BOX_H / 2 + ) + return canvas_width, canvas_height, positions + + +def _marker_suffix( + nodes: Sequence[str], edges: Sequence[tuple[str, str]], aria_label: str +) -> str: + payload = "\n".join( + [aria_label, *nodes, *[f"{source}->{target}" for source, target in edges]] + ) + return sha1(payload.encode("utf-8")).hexdigest()[:10] + + +def _block_diagram_defs(marker_suffix: str) -> str: + arrow_id = f"block-arrow-{marker_suffix}" + danger_id = f"block-arrow-danger-{marker_suffix}" + return ( + "" + f'' + '' + f'' + '' + "" + ) + + +def _marker_url(*, marker_suffix: str, danger: bool) -> str: + marker = "block-arrow-danger" if danger else "block-arrow" + return f"url(#{marker}-{marker_suffix})" + + +def _spread_x(center_x: float, box_width: int, rank: int, count: int) -> float: + """Distribute *count* edge endpoints across a box edge, ordered by *rank*.""" + if count <= 1: + return center_x + span = min(box_width * _FAN_SPREAD_FRAC, _FAN_SPREAD_STEP * (count - 1)) + return center_x - span / 2 + rank * (span / (count - 1)) + + +def _rank_endpoints( + edges: Sequence[tuple[str, str]], + positions: Mapping[str, tuple[float, float]], + *, + by_key: int, + sort_key: int, +) -> tuple[dict[tuple[str, str], int], dict[str, int]]: + groups: dict[str, list[tuple[str, str]]] = {} + for edge in edges: + groups.setdefault(edge[by_key], []).append(edge) + for group in groups.values(): + group.sort(key=lambda edge: positions[edge[sort_key]][0]) + rank = { + edge: index for group in groups.values() for index, edge in enumerate(group) + } + count = {node: len(group) for node, group in groups.items()} + return rank, count + + +def _rank_lanes( + edges: Sequence[tuple[str, str]], + positions: Mapping[str, tuple[float, float]], +) -> tuple[dict[tuple[str, str], int], dict[tuple[int, int], int]]: + groups: dict[tuple[int, int], list[tuple[str, str]]] = {} + for edge in edges: + sy = round(positions[edge[0]][1]) + ty = round(positions[edge[1]][1]) + groups.setdefault((sy, ty), []).append(edge) + for group in groups.values(): + group.sort( + key=lambda edge: (positions[edge[0]][0], positions[edge[1]][0], edge) + ) + rank = { + edge: index for group in groups.values() for index, edge in enumerate(group) + } + count = {key: len(group) for key, group in groups.items()} + return rank, count + + +def _lane_offset(rank: int, count: int) -> float: + if count <= 1: + return 0.0 + return (rank - (count - 1) / 2) * _LANE_STEP + + +def _curved_vertical_path( + exit_x: float, + exit_y: float, + entry_x: float, + entry_y: float, + *, + lane: float, +) -> str: + mid = (exit_y + entry_y) / 2 + lane + return ( + f"M{exit_x:.1f},{exit_y:.1f} " + f"C{exit_x:.1f},{mid:.1f} {entry_x:.1f},{mid:.1f} " + f"{entry_x:.1f},{entry_y:.1f}" + ) + + +def _same_layer_path( + source_x: float, + source_y: float, + target_x: float, + target_y: float, + source_width: int, + target_width: int, + *, + lane: float, +) -> str: + side = 1 if target_x >= source_x else -1 + exit_x = source_x + side * source_width / 2 + entry_x = target_x - side * target_width / 2 + lift = _BOX_H * 1.75 + abs(lane) + bend_y = min(source_y, target_y) - lift + return ( + f"M{exit_x:.1f},{source_y:.1f} " + f"C{exit_x + side * 24:.1f},{bend_y:.1f} " + f"{entry_x - side * 24:.1f},{bend_y:.1f} " + f"{entry_x:.1f},{target_y:.1f}" + ) + + +def _render_block_edges( + edges: Sequence[tuple[str, str]], + positions: Mapping[str, tuple[float, float]], + box_widths: Mapping[str, int], + box_heights: Mapping[str, int], + *, + danger_edges: AbstractSet[tuple[str, str]], + weight_fn: Callable[[tuple[str, str]], int] | None, + marker_suffix: str, +) -> list[str]: + out_rank, out_count = _rank_endpoints(edges, positions, by_key=0, sort_key=1) + in_rank, in_count = _rank_endpoints(edges, positions, by_key=1, sort_key=0) + lane_rank, lane_count = _rank_lanes(edges, positions) + rendered: list[str] = [] + for source, target in edges: + sx, sy = positions[source] + tx, ty = positions[target] + lane_key = (round(sy), round(ty)) + lane = _lane_offset(lane_rank[(source, target)], lane_count[lane_key]) + if ty > sy + box_heights[source]: + exit_x = _spread_x( + sx, box_widths[source], out_rank[(source, target)], out_count[source] + ) + entry_x = _spread_x( + tx, box_widths[target], in_rank[(source, target)], in_count[target] + ) + path = _curved_vertical_path( + exit_x, + sy + box_heights[source] / 2, + entry_x, + ty - box_heights[target] / 2, + lane=lane, + ) + elif ty < sy - box_heights[source]: + exit_x = _spread_x( + sx, box_widths[source], out_rank[(source, target)], out_count[source] + ) + entry_x = _spread_x( + tx, box_widths[target], in_rank[(source, target)], in_count[target] + ) + path = _curved_vertical_path( + exit_x, + sy - box_heights[source] / 2, + entry_x, + ty + box_heights[target] / 2, + lane=lane, + ) + else: + path = _same_layer_path( + sx, + sy, + tx, + ty, + box_widths[source], + box_widths[target], + lane=lane, + ) + is_danger = (source, target) in danger_edges + stroke = "var(--danger)" if is_danger else "var(--border-strong)" + opacity = "0.66" if is_danger else "0.34" + weight = weight_fn((source, target)) if weight_fn is not None else 1 + marker_url = _marker_url(marker_suffix=marker_suffix, danger=is_danger) + rendered.append( + f'' + "" + f"{_escape_html(source)} → {_escape_html(target)}" + ) + return rendered + + +def _render_block_nodes( + nodes: Sequence[str], + positions: Mapping[str, tuple[float, float]], + box_widths: Mapping[str, int], + style_fn: Callable[[str], BlockNodeStyle], +) -> list[str]: + rendered: list[str] = [] + for node in nodes: + cx, cy = positions[node] + width = box_widths[node] + x = cx - width / 2 + y = cy - _BOX_H / 2 + style = style_fn(node) + label = _short_label(node, _LABEL_MAX) + parts: list[str] = [] + if style.ring: + parts.append( + f'' + ) + dash = ' stroke-dasharray="4,3"' if style.dashed else "" + parts.append( + f'' + ) + parts.append( + f'" + f"{_escape_html(style.title or node)}" + f"{_escape_html(label)}" + ) + rendered.append("".join(parts)) + return rendered + + +def render_block_diagram( + nodes: Sequence[str], + edges: Sequence[tuple[str, str]], + *, + style_fn: Callable[[str], BlockNodeStyle], + aria_label: str, + danger_edges: AbstractSet[tuple[str, str]] = frozenset(), + edge_weight_fn: Callable[[tuple[str, str]], int] | None = None, +) -> str: + """Render a layered block diagram for *nodes* / *edges* as a single SVG.""" + in_degree, out_degree = _build_degree_maps(nodes, edges) + layer_groups = _build_layer_groups(nodes, edges, in_degree, out_degree) + box_widths = {node: _box_width(_short_label(node, _LABEL_MAX)) for node in nodes} + box_heights = dict.fromkeys(nodes, _BOX_H) + degree = {node: in_degree.get(node, 0) + out_degree.get(node, 0) for node in nodes} + width, height, positions = _layout_block_diagram( + layer_groups, box_widths, degree=degree + ) + marker_suffix = _marker_suffix(nodes, edges, aria_label) + + edge_svg = _render_block_edges( + edges, + positions, + box_widths, + box_heights, + danger_edges=danger_edges, + weight_fn=edge_weight_fn, + marker_suffix=marker_suffix, + ) + node_svg = _render_block_nodes(nodes, positions, box_widths, style_fn) + vb_w = width + _BLOCK_PAD * 2 + vb_h = height + _BLOCK_PAD * 2 + if len(nodes) >= _WIDE_NODE_LIMIT or vb_w >= 980: + density = "wide" + render_width = min(max(round(vb_w * 1.08), 1040), _WIDE_RENDER_MAX) + svg_style = f"width:100%;max-width:{render_width}px" + elif len(nodes) > _COMPACT_NODE_LIMIT: + density = "comfortable" + render_width = min(max(round(vb_w * 1.18), 900), _COMFORTABLE_RENDER_MAX) + svg_style = f"width:100%;max-width:{render_width}px" + else: + density = "compact" + render_width = min(round(vb_w * 1.45), _COMPACT_RENDER_MAX) + svg_style = f"width:100%;max-width:{render_width}px" + return ( + '
' + f'' + f"{_block_diagram_defs(marker_suffix)}{''.join(edge_svg)}{''.join(node_svg)}" + "
" + ) diff --git a/codeclone/report/html/widgets/icons.py b/codeclone/report/html/widgets/icons.py index 12dde5c1..49dbbaa5 100644 --- a/codeclone/report/html/widgets/icons.py +++ b/codeclone/report/html/widgets/icons.py @@ -115,6 +115,11 @@ def _svg_with_class(size: int, sw: str, body: str, *, class_name: str = "") -> s '' '', ), + "review": ( + "2", + '' + '', + ), "clones": ( "2", '' @@ -132,6 +137,13 @@ def _svg_with_class(size: int, sw: str, body: str, *, class_name: str = "") -> s '' '', ), + "module-map": ( + "2", + '' + '' + '' + '', + ), "dead-code": ( "2", '' diff --git a/codeclone/report/html/widgets/tables.py b/codeclone/report/html/widgets/tables.py index 59cbf0e6..c5ee3559 100644 --- a/codeclone/report/html/widgets/tables.py +++ b/codeclone/report/html/widgets/tables.py @@ -8,11 +8,20 @@ from __future__ import annotations -from collections.abc import Collection, Sequence +from collections.abc import Collection, Mapping, Sequence from typing import TYPE_CHECKING from ..primitives.escape import _escape_html -from .badges import _quality_badge_html, _tab_empty +from .badges import ( + _chips_html, + _code_chip_html, + _metric_meter_html, + _quality_badge_html, + _score_bar_html, + _source_kind_badge_html, + _status_pill_html, + _tab_empty, +) from .glossary import glossary_tip if TYPE_CHECKING: @@ -39,6 +48,11 @@ "kind": "76px", "steps": "120px", "coupled classes": "360px", + "fan-in": "96px", + "fan-out": "100px", + "loc": "100px", + "complexity total": "136px", + "source": "104px", } _COL_CLS: dict[str, str] = {} @@ -67,6 +81,31 @@ _COL_CLS["steps"] = "col-steps" +_CELL_RENDERERS = { + "score": _score_bar_html, + "status": _status_pill_html, + "chips": _chips_html, + "source_kind": _source_kind_badge_html, + "code": _code_chip_html, +} + + +def _safe_abs_float(value: object) -> float: + try: + return abs(float(str(value).strip())) + except (TypeError, ValueError): + return 0.0 + + +_CELL_TYPE_CLS = { + "score": "col-score", + "status": "col-badge", + "chips": "col-chips", + "source_kind": "col-badge", + "code": "col-code", +} + + def render_rows_table( *, headers: Sequence[str], @@ -74,14 +113,30 @@ def render_rows_table( empty_message: str, empty_description: str | None = "Nothing to report - keep up the good work.", raw_html_headers: Collection[str] = (), + column_types: Mapping[str, str] | None = None, ctx: ReportContext | None = None, ) -> str: - """Render a data table with badges, tooltips, and col sizing.""" + """Render a data table with badges, tooltips, and col sizing. + + *column_types* maps a header to a typed cell renderer: ``"score"`` (indigo + progress bar + value), ``"status"`` (candidate-status pill), or ``"chips"`` + (comma-separated values as compact chips). Typed columns own their own + badge markup, so the table stays the single rendering authority. + """ if not rows: return _tab_empty(empty_message, description=empty_description) lower_headers = [h.lower() for h in headers] raw_html_set = {h.lower() for h in raw_html_headers} + typed_cols = {h.lower(): t for h, t in (column_types or {}).items()} + + # Meter columns self-scale: each bar fills relative to that column's max. + meter_max: dict[int, float] = {} + for col_idx, header in enumerate(lower_headers): + if typed_cols.get(header) != "meter": + continue + values = [_safe_abs_float(row[col_idx]) for row in rows if col_idx < len(row)] + meter_max[col_idx] = max([*values, 0.0]) # colgroup cg = [""] @@ -98,6 +153,15 @@ def render_rows_table( # tbody def _td(col_idx: int, cell: str) -> str: h = lower_headers[col_idx] if col_idx < len(lower_headers) else "" + cell_type = typed_cols.get(h) + if cell_type == "meter": + colmax = meter_max.get(col_idx, 0.0) + fraction = _safe_abs_float(cell) / colmax if colmax > 0 else 0.0 + meter = _metric_meter_html(cell, fraction=fraction) + return f'{meter}' + if cell_type in _CELL_RENDERERS: + cls = _CELL_TYPE_CLS[cell_type] + return f'{_CELL_RENDERERS[cell_type](cell)}' cls = _COL_CLS.get(h, "") cls_attr = f' class="{cls}"' if cls else "" if h in raw_html_set: diff --git a/codeclone/report/html/widgets/tabs.py b/codeclone/report/html/widgets/tabs.py index 5b708e3a..4710187b 100644 --- a/codeclone/report/html/widgets/tabs.py +++ b/codeclone/report/html/widgets/tabs.py @@ -18,20 +18,25 @@ def render_split_tabs( group_id: str, tabs: Sequence[tuple[str, str, int, str]], emit_clone_counters: bool = False, + active_id: str | None = None, ) -> str: """Render sub-tab navigation + panels. - Each tab tuple: ``(tab_id, label, count, panel_html)``. + Each tab tuple: ``(tab_id, label, count, panel_html)``. ``active_id`` selects + which tab starts active; when omitted the first tab is active. """ if not tabs: return "" + def _is_active(idx: int, tab_id: str) -> bool: + return tab_id == active_id if active_id is not None else idx == 0 + nav: list[str] = [ '