From 02a2130682b6ca470a178ab4b3c83f69856eaded Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 15:35:34 +0200 Subject: [PATCH 01/12] Initial version. --- skills/test-unit-review/SKILL.md | 104 +++++++ skills/test-unit-review/evals/evals.json | 127 +++++++++ .../files/compliant-auth-service-tests.py | 101 +++++++ .../missing-coverage-subscription-tests.py | 113 ++++++++ .../files/multi-violation-report-tests.py | 89 ++++++ skills/test-unit-review/evals/fixture-map.md | 35 +++ .../test-unit-review/evals/trigger-eval.json | 104 +++++++ skills/test-unit-standards/SKILL.md | 84 ++++++ skills/test-unit-standards/evals/evals.json | 266 ++++++++++++++++++ .../files/bad-naming-missing-boundaries.py | 49 ++++ .../evals/files/compliant-payment-tests.py | 97 +++++++ .../evals/files/copy-paste-setup.py | 66 +++++ .../missing-docstrings-stray-comments.py | 52 ++++ .../evals/files/mixed-source-and-tests.py | 62 ++++ .../evals/files/multi-violation-audit.py | 92 ++++++ .../evals/files/private-member-access.py | 51 ++++ .../evals/files/real-io-violations.py | 51 ++++ .../evals/files/shared-mutable-state.py | 37 +++ .../evals/files/source-discount-calculator.py | 45 +++ .../evals/files/weak-assertions-no-failure.py | 48 ++++ .../test-unit-standards/evals/fixture-map.md | 52 ++++ .../evals/trigger-eval.json | 152 ++++++++++ .../references/javascript-jest.md | 197 +++++++++++++ .../references/python-pytest.md | 217 ++++++++++++++ .../references/scala-munit.md | 193 +++++++++++++ skills/test-unit-write/SKILL.md | 103 +++++++ skills/test-unit-write/evals/evals.json | 150 ++++++++++ .../evals/files/partial-happy-path-only.py | 51 ++++ .../evals/files/source-order-service.py | 70 +++++ .../evals/files/source-price-calculator.py | 61 ++++ skills/test-unit-write/evals/fixture-map.md | 34 +++ .../test-unit-write/evals/trigger-eval.json | 110 ++++++++ 32 files changed, 3063 insertions(+) create mode 100644 skills/test-unit-review/SKILL.md create mode 100644 skills/test-unit-review/evals/evals.json create mode 100644 skills/test-unit-review/evals/files/compliant-auth-service-tests.py create mode 100644 skills/test-unit-review/evals/files/missing-coverage-subscription-tests.py create mode 100644 skills/test-unit-review/evals/files/multi-violation-report-tests.py create mode 100644 skills/test-unit-review/evals/fixture-map.md create mode 100644 skills/test-unit-review/evals/trigger-eval.json create mode 100644 skills/test-unit-standards/SKILL.md create mode 100644 skills/test-unit-standards/evals/evals.json create mode 100644 skills/test-unit-standards/evals/files/bad-naming-missing-boundaries.py create mode 100644 skills/test-unit-standards/evals/files/compliant-payment-tests.py create mode 100644 skills/test-unit-standards/evals/files/copy-paste-setup.py create mode 100644 skills/test-unit-standards/evals/files/missing-docstrings-stray-comments.py create mode 100644 skills/test-unit-standards/evals/files/mixed-source-and-tests.py create mode 100644 skills/test-unit-standards/evals/files/multi-violation-audit.py create mode 100644 skills/test-unit-standards/evals/files/private-member-access.py create mode 100644 skills/test-unit-standards/evals/files/real-io-violations.py create mode 100644 skills/test-unit-standards/evals/files/shared-mutable-state.py create mode 100644 skills/test-unit-standards/evals/files/source-discount-calculator.py create mode 100644 skills/test-unit-standards/evals/files/weak-assertions-no-failure.py create mode 100644 skills/test-unit-standards/evals/fixture-map.md create mode 100644 skills/test-unit-standards/evals/trigger-eval.json create mode 100644 skills/test-unit-standards/references/javascript-jest.md create mode 100644 skills/test-unit-standards/references/python-pytest.md create mode 100644 skills/test-unit-standards/references/scala-munit.md create mode 100644 skills/test-unit-write/SKILL.md create mode 100644 skills/test-unit-write/evals/evals.json create mode 100644 skills/test-unit-write/evals/files/partial-happy-path-only.py create mode 100644 skills/test-unit-write/evals/files/source-order-service.py create mode 100644 skills/test-unit-write/evals/files/source-price-calculator.py create mode 100644 skills/test-unit-write/evals/fixture-map.md create mode 100644 skills/test-unit-write/evals/trigger-eval.json diff --git a/skills/test-unit-review/SKILL.md b/skills/test-unit-review/SKILL.md new file mode 100644 index 0000000..5a30d75 --- /dev/null +++ b/skills/test-unit-review/SKILL.md @@ -0,0 +1,104 @@ +--- +name: test-unit-review +description: > + Procedural skill for reviewing and auditing existing unit tests against team standards. + Activate when the user asks to review, audit, check, or give feedback on an existing test + file — including informal requests and sharing a test file for a quality opinion. + Triggers on: "review these tests", "audit this test file", "check my tests", "are these + tests good enough", "do these tests follow our standards", "LGTM on tests", + "does this test look right", "is this test missing anything", "any issues with my tests", + "give feedback on this test file", "are these tests up to scratch". + Does NOT trigger for: writing new tests (use test-unit-write), choosing test doubles + (use test-mocking-patterns), managing test data strategies (use test-data-management), + debugging why a test fails at runtime, PR reviews not specifically about test quality, + or integration test planning (use test-integration-standards). + Pairs with test-unit-standards (rules checklist) and test-unit-write (if new tests needed). +license: Proprietary +compatibility: GitHub Copilot +--- + +# Unit Test Reviewer + +**For writing new tests**, use **test-unit-write**. **For test doubles**, use **test-mocking-patterns**. + +## Step 1 — Load language reference + +Identify language/framework from imports or file extension. Load reference: + +| Language / framework | Reference file | +|---|---| +| Python + pytest | `../test-unit-standards/references/python-pytest.md` | +| JavaScript / TypeScript + Jest | `../test-unit-standards/references/javascript-jest.md` | +| Scala + MUnit | `../test-unit-standards/references/scala-munit.md` | + +## Step 2 — Read test file completely + +Read entire file first. Note: unit under test, test count, imports, dependencies touched. + +## Step 3 — Run tests if available + +| Language | Command | +|---|---| +| Python | `pytest -v` | +| TypeScript | `npx jest --no-coverage` | +| Scala | `./mvnw test -pl -Dtest=` | +| .NET | `dotnet test --filter "FullyQualifiedName~"` | + +**Failing tests = Blocker** (regardless of style). Continue standards check even if all pass. + +## Step 4 — Check against standards + +Work through each standard from `../test-unit-standards/SKILL.md` in order. +For each category, list every violation found before moving to the next. + +### 4.1 Isolation + +- Real external I/O? (network calls, file reads/writes, real DB connections) +- Shared mutable state between tests? +- Tests that depend on execution order? + +### 4.2 Scope + +- Private members accessed (see language reference for the convention)? +- Test bypassing the public API to reach internal state? + +### 4.3 Naming + +- Does each test name clearly state unit, condition, and expected outcome? +- Apply the naming format from the language reference + +### 4.4 Assertion quality + +- Weak assertions (`assert result`, `assert result is not None`) where exact value known? +- Missing assertions (test body with no assert)? +- Side-effect assertions using manual truthy instead of framework matcher? +- **Note:** `pytest.approx` is **correct** for floats — do not flag. Only flag genuine weak assertions. + +### 4.5 Coverage + +- Success path per behaviour? +- Failure path per behaviour? +- Boundary values (zero, empty, None/null, max)? + +### 4.6 Fixtures + +- Setup code copy-pasted across tests? +- Shared fixtures lacking docstring? + +## Step 5 — Report findings + +Group by severity. Cite test name, line, rule, fix suggestion. Empty sections show `(none)`. + +### Blocker + +Broken tests, real I/O calls, or shared state (suite unreliable/order-dependent). + +### Important + +Missing failure/boundary coverage, weak/absent assertions, private member access, copy-pasted setup, unreadable naming. + +### Nit + +Minor naming inconsistencies, missing fixture docstrings, style issues. + +**Explicitly confirm compliance** for each category checked when no violations found. diff --git a/skills/test-unit-review/evals/evals.json b/skills/test-unit-review/evals/evals.json new file mode 100644 index 0000000..eba3eed --- /dev/null +++ b/skills/test-unit-review/evals/evals.json @@ -0,0 +1,127 @@ +{ + "skill_name": "test-unit-review", + "evals": [ + { + "id": 1, + "category": "happy-path", + "prompt": "Review these AuthService unit tests and confirm they follow our standards. File: evals/files/compliant-auth-service-tests.py", + "expected_output": "Review confirms full compliance across all categories: isolation (no real I/O — both dependencies are MagicMock stubs); scope (no private members accessed, all tests go through login/logout public methods); naming (all names follow test___); assertions (specific values checked, exception type verified); coverage (success path, wrong-password failure path, unknown-user failure path, logout side effect all covered); fixtures (all three fixtures have docstrings; setup is not duplicated). No violations found.", + "files": [ + "evals/files/compliant-auth-service-tests.py" + ], + "expectations": [ + "Reviewer confirms isolation: no real I/O, both deps are stubs", + "Reviewer confirms scope: public interface only, no private members accessed", + "Reviewer confirms naming convention is followed for all five tests", + "Reviewer confirms assertions are specific (exact token value, exact call args)", + "Reviewer confirms success path and two failure paths are covered for login", + "Reviewer confirms logout side-effect is tested", + "Reviewer confirms all fixtures have docstrings", + "Reviewer does not fabricate violations where none exist", + "Review concludes with explicit compliance confirmation" + ] + }, + { + "id": 2, + "category": "multi-violation", + "prompt": "Full review of these ReportService tests — tell me everything that's wrong. File: evals/files/multi-violation-report-tests.py", + "expected_output": "Review identifies violations across all six categories: (1) ISOLATION — setup_method opens a real Redis connection. (2) SCOPE — test_build_subject_template calls private _build_subject; test_retry_count_after_send reads private _retry_count. (3) NAMING — test_send, test_error, test_ok, test_build_subject_template, test_retry_count_after_send, test_cache_updated_after_send all lack a clear condition/expected-outcome in the name. (4) ASSERTIONS — test_send uses 'is not None' (always True for True); test_ok uses a truthy check; test_error has no assertion; test_cache_updated_after_send asserts mock.assert_called_once() without verifying key or value args. (5) COVERAGE — no failure path (SMTP failure not surfaced, no test for empty recipient, None subject, empty body). (6) FIXTURES — smtp_mock + cache_mock + service construction copy-pasted in three standalone test functions.", + "files": [ + "evals/files/multi-violation-report-tests.py" + ], + "expectations": [ + "setup_method real Redis connection is flagged as isolation violation (Blocker)", + "test_build_subject_template calling _build_subject is flagged as scope violation", + "test_retry_count_after_send reading _retry_count is flagged as scope violation", + "test_send, test_error, test_ok naming is flagged as not following convention", + "test_send 'is not None' assertion is flagged as weak", + "test_ok truthy-only assertion is flagged as weak", + "test_error has no assertion — flagged as incomplete test", + "test_cache_updated_after_send lacks key/value argument verification", + "Missing failure-path coverage is flagged", + "Copy-pasted smtp_mock + cache_mock setup across standalone functions is flagged", + "All six categories are applied" + ] + }, + { + "id": 3, + "category": "regression", + "prompt": "Are these SubscriptionService tests complete? File: evals/files/missing-coverage-subscription-tests.py", + "expected_output": "Review confirms fixtures and isolation are fine, but flags coverage and assertion gaps: (1) COVERAGE — no failure test for subscribe when user is already subscribed (AlreadySubscribedError). (2) COVERAGE — no failure test for subscribe when plan is not found (PlanNotFoundError). (3) COVERAGE — no boundary test for empty user_id in subscribe. (4) COVERAGE — no failure test for upgrade with duration_days <= 0 (ValueError). (5) COVERAGE — no failure test for upgrade when plan is not found. (6) ASSERTION — test_upgrade_renews_expiry asserts expiry is not None instead of verifying the actual ISO timestamp value or approximate datetime.", + "files": [ + "evals/files/missing-coverage-subscription-tests.py" + ], + "expectations": [ + "Missing AlreadySubscribedError failure test for subscribe is flagged", + "Missing PlanNotFoundError failure test for subscribe is flagged", + "Missing empty user_id boundary test for subscribe is flagged", + "Missing ValueError failure test for upgrade with duration_days <= 0 is flagged", + "Missing PlanNotFoundError failure test for upgrade is flagged", + "Weak 'is not None' assertion in test_upgrade_renews_expiry is flagged", + "Reviewer confirms isolation and fixture documentation are compliant" + ] + }, + { + "id": 4, + "category": "negative", + "prompt": "Write unit tests for the SubscriptionService class.", + "expected_output": "This is a test-writing request that should route to test-unit-write, not trigger a review workflow. If handled: the skill may produce a review-style response but should ideally defer to the write skill for code generation.", + "files": [], + "expectations": [ + "Skill does not confuse a write request with a review request", + "Skill either routes to test-unit-write or notes that code generation is handled by test-unit-write" + ] + }, + { + "id": 5, + "category": "negative", + "prompt": "Why is my test throwing AttributeError: 'NoneType' object has no attribute 'send'?", + "expected_output": "This is a runtime debugging request, not a test review. The skill does not apply standards checks or audit the test file against the checklist.", + "files": [], + "expectations": [ + "Skill does not apply unit test standards rules to a debugging question", + "Response addresses the runtime error, not test quality" + ] + }, + { + "id": 6, + "category": "paraphrase", + "prompt": "Look at these tests and tell me if they're up to scratch.", + "expected_output": "Agent applies the full six-category review checklist to the provided test file. Reports on: isolation (real I/O), scope (private member access), naming (test___), assertions (specific values vs truthy), coverage (failure paths, boundaries), fixtures (shared setup, docstrings). Groups findings as Blocker / Important / Nit. Confirms compliance explicitly if all categories pass.", + "files": [], + "expectations": [ + "Identifies 'up to scratch' as a unit test review request", + "Applies all six review categories", + "Groups findings as Blocker / Important / Nit", + "Confirms explicit compliance if no violations found" + ] + }, + { + "id": 7, + "category": "edge-case", + "prompt": "A test uses pytest.approx to assert a floating point result. Is this a violation?", + "expected_output": "No. pytest.approx is the correct way to assert on floating-point results — it is not a weak assertion. A weak assertion would be 'result > 0' or 'result is not None'. pytest.approx(0.30, rel=1e-6) asserts the value is approximately 0.30 within a relative tolerance. This is a specific assertion that kills arithmetic operator mutants. The review should not flag pytest.approx as a violation — it should note it as a compliant assertion for floating-point values.", + "files": [], + "expectations": [ + "Does not flag pytest.approx as a violation", + "Explains that pytest.approx is a specific, correct assertion for floats", + "Distinguishes from genuinely weak assertions (> 0, is not None)", + "Notes pytest.approx kills arithmetic operator mutants" + ] + }, + { + "id": 8, + "category": "output-format", + "prompt": "What does a compliant unit test review report look like? Show the exact output format with a Blocker and an Important finding.", + "expected_output": "The review report uses three H3 sections: '### Blocker', '### Important', '### Nit'. Each finding is a bulleted item with: (1) the test name or location in backticks, (2) the violated rule (e.g. 'Isolation — real HTTP call detected'), (3) a one-sentence fix. Sections with no findings still appear with '(none)'. A final compliance status line ends the report.", + "files": [], + "expectations": [ + "Three H3 sections: Blocker, Important, Nit", + "Each finding is a bulleted item", + "Finding format: backtick location + rule name + one-sentence fix", + "Empty sections show (none)", + "Final compliance status line present" + ] + } + ] +} diff --git a/skills/test-unit-review/evals/files/compliant-auth-service-tests.py b/skills/test-unit-review/evals/files/compliant-auth-service-tests.py new file mode 100644 index 0000000..de7b998 --- /dev/null +++ b/skills/test-unit-review/evals/files/compliant-auth-service-tests.py @@ -0,0 +1,101 @@ +""" +Compliant unit tests for AuthService — review eval fixture. + +All standards are met: isolation, scope, naming, assertions, coverage, fixtures. +""" + +import pytest +from unittest.mock import MagicMock +from datetime import datetime, timezone, timedelta + + +class UnauthorizedError(Exception): + pass + + +class AuthService: + def __init__(self, token_repo, session_store): + self._token_repo = token_repo + self._sessions = session_store + + def login(self, username: str, password: str) -> str: + record = self._token_repo.find_user(username) + if record is None or record["password_hash"] != _hash(password): + raise UnauthorizedError("invalid credentials") + token = self._token_repo.create_token(username) + self._sessions.store(username, token) + return token + + def logout(self, token: str) -> None: + self._sessions.invalidate(token) + + +def _hash(value: str) -> str: + return f"hashed:{value}" + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def token_repo(): + """Stubbed token repository with a single valid user record.""" + repo = MagicMock() + repo.find_user.return_value = { + "username": "alice", + "password_hash": "hashed:secret", + } + repo.create_token.return_value = "tok_abc123" + return repo + + +@pytest.fixture +def session_store(): + """Stubbed session store. Side effects are not tested here.""" + return MagicMock() + + +@pytest.fixture +def auth_service(token_repo, session_store): + """AuthService wired with stubbed dependencies — no real I/O.""" + return AuthService(token_repo=token_repo, session_store=session_store) + + +# --------------------------------------------------------------------------- +# login +# --------------------------------------------------------------------------- + +def test_login_with_valid_credentials_returns_token(auth_service): + """Successful login returns the token created by the repository.""" + token = auth_service.login("alice", "secret") + assert token == "tok_abc123" + + +def test_login_stores_token_in_session(auth_service, session_store): + """Successful login stores the issued token in the session store.""" + auth_service.login("alice", "secret") + session_store.store.assert_called_once_with("alice", "tok_abc123") + + +def test_login_with_unknown_user_raises_unauthorized(auth_service, token_repo): + """Login fails with UnauthorizedError when the username is not found.""" + token_repo.find_user.return_value = None + with pytest.raises(UnauthorizedError): + auth_service.login("unknown", "secret") + + +def test_login_with_wrong_password_raises_unauthorized(auth_service): + """Login fails with UnauthorizedError when the password hash does not match.""" + with pytest.raises(UnauthorizedError): + auth_service.login("alice", "wrong-password") + + +# --------------------------------------------------------------------------- +# logout +# --------------------------------------------------------------------------- + +def test_logout_invalidates_session_token(auth_service, session_store): + """Logout calls invalidate on the session store with the given token.""" + auth_service.logout("tok_abc123") + session_store.invalidate.assert_called_once_with("tok_abc123") diff --git a/skills/test-unit-review/evals/files/missing-coverage-subscription-tests.py b/skills/test-unit-review/evals/files/missing-coverage-subscription-tests.py new file mode 100644 index 0000000..8de884e --- /dev/null +++ b/skills/test-unit-review/evals/files/missing-coverage-subscription-tests.py @@ -0,0 +1,113 @@ +""" +SubscriptionService tests — missing failure paths and boundary coverage. + +Violations present: + COVERAGE — no failure path when user is already subscribed + COVERAGE — no failure path when plan is not found + COVERAGE — no boundary test for empty user_id + COVERAGE — no boundary test for negative duration_days + ASSERTIONS — test_upgrade_renews_expiry asserts only that expiry changed, not the new value +""" + +import pytest +from unittest.mock import MagicMock +from datetime import datetime, timezone, timedelta + + +class PlanNotFoundError(Exception): + pass + + +class AlreadySubscribedError(Exception): + pass + + +class SubscriptionService: + def __init__(self, plan_repo, user_repo): + self._plans = plan_repo + self._users = user_repo + + def subscribe(self, user_id: str, plan_id: str) -> dict: + if not user_id: + raise ValueError("user_id must not be empty") + plan = self._plans.find(plan_id) + if plan is None: + raise PlanNotFoundError(f"plan {plan_id!r} not found") + existing = self._users.get_subscription(user_id) + if existing is not None: + raise AlreadySubscribedError(f"user {user_id!r} is already subscribed") + sub = {"user_id": user_id, "plan_id": plan_id, "status": "active"} + self._users.save_subscription(sub) + return sub + + def upgrade(self, user_id: str, new_plan_id: str, duration_days: int) -> dict: + if duration_days <= 0: + raise ValueError("duration_days must be positive") + plan = self._plans.find(new_plan_id) + if plan is None: + raise PlanNotFoundError(f"plan {new_plan_id!r} not found") + expiry = datetime.now(timezone.utc) + timedelta(days=duration_days) + sub = {"user_id": user_id, "plan_id": new_plan_id, "expiry": expiry.isoformat()} + self._users.save_subscription(sub) + return sub + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def plan_repo(): + """Stubbed plan repository returning a standard monthly plan.""" + repo = MagicMock() + repo.find.return_value = {"id": "plan_monthly", "name": "Monthly", "price": 9.99} + return repo + + +@pytest.fixture +def user_repo(): + """Stubbed user repository. No existing subscription by default.""" + repo = MagicMock() + repo.get_subscription.return_value = None + return repo + + +@pytest.fixture +def service(plan_repo, user_repo): + """SubscriptionService wired with stubbed repositories.""" + return SubscriptionService(plan_repo=plan_repo, user_repo=user_repo) + + +# --------------------------------------------------------------------------- +# subscribe — happy path only (failure paths missing) +# --------------------------------------------------------------------------- + +def test_subscribe_with_valid_inputs_returns_subscription(service): + """Happy path: valid user and plan returns active subscription dict.""" + result = service.subscribe("user-1", "plan_monthly") + assert result["user_id"] == "user-1" + assert result["plan_id"] == "plan_monthly" + assert result["status"] == "active" + + +def test_subscribe_saves_subscription_to_repo(service, user_repo): + """Happy path: successful subscribe calls save_subscription on the user repo.""" + service.subscribe("user-1", "plan_monthly") + user_repo.save_subscription.assert_called_once() + + +# --------------------------------------------------------------------------- +# upgrade — happy path only (failure paths and boundary coverage missing) +# --------------------------------------------------------------------------- + +def test_upgrade_with_valid_plan_returns_updated_subscription(service): + """Happy path: valid upgrade returns subscription with new plan id.""" + result = service.upgrade("user-1", "plan_annual", 365) + assert result["plan_id"] == "plan_annual" + assert result["user_id"] == "user-1" + + +def test_upgrade_renews_expiry(service): + """Happy path: upgrade sets an expiry date on the subscription.""" + result = service.upgrade("user-1", "plan_annual", 365) + assert result["expiry"] is not None # ASSERTION: weak — any non-None value passes diff --git a/skills/test-unit-review/evals/files/multi-violation-report-tests.py b/skills/test-unit-review/evals/files/multi-violation-report-tests.py new file mode 100644 index 0000000..cb9aa80 --- /dev/null +++ b/skills/test-unit-review/evals/files/multi-violation-report-tests.py @@ -0,0 +1,89 @@ +""" +ReportService tests — multiple violations across all six standard categories. + +Violations present: + ISOLATION — setup_method opens a real Redis connection + SCOPE — _build_subject (private method) and _retry_count (private attribute) accessed + NAMING — test_send, test_error, test_ok do not follow naming convention + ASSERTIONS — test_send uses 'is not None'; test_ok uses truthy check; test_error has no assert + COVERAGE — no failure-path or boundary tests (empty recipient list, None subject, 0-length body) + FIXTURES — redis_mock + smtp_client setup copy-pasted across standalone test functions +""" + +import pytest +import redis +from unittest.mock import MagicMock + + +class ReportService: + def __init__(self, smtp_client, cache): + self._smtp = smtp_client + self._cache = cache + self._retry_count = 0 + + def send_report(self, recipient: str, subject: str, body: str) -> bool: + self._retry_count += 1 + subject_line = self._build_subject(subject) + self._smtp.send(recipient, subject_line, body) + self._cache.set(f"last_report:{recipient}", subject) + return True + + def _build_subject(self, subject: str) -> str: + return f"[REPORT] {subject}" + + +# --------------------------------------------------------------------------- +# Class-based tests — ISOLATION VIOLATION: real Redis in setup_method +# --------------------------------------------------------------------------- + +class TestReportServiceIntegrated: + def setup_method(self): + self.cache = redis.Redis(host="localhost", port=6379) # real Redis — isolation violated + self.smtp = MagicMock() + self.service = ReportService(smtp_client=self.smtp, cache=self.cache) + + def test_send(self): # NAMING: vague — no condition or expected outcome + result = self.service.send_report("a@b.com", "Q1", "body text") + assert result is not None # ASSERTION: is-not-None is always True for True + + def test_error(self): # NAMING: vague + self.smtp.send.side_effect = Exception("SMTP failure") + try: + self.service.send_report("a@b.com", "Q1", "body") + except Exception: + pass + # no assertion — test passes regardless of behaviour + + def test_ok(self): # NAMING: vague + result = self.service.send_report("a@b.com", "Q1", "body") + assert result # ASSERTION: truthy only — True, 1, "ok" would all pass + + +# --------------------------------------------------------------------------- +# Standalone tests — FIXTURES: copy-pasted setup; SCOPE: private member access +# --------------------------------------------------------------------------- + +def test_build_subject_template(): + # SCOPE: calling private method _build_subject directly + smtp_mock = MagicMock() + cache_mock = MagicMock() + service = ReportService(smtp_client=smtp_mock, cache=cache_mock) + result = service._build_subject("Sales Update") # private — not allowed + assert result == "[REPORT] Sales Update" + + +def test_retry_count_after_send(): + # SCOPE: reading private attribute _retry_count directly + smtp_mock = MagicMock() # FIXTURES: setup copy-pasted — identical to next test + cache_mock = MagicMock() + service = ReportService(smtp_client=smtp_mock, cache=cache_mock) + service.send_report("a@b.com", "Q1", "body") + assert service._retry_count == 1 # private attribute — not allowed + + +def test_cache_updated_after_send(): + smtp_mock = MagicMock() # FIXTURES: copy-pasted setup (same as above) + cache_mock = MagicMock() + service = ReportService(smtp_client=smtp_mock, cache=cache_mock) + service.send_report("a@b.com", "Q1", "body") + cache_mock.set.assert_called_once() # ASSERTION: no check on key or value arguments diff --git a/skills/test-unit-review/evals/fixture-map.md b/skills/test-unit-review/evals/fixture-map.md new file mode 100644 index 0000000..2d48496 --- /dev/null +++ b/skills/test-unit-review/evals/fixture-map.md @@ -0,0 +1,35 @@ +# Unit Test Review — Evals Fixture Map + +Links each eval test case to its fixture file(s). + +| Test ID | Category | Fixture | +|---------|----------------|---------| +| 1 | happy-path | evals/files/compliant-auth-service-tests.py | +| 2 | multi-violation | evals/files/multi-violation-report-tests.py | +| 3 | regression | evals/files/missing-coverage-subscription-tests.py | +| 4 | negative | *(no file — write request routed away)* | +| 5 | negative | *(no file — runtime debugging question)* | + +## Fixture → Rule mapping + +| Fixture file | Primary category exercised | +|---|---| +| compliant-auth-service-tests.py | All six categories — no violations (confirms reviewer does not fabricate) | +| multi-violation-report-tests.py | All six categories — isolation, scope, naming, assertions, coverage, fixtures | +| missing-coverage-subscription-tests.py | Coverage completeness and weak assertions | + +## Coverage summary + +- happy-path: 1 +- multi-violation: 1 +- regression: 1 +- negative: 2 +- **total: 5** + +## Trigger eval coverage + +| Direction | Count | +|---|---| +| should_trigger = true | 11 | +| should_trigger = false | 6 | +| **total** | **17** | diff --git a/skills/test-unit-review/evals/trigger-eval.json b/skills/test-unit-review/evals/trigger-eval.json new file mode 100644 index 0000000..f0c830a --- /dev/null +++ b/skills/test-unit-review/evals/trigger-eval.json @@ -0,0 +1,104 @@ +[ + { + "id": "r-t01-review-tests-explicit", + "query": "Review these unit tests and tell me if they follow our standards.", + "should_trigger": true, + "reason": "Explicit review request — primary trigger." + }, + { + "id": "r-t02-audit-test-file", + "query": "Audit this test file against our unit test standards.", + "should_trigger": true, + "reason": "'Audit this test file' is a listed trigger phrase." + }, + { + "id": "r-t03-are-tests-up-to-scratch", + "query": "Are these tests up to scratch? I want to ship this tomorrow.", + "should_trigger": true, + "reason": "'Are these tests up to scratch' is a listed trigger phrase." + }, + { + "id": "r-t04-lgtm-on-tests", + "query": "LGTM on the tests?", + "should_trigger": true, + "reason": "'LGTM on tests' is a listed trigger phrase." + }, + { + "id": "r-t05-does-this-look-right", + "query": "Does this test look right to you?", + "should_trigger": true, + "reason": "'Does this test look right' is a listed trigger phrase." + }, + { + "id": "r-t06-check-my-tests", + "query": "Check my tests before I merge this PR.", + "should_trigger": true, + "reason": "'Check my tests' is a listed trigger phrase." + }, + { + "id": "r-t07-isolation-check", + "query": "Are these tests properly isolated? I want to make sure they don't call real APIs.", + "should_trigger": true, + "reason": "Isolation audit is a specific category within the review checklist." + }, + { + "id": "r-t08-any-issues-with-my-tests", + "query": "Any issues with my test file that I should fix before the sprint review?", + "should_trigger": true, + "reason": "'Any issues with my tests' maps to the review skill." + }, + { + "id": "r-t09-is-this-missing-anything", + "query": "Is this test missing any important coverage?", + "should_trigger": true, + "reason": "'Is this test missing anything' is a listed trigger phrase." + }, + { + "id": "r-t10-give-feedback", + "query": "Give me feedback on this test file.", + "should_trigger": true, + "reason": "'Give feedback on this test file' is a listed trigger phrase." + }, + { + "id": "r-t11-do-these-tests-follow-standards", + "query": "Do these tests follow our standards?", + "should_trigger": true, + "reason": "'Do these tests follow our standards' is a listed trigger phrase." + }, + { + "id": "r-n01-write-tests", + "query": "Write unit tests for the ReportService class.", + "should_trigger": false, + "reason": "Test-writing request — routes to test-unit-write, not test-unit-review." + }, + { + "id": "r-n02-add-failure-coverage", + "query": "Add failure-path tests to this file.", + "should_trigger": false, + "reason": "Adding new tests is a write task — routes to test-unit-write." + }, + { + "id": "r-n03-debug-failing-test", + "query": "Why is my test throwing NullPointerException on the mock setup?", + "should_trigger": false, + "reason": "Runtime debugging — not a standards review task." + }, + { + "id": "r-n04-spy-vs-mock", + "query": "Should I use a spy or a mock for the email service?", + "should_trigger": false, + "reason": "Test-double selection — routes to test-mocking-patterns." + }, + { + "id": "r-n05-pr-review", + "query": "Review this pull request before I merge it.", + "should_trigger": false, + "reason": "General PR review — routes to pr-review skill, not test-unit-review." + }, + { + "id": "r-n06-integration-test-plan", + "query": "Design an integration test plan for the checkout flow.", + "should_trigger": false, + "reason": "Integration test planning is out of scope for this skill." + } +] diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md new file mode 100644 index 0000000..adcfea4 --- /dev/null +++ b/skills/test-unit-standards/SKILL.md @@ -0,0 +1,84 @@ +--- +name: test-unit-standards +description: > + Unit test quality — writing, reviewing, and auditing unit tests against team standards. Activate + for any unit test quality concern: writing new unit tests, extending coverage, reviewing or + auditing existing test files, or asking about test conventions and structural rules. + Triggers on: "write unit tests", "add tests for", "review these tests", "audit this test file", + "are these tests good enough", "unit test standards", "test isolation rules", "test naming + convention", "assertion standards", "fixture reuse rules", "private member in tests", + "failure-path coverage", "add edge case tests", "are these tests up to scratch". + Does NOT trigger for: PR review (use pr-review), test-double selection — spy vs stub (use + test-mocking-patterns), test data strategy (use test-data-management), debugging a failing test + at runtime, running tests, integration test planning, or non-test tasks. + Pairs with test-unit-write and test-unit-review for procedural workflows. +license: Proprietary +compatibility: GitHub Copilot +--- + +# Unit Test Standards + +**Applies to test code only.** Non-test tasks: complete normally without enforcing rules. + +## Step 0 — Load reference + +Detect language/framework. Load reference: + +| Language / framework | Reference file | +|---|---| +| Python + pytest | `references/python-pytest.md` | +| JavaScript / TypeScript + Jest | `references/javascript-jest.md` | +| Scala + MUnit | `references/scala-munit.md` | + +If no reference exists, apply language-agnostic rules below and note missing guidance. +When no violations found, explicitly confirm compliance for each category. + +## Isolation + +- No real external APIs, DBs, file systems, network +- Mock/stub all I/O and boundaries +- Independent tests, no shared mutable state +- Runnable in isolation, any order + +## Scope + +- Unit = one function, method, or class +- Test public interface only +- No direct private member access (see language reference) + +## Naming and structure + +- Test names state scenario (follow language reference format) +- One logical behavior per test (multiple asserts OK if collectively confirming single outcome) +- Docstring required stating scenario (not inline comments) +- No loose comments between methods; inline context comments OK (e.g. `# edge case: empty list`) + +## Assertions + +- Test: return values, exceptions, log messages (contract-sensitive), exit codes +- Cover success + ≥1 failure path per behaviour +- Cover boundary values & empty/null inputs +- Specific over generic assertions (see language reference) + +## Fixtures + +- Place in framework-idiomatic location (Python: `conftest.py`) +- Reuse across tests; no copy-pasted setup +- Document purpose & side effects + +## Report by severity + +Group violations under three headings. Empty sections: `(none)`. + +### Blocker +Broken tests, real I/O, shared state (suite unreliable/order-dependent). + +### Important +Missing failure/boundary coverage, weak/absent assertions, private access, copy-pasted setup, unreadable naming. + +### Nit +Minor naming inconsistencies, missing fixture docstrings, style issues. + +## Note: pytest `tmp_path` + +`pytest.tmp_path` violates isolation (real file system). Flag as **Important**. Preferred fix: refactor to accept file-like object, use `io.BytesIO`/`io.StringIO`. Accept `tmp_path` only if core logic requires real path and refactoring disproportionate — document explicitly. diff --git a/skills/test-unit-standards/evals/evals.json b/skills/test-unit-standards/evals/evals.json new file mode 100644 index 0000000..de3695a --- /dev/null +++ b/skills/test-unit-standards/evals/evals.json @@ -0,0 +1,266 @@ +{ + "skill_name": "test-unit-standards", + "evals": [ + { + "id": 1, + "category": "happy-path", + "prompt": "Can you review these unit tests for our PaymentService and confirm they follow our unit testing standards? File: evals/files/compliant-payment-tests.py", + "expected_output": "Review confirms the tests comply with all unit testing standards. All I/O is stubbed via fixtures (no real network, DB, or filesystem). Tests use only the public interface. Names follow test___. Assertions are specific. Both success and failure paths are covered. Boundary value (0.01) is tested. Shared fixtures are defined and documented. No violations found.", + "files": [ + "evals/files/compliant-payment-tests.py" + ], + "expectations": [ + "No violations are reported", + "Reviewer confirms all I/O is properly stubbed (isolation maintained)", + "Reviewer confirms tests use the public interface only", + "Reviewer confirms naming convention test___ is followed", + "Reviewer confirms success and failure paths are both covered", + "Reviewer acknowledges shared fixtures are defined and documented", + "Reviewer does not fabricate problems where none exist" + ] + }, + { + "id": 2, + "category": "regression", + "prompt": "Please review these unit tests for the ReportExporter. Do they follow isolation standards? File: evals/files/real-io-violations.py", + "expected_output": "Review flags three isolation violations: (1) test_export_pdf_creates_file writes to the real filesystem (/tmp). (2) test_fetch_template_from_cdn makes a real outbound HTTP request. (3) TestReportExporterWithDB.setup_method opens a live PostgreSQL connection. Each should be replaced with a mock or stub at its I/O boundary.", + "files": [ + "evals/files/real-io-violations.py" + ], + "expectations": [ + "test_export_pdf_creates_file is flagged for real filesystem write", + "test_fetch_template_from_cdn is flagged for real outbound HTTP call", + "TestReportExporterWithDB.setup_method is flagged for real PostgreSQL connection", + "Reviewer recommends mocking os/pathlib, requests, and psycopg2 at their boundary", + "Isolation rule is cited as the violated standard" + ] + }, + { + "id": 3, + "category": "regression", + "prompt": "Check whether these TokenValidator tests follow unit test standards. Do any of them access things they shouldn't? File: evals/files/private-member-access.py", + "expected_output": "Review flags three scope violations: (1) test_internal_decode_extracts_sub_claim calls private method _decode directly. (2) test_signature_cache_is_populated reads private attribute _cache. (3) test_algorithm_whitelist_contains_hs256 reads private attribute _allowed_algorithms. All three should be rewritten to assert behaviour through the public validate() interface.", + "files": [ + "evals/files/private-member-access.py" + ], + "expectations": [ + "test_internal_decode_extracts_sub_claim is flagged for calling private method _decode", + "test_signature_cache_is_populated is flagged for reading private attribute _cache", + "test_algorithm_whitelist_contains_hs256 is flagged for reading private attribute _allowed_algorithms", + "Reviewer explains that behaviour should be verified through the public interface", + "Scope rule is cited as the violated standard" + ] + }, + { + "id": 4, + "category": "regression", + "prompt": "Are these OrderProcessor tests complete? Do the assertions and coverage look right to you? File: evals/files/weak-assertions-no-failure.py", + "expected_output": "Review flags assertion completeness issues: (1) test_process_order_returns_something uses 'is not None' — too weak, should assert the specific return value. (2) test_process_order_completes asserts truthy only. (3) test_reserve_stock_called asserts '.called is not None' which is always True and proves nothing. Also flags missing failure-path coverage (out-of-stock, unknown SKU) and missing boundary tests (quantity=0, quantity=1).", + "files": [ + "evals/files/weak-assertions-no-failure.py" + ], + "expectations": [ + "test_process_order_returns_something is flagged for 'is not None' weak assertion", + "test_process_order_completes is flagged for truthy-check-only assertion", + "test_reserve_stock_called is flagged for '.called is not None' which is always True", + "Missing failure-path tests (out-of-stock or unknown SKU) are flagged", + "Missing boundary tests (quantity=0) are flagged", + "Assertion completeness and boundary coverage rules are cited" + ] + }, + { + "id": 5, + "category": "regression", + "prompt": "Review these UserProfileService tests — I think there may be some duplication issues. File: evals/files/copy-paste-setup.py", + "expected_output": "Review flags that the (repo, cache, service) setup block is copy-pasted identically across all four tests. Recommends extracting it into a shared @pytest.fixture (or conftest.py). Also notes that fixtures should be documented with their purpose and any side effects.", + "files": [ + "evals/files/copy-paste-setup.py" + ], + "expectations": [ + "Identical (repo, cache, service) setup in each test is flagged as copy-pasted", + "Reviewer recommends extracting shared setup into a @pytest.fixture", + "Reviewer mentions placing shared fixtures in conftest.py or a test class setup", + "Reviewer notes that fixtures should be documented with purpose and side effects", + "Fixture management rule is cited as the violated standard" + ] + }, + { + "id": 6, + "category": "regression", + "prompt": "Audit these DiscountCalculator tests for naming conventions and test coverage gaps. File: evals/files/bad-naming-missing-boundaries.py", + "expected_output": "Review flags four naming violations: (1) test_1 has no descriptive name. (2) test_discount_works is vague — no condition or expected outcome. (3) test_it_returns_price is missing the expected-outcome segment. (4) test_big_number does not follow test___ format. Also flags missing boundary test for price=0, missing failure test for negative price, and missing failure test for unknown tier string.", + "files": [ + "evals/files/bad-naming-missing-boundaries.py" + ], + "expectations": [ + "test_1 is flagged as having no descriptive name", + "test_discount_works is flagged as vague — missing condition and expected outcome", + "test_it_returns_price is flagged for missing the expected-outcome segment", + "test_big_number is flagged for not following test___", + "Missing boundary test for price=0 is flagged", + "Missing failure test for negative price is flagged", + "Missing failure test for unknown member_tier string is flagged", + "Naming convention and boundary coverage rules are cited" + ] + }, + { + "id": 7, + "category": "happy-path", + "prompt": "Write unit tests for the DiscountCalculator class in evals/files/source-discount-calculator.py. Make sure they follow our unit test standards.", + "expected_output": "Output produces compliant unit tests: names follow test___; all valid tiers (gold, silver, bronze, None) have a success test; failure paths for negative price and unknown tier are tested with specific exception assertions; boundary value price=0 is tested; assertions are specific (assertEqual / assert ==); no private members are accessed; no real I/O is introduced.", + "files": [ + "evals/files/source-discount-calculator.py" + ], + "expectations": [ + "Test names follow test___ convention", + "All four tiers (gold, silver, bronze, None) are tested for correct discount", + "Failure path for negative price is tested with a specific exception assertion", + "Failure path for unknown tier string is tested with a specific exception assertion", + "Boundary value price=0 is covered", + "Assertions are specific (exact value comparison, not 'is not None' or truthy)", + "No private members are accessed", + "No real I/O or external dependencies are introduced" + ] + }, + { + "id": 8, + "category": "negative", + "prompt": "Can you write a docstring for the UserProfileService class? I don't need tests right now.", + "expected_output": "Assistant writes a docstring for the class without applying unit test standards. It does not flag test coverage gaps, enforce test naming, or cite isolation rules — because no test code was provided and none was requested.", + "files": [], + "expectations": [ + "Assistant produces a docstring, not unit tests", + "Skill does not apply test naming, isolation, or assertion rules to a non-test request", + "No test coverage gaps, fixture requirements, or boundary coverage are cited", + "The docstring output is idiomatic and useful if produced" + ] + }, + { + "id": 9, + "category": "paraphrase", + "prompt": "Are these tests up to scratch? I want to make sure they're not calling anything outside the service under test. evals/files/real-io-violations.py", + "expected_output": "Same core behaviour as eval 2: all three real-I/O violations are identified (real filesystem, real HTTP call, real DB connection) with remediation guidance. Phrasing of the request differs but the findings and rule citations should be equivalent.", + "files": [ + "evals/files/real-io-violations.py" + ], + "expectations": [ + "Real filesystem I/O (test_export_pdf_creates_file) is flagged", + "Real HTTP call (test_fetch_template_from_cdn) is flagged", + "Real PostgreSQL connection (TestReportExporterWithDB.setup_method) is flagged", + "Remediation guidance (mock os/requests/psycopg2) is provided", + "Response substance matches eval 2 despite different prompt phrasing" + ] + }, + { + "id": 10, + "category": "multi-violation", + "prompt": "Full audit of these NotificationService unit tests — tell me everything that's wrong. File: evals/files/multi-violation-audit.py", + "expected_output": "Review identifies violations across all six rule categories: (1) ISOLATION — setup_method uses a real Redis connection. (2) SCOPE — _build_message (private method) and _retry_count (private attribute) are accessed directly. (3) NAMING — test_send, test_error, test_ok do not follow test___. (4) ASSERTIONS — test_send uses 'is not None'; test_ok uses a truthy check; test_error has no assertion. (5) FIXTURES — redis_mock + email_client setup is copy-pasted in the standalone tests instead of using a shared fixture. (6) BOUNDARIES — no test for empty recipients, None subject, or zero-length body.", + "files": [ + "evals/files/multi-violation-audit.py" + ], + "expectations": [ + "setup_method real Redis connection is flagged as an isolation violation", + "_build_message call in test_build_message_template is flagged as private method access", + "_retry_count read in test_retry_count_after_failure is flagged as private attribute access", + "test_send, test_error, test_ok are flagged for not following the naming convention", + "test_send 'is not None' assertion is flagged as weak", + "test_ok truthy-only assertion is flagged as weak", + "test_error has no assertion — flagged as an incomplete test", + "Copy-pasted redis_mock + email_client setup in standalone tests is flagged", + "Missing boundary tests (empty recipients, None subject) are flagged", + "All six rule categories are applied: isolation, scope, naming, assertions, fixtures, boundaries" + ] + }, + { + "id": 11, + "category": "regression", + "prompt": "Are there any isolation issues in these CartService tests? File: evals/files/shared-mutable-state.py", + "expected_output": "Review flags that TestCartServiceSequenced._processed is a class-level mutable list shared across all test instances. test_refund_removes_order depends on test_checkout_records_order having run first, making the suite order-dependent. Each test must be fully independent and runnable in any order; shared mutable state must be reset per-test (e.g. in a fixture with function scope).", + "files": [ + "evals/files/shared-mutable-state.py" + ], + "expectations": [ + "_processed class-level mutable list is flagged as shared mutable state", + "test_refund_removes_order is flagged as order-dependent on test_checkout_records_order", + "Reviewer explains the tests are not independently runnable", + "Reviewer recommends resetting state per-test via a function-scoped fixture or instance attribute", + "Isolation rule (no shared mutable state, each test runnable in isolation) is cited" + ] + }, + { + "id": 12, + "category": "regression", + "prompt": "Check these InvoiceGenerator tests for structural issues — docstrings, comments, that kind of thing. File: evals/files/missing-docstrings-stray-comments.py", + "expected_output": "Review flags: (1) test_generate_valid_invoice has no docstring — scenario not stated. (2) test_generate_zero_amount and test_generate_missing_client_raises use inline comments instead of docstrings. (3) A stray explanatory comment appears between test functions (outside any test method) above test_generate_negative_amount. (4) The generator fixture has no docstring documenting its purpose or side effects.", + "files": [ + "evals/files/missing-docstrings-stray-comments.py" + ], + "expectations": [ + "test_generate_valid_invoice is flagged for missing docstring", + "test_generate_zero_amount is flagged for using an inline comment instead of a docstring", + "The stray comment between test functions (outside a test method) is flagged", + "test_generate_missing_client_raises is flagged for inline comment instead of docstring", + "The generator fixture is flagged for missing purpose/side-effects docstring", + "Naming/structure rule (docstring required, no stray comments outside test methods) is cited" + ] + }, + { + "id": 13, + "category": "negative", + "prompt": "The calculate() method in evals/files/mixed-source-and-tests.py uses a chain of if/elif blocks. Can you refactor it to use a cleaner lookup structure? The test file is there for reference — don't change the tests.", + "expected_output": "Assistant refactors the calculate() method (e.g. replacing the if/elif weight tiers with a sorted lookup table). It does NOT apply unit test standards to the test portion of the file, flag test names, assert coverage gaps, or cite isolation rules — because the request is a source-code refactor, not a test review.", + "files": [ + "evals/files/mixed-source-and-tests.py" + ], + "expectations": [ + "Assistant refactors the calculate() method as requested", + "Skill does not flag the test functions for naming or assertion issues", + "No test coverage gaps, fixture requirements, or isolation rules are cited", + "The existing passing tests are left unchanged", + "Scope note behaviour is correctly applied: task is non-test so standards are not enforced" + ] + }, + { + "id": 14, + "category": "paraphrase", + "prompt": "Are these tests following our conventions? They were written by a new team member.", + "expected_output": "Agent applies the full six-category unit test standards checklist: isolation (no real I/O), scope (no private member access), naming (test___), assertions (specific values vs truthy checks), coverage (failure paths, boundaries, edge cases), fixtures (docstrings, no shared mutable state). Groups findings as Blocker / Important / Nit. Confirms compliance explicitly if all categories pass. The 'new team member' framing does not change the standard applied.", + "files": [], + "expectations": [ + "Identifies 'following our conventions' as a unit test standards review", + "Applies all six standards categories", + "Groups findings as Blocker / Important / Nit", + "Does not soften the standard because the author is new" + ] + }, + { + "id": 15, + "category": "edge-case", + "prompt": "A test uses pytest's tmp_path fixture to write a file and then reads it back. Is using a real file system in a unit test a violation?", + "expected_output": "Yes, but it is a borderline case worth explaining. The standard says no real I/O in unit tests. tmp_path is an in-process temporary directory that does touch the real file system, so strictly it is a violation of the isolation rule. However, if the function under test explicitly takes a file path and its core logic is the file handling, the choice is: (1) refactor to accept a file-like object (preferred — allows full in-memory testing); (2) accept tmp_path as a pragmatic exception only when refactoring is disproportionate. Flag it as Important with the refactoring suggestion as the preferred fix.", + "files": [], + "expectations": [ + "Identifies tmp_path as real file system I/O — a standards violation", + "Notes the borderline nature of the case", + "Recommends refactoring to accept a file-like object as the preferred fix", + "Acknowledges tmp_path as a pragmatic exception only when refactoring is disproportionate", + "Flags as Important, not Nit" + ] + }, + { + "id": 16, + "category": "output-format", + "prompt": "Show me what the skill output looks like for a test file with one isolation violation and one naming violation.", + "expected_output": "The response groups findings under '### Blocker', '### Important', '### Nit' headers. Each violation is a bulleted item with: (1) test name in backticks, (2) standards category (e.g. 'Isolation', 'Naming'), (3) the specific rule violated, (4) a concrete fix suggestion. Code snippets showing the fix appear in fenced code blocks. A compliance confirmation is stated explicitly when no Blocker or Important findings exist.", + "files": [], + "expectations": [ + "Blocker / Important / Nit H3 sections", + "Bulleted items with test name in backticks", + "Standards category name in each finding", + "Fix suggestions in fenced code blocks where applicable", + "Explicit compliance confirmation when no Blocker/Important findings" + ] + } + ] +} diff --git a/skills/test-unit-standards/evals/files/bad-naming-missing-boundaries.py b/skills/test-unit-standards/evals/files/bad-naming-missing-boundaries.py new file mode 100644 index 0000000..0e787f5 --- /dev/null +++ b/skills/test-unit-standards/evals/files/bad-naming-missing-boundaries.py @@ -0,0 +1,49 @@ +"""Unit tests for DiscountCalculator. + +VIOLATIONS — naming conventions and boundary/failure-path coverage: + test_1 : no descriptive name at all + test_discount_works : vague name — no condition or expected outcome stated + test_it_returns_price : missing the 'expected' segment + test_big_number : not in test___ format + (no test) : missing boundary test for price=0.0 + (no test) : missing failure test for negative price + (no test) : missing failure test for unknown member_tier string + (no test) : missing floating-point boundary (e.g. price=0.01) +""" +import pytest + +from services.discount_calculator import DiscountCalculator + + +def test_1(): + """No descriptive name — scenario and expected outcome are unclear.""" + calc = DiscountCalculator() + result = calc.calculate(price=100.0, member_tier="gold") + assert result == 80.0 + + +def test_discount_works(): + """Vague name: does not state the condition being tested or the expected result.""" + calc = DiscountCalculator() + result = calc.calculate(price=200.0, member_tier="silver") + assert result == 170.0 + + +def test_it_returns_price(): + """Name identifies the subject but is missing the expected-outcome segment.""" + calc = DiscountCalculator() + result = calc.calculate(price=50.0, member_tier=None) + assert result == 50.0 + + +def test_big_number(): + """Name is not in test___ format.""" + calc = DiscountCalculator() + result = calc.calculate(price=1_000_000.0, member_tier="gold") + assert result == 800_000.0 + + +# Missing: test_calculate_price_zero_no_tier_returns_zero +# Missing: test_calculate_negative_price_raises_value_error +# Missing: test_calculate_unknown_tier_raises_value_error +# Missing: test_calculate_minimum_positive_price_applies_correct_discount diff --git a/skills/test-unit-standards/evals/files/compliant-payment-tests.py b/skills/test-unit-standards/evals/files/compliant-payment-tests.py new file mode 100644 index 0000000..e829566 --- /dev/null +++ b/skills/test-unit-standards/evals/files/compliant-payment-tests.py @@ -0,0 +1,97 @@ +"""Unit tests for PaymentService. + +Tests cover the public interface only. All I/O (gateway, logger) is stubbed via +fixtures. Shared fixtures are defined once; no copy-paste setup. +""" +import pytest +from unittest.mock import MagicMock + +from services.payment_service import PaymentService, InsufficientFundsError, PaymentError + + +# --- fixtures --- + + +@pytest.fixture +def gateway(): + """Stub for the external payment gateway; returns a canned success response.""" + mock = MagicMock() + mock.charge.return_value = {"status": "ok", "transaction_id": "txn_001"} + return mock + + +@pytest.fixture +def logger(): + """Stub logger so log-call assertions can be made without real I/O.""" + return MagicMock() + + +@pytest.fixture +def service(gateway, logger): + """PaymentService wired to stub dependencies.""" + return PaymentService(gateway=gateway, logger=logger) + + +# --- charge: success path --- + + +def test_charge_valid_amount_returns_transaction_id(service, gateway): + """Charging a positive amount returns the transaction ID supplied by the gateway.""" + result = service.charge(amount=100, currency="ZAR", account_id="acc_1") + assert result == "txn_001" + + +def test_charge_valid_amount_calls_gateway_once(service, gateway): + """Exactly one gateway call is made per charge invocation.""" + service.charge(amount=100, currency="ZAR", account_id="acc_1") + gateway.charge.assert_called_once_with(100, "ZAR", "acc_1") + + +def test_charge_success_emits_info_log(service, logger): + """A single info-level log entry is emitted after a successful charge.""" + service.charge(amount=100, currency="ZAR", account_id="acc_1") + logger.info.assert_called_once() + + +# --- charge: boundary values --- + + +def test_charge_minimum_positive_amount_succeeds(service): + """Charging the minimum positive amount (0.01) completes without error.""" + result = service.charge(amount=0.01, currency="ZAR", account_id="acc_1") + assert result == "txn_001" + + +# --- charge: failure paths --- + + +def test_charge_zero_amount_raises_value_error(service): + """Charging zero is rejected with ValueError before contacting the gateway.""" + with pytest.raises(ValueError, match="amount must be positive"): + service.charge(amount=0, currency="ZAR", account_id="acc_1") + + +def test_charge_negative_amount_raises_value_error(service): + """Negative amounts are rejected immediately without reaching the gateway.""" + with pytest.raises(ValueError, match="amount must be positive"): + service.charge(amount=-50, currency="ZAR", account_id="acc_1") + + +def test_charge_gateway_timeout_raises_payment_error(service, gateway): + """A gateway timeout is surfaced to the caller as a PaymentError.""" + gateway.charge.side_effect = PaymentError("gateway timeout") + with pytest.raises(PaymentError): + service.charge(amount=100, currency="ZAR", account_id="acc_1") + + +def test_charge_insufficient_funds_raises_insufficient_funds_error(service, gateway): + """Gateway returning insufficient-funds propagates as InsufficientFundsError.""" + gateway.charge.side_effect = InsufficientFundsError("balance too low") + with pytest.raises(InsufficientFundsError): + service.charge(amount=9999, currency="ZAR", account_id="acc_low") + + +def test_charge_empty_account_id_raises_value_error(service): + """An empty account_id string is rejected with a descriptive ValueError.""" + with pytest.raises(ValueError, match="account_id"): + service.charge(amount=100, currency="ZAR", account_id="") diff --git a/skills/test-unit-standards/evals/files/copy-paste-setup.py b/skills/test-unit-standards/evals/files/copy-paste-setup.py new file mode 100644 index 0000000..6cb90ee --- /dev/null +++ b/skills/test-unit-standards/evals/files/copy-paste-setup.py @@ -0,0 +1,66 @@ +"""Unit tests for UserProfileService. + +VIOLATIONS — fixture management: + All four tests contain an identical setup block (repo, cache, service construction) + copy-pasted inline. No shared @pytest.fixture or conftest.py helper is used. + Fixtures are not documented with their purpose or side effects. +""" +from unittest.mock import MagicMock + +import pytest + +from services.user_profile_service import UserProfileService + + +def test_get_profile_returns_display_name(): + """Returns the user's display name when the user exists in the repository.""" + # duplicated setup — should be a shared fixture + repo = MagicMock() + cache = MagicMock() + cache.get.return_value = None + repo.find_by_id.return_value = {"id": "u1", "display_name": "Alice", "email": "a@example.com"} + service = UserProfileService(repo=repo, cache=cache) + + profile = service.get_profile("u1") + assert profile["display_name"] == "Alice" + + +def test_get_profile_caches_result_after_first_call(): + """Result is cached so the repository is only queried once for repeated calls.""" + # duplicated setup + repo = MagicMock() + cache = MagicMock() + cache.get.return_value = None + repo.find_by_id.return_value = {"id": "u1", "display_name": "Alice", "email": "a@example.com"} + service = UserProfileService(repo=repo, cache=cache) + + service.get_profile("u1") + service.get_profile("u1") + assert cache.set.call_count == 1 + assert repo.find_by_id.call_count == 1 # second call should hit the cache + + +def test_get_profile_unknown_user_raises_not_found(): + """Requesting a user that does not exist raises KeyError.""" + # duplicated setup + repo = MagicMock() + cache = MagicMock() + cache.get.return_value = None + repo.find_by_id.return_value = None + service = UserProfileService(repo=repo, cache=cache) + + with pytest.raises(KeyError): + service.get_profile("unknown") + + +def test_update_email_persists_change(): + """Updating the email address delegates a save call to the repository.""" + # duplicated setup yet again + repo = MagicMock() + cache = MagicMock() + cache.get.return_value = None + repo.find_by_id.return_value = {"id": "u1", "display_name": "Alice", "email": "old@example.com"} + service = UserProfileService(repo=repo, cache=cache) + + service.update_email("u1", "new@example.com") + repo.save.assert_called_once() diff --git a/skills/test-unit-standards/evals/files/missing-docstrings-stray-comments.py b/skills/test-unit-standards/evals/files/missing-docstrings-stray-comments.py new file mode 100644 index 0000000..99fd591 --- /dev/null +++ b/skills/test-unit-standards/evals/files/missing-docstrings-stray-comments.py @@ -0,0 +1,52 @@ +"""Unit tests for InvoiceGenerator. + +VIOLATIONS — naming / structure: + test_generate_valid_invoice : no docstring (scenario not stated) + test_generate_zero_amount : no docstring; inline comment used instead + test_generate_negative_amount : stray explanatory comment outside a test method + test_generate_missing_client_raises : no docstring; stray inline comment in body + +Fixture docstring is also missing its side-effects note. +""" +import pytest +from unittest.mock import MagicMock + +from billing.invoice_generator import InvoiceGenerator + + +@pytest.fixture +def generator(): + # this fixture creates an InvoiceGenerator — missing docstring with purpose/side-effects + mailer = MagicMock() + templates = MagicMock() + templates.render.return_value = "Invoice" + return InvoiceGenerator(mailer=mailer, templates=templates) + + +# --- generate --- + + +def test_generate_valid_invoice(generator): + # no docstring — scenario is not stated as required by standards + result = generator.generate(client_id="c1", amount=500.0) + assert result["status"] == "sent" + + +def test_generate_zero_amount(generator): + # no docstring; scenario explained via inline comment instead + # zero amount should raise ValueError per the contract + with pytest.raises(ValueError): + generator.generate(client_id="c1", amount=0.0) + + +# Stray comment outside a test method explaining intent — should not appear here +def test_generate_negative_amount(generator): + with pytest.raises(ValueError): + generator.generate(client_id="c1", amount=-10.0) + + +def test_generate_missing_client_raises_key_error(generator): + # inline comment used instead of a docstring + # empty client_id is not permitted + with pytest.raises(KeyError): + generator.generate(client_id="", amount=100.0) diff --git a/skills/test-unit-standards/evals/files/mixed-source-and-tests.py b/skills/test-unit-standards/evals/files/mixed-source-and-tests.py new file mode 100644 index 0000000..fab823e --- /dev/null +++ b/skills/test-unit-standards/evals/files/mixed-source-and-tests.py @@ -0,0 +1,62 @@ +"""Mixed request fixture: source code + tests side by side. + +Used by eval 13 (scope-note adherence): the user asks for a source-code +refactor, not a test review. The skill must complete the refactor without +citing or enforcing unit test standards on the test file. +""" + +# ── source under refactor ───────────────────────────────────────────────────── + + +class ShippingCalculator: + """Calculates shipping cost based on weight and destination zone.""" + + ZONE_RATES = {"domestic": 15.0, "regional": 35.0, "international": 80.0} + + def calculate(self, weight_kg: float, zone: str) -> float: + if weight_kg <= 0: + raise ValueError("weight_kg must be positive") + if zone not in self.ZONE_RATES: + raise ValueError(f"unknown zone: {zone!r}") + if weight_kg <= 1: + base = self.ZONE_RATES[zone] + elif weight_kg <= 5: + base = self.ZONE_RATES[zone] * 1.5 + elif weight_kg <= 20: + base = self.ZONE_RATES[zone] * 2.5 + else: + base = self.ZONE_RATES[zone] * 4.0 + return round(base, 2) + + +# ── existing tests (compliant — should not be touched or flagged) ───────────── + +import pytest + + +@pytest.fixture +def calc(): + """ShippingCalculator instance for all tests.""" + return ShippingCalculator() + + +def test_calculate_domestic_under_1kg_returns_base_rate(calc): + """Shipments up to 1 kg domestic use the flat base rate.""" + assert calc.calculate(weight_kg=0.5, zone="domestic") == 15.0 + + +def test_calculate_international_over_20kg_applies_max_multiplier(calc): + """Shipments over 20 kg international use the 4× multiplier.""" + assert calc.calculate(weight_kg=25.0, zone="international") == 320.0 + + +def test_calculate_zero_weight_raises_value_error(calc): + """Zero weight is rejected with a descriptive ValueError.""" + with pytest.raises(ValueError, match="weight_kg must be positive"): + calc.calculate(weight_kg=0, zone="domestic") + + +def test_calculate_unknown_zone_raises_value_error(calc): + """An unrecognised zone string raises ValueError.""" + with pytest.raises(ValueError, match="unknown zone"): + calc.calculate(weight_kg=1.0, zone="orbit") diff --git a/skills/test-unit-standards/evals/files/multi-violation-audit.py b/skills/test-unit-standards/evals/files/multi-violation-audit.py new file mode 100644 index 0000000..2e80486 --- /dev/null +++ b/skills/test-unit-standards/evals/files/multi-violation-audit.py @@ -0,0 +1,92 @@ +"""Unit tests for NotificationService. + +VIOLATIONS — all rule categories present (used for multi-violation audit eval): + + ISOLATION setup_method opens a real Redis connection instead of using a stub + SCOPE test_build_message_template calls private _build_message method + test_retry_count_after_failure reads private _retry_count attribute + NAMING test_send / test_error / test_ok do not follow test___ + ASSERTIONS test_send asserts 'is not None' (weak) + test_ok asserts truthiness only + test_error has no assertion at all + FIXTURES redis_mock + email_client construction is copy-pasted in standalone tests + BOUNDARIES no test for empty recipients list + no test for None subject + no test for zero-length message body +""" +import redis +import pytest +from unittest.mock import MagicMock + +from notifications.notification_service import NotificationService + + +class TestNotificationService: + def setup_method(self): + # real Redis connection — not mocked; violates isolation + self.redis = redis.Redis(host="localhost", port=6379) + self.email_client = MagicMock() + self.service = NotificationService( + redis=self.redis, email_client=self.email_client + ) + + def test_send(self): + """Vague name — no condition or expected outcome.""" + result = self.service.send( + recipients=["a@example.com"], subject="Hello", body="World" + ) + assert result is not None # weak: specific return value not asserted + + def test_error(self): + """Name gives no information about the scenario being tested.""" + self.email_client.send.side_effect = ConnectionError("SMTP down") + # missing: no assertion — test body does not verify the outcome + + def test_ok(self): + """Overly generic pass/fail label.""" + result = self.service.send( + recipients=["b@example.com"], subject="Hi", body="There" + ) + assert result # weak: truthy check only + + def test_build_message_template(self): + """Accesses private _build_message method to verify template rendering.""" + msg = self.service._build_message(subject="Hi", body="Body") # private method — violates scope + assert "Hi" in msg + + def test_retry_count_after_failure(self): + """Reads private _retry_count attribute after a simulated gateway failure.""" + self.email_client.send.side_effect = ConnectionError() + try: + self.service.send(recipients=["c@example.com"], subject="Retry", body="Test") + except Exception: + pass + assert self.service._retry_count > 0 # private attribute — violates scope + + +# --- standalone tests with copy-pasted fixture setup --- + + +def test_send_to_multiple_recipients_calls_email_client_per_recipient(): + """Sending to two recipients results in two email-client calls.""" + email_client = MagicMock() # duplicated setup — should use a shared fixture + redis_mock = MagicMock() + service = NotificationService(redis=redis_mock, email_client=email_client) + + service.send(recipients=["x@e.com", "y@e.com"], subject="Hi", body="Body") + assert email_client.send.call_count == 2 + + +def test_queue_notification_stores_message_in_redis(): + """Queuing a notification pushes it to Redis.""" + email_client = MagicMock() # duplicated setup — should use a shared fixture + redis_mock = MagicMock() + service = NotificationService(redis=redis_mock, email_client=email_client) + + service.queue("z@e.com", subject="Queued", body="Later") + redis_mock.lpush.assert_called_once() + + +# Missing: test_send_empty_recipients_raises_value_error (boundary) +# Missing: test_send_none_subject_raises_value_error (boundary / null input) +# Missing: test_send_smtp_failure_raises_notification_error (failure path with assertion) diff --git a/skills/test-unit-standards/evals/files/private-member-access.py b/skills/test-unit-standards/evals/files/private-member-access.py new file mode 100644 index 0000000..248e570 --- /dev/null +++ b/skills/test-unit-standards/evals/files/private-member-access.py @@ -0,0 +1,51 @@ +"""Unit tests for TokenValidator. + +VIOLATIONS — scope (accessing private members): + test_internal_decode_extracts_sub_claim : calls _decode (private method) + test_signature_cache_is_populated : reads _cache (private attribute) + test_algorithm_whitelist_contains_hs256 : reads _allowed_algorithms (private attribute) +""" +import pytest + +from auth.token_validator import TokenValidator + +VALID_TOKEN = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1c2VyMSJ9.SIG" + + +@pytest.fixture +def validator(): + """TokenValidator initialised with a test secret.""" + return TokenValidator(secret="test-secret") + + +# --- compliant tests (public interface only) --- + + +def test_validate_token_returns_true_for_valid_jwt(validator): + """Valid JWT passes validation via the public validate() method.""" + assert validator.validate(VALID_TOKEN) is True + + +def test_validate_token_returns_false_for_expired_jwt(validator): + """Expired token is rejected and validate() returns False.""" + assert validator.validate("expired.token.sig") is False + + +# --- violating tests (private member access) --- + + +def test_internal_decode_extracts_sub_claim(validator): + """Calls private _decode method to verify claim extraction internals.""" + payload = validator._decode(VALID_TOKEN) # private method — violates scope + assert payload["sub"] == "user1" + + +def test_signature_cache_is_populated_after_validation(validator): + """Reads private _cache attribute to assert caching side-effect.""" + validator.validate(VALID_TOKEN) + assert VALID_TOKEN in validator._cache # private attribute — violates scope + + +def test_algorithm_whitelist_contains_hs256(validator): + """Reads private _allowed_algorithms attribute directly.""" + assert "HS256" in validator._allowed_algorithms # private attribute — violates scope diff --git a/skills/test-unit-standards/evals/files/real-io-violations.py b/skills/test-unit-standards/evals/files/real-io-violations.py new file mode 100644 index 0000000..c84b653 --- /dev/null +++ b/skills/test-unit-standards/evals/files/real-io-violations.py @@ -0,0 +1,51 @@ +"""Unit tests for ReportExporter. + +VIOLATIONS — isolation: + test_export_pdf_creates_file : writes to the real filesystem (/tmp) + test_fetch_template_from_cdn : sends a real outbound HTTP request + TestReportExporterWithDB : setup_method opens a real PostgreSQL connection +""" +import os + +import requests +import pytest + +from exporters.report_exporter import ReportExporter + + +def test_export_pdf_creates_file(): + """Exports a report to PDF and verifies the file exists on disk.""" + exporter = ReportExporter() + path = "/tmp/report_test_output.pdf" + exporter.export(data={"title": "Q1"}, output_path=path) + assert os.path.exists(path) # real filesystem I/O — violates isolation + os.remove(path) + + +def test_fetch_template_from_cdn(): + """Downloads a template from the CDN and applies it to an empty data set.""" + exporter = ReportExporter() + resp = requests.get("https://cdn.example.com/templates/report.html") # real HTTP call + assert resp.status_code == 200 + exporter.apply_template(resp.text, data={}) + + +class TestReportExporterWithDB: + """Integration-style tests embedded in a unit test file — DB is not mocked.""" + + def setup_method(self): + import psycopg2 + # real database connection — not mocked or stubbed + self.conn = psycopg2.connect( + "host=localhost dbname=reports user=sa password=sa" + ) + + def test_load_report_data_from_db_returns_row(self): + """Loads a report row from PostgreSQL and asserts it is not None.""" + cur = self.conn.cursor() + cur.execute("SELECT * FROM reports LIMIT 1") + row = cur.fetchone() + assert row is not None # depends on live database state + + def teardown_method(self): + self.conn.close() diff --git a/skills/test-unit-standards/evals/files/shared-mutable-state.py b/skills/test-unit-standards/evals/files/shared-mutable-state.py new file mode 100644 index 0000000..43aadeb --- /dev/null +++ b/skills/test-unit-standards/evals/files/shared-mutable-state.py @@ -0,0 +1,37 @@ +"""Unit tests for CartService. + +VIOLATIONS — isolation (shared mutable state between tests): + TestCartServiceSequenced._processed is a class-level list mutated across tests. + test_refund_removes_order depends on test_checkout_records_order running first, + making the suite order-dependent and breaking independent-execution guarantees. +""" +import pytest +from unittest.mock import MagicMock + +from services.cart_service import CartService + + +class TestCartServiceSequenced: + _processed: list = [] # class-level mutable state — shared across all test instances + + @pytest.fixture(autouse=True) + def setup(self): + """Wire CartService to a payment stub.""" + self.payment = MagicMock() + self.service = CartService(payment=self.payment) + + def test_checkout_records_order(self): + """Checkout adds an entry to the shared _processed list.""" + self.service.checkout(cart_id="c1", total=150.0) + self.__class__._processed.append("c1") # mutates shared class-level state + assert "c1" in self.__class__._processed + + def test_refund_removes_order(self): + """Refund removes an entry — implicitly requires test_checkout_records_order to have run first.""" + self.__class__._processed.remove("c1") # reads state written by a previous test + assert "c1" not in self.__class__._processed + + def test_checkout_delegates_charge_to_payment_gateway(self): + """Checkout total is delegated to the payment gateway.""" + self.service.checkout(cart_id="c2", total=200.0) + self.payment.charge.assert_called_once_with(200.0) diff --git a/skills/test-unit-standards/evals/files/source-discount-calculator.py b/skills/test-unit-standards/evals/files/source-discount-calculator.py new file mode 100644 index 0000000..0be5cd4 --- /dev/null +++ b/skills/test-unit-standards/evals/files/source-discount-calculator.py @@ -0,0 +1,45 @@ +"""Source module for DiscountCalculator — used by eval 7 (write-tests task). + +This is the production code under test. No test file exists yet; the eval asks +the model to write compliant unit tests for this class from scratch. +""" +from __future__ import annotations + + +class DiscountCalculator: + """Calculates a discounted price based on the customer's membership tier. + + Recognised tiers and their discounts: + "gold" → 20 % off + "silver" → 15 % off + "bronze" → 10 % off + None → 0 % off (no discount) + + Raises: + ValueError: if *price* is negative. + ValueError: if *member_tier* is not one of the recognised values or None. + """ + + DISCOUNTS: dict[str | None, float] = { + "gold": 0.20, + "silver": 0.15, + "bronze": 0.10, + None: 0.00, + } + + def calculate(self, price: float, member_tier: str | None) -> float: + """Return the discounted price rounded to two decimal places. + + Args: + price: The original price (must be >= 0). + member_tier: One of "gold", "silver", "bronze", or None. + + Returns: + The discounted price as a float. + """ + if price < 0: + raise ValueError(f"price must be non-negative, got {price}") + if member_tier not in self.DISCOUNTS: + raise ValueError(f"unknown tier: {member_tier!r}") + discount = self.DISCOUNTS[member_tier] + return round(price * (1 - discount), 2) diff --git a/skills/test-unit-standards/evals/files/weak-assertions-no-failure.py b/skills/test-unit-standards/evals/files/weak-assertions-no-failure.py new file mode 100644 index 0000000..2517e3b --- /dev/null +++ b/skills/test-unit-standards/evals/files/weak-assertions-no-failure.py @@ -0,0 +1,48 @@ +"""Unit tests for OrderProcessor. + +VIOLATIONS — assertion completeness: + test_process_order_returns_something : asserts 'is not None' instead of a specific value + test_process_order_completes : asserts truthiness only (assert result) + test_reserve_stock_called : '.called is not None' is always True — not a real assertion + (no test) : missing failure path for out-of-stock inventory + (no test) : missing failure path for unknown SKU + (no test) : missing boundary test for quantity=0 + (no test) : missing boundary test for quantity=1 (lower success boundary) +""" +import pytest +from unittest.mock import MagicMock + +from orders.order_processor import OrderProcessor + + +@pytest.fixture +def processor(): + """OrderProcessor with stubbed inventory and notifier.""" + inv = MagicMock() + inv.reserve.return_value = True + notifier = MagicMock() + return OrderProcessor(inventory=inv, notifier=notifier) + + +def test_process_order_returns_something(processor): + """Processing a valid order returns a result.""" + result = processor.process(order_id="o1", quantity=2, sku="SKU-100") + assert result is not None # weak: no specific value asserted + + +def test_process_order_completes(processor): + """Processing a valid order completes without error.""" + result = processor.process(order_id="o2", quantity=1, sku="SKU-200") + assert result # weak: truthy check only + + +def test_reserve_stock_called(processor): + """Processing an order triggers inventory reservation.""" + processor.process(order_id="o3", quantity=5, sku="SKU-300") + assert processor.inventory.reserve.called is not None # always True — not a real assertion + + +# Missing: test for out-of-stock — what exception is raised? +# Missing: test for unknown SKU — what exception is raised? +# Missing: test for quantity=0 (boundary — should this raise or return empty order?) +# Missing: test for quantity=1 (lower boundary success case) diff --git a/skills/test-unit-standards/evals/fixture-map.md b/skills/test-unit-standards/evals/fixture-map.md new file mode 100644 index 0000000..1849ace --- /dev/null +++ b/skills/test-unit-standards/evals/fixture-map.md @@ -0,0 +1,52 @@ +# Unit Test Standards — Evals Fixture Map + +Links each eval test case to its fixture file(s). + +| Test ID | Category | Fixture | +|---|---|---| +| 1 | happy-path | evals/files/compliant-payment-tests.py | +| 2 | regression | evals/files/real-io-violations.py | +| 3 | regression | evals/files/private-member-access.py | +| 4 | regression | evals/files/weak-assertions-no-failure.py | +| 5 | regression | evals/files/copy-paste-setup.py | +| 6 | regression | evals/files/bad-naming-missing-boundaries.py | +| 7 | happy-path | evals/files/source-discount-calculator.py | +| 8 | negative | *(no file — docstring task)* | +| 9 | paraphrase | evals/files/real-io-violations.py | +| 10 | multi-violation | evals/files/multi-violation-audit.py | +| 11 | regression | evals/files/shared-mutable-state.py | +| 12 | regression | evals/files/missing-docstrings-stray-comments.py | +| 13 | negative | evals/files/mixed-source-and-tests.py | + +## Fixture → Rule mapping + +| Fixture file | Primary rule(s) exercised | +|---|---| +| compliant-payment-tests.py | All rules — no violations (reference implementation) | +| real-io-violations.py | Isolation — real I/O (filesystem, HTTP, DB) | +| private-member-access.py | Scope — private member access | +| weak-assertions-no-failure.py | Assertion completeness, boundary values | +| copy-paste-setup.py | Fixture management | +| bad-naming-missing-boundaries.py | Naming conventions, boundary values | +| source-discount-calculator.py | All rules — write-from-scratch task | +| shared-mutable-state.py | Isolation — shared mutable state between tests | +| missing-docstrings-stray-comments.py | Naming/structure — docstrings required, no stray comments | +| mixed-source-and-tests.py | Scope note — non-test task must not trigger rule enforcement | +| multi-violation-audit.py | All rules — isolation, scope, naming, assertions, fixtures, boundaries | + +## Coverage summary + +- happy-path: 2 +- regression: 7 +- negative: 2 +- paraphrase: 1 +- multi-violation: 1 +- **total: 13** + +## Trigger eval coverage + +| Direction | Count | +|---|---| +| should_trigger = true | 12 | +| should_trigger = false | 13 | +| **total** | **25** | diff --git a/skills/test-unit-standards/evals/trigger-eval.json b/skills/test-unit-standards/evals/trigger-eval.json new file mode 100644 index 0000000..d854325 --- /dev/null +++ b/skills/test-unit-standards/evals/trigger-eval.json @@ -0,0 +1,152 @@ +[ + { + "id": "t01-write-unit-tests-explicit", + "query": "Write unit tests for the PaymentService class.", + "should_trigger": true, + "reason": "Explicit test-writing request — core trigger for the skill." + }, + { + "id": "t02-review-unit-tests-explicit", + "query": "Can you review these unit tests and tell me if they follow best practices?", + "should_trigger": true, + "reason": "Explicit unit test review request." + }, + { + "id": "t03-audit-test-standards", + "query": "Audit this test file against our unit test standards.", + "should_trigger": true, + "reason": "Direct audit request using 'unit test standards' — primary trigger phrase." + }, + { + "id": "t04-test-isolation-check", + "query": "Are these tests properly isolated? I want to make sure they don't call real APIs.", + "should_trigger": true, + "reason": "Isolation check request — test isolation is explicitly listed in the skill description." + }, + { + "id": "t05-check-test-naming", + "query": "Do my test function names follow the right convention?", + "should_trigger": true, + "reason": "Test naming convention check — covered explicitly by the skill." + }, + { + "id": "t06-assertions-review", + "query": "Are the assertions in these tests strong enough, or am I missing edge cases?", + "should_trigger": true, + "reason": "Assertion completeness and boundary coverage — both listed in the skill description." + }, + { + "id": "t07-fixture-review", + "query": "I have a lot of copy-paste in my test setup — how should I refactor it using fixtures?", + "should_trigger": true, + "reason": "Fixture management question — fixture reuse is a named concern in the skill." + }, + { + "id": "t08-private-member-access", + "query": "Are any of these tests accessing private methods or internal state they shouldn't?", + "should_trigger": true, + "reason": "Private member access check — directly listed in the skill description." + }, + { + "id": "t09-write-tests-for-edge-cases", + "query": "Write me a test for the edge case where the input list is empty and one where it is None.", + "should_trigger": true, + "reason": "Test writing for boundary/null inputs — boundary values and empty/null inputs are explicitly in scope." + }, + { + "id": "t10-are-tests-up-to-scratch", + "query": "Are these tests up to scratch? I want to ship this service today.", + "should_trigger": true, + "reason": "Paraphrased test quality review — 'are these tests ok' maps to reviewing against test standards." + }, + { + "id": "t11-mocking-boundaries", + "query": "Should I be mocking the database layer in these tests, or is it fine to use a real connection?", + "should_trigger": true, + "reason": "Boundary mocking question — mock/stub decisions for I/O boundaries are within skill scope." + }, + { + "id": "t12-failure-path-coverage", + "query": "I only have happy-path tests for this service — can you help me add failure-path coverage?", + "should_trigger": true, + "reason": "Failure-path coverage — the skill explicitly requires at least one failure path per behaviour." + }, + { + "id": "n01-review-pr", + "query": "Review this PR before I merge it.", + "should_trigger": false, + "reason": "PR review request — routes to pr-review skill, not unit test standards." + }, + { + "id": "n02-implement-feature", + "query": "Implement a discount calculation function in Python.", + "should_trigger": false, + "reason": "Feature implementation request with no test-writing or test-review intent." + }, + { + "id": "n03-generate-docs", + "query": "Write docstrings for all the public methods in this module.", + "should_trigger": false, + "reason": "Documentation generation task — no test authoring or review requested." + }, + { + "id": "n04-terraform-module", + "query": "Create a Terraform module for an S3 bucket with versioning enabled.", + "should_trigger": false, + "reason": "Infrastructure-as-code task — routes to cps-iac skill." + }, + { + "id": "n05-why-test-failing", + "query": "Why is my test throwing AttributeError: 'NoneType' object has no attribute 'charge'?", + "should_trigger": false, + "reason": "Runtime debugging question about a test error — not a standards review or test-writing task." + }, + { + "id": "n06-refactor-service", + "query": "Refactor this service class for lower cyclomatic complexity.", + "should_trigger": false, + "reason": "Production code refactoring request — no test standards involvement." + }, + { + "id": "n07-create-github-issue", + "query": "Create a GitHub issue for the flaky integration test in CI.", + "should_trigger": false, + "reason": "Issue creation task — routes to create-issue skill." + }, + { + "id": "n08-kudos", + "query": "Give kudos to Alex for fixing the performance regression.", + "should_trigger": false, + "reason": "Recognition workflow — routes to kudos skill." + }, + { + "id": "n09-explain-code", + "query": "Explain how the payment gateway client works.", + "should_trigger": false, + "reason": "Code explanation request — no test writing or review in scope." + }, + { + "id": "n10-crossall-update", + "query": "Generate the crossall status update for this sprint.", + "should_trigger": false, + "reason": "Sprint reporting task — routes to crossall-changes skill." + }, + { + "id": "n11-integration-test-design", + "query": "Design an integration test plan for the full payment flow including the gateway sandbox.", + "should_trigger": false, + "reason": "Integration test planning — the skill covers unit tests only; integration tests deliberately use real I/O and are out of scope." + }, + { + "id": "n12-test-double-selection", + "query": "Should I use a spy or a stub for the email client in this test? What's the difference?", + "should_trigger": false, + "reason": "Test-double selection (spy vs stub vs mock) routes to the test-mocking-patterns skill; the skill description explicitly pairs with it for double selection and does not own this decision." + }, + { + "id": "n13-run-the-tests", + "query": "Can you run the unit tests for the PaymentService and show me the output?", + "should_trigger": false, + "reason": "Test execution request — the skill covers writing, reviewing, and auditing test standards, not running tests." + } +] diff --git a/skills/test-unit-standards/references/javascript-jest.md b/skills/test-unit-standards/references/javascript-jest.md new file mode 100644 index 0000000..30a7dbe --- /dev/null +++ b/skills/test-unit-standards/references/javascript-jest.md @@ -0,0 +1,197 @@ +# JavaScript / TypeScript + Jest — Language Reference + +Loaded by `test-unit-standards`, `test-unit-write`, and `test-unit-review` when JavaScript or +TypeScript is detected. Apply these conventions on top of the language-agnostic rules in +`test-unit-standards/SKILL.md`. + +--- + +## Test naming + +Use `describe` to group tests by unit under test, and `it` / `test` for individual cases. +Each case name must state the condition and expected outcome. + +```typescript +// ✅ +describe('applyDiscount', () => { + it('returns 20% off when tier is gold', () => { ... }); + it('returns 10% off when tier is silver', () => { ... }); + it('returns full price when tier is null', () => { ... }); + it('throws RangeError when price is negative', () => { ... }); + it('returns 0 when price is 0', () => { ... }); +}); + +// ❌ — vague, no condition or expected outcome +describe('discount', () => { + it('works', () => { ... }); + it('test1', () => { ... }); + it('handles edge cases', () => { ... }); +}); +``` + +Flat `test('unitName when condition returns expected', ...)` is also acceptable when `describe` +nesting would be unnecessary. + +--- + +## Private member convention + +Modern JavaScript / TypeScript uses `#` for truly private fields (hard private). Older codebases +use `_` or `__` by convention (soft private). Tests must not access either. + +```typescript +// ✅ — test through the public API +it('throws UnauthorizedError for an expired token', () => { + expect(() => validator.validate(expiredToken)).toThrow(UnauthorizedError); +}); + +// ❌ — accessing a hard-private field via type cast +it('extracts the sub claim from the payload', () => { + const result = (validator as any)['#decode'](token); // private — not allowed + expect(result.sub).toBe('user123'); +}); + +// ❌ — accessing a soft-private field +it('cache is populated after validation', () => { + validator.validate(validToken); + expect((validator as any)._cache).toContain('user123'); // private — not allowed +}); +``` + +--- + +## Test file placement + +``` +src/ +├── services/ +│ ├── payment.service.ts +│ └── payment.service.test.ts ← co-located, same directory +``` + +or a mirrored `__tests__` directory: + +``` +src/ +└── services/ + └── payment.service.ts +__tests__/ +└── services/ + └── payment.service.test.ts +``` + +Suffix: `.test.ts` or `.spec.ts`. One test file per source module. + +--- + +## Setup and teardown + +Use `beforeEach` / `afterEach` for per-test setup. Always call `jest.clearAllMocks()` in +`beforeEach` to prevent mock state from leaking between tests. + +```typescript +// ✅ +let mockCharge: jest.Mock; + +beforeEach(() => { + jest.clearAllMocks(); + mockCharge = jest.fn().mockResolvedValue({ id: 'ch_123', status: 'succeeded' }); +}); + +// ❌ — mock created once at module scope; state accumulates across tests +const mockCharge = jest.fn(); +``` + +--- + +## Mocking modules + +Use `jest.mock(...)` at the top of the file — Jest hoists it before `import` statements. + +```typescript +// ✅ — module mock hoisted automatically +jest.mock('../httpClient'); +import { httpClient } from '../httpClient'; + +const mockPost = httpClient.post as jest.Mock; + +beforeEach(() => { + jest.clearAllMocks(); + mockPost.mockResolvedValue({ data: { id: 'order_1' } }); +}); + +it('returns the created order id', async () => { + const result = await orderService.place('u1', 'SKU-1', 2); + expect(result.id).toBe('order_1'); +}); + +// ❌ — manual mock defined inside a test body; does not intercept the module import +it('returns the created order id', async () => { + const mockPost = jest.fn().mockResolvedValue({ data: { id: 'order_1' } }); + // ... mockPost is never injected into the module +}); +``` + +--- + +## Assertion style + +Use specific Jest matchers. Avoid `toBeTruthy()` or `toBeDefined()` when an exact value is known. + +```typescript +// ✅ +expect(result).toEqual({ id: 'order_1', status: 'placed' }); +expect(mockSend).toHaveBeenCalledWith('u1', expect.stringContaining('SKU-1')); +expect(mockSend).toHaveBeenCalledTimes(1); +expect(() => service.placeOrder('u1', 'SKU-X', -1)).toThrow(RangeError); +await expect(service.placeOrder('u1', '', 1)).rejects.toThrow('sku is required'); + +// ❌ — too weak +expect(result).toBeTruthy(); +expect(result).toBeDefined(); +expect(mockSend.mock.calls.length).not.toBe(0); // use toHaveBeenCalled() instead +expect(result).not.toBeNull(); // use toEqual with the actual value +``` + +--- + +## Exception and async error testing + +```typescript +// ✅ — synchronous throw +expect(() => applyDiscount(-1, 'gold')).toThrow(RangeError); +expect(() => applyDiscount(-1, 'gold')).toThrow('price must be non-negative'); + +// ✅ — async rejection +await expect(service.placeOrder('u1', 'SKU-X', 100)) + .rejects.toThrow(InsufficientStockError); + +// ❌ — silent failure on async errors +it('throws on insufficient stock', () => { // missing async/await + expect(service.placeOrder('u1', 'SKU-X', 100)).rejects.toThrow(...); +}); +``` + +--- + +## Parametrised tests + +Use `test.each` (table form) for multiple input variations of the same logic. + +```typescript +// ✅ +test.each([ + ['gold', 100, 80], + ['silver', 100, 90], + [null, 100, 100], + ['gold', 0, 0], // boundary +] as const)('applyDiscount(%s, %i) returns %i', (tier, price, expected) => { + expect(applyDiscount(price, tier)).toBe(expected); +}); + +// ❌ — four separate test functions testing the same logic +test('applyDiscount returns 80 for gold', () => expect(applyDiscount(100, 'gold')).toBe(80)); +test('applyDiscount returns 90 for silver', () => expect(applyDiscount(100, 'silver')).toBe(90)); +test('applyDiscount returns 100 for null', () => expect(applyDiscount(100, null)).toBe(100)); +test('applyDiscount returns 0 at boundary', () => expect(applyDiscount(0, 'gold')).toBe(0)); +``` diff --git a/skills/test-unit-standards/references/python-pytest.md b/skills/test-unit-standards/references/python-pytest.md new file mode 100644 index 0000000..9e848ac --- /dev/null +++ b/skills/test-unit-standards/references/python-pytest.md @@ -0,0 +1,217 @@ +# Python + pytest — Language Reference + +Loaded by `test-unit-standards`, `test-unit-write`, and `test-unit-review` when Python is detected. +Apply these conventions on top of the language-agnostic rules in `test-unit-standards/SKILL.md`. + +--- + +## Test naming + +Convention: `test___` + +```python +# ✅ +def test_apply_discount_with_gold_tier_returns_20_percent_off(): ... +def test_apply_discount_with_negative_price_raises_value_error(): ... +def test_place_order_with_zero_quantity_raises_value_error(): ... +def test_place_order_when_stock_insufficient_raises_insufficient_stock_error(): ... + +# ❌ — vague, missing condition or expected outcome +def test_discount(): ... +def test_it_works(): ... +def test_1(): ... +def test_apply_discount_works(): ... +``` + +--- + +## Private member convention + +Python marks private members with a leading underscore (`_name`) or name-mangling (`__name`). +Tests must not access either. + +```python +# ✅ — test through the public interface +def test_validate_with_expired_token_raises_unauthorized(validator): + with pytest.raises(UnauthorizedError): + validator.validate(expired_token) + +# ❌ — accessing private method directly +def test_decode_extracts_sub_claim(validator): + result = validator._decode(token) # private — not allowed + assert result["sub"] == "user123" + +# ❌ — accessing private attribute directly +def test_cache_is_populated(validator): + validator.validate(valid_token) + assert "user123" in validator._cache # private — not allowed +``` + +--- + +## Fixtures and conftest.py + +- Place shared fixtures in `conftest.py` at the nearest test directory level +- Use `@pytest.fixture` with explicit `scope` (`"function"` is default; `"session"` for expensive shared setup) +- Document every fixture with a docstring stating its purpose and any side effects + +```python +# ✅ tests/unit/conftest.py +import pytest +from unittest.mock import MagicMock + +@pytest.fixture +def payment_gateway(): + """Stubbed payment gateway. Returns a fixed successful charge response.""" + gateway = MagicMock() + gateway.charge.return_value = {"id": "ch_123", "status": "succeeded"} + return gateway + +@pytest.fixture +def order_service(payment_gateway): + """OrderService wired with the stubbed gateway. No real network calls.""" + return OrderService(gateway=payment_gateway) + +# ❌ — setup copy-pasted inside every test +def test_charge_returns_id(): + gateway = MagicMock() + gateway.charge.return_value = {"id": "ch_123", "status": "succeeded"} + service = OrderService(gateway=gateway) + ... + +def test_charge_raises_on_decline(): + gateway = MagicMock() # duplicated + gateway.charge.return_value = {"id": "ch_123", ...} # duplicated + service = OrderService(gateway=gateway) # duplicated + ... +``` + +--- + +## Assertion style + +Use pytest's native `assert` statement with specific, exact values. + +```python +# ✅ +assert result == {"id": "order_1", "status": "placed"} +assert result.discount_amount == pytest.approx(20.0) + +with pytest.raises(ValueError, match="quantity must be positive"): + service.place_order("u1", "SKU-1", 0) + +mock_notify.assert_called_once_with("u1", "Order placed for 2x SKU-1") + +# ❌ — too weak to prove behaviour +assert result is not None +assert result # truthy check only +assert mock.called is not None # always True — proves nothing +assert isinstance(result, dict) # proves type, not value +self.assertEqual(result, ...) # unittest style in a pytest file — inconsistent +``` + +--- + +## Exception testing + +```python +# ✅ — specific type and message match +with pytest.raises(InsufficientStockError, match="Only 2 units of SKU-X available"): + service.place_order("u1", "SKU-X", 10) + +# ✅ — capture and inspect the exception value +with pytest.raises(ValidationError) as exc_info: + service.place_order("u1", "", 1) +assert "sku" in str(exc_info.value).lower() + +# ❌ — catches any exception, hides real failures +try: + service.place_order("u1", "SKU-X", 10) +except Exception: + pass + +# ❌ — no assertion on type or message +with pytest.raises(Exception): + service.place_order("u1", "SKU-X", 10) +``` + +--- + +## Mocking + +Use `pytest-mock` (`mocker` fixture) or `unittest.mock.patch`. + +**Rule:** patch the name as it is **imported in the module under test**, not where it is defined. + +```python +# ✅ — patch where 'requests' is imported inside myapp.services.user_service +def test_fetch_user_returns_parsed_response(mocker): + mocker.patch( + "myapp.services.user_service.requests.get", + return_value=MagicMock(status_code=200, json=lambda: {"id": "u1"}), + ) + result = UserService().fetch("u1") + assert result == {"id": "u1"} + +# ❌ — patching the source module has no effect on the module under test +def test_fetch_user_returns_parsed_response(mocker): + mocker.patch("requests.get", return_value=...) +``` + +Prefer fixture-based patches over inline `with patch(...)` blocks to keep test bodies clean. + +```python +# ✅ — fixture-based patch +@pytest.fixture +def mock_requests_get(mocker): + """Patches requests.get in user_service; returns the mock for assertion.""" + return mocker.patch("myapp.services.user_service.requests.get") + +def test_fetch_user_calls_correct_url(mock_requests_get): + mock_requests_get.return_value = MagicMock(json=lambda: {"id": "u1"}) + UserService().fetch("u1") + mock_requests_get.assert_called_once_with("https://api.example.com/users/u1") +``` + +--- + +## Parametrize + +Use `@pytest.mark.parametrize` for multiple input variations of the same logical case. + +```python +# ✅ — one parametrized test instead of four separate functions +@pytest.mark.parametrize("price,tier,expected", [ + (100.0, "gold", 80.0), # 20% off + (100.0, "silver", 90.0), # 10% off + (100.0, None, 100.0), # no discount + (0.0, "gold", 0.0), # boundary: zero price +]) +def test_apply_discount_returns_correct_price(price, tier, expected): + assert apply_discount(price, tier) == pytest.approx(expected) + +# ❌ — four separate functions testing identical logic +def test_apply_discount_gold(): assert apply_discount(100.0, "gold") == 80.0 +def test_apply_discount_silver(): assert apply_discount(100.0, "silver") == 90.0 +def test_apply_discount_no_tier(): assert apply_discount(100.0, None) == 100.0 +def test_apply_discount_zero(): assert apply_discount(0.0, "gold") == 0.0 +``` + +--- + +## Test file placement + +``` +project/ +├── src/ +│ └── myapp/ +│ └── services/ +│ └── payment_service.py +└── tests/ + └── unit/ + ├── conftest.py ← shared fixtures for all unit tests + └── services/ + └── test_payment_service.py ← mirrors src/ path, prefixed with test_ +``` + +One test file per source module. Prefix test files with `test_`. Collect shared fixtures in `conftest.py`. diff --git a/skills/test-unit-standards/references/scala-munit.md b/skills/test-unit-standards/references/scala-munit.md new file mode 100644 index 0000000..039191c --- /dev/null +++ b/skills/test-unit-standards/references/scala-munit.md @@ -0,0 +1,193 @@ +# Scala + MUnit — Language Reference + +Loaded by `test-unit-standards`, `test-unit-write`, and `test-unit-review` when Scala is detected. +Apply these conventions on top of the language-agnostic rules in `test-unit-standards/SKILL.md`. + +--- + +## Test naming + +MUnit uses string literals as test names. Names must state the unit, condition, and expected outcome +in plain English. Prefer the pattern: `" returns/throws/emits "`. + +```scala +// ✅ +test("applyDiscount with gold tier returns 20% off") { ... } +test("applyDiscount with negative price throws IllegalArgumentException") { ... } +test("applyDiscount with zero price returns 0") { ... } +test("placeOrder when stock is insufficient throws InsufficientStockException") { ... } + +// ❌ — vague, no condition or expected outcome +test("discount") { ... } +test("it works") { ... } +test("test1") { ... } +``` + +For grouping, use a dedicated `Suite` class per unit under test: + +```scala +class ApplyDiscountSuite extends FunSuite { ... } +class OrderServiceSuite extends FunSuite { ... } +``` + +--- + +## Private member convention + +Scala marks private members with `private` or `private[package]` visibility modifiers. Tests must +not use reflection or other mechanisms to access them. + +```scala +// ✅ — test through the public API +test("validate with expired token throws UnauthorizedException") { + intercept[UnauthorizedException] { + validator.validate(expiredToken) + } +} + +// ❌ — accessing private method via reflection +test("decode extracts sub claim") { + val method = validator.getClass.getDeclaredMethod("decode", classOf[String]) + method.setAccessible(true) // private — not allowed + val result = method.invoke(validator, token) + assertEquals(result.asInstanceOf[Map[String, String]]("sub"), "user123") +} +``` + +--- + +## Test file placement + +``` +src/ +└── main/ + └── scala/ + └── com/example/services/ + └── OrderService.scala +src/ +└── test/ + └── scala/ + └── com/example/services/ + └── OrderServiceSuite.scala ← mirrors main/ path, suffixed with Suite +``` + +One `Suite` class per source class. Place shared helpers in a `TestSupport` trait or object. + +--- + +## Fixtures and setup + +MUnit provides `beforeEach` / `afterEach` hooks and `fixture` helpers for resource management. + +```scala +// ✅ — FunFixtures for managed resources +val stubGateway: Fixture[PaymentGateway] = FunFixture( + setup = _ => StubPaymentGateway(), + teardown = _ => () +) + +// ✅ — beforeEach for simple resets +var mockRepo: MockInventoryRepo = _ + +override def beforeEach(context: BeforeEach): Unit = { + mockRepo = new MockInventoryRepo() +} + +// ❌ — shared mutable val at class level, never reset between tests +val mockRepo = new MockInventoryRepo() // state accumulates — tests become order-dependent +``` + +--- + +## Assertion style + +Use MUnit's `assertEquals`, `assertThrows`, and `interceptMessage` for specific assertions. + +```scala +// ✅ +assertEquals(applyDiscount(100.0, Tier.Gold), 80.0) +assertEquals(result, Order(userId = "u1", sku = "SKU-1", status = "placed")) + +assertThrows[InsufficientStockException] { + orderService.placeOrder("u1", "SKU-X", 100) +} + +// ✅ — check exception message +val ex = interceptMessage[IllegalArgumentException]("quantity must be positive") { + orderService.placeOrder("u1", "SKU-1", 0) +} + +// ❌ — too weak +assert(result != null) +assert(result.isInstanceOf[Order]) // proves type, not value +assert(called) // truthy check — use specific verify instead +``` + +--- + +## Mocking (mockito-scala) + +```scala +import org.mockito.MockitoSugar._ + +// ✅ — stub a return value +val repo = mock[InventoryRepository] +when(repo.getStock("SKU-1")).thenReturn(10) + +// ✅ — verify a call +verify(repo).reserve("SKU-1", 2) + +// ✅ — argument matchers for irrelevant args +when(repo.getStock(any[String])).thenReturn(5) + +// ❌ — over-specifying unrelated arguments makes the test brittle +verify(repo).reserve(eqTo("SKU-1"), eqTo(2)) // fine when both matter +verify(repo).reserve(any(), eqTo(2)) // ✅ when only qty matters +``` + +Reset mocks between tests using `reset(mock)` in `beforeEach`, or declare mocks inside the test +body / fixture so they are scoped per test. + +--- + +## Data-driven tests + +Use `List(...).foreach` or `FunSuite`'s table helpers for multiple input variations. + +```scala +// ✅ +List( + ("gold", 100.0, 80.0), + ("silver", 100.0, 90.0), + ("bronze", 100.0, 95.0), + ("gold", 0.0, 0.0), // boundary +).foreach { case (tier, price, expected) => + test(s"applyDiscount $tier $price returns $expected") { + assertEquals(applyDiscount(price, Tier.withName(tier)), expected) + } +} + +// ❌ — separate test per variation +test("applyDiscount gold 100 returns 80") { assertEquals(applyDiscount(100.0, Tier.Gold), 80.0) } +test("applyDiscount silver 100 returns 90") { assertEquals(applyDiscount(100.0, Tier.Silver), 90.0) } +``` + +--- + +## Async tests + +MUnit supports `Future`-returning tests natively. Always return the `Future` from the test body. + +```scala +// ✅ +test("placeOrder resolves with order id") { + orderService.placeOrder("u1", "SKU-1", 2).map { order => + assertEquals(order.id.isEmpty, false) + } +} + +// ❌ — fire-and-forget; test passes before the Future completes +test("placeOrder resolves with order id") { + orderService.placeOrder("u1", "SKU-1", 2) // Future not returned — assertion never runs +} +``` diff --git a/skills/test-unit-write/SKILL.md b/skills/test-unit-write/SKILL.md new file mode 100644 index 0000000..93abb37 --- /dev/null +++ b/skills/test-unit-write/SKILL.md @@ -0,0 +1,103 @@ +--- +name: test-unit-write +description: > + Procedural skill for writing unit tests from scratch or extending partial coverage. Activate + when the user asks to write, generate, add, or scaffold unit tests for a function, class, or + module — including requests for partial coverage such as "add failure-path tests", "add boundary + tests", or "I need tests for this method". Also activate for "help me test this" and "what tests + should I write for X?". + Triggers on: "write unit tests for", "add tests for", "generate test cases for", "help me test + this", "add coverage for", "scaffold tests for", "I need unit tests", "write tests that cover", + "what tests should I write", "add failure-path coverage", "add edge case tests". + Does NOT trigger for: reviewing or auditing existing tests (use test-unit-review), choosing test + doubles (use test-mocking-patterns), managing test data strategies (use test-data-management), + debugging failing tests at runtime, integration or e2e test planning, or documentation writing. + Pairs with test-unit-standards and test-mocking-patterns. +--- + +# Unit Test Writer + +**For reviewing tests**, use **test-unit-review**. **For test doubles**, use **test-mocking-patterns**. + +## Step 1 — Load language reference + +Identify language/framework from file extension, imports, build files. Load reference: + +| Language / framework | Reference file | +|---|---| +| Python + pytest | `../test-unit-standards/references/python-pytest.md` | +| JavaScript / TypeScript + Jest | `../test-unit-standards/references/javascript-jest.md` | +| Scala + MUnit | `../test-unit-standards/references/scala-munit.md` | + +Apply language-specific naming, fixture, assertion guidance throughout. + +## Step 2 — Analyse source + +Read file. Identify: +- Public API (functions, methods, constructors) +- Dependencies (args, services, modules) +- I/O boundaries (HTTP, DB, files, env, clock, randomness) +- Return values & side effects +- Failure conditions (exceptions, guards, errors) + +**Do not write tests yet.** + +## Step 3 — Choose mock strategy + +| Dependency | Strategy | +|---|---| +| HTTP / API | Stub with canned response | +| DB / ORM / repo | Stub repo/DAO layer | +| File I/O | Stub in-memory or `tmp_path` | +| Env vars | Monkeypatch at boundary | +| Clock | Inject/patch to fixed datetime | +| Randomness / UUID | Patch deterministically | +| Pure helpers | Call real implementation | + +**Unsure on mock vs stub vs spy vs fake?** Consult **test-mocking-patterns**. + +## Step 4 — Scaffold test file + +1. Determine file location (language reference) +2. Write imports (framework, mocking, unit under test) +3. Declare shared fixtures for mocks (Step 3) +4. Document fixture purpose & side effects + +**Do not write test functions yet.** + +## Step 5 — Write test cases + +Order: 1) Happy path, 2) Failure paths (≥1 per behaviour), 3) Edge/boundary cases (empty, zero, None, max/min). + +**Extending existing file:** Add only new tests; do not modify/remove existing. + +Per test: +- Name per language reference +- Docstring: scenario in plain language +- Public interface only, no private access +- One logical behaviour (multiple asserts OK if collectively confirm single outcome) +- **One test per exception type + condition** — do not combine invalid inputs. Each deserves own test. Use `pytest.raises` context manager; use `pytest.mark.parametrize` for many inputs. + +## Step 6 — Assert correctly + +Verify per language reference: +- No `is not None` / `toBeDefined()` / `assert result` where exact value known +- Exception tests assert specific type & message (where meaningful) +- Side-effect assertions use framework matcher, not manual truthy +- Float comparisons use approximate matcher + +## Step 7 — Run and validate + +1. Run test suite +2. Confirm all new tests pass +3. Fix before returning if any fail — no broken tests returned +4. If runner unavailable, state explicitly and note tests unverified + +## Demo format (format/structure examples) + +For *"show what tests look like"* requests: +- Names: `test___` +- One-line docstring per test +- Fixtures via `@pytest.fixture` with `MagicMock()` +- **Exactly one `assert` or `pytest.raises()`** — keep minimal/focused. Real files may have multiple asserts (see Step 5). + diff --git a/skills/test-unit-write/evals/evals.json b/skills/test-unit-write/evals/evals.json new file mode 100644 index 0000000..615ee8d --- /dev/null +++ b/skills/test-unit-write/evals/evals.json @@ -0,0 +1,150 @@ +{ + "skill_name": "test-unit-write", + "evals": [ + { + "id": 1, + "category": "happy-path", + "prompt": "Write unit tests for the OrderService class in evals/files/source-order-service.py. Cover all public methods.", + "expected_output": "Output produces a complete, compliant test file: fixtures stub InventoryRepository and NotificationClient (no real I/O); place_order happy path, ValueError for empty sku and quantity<=0, InsufficientStockError, and boundary quantity=1 are all tested; cancel_order happy path (pending order), already-shipped return value, and empty order_id ValueError are tested; test names follow test___; assertions are specific; no private members accessed.", + "files": [ + "evals/files/source-order-service.py" + ], + "expectations": [ + "Shared fixtures stub InventoryRepository and NotificationClient — no real I/O", + "place_order happy path is tested and returns expected dict fields", + "place_order with empty sku raises ValueError", + "place_order with quantity <= 0 raises ValueError", + "place_order when stock < quantity raises InsufficientStockError", + "place_order confirms inventory.reserve is called with correct args", + "place_order confirms notifications.send is called", + "cancel_order with a non-shipped order returns True", + "cancel_order with a shipped order returns False", + "cancel_order with empty order_id raises ValueError", + "Test names follow test___ convention", + "Assertions use specific values, not is-not-None or truthy checks", + "No private members (_inventory, _notifications) are accessed" + ] + }, + { + "id": 2, + "category": "happy-path", + "prompt": "Write unit tests for PriceCalculator.calculate() in evals/files/source-price-calculator.py. Make sure all I/O is mocked and all paths are covered.", + "expected_output": "Output stubs the promotions HTTP client (no real requests.get); tests cover: gold/silver/bronze tier discounts, no-tier (None) case, negative base_price ValueError, unrecognised tier ValueError, promo_code with successful API response, promo_code when API returns non-200 raises PromotionError, zero price boundary, and combination of tier + promo_code discount. Env var PROMOTIONS_API_URL is either monkeypatched or unused because the client is injected. Test names are descriptive.", + "files": [ + "evals/files/source-price-calculator.py" + ], + "expectations": [ + "promotions HTTP client is stubbed — no real requests.get calls", + "Gold tier discount (20%) is tested", + "Silver tier discount (10%) is tested", + "Bronze tier discount (5%) is tested", + "None tier (no discount) is tested", + "Negative base_price raises ValueError", + "Unrecognised tier raises ValueError", + "promo_code path with a 200 API response applies extra discount", + "promo_code path with a non-200 API response raises PromotionError", + "Boundary: base_price = 0 returns 0", + "Test names follow the naming convention from the language reference" + ] + }, + { + "id": 3, + "category": "extension", + "prompt": "The file evals/files/partial-happy-path-only.py has unit tests for OrderService but only covers the happy path. Add tests for all missing failure paths and boundary values without modifying the existing tests.", + "expected_output": "Output adds tests for: place_order with quantity=0 raises ValueError; place_order with quantity negative raises ValueError; place_order with empty sku raises ValueError; place_order when inventory.get_stock returns less than quantity raises InsufficientStockError; cancel_order with a shipped order returns False; cancel_order with a non-shipped pending order returns True; cancel_order with empty order_id raises ValueError. Existing tests are left unchanged.", + "files": [ + "evals/files/partial-happy-path-only.py" + ], + "expectations": [ + "place_order with quantity=0 is tested and raises ValueError", + "place_order with empty sku is tested and raises ValueError", + "place_order when insufficient stock is tested and raises InsufficientStockError", + "cancel_order with a shipped order is tested and returns False", + "cancel_order with empty order_id is tested and raises ValueError", + "Existing three tests are not modified", + "New test names follow test___ convention", + "No real I/O is introduced in the new tests" + ] + }, + { + "id": 4, + "category": "negative", + "prompt": "Can you review these unit tests and tell me what's wrong with them? evals/files/partial-happy-path-only.py", + "expected_output": "This is a review request, not a test-writing request. The skill routes this to test-unit-review rather than generating new tests. If handled here: it identifies missing failure paths and boundary coverage but does not generate new test code.", + "files": [ + "evals/files/partial-happy-path-only.py" + ], + "expectations": [ + "Skill does not confuse a review request with a write request", + "If the skill proceeds, it identifies missing coverage rather than generating tests", + "No new test code is generated as part of a review-only response" + ] + }, + { + "id": 5, + "category": "negative", + "prompt": "Should I use a spy or a mock for the notification_client in these tests?", + "expected_output": "This is a test-double selection question that routes to test-mocking-patterns. test-unit-write does not own double selection decisions.", + "files": [], + "expectations": [ + "Skill does not generate test code for a double-selection question", + "Response correctly defers to test-mocking-patterns for spy vs mock guidance" + ] + }, + { + "id": 6, + "category": "paraphrase", + "prompt": "Add tests for the new refund logic in RefundService — cover the happy path, the already-refunded rejection, and the partial refund boundary.", + "expected_output": "Agent produces three unit tests: (1) happy path — refund succeeds for a valid pending order, asserts returned refund ID and status; (2) already-refunded rejection — calling refund on an already-refunded order raises AlreadyRefundedError with the order ID in the message; (3) partial refund boundary — refund amount equal to the total order amount succeeds; amount greater than total raises OverRefundError. All dependencies are stubbed via fixtures. Test names follow test___.", + "files": [], + "expectations": [ + "Three test functions covering the three specified paths", + "Happy path asserts returned refund ID and status specifically", + "AlreadyRefundedError test asserts exception type and message content", + "Boundary test covers amount == total (pass) and amount > total (fail)", + "All dependencies are stubbed — no real I/O", + "Test names follow test___" + ] + }, + { + "id": 7, + "category": "edge-case", + "prompt": "The method I'm testing raises different exceptions for different invalid inputs. How should I structure the tests — one test per exception or one test for all invalid inputs?", + "expected_output": "One test per exception type and triggering condition. Each invalid input deserves its own focused test because: (1) the test name documents the specific invalid condition; (2) a single test with multiple asserts stops at the first failure, hiding subsequent failures; (3) mutation testing requires each branch to be covered by a dedicated test. Use pytest.raises as a context manager in each test. If many inputs trigger the same exception type, use pytest.mark.parametrize to table them.", + "files": [], + "expectations": [ + "Recommends one test per exception type and triggering condition", + "Explains why combined tests are worse: mask failures, hurt mutation coverage", + "Recommends pytest.raises as context manager", + "Notes pytest.mark.parametrize for multiple inputs triggering the same exception" + ] + }, + { + "id": 8, + "category": "output-format", + "prompt": "Show me what unit tests generated by the skill look like — naming, structure, fixtures.", + "expected_output": "Generated tests appear in a single fenced ```python code block. Function names follow test___ convention. Each test function has a one-line docstring. Dependencies are patched via pytest fixtures using mocker.MagicMock(). Each test ends with exactly one assert or one pytest.raises() call. No test function imports additional modules beyond what is available in the fixture.", + "files": [], + "expectations": [ + "Tests in a single fenced ```python code block", + "test___ naming convention", + "One-line docstring per test", + "mocker.MagicMock() for dependencies", + "Exactly one assert or pytest.raises() per test" + ] + }, + { + "id": 9, + "category": "regression", + "prompt": "Write unit tests for this method: def get_discount_label(amount): return 'BIG' if amount > 100 else 'SMALL'", + "expected_output": "Agent generates two tests: one for amount > 100 (expects 'BIG') and one for amount <= 100 (expects 'SMALL'). Critically, each test asserts the exact return value string — NOT 'result is not None', NOT 'result is truthy', NOT 'assert result'. The regression being checked: a previous version of the skill generated 'assert result' (truthy check) instead of 'assert result == \"BIG\"'. The boundary value 100 is covered: amount=100 returns 'SMALL', amount=101 returns 'BIG'.", + "files": [], + "expectations": [ + "Two tests: one for amount > 100, one for amount <= 100", + "Assertions use exact string values ('BIG', 'SMALL') — not truthy checks", + "Boundary value 100 is covered", + "No 'assert result' or 'assert result is not None' style assertions" + ] + } + ] +} diff --git a/skills/test-unit-write/evals/files/partial-happy-path-only.py b/skills/test-unit-write/evals/files/partial-happy-path-only.py new file mode 100644 index 0000000..433c3c8 --- /dev/null +++ b/skills/test-unit-write/evals/files/partial-happy-path-only.py @@ -0,0 +1,51 @@ +""" +Partial tests for OrderService — happy path only. + +These tests cover the successful place_order scenario but are missing: +- Failure paths (ValueError for bad inputs, InsufficientStockError) +- Boundary values (quantity=1, empty sku) +- cancel_order coverage entirely +""" + +import pytest +from unittest.mock import MagicMock +from source_order_service import OrderService + + +@pytest.fixture +def inventory(): + repo = MagicMock() + repo.get_stock.return_value = 10 + return repo + + +@pytest.fixture +def notifications(): + return MagicMock() + + +@pytest.fixture +def service(inventory, notifications): + return OrderService(inventory_repo=inventory, notification_client=notifications) + + +def test_place_order_with_valid_inputs_returns_order_dict(service): + """Happy path: valid user, sku, and quantity returns a well-formed order.""" + result = service.place_order("user-1", "SKU-A", 3) + + assert result["user_id"] == "user-1" + assert result["sku"] == "SKU-A" + assert result["quantity"] == 3 + assert result["status"] == "placed" + + +def test_place_order_reserves_stock(service, inventory): + """Happy path: placing an order calls reserve on the inventory repo.""" + service.place_order("user-1", "SKU-A", 3) + inventory.reserve.assert_called_once_with("SKU-A", 3) + + +def test_place_order_sends_notification(service, notifications): + """Happy path: placing an order sends a notification to the user.""" + service.place_order("user-1", "SKU-A", 3) + notifications.send.assert_called_once() diff --git a/skills/test-unit-write/evals/files/source-order-service.py b/skills/test-unit-write/evals/files/source-order-service.py new file mode 100644 index 0000000..92de1f8 --- /dev/null +++ b/skills/test-unit-write/evals/files/source-order-service.py @@ -0,0 +1,70 @@ +""" +OrderService — source under test. + +Handles placing and cancelling orders. Depends on an InventoryRepository +and a NotificationClient injected at construction time. +""" + +from datetime import datetime, timezone + + +class InsufficientStockError(Exception): + """Raised when the requested quantity exceeds available stock.""" + + +class OrderService: + def __init__(self, inventory_repo, notification_client): + self._inventory = inventory_repo + self._notifications = notification_client + + def place_order(self, user_id: str, sku: str, quantity: int) -> dict: + """Place an order for the given SKU and quantity. + + Returns: + dict with keys: user_id, sku, quantity, status, placed_at + + Raises: + ValueError: if quantity <= 0 or sku is empty + InsufficientStockError: if available stock < quantity + """ + if not sku: + raise ValueError("sku must not be empty") + if quantity <= 0: + raise ValueError("quantity must be positive") + + stock = self._inventory.get_stock(sku) + if stock < quantity: + raise InsufficientStockError( + f"Only {stock} units of {sku} available" + ) + + self._inventory.reserve(sku, quantity) + order = { + "user_id": user_id, + "sku": sku, + "quantity": quantity, + "status": "placed", + "placed_at": datetime.now(timezone.utc).isoformat(), + } + self._notifications.send(user_id, f"Order placed for {quantity}x {sku}") + return order + + def cancel_order(self, order_id: str) -> bool: + """Cancel an existing order. + + Returns: + True if cancelled successfully. + False if the order has already shipped and cannot be cancelled. + + Raises: + ValueError: if order_id is empty + """ + if not order_id: + raise ValueError("order_id must not be empty") + + status = self._inventory.get_order_status(order_id) + if status == "shipped": + return False + + self._inventory.release_order(order_id) + return True diff --git a/skills/test-unit-write/evals/files/source-price-calculator.py b/skills/test-unit-write/evals/files/source-price-calculator.py new file mode 100644 index 0000000..f53b00a --- /dev/null +++ b/skills/test-unit-write/evals/files/source-price-calculator.py @@ -0,0 +1,61 @@ +""" +PriceCalculator — source under test (JavaScript/TypeScript equivalent in Python for eval purposes). + +Calculates the final price for a cart item applying member tier discounts and +promotional codes fetched from an external promotions API. +""" + +import os +import requests + + +TIER_DISCOUNTS = { + "gold": 0.20, + "silver": 0.10, + "bronze": 0.05, +} + + +class PromotionError(Exception): + """Raised when the promotions API returns an unexpected response.""" + + +class PriceCalculator: + def __init__(self, promotions_client=None): + self._promotions = promotions_client or requests + + def calculate(self, base_price: float, tier: str | None, promo_code: str | None) -> float: + """Calculate the final price after tier discount and optional promo code. + + Args: + base_price: Original item price. Must be >= 0. + tier: Member tier ('gold', 'silver', 'bronze') or None for no tier discount. + promo_code: Optional promotional code to look up additional % off. + + Returns: + Final price (float, >= 0). + + Raises: + ValueError: if base_price < 0 or tier is not a recognised value. + PromotionError: if the promotions API returns a non-200 response. + """ + if base_price < 0: + raise ValueError(f"base_price must be non-negative, got {base_price}") + if tier is not None and tier not in TIER_DISCOUNTS: + raise ValueError(f"unrecognised tier: {tier!r}") + + price = base_price + if tier: + price *= 1.0 - TIER_DISCOUNTS[tier] + + if promo_code: + api_url = os.environ.get("PROMOTIONS_API_URL", "https://promos.example.com") + response = self._promotions.get(f"{api_url}/codes/{promo_code}") + if response.status_code != 200: + raise PromotionError( + f"promotions API returned {response.status_code} for code {promo_code!r}" + ) + extra_discount = response.json().get("discount_pct", 0) / 100 + price *= 1.0 - extra_discount + + return round(price, 2) diff --git a/skills/test-unit-write/evals/fixture-map.md b/skills/test-unit-write/evals/fixture-map.md new file mode 100644 index 0000000..116077c --- /dev/null +++ b/skills/test-unit-write/evals/fixture-map.md @@ -0,0 +1,34 @@ +# Unit Test Write — Evals Fixture Map + +Links each eval test case to its fixture file(s). + +| Test ID | Category | Fixture | +|---------|-----------|---------| +| 1 | happy-path | evals/files/source-order-service.py | +| 2 | happy-path | evals/files/source-price-calculator.py | +| 3 | extension | evals/files/partial-happy-path-only.py | +| 4 | negative | evals/files/partial-happy-path-only.py | +| 5 | negative | *(no file — double-selection question)* | + +## Fixture → Scenario mapping + +| Fixture file | Scenario exercised | +|---|---| +| source-order-service.py | Write full test suite from scratch; two deps to stub; multiple failure paths | +| source-price-calculator.py | Write tests for a class with HTTP dep and env var; promo code path adds extra double | +| partial-happy-path-only.py | Extend existing tests with failure and boundary coverage without modifying existing tests | + +## Coverage summary + +- happy-path: 2 +- extension: 1 +- negative: 2 +- **total: 5** + +## Trigger eval coverage + +| Direction | Count | +|---|---| +| should_trigger = true | 10 | +| should_trigger = false | 8 | +| **total** | **18** | diff --git a/skills/test-unit-write/evals/trigger-eval.json b/skills/test-unit-write/evals/trigger-eval.json new file mode 100644 index 0000000..f6601e1 --- /dev/null +++ b/skills/test-unit-write/evals/trigger-eval.json @@ -0,0 +1,110 @@ +[ + { + "id": "w-t01-write-tests-explicit", + "query": "Write unit tests for the OrderService class.", + "should_trigger": true, + "reason": "Explicit test-writing request — primary trigger." + }, + { + "id": "w-t02-add-tests-for-method", + "query": "Add unit tests for the place_order method.", + "should_trigger": true, + "reason": "Explicit request to add tests for a specific method." + }, + { + "id": "w-t03-generate-test-cases", + "query": "Generate test cases for the PriceCalculator.calculate() function.", + "should_trigger": true, + "reason": "'Generate test cases' is a listed trigger phrase." + }, + { + "id": "w-t04-help-me-test", + "query": "Help me test this service — it has a database dependency.", + "should_trigger": true, + "reason": "'Help me test this' is a listed trigger phrase." + }, + { + "id": "w-t05-scaffold-tests", + "query": "Scaffold the tests for this module following our standards.", + "should_trigger": true, + "reason": "'Scaffold tests for' is a listed trigger phrase." + }, + { + "id": "w-t06-add-failure-coverage", + "query": "I only have happy-path tests — can you add failure-path coverage for me?", + "should_trigger": true, + "reason": "'Add failure-path coverage' is a listed trigger phrase for extension scenarios." + }, + { + "id": "w-t07-what-tests-to-write", + "query": "What tests should I write for this function?", + "should_trigger": true, + "reason": "'What tests should I write for X' is a listed trigger phrase." + }, + { + "id": "w-t08-add-edge-case-tests", + "query": "Add edge case tests for the boundary values in this calculator.", + "should_trigger": true, + "reason": "'Add edge case tests' is a listed trigger phrase." + }, + { + "id": "w-t09-i-need-unit-tests", + "query": "I need unit tests for the NotificationService before I merge this.", + "should_trigger": true, + "reason": "'I need unit tests' is a listed trigger phrase." + }, + { + "id": "w-t10-add-coverage-for", + "query": "Add coverage for the cancel_order and refund paths.", + "should_trigger": true, + "reason": "'Add coverage for' is a listed trigger phrase." + }, + { + "id": "w-n01-review-tests", + "query": "Review these unit tests and tell me what's wrong with them.", + "should_trigger": false, + "reason": "Review request — routes to test-unit-review, not test-unit-write." + }, + { + "id": "w-n02-audit-test-standards", + "query": "Audit this test file against our unit test standards.", + "should_trigger": false, + "reason": "Audit/review request — routes to test-unit-review." + }, + { + "id": "w-n03-spy-vs-mock", + "query": "Should I use a spy or a mock for the email client?", + "should_trigger": false, + "reason": "Test-double selection — routes to test-mocking-patterns." + }, + { + "id": "w-n04-test-data-factory", + "query": "How should I organise my test data with a factory function?", + "should_trigger": false, + "reason": "Test data strategy — routes to test-data-management." + }, + { + "id": "w-n05-debug-failing-test", + "query": "Why is my test throwing AttributeError: 'NoneType' object has no attribute 'charge'?", + "should_trigger": false, + "reason": "Runtime debugging of a failing test — not a test-writing task." + }, + { + "id": "w-n06-integration-test", + "query": "Design an integration test plan for the full payment flow with the gateway sandbox.", + "should_trigger": false, + "reason": "Integration test planning — the skill covers unit tests only." + }, + { + "id": "w-n07-write-docstring", + "query": "Write a docstring for the OrderService class.", + "should_trigger": false, + "reason": "Documentation task — no test authoring intent." + }, + { + "id": "w-n08-refactor", + "query": "Refactor this service to reduce its cyclomatic complexity.", + "should_trigger": false, + "reason": "Source code refactoring — no test writing requested." + } +] From 96ab7c86a60c6e2e0eee00d73ac0656c585430ba Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 15:49:26 +0200 Subject: [PATCH 02/12] Improved fixtures. --- skills/test-unit-review/evals/evals.json | 32 ++++++++++++++++ .../evals/files/flaky-shared-mock-state.py | 36 ++++++++++++++++++ .../files/integration-test-in-unit-suite.py | 37 ++++++++++++++++++ skills/test-unit-review/evals/fixture-map.md | 18 +++++++-- .../test-unit-review/evals/trigger-eval.json | 12 ++++++ skills/test-unit-standards/evals/evals.json | 16 ++++++++ .../evals/files/framework-idiom-misuse.py | 38 +++++++++++++++++++ .../test-unit-standards/evals/fixture-map.md | 17 ++++++--- .../evals/trigger-eval.json | 38 +++++++++++++------ skills/test-unit-write/evals/evals.json | 19 ++++++++++ .../files/source-parametrized-discounts.py | 37 ++++++++++++++++++ skills/test-unit-write/evals/fixture-map.md | 18 +++++++-- .../test-unit-write/evals/trigger-eval.json | 12 ++++++ 13 files changed, 305 insertions(+), 25 deletions(-) create mode 100644 skills/test-unit-review/evals/files/flaky-shared-mock-state.py create mode 100644 skills/test-unit-review/evals/files/integration-test-in-unit-suite.py create mode 100644 skills/test-unit-standards/evals/files/framework-idiom-misuse.py create mode 100644 skills/test-unit-write/evals/files/source-parametrized-discounts.py diff --git a/skills/test-unit-review/evals/evals.json b/skills/test-unit-review/evals/evals.json index eba3eed..dd01434 100644 --- a/skills/test-unit-review/evals/evals.json +++ b/skills/test-unit-review/evals/evals.json @@ -122,6 +122,38 @@ "Empty sections show (none)", "Final compliance status line present" ] + }, + { + "id": 9, + "category": "regression", + "prompt": "Are these UserService tests reliable? Can they run in any order and pass consistently? File: evals/files/flaky-shared-mock-state.py", + "expected_output": "Review flags isolation violation: a module-level mutable variable _mock_user is shared across all tests. test_delete_user_clears_cache depends on test_create_user_sets_id running first; if test execution order changes or tests run in parallel, the suite fails. Each test must be fully independent with its own fixtures. Recommend moving _mock_user inside each test or using a function-scoped fixture.", + "files": [ + "evals/files/flaky-shared-mock-state.py" + ], + "expectations": [ + "Module-level _mock_user variable is flagged as shared mutable state", + "test_delete_user_clears_cache is flagged as order-dependent on test_create_user_sets_id", + "Reviewer identifies that parallel execution will break the suite", + "Reviewer recommends moving state into test functions or function-scoped fixtures", + "Isolation rule (no shared state, runnable in any order) is cited" + ] + }, + { + "id": 10, + "category": "regression", + "prompt": "I have some unit tests but one of them makes a real Kafka call. Is that a problem? File: evals/files/integration-test-in-unit-suite.py", + "expected_output": "Review flags isolation violation: test_publish_event_to_kafka makes a real outbound call to a live Kafka cluster (bootstrap_servers=['localhost:9092']). This is an integration test, not a unit test, and violates test isolation. It should either (1) be removed from the unit suite and moved to integration tests, or (2) be rewritten to stub the KafkaProducer. The test also has no meaningful assertion (just calls real I/O).", + "files": [ + "evals/files/integration-test-in-unit-suite.py" + ], + "expectations": [ + "test_publish_event_to_kafka is flagged as an integration test, not a unit test", + "Real KafkaProducer and bootstrap_servers are flagged as real I/O", + "Reviewer explains that integration tests violate unit test isolation", + "Reviewer recommends moving to an integration suite or stubbing the producer", + "Isolation rule is cited as the violated standard" + ] } ] } diff --git a/skills/test-unit-review/evals/files/flaky-shared-mock-state.py b/skills/test-unit-review/evals/files/flaky-shared-mock-state.py new file mode 100644 index 0000000..abf4483 --- /dev/null +++ b/skills/test-unit-review/evals/files/flaky-shared-mock-state.py @@ -0,0 +1,36 @@ +# Flaky test suite: shared mock state causes order-dependent failures +# Problem: _mock_user is a module-level variable shared across all tests +# When run in one order, tests pass; in another order (or parallel), they fail + +import pytest +from unittest.mock import MagicMock + +_mock_user = None # Shared mock state — VIOLATION of isolation rule + +class TestUserService: + def test_create_user_sets_id(self): + """When create_user is called, the returned user has a non-None ID.""" + global _mock_user + service = UserService() + _mock_user = MagicMock(id=123, name="Alice") + result = service.get_user(_mock_user.id) + assert result.id == _mock_user.id + + def test_delete_user_clears_cache(self): + """When delete_user is called, the user is removed from the service.""" + global _mock_user + # This test depends on test_create_user_sets_id having run first. + # If it runs first, _mock_user is None, and the test fails. + assert _mock_user is not None + service = UserService() + service.delete_user(_mock_user.id) + result = service.get_user(_mock_user.id) + assert result is None + + def test_get_nonexistent_user_returns_none(self): + """When get_user is called with an unknown ID, None is returned.""" + service = UserService() + # This test should be independent, but if test_delete_user_clears_cache + # ran first and modified _mock_user, this test may see stale state. + result = service.get_user(999) + assert result is None diff --git a/skills/test-unit-review/evals/files/integration-test-in-unit-suite.py b/skills/test-unit-review/evals/files/integration-test-in-unit-suite.py new file mode 100644 index 0000000..28faac4 --- /dev/null +++ b/skills/test-unit-review/evals/files/integration-test-in-unit-suite.py @@ -0,0 +1,37 @@ +# Mixed unit and integration tests in one suite +# Problem: test_publish_event_to_kafka makes a real outbound Kafka call (integration test) +# should not be in the unit test suite + +import pytest +from unittest.mock import MagicMock +import kafka + +class TestEventPublisher: + @pytest.fixture + def publisher(self): + """Event publisher with stubbed Kafka producer.""" + return EventPublisher( + producer=MagicMock(), + logger=MagicMock() + ) + + def test_publish_creates_message_dict(self, publisher): + """When publish_event is called, the message dict includes timestamp.""" + result = publisher.publish_event({"type": "order", "id": 123}) + assert "timestamp" in result + assert result["type"] == "order" + + def test_publish_event_to_kafka(self, publisher): + """VIOLATION: This is an integration test, not a unit test. + It publishes a real message to a live Kafka cluster, violating test isolation.""" + # Real Kafka connection — should be removed or moved to integration suite + real_kafka = kafka.KafkaProducer(bootstrap_servers=['localhost:9092']) + real_kafka.send('events-topic', {'type': 'order', 'id': 456}) + real_kafka.flush() + # Assertion checks real Kafka, not the EventPublisher behaviour + # This defeats the purpose of unit testing + + def test_logger_is_called_on_publish(self, publisher): + """When publish_event is called, the logger is invoked.""" + publisher.publish_event({"type": "user", "id": 789}) + publisher.logger.info.assert_called() diff --git a/skills/test-unit-review/evals/fixture-map.md b/skills/test-unit-review/evals/fixture-map.md index 2d48496..dc62777 100644 --- a/skills/test-unit-review/evals/fixture-map.md +++ b/skills/test-unit-review/evals/fixture-map.md @@ -9,6 +9,11 @@ Links each eval test case to its fixture file(s). | 3 | regression | evals/files/missing-coverage-subscription-tests.py | | 4 | negative | *(no file — write request routed away)* | | 5 | negative | *(no file — runtime debugging question)* | +| 6 | paraphrase | *(no file — informal phrasing)* | +| 7 | edge-case | *(no file — pytest.approx guidance)* | +| 8 | output-format | *(no file — format documentation)* | +| 9 | regression | evals/files/flaky-shared-mock-state.py | +| 10 | regression | evals/files/integration-test-in-unit-suite.py | ## Fixture → Rule mapping @@ -17,19 +22,24 @@ Links each eval test case to its fixture file(s). | compliant-auth-service-tests.py | All six categories — no violations (confirms reviewer does not fabricate) | | multi-violation-report-tests.py | All six categories — isolation, scope, naming, assertions, coverage, fixtures | | missing-coverage-subscription-tests.py | Coverage completeness and weak assertions | +| flaky-shared-mock-state.py | Isolation — shared mutable state, order-dependent tests | +| integration-test-in-unit-suite.py | Isolation — real I/O (Kafka), integration test in unit suite | ## Coverage summary - happy-path: 1 - multi-violation: 1 -- regression: 1 +- regression: 3 - negative: 2 -- **total: 5** +- paraphrase: 1 +- edge-case: 1 +- output-format: 1 +- **total: 10** ## Trigger eval coverage | Direction | Count | |---|---| | should_trigger = true | 11 | -| should_trigger = false | 6 | -| **total** | **17** | +| should_trigger = false | 8 | +| **total** | **19** | diff --git a/skills/test-unit-review/evals/trigger-eval.json b/skills/test-unit-review/evals/trigger-eval.json index f0c830a..7405912 100644 --- a/skills/test-unit-review/evals/trigger-eval.json +++ b/skills/test-unit-review/evals/trigger-eval.json @@ -100,5 +100,17 @@ "query": "Design an integration test plan for the checkout flow.", "should_trigger": false, "reason": "Integration test planning is out of scope for this skill." + }, + { + "id": "r-n07-flaky-tests-ci", + "query": "Why are my tests flaky when run in parallel in CI but not locally?", + "should_trigger": false, + "reason": "Debugging runtime behaviour and CI infrastructure — not a standards review of test code quality." + }, + { + "id": "r-n08-parallel-execution", + "query": "Should I run unit tests in parallel or sequentially?", + "should_trigger": false, + "reason": "Test execution strategy and CI infrastructure configuration — outside the standards review scope." } ] diff --git a/skills/test-unit-standards/evals/evals.json b/skills/test-unit-standards/evals/evals.json index de3695a..d71f63b 100644 --- a/skills/test-unit-standards/evals/evals.json +++ b/skills/test-unit-standards/evals/evals.json @@ -261,6 +261,22 @@ "Fix suggestions in fenced code blocks where applicable", "Explicit compliance confirmation when no Blocker/Important findings" ] + }, + { + "id": 17, + "category": "regression", + "prompt": "Can you review these PriceValidator tests for framework idiom issues? File: evals/files/framework-idiom-misuse.py", + "expected_output": "Review flags a pytest idiom violation: test_negative_price_raises_error uses pytest.raises(ValueError, match=\"Price must be non-negative\") where match is a regex parameter, not a literal substring. The match parameter expects a compiled regex or regex string; a literal match parameter that is not a valid regex will not work correctly. Correct usage: match=r'Price must be' or match='Price must' (a valid regex substring). This is not a broken test (the other test shows correct usage), but a potential edge case where exact strings as match parameters can fail if they contain regex metacharacters.", + "files": [ + "evals/files/framework-idiom-misuse.py" + ], + "expectations": [ + "test_negative_price_raises_error match parameter is reviewed for regex safety", + "Reviewer explains that match is a regex, not a literal string", + "Reviewer notes test_unknown_currency_raises_error is correct", + "Reviewer clarifies the idiom (match parameter must be a valid regex)", + "Exception assertion rule is cited" + ] } ] } diff --git a/skills/test-unit-standards/evals/files/framework-idiom-misuse.py b/skills/test-unit-standards/evals/files/framework-idiom-misuse.py new file mode 100644 index 0000000..e6a7af7 --- /dev/null +++ b/skills/test-unit-standards/evals/files/framework-idiom-misuse.py @@ -0,0 +1,38 @@ +# Framework idiom misuse: pytest.raises with wrong parameters +# Problem: match parameter is used with a literal string, not a regex + +import pytest +from decimal import Decimal + +class PriceValidator: + def validate(self, price, currency): + """Validate price and currency. Raise ValueError if invalid.""" + if price < 0: + raise ValueError("Price must be non-negative") + if currency not in ["USD", "EUR", "GBP"]: + raise ValueError(f"Unknown currency: {currency}") + return True + +class TestPriceValidator: + def test_negative_price_raises_error(self): + """When validate is called with negative price, ValueError is raised.""" + validator = PriceValidator() + # VIOLATION: match is a regex pattern, not a literal string. + # This will fail because match expects a regex, not a literal substring. + # Correct: use match=r'Price must be non-negative' (regex) + # or use match='Price must be' (substring of a regex) + with pytest.raises(ValueError, match="Price must be non-negative"): + validator.validate(-10, "USD") + + def test_unknown_currency_raises_error(self): + """When validate is called with unknown currency, ValueError is raised.""" + validator = PriceValidator() + # This one is correct: match is a substring pattern + with pytest.raises(ValueError, match="Unknown currency"): + validator.validate(100, "XYZ") + + def test_valid_price_and_currency(self): + """When validate is called with valid inputs, return True.""" + validator = PriceValidator() + result = validator.validate(Decimal('99.99'), "EUR") + assert result is True diff --git a/skills/test-unit-standards/evals/fixture-map.md b/skills/test-unit-standards/evals/fixture-map.md index 1849ace..5d61124 100644 --- a/skills/test-unit-standards/evals/fixture-map.md +++ b/skills/test-unit-standards/evals/fixture-map.md @@ -17,6 +17,10 @@ Links each eval test case to its fixture file(s). | 11 | regression | evals/files/shared-mutable-state.py | | 12 | regression | evals/files/missing-docstrings-stray-comments.py | | 13 | negative | evals/files/mixed-source-and-tests.py | +| 14 | paraphrase | *(no file — informal phrasing)* | +| 15 | edge-case | *(no file — tmp_path guidance)* | +| 16 | output-format | *(no file — format documentation)* | +| 17 | regression | evals/files/framework-idiom-misuse.py | ## Fixture → Rule mapping @@ -33,20 +37,23 @@ Links each eval test case to its fixture file(s). | missing-docstrings-stray-comments.py | Naming/structure — docstrings required, no stray comments | | mixed-source-and-tests.py | Scope note — non-test task must not trigger rule enforcement | | multi-violation-audit.py | All rules — isolation, scope, naming, assertions, fixtures, boundaries | +| framework-idiom-misuse.py | Assertions — pytest.raises idiom (regex match parameter) | ## Coverage summary - happy-path: 2 -- regression: 7 +- regression: 8 - negative: 2 -- paraphrase: 1 +- paraphrase: 2 - multi-violation: 1 -- **total: 13** +- edge-case: 1 +- output-format: 1 +- **total: 17** ## Trigger eval coverage | Direction | Count | |---|---| | should_trigger = true | 12 | -| should_trigger = false | 13 | -| **total** | **25** | +| should_trigger = false | 18 | +| **total** | **30** | diff --git a/skills/test-unit-standards/evals/trigger-eval.json b/skills/test-unit-standards/evals/trigger-eval.json index d854325..19ddb37 100644 --- a/skills/test-unit-standards/evals/trigger-eval.json +++ b/skills/test-unit-standards/evals/trigger-eval.json @@ -5,12 +5,7 @@ "should_trigger": true, "reason": "Explicit test-writing request — core trigger for the skill." }, - { - "id": "t02-review-unit-tests-explicit", - "query": "Can you review these unit tests and tell me if they follow best practices?", - "should_trigger": true, - "reason": "Explicit unit test review request." - }, + { "id": "t03-audit-test-standards", "query": "Audit this test file against our unit test standards.", @@ -47,12 +42,7 @@ "should_trigger": true, "reason": "Private member access check — directly listed in the skill description." }, - { - "id": "t09-write-tests-for-edge-cases", - "query": "Write me a test for the edge case where the input list is empty and one where it is None.", - "should_trigger": true, - "reason": "Test writing for boundary/null inputs — boundary values and empty/null inputs are explicitly in scope." - }, + { "id": "t10-are-tests-up-to-scratch", "query": "Are these tests up to scratch? I want to ship this service today.", @@ -148,5 +138,29 @@ "query": "Can you run the unit tests for the PaymentService and show me the output?", "should_trigger": false, "reason": "Test execution request — the skill covers writing, reviewing, and auditing test standards, not running tests." + }, + { + "id": "n14-review-tests-explicit", + "query": "Can you review these unit tests and tell me if they follow best practices?", + "should_trigger": false, + "reason": "Test review request — routes to test-unit-review, not test-unit-standards. Standards is a reference checklist, not the primary workflow trigger." + }, + { + "id": "n15-write-boundary-tests", + "query": "Write me a test for the edge case where the input list is empty and one where it is None.", + "should_trigger": false, + "reason": "Test-writing request — routes to test-unit-write, not test-unit-standards. Test writing is a procedural workflow." + }, + { + "id": "n16-refactor-for-clarity", + "query": "Refactor this test file to reduce duplication in the setup logic.", + "should_trigger": false, + "reason": "Test refactoring request — while fixture reuse is mentioned, refactoring code is not a standards audit or checklist application." + }, + { + "id": "n17-write-docstring", + "query": "Write docstrings for all the test functions in this file.", + "should_trigger": false, + "reason": "Documentation generation task — no test authoring or standards audit requested." } ] diff --git a/skills/test-unit-write/evals/evals.json b/skills/test-unit-write/evals/evals.json index 615ee8d..27efe24 100644 --- a/skills/test-unit-write/evals/evals.json +++ b/skills/test-unit-write/evals/evals.json @@ -145,6 +145,25 @@ "Boundary value 100 is covered", "No 'assert result' or 'assert result is not None' style assertions" ] + }, + { + "id": 10, + "category": "happy-path", + "prompt": "Write unit tests for DiscountCalculator.calculate_discount() using pytest.mark.parametrize to cover all tiers. File: evals/files/source-parametrized-discounts.py", + "expected_output": "Output uses pytest.mark.parametrize to test all four tiers (gold, silver, bronze, standard) and None with expected discount rates. Additionally, tests for failure cases: negative amount (ValueError), unknown tier (ValueError). Assertions verify exact discount amounts (e.g. amount * 0.8 for gold tier). No test duplication; a single parametrized test covers all success paths. Test names clearly indicate parametrized nature (e.g. test_calculate_discount_with_tier[gold-20%]).", + "files": [ + "evals/files/source-parametrized-discounts.py" + ], + "expectations": [ + "pytest.mark.parametrize is used for multiple tier inputs", + "All four tiers (gold, silver, bronze, standard) are covered", + "None tier is tested (no discount)", + "Failure path for negative amount is tested", + "Failure path for unknown tier is tested", + "Assertions verify exact discount amounts, not just non-None", + "No test duplication", + "Test names reflect parametrized values" + ] } ] } diff --git a/skills/test-unit-write/evals/files/source-parametrized-discounts.py b/skills/test-unit-write/evals/files/source-parametrized-discounts.py new file mode 100644 index 0000000..8c2ddc9 --- /dev/null +++ b/skills/test-unit-write/evals/files/source-parametrized-discounts.py @@ -0,0 +1,37 @@ +# Source file for test-unit-write eval: parametrized test generation +# This file defines a discount calculator with multiple tiers. +# Tests should use pytest.mark.parametrize to cover all tiers. + +class DiscountCalculator: + def __init__(self, exchange_rate=1.0): + self.exchange_rate = exchange_rate + + def calculate_discount(self, amount, tier): + """Calculate discount for given amount and tier. + + Args: + amount: float, the order amount + tier: str, one of: gold (20%), silver (10%), bronze (5%), standard (0%) + + Returns: + float, the discounted amount + + Raises: + ValueError if amount < 0 or tier not recognised + """ + if amount < 0: + raise ValueError("Amount must be non-negative") + + rates = { + "gold": 0.20, + "silver": 0.10, + "bronze": 0.05, + "standard": 0.00, + None: 0.00 + } + + if tier not in rates: + raise ValueError(f"Unknown tier: {tier}") + + discount_rate = rates[tier] + return amount * (1 - discount_rate) * self.exchange_rate diff --git a/skills/test-unit-write/evals/fixture-map.md b/skills/test-unit-write/evals/fixture-map.md index 116077c..9154c28 100644 --- a/skills/test-unit-write/evals/fixture-map.md +++ b/skills/test-unit-write/evals/fixture-map.md @@ -9,6 +9,11 @@ Links each eval test case to its fixture file(s). | 3 | extension | evals/files/partial-happy-path-only.py | | 4 | negative | evals/files/partial-happy-path-only.py | | 5 | negative | *(no file — double-selection question)* | +| 6 | paraphrase | *(no file — informal phrasing)* | +| 7 | edge-case | *(no file — structure guidance)* | +| 8 | output-format | *(no file — format documentation)* | +| 9 | regression | *(no file — simple method guidance)* | +| 10 | happy-path | evals/files/source-parametrized-discounts.py | ## Fixture → Scenario mapping @@ -17,18 +22,23 @@ Links each eval test case to its fixture file(s). | source-order-service.py | Write full test suite from scratch; two deps to stub; multiple failure paths | | source-price-calculator.py | Write tests for a class with HTTP dep and env var; promo code path adds extra double | | partial-happy-path-only.py | Extend existing tests with failure and boundary coverage without modifying existing tests | +| source-parametrized-discounts.py | Use pytest.mark.parametrize to cover multiple tiers and values; avoid test duplication | ## Coverage summary -- happy-path: 2 +- happy-path: 3 - extension: 1 - negative: 2 -- **total: 5** +- paraphrase: 1 +- edge-case: 1 +- output-format: 1 +- regression: 1 +- **total: 10** ## Trigger eval coverage | Direction | Count | |---|---| | should_trigger = true | 10 | -| should_trigger = false | 8 | -| **total** | **18** | +| should_trigger = false | 11 | +| **total** | **21** | diff --git a/skills/test-unit-write/evals/trigger-eval.json b/skills/test-unit-write/evals/trigger-eval.json index f6601e1..dae8c28 100644 --- a/skills/test-unit-write/evals/trigger-eval.json +++ b/skills/test-unit-write/evals/trigger-eval.json @@ -106,5 +106,17 @@ "query": "Refactor this service to reduce its cyclomatic complexity.", "should_trigger": false, "reason": "Source code refactoring — no test writing requested." + }, + { + "id": "w-n09-test-data-strategy", + "query": "How should I organise my test data with a factory builder pattern? Should each test create its own data?", + "should_trigger": false, + "reason": "Test data strategy — routes to test-data-management, not test-unit-write (which uses fixtures, not factories)." + }, + { + "id": "w-n10-language-choice", + "query": "Should I write my unit tests in Python or TypeScript?", + "should_trigger": false, + "reason": "Technology choice and architecture decision — not a test-writing task." } ] From f2307ce0effa2a79edd6c15b6694369bd98f8935 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 16:18:40 +0200 Subject: [PATCH 03/12] Improved overlap --- skills/test-unit-review/SKILL.md | 4 ++-- skills/test-unit-standards/SKILL.md | 1 + skills/test-unit-write/SKILL.md | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/skills/test-unit-review/SKILL.md b/skills/test-unit-review/SKILL.md index 5a30d75..41abe61 100644 --- a/skills/test-unit-review/SKILL.md +++ b/skills/test-unit-review/SKILL.md @@ -72,7 +72,7 @@ For each category, list every violation found before moving to the next. - Weak assertions (`assert result`, `assert result is not None`) where exact value known? - Missing assertions (test body with no assert)? - Side-effect assertions using manual truthy instead of framework matcher? -- **Note:** `pytest.approx` is **correct** for floats — do not flag. Only flag genuine weak assertions. +- **Note:** `pytest.approx` is **correct** for floats — do not flag. It is a specific assertion that verifies the value within a tolerance, stronger than truthy or `is not None` checks and kills arithmetic operator mutants. Only flag genuine weak assertions. ### 4.5 Coverage @@ -87,7 +87,7 @@ For each category, list every violation found before moving to the next. ## Step 5 — Report findings -Group by severity. Cite test name, line, rule, fix suggestion. Empty sections show `(none)`. +Group by severity. Cite test name, line, rule, fix suggestion. Empty sections show `(none)`. End with a line confirming overall compliance or summarizing blockers/important issues. ### Blocker diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md index adcfea4..867fef0 100644 --- a/skills/test-unit-standards/SKILL.md +++ b/skills/test-unit-standards/SKILL.md @@ -59,6 +59,7 @@ When no violations found, explicitly confirm compliance for each category. - Cover success + ≥1 failure path per behaviour - Cover boundary values & empty/null inputs - Specific over generic assertions (see language reference) +- **Note on exception assertions:** When using `pytest.raises(..., match=...)`, the `match` parameter accepts a regex pattern. Plain literal strings (e.g., `match="Invalid user"`) are valid regex patterns. Only flag if the string contains unescaped regex metacharacters (e.g., `match="Price."` without escaping the dot) and the test intends a literal match. ## Fixtures diff --git a/skills/test-unit-write/SKILL.md b/skills/test-unit-write/SKILL.md index 93abb37..09a088b 100644 --- a/skills/test-unit-write/SKILL.md +++ b/skills/test-unit-write/SKILL.md @@ -76,7 +76,7 @@ Per test: - Docstring: scenario in plain language - Public interface only, no private access - One logical behaviour (multiple asserts OK if collectively confirm single outcome) -- **One test per exception type + condition** — do not combine invalid inputs. Each deserves own test. Use `pytest.raises` context manager; use `pytest.mark.parametrize` for many inputs. +- **One test per exception type + condition** — do not combine invalid inputs. Each deserves own test. Exception: a single boundary path may cover the last valid and first invalid value together (e.g. `amount == total` succeeds, `amount > total` raises). Use `pytest.raises` context manager; use `pytest.mark.parametrize` for many inputs. ## Step 6 — Assert correctly From 97730a9dec4151c1239aaab92bca2a7b028c5605 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 16:31:57 +0200 Subject: [PATCH 04/12] triggers improved --- skills/test-unit-review/SKILL.md | 21 ++++++++++++--------- skills/test-unit-standards/SKILL.md | 26 +++++++++++++++----------- skills/test-unit-write/SKILL.md | 27 +++++++++++++++------------ 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/skills/test-unit-review/SKILL.md b/skills/test-unit-review/SKILL.md index 41abe61..33f743b 100644 --- a/skills/test-unit-review/SKILL.md +++ b/skills/test-unit-review/SKILL.md @@ -1,18 +1,21 @@ --- name: test-unit-review description: > - Procedural skill for reviewing and auditing existing unit tests against team standards. - Activate when the user asks to review, audit, check, or give feedback on an existing test - file — including informal requests and sharing a test file for a quality opinion. + Review and audit existing unit tests against quality standards. Activate when the user + asks to review, critique, give feedback, check quality, or audit any existing test file + — including informal requests like "does this look right?" or sharing a test for a quality + opinion. This skill evaluates tests for isolation, scope, naming, assertions, coverage, and + fixture reuse. Triggers on: "review these tests", "audit this test file", "check my tests", "are these tests good enough", "do these tests follow our standards", "LGTM on tests", "does this test look right", "is this test missing anything", "any issues with my tests", - "give feedback on this test file", "are these tests up to scratch". - Does NOT trigger for: writing new tests (use test-unit-write), choosing test doubles - (use test-mocking-patterns), managing test data strategies (use test-data-management), - debugging why a test fails at runtime, PR reviews not specifically about test quality, - or integration test planning (use test-integration-standards). - Pairs with test-unit-standards (rules checklist) and test-unit-write (if new tests needed). + "give feedback on this test file", "are these tests up to scratch", "are these tests + properly isolated", "test isolation check", "assertion quality". + Does NOT trigger for: writing new tests (use test-unit-write), adding tests or coverage + (use test-unit-write), choosing test doubles or mocking strategy (use test-mocking-patterns), + managing test data strategies (use test-data-management), debugging failing tests at runtime, + general PR review (use pr-review), or integration test planning (use test-integration-standards). + Pairs with test-unit-write (to add missing tests) and test-unit-standards (for reference). license: Proprietary compatibility: GitHub Copilot --- diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md index 867fef0..f1d6fd7 100644 --- a/skills/test-unit-standards/SKILL.md +++ b/skills/test-unit-standards/SKILL.md @@ -1,17 +1,21 @@ --- name: test-unit-standards description: > - Unit test quality — writing, reviewing, and auditing unit tests against team standards. Activate - for any unit test quality concern: writing new unit tests, extending coverage, reviewing or - auditing existing test files, or asking about test conventions and structural rules. - Triggers on: "write unit tests", "add tests for", "review these tests", "audit this test file", - "are these tests good enough", "unit test standards", "test isolation rules", "test naming - convention", "assertion standards", "fixture reuse rules", "private member in tests", - "failure-path coverage", "add edge case tests", "are these tests up to scratch". - Does NOT trigger for: PR review (use pr-review), test-double selection — spy vs stub (use - test-mocking-patterns), test data strategy (use test-data-management), debugging a failing test - at runtime, running tests, integration test planning, or non-test tasks. - Pairs with test-unit-write and test-unit-review for procedural workflows. + Reference guide and checklist for unit test standards — test isolation, scope, naming, + assertions, coverage, and fixtures. Use this skill to ask questions about test conventions, + structural rules, and best practices. This is a reference skill that pairs with test-unit-write + and test-unit-review for procedural workflows. + Triggers on: "unit test standards", "test isolation rules", "test naming convention", + "assertion standards", "fixture reuse rules", "private member in tests", + "failure-path coverage", "add edge case tests", "are these tests good enough", + "test conventions", "what are the rules for", "how should I structure my tests", + "best practices for unit tests", "should I be mocking". + Does NOT trigger for: writing new unit tests (use test-unit-write), reviewing or auditing + existing tests (use test-unit-review), choosing test doubles — spy vs stub vs mock (use + test-mocking-patterns), managing test data strategies (use test-data-management), debugging + a failing test at runtime, running tests, integration test planning, general PR review (use + pr-review), or non-test tasks. + Pairs with test-unit-write (primary workflow) and test-unit-review (procedural review workflow). license: Proprietary compatibility: GitHub Copilot --- diff --git a/skills/test-unit-write/SKILL.md b/skills/test-unit-write/SKILL.md index 09a088b..182d437 100644 --- a/skills/test-unit-write/SKILL.md +++ b/skills/test-unit-write/SKILL.md @@ -1,18 +1,21 @@ --- name: test-unit-write description: > - Procedural skill for writing unit tests from scratch or extending partial coverage. Activate - when the user asks to write, generate, add, or scaffold unit tests for a function, class, or - module — including requests for partial coverage such as "add failure-path tests", "add boundary - tests", or "I need tests for this method". Also activate for "help me test this" and "what tests - should I write for X?". - Triggers on: "write unit tests for", "add tests for", "generate test cases for", "help me test - this", "add coverage for", "scaffold tests for", "I need unit tests", "write tests that cover", - "what tests should I write", "add failure-path coverage", "add edge case tests". - Does NOT trigger for: reviewing or auditing existing tests (use test-unit-review), choosing test - doubles (use test-mocking-patterns), managing test data strategies (use test-data-management), - debugging failing tests at runtime, integration or e2e test planning, or documentation writing. - Pairs with test-unit-standards and test-mocking-patterns. + Write unit tests from scratch or extend partial coverage for functions, classes, or modules. + Activate when the user asks to write, generate, add, or scaffold unit tests — including + requests for partial coverage such as "add failure-path tests", "add boundary tests", or + "I need tests for this method". Also activate for "help me test this" and "what tests + should I write for X?". This skill covers test isolation, naming, assertions, and coverage. + Triggers on: "write unit tests for", "add tests for", "generate test cases for", + "help me test this", "add coverage for", "scaffold tests for", "I need unit tests", + "write tests that cover", "what tests should I write", "add failure-path coverage", + "add edge case tests", "add boundary tests", "test this method", "generate test". + Does NOT trigger for: reviewing or auditing existing tests (use test-unit-review), asking + about test standards or conventions (use test-unit-standards), choosing test doubles — spy + vs stub vs mock (use test-mocking-patterns), managing test data strategies (use test-data-management), + debugging failing tests at runtime, integration or e2e test planning, documentation writing, + or general code refactoring. + Pairs with test-unit-standards (reference) and test-mocking-patterns (test-double selection). --- # Unit Test Writer From 60dd75f0b066ae9c2c47397f711db92f0eb7047b Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 16:41:54 +0200 Subject: [PATCH 05/12] trigger improved --- skills/test-unit-standards/SKILL.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md index f1d6fd7..2cf62af 100644 --- a/skills/test-unit-standards/SKILL.md +++ b/skills/test-unit-standards/SKILL.md @@ -3,19 +3,19 @@ name: test-unit-standards description: > Reference guide and checklist for unit test standards — test isolation, scope, naming, assertions, coverage, and fixtures. Use this skill to ask questions about test conventions, - structural rules, and best practices. This is a reference skill that pairs with test-unit-write - and test-unit-review for procedural workflows. - Triggers on: "unit test standards", "test isolation rules", "test naming convention", - "assertion standards", "fixture reuse rules", "private member in tests", - "failure-path coverage", "add edge case tests", "are these tests good enough", - "test conventions", "what are the rules for", "how should I structure my tests", - "best practices for unit tests", "should I be mocking". - Does NOT trigger for: writing new unit tests (use test-unit-write), reviewing or auditing - existing tests (use test-unit-review), choosing test doubles — spy vs stub vs mock (use - test-mocking-patterns), managing test data strategies (use test-data-management), debugging - a failing test at runtime, running tests, integration test planning, general PR review (use - pr-review), or non-test tasks. - Pairs with test-unit-write (primary workflow) and test-unit-review (procedural review workflow). + structural rules, best practices, and to audit test files against standards. Activate when + the user asks about standards, conventions, or wants to understand what makes a good test. + Triggers on: "write unit tests", "add tests for", "audit this test file", "review these tests", + "unit test standards", "test isolation rules", "test naming convention", "assertion standards", + "fixture reuse rules", "private member in tests", "failure-path coverage", "are these tests + good enough", "test conventions", "what are the rules for", "how should I structure my tests", + "best practices for unit tests", "should I be mocking", "do these tests follow our standards". + Does NOT trigger for: writing specific test cases or test code (use test-unit-write for "write a test for + the edge case where..."), choosing test doubles — spy vs stub vs mock (use test-mocking-patterns), + managing test data strategies (use test-data-management), debugging a failing test at runtime, + running tests, refactoring test code for clarity/duplication, integration test planning, general + PR review (use pr-review), or non-test tasks. + Pairs with test-unit-write and test-unit-review — standards is the reference layer they consult. license: Proprietary compatibility: GitHub Copilot --- From 32b641835f91d71e60b8aac8c8f825141913416c Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 17:22:29 +0200 Subject: [PATCH 06/12] Add unit tests for various services, addressing naming conventions, fixture management, and isolation violations --- .../files/bad-naming-missing-boundaries.py | 0 .../evals/files/copy-paste-setup.py | 0 .../evals/files/framework-idiom-misuse.py | 0 .../missing-docstrings-stray-comments.py | 0 .../evals/files/private-member-access.py | 0 .../evals/files/real-io-violations.py | 0 .../evals/files/weak-assertions-no-failure.py | 0 .../evals/files/compliant-payment-tests.py | 97 ------------------- .../evals/files/multi-violation-audit.py | 92 ------------------ .../evals/files/shared-mutable-state.py | 37 ------- .../evals/files/source-discount-calculator.py | 45 --------- 11 files changed, 271 deletions(-) rename skills/{test-unit-standards => test-unit-review}/evals/files/bad-naming-missing-boundaries.py (100%) rename skills/{test-unit-standards => test-unit-review}/evals/files/copy-paste-setup.py (100%) rename skills/{test-unit-standards => test-unit-review}/evals/files/framework-idiom-misuse.py (100%) rename skills/{test-unit-standards => test-unit-review}/evals/files/missing-docstrings-stray-comments.py (100%) rename skills/{test-unit-standards => test-unit-review}/evals/files/private-member-access.py (100%) rename skills/{test-unit-standards => test-unit-review}/evals/files/real-io-violations.py (100%) rename skills/{test-unit-standards => test-unit-review}/evals/files/weak-assertions-no-failure.py (100%) delete mode 100644 skills/test-unit-standards/evals/files/compliant-payment-tests.py delete mode 100644 skills/test-unit-standards/evals/files/multi-violation-audit.py delete mode 100644 skills/test-unit-standards/evals/files/shared-mutable-state.py delete mode 100644 skills/test-unit-standards/evals/files/source-discount-calculator.py diff --git a/skills/test-unit-standards/evals/files/bad-naming-missing-boundaries.py b/skills/test-unit-review/evals/files/bad-naming-missing-boundaries.py similarity index 100% rename from skills/test-unit-standards/evals/files/bad-naming-missing-boundaries.py rename to skills/test-unit-review/evals/files/bad-naming-missing-boundaries.py diff --git a/skills/test-unit-standards/evals/files/copy-paste-setup.py b/skills/test-unit-review/evals/files/copy-paste-setup.py similarity index 100% rename from skills/test-unit-standards/evals/files/copy-paste-setup.py rename to skills/test-unit-review/evals/files/copy-paste-setup.py diff --git a/skills/test-unit-standards/evals/files/framework-idiom-misuse.py b/skills/test-unit-review/evals/files/framework-idiom-misuse.py similarity index 100% rename from skills/test-unit-standards/evals/files/framework-idiom-misuse.py rename to skills/test-unit-review/evals/files/framework-idiom-misuse.py diff --git a/skills/test-unit-standards/evals/files/missing-docstrings-stray-comments.py b/skills/test-unit-review/evals/files/missing-docstrings-stray-comments.py similarity index 100% rename from skills/test-unit-standards/evals/files/missing-docstrings-stray-comments.py rename to skills/test-unit-review/evals/files/missing-docstrings-stray-comments.py diff --git a/skills/test-unit-standards/evals/files/private-member-access.py b/skills/test-unit-review/evals/files/private-member-access.py similarity index 100% rename from skills/test-unit-standards/evals/files/private-member-access.py rename to skills/test-unit-review/evals/files/private-member-access.py diff --git a/skills/test-unit-standards/evals/files/real-io-violations.py b/skills/test-unit-review/evals/files/real-io-violations.py similarity index 100% rename from skills/test-unit-standards/evals/files/real-io-violations.py rename to skills/test-unit-review/evals/files/real-io-violations.py diff --git a/skills/test-unit-standards/evals/files/weak-assertions-no-failure.py b/skills/test-unit-review/evals/files/weak-assertions-no-failure.py similarity index 100% rename from skills/test-unit-standards/evals/files/weak-assertions-no-failure.py rename to skills/test-unit-review/evals/files/weak-assertions-no-failure.py diff --git a/skills/test-unit-standards/evals/files/compliant-payment-tests.py b/skills/test-unit-standards/evals/files/compliant-payment-tests.py deleted file mode 100644 index e829566..0000000 --- a/skills/test-unit-standards/evals/files/compliant-payment-tests.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Unit tests for PaymentService. - -Tests cover the public interface only. All I/O (gateway, logger) is stubbed via -fixtures. Shared fixtures are defined once; no copy-paste setup. -""" -import pytest -from unittest.mock import MagicMock - -from services.payment_service import PaymentService, InsufficientFundsError, PaymentError - - -# --- fixtures --- - - -@pytest.fixture -def gateway(): - """Stub for the external payment gateway; returns a canned success response.""" - mock = MagicMock() - mock.charge.return_value = {"status": "ok", "transaction_id": "txn_001"} - return mock - - -@pytest.fixture -def logger(): - """Stub logger so log-call assertions can be made without real I/O.""" - return MagicMock() - - -@pytest.fixture -def service(gateway, logger): - """PaymentService wired to stub dependencies.""" - return PaymentService(gateway=gateway, logger=logger) - - -# --- charge: success path --- - - -def test_charge_valid_amount_returns_transaction_id(service, gateway): - """Charging a positive amount returns the transaction ID supplied by the gateway.""" - result = service.charge(amount=100, currency="ZAR", account_id="acc_1") - assert result == "txn_001" - - -def test_charge_valid_amount_calls_gateway_once(service, gateway): - """Exactly one gateway call is made per charge invocation.""" - service.charge(amount=100, currency="ZAR", account_id="acc_1") - gateway.charge.assert_called_once_with(100, "ZAR", "acc_1") - - -def test_charge_success_emits_info_log(service, logger): - """A single info-level log entry is emitted after a successful charge.""" - service.charge(amount=100, currency="ZAR", account_id="acc_1") - logger.info.assert_called_once() - - -# --- charge: boundary values --- - - -def test_charge_minimum_positive_amount_succeeds(service): - """Charging the minimum positive amount (0.01) completes without error.""" - result = service.charge(amount=0.01, currency="ZAR", account_id="acc_1") - assert result == "txn_001" - - -# --- charge: failure paths --- - - -def test_charge_zero_amount_raises_value_error(service): - """Charging zero is rejected with ValueError before contacting the gateway.""" - with pytest.raises(ValueError, match="amount must be positive"): - service.charge(amount=0, currency="ZAR", account_id="acc_1") - - -def test_charge_negative_amount_raises_value_error(service): - """Negative amounts are rejected immediately without reaching the gateway.""" - with pytest.raises(ValueError, match="amount must be positive"): - service.charge(amount=-50, currency="ZAR", account_id="acc_1") - - -def test_charge_gateway_timeout_raises_payment_error(service, gateway): - """A gateway timeout is surfaced to the caller as a PaymentError.""" - gateway.charge.side_effect = PaymentError("gateway timeout") - with pytest.raises(PaymentError): - service.charge(amount=100, currency="ZAR", account_id="acc_1") - - -def test_charge_insufficient_funds_raises_insufficient_funds_error(service, gateway): - """Gateway returning insufficient-funds propagates as InsufficientFundsError.""" - gateway.charge.side_effect = InsufficientFundsError("balance too low") - with pytest.raises(InsufficientFundsError): - service.charge(amount=9999, currency="ZAR", account_id="acc_low") - - -def test_charge_empty_account_id_raises_value_error(service): - """An empty account_id string is rejected with a descriptive ValueError.""" - with pytest.raises(ValueError, match="account_id"): - service.charge(amount=100, currency="ZAR", account_id="") diff --git a/skills/test-unit-standards/evals/files/multi-violation-audit.py b/skills/test-unit-standards/evals/files/multi-violation-audit.py deleted file mode 100644 index 2e80486..0000000 --- a/skills/test-unit-standards/evals/files/multi-violation-audit.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Unit tests for NotificationService. - -VIOLATIONS — all rule categories present (used for multi-violation audit eval): - - ISOLATION setup_method opens a real Redis connection instead of using a stub - SCOPE test_build_message_template calls private _build_message method - test_retry_count_after_failure reads private _retry_count attribute - NAMING test_send / test_error / test_ok do not follow test___ - ASSERTIONS test_send asserts 'is not None' (weak) - test_ok asserts truthiness only - test_error has no assertion at all - FIXTURES redis_mock + email_client construction is copy-pasted in standalone tests - BOUNDARIES no test for empty recipients list - no test for None subject - no test for zero-length message body -""" -import redis -import pytest -from unittest.mock import MagicMock - -from notifications.notification_service import NotificationService - - -class TestNotificationService: - def setup_method(self): - # real Redis connection — not mocked; violates isolation - self.redis = redis.Redis(host="localhost", port=6379) - self.email_client = MagicMock() - self.service = NotificationService( - redis=self.redis, email_client=self.email_client - ) - - def test_send(self): - """Vague name — no condition or expected outcome.""" - result = self.service.send( - recipients=["a@example.com"], subject="Hello", body="World" - ) - assert result is not None # weak: specific return value not asserted - - def test_error(self): - """Name gives no information about the scenario being tested.""" - self.email_client.send.side_effect = ConnectionError("SMTP down") - # missing: no assertion — test body does not verify the outcome - - def test_ok(self): - """Overly generic pass/fail label.""" - result = self.service.send( - recipients=["b@example.com"], subject="Hi", body="There" - ) - assert result # weak: truthy check only - - def test_build_message_template(self): - """Accesses private _build_message method to verify template rendering.""" - msg = self.service._build_message(subject="Hi", body="Body") # private method — violates scope - assert "Hi" in msg - - def test_retry_count_after_failure(self): - """Reads private _retry_count attribute after a simulated gateway failure.""" - self.email_client.send.side_effect = ConnectionError() - try: - self.service.send(recipients=["c@example.com"], subject="Retry", body="Test") - except Exception: - pass - assert self.service._retry_count > 0 # private attribute — violates scope - - -# --- standalone tests with copy-pasted fixture setup --- - - -def test_send_to_multiple_recipients_calls_email_client_per_recipient(): - """Sending to two recipients results in two email-client calls.""" - email_client = MagicMock() # duplicated setup — should use a shared fixture - redis_mock = MagicMock() - service = NotificationService(redis=redis_mock, email_client=email_client) - - service.send(recipients=["x@e.com", "y@e.com"], subject="Hi", body="Body") - assert email_client.send.call_count == 2 - - -def test_queue_notification_stores_message_in_redis(): - """Queuing a notification pushes it to Redis.""" - email_client = MagicMock() # duplicated setup — should use a shared fixture - redis_mock = MagicMock() - service = NotificationService(redis=redis_mock, email_client=email_client) - - service.queue("z@e.com", subject="Queued", body="Later") - redis_mock.lpush.assert_called_once() - - -# Missing: test_send_empty_recipients_raises_value_error (boundary) -# Missing: test_send_none_subject_raises_value_error (boundary / null input) -# Missing: test_send_smtp_failure_raises_notification_error (failure path with assertion) diff --git a/skills/test-unit-standards/evals/files/shared-mutable-state.py b/skills/test-unit-standards/evals/files/shared-mutable-state.py deleted file mode 100644 index 43aadeb..0000000 --- a/skills/test-unit-standards/evals/files/shared-mutable-state.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Unit tests for CartService. - -VIOLATIONS — isolation (shared mutable state between tests): - TestCartServiceSequenced._processed is a class-level list mutated across tests. - test_refund_removes_order depends on test_checkout_records_order running first, - making the suite order-dependent and breaking independent-execution guarantees. -""" -import pytest -from unittest.mock import MagicMock - -from services.cart_service import CartService - - -class TestCartServiceSequenced: - _processed: list = [] # class-level mutable state — shared across all test instances - - @pytest.fixture(autouse=True) - def setup(self): - """Wire CartService to a payment stub.""" - self.payment = MagicMock() - self.service = CartService(payment=self.payment) - - def test_checkout_records_order(self): - """Checkout adds an entry to the shared _processed list.""" - self.service.checkout(cart_id="c1", total=150.0) - self.__class__._processed.append("c1") # mutates shared class-level state - assert "c1" in self.__class__._processed - - def test_refund_removes_order(self): - """Refund removes an entry — implicitly requires test_checkout_records_order to have run first.""" - self.__class__._processed.remove("c1") # reads state written by a previous test - assert "c1" not in self.__class__._processed - - def test_checkout_delegates_charge_to_payment_gateway(self): - """Checkout total is delegated to the payment gateway.""" - self.service.checkout(cart_id="c2", total=200.0) - self.payment.charge.assert_called_once_with(200.0) diff --git a/skills/test-unit-standards/evals/files/source-discount-calculator.py b/skills/test-unit-standards/evals/files/source-discount-calculator.py deleted file mode 100644 index 0be5cd4..0000000 --- a/skills/test-unit-standards/evals/files/source-discount-calculator.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Source module for DiscountCalculator — used by eval 7 (write-tests task). - -This is the production code under test. No test file exists yet; the eval asks -the model to write compliant unit tests for this class from scratch. -""" -from __future__ import annotations - - -class DiscountCalculator: - """Calculates a discounted price based on the customer's membership tier. - - Recognised tiers and their discounts: - "gold" → 20 % off - "silver" → 15 % off - "bronze" → 10 % off - None → 0 % off (no discount) - - Raises: - ValueError: if *price* is negative. - ValueError: if *member_tier* is not one of the recognised values or None. - """ - - DISCOUNTS: dict[str | None, float] = { - "gold": 0.20, - "silver": 0.15, - "bronze": 0.10, - None: 0.00, - } - - def calculate(self, price: float, member_tier: str | None) -> float: - """Return the discounted price rounded to two decimal places. - - Args: - price: The original price (must be >= 0). - member_tier: One of "gold", "silver", "bronze", or None. - - Returns: - The discounted price as a float. - """ - if price < 0: - raise ValueError(f"price must be non-negative, got {price}") - if member_tier not in self.DISCOUNTS: - raise ValueError(f"unknown tier: {member_tier!r}") - discount = self.DISCOUNTS[member_tier] - return round(price * (1 - discount), 2) From 30e19644272e57cd7c2a785bf4592c6d0b4507d9 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 17:22:38 +0200 Subject: [PATCH 07/12] Refactor test unit standards skill: update evals, fixture mappings, and coverage summaries - Revised evals in `test-unit-standards/evals/evals.json` to focus on reference guidance rather than file audits. - Updated fixture mappings in `fixture-map.md` to reflect new eval structure and purpose. - Enhanced coverage summaries to accurately represent the number of tests and categories. - Adjusted trigger evals in `trigger-eval.json` to align with the new focus on standards and conventions. - Consolidated notes and explanations to clarify the role of the standards skill in relation to test writing and auditing. --- skills/test-unit-review/evals/evals.json | 117 ++++++++ skills/test-unit-review/evals/fixture-map.md | 50 +++- skills/test-unit-standards/SKILL.md | 28 +- skills/test-unit-standards/evals/evals.json | 272 +++++------------- .../test-unit-standards/evals/fixture-map.md | 82 +++--- .../evals/trigger-eval.json | 160 +++++------ 6 files changed, 342 insertions(+), 367 deletions(-) diff --git a/skills/test-unit-review/evals/evals.json b/skills/test-unit-review/evals/evals.json index dd01434..dacdd3e 100644 --- a/skills/test-unit-review/evals/evals.json +++ b/skills/test-unit-review/evals/evals.json @@ -154,6 +154,123 @@ "Reviewer recommends moving to an integration suite or stubbing the producer", "Isolation rule is cited as the violated standard" ] + }, + { + "id": 11, + "category": "regression", + "prompt": "Review these ReportExporter unit tests — do they follow isolation standards? File: evals/files/real-io-violations.py", + "expected_output": "Review flags three isolation violations: (1) test_export_pdf_creates_file writes to the real filesystem (/tmp). (2) test_fetch_template_from_cdn makes a real outbound HTTP request. (3) TestReportExporterWithDB.setup_method opens a live PostgreSQL connection. Each should be replaced with a mock or stub at its I/O boundary.", + "files": [ + "evals/files/real-io-violations.py" + ], + "expectations": [ + "test_export_pdf_creates_file is flagged for real filesystem write", + "test_fetch_template_from_cdn is flagged for real outbound HTTP call", + "TestReportExporterWithDB.setup_method is flagged for real PostgreSQL connection", + "Reviewer recommends mocking os/pathlib, requests, and psycopg2 at their boundary", + "Isolation rule is cited as the violated standard" + ] + }, + { + "id": 12, + "category": "regression", + "prompt": "Check whether these TokenValidator tests follow our standards. Do any of them access things they shouldn't? File: evals/files/private-member-access.py", + "expected_output": "Review flags three scope violations: (1) test_internal_decode_extracts_sub_claim calls private method _decode directly. (2) test_signature_cache_is_populated reads private attribute _cache. (3) test_algorithm_whitelist_contains_hs256 reads private attribute _allowed_algorithms. All three should be rewritten to assert behaviour through the public validate() interface.", + "files": [ + "evals/files/private-member-access.py" + ], + "expectations": [ + "test_internal_decode_extracts_sub_claim is flagged for calling private method _decode", + "test_signature_cache_is_populated is flagged for reading private attribute _cache", + "test_algorithm_whitelist_contains_hs256 is flagged for reading private attribute _allowed_algorithms", + "Reviewer explains that behaviour should be verified through the public interface", + "Scope rule is cited as the violated standard" + ] + }, + { + "id": 13, + "category": "regression", + "prompt": "Are these OrderProcessor tests complete? Do the assertions and coverage look right to you? File: evals/files/weak-assertions-no-failure.py", + "expected_output": "Review flags assertion completeness issues: (1) test_process_order_returns_something uses 'is not None' — too weak, should assert the specific return value. (2) test_process_order_completes asserts truthy only. (3) test_reserve_stock_called asserts '.called is not None' which is always True and proves nothing. Also flags missing failure-path coverage (out-of-stock, unknown SKU) and missing boundary tests (quantity=0, quantity=1).", + "files": [ + "evals/files/weak-assertions-no-failure.py" + ], + "expectations": [ + "test_process_order_returns_something is flagged for 'is not None' weak assertion", + "test_process_order_completes is flagged for truthy-check-only assertion", + "test_reserve_stock_called is flagged for '.called is not None' which is always True", + "Missing failure-path tests (out-of-stock or unknown SKU) are flagged", + "Missing boundary tests (quantity=0) are flagged", + "Assertion completeness and boundary coverage rules are cited" + ] + }, + { + "id": 14, + "category": "regression", + "prompt": "Review these UserProfileService tests — I think there may be some duplication issues. File: evals/files/copy-paste-setup.py", + "expected_output": "Review flags that the (repo, cache, service) setup block is copy-pasted identically across all four tests. Recommends extracting it into a shared @pytest.fixture (or conftest.py). Also notes that fixtures should be documented with their purpose and any side effects.", + "files": [ + "evals/files/copy-paste-setup.py" + ], + "expectations": [ + "Identical (repo, cache, service) setup in each test is flagged as copy-pasted", + "Reviewer recommends extracting shared setup into a @pytest.fixture", + "Reviewer mentions placing shared fixtures in conftest.py or a test class setup", + "Reviewer notes that fixtures should be documented with purpose and side effects", + "Fixture management rule is cited as the violated standard" + ] + }, + { + "id": 15, + "category": "regression", + "prompt": "Audit these DiscountCalculator tests for naming conventions and test coverage gaps. File: evals/files/bad-naming-missing-boundaries.py", + "expected_output": "Review flags four naming violations: (1) test_1 has no descriptive name. (2) test_discount_works is vague — no condition or expected outcome. (3) test_it_returns_price is missing the expected-outcome segment. (4) test_big_number does not follow test___ format. Also flags missing boundary test for price=0, missing failure test for negative price, and missing failure test for unknown tier string.", + "files": [ + "evals/files/bad-naming-missing-boundaries.py" + ], + "expectations": [ + "test_1 is flagged as having no descriptive name", + "test_discount_works is flagged as vague — missing condition and expected outcome", + "test_it_returns_price is flagged for missing the expected-outcome segment", + "test_big_number is flagged for not following test___", + "Missing boundary test for price=0 is flagged", + "Missing failure test for negative price is flagged", + "Missing failure test for unknown member_tier string is flagged", + "Naming convention and boundary coverage rules are cited" + ] + }, + { + "id": 16, + "category": "regression", + "prompt": "Check these InvoiceGenerator tests for structural issues — docstrings, comments, that kind of thing. File: evals/files/missing-docstrings-stray-comments.py", + "expected_output": "Review flags: (1) test_generate_valid_invoice has no docstring — scenario not stated. (2) test_generate_zero_amount and test_generate_missing_client_raises use inline comments instead of docstrings. (3) A stray explanatory comment appears between test functions (outside any test method) above test_generate_negative_amount. (4) The generator fixture has no docstring documenting its purpose or side effects.", + "files": [ + "evals/files/missing-docstrings-stray-comments.py" + ], + "expectations": [ + "test_generate_valid_invoice is flagged for missing docstring", + "test_generate_zero_amount is flagged for using an inline comment instead of a docstring", + "The stray comment between test functions (outside a test method) is flagged", + "test_generate_missing_client_raises is flagged for inline comment instead of docstring", + "The generator fixture is flagged for missing purpose/side-effects docstring", + "Naming/structure rule (docstring required, no stray comments outside test methods) is cited" + ] + }, + { + "id": 17, + "category": "edge-case", + "prompt": "Can you review these PriceValidator tests for framework idiom issues? File: evals/files/framework-idiom-misuse.py", + "expected_output": "Review explains the pytest.raises match parameter is a regex, not a literal substring. test_negative_price_raises_error uses match=\"Price must be non-negative\", which works only because it contains no regex metacharacters; a literal string with metacharacters (e.g. an unescaped '.') would silently fail to match. test_unknown_currency_raises_error is correct. The reviewer should clarify the idiom (match must be a valid regex) rather than flag a broken test — this is a latent edge case, not a current failure.", + "files": [ + "evals/files/framework-idiom-misuse.py" + ], + "expectations": [ + "test_negative_price_raises_error match parameter is reviewed for regex safety", + "Reviewer explains that match is a regex, not a literal string", + "Reviewer notes test_unknown_currency_raises_error is correct", + "Reviewer clarifies the idiom (match parameter must be a valid regex)", + "Exception assertion rule is cited" + ] } ] } diff --git a/skills/test-unit-review/evals/fixture-map.md b/skills/test-unit-review/evals/fixture-map.md index dc62777..48afe07 100644 --- a/skills/test-unit-review/evals/fixture-map.md +++ b/skills/test-unit-review/evals/fixture-map.md @@ -2,18 +2,25 @@ Links each eval test case to its fixture file(s). -| Test ID | Category | Fixture | -|---------|----------------|---------| -| 1 | happy-path | evals/files/compliant-auth-service-tests.py | +| Test ID | Category | Fixture | +|---------|-----------------|---------| +| 1 | happy-path | evals/files/compliant-auth-service-tests.py | | 2 | multi-violation | evals/files/multi-violation-report-tests.py | -| 3 | regression | evals/files/missing-coverage-subscription-tests.py | -| 4 | negative | *(no file — write request routed away)* | -| 5 | negative | *(no file — runtime debugging question)* | -| 6 | paraphrase | *(no file — informal phrasing)* | -| 7 | edge-case | *(no file — pytest.approx guidance)* | -| 8 | output-format | *(no file — format documentation)* | -| 9 | regression | evals/files/flaky-shared-mock-state.py | -| 10 | regression | evals/files/integration-test-in-unit-suite.py | +| 3 | regression | evals/files/missing-coverage-subscription-tests.py | +| 4 | negative | *(no file — write request routed away)* | +| 5 | negative | *(no file — runtime debugging question)* | +| 6 | paraphrase | *(no file — informal phrasing)* | +| 7 | edge-case | *(no file — pytest.approx guidance)* | +| 8 | output-format | *(no file — format documentation)* | +| 9 | regression | evals/files/flaky-shared-mock-state.py | +| 10 | regression | evals/files/integration-test-in-unit-suite.py | +| 11 | regression | evals/files/real-io-violations.py | +| 12 | regression | evals/files/private-member-access.py | +| 13 | regression | evals/files/weak-assertions-no-failure.py | +| 14 | regression | evals/files/copy-paste-setup.py | +| 15 | regression | evals/files/bad-naming-missing-boundaries.py | +| 16 | regression | evals/files/missing-docstrings-stray-comments.py | +| 17 | edge-case | evals/files/framework-idiom-misuse.py | ## Fixture → Rule mapping @@ -24,17 +31,24 @@ Links each eval test case to its fixture file(s). | missing-coverage-subscription-tests.py | Coverage completeness and weak assertions | | flaky-shared-mock-state.py | Isolation — shared mutable state, order-dependent tests | | integration-test-in-unit-suite.py | Isolation — real I/O (Kafka), integration test in unit suite | +| real-io-violations.py | Isolation — real I/O (filesystem, HTTP, DB) | +| private-member-access.py | Scope — private member access | +| weak-assertions-no-failure.py | Assertions — completeness, weak assertions, missing failure/boundary coverage | +| copy-paste-setup.py | Fixtures — copy-pasted setup, missing shared fixture | +| bad-naming-missing-boundaries.py | Naming conventions and boundary coverage | +| missing-docstrings-stray-comments.py | Naming/structure — docstrings required, no stray comments | +| framework-idiom-misuse.py | Assertions — pytest.raises idiom (regex match parameter) | ## Coverage summary - happy-path: 1 - multi-violation: 1 -- regression: 3 +- regression: 12 - negative: 2 - paraphrase: 1 -- edge-case: 1 +- edge-case: 2 - output-format: 1 -- **total: 10** +- **total: 20** ## Trigger eval coverage @@ -43,3 +57,11 @@ Links each eval test case to its fixture file(s). | should_trigger = true | 11 | | should_trigger = false | 8 | | **total** | **19** | + +## Notes + +Single-category fixtures 11–17 (real I/O, private member access, weak assertions, +copy-paste setup, bad naming, missing docstrings, framework idiom misuse) were +migrated from `test-unit-standards/evals/files/` when that skill was narrowed to a +pure reference layer. They give the review skill dedicated per-category regression +coverage alongside the existing multi-violation and compliant fixtures. diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md index 2cf62af..744cf8e 100644 --- a/skills/test-unit-standards/SKILL.md +++ b/skills/test-unit-standards/SKILL.md @@ -1,21 +1,19 @@ --- name: test-unit-standards description: > - Reference guide and checklist for unit test standards — test isolation, scope, naming, - assertions, coverage, and fixtures. Use this skill to ask questions about test conventions, - structural rules, best practices, and to audit test files against standards. Activate when - the user asks about standards, conventions, or wants to understand what makes a good test. - Triggers on: "write unit tests", "add tests for", "audit this test file", "review these tests", - "unit test standards", "test isolation rules", "test naming convention", "assertion standards", - "fixture reuse rules", "private member in tests", "failure-path coverage", "are these tests - good enough", "test conventions", "what are the rules for", "how should I structure my tests", - "best practices for unit tests", "should I be mocking", "do these tests follow our standards". - Does NOT trigger for: writing specific test cases or test code (use test-unit-write for "write a test for - the edge case where..."), choosing test doubles — spy vs stub vs mock (use test-mocking-patterns), - managing test data strategies (use test-data-management), debugging a failing test at runtime, - running tests, refactoring test code for clarity/duplication, integration test planning, general - PR review (use pr-review), or non-test tasks. - Pairs with test-unit-write and test-unit-review — standards is the reference layer they consult. + Reference checklist for unit test standards — isolation, scope, naming, + assertions, coverage, and fixtures. Ask this skill about test conventions, structural + rules, and best practices. This is the reference layer that test-unit-write and + test-unit-review consult; it does not write or audit tests itself. + Triggers on: "unit test standards", "test isolation rules", "test naming convention", + "assertion standards", "fixture reuse rules", "private member in tests", "test + conventions", "what are the rules for", "how should I structure my tests", "best + practices for unit tests". + Does NOT trigger for: writing new tests (use test-unit-write), reviewing or auditing + existing tests (use test-unit-review), choosing test doubles (use test-mocking-patterns), + test data strategies (use test-data-management), debugging or running tests, + integration test planning, general PR review (use pr-review), or non-test tasks. + Pairs with test-unit-write and test-unit-review — standards is their reference layer. license: Proprietary compatibility: GitHub Copilot --- diff --git a/skills/test-unit-standards/evals/evals.json b/skills/test-unit-standards/evals/evals.json index d71f63b..a316c28 100644 --- a/skills/test-unit-standards/evals/evals.json +++ b/skills/test-unit-standards/evals/evals.json @@ -3,123 +3,94 @@ "evals": [ { "id": 1, - "category": "happy-path", - "prompt": "Can you review these unit tests for our PaymentService and confirm they follow our unit testing standards? File: evals/files/compliant-payment-tests.py", - "expected_output": "Review confirms the tests comply with all unit testing standards. All I/O is stubbed via fixtures (no real network, DB, or filesystem). Tests use only the public interface. Names follow test___. Assertions are specific. Both success and failure paths are covered. Boundary value (0.01) is tested. Shared fixtures are defined and documented. No violations found.", - "files": [ - "evals/files/compliant-payment-tests.py" - ], + "category": "reference", + "prompt": "What are our unit test standards? Give me the checklist before I start writing tests.", + "expected_output": "Agent summarises the six standards categories from the skill: isolation (no real I/O, mock all boundaries, independent tests), scope (one unit, public interface only, no private access), naming/structure (scenario-stating names, one behavior per test, docstring required), assertions (return values/exceptions/logs, success + ≥1 failure path, boundaries and null/empty), and fixtures (idiomatic location, reuse not copy-paste, documented). It presents these as reference guidance, not as an audit of any specific file.", + "files": [], "expectations": [ - "No violations are reported", - "Reviewer confirms all I/O is properly stubbed (isolation maintained)", - "Reviewer confirms tests use the public interface only", - "Reviewer confirms naming convention test___ is followed", - "Reviewer confirms success and failure paths are both covered", - "Reviewer acknowledges shared fixtures are defined and documented", - "Reviewer does not fabricate problems where none exist" + "All six categories are described: isolation, scope, naming/structure, assertions, coverage, fixtures", + "Response is reference guidance, not a file audit", + "No specific test file is assumed or reviewed", + "Does not fabricate violations or apply Blocker/Important/Nit findings to non-existent code" ] }, { "id": 2, - "category": "regression", - "prompt": "Please review these unit tests for the ReportExporter. Do they follow isolation standards? File: evals/files/real-io-violations.py", - "expected_output": "Review flags three isolation violations: (1) test_export_pdf_creates_file writes to the real filesystem (/tmp). (2) test_fetch_template_from_cdn makes a real outbound HTTP request. (3) TestReportExporterWithDB.setup_method opens a live PostgreSQL connection. Each should be replaced with a mock or stub at its I/O boundary.", - "files": [ - "evals/files/real-io-violations.py" - ], + "category": "reference", + "prompt": "What's the rule on accessing private members in unit tests?", + "expected_output": "Agent explains the scope rule: tests should verify behaviour through the public interface only and must not call private methods or read private attributes directly. It explains why — private members are implementation details that change without affecting the contract, so tests coupled to them are brittle. It points to the language reference for the specific convention. No file is reviewed.", + "files": [], "expectations": [ - "test_export_pdf_creates_file is flagged for real filesystem write", - "test_fetch_template_from_cdn is flagged for real outbound HTTP call", - "TestReportExporterWithDB.setup_method is flagged for real PostgreSQL connection", - "Reviewer recommends mocking os/pathlib, requests, and psycopg2 at their boundary", - "Isolation rule is cited as the violated standard" + "States that tests should use the public interface only", + "States that private methods/attributes should not be accessed directly", + "Explains the rationale (implementation details, brittleness)", + "Does not audit a specific file" ] }, { "id": 3, - "category": "regression", - "prompt": "Check whether these TokenValidator tests follow unit test standards. Do any of them access things they shouldn't? File: evals/files/private-member-access.py", - "expected_output": "Review flags three scope violations: (1) test_internal_decode_extracts_sub_claim calls private method _decode directly. (2) test_signature_cache_is_populated reads private attribute _cache. (3) test_algorithm_whitelist_contains_hs256 reads private attribute _allowed_algorithms. All three should be rewritten to assert behaviour through the public validate() interface.", - "files": [ - "evals/files/private-member-access.py" - ], + "category": "reference", + "prompt": "What naming convention should our unit test functions follow, and how many assertions per test?", + "expected_output": "Agent states test names must state the scenario following the language-reference format (e.g. test___), and that each test should cover one logical behaviour — multiple asserts are acceptable only when they collectively confirm a single outcome. It also notes a docstring stating the scenario is required and loose comments between methods are discouraged. Reference guidance only.", + "files": [], "expectations": [ - "test_internal_decode_extracts_sub_claim is flagged for calling private method _decode", - "test_signature_cache_is_populated is flagged for reading private attribute _cache", - "test_algorithm_whitelist_contains_hs256 is flagged for reading private attribute _allowed_algorithms", - "Reviewer explains that behaviour should be verified through the public interface", - "Scope rule is cited as the violated standard" + "Names must state the scenario (test___ or language-reference format)", + "One logical behaviour per test; multiple asserts OK only if confirming a single outcome", + "Docstring stating the scenario is required", + "No specific file is reviewed" ] }, { "id": 4, - "category": "regression", - "prompt": "Are these OrderProcessor tests complete? Do the assertions and coverage look right to you? File: evals/files/weak-assertions-no-failure.py", - "expected_output": "Review flags assertion completeness issues: (1) test_process_order_returns_something uses 'is not None' — too weak, should assert the specific return value. (2) test_process_order_completes asserts truthy only. (3) test_reserve_stock_called asserts '.called is not None' which is always True and proves nothing. Also flags missing failure-path coverage (out-of-stock, unknown SKU) and missing boundary tests (quantity=0, quantity=1).", - "files": [ - "evals/files/weak-assertions-no-failure.py" - ], + "category": "reference", + "prompt": "What are the rules for failure-path and boundary coverage in our unit tests?", + "expected_output": "Agent explains the assertion/coverage standard: every behaviour needs success plus at least one failure path, plus boundary values and empty/null inputs. Assertions must be specific (return values, exceptions, contract-sensitive log messages, exit codes) rather than generic truthy or is-not-None checks. Reference guidance only — no file audit.", + "files": [], "expectations": [ - "test_process_order_returns_something is flagged for 'is not None' weak assertion", - "test_process_order_completes is flagged for truthy-check-only assertion", - "test_reserve_stock_called is flagged for '.called is not None' which is always True", - "Missing failure-path tests (out-of-stock or unknown SKU) are flagged", - "Missing boundary tests (quantity=0) are flagged", - "Assertion completeness and boundary coverage rules are cited" + "Requires success plus at least one failure path per behaviour", + "Requires boundary values and empty/null input coverage", + "Requires specific assertions over generic truthy/is-not-None checks", + "Does not audit a specific file" ] }, { "id": 5, - "category": "regression", - "prompt": "Review these UserProfileService tests — I think there may be some duplication issues. File: evals/files/copy-paste-setup.py", - "expected_output": "Review flags that the (repo, cache, service) setup block is copy-pasted identically across all four tests. Recommends extracting it into a shared @pytest.fixture (or conftest.py). Also notes that fixtures should be documented with their purpose and any side effects.", - "files": [ - "evals/files/copy-paste-setup.py" - ], + "category": "reference", + "prompt": "Where should pytest fixtures live and how should they be documented?", + "expected_output": "Agent states fixtures belong in a framework-idiomatic location (for pytest, conftest.py), must be reused across tests rather than copy-pasted into each test, and must be documented with their purpose and any side effects. Reference guidance only.", + "files": [], "expectations": [ - "Identical (repo, cache, service) setup in each test is flagged as copy-pasted", - "Reviewer recommends extracting shared setup into a @pytest.fixture", - "Reviewer mentions placing shared fixtures in conftest.py or a test class setup", - "Reviewer notes that fixtures should be documented with purpose and side effects", - "Fixture management rule is cited as the violated standard" + "Fixtures placed in idiomatic location (pytest: conftest.py)", + "Reuse across tests, no copy-pasted setup", + "Document purpose and side effects", + "No specific file is reviewed" ] }, { "id": 6, - "category": "regression", - "prompt": "Audit these DiscountCalculator tests for naming conventions and test coverage gaps. File: evals/files/bad-naming-missing-boundaries.py", - "expected_output": "Review flags four naming violations: (1) test_1 has no descriptive name. (2) test_discount_works is vague — no condition or expected outcome. (3) test_it_returns_price is missing the expected-outcome segment. (4) test_big_number does not follow test___ format. Also flags missing boundary test for price=0, missing failure test for negative price, and missing failure test for unknown tier string.", - "files": [ - "evals/files/bad-naming-missing-boundaries.py" - ], + "category": "edge-case", + "prompt": "A test uses pytest's tmp_path fixture to write a file and then reads it back. Is using a real file system in a unit test a violation of our standards?", + "expected_output": "Yes, but it is a borderline case worth explaining. The standard says no real I/O in unit tests. tmp_path is an in-process temporary directory that does touch the real file system, so strictly it violates the isolation rule. However, if the function under test explicitly takes a file path and its core logic is the file handling, the choice is: (1) refactor to accept a file-like object (preferred — allows full in-memory testing); (2) accept tmp_path as a pragmatic exception only when refactoring is disproportionate. Flag it as Important with the refactoring suggestion as the preferred fix.", + "files": [], "expectations": [ - "test_1 is flagged as having no descriptive name", - "test_discount_works is flagged as vague — missing condition and expected outcome", - "test_it_returns_price is flagged for missing the expected-outcome segment", - "test_big_number is flagged for not following test___", - "Missing boundary test for price=0 is flagged", - "Missing failure test for negative price is flagged", - "Missing failure test for unknown member_tier string is flagged", - "Naming convention and boundary coverage rules are cited" + "Identifies tmp_path as real file system I/O — a standards violation", + "Notes the borderline nature of the case", + "Recommends refactoring to accept a file-like object as the preferred fix", + "Acknowledges tmp_path as a pragmatic exception only when refactoring is disproportionate", + "Flags as Important, not Nit" ] }, { "id": 7, - "category": "happy-path", - "prompt": "Write unit tests for the DiscountCalculator class in evals/files/source-discount-calculator.py. Make sure they follow our unit test standards.", - "expected_output": "Output produces compliant unit tests: names follow test___; all valid tiers (gold, silver, bronze, None) have a success test; failure paths for negative price and unknown tier are tested with specific exception assertions; boundary value price=0 is tested; assertions are specific (assertEqual / assert ==); no private members are accessed; no real I/O is introduced.", - "files": [ - "evals/files/source-discount-calculator.py" - ], + "category": "edge-case", + "prompt": "Is it OK for a single unit test to have several assert statements, or should it be one assert per test?", + "expected_output": "Agent explains the one-logical-behaviour-per-test rule: multiple assert statements are acceptable when they collectively confirm a single outcome (e.g. asserting several fields of one returned object), but a test asserting several unrelated behaviours should be split so a failure points to a single invariant. Reference guidance, not a file audit.", + "files": [], "expectations": [ - "Test names follow test___ convention", - "All four tiers (gold, silver, bronze, None) are tested for correct discount", - "Failure path for negative price is tested with a specific exception assertion", - "Failure path for unknown tier string is tested with a specific exception assertion", - "Boundary value price=0 is covered", - "Assertions are specific (exact value comparison, not 'is not None' or truthy)", - "No private members are accessed", - "No real I/O or external dependencies are introduced" + "States one logical behaviour per test, not strictly one assert", + "Multiple asserts OK when confirming a single outcome", + "Unrelated behaviours should be split into separate tests", + "Explains the rationale (failure localisation)" ] }, { @@ -137,76 +108,6 @@ }, { "id": 9, - "category": "paraphrase", - "prompt": "Are these tests up to scratch? I want to make sure they're not calling anything outside the service under test. evals/files/real-io-violations.py", - "expected_output": "Same core behaviour as eval 2: all three real-I/O violations are identified (real filesystem, real HTTP call, real DB connection) with remediation guidance. Phrasing of the request differs but the findings and rule citations should be equivalent.", - "files": [ - "evals/files/real-io-violations.py" - ], - "expectations": [ - "Real filesystem I/O (test_export_pdf_creates_file) is flagged", - "Real HTTP call (test_fetch_template_from_cdn) is flagged", - "Real PostgreSQL connection (TestReportExporterWithDB.setup_method) is flagged", - "Remediation guidance (mock os/requests/psycopg2) is provided", - "Response substance matches eval 2 despite different prompt phrasing" - ] - }, - { - "id": 10, - "category": "multi-violation", - "prompt": "Full audit of these NotificationService unit tests — tell me everything that's wrong. File: evals/files/multi-violation-audit.py", - "expected_output": "Review identifies violations across all six rule categories: (1) ISOLATION — setup_method uses a real Redis connection. (2) SCOPE — _build_message (private method) and _retry_count (private attribute) are accessed directly. (3) NAMING — test_send, test_error, test_ok do not follow test___. (4) ASSERTIONS — test_send uses 'is not None'; test_ok uses a truthy check; test_error has no assertion. (5) FIXTURES — redis_mock + email_client setup is copy-pasted in the standalone tests instead of using a shared fixture. (6) BOUNDARIES — no test for empty recipients, None subject, or zero-length body.", - "files": [ - "evals/files/multi-violation-audit.py" - ], - "expectations": [ - "setup_method real Redis connection is flagged as an isolation violation", - "_build_message call in test_build_message_template is flagged as private method access", - "_retry_count read in test_retry_count_after_failure is flagged as private attribute access", - "test_send, test_error, test_ok are flagged for not following the naming convention", - "test_send 'is not None' assertion is flagged as weak", - "test_ok truthy-only assertion is flagged as weak", - "test_error has no assertion — flagged as an incomplete test", - "Copy-pasted redis_mock + email_client setup in standalone tests is flagged", - "Missing boundary tests (empty recipients, None subject) are flagged", - "All six rule categories are applied: isolation, scope, naming, assertions, fixtures, boundaries" - ] - }, - { - "id": 11, - "category": "regression", - "prompt": "Are there any isolation issues in these CartService tests? File: evals/files/shared-mutable-state.py", - "expected_output": "Review flags that TestCartServiceSequenced._processed is a class-level mutable list shared across all test instances. test_refund_removes_order depends on test_checkout_records_order having run first, making the suite order-dependent. Each test must be fully independent and runnable in any order; shared mutable state must be reset per-test (e.g. in a fixture with function scope).", - "files": [ - "evals/files/shared-mutable-state.py" - ], - "expectations": [ - "_processed class-level mutable list is flagged as shared mutable state", - "test_refund_removes_order is flagged as order-dependent on test_checkout_records_order", - "Reviewer explains the tests are not independently runnable", - "Reviewer recommends resetting state per-test via a function-scoped fixture or instance attribute", - "Isolation rule (no shared mutable state, each test runnable in isolation) is cited" - ] - }, - { - "id": 12, - "category": "regression", - "prompt": "Check these InvoiceGenerator tests for structural issues — docstrings, comments, that kind of thing. File: evals/files/missing-docstrings-stray-comments.py", - "expected_output": "Review flags: (1) test_generate_valid_invoice has no docstring — scenario not stated. (2) test_generate_zero_amount and test_generate_missing_client_raises use inline comments instead of docstrings. (3) A stray explanatory comment appears between test functions (outside any test method) above test_generate_negative_amount. (4) The generator fixture has no docstring documenting its purpose or side effects.", - "files": [ - "evals/files/missing-docstrings-stray-comments.py" - ], - "expectations": [ - "test_generate_valid_invoice is flagged for missing docstring", - "test_generate_zero_amount is flagged for using an inline comment instead of a docstring", - "The stray comment between test functions (outside a test method) is flagged", - "test_generate_missing_client_raises is flagged for inline comment instead of docstring", - "The generator fixture is flagged for missing purpose/side-effects docstring", - "Naming/structure rule (docstring required, no stray comments outside test methods) is cited" - ] - }, - { - "id": 13, "category": "negative", "prompt": "The calculate() method in evals/files/mixed-source-and-tests.py uses a chain of if/elif blocks. Can you refactor it to use a cleaner lookup structure? The test file is there for reference — don't change the tests.", "expected_output": "Assistant refactors the calculate() method (e.g. replacing the if/elif weight tiers with a sorted lookup table). It does NOT apply unit test standards to the test portion of the file, flag test names, assert coverage gaps, or cite isolation rules — because the request is a source-code refactor, not a test review.", @@ -222,60 +123,15 @@ ] }, { - "id": 14, - "category": "paraphrase", - "prompt": "Are these tests following our conventions? They were written by a new team member.", - "expected_output": "Agent applies the full six-category unit test standards checklist: isolation (no real I/O), scope (no private member access), naming (test___), assertions (specific values vs truthy checks), coverage (failure paths, boundaries, edge cases), fixtures (docstrings, no shared mutable state). Groups findings as Blocker / Important / Nit. Confirms compliance explicitly if all categories pass. The 'new team member' framing does not change the standard applied.", - "files": [], - "expectations": [ - "Identifies 'following our conventions' as a unit test standards review", - "Applies all six standards categories", - "Groups findings as Blocker / Important / Nit", - "Does not soften the standard because the author is new" - ] - }, - { - "id": 15, - "category": "edge-case", - "prompt": "A test uses pytest's tmp_path fixture to write a file and then reads it back. Is using a real file system in a unit test a violation?", - "expected_output": "Yes, but it is a borderline case worth explaining. The standard says no real I/O in unit tests. tmp_path is an in-process temporary directory that does touch the real file system, so strictly it is a violation of the isolation rule. However, if the function under test explicitly takes a file path and its core logic is the file handling, the choice is: (1) refactor to accept a file-like object (preferred — allows full in-memory testing); (2) accept tmp_path as a pragmatic exception only when refactoring is disproportionate. Flag it as Important with the refactoring suggestion as the preferred fix.", - "files": [], - "expectations": [ - "Identifies tmp_path as real file system I/O — a standards violation", - "Notes the borderline nature of the case", - "Recommends refactoring to accept a file-like object as the preferred fix", - "Acknowledges tmp_path as a pragmatic exception only when refactoring is disproportionate", - "Flags as Important, not Nit" - ] - }, - { - "id": 16, - "category": "output-format", - "prompt": "Show me what the skill output looks like for a test file with one isolation violation and one naming violation.", - "expected_output": "The response groups findings under '### Blocker', '### Important', '### Nit' headers. Each violation is a bulleted item with: (1) test name in backticks, (2) standards category (e.g. 'Isolation', 'Naming'), (3) the specific rule violated, (4) a concrete fix suggestion. Code snippets showing the fix appear in fenced code blocks. A compliance confirmation is stated explicitly when no Blocker or Important findings exist.", + "id": 10, + "category": "negative", + "prompt": "Can you review these unit tests and tell me what's wrong with them? evals/files/some-tests.py", + "expected_output": "This is a review/audit request for a specific file, which routes to test-unit-review — not the standards reference layer. The standards skill provides the rules that the review skill applies; it does not itself perform file audits. If handled here, it should defer to test-unit-review for the audit.", "files": [], "expectations": [ - "Blocker / Important / Nit H3 sections", - "Bulleted items with test name in backticks", - "Standards category name in each finding", - "Fix suggestions in fenced code blocks where applicable", - "Explicit compliance confirmation when no Blocker/Important findings" - ] - }, - { - "id": 17, - "category": "regression", - "prompt": "Can you review these PriceValidator tests for framework idiom issues? File: evals/files/framework-idiom-misuse.py", - "expected_output": "Review flags a pytest idiom violation: test_negative_price_raises_error uses pytest.raises(ValueError, match=\"Price must be non-negative\") where match is a regex parameter, not a literal substring. The match parameter expects a compiled regex or regex string; a literal match parameter that is not a valid regex will not work correctly. Correct usage: match=r'Price must be' or match='Price must' (a valid regex substring). This is not a broken test (the other test shows correct usage), but a potential edge case where exact strings as match parameters can fail if they contain regex metacharacters.", - "files": [ - "evals/files/framework-idiom-misuse.py" - ], - "expectations": [ - "test_negative_price_raises_error match parameter is reviewed for regex safety", - "Reviewer explains that match is a regex, not a literal string", - "Reviewer notes test_unknown_currency_raises_error is correct", - "Reviewer clarifies the idiom (match parameter must be a valid regex)", - "Exception assertion rule is cited" + "Skill does not perform a file-by-file audit itself", + "Skill defers reviewing/auditing an existing file to test-unit-review", + "Reference layer boundary is respected (standards informs, review audits)" ] } ] diff --git a/skills/test-unit-standards/evals/fixture-map.md b/skills/test-unit-standards/evals/fixture-map.md index 5d61124..cb8fbbf 100644 --- a/skills/test-unit-standards/evals/fixture-map.md +++ b/skills/test-unit-standards/evals/fixture-map.md @@ -2,58 +2,48 @@ Links each eval test case to its fixture file(s). -| Test ID | Category | Fixture | -|---|---|---| -| 1 | happy-path | evals/files/compliant-payment-tests.py | -| 2 | regression | evals/files/real-io-violations.py | -| 3 | regression | evals/files/private-member-access.py | -| 4 | regression | evals/files/weak-assertions-no-failure.py | -| 5 | regression | evals/files/copy-paste-setup.py | -| 6 | regression | evals/files/bad-naming-missing-boundaries.py | -| 7 | happy-path | evals/files/source-discount-calculator.py | -| 8 | negative | *(no file — docstring task)* | -| 9 | paraphrase | evals/files/real-io-violations.py | -| 10 | multi-violation | evals/files/multi-violation-audit.py | -| 11 | regression | evals/files/shared-mutable-state.py | -| 12 | regression | evals/files/missing-docstrings-stray-comments.py | -| 13 | negative | evals/files/mixed-source-and-tests.py | -| 14 | paraphrase | *(no file — informal phrasing)* | -| 15 | edge-case | *(no file — tmp_path guidance)* | -| 16 | output-format | *(no file — format documentation)* | -| 17 | regression | evals/files/framework-idiom-misuse.py | - -## Fixture → Rule mapping - -| Fixture file | Primary rule(s) exercised | +`test-unit-standards` is the reference layer: it answers questions about test +conventions and rules. It does not write tests (see `test-unit-write`) or audit +specific test files (see `test-unit-review`), so most evals are file-free Q&A. + +| Test ID | Category | Fixture | +|---------|------------|---------| +| 1 | reference | *(no file — standards checklist Q&A)* | +| 2 | reference | *(no file — private member rule Q&A)* | +| 3 | reference | *(no file — naming + asserts-per-test Q&A)* | +| 4 | reference | *(no file — failure-path/boundary rule Q&A)* | +| 5 | reference | *(no file — fixture placement/docs Q&A)* | +| 6 | edge-case | *(no file — tmp_path guidance)* | +| 7 | edge-case | *(no file — multiple-asserts guidance)* | +| 8 | negative | *(no file — docstring task)* | +| 9 | negative | evals/files/mixed-source-and-tests.py | +| 10 | negative | *(no file — review request routed to test-unit-review)* | + +## Fixture → purpose mapping + +| Fixture file | Purpose | |---|---| -| compliant-payment-tests.py | All rules — no violations (reference implementation) | -| real-io-violations.py | Isolation — real I/O (filesystem, HTTP, DB) | -| private-member-access.py | Scope — private member access | -| weak-assertions-no-failure.py | Assertion completeness, boundary values | -| copy-paste-setup.py | Fixture management | -| bad-naming-missing-boundaries.py | Naming conventions, boundary values | -| source-discount-calculator.py | All rules — write-from-scratch task | -| shared-mutable-state.py | Isolation — shared mutable state between tests | -| missing-docstrings-stray-comments.py | Naming/structure — docstrings required, no stray comments | -| mixed-source-and-tests.py | Scope note — non-test task must not trigger rule enforcement | -| multi-violation-audit.py | All rules — isolation, scope, naming, assertions, fixtures, boundaries | -| framework-idiom-misuse.py | Assertions — pytest.raises idiom (regex match parameter) | +| mixed-source-and-tests.py | Scope note — non-test (source refactor) task must not trigger standards enforcement | ## Coverage summary -- happy-path: 2 -- regression: 8 -- negative: 2 -- paraphrase: 2 -- multi-violation: 1 -- edge-case: 1 -- output-format: 1 -- **total: 17** +- reference: 5 +- edge-case: 2 +- negative: 3 +- **total: 10** ## Trigger eval coverage | Direction | Count | |---|---| -| should_trigger = true | 12 | -| should_trigger = false | 18 | -| **total** | **30** | +| should_trigger = true | 10 | +| should_trigger = false | 16 | +| **total** | **26** | + +## Notes + +Behavioral review/audit fixtures (private member access, naming, weak assertions, +copy-paste fixtures, real I/O, missing docstrings, framework idiom misuse) were +migrated to `test-unit-review/evals/files/` when this skill was narrowed to a pure +reference layer. The review skill owns file-audit behaviour and those fixtures back +its single-category regression evals. diff --git a/skills/test-unit-standards/evals/trigger-eval.json b/skills/test-unit-standards/evals/trigger-eval.json index 19ddb37..95bbc6f 100644 --- a/skills/test-unit-standards/evals/trigger-eval.json +++ b/skills/test-unit-standards/evals/trigger-eval.json @@ -1,166 +1,158 @@ [ { - "id": "t01-write-unit-tests-explicit", - "query": "Write unit tests for the PaymentService class.", + "id": "t01-what-are-our-standards", + "query": "What are our unit test standards?", "should_trigger": true, - "reason": "Explicit test-writing request — core trigger for the skill." + "reason": "Direct request for the unit test standards reference — primary trigger." }, - { - "id": "t03-audit-test-standards", - "query": "Audit this test file against our unit test standards.", + "id": "t02-isolation-rule", + "query": "What's the rule for test isolation in unit tests?", "should_trigger": true, - "reason": "Direct audit request using 'unit test standards' — primary trigger phrase." + "reason": "Abstract convention question about the isolation rule — reference layer territory." }, { - "id": "t04-test-isolation-check", - "query": "Are these tests properly isolated? I want to make sure they don't call real APIs.", + "id": "t03-naming-convention", + "query": "What naming convention should our unit test functions follow?", "should_trigger": true, - "reason": "Isolation check request — test isolation is explicitly listed in the skill description." + "reason": "Asks for the naming convention rule, not a review of a specific file." }, { - "id": "t05-check-test-naming", - "query": "Do my test function names follow the right convention?", + "id": "t04-assertion-standards", + "query": "What are the assertion standards we expect in unit tests?", "should_trigger": true, - "reason": "Test naming convention check — covered explicitly by the skill." + "reason": "Asks about assertion standards in the abstract — listed trigger phrase." }, { - "id": "t06-assertions-review", - "query": "Are the assertions in these tests strong enough, or am I missing edge cases?", + "id": "t05-fixture-reuse-rules", + "query": "What are the rules for reusing fixtures across unit tests?", "should_trigger": true, - "reason": "Assertion completeness and boundary coverage — both listed in the skill description." + "reason": "Fixture reuse rules question — a convention lookup, not a file audit." }, { - "id": "t07-fixture-review", - "query": "I have a lot of copy-paste in my test setup — how should I refactor it using fixtures?", + "id": "t06-private-members-rule", + "query": "Should unit tests access private members of the class under test?", "should_trigger": true, - "reason": "Fixture management question — fixture reuse is a named concern in the skill." + "reason": "Asks for the scope rule on private member access in the abstract." }, { - "id": "t08-private-member-access", - "query": "Are any of these tests accessing private methods or internal state they shouldn't?", + "id": "t07-how-to-structure", + "query": "How should I structure my unit tests?", "should_trigger": true, - "reason": "Private member access check — directly listed in the skill description." + "reason": "'How should I structure my tests' is a listed trigger phrase." }, - { - "id": "t10-are-tests-up-to-scratch", - "query": "Are these tests up to scratch? I want to ship this service today.", + "id": "t08-best-practices", + "query": "What are the best practices for unit tests?", "should_trigger": true, - "reason": "Paraphrased test quality review — 'are these tests ok' maps to reviewing against test standards." + "reason": "'Best practices for unit tests' is a listed trigger phrase." }, { - "id": "t11-mocking-boundaries", - "query": "Should I be mocking the database layer in these tests, or is it fine to use a real connection?", + "id": "t09-what-are-the-rules", + "query": "What are the rules for failure-path coverage in our unit tests?", "should_trigger": true, - "reason": "Boundary mocking question — mock/stub decisions for I/O boundaries are within skill scope." + "reason": "'What are the rules for' a specific standard — abstract convention question." }, { - "id": "t12-failure-path-coverage", - "query": "I only have happy-path tests for this service — can you help me add failure-path coverage?", + "id": "t10-test-conventions", + "query": "Can you remind me of our test conventions before I start?", "should_trigger": true, - "reason": "Failure-path coverage — the skill explicitly requires at least one failure path per behaviour." + "reason": "'Test conventions' lookup with no specific file to write or review." }, { - "id": "n01-review-pr", - "query": "Review this PR before I merge it.", + "id": "n01-write-unit-tests", + "query": "Write unit tests for the PaymentService class.", "should_trigger": false, - "reason": "PR review request — routes to pr-review skill, not unit test standards." + "reason": "Test-writing request — routes to test-unit-write, not the reference layer." }, { - "id": "n02-implement-feature", - "query": "Implement a discount calculation function in Python.", + "id": "n02-add-failure-coverage", + "query": "I only have happy-path tests — can you add failure-path coverage?", "should_trigger": false, - "reason": "Feature implementation request with no test-writing or test-review intent." + "reason": "Adding tests is a write task — routes to test-unit-write." }, { - "id": "n03-generate-docs", - "query": "Write docstrings for all the public methods in this module.", + "id": "n03-write-boundary-tests", + "query": "Write me a test for the edge case where the input list is empty and one where it is None.", "should_trigger": false, - "reason": "Documentation generation task — no test authoring or review requested." + "reason": "Test-writing request for specific cases — routes to test-unit-write." }, { - "id": "n04-terraform-module", - "query": "Create a Terraform module for an S3 bucket with versioning enabled.", + "id": "n04-review-tests-explicit", + "query": "Can you review these unit tests and tell me if they follow best practices?", "should_trigger": false, - "reason": "Infrastructure-as-code task — routes to cps-iac skill." + "reason": "Reviewing an existing file — routes to test-unit-review, not the reference layer." }, { - "id": "n05-why-test-failing", - "query": "Why is my test throwing AttributeError: 'NoneType' object has no attribute 'charge'?", + "id": "n05-audit-test-file", + "query": "Audit this test file against our unit test standards.", "should_trigger": false, - "reason": "Runtime debugging question about a test error — not a standards review or test-writing task." + "reason": "Auditing a specific file — routes to test-unit-review even though it names 'standards'." }, { - "id": "n06-refactor-service", - "query": "Refactor this service class for lower cyclomatic complexity.", + "id": "n06-are-tests-up-to-scratch", + "query": "Are these tests up to scratch? I want to ship this service today.", "should_trigger": false, - "reason": "Production code refactoring request — no test standards involvement." + "reason": "Quality review of a specific file — routes to test-unit-review." }, { - "id": "n07-create-github-issue", - "query": "Create a GitHub issue for the flaky integration test in CI.", + "id": "n07-test-double-selection", + "query": "Should I use a spy or a stub for the email client in this test? What's the difference?", "should_trigger": false, - "reason": "Issue creation task — routes to create-issue skill." + "reason": "Test-double selection (spy vs stub vs mock) routes to test-mocking-patterns." }, { - "id": "n08-kudos", - "query": "Give kudos to Alex for fixing the performance regression.", + "id": "n08-test-data-strategy", + "query": "How should I organise my test data with a factory builder pattern?", "should_trigger": false, - "reason": "Recognition workflow — routes to kudos skill." + "reason": "Test data strategy — routes to test-data-management." }, { - "id": "n09-explain-code", - "query": "Explain how the payment gateway client works.", + "id": "n09-why-test-failing", + "query": "Why is my test throwing AttributeError: 'NoneType' object has no attribute 'charge'?", "should_trigger": false, - "reason": "Code explanation request — no test writing or review in scope." + "reason": "Runtime debugging of a test error — not a standards reference question." }, { - "id": "n10-crossall-update", - "query": "Generate the crossall status update for this sprint.", + "id": "n10-run-the-tests", + "query": "Can you run the unit tests for the PaymentService and show me the output?", "should_trigger": false, - "reason": "Sprint reporting task — routes to crossall-changes skill." + "reason": "Test execution request — the reference layer does not run tests." }, { "id": "n11-integration-test-design", "query": "Design an integration test plan for the full payment flow including the gateway sandbox.", "should_trigger": false, - "reason": "Integration test planning — the skill covers unit tests only; integration tests deliberately use real I/O and are out of scope." + "reason": "Integration test planning — the skill covers unit test standards only." }, { - "id": "n12-test-double-selection", - "query": "Should I use a spy or a stub for the email client in this test? What's the difference?", - "should_trigger": false, - "reason": "Test-double selection (spy vs stub vs mock) routes to the test-mocking-patterns skill; the skill description explicitly pairs with it for double selection and does not own this decision." - }, - { - "id": "n13-run-the-tests", - "query": "Can you run the unit tests for the PaymentService and show me the output?", + "id": "n12-review-pr", + "query": "Review this PR before I merge it.", "should_trigger": false, - "reason": "Test execution request — the skill covers writing, reviewing, and auditing test standards, not running tests." + "reason": "General PR review — routes to pr-review skill." }, { - "id": "n14-review-tests-explicit", - "query": "Can you review these unit tests and tell me if they follow best practices?", + "id": "n13-implement-feature", + "query": "Implement a discount calculation function in Python.", "should_trigger": false, - "reason": "Test review request — routes to test-unit-review, not test-unit-standards. Standards is a reference checklist, not the primary workflow trigger." + "reason": "Feature implementation with no test intent." }, { - "id": "n15-write-boundary-tests", - "query": "Write me a test for the edge case where the input list is empty and one where it is None.", + "id": "n14-generate-docs", + "query": "Write docstrings for all the public methods in this module.", "should_trigger": false, - "reason": "Test-writing request — routes to test-unit-write, not test-unit-standards. Test writing is a procedural workflow." + "reason": "Documentation generation task — no test standards intent." }, { - "id": "n16-refactor-for-clarity", - "query": "Refactor this test file to reduce duplication in the setup logic.", + "id": "n15-terraform-module", + "query": "Create a Terraform module for an S3 bucket with versioning enabled.", "should_trigger": false, - "reason": "Test refactoring request — while fixture reuse is mentioned, refactoring code is not a standards audit or checklist application." + "reason": "Infrastructure-as-code task — out of scope." }, { - "id": "n17-write-docstring", - "query": "Write docstrings for all the test functions in this file.", + "id": "n16-refactor-service", + "query": "Refactor this service class for lower cyclomatic complexity.", "should_trigger": false, - "reason": "Documentation generation task — no test authoring or standards audit requested." + "reason": "Production code refactoring — no test standards involvement." } ] From a23191c74852a301eb6007ad7f0b6109e8acbef4 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 17:38:52 +0200 Subject: [PATCH 08/12] review fixes and sync of skills --- skills/test-unit-review/SKILL.md | 25 ++++++++++--------------- skills/test-unit-standards/SKILL.md | 2 +- skills/test-unit-write/SKILL.md | 21 ++++++++++----------- 3 files changed, 21 insertions(+), 27 deletions(-) diff --git a/skills/test-unit-review/SKILL.md b/skills/test-unit-review/SKILL.md index 33f743b..8560ca8 100644 --- a/skills/test-unit-review/SKILL.md +++ b/skills/test-unit-review/SKILL.md @@ -2,21 +2,16 @@ name: test-unit-review description: > Review and audit existing unit tests against quality standards. Activate when the user - asks to review, critique, give feedback, check quality, or audit any existing test file - — including informal requests like "does this look right?" or sharing a test for a quality - opinion. This skill evaluates tests for isolation, scope, naming, assertions, coverage, and - fixture reuse. - Triggers on: "review these tests", "audit this test file", "check my tests", "are these - tests good enough", "do these tests follow our standards", "LGTM on tests", - "does this test look right", "is this test missing anything", "any issues with my tests", - "give feedback on this test file", "are these tests up to scratch", "are these tests - properly isolated", "test isolation check", "assertion quality". - Does NOT trigger for: writing new tests (use test-unit-write), adding tests or coverage - (use test-unit-write), choosing test doubles or mocking strategy (use test-mocking-patterns), - managing test data strategies (use test-data-management), debugging failing tests at runtime, - general PR review (use pr-review), or integration test planning (use test-integration-standards). - Pairs with test-unit-write (to add missing tests) and test-unit-standards (for reference). -license: Proprietary + asks to review, critique, audit, or give feedback on any existing test file — including + informal requests sharing a test for a quality opinion. Evaluates isolation, scope, naming, + assertions, coverage, and fixture reuse. + Triggers on: "review these tests", "audit this test file", "check my tests", "LGTM on tests", + "does this test look right", "any issues with my tests", "test isolation check", "assertion quality". + Does NOT trigger for: writing new tests (use test-unit-write), choosing test doubles + (use test-mocking-patterns), managing test data (use test-data-management), debugging failing + tests at runtime, general PR review (use pr-review), or integration test planning + (use test-integration-standards). Pairs with test-unit-write and test-unit-standards. +license: Apache-2.0 compatibility: GitHub Copilot --- diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md index 744cf8e..9f32a09 100644 --- a/skills/test-unit-standards/SKILL.md +++ b/skills/test-unit-standards/SKILL.md @@ -14,7 +14,7 @@ description: > test data strategies (use test-data-management), debugging or running tests, integration test planning, general PR review (use pr-review), or non-test tasks. Pairs with test-unit-write and test-unit-review — standards is their reference layer. -license: Proprietary +license: Apache-2.0 compatibility: GitHub Copilot --- diff --git a/skills/test-unit-write/SKILL.md b/skills/test-unit-write/SKILL.md index 182d437..26cbaf4 100644 --- a/skills/test-unit-write/SKILL.md +++ b/skills/test-unit-write/SKILL.md @@ -3,19 +3,18 @@ name: test-unit-write description: > Write unit tests from scratch or extend partial coverage for functions, classes, or modules. Activate when the user asks to write, generate, add, or scaffold unit tests — including - requests for partial coverage such as "add failure-path tests", "add boundary tests", or - "I need tests for this method". Also activate for "help me test this" and "what tests - should I write for X?". This skill covers test isolation, naming, assertions, and coverage. + partial coverage requests like "add failure-path tests" or "add boundary tests". Also + activates for "help me test this" and "what tests should I write for X?". Triggers on: "write unit tests for", "add tests for", "generate test cases for", - "help me test this", "add coverage for", "scaffold tests for", "I need unit tests", - "write tests that cover", "what tests should I write", "add failure-path coverage", - "add edge case tests", "add boundary tests", "test this method", "generate test". - Does NOT trigger for: reviewing or auditing existing tests (use test-unit-review), asking - about test standards or conventions (use test-unit-standards), choosing test doubles — spy - vs stub vs mock (use test-mocking-patterns), managing test data strategies (use test-data-management), - debugging failing tests at runtime, integration or e2e test planning, documentation writing, - or general code refactoring. + "help me test this", "I need unit tests", "add failure-path coverage", "add edge case tests", + "add boundary tests", "test this method", "generate test", "what tests should I write". + Does NOT trigger for: reviewing existing tests (use test-unit-review), asking about + test conventions (use test-unit-standards), choosing test doubles (use test-mocking-patterns), + managing test data (use test-data-management), debugging failing tests at runtime, + integration or e2e test planning, or general code refactoring. Pairs with test-unit-standards (reference) and test-mocking-patterns (test-double selection). +license: Apache-2.0 +compatibility: GitHub Copilot --- # Unit Test Writer From 82b707fca3e18186cf08a812be3f8a95b4dab85f Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Tue, 23 Jun 2026 17:45:15 +0200 Subject: [PATCH 09/12] shrink of descriptions --- skills/test-unit-review/SKILL.md | 14 +++++--------- skills/test-unit-standards/SKILL.md | 19 ++++++------------- skills/test-unit-write/SKILL.md | 16 +++++----------- 3 files changed, 16 insertions(+), 33 deletions(-) diff --git a/skills/test-unit-review/SKILL.md b/skills/test-unit-review/SKILL.md index 8560ca8..af996b5 100644 --- a/skills/test-unit-review/SKILL.md +++ b/skills/test-unit-review/SKILL.md @@ -1,16 +1,12 @@ --- name: test-unit-review description: > - Review and audit existing unit tests against quality standards. Activate when the user - asks to review, critique, audit, or give feedback on any existing test file — including - informal requests sharing a test for a quality opinion. Evaluates isolation, scope, naming, - assertions, coverage, and fixture reuse. + Review existing unit tests: isolation, scope, naming, assertions, coverage, fixture reuse. Triggers on: "review these tests", "audit this test file", "check my tests", "LGTM on tests", - "does this test look right", "any issues with my tests", "test isolation check", "assertion quality". - Does NOT trigger for: writing new tests (use test-unit-write), choosing test doubles - (use test-mocking-patterns), managing test data (use test-data-management), debugging failing - tests at runtime, general PR review (use pr-review), or integration test planning - (use test-integration-standards). Pairs with test-unit-write and test-unit-standards. + "test isolation check", "assertion quality". + Does NOT trigger for: writing new tests (use test-unit-write), test conventions + (use test-unit-standards), test doubles (use test-mocking-patterns), test data + (use test-data-management), PR review (use pr-review), integration tests (use test-integration-standards). license: Apache-2.0 compatibility: GitHub Copilot --- diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md index 9f32a09..7b0c60a 100644 --- a/skills/test-unit-standards/SKILL.md +++ b/skills/test-unit-standards/SKILL.md @@ -1,19 +1,12 @@ --- name: test-unit-standards description: > - Reference checklist for unit test standards — isolation, scope, naming, - assertions, coverage, and fixtures. Ask this skill about test conventions, structural - rules, and best practices. This is the reference layer that test-unit-write and - test-unit-review consult; it does not write or audit tests itself. - Triggers on: "unit test standards", "test isolation rules", "test naming convention", - "assertion standards", "fixture reuse rules", "private member in tests", "test - conventions", "what are the rules for", "how should I structure my tests", "best - practices for unit tests". - Does NOT trigger for: writing new tests (use test-unit-write), reviewing or auditing - existing tests (use test-unit-review), choosing test doubles (use test-mocking-patterns), - test data strategies (use test-data-management), debugging or running tests, - integration test planning, general PR review (use pr-review), or non-test tasks. - Pairs with test-unit-write and test-unit-review — standards is their reference layer. + Unit test standards reference: isolation, scope, naming, assertions, coverage, fixtures. + Does not write or audit tests. + Triggers on: "unit test standards", "test naming convention", "best practices for unit tests", + "how should I structure my tests". + Does NOT trigger for: writing tests (use test-unit-write), reviewing tests (use test-unit-review), + test doubles (use test-mocking-patterns), test data (use test-data-management), PR review (use pr-review). license: Apache-2.0 compatibility: GitHub Copilot --- diff --git a/skills/test-unit-write/SKILL.md b/skills/test-unit-write/SKILL.md index 26cbaf4..07abbcd 100644 --- a/skills/test-unit-write/SKILL.md +++ b/skills/test-unit-write/SKILL.md @@ -2,17 +2,11 @@ name: test-unit-write description: > Write unit tests from scratch or extend partial coverage for functions, classes, or modules. - Activate when the user asks to write, generate, add, or scaffold unit tests — including - partial coverage requests like "add failure-path tests" or "add boundary tests". Also - activates for "help me test this" and "what tests should I write for X?". - Triggers on: "write unit tests for", "add tests for", "generate test cases for", - "help me test this", "I need unit tests", "add failure-path coverage", "add edge case tests", - "add boundary tests", "test this method", "generate test", "what tests should I write". - Does NOT trigger for: reviewing existing tests (use test-unit-review), asking about - test conventions (use test-unit-standards), choosing test doubles (use test-mocking-patterns), - managing test data (use test-data-management), debugging failing tests at runtime, - integration or e2e test planning, or general code refactoring. - Pairs with test-unit-standards (reference) and test-mocking-patterns (test-double selection). + Triggers on: "write unit tests for", "add tests for", "help me test this", "add failure-path tests", + "add boundary tests", "what tests should I write". + Does NOT trigger for: reviewing tests (use test-unit-review), test conventions + (use test-unit-standards), test doubles (use test-mocking-patterns), test data + (use test-data-management), debugging, integration/e2e planning, or refactoring. license: Apache-2.0 compatibility: GitHub Copilot --- From 5176bb76d9860ee760c127e8dedda2ce41293fd7 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Wed, 24 Jun 2026 07:46:55 +0200 Subject: [PATCH 10/12] Improved trigger --- skills/test-unit-review/SKILL.md | 13 +++++++------ skills/test-unit-standards/SKILL.md | 13 +++++++------ skills/test-unit-write/SKILL.md | 13 +++++++------ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/skills/test-unit-review/SKILL.md b/skills/test-unit-review/SKILL.md index af996b5..1ef8ce9 100644 --- a/skills/test-unit-review/SKILL.md +++ b/skills/test-unit-review/SKILL.md @@ -1,12 +1,13 @@ --- name: test-unit-review description: > - Review existing unit tests: isolation, scope, naming, assertions, coverage, fixture reuse. - Triggers on: "review these tests", "audit this test file", "check my tests", "LGTM on tests", - "test isolation check", "assertion quality". - Does NOT trigger for: writing new tests (use test-unit-write), test conventions - (use test-unit-standards), test doubles (use test-mocking-patterns), test data - (use test-data-management), PR review (use pr-review), integration tests (use test-integration-standards). + Review and audit existing unit tests: isolation, scope, naming, assertions, coverage, fixtures. + Triggers: feedback/review/check/audit on tests ("review these tests", "audit this test file", "check my tests", + "LGTM on tests", "are these tests up to scratch", "does this test look right", "any issues with my tests", + "give me feedback on this test file", "is this test missing coverage", "do these tests follow standards", + "are these tests properly isolated"). + NOT for: writing (→test-unit-write), standards questions (→test-unit-standards), test doubles (→test-mocking-patterns), + test data (→test-data-management), PR review (→pr-review), or debugging/CI. license: Apache-2.0 compatibility: GitHub Copilot --- diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md index 7b0c60a..d26976a 100644 --- a/skills/test-unit-standards/SKILL.md +++ b/skills/test-unit-standards/SKILL.md @@ -1,12 +1,13 @@ --- name: test-unit-standards description: > - Unit test standards reference: isolation, scope, naming, assertions, coverage, fixtures. - Does not write or audit tests. - Triggers on: "unit test standards", "test naming convention", "best practices for unit tests", - "how should I structure my tests". - Does NOT trigger for: writing tests (use test-unit-write), reviewing tests (use test-unit-review), - test doubles (use test-mocking-patterns), test data (use test-data-management), PR review (use pr-review). + Reference for unit test standards: isolation, scope, naming, assertions, coverage, and fixtures. + Triggers: abstract questions only ("what are our unit test standards", "test naming convention", + "best practices for unit tests", "how should I structure my tests", "what's the rule for test isolation", + "assertion standards", "rules for reusing fixtures", "should I access private members", + "test conventions", "what are the rules for failure-path coverage"). + NOT for: writing (→test-unit-write), reviewing/auditing files (→test-unit-review), test doubles (→test-mocking-patterns), + test data strategies (→test-data-management), or PR review (→pr-review). license: Apache-2.0 compatibility: GitHub Copilot --- diff --git a/skills/test-unit-write/SKILL.md b/skills/test-unit-write/SKILL.md index 07abbcd..03c9073 100644 --- a/skills/test-unit-write/SKILL.md +++ b/skills/test-unit-write/SKILL.md @@ -1,12 +1,13 @@ --- name: test-unit-write description: > - Write unit tests from scratch or extend partial coverage for functions, classes, or modules. - Triggers on: "write unit tests for", "add tests for", "help me test this", "add failure-path tests", - "add boundary tests", "what tests should I write". - Does NOT trigger for: reviewing tests (use test-unit-review), test conventions - (use test-unit-standards), test doubles (use test-mocking-patterns), test data - (use test-data-management), debugging, integration/e2e planning, or refactoring. + Write unit tests from scratch or extend coverage for functions, classes, or modules. + Triggers: test-writing requests ("write unit tests for", "add unit tests for", "generate test cases", + "help me test this", "scaffold tests", "add failure-path coverage", "what tests should I write", + "add edge case tests", "I need unit tests", "add coverage for", "I only have happy-path tests"). + NOT for: reviewing/auditing (→test-unit-review), standards questions (→test-unit-standards), + test doubles (→test-mocking-patterns), test data organization (→test-data-management), + debugging, integration tests (→test-integration-standards), or code refactoring. license: Apache-2.0 compatibility: GitHub Copilot --- From 3e7057448fefd7bcb3762e599e21628d0ffdc619 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Wed, 24 Jun 2026 08:35:20 +0200 Subject: [PATCH 11/12] Refactor unit test skills: enhance descriptions for review, standards, and writing processes --- skills/test-unit-review/SKILL.md | 13 ++++++------- skills/test-unit-standards/SKILL.md | 13 ++++++------- skills/test-unit-write/SKILL.md | 13 ++++++------- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/skills/test-unit-review/SKILL.md b/skills/test-unit-review/SKILL.md index 1ef8ce9..e7575cf 100644 --- a/skills/test-unit-review/SKILL.md +++ b/skills/test-unit-review/SKILL.md @@ -1,13 +1,12 @@ --- name: test-unit-review description: > - Review and audit existing unit tests: isolation, scope, naming, assertions, coverage, fixtures. - Triggers: feedback/review/check/audit on tests ("review these tests", "audit this test file", "check my tests", - "LGTM on tests", "are these tests up to scratch", "does this test look right", "any issues with my tests", - "give me feedback on this test file", "is this test missing coverage", "do these tests follow standards", - "are these tests properly isolated"). - NOT for: writing (→test-unit-write), standards questions (→test-unit-standards), test doubles (→test-mocking-patterns), - test data (→test-data-management), PR review (→pr-review), or debugging/CI. + Systematically review unit tests against isolation, scope, naming, assertions, coverage, fixtures. + Categorizes issues by severity (blocker/important/nit) with standards-grounded feedback. + Triggers on: review, audit, check, feedback requests ("review these tests", "audit this file", + "check my tests", "LGTM", "any issues", "give feedback", "do these follow standards"). + NOT: writing (→test-unit-write), standards (→test-unit-standards), test doubles (→test-mocking-patterns), + test data (→test-data-management), PR review (→pr-review), debugging/CI. license: Apache-2.0 compatibility: GitHub Copilot --- diff --git a/skills/test-unit-standards/SKILL.md b/skills/test-unit-standards/SKILL.md index d26976a..d144c15 100644 --- a/skills/test-unit-standards/SKILL.md +++ b/skills/test-unit-standards/SKILL.md @@ -1,13 +1,12 @@ --- name: test-unit-standards description: > - Reference for unit test standards: isolation, scope, naming, assertions, coverage, and fixtures. - Triggers: abstract questions only ("what are our unit test standards", "test naming convention", - "best practices for unit tests", "how should I structure my tests", "what's the rule for test isolation", - "assertion standards", "rules for reusing fixtures", "should I access private members", - "test conventions", "what are the rules for failure-path coverage"). - NOT for: writing (→test-unit-write), reviewing/auditing files (→test-unit-review), test doubles (→test-mocking-patterns), - test data strategies (→test-data-management), or PR review (→pr-review). + Comprehensive reference for unit test standards: isolation, scope, naming, assertions, coverage, fixtures. + Language-specific guidance with principles, rationale, and conventions. Triggers on abstract questions only + ("what are our standards", "naming convention", "best practices", "how to structure", "rules for isolation", + "fixture reuse", "failure-path coverage"). + NOT: writing (→test-unit-write), reviewing files (→test-unit-review), test doubles (→test-mocking-patterns), + test data (→test-data-management), PR review (→pr-review), code refactoring. license: Apache-2.0 compatibility: GitHub Copilot --- diff --git a/skills/test-unit-write/SKILL.md b/skills/test-unit-write/SKILL.md index 03c9073..a3233dc 100644 --- a/skills/test-unit-write/SKILL.md +++ b/skills/test-unit-write/SKILL.md @@ -1,13 +1,12 @@ --- name: test-unit-write description: > - Write unit tests from scratch or extend coverage for functions, classes, or modules. - Triggers: test-writing requests ("write unit tests for", "add unit tests for", "generate test cases", - "help me test this", "scaffold tests", "add failure-path coverage", "what tests should I write", - "add edge case tests", "I need unit tests", "add coverage for", "I only have happy-path tests"). - NOT for: reviewing/auditing (→test-unit-review), standards questions (→test-unit-standards), - test doubles (→test-mocking-patterns), test data organization (→test-data-management), - debugging, integration tests (→test-integration-standards), or code refactoring. + Write complete unit tests from scratch or extend coverage for functions, classes, modules. + Follows standards and coverage-driven approach with language-specific patterns. + Triggers on: test-writing requests ("write unit tests for", "add unit tests for", + "generate test cases", "help me test", "scaffold tests", "add failure-path", "add coverage"). + NOT: reviewing (→test-unit-review), standards (→test-unit-standards), test doubles (→test-mocking-patterns), + test data (→test-data-management), debugging, integration tests (→test-integration-standards), refactoring. license: Apache-2.0 compatibility: GitHub Copilot --- From ebadbe9f95ed1606432a3152f833d5e642dd60c7 Mon Sep 17 00:00:00 2001 From: miroslavpojer Date: Wed, 24 Jun 2026 10:33:20 +0200 Subject: [PATCH 12/12] Add unit test skills: implement standards, writer, and reviewer documentation --- README.md | 3 ++ docs/README.md | 3 ++ docs/test-unit-review.md | 101 ++++++++++++++++++++++++++++++++++++ docs/test-unit-standards.md | 43 +++++++++++++++ docs/test-unit-write.md | 89 +++++++++++++++++++++++++++++++ 5 files changed, 239 insertions(+) create mode 100644 docs/test-unit-review.md create mode 100644 docs/test-unit-standards.md create mode 100644 docs/test-unit-write.md diff --git a/README.md b/README.md index 6b1f500..0146004 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,9 @@ its purpose, trigger phrases, and full instructions. | Skill | Description | |------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| | **[pr-review](./skills/pr-review/)** | Pull request code review — reviews diffs for risk, security issues, API contract changes, dependency bumps, CI/CD and infrastructure changes. Produces concise Blocker / Important / Nit comments. | +| **[test-unit-standards](./skills/test-unit-standards/)** | Reference for unit test standards across isolation, scope, naming, assertions, coverage, and fixtures. Language-specific guidance (pytest, Jest, MUnit) with principles and conventions. | +| **[test-unit-write](./skills/test-unit-write/)** | Generate unit tests from scratch following language-specific standards. Analyzes source, selects mock strategies, and produces tests covering happy paths, failure conditions, and edge cases. | +| **[test-unit-review](./skills/test-unit-review/)** | Systematically audit unit test suites. Runs test runner, checks isolation/scope/naming/assertions/coverage standards, and reports findings by severity (Blocker / Important / Nit). | | **[token-saving](./skills/token-saving/)** | Always-active response discipline — enforces brevity, no filler openers or closers, structured output, and a What/Why/How footer on code responses. Suspends on explicit "full detail" requests. | ## Finding More Skills diff --git a/docs/README.md b/docs/README.md index 1388ae5..f130467 100644 --- a/docs/README.md +++ b/docs/README.md @@ -26,6 +26,9 @@ Navigation hub for all guides in this repository. Browse by category below. |----|----| | [PR Review](./pr-review.md) | How the PR review skill works, what sections it applies, and how to trigger it | | [Token Saving](./token-saving.md) | Keeping AI responses concise — how the token-saving skill works and when it applies | +| [Unit Test Standards](./test-unit-standards.md) | Reference for unit test standards across isolation, scope, naming, assertions, coverage, fixtures | +| [Unit Test Writer](./test-unit-write.md) | Generate complete unit tests from scratch following language-specific standards | +| [Unit Test Reviewer](./test-unit-review.md) | Systematically audit unit tests and report findings by severity | > **Keep this index up to date.** When you add a new guide, add a row to the appropriate table above. diff --git a/docs/test-unit-review.md b/docs/test-unit-review.md new file mode 100644 index 0000000..8e09290 --- /dev/null +++ b/docs/test-unit-review.md @@ -0,0 +1,101 @@ +# Unit Test Reviewer + +This skill systematically audits unit tests against isolation, scope, naming, assertions, coverage, and fixture standards. + +## Quick Start + +Ask the agent to review tests: +- "Review these tests" +- "Audit this test file" +- "Check my unit tests" +- "Do these follow our standards?" +- "Give feedback on this test suite" + +Provide the test file, and the agent will: + +1. Run the test suite (pytest, Jest, MUnit, etc.) +2. Check each standard (isolation, scope, naming, assertions, coverage, fixtures) +3. Report findings grouped by severity: **Blocker / Important / Nit** + +## What This Skill Does + +**test-unit-review** performs a 5-step systematic audit: + +1. **Load language reference** — Detect language/framework conventions +2. **Read test file completely** — Identify units under test, test count, dependencies +3. **Run tests if available** — Surface failing tests as blockers immediately +4. **Check against standards** — Walk through each standard category in order +5. **Report findings** — Group violations by severity with specific fixes + +## Report Severity Levels + +### Blocker + +Tests that are broken or unreliable: +- **Failing tests** — any test that doesn't pass +- **Real I/O** — network calls, file reads/writes, real database connections +- **Shared mutable state** — tests that depend on execution order or share fixtures unsafely + +### Important + +Gaps in coverage or weak assertions: +- **Missing failure paths** — ≥1 failure path per behaviour not tested +- **Missing boundary coverage** — empty, zero, None/null, max/min values not tested +- **Weak assertions** — `assert result`, `assert x is not None` where exact value is known +- **Private member access** — tests reaching into private internals +- **Copy-pasted setup** — fixtures duplicated instead of reused +- **Unreadable naming** — test names that don't state condition and expected outcome + +### Nit + +Minor style issues: +- **Naming inconsistencies** — mixing naming conventions +- **Missing fixture docstrings** — fixtures without purpose documentation +- **Style issues** — formatting, unused imports, etc. + +## Example Feedback + +``` +### Blocker +- ❌ test_payment_service_real_api (line 18): Makes real HTTP call to payment gateway. Mock the endpoint. + +### Important +- ❌ test_order_with_zero_amount (line 45): Missing. Zero is a boundary value — test must verify rejection or special handling. +- ⚠️ test_order_succeeds (line 12): Assertion `assert result` is weak. Should be `assert result.status == "success"`. +- ⚠️ mock_payment_gateway (line 8): Fixture missing docstring — state purpose and side effects. + +### Nit +- test_order_invalid_... and test_process_invalid_... mixed naming. Choose one prefix. + +--- +**Summary**: 1 blocker (remove real I/O), 2 important (add boundary test, strengthen assertion), fix naming. No other issues. +``` + +## Language Support + +| Language | Command | Reference | +|---|---|---| +| Python + pytest | `pytest -v` | `references/python-pytest.md` | +| JavaScript / TypeScript + Jest | `npx jest --no-coverage` | `references/javascript-jest.md` | +| Scala + MUnit | `./mvnw test -pl -Dtest=` | `references/scala-munit.md` | +| .NET | `dotnet test --filter "FullyQualifiedName~"` | (future) | + +## When To Use + +- Reviewing test files before PR merge +- Auditing test quality on an existing codebase +- Learning what standards look like in practice +- Validating test coverage and isolation + +## When NOT To Use + +- Writing new tests → use **[test-unit-write](./test-unit-write.md)** +- Abstract standards questions → use **[test-unit-standards](./test-unit-standards.md)** +- Reviewing PR code (non-tests) → use **[pr-review](./pr-review.md)** +- Debugging CI failures → not this skill (use debugging workflow) + +## Related Skills + +- **[test-unit-write](./test-unit-write.md)** — Generate tests following standards +- **[test-unit-standards](./test-unit-standards.md)** — Reference for standards definitions +- **[pr-review](./pr-review.md)** — Review full PR (including tests + source) diff --git a/docs/test-unit-standards.md b/docs/test-unit-standards.md new file mode 100644 index 0000000..9c83f0c --- /dev/null +++ b/docs/test-unit-standards.md @@ -0,0 +1,43 @@ +# Unit Test Standards + +This skill defines comprehensive reference standards for unit testing across Python, JavaScript/TypeScript, and Scala. + +## Quick Start + +Ask the agent about unit test best practices: +- "What are our standards for unit test isolation?" +- "How should I name a test in pytest?" +- "What are the naming conventions for Jest tests?" +- "How do I structure fixtures properly?" + +## What This Skill Does + +**test-unit-standards** provides language-agnostic principles plus language-specific conventions for: + +- **Isolation** — no real I/O, independent tests, no shared mutable state +- **Scope** — unit = one function/method/class, public interface only +- **Naming** — test names state scenario clearly +- **Assertions** — specific over generic, cover success + ≥1 failure path per behaviour +- **Coverage** — success paths, failure paths, boundary values (empty, zero, null, max) +- **Fixtures** — framework-idiomatic location, reusable, documented + +## Language References + +The skill loads language-specific guidance for: + +| Language / Framework | Location | +|---|---| +| Python + pytest | `skills/test-unit-standards/references/python-pytest.md` | +| JavaScript / TypeScript + Jest | `skills/test-unit-standards/references/javascript-jest.md` | +| Scala + MUnit | `skills/test-unit-standards/references/scala-munit.md` | + +## When To Use + +- **Abstract questions**: "What are our standards?", "Best practices for X?", "Naming convention?" +- **Learning**: Understanding isolation rules, fixture reuse, coverage depth +- **Validation**: Confirming a test design before writing or reviewing + +## Related Skills + +- **[test-unit-write](./test-unit-write.md)** — Generate new tests following standards +- **[test-unit-review](./test-unit-review.md)** — Audit existing tests against standards diff --git a/docs/test-unit-write.md b/docs/test-unit-write.md new file mode 100644 index 0000000..4d239eb --- /dev/null +++ b/docs/test-unit-write.md @@ -0,0 +1,89 @@ +# Unit Test Writer + +This skill generates complete unit tests from scratch or extends coverage for functions, classes, and modules. + +## Quick Start + +Ask the agent to write tests: +- "Write unit tests for this function" +- "Add tests covering the failure paths" +- "Generate test cases for this class" +- "Help me test this service" + +Provide the source code, and the agent will: + +1. Analyze the public API and I/O boundaries +2. Select appropriate mock strategies (stub/patch/inject) +3. Generate test cases covering happy paths, failures, and edge cases +4. Output tests in language-idiomatic format (pytest, Jest, MUnit) + +## What This Skill Does + +**test-unit-write** follows a 7-step workflow: + +1. **Load language reference** — Detect language/framework and load conventions +2. **Analyse source** — Identify public API, dependencies, I/O boundaries, failure conditions +3. **Choose mock strategy** — Decide how to isolate HTTP, DB, files, clock, randomness +4. **Scaffold test file** — Setup imports, fixtures, mocking infrastructure +5. **Write test cases** — Happy path, failure paths (≥1 per behaviour), edge/boundary cases +6. **Assert correctly** — Use framework-specific matchers, not generic truthy checks +7. **Run and validate** — Execute tests; fix any failures before returning + +## Generated Test Structure + +```python +# pytest example +import pytest +from unittest.mock import MagicMock +from order_service import OrderService + +@pytest.fixture +def mock_payment_gateway(): + """Mock payment gateway for order processing.""" + return MagicMock() + +def test_order_succeeds_with_valid_input(mock_payment_gateway): + """Happy path: order placed with valid amount and payment success.""" + service = OrderService(mock_payment_gateway) + service.process_order(customer_id=123, amount=50.00) + assert mock_payment_gateway.charge.called + +def test_order_fails_on_insufficient_funds(mock_payment_gateway): + """Failure path: payment gateway returns insufficient funds.""" + mock_payment_gateway.charge.side_effect = ValueError("Insufficient funds") + service = OrderService(mock_payment_gateway) + with pytest.raises(ValueError, match="Insufficient funds"): + service.process_order(customer_id=123, amount=50.00) +``` + +## Coverage Requirements + +Each test addresses: +- **Success path** — main behaviour under ideal conditions +- **≥1 failure path per behaviour** — exceptions, error conditions, guards +- **Boundary values** — empty, zero, None/null, max/min values + +## Language Support + +- **Python** — pytest with fixtures, parametrization, monkeypatch +- **JavaScript / TypeScript** — Jest with mocks, spies, async/await +- **Scala** — MUnit with scopes, fixtures, assertions + +## When To Use + +- Writing tests for new functions or classes +- Extending coverage for existing code +- Scaffolding test structure and mocking strategy +- Learning what good test coverage looks like + +## When NOT To Use + +- Reviewing existing tests → use **[test-unit-review](./test-unit-review.md)** +- Abstract standards questions → use **[test-unit-standards](./test-unit-standards.md)** +- Test doubles / mocking patterns → separate skill (test-mocking-patterns, future) +- Integration tests → use integration test skill (future) + +## Related Skills + +- **[test-unit-standards](./test-unit-standards.md)** — Reference for naming, isolation, assertions +- **[test-unit-review](./test-unit-review.md)** — Audit tests after writing