From 135395bbe5bdb08a1ad28e41b7bb983ee1d95aa2 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 30 Jun 2026 23:32:04 -0700 Subject: [PATCH 01/11] config: add SUFFIX_ACRONYMS_AMBIGUOUS, move (ret)/(vet) to SUFFIX_NOT_ACRONYMS --- nameparser/config/__init__.py | 8 +++++++- nameparser/config/suffixes.py | 26 ++++++++++++++++++++++++-- tests/test_suffixes.py | 8 ++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index c143299..79d4f8c 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -42,6 +42,7 @@ from nameparser.config.conjunctions import CONJUNCTIONS from nameparser.config.suffixes import SUFFIX_ACRONYMS from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS +from nameparser.config.suffixes import SUFFIX_ACRONYMS_AMBIGUOUS from nameparser.config.titles import TITLES from nameparser.config.titles import FIRST_NAME_TITLES from nameparser.config.regexes import EMPTY_REGEX, REGEXES @@ -236,8 +237,10 @@ class Constants: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_not_acronyms: + :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. + :param set suffix_acronyms_ambiguous: + :py:attr:`~suffixes.SUFFIX_ACRONYMS_AMBIGUOUS` wrapped with :py:class:`SetManager`. :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :param set first_name_prefixes: @@ -257,6 +260,7 @@ class Constants: first_name_titles: SetManager conjunctions: SetManager first_name_prefixes: SetManager + suffix_acronyms_ambiguous: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager _pst: Set[str] | None @@ -388,6 +392,7 @@ def __init__(self, prefixes: Iterable[str] = PREFIXES, suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS, suffix_not_acronyms: Iterable[str] = SUFFIX_NOT_ACRONYMS, + suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS, titles: Iterable[str] = TITLES, first_name_titles: Iterable[str] = FIRST_NAME_TITLES, conjunctions: Iterable[str] = CONJUNCTIONS, @@ -406,6 +411,7 @@ def __init__(self, self.first_name_titles = SetManager(first_name_titles) self.conjunctions = SetManager(conjunctions) self.first_name_prefixes = SetManager(first_name_prefixes) + self.suffix_acronyms_ambiguous = SetManager(suffix_acronyms_ambiguous) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) self.patronymic_name_order = patronymic_name_order diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index f014c81..dd17fe8 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -13,6 +13,8 @@ 'iii', 'iv', 'v', + 'ret', + 'vet', ]) """ @@ -21,8 +23,6 @@ """ SUFFIX_ACRONYMS = set([ - '(ret)', - '(vet)', '8-vsb', 'aas', 'aba', @@ -653,3 +653,25 @@ when matching against these pieces. """ +SUFFIX_ACRONYMS_AMBIGUOUS = set([ + # Suffix acronyms that also commonly work as given-name nicknames on + # their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames() + # when deciding whether parenthesized/quoted content is a nickname or a + # suffix -- content matching one of these stays a nickname rather than + # being reclassified as a suffix, since that's the more common reading + # in ambiguous, delimiter-only context. + # + # When adding a new entry to SUFFIX_ACRONYMS, also add it here only if + # the exact letter sequence could plausibly be someone's given name or + # common nickname on its own (e.g. 'jd', 'ed'). Unambiguous + # certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry. + 'ed', + 'jd', +]) +""" + +Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a +common given-name nickname. Not a partition of SUFFIX_ACRONYMS -- a small, +standalone exception list consulted only by parse_nicknames(). + +""" diff --git a/tests/test_suffixes.py b/tests/test_suffixes.py index 438b508..175b2e0 100644 --- a/tests/test_suffixes.py +++ b/tests/test_suffixes.py @@ -274,3 +274,11 @@ def test_suffix_delimiter_inverted_format_known_limitation(self) -> None: # This is a documented limitation — do not "fix" it without a broader solution. hn = HumanName("Doe, Mary - Kate, RN", suffix_delimiter=" - ") self.assertNotEqual(hn.first, "Mary - Kate") + + def test_suffix_acronyms_ambiguous_is_customizable(self) -> None: + from nameparser.config import Constants + custom = Constants(suffix_acronyms_ambiguous=['xyz']) + self.assertEqual(set(custom.suffix_acronyms_ambiguous), {'xyz'}) + # Constructing without the kwarg still works and uses the module default. + default = Constants() + self.assertIn('jd', default.suffix_acronyms_ambiguous) From 8995b2bf0be4abe3825d2327694f0c062e661f5a Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 30 Jun 2026 23:44:40 -0700 Subject: [PATCH 02/11] fix: don't extract suffix-shaped parenthesized/quoted content as nicknames (#111) --- nameparser/parser.py | 33 +++++++++++++++--- tests/test_nicknames.py | 8 +++++ tests/test_suffixes.py | 75 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 4 deletions(-) diff --git a/nameparser/parser.py b/nameparser/parser.py index 4cda41a..694d646 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -774,7 +774,11 @@ def fix_phd(self) -> None: def parse_nicknames(self) -> None: """ The content of parenthesis or quotes in the name will be added to the - nicknames list. This happens before any other processing of the name. + nicknames list, unless that content is suffix-shaped -- an unambiguous + suffix_not_acronyms/suffix_acronyms member, or content ending in a + period -- in which case it's left in place (undelimited) for normal + downstream suffix/title/word parsing instead. This happens before any + other processing of the name. Single quotes cannot span white space characters and must border white space to allow for quotes in names like O'Connor and Kawai'ae'a. @@ -788,10 +792,31 @@ def parse_nicknames(self) -> None: re_double_quotes = self.C.regexes.double_quotes re_parenthesis = self.C.regexes.parenthesis + def handle_match(m: 're.Match[str]') -> str: + # Fall back to the whole match when the regex has no capturing + # group (e.g. a custom override regex without one, like + # EMPTY_REGEX) -- mirrors the old code's use of findall(), which + # returns the whole match for group-less patterns. + content = m.group(1) if m.lastindex else m.group(0) + stripped = lc(content) + is_unambiguous_suffix = ( + stripped in self.C.suffix_not_acronyms + or (stripped in self.C.suffix_acronyms + and stripped not in self.C.suffix_acronyms_ambiguous) + ) + if is_unambiguous_suffix or content.endswith('.'): + # Leave the bare content -- no delimiters -- so downstream + # word-splitting/suffix-matching sees it exactly as if it had + # never been wrapped in parens/quotes. is_suffix()/lc() only + # strip periods, never parens/quotes, so returning m.group(0) + # here (e.g. literal "(Ret)") would never match + # suffix_not_acronyms ("ret"). + return content + self.nickname_list.append(content) + return '' + for _re in (re_quoted_word, re_double_quotes, re_parenthesis): - if _re.search(self._full_name): - self.nickname_list += [x for x in _re.findall(self._full_name)] - self._full_name = _re.sub('', self._full_name) + self._full_name = _re.sub(handle_match, self._full_name) def squash_emoji(self) -> None: """ diff --git a/tests/test_nicknames.py b/tests/test_nicknames.py index c5aea85..fa79e45 100644 --- a/tests/test_nicknames.py +++ b/tests/test_nicknames.py @@ -134,6 +134,14 @@ def test_nickname_and_last_name_with_title(self) -> None: self.m(hn.last, "Edmonds", hn) self.m(hn.nickname, "Rick", hn) + def test_ambiguous_suffix_acronym_in_parenthesis_stays_nickname(self) -> None: + # JD is in SUFFIX_ACRONYMS_AMBIGUOUS: both a law-degree acronym and a + # common given-name nickname. Existing behavior (nickname) must be + # preserved -- see issue #111. + hn = HumanName("JEFFREY (JD) BRICKEN") + self.m(hn.nickname, "JD", hn) + self.m(hn.suffix, "", hn) + # class MaidenNameTestCase(HumanNameTestBase): # diff --git a/tests/test_suffixes.py b/tests/test_suffixes.py index 175b2e0..33a693a 100644 --- a/tests/test_suffixes.py +++ b/tests/test_suffixes.py @@ -282,3 +282,78 @@ def test_suffix_acronyms_ambiguous_is_customizable(self) -> None: # Constructing without the kwarg still works and uses the module default. default = Constants() self.assertIn('jd', default.suffix_acronyms_ambiguous) + + def test_suffix_in_parenthesis_with_other_suffixes(self) -> None: + hn = HumanName("Andrew Perkins, Jr., Col. (Ret)") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.assertIn("Ret", hn.suffix) + self.m(hn.nickname, "", hn) + + def test_suffix_in_parenthesis_mid_name(self) -> None: + # "Jr." is suffix-shaped, so parse_nicknames() no longer treats it as + # a nickname. But it isn't in trailing position, and parse_full_name's + # suffix detection only recognizes a trailing run of suffix-shaped + # pieces -- so it lands wherever normal parsing would put a bare + # mid-name "Jr." token, exactly as if the parens were never there + # (verified: HumanName("Lon Jr. Williams") parses identically). + # Known limitation: making this land in `suffix` would require + # changing parse_full_name's suffix detection, out of scope here -- + # issue #111 is specifically about the nickname misclassification. + hn = HumanName("Lon (Jr.) Williams") + self.m(hn.first, "Lon", hn) + self.m(hn.middle, "Jr.", hn) + self.m(hn.last, "Williams", hn) + self.m(hn.suffix, "", hn) + self.m(hn.nickname, "", hn) + + def test_suffix_in_parenthesis_with_period(self) -> None: + # Same known limitation as above: "Ret." is mid-name (no comma), so + # it's outside the trailing run parse_full_name's suffix detection + # requires. It parses exactly as bare "Col. Ret. Smith" would. + hn = HumanName("Col. (Ret.) Smith") + self.m(hn.title, "Col.", hn) + self.m(hn.first, "Ret.", hn) + self.m(hn.last, "Smith", hn) + self.m(hn.suffix, "", hn) + self.m(hn.nickname, "", hn) + + def test_acronym_suffix_in_parenthesis(self) -> None: + hn = HumanName("Andrew Perkins (MBA)") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "MBA", hn) + self.m(hn.nickname, "", hn) + + def test_period_terminated_content_in_parenthesis_not_forced_either_way(self) -> None: + # "Mgr." isn't in any suffix list, but it ends in a period, so the + # period heuristic (rule 2) excludes it from nickname_list. It flows + # into normal parsing instead of being force-classified as a suffix. + hn = HumanName("Andrew Perkins (Mgr.)") + self.m(hn.nickname, "", hn) + self.m(hn.suffix, "", hn) + + def test_suffix_acronyms_ambiguous_custom_entry_stays_nickname(self) -> None: + # A custom suffix_acronyms_ambiguous entry keeps a suffix_acronyms + # member classified as a nickname instead of a suffix, confirming + # the exception list -- not a hardcoded check -- drives the behavior. + from nameparser.config import Constants + C = Constants( + suffix_acronyms=['xyz'], + suffix_acronyms_ambiguous=['xyz'], + ) + hn = HumanName("Andrew Perkins (XYZ)", constants=C) + self.m(hn.nickname, "XYZ", hn) + self.m(hn.suffix, "", hn) + + def test_suffix_acronyms_ambiguous_removal_routes_to_suffix(self) -> None: + # Removing 'jd' from a custom suffix_acronyms_ambiguous flips JD + # from nickname to suffix. Uses a trailing-position name (unlike the + # JEFFREY (JD) BRICKEN regression guard in test_nicknames.py) so + # parse_full_name's trailing-run suffix detection actually picks it + # up -- see the known mid-name limitation noted on the tests above. + from nameparser.config import Constants + C = Constants(suffix_acronyms_ambiguous=[]) + hn = HumanName("Andrew Perkins (JD)", constants=C) + self.m(hn.nickname, "", hn) + self.m(hn.suffix, "JD", hn) From a7e04f51ac834914e7735f69208281245447cd7c Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 30 Jun 2026 23:50:14 -0700 Subject: [PATCH 03/11] docs: add suffix-in-parens design spec and implementation plan (#111) Includes the corrected scope-boundary note discovered during Task 2 implementation: parse_full_name's suffix detection only recognizes a trailing run of suffix-shaped pieces, so mid-name content freed from parens/quotes doesn't always land in suffix -- documented as a known, pre-existing limitation rather than worked around. --- ...6-06-30-suffix-in-parens-implementation.md | 396 ++++++++++++++++++ .../2026-06-30-suffix-in-parens-design.md | 288 +++++++++++++ 2 files changed, 684 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-30-suffix-in-parens-implementation.md create mode 100644 docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md diff --git a/docs/superpowers/plans/2026-06-30-suffix-in-parens-implementation.md b/docs/superpowers/plans/2026-06-30-suffix-in-parens-implementation.md new file mode 100644 index 0000000..3d79127 --- /dev/null +++ b/docs/superpowers/plans/2026-06-30-suffix-in-parens-implementation.md @@ -0,0 +1,396 @@ +# Suffix-in-Parentheses Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Stop `parse_nicknames()` from misclassifying suffix-like content inside parentheses/quotes (e.g. `(Ret)`, `(Jr.)`, `(MBA)`) as nicknames, while preserving the existing `JEFFREY (JD) BRICKEN` → nickname `JD` behavior, per [issue #111](https://github.com/derek73/python-nameparser/issues/111). + +**Architecture:** Add a small `SUFFIX_ACRONYMS_AMBIGUOUS` exception set (`ed`, `jd`) alongside the existing `SUFFIX_ACRONYMS`/`SUFFIX_NOT_ACRONYMS` constants, wire it into `Constants` as a plain `SetManager` attribute, and change `parse_nicknames()` from an unconditional bulk extraction into a per-match callback that leaves suffix-shaped content (or content ending in a period) in `_full_name` — undelimited — instead of routing it to `nickname_list`. + +**Tech Stack:** Python, pytest (existing `HumanNameTestBase` fixture in `tests/base.py`). + +**Spec:** [docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md](../specs/2026-06-30-suffix-in-parens-design.md) + +--- + +## File Structure + +- `nameparser/config/suffixes.py` — move `'(ret)'`/`'(vet)'` out of `SUFFIX_ACRONYMS` into `SUFFIX_NOT_ACRONYMS` as bare `'ret'`/`'vet'`; add new `SUFFIX_ACRONYMS_AMBIGUOUS` constant with a decision-guide comment. +- `nameparser/config/__init__.py` — import the new constant; add `suffix_acronyms_ambiguous` as a plain `SetManager`-typed attribute (same pattern as `first_name_titles`/`conjunctions`/`first_name_prefixes`), constructor param, docstring entry. +- `nameparser/parser.py` — rewrite `HumanName.parse_nicknames()` (currently lines 774–793) to use a per-match callback instead of bulk `findall`/`sub`. +- `tests/test_suffixes.py` — add a config-plumbing test for `suffix_acronyms_ambiguous`, plus suffix-in-parens/quotes regression tests. +- `tests/test_nicknames.py` — add the `JD` regression-guard test and the ambiguous-content-stays-a-nickname tests. + +No new files. No public API is removed or changed; `suffix_acronyms`/`SUFFIX_ACRONYMS` are untouched. + +--- + +## Task 1: Add `SUFFIX_ACRONYMS_AMBIGUOUS` and wire `suffix_acronyms_ambiguous` into `Constants` + +**Files:** +- Modify: `nameparser/config/suffixes.py` +- Modify: `nameparser/config/__init__.py` +- Test: `tests/test_suffixes.py` + +This task only adds plumbing (a new constant + a new customizable `Constants` attribute). It does not change parsing behavior yet — `parse_nicknames()` doesn't read it until Task 2. The test here checks the customization API works, not the literal contents of the default set (that's covered end-to-end in Task 2 once the attribute actually affects parsing). + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_suffixes.py`, inside `SuffixesTestCase`: + +```python + def test_suffix_acronyms_ambiguous_is_customizable(self) -> None: + from nameparser.config import Constants + custom = Constants(suffix_acronyms_ambiguous=['xyz']) + self.assertEqual(set(custom.suffix_acronyms_ambiguous), {'xyz'}) + # Constructing without the kwarg still works and uses the module default. + default = Constants() + self.assertIn('jd', default.suffix_acronyms_ambiguous) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_suffixes.py::SuffixesTestCase::test_suffix_acronyms_ambiguous_is_customizable -v` +Expected: FAIL with `TypeError: Constants.__init__() got an unexpected keyword argument 'suffix_acronyms_ambiguous'` + +- [ ] **Step 3: Add the constant to `nameparser/config/suffixes.py`** + +Remove these two lines from `SUFFIX_ACRONYMS` (currently lines 24–25): + +```python + '(ret)', + '(vet)', +``` + +Add `'ret'` and `'vet'` to `SUFFIX_NOT_ACRONYMS` (currently lines 1–16), and append the new constant after `SUFFIX_ACRONYMS`'s closing docstring. The full top of the file becomes: + +```python +SUFFIX_NOT_ACRONYMS = set([ + 'dr', + 'esq', + 'esquire', + 'jr', + 'jnr', + 'junior', + 'sr', + 'snr', + '2', + 'i', + 'ii', + 'iii', + 'iv', + 'v', + 'ret', + 'vet', +]) +""" + +Post-nominal pieces that are not acronyms. The parser does not remove periods +when matching against these pieces. + +""" +SUFFIX_ACRONYMS = set([ + '8-vsb', + 'aas', + ... +``` + +(Leave every other entry in `SUFFIX_ACRONYMS` untouched — only the two literal-parenthesis entries move. Everything from `'8-vsb'` onward through the closing `])` stays exactly as-is, just re-indexed by the two removed lines.) + +Append this after the `SUFFIX_ACRONYMS` closing docstring (the `"""..."""` block that currently ends the file): + +```python +SUFFIX_ACRONYMS_AMBIGUOUS = set([ + # Suffix acronyms that also commonly work as given-name nicknames on + # their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames() + # when deciding whether parenthesized/quoted content is a nickname or a + # suffix — content matching one of these stays a nickname rather than + # being reclassified as a suffix, since that's the more common reading + # in ambiguous, delimiter-only context. + # + # When adding a new entry to SUFFIX_ACRONYMS, also add it here only if + # the exact letter sequence could plausibly be someone's given name or + # common nickname on its own (e.g. 'jd', 'ed'). Unambiguous + # certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry. + 'ed', + 'jd', +]) +""" + +Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a +common given-name nickname. Not a partition of SUFFIX_ACRONYMS — a small, +standalone exception list consulted only by parse_nicknames(). + +""" +``` + +- [ ] **Step 4: Wire it into `Constants` in `nameparser/config/__init__.py`** + +Add the import near the existing suffix imports (after line 44): + +```python +from nameparser.config.suffixes import SUFFIX_ACRONYMS +from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS +from nameparser.config.suffixes import SUFFIX_ACRONYMS_AMBIGUOUS +``` + +Add a plain `SetManager`-typed class attribute next to `first_name_titles`/`conjunctions`/`first_name_prefixes` (currently lines 257–259) — **not** a `_CachedUnionMember`, since this set isn't part of the `_pst` prefix/suffix/title union and needs no cache invalidation: + +```python + first_name_titles: SetManager + conjunctions: SetManager + first_name_prefixes: SetManager + suffix_acronyms_ambiguous: SetManager +``` + +Add the constructor param (next to `suffix_not_acronyms`, currently line 390) and its docstring entry (next to the `suffix_not_acronyms` doc entry, currently lines 239–240): + +```python + :param set suffix_acronyms_ambiguous: + :py:attr:`~suffixes.SUFFIX_ACRONYMS_AMBIGUOUS` wrapped with :py:class:`SetManager`. +``` + +```python + def __init__(self, + prefixes: Iterable[str] = PREFIXES, + suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS, + suffix_not_acronyms: Iterable[str] = SUFFIX_NOT_ACRONYMS, + suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS, + titles: Iterable[str] = TITLES, + ... +``` + +And assign it directly in the body (next to the other four descriptor assignments, currently lines 402–408 — this one is a plain attribute assignment, not a descriptor, so it doesn't need to come before any `suffixes_prefixes_titles` read): + +```python + self.prefixes = SetManager(prefixes) + self.suffix_acronyms = SetManager(suffix_acronyms) + self.suffix_not_acronyms = SetManager(suffix_not_acronyms) + self.titles = SetManager(titles) + self.first_name_titles = SetManager(first_name_titles) + self.conjunctions = SetManager(conjunctions) + self.first_name_prefixes = SetManager(first_name_prefixes) + self.suffix_acronyms_ambiguous = SetManager(suffix_acronyms_ambiguous) +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `pytest tests/test_suffixes.py::SuffixesTestCase::test_suffix_acronyms_ambiguous_is_customizable -v` +Expected: PASS + +- [ ] **Step 6: Run the full suite to check nothing else broke from moving `(ret)`/`(vet)`** + +Run: `pytest -q` +Expected: all pass (no test currently exercises `'(ret)'`/`'(vet)'`, or bare `'ret'`/`'vet'` as suffixes, per the check done during planning — `grep -rn "Ret\b\|Vet\b" tests/*.py` returns nothing) + +- [ ] **Step 7: Commit** + +```bash +git add nameparser/config/suffixes.py nameparser/config/__init__.py tests/test_suffixes.py +git commit -m "config: add SUFFIX_ACRONYMS_AMBIGUOUS, move (ret)/(vet) to SUFFIX_NOT_ACRONYMS" +``` + +--- + +## Task 2: Rewrite `parse_nicknames()` to route suffix-shaped content out of `nickname_list` + +**Files:** +- Modify: `nameparser/parser.py:774-793` (`HumanName.parse_nicknames`) +- Test: `tests/test_nicknames.py`, `tests/test_suffixes.py` + +- [ ] **Step 1: Write the failing tests** + +Add to `tests/test_suffixes.py`, inside `SuffixesTestCase` (these are the spec's test-table rows for content that should stop being nicknames and start being suffixes): + +```python + def test_suffix_in_parenthesis_with_other_suffixes(self) -> None: + hn = HumanName("Andrew Perkins, Jr., Col. (Ret)") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.assertIn("Ret", hn.suffix) + self.m(hn.nickname, "", hn) + + def test_suffix_in_parenthesis_mid_name(self) -> None: + # "Jr." is suffix-shaped, so parse_nicknames() no longer treats it as + # a nickname. But it isn't in trailing position, and parse_full_name's + # suffix detection only recognizes a trailing run of suffix-shaped + # pieces -- so it lands wherever normal parsing would put a bare + # mid-name "Jr." token, exactly as if the parens were never there + # (verified: HumanName("Lon Jr. Williams") parses identically). + # Known limitation: making this land in `suffix` would require + # changing parse_full_name's suffix detection, out of scope here -- + # issue #111 is specifically about the nickname misclassification. + hn = HumanName("Lon (Jr.) Williams") + self.m(hn.first, "Lon", hn) + self.m(hn.middle, "Jr.", hn) + self.m(hn.last, "Williams", hn) + self.m(hn.suffix, "", hn) + self.m(hn.nickname, "", hn) + + def test_suffix_in_parenthesis_with_period(self) -> None: + # Same known limitation as above: "Ret." is mid-name (no comma), so + # it's outside the trailing run parse_full_name's suffix detection + # requires. It parses exactly as bare "Col. Ret. Smith" would. + hn = HumanName("Col. (Ret.) Smith") + self.m(hn.title, "Col.", hn) + self.m(hn.first, "Ret.", hn) + self.m(hn.last, "Smith", hn) + self.m(hn.suffix, "", hn) + self.m(hn.nickname, "", hn) + + def test_acronym_suffix_in_parenthesis(self) -> None: + hn = HumanName("Andrew Perkins (MBA)") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "MBA", hn) + self.m(hn.nickname, "", hn) + + def test_period_terminated_content_in_parenthesis_not_forced_either_way(self) -> None: + # "Mgr." isn't in any suffix list, but it ends in a period, so the + # period heuristic (rule 2) excludes it from nickname_list. It flows + # into normal parsing instead of being force-classified as a suffix. + hn = HumanName("Andrew Perkins (Mgr.)") + self.m(hn.nickname, "", hn) + self.m(hn.suffix, "", hn) + + def test_suffix_acronyms_ambiguous_custom_entry_stays_nickname(self) -> None: + # A custom suffix_acronyms_ambiguous entry keeps a suffix_acronyms + # member classified as a nickname instead of a suffix, confirming + # the exception list -- not a hardcoded check -- drives the behavior. + from nameparser.config import Constants + C = Constants( + suffix_acronyms=['xyz'], + suffix_acronyms_ambiguous=['xyz'], + ) + hn = HumanName("Andrew Perkins (XYZ)", constants=C) + self.m(hn.nickname, "XYZ", hn) + self.m(hn.suffix, "", hn) + + def test_suffix_acronyms_ambiguous_removal_routes_to_suffix(self) -> None: + # Removing 'jd' from a custom suffix_acronyms_ambiguous flips JD + # from nickname to suffix. Uses a trailing-position name (unlike the + # JEFFREY (JD) BRICKEN regression guard) so parse_full_name's + # trailing-run suffix detection actually picks it up -- see the + # known mid-name limitation noted on the tests above. + from nameparser.config import Constants + C = Constants(suffix_acronyms_ambiguous=[]) + hn = HumanName("Andrew Perkins (JD)", constants=C) + self.m(hn.nickname, "", hn) + self.m(hn.suffix, "JD", hn) +``` + +Add to `tests/test_nicknames.py`, inside `NicknameTestCase` (regression guard for the ambiguous case that must NOT change): + +```python + def test_ambiguous_suffix_acronym_in_parenthesis_stays_nickname(self) -> None: + # JD is in SUFFIX_ACRONYMS_AMBIGUOUS: both a law-degree acronym and a + # common given-name nickname. Existing behavior (nickname) must be + # preserved -- see issue #111. + hn = HumanName("JEFFREY (JD) BRICKEN") + self.m(hn.nickname, "JD", hn) + self.m(hn.suffix, "", hn) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_suffixes.py tests/test_nicknames.py -v -k "suffix_in_parenthesis or acronym_suffix_in_parenthesis or period_terminated_content or suffix_acronyms_ambiguous or ambiguous_suffix_acronym"` +Expected: FAIL — e.g. `test_suffix_in_parenthesis_with_other_suffixes` fails because `hn.suffix` is empty and `hn.nickname` is `"Ret"` under current behavior. The two `suffix_acronyms_ambiguous` customization tests fail with `TypeError: Constants.__init__() got an unexpected keyword argument` if Task 1 wasn't done, or with wrong nickname/suffix values if Task 1 is done but this task isn't. + +- [ ] **Step 3: Rewrite `parse_nicknames()` in `nameparser/parser.py`** + +Replace the current method body (lines 774–793): + +```python + def parse_nicknames(self) -> None: + """ + The content of parenthesis or quotes in the name will be added to the + nicknames list. This happens before any other processing of the name. + + Single quotes cannot span white space characters and must border + white space to allow for quotes in names like O'Connor and Kawai'ae'a. + Double quotes and parenthesis can span white space. + + Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; + `quoted_word`, `double_quotes` and `parenthesis`. + """ + + re_quoted_word = self.C.regexes.quoted_word + re_double_quotes = self.C.regexes.double_quotes + re_parenthesis = self.C.regexes.parenthesis + + for _re in (re_quoted_word, re_double_quotes, re_parenthesis): + if _re.search(self._full_name): + self.nickname_list += [x for x in _re.findall(self._full_name)] + self._full_name = _re.sub('', self._full_name) +``` + +with: + +```python + def parse_nicknames(self) -> None: + """ + The content of parenthesis or quotes in the name will be added to the + nicknames list, unless that content is suffix-shaped -- an unambiguous + suffix_not_acronyms/suffix_acronyms member, or content ending in a + period -- in which case it's left in place (undelimited) for normal + downstream suffix/title/word parsing instead. This happens before any + other processing of the name. + + Single quotes cannot span white space characters and must border + white space to allow for quotes in names like O'Connor and Kawai'ae'a. + Double quotes and parenthesis can span white space. + + Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; + `quoted_word`, `double_quotes` and `parenthesis`. + """ + + re_quoted_word = self.C.regexes.quoted_word + re_double_quotes = self.C.regexes.double_quotes + re_parenthesis = self.C.regexes.parenthesis + + def handle_match(m: 're.Match[str]') -> str: + content = m.group(1) + stripped = lc(content) + is_unambiguous_suffix = ( + stripped in self.C.suffix_not_acronyms + or (stripped in self.C.suffix_acronyms + and stripped not in self.C.suffix_acronyms_ambiguous) + ) + if is_unambiguous_suffix or content.endswith('.'): + # Leave the bare content -- no delimiters -- so downstream + # word-splitting/suffix-matching sees it exactly as if it had + # never been wrapped in parens/quotes. is_suffix()/lc() only + # strip periods, never parens/quotes, so returning m.group(0) + # here (e.g. literal "(Ret)") would never match + # suffix_not_acronyms ("ret"). + return content + self.nickname_list.append(content) + return '' + + for _re in (re_quoted_word, re_double_quotes, re_parenthesis): + self._full_name = _re.sub(handle_match, self._full_name) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pytest tests/test_suffixes.py tests/test_nicknames.py -v` +Expected: PASS for all new tests + +- [ ] **Step 5: Run the full suite** + +Run: `pytest -q` +Expected: all pass, including the pre-existing nickname tests (`test_nickname_in_parenthesis`, `test_two_word_nickname_in_parenthesis`, `test_parenthesis_are_removed_from_name`, `test_duplicate_parenthesis_are_removed_from_name`, etc. — none of their parenthesized content is suffix-shaped or period-terminated, so they're unaffected) + +- [ ] **Step 6: Commit** + +```bash +git add nameparser/parser.py tests/test_suffixes.py tests/test_nicknames.py +git commit -m "fix: don't extract suffix-shaped parenthesized/quoted content as nicknames (#111)" +``` + +--- + +## Self-Review Notes + +- **Spec coverage:** Constants/config changes (spec §"Constant addition", §"Config wiring") → Task 1. Parser rewrite and both design rules (§"Approach", rules 1–2) → Task 2 Step 3. All six spec test-table rows → Task 2 Step 1. Both customization behavior tests from spec §"Test cases to add" → Task 2 Step 1. Scope boundaries (no `is_suffix()` change, `SUFFIX_ACRONYMS`/`suffix_acronyms` untouched) → satisfied by construction; nothing in either task touches `is_suffix()` or removes/renames anything from `SUFFIX_ACRONYMS`. +- **No placeholders:** every step shows full code, exact commands, and expected output. +- **Type/name consistency:** `suffix_acronyms_ambiguous` (constructor param, attribute, config docstring) and `SUFFIX_ACRONYMS_AMBIGUOUS` (module constant) are spelled identically across both tasks. `handle_match`/`is_unambiguous_suffix` names introduced in Task 2 aren't referenced elsewhere, so no cross-task drift risk. diff --git a/docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md b/docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md new file mode 100644 index 0000000..0641151 --- /dev/null +++ b/docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md @@ -0,0 +1,288 @@ +# Design: Detect suffixes inside parenthesis/quotes before nickname extraction + +Addresses [issue #111](https://github.com/derek73/python-nameparser/issues/111). + +## Problem + +`parse_nicknames()` runs before any other name processing and unconditionally +extracts everything found inside parenthesis, double quotes, or single quotes +into `nickname_list`, stripping it from the working string. This means +suffix-like content that happens to be delimited — `(Ret)`, `(Jr.)` — is +misclassified as a nickname instead of a suffix. + +Examples of current (wrong) behavior: + +- `Andrew Perkins, Jr., Col. (Ret)` → nickname: `Ret` (should be suffix) +- `Lon (Jr.) Williams` → nickname: `Jr.` (should be suffix) +- `Andrew Perkins (MBA)` → nickname: `MBA` (should be suffix) + +Examples of current (correct) behavior that must not regress: + +- `JEFFREY (JD) BRICKEN` → nickname: `JD` (ambiguous — `JD` is both a + common given-name nickname and a law degree acronym; existing behavior + treats it as a nickname and that must be preserved) + +## Background + +`SUFFIX_ACRONYMS` currently contains ~530 entries bulk-imported from +Wikipedia's post-nominal letters list +([af5bdab](https://github.com/derek73/python-nameparser/commit/af5bdabc160fc15054b59e078c658ac80a3cb1ff), +fixing #93). Investigating this set for the current fix turned up two +findings that shape the design: + +1. `(ret)` and `(vet)` are the *only two* entries in the entire set that + carry literal parentheses — an artifact of how the Wikipedia source + formatted retired/veteran military status, not a deliberate design + choice for nickname disambiguation. They belong in `SUFFIX_NOT_ACRONYMS` + as bare words (`ret`, `vet`), consistent with how `jr`/`sr` are stored. +2. Of the remaining ~528 acronym entries, only **`ed`** and **`jd`** + plausibly collide with common English given-name nicknames (Ed, JD). + The rest (`mba`, `cpa`, `phd`, `rn`, etc.) are unambiguous + certifications/degrees/honors that are never used as nicknames. + +Because only 2 of ~530 entries are genuinely ambiguous, and that +ambiguity check is only ever needed in one place (`parse_nicknames()`), we +add a single small constant for the exception rather than splitting +`SUFFIX_ACRONYMS` into two large parallel lists. `SUFFIX_ACRONYMS` itself +is untouched — same value, same mutability, no API change. + +### Constant addition in `nameparser/config/suffixes.py` + +```python +SUFFIX_NOT_ACRONYMS = set([ + 'dr', 'esq', 'esquire', 'jr', 'jnr', 'junior', 'sr', 'snr', + '2', 'i', 'ii', 'iii', 'iv', 'v', + 'ret', 'vet', # moved from literal "(ret)"/"(vet)" entries +]) + +SUFFIX_ACRONYMS_AMBIGUOUS = set([ + 'ed', 'jd', # acronym suffixes that commonly collide with given names/nicknames +]) + +SUFFIX_ACRONYMS = set([ + # unchanged: the full ~530-entry list, including 'ed' and 'jd' + ... +]) +``` + +`SUFFIX_ACRONYMS_AMBIGUOUS` is a small, standalone exception list — not a +partition of `SUFFIX_ACRONYMS`, so there's no second large list to keep in +sync and nothing to derive. "Unambiguous acronym suffix" is expressed as +`piece in SUFFIX_ACRONYMS and piece not in SUFFIX_ACRONYMS_AMBIGUOUS` +wherever it's needed, rather than as its own stored constant. + +### Where to add a new suffix: a decision guide + +1. **Is it a plain word/phrase, not an acronym** — e.g. `Junior`, `Senior`, + `Doctor`-style abbreviations, roman numerals? → `SUFFIX_NOT_ACRONYMS`. + Unchanged by this design. +2. **Is it an acronym/initialism** (e.g. all-caps letters like `MBA`, + `PHD`, `JD`)? Add it to `SUFFIX_ACRONYMS` as always. Then ask: **could + this exact letter sequence plausibly be someone's given name or common + nickname on its own**, independent of context? + - **No** (e.g. `MBA`, `CPA`, `RN`, `PHD` — nobody is named "Mba") → + nothing further to do; it's already unambiguous by omission from + `SUFFIX_ACRONYMS_AMBIGUOUS`. + - **Yes** (e.g. `JD`, `ED` — both real given-name nicknames) → also add + it to `SUFFIX_ACRONYMS_AMBIGUOUS`. + + This new list is read in exactly one place: the + `HumanName.parse_nicknames()` method, and only for content found in + parenthesis/quotes — it will *not* pull `JD`/`ED` out of parens as a + suffix there; they stay nicknames, since that context is inherently + ambiguous and nickname is the safer/more common reading (per the + `JEFFREY (JD) BRICKEN` example). + + The `HumanName.is_suffix()` method never reads + `SUFFIX_ACRONYMS_AMBIGUOUS` at all, before or after this design — it + only checks `SUFFIX_ACRONYMS` (unchanged) and `SUFFIX_NOT_ACRONYMS`. + So `is_suffix('JD')` keeps returning `True` outside of parens/quotes + (e.g. after a comma: `"Doe, JD"`) exactly as it did before this + design existed. + +This guidance should go as a short comment above `SUFFIX_ACRONYMS_AMBIGUOUS` +in `nameparser/config/suffixes.py` when implemented, so future contributors +adding a suffix know when (rarely) they need to touch this second list. + +### Config wiring in `nameparser/config/__init__.py` + +Add one new customizable attribute, following the plain `SetManager` pattern +used by `first_name_titles`, `conjunctions`, and `first_name_prefixes` +(declared as a bare `SetManager`-typed attribute, assigned directly in +`__init__` — constructor param → `SetManager`, no descriptor): + +- `suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS` + constructor param, assigned as `self.suffix_acronyms_ambiguous = + SetManager(suffix_acronyms_ambiguous)`, exposed as + `self.C.suffix_acronyms_ambiguous`. + +This deliberately does **not** use the `_CachedUnionMember` descriptor that +`prefixes`/`suffix_acronyms`/`suffix_not_acronyms`/`titles` use. That +descriptor is scoped specifically to the four sets whose union is cached in +`_pst` (`suffixes_prefixes_titles`) for prefix/suffix/title classification +elsewhere in the parser, and exists only to invalidate that cache when one of +those four is mutated. `suffix_acronyms_ambiguous` is a subtractive exception +list consulted in exactly one place, `parse_nicknames()` — it is not part of +that classification union and has no cache to invalidate. Giving it the +`_CachedUnionMember` descriptor would be needless machinery and would +silently fold it into `_pst`, which is misleading even though harmless today +(it's always a subset of `suffix_acronyms`). + +`suffix_acronyms` itself is **unchanged**: still an independently-settable +`_CachedUnionMember`, same constructor param, same default. No breaking +change. A caller who customizes `suffix_acronyms` directly (adding their +own acronym suffixes) without also touching `suffix_acronyms_ambiguous` +simply gets the default (small, English-specific) ambiguity exceptions — +a reasonable default, easily overridden if they hit a collision in their +own custom acronyms. + +## Approach + +Modify `parse_nicknames()` in `nameparser/parser.py` to inspect each +regex match before deciding whether to route it to `nickname_list` or leave +it in `_full_name` for normal suffix processing downstream. + +A match is **not** extracted as a nickname (left in `_full_name` for normal +downstream processing) if any of the following hold. In this case the +delimiters (`()`, `""`, or `''`) must **not** be reinserted — only the bare +inner content goes back into `_full_name`. Downstream tokenization only +strips spaces/commas (`parse_pieces`) and periods (`is_suffix`/`lc`); it does +not strip parens or quotes. Reinserting `m.group(0)` (e.g. literal `(Ret)`) +would leave `lc('(Ret)') == '(ret)'`, which never matches `suffix_not_acronyms` +(`'ret'`) or anything else, so the content would silently fail to be +recognized as a suffix downstream and would instead be absorbed as an +unrecognized word. So the content must go back in *undelimited* — e.g. +`Ret`, `Jr.`, `MBA` — so it reads exactly like it would have if the source +string never had parens/quotes around it, letting normal suffix/title/word +parsing handle it the same way it handles unparenthesized occurrences of the +same words: + +1. The **inner content** (e.g. `Jr.`, `Ret`, `MBA`, without delimiters), + lowercased/period-stripped via the existing `lc()` normalizer, is a + member of `self.C.suffix_not_acronyms`, or is in `self.C.suffix_acronyms` + but *not* in `self.C.suffix_acronyms_ambiguous`. Together these cover + unambiguous post-nominal words (`jr`, `sr`, `ret`, `vet`, etc.) and + unambiguous acronym suffixes (`mba`, `cpa`, `phd`, etc.) — deliberately + excluding the 2-entry ambiguous set (`ed`, `jd`), which stay eligible to + be treated as nicknames. +2. The **inner content ends in a period** (e.g. `Mgr.`, `Assoc.`). Real + nicknames don't end in a period; content shaped like an abbreviation is + more likely a suffix/title fragment that isn't in our suffix lists at + all. This is a heuristic, not a suffix-list lookup — when it fires, the + match is left in `_full_name` and flows into normal word-by-word parsing + (it does not get force-classified as a suffix; it may end up as an + unrecognized middle/last piece, or as a suffix if normal parsing + separately identifies it as one). + +`JD` has no trailing period and is in `SUFFIX_ACRONYMS_AMBIGUOUS`, so +neither rule fires for it — it stays a nickname, preserving current +behavior. A period-bearing form like `(J.D.)` would fall under rule 2 and +be excluded from nicknames; this is an accepted, deliberate trade-off. + +### Implementation sketch + +Replace the current bulk extraction: + +```python +for _re in (re_quoted_word, re_double_quotes, re_parenthesis): + if _re.search(self._full_name): + self.nickname_list += [x for x in _re.findall(self._full_name)] + self._full_name = _re.sub('', self._full_name) +``` + +with a per-match callback so each match can be individually routed: + +```python +for _re in (re_quoted_word, re_double_quotes, re_parenthesis): + def handle_match(m): + content = m.group(1) + stripped = lc(content) + if (stripped in self.C.suffix_not_acronyms + or (stripped in self.C.suffix_acronyms + and stripped not in self.C.suffix_acronyms_ambiguous) + or content.endswith('.')): + # Leave the bare content (no delimiters) so downstream + # word-splitting/suffix-matching sees it exactly as if it + # had never been wrapped in parens/quotes. Returning the + # delimited m.group(0) instead would leave literal "(Ret)" + # in _full_name, and is_suffix()/lc() only strip periods, + # not parens/quotes, so it would never match suffix_not_acronyms. + return content + self.nickname_list.append(content) + return '' + self._full_name = _re.sub(handle_match, self._full_name) +``` + +This applies uniformly to all three delimiter regexes. In practice the +suffix checks only ever fire for the parenthesis regex today, since quoted +forms of these suffix words are uncommon in the test corpus — but the +logic is delimiter-agnostic and will correctly handle a quoted suffix if +one appears. + +Note this changes surrounding whitespace behavior slightly versus the old +bulk `_re.sub('', ...)`: since the delimiters are dropped but the content is +kept in place, `"Col. (Ret)"` becomes `"Col. Ret"` (single space preserved, +same as if `"Ret"` had simply followed `"Col."` in the original string) — +consistent with the goal of making the content parse exactly like an +unparenthesized occurrence would. + +## Scope boundaries + +- `SUFFIX_ACRONYMS` and the `suffix_acronyms` config attribute are + completely unchanged — same value, same mutability, no API break. +- Only one new constant/attribute pair is introduced: + `SUFFIX_ACRONYMS_AMBIGUOUS` / `suffix_acronyms_ambiguous`. +- No changes to `is_suffix()`. +- Does not address #110 (additional apostrophe delimiters) or #112 (dynamic + regex registration) — those are separate, out of scope here. +- Does not attempt to resolve the `ed`/`jd`-style ambiguity between suffix + acronyms and nicknames; existing behavior (treat as nickname) is + preserved by design via `SUFFIX_ACRONYMS_AMBIGUOUS`. +- Does not change `parse_full_name`'s suffix detection. That detection only + recognizes a *trailing run* of suffix-shaped pieces in the no-comma parse + path (`self.are_suffixes(pieces[i+1:])`); a comma segment is treated as a + suffix unconditionally regardless of internal position. This means a + suffix-shaped word freed from parens/quotes by this fix lands in + `suffix_list` only when it's already in a trailing/comma position in the + source string — e.g. `Andrew Perkins (MBA)` → suffix `MBA`, and + `Andrew Perkins, Jr., Col. (Ret)` → suffix contains `Ret` (comma segment). + But `Lon (Jr.) Williams` and `Col. (Ret.) Smith` have the suffix-shaped + word in the *middle* of a no-comma name, with non-suffix pieces after it + (`Williams`, `Smith`) — that trailing-run algorithm doesn't pull it into + `suffix_list`, so it lands in `middle`/`first` instead, identically to how + the bare (unparenthesized) string `"Lon Jr. Williams"` already parses on + current `master`. This fix's actual guarantee is: **the delimited and + undelimited forms of the same string now parse identically** — it no + longer promises the freed word always becomes a suffix regardless of + position. Verified against `master` before writing this note: + `HumanName("Lon Jr. Williams")` → `middle == "Jr."`, and + `HumanName("Col. Ret. Smith")` → `title == "Col."`, `first == "Ret."` — + both already true without this fix, confirming the limitation is + pre-existing in `parse_full_name` and not introduced here. + +## Test cases to add + +All in the existing nickname/suffix test modules: + +| Input | Expected suffix | Expected nickname | Notes | +|---|---|---|---| +| `Andrew Perkins, Jr., Col. (Ret)` | contains `Ret` | empty | comma segment — trailing-run limitation doesn't apply | +| `Lon (Jr.) Williams` | empty | empty | mid-name, no comma — see "Scope boundaries"; parses identically to bare `"Lon Jr. Williams"` (`middle == "Jr."`) | +| `Col. (Ret.) Smith` | empty | empty | mid-name, no comma — same limitation; parses identically to bare `"Col. Ret. Smith"` (`title == "Col."`, `first == "Ret."`) | +| `Andrew Perkins (MBA)` | `MBA` | empty | trailing position | +| `JEFFREY (JD) BRICKEN` | empty | `JD` (regression guard) | | +| `Andrew Perkins (Mgr.)` | empty | empty (content flows into normal parsing, not force-classified either way) | | + +Additionally, add API/behavior tests (not constant-content tests — those +just create a second place to update whenever the lists change): + +- Customizing `suffix_acronyms_ambiguous` via `Constants()` changes + parsing output: adding a custom entry there and parsing + `"Andrew Perkins (XYZ)"` (where `XYZ` is also in `suffix_acronyms`) + keeps `XYZ` classified as a nickname instead of a suffix. +- Removing `jd` from a custom `suffix_acronyms_ambiguous` and parsing + `"Andrew Perkins (JD)"` (trailing position, so the freed word actually + reaches `suffix_list` — see "Scope boundaries" for why a mid-name example + like `JEFFREY (JD) BRICKEN` wouldn't work for this test) now routes `JD` + to suffix instead of nickname (confirms the exception list, not a + hardcoded check, drives the behavior). From 919b2696022932f6e0bdec011d47adf1a0888d05 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 30 Jun 2026 23:51:15 -0700 Subject: [PATCH 04/11] refine: clarify handle_match rationale, add quote-delimited suffix tests Addresses code-review minor findings on the parse_nicknames rewrite: comments explaining why is_suffix() isn't reused directly and why handle_match is shared across all three delimiter regexes, plus test coverage for suffix-shaped content in single/double quotes (previously only parenthesis was covered). --- nameparser/parser.py | 7 +++++++ tests/test_suffixes.py | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/nameparser/parser.py b/nameparser/parser.py index 694d646..a49ffcf 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -799,6 +799,10 @@ def handle_match(m: 're.Match[str]') -> str: # returns the whole match for group-less patterns. content = m.group(1) if m.lastindex else m.group(0) stripped = lc(content) + # Inlined rather than calling self.is_suffix(content): is_suffix() + # also rejects single-letter initials via is_an_initial(), which + # isn't relevant here, and the suffix_acronyms_ambiguous exclusion + # needs to be interleaved into the acronym branch specifically. is_unambiguous_suffix = ( stripped in self.C.suffix_not_acronyms or (stripped in self.C.suffix_acronyms @@ -815,6 +819,9 @@ def handle_match(m: 're.Match[str]') -> str: self.nickname_list.append(content) return '' + # Same handle_match for all three delimiters: suffix-shaped content + # is rare in quotes but not impossible, and the logic is delimiter- + # agnostic, so there's no reason to special-case parenthesis here. for _re in (re_quoted_word, re_double_quotes, re_parenthesis): self._full_name = _re.sub(handle_match, self._full_name) diff --git a/tests/test_suffixes.py b/tests/test_suffixes.py index 33a693a..6107d99 100644 --- a/tests/test_suffixes.py +++ b/tests/test_suffixes.py @@ -333,6 +333,23 @@ def test_period_terminated_content_in_parenthesis_not_forced_either_way(self) -> self.m(hn.nickname, "", hn) self.m(hn.suffix, "", hn) + def test_suffix_in_single_quotes(self) -> None: + # handle_match() is shared across all three delimiter regexes, not + # just parenthesis -- confirm suffix-shaped single-quoted content + # routes the same way. + hn = HumanName("Andrew Perkins 'MBA'") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "MBA", hn) + self.m(hn.nickname, "", hn) + + def test_suffix_in_double_quotes(self) -> None: + hn = HumanName('Andrew Perkins "MBA"') + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "MBA", hn) + self.m(hn.nickname, "", hn) + def test_suffix_acronyms_ambiguous_custom_entry_stays_nickname(self) -> None: # A custom suffix_acronyms_ambiguous entry keeps a suffix_acronyms # member classified as a nickname instead of a suffix, confirming From 859d7593b073038a4838711b95bb6c36711f8a6b Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 30 Jun 2026 23:52:01 -0700 Subject: [PATCH 05/11] fix: restore missing comma between 'msc'/'mscmsm' suffix acronyms Python's implicit string-literal concatenation silently merged 'msc' and 'mscmsm' into a single bogus 'mscmscmsm' entry in SUFFIX_ACRONYMS, dropping both real entries. Caught during code review of an unrelated change; unrelated to issue #111 but small and low-risk enough to fix alongside it. --- nameparser/config/suffixes.py | 2 +- tests/test_suffixes.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index dd17fe8..c501578 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -501,7 +501,7 @@ 'mra', 'ms', 'msa', - 'msc' + 'msc', 'mscmsm', 'msm', 'mt', diff --git a/tests/test_suffixes.py b/tests/test_suffixes.py index 6107d99..0c652f0 100644 --- a/tests/test_suffixes.py +++ b/tests/test_suffixes.py @@ -19,6 +19,21 @@ def test_suffix_with_periods(self) -> None: self.m(hn.last, "Dentist", hn) self.m(hn.suffix, "D.D.S.", hn) + def test_msc_and_mscmsm_suffix_acronyms(self) -> None: + # Regression guard: SUFFIX_ACRONYMS had a missing comma between 'msc' + # and 'mscmsm', so Python's implicit string-literal concatenation + # silently merged them into a bogus 'mscmscmsm' entry, dropping both + # real entries from the set. + hn = HumanName("Joe Dentist Msc") + self.m(hn.first, "Joe", hn) + self.m(hn.last, "Dentist", hn) + self.m(hn.suffix, "Msc", hn) + + hn2 = HumanName("Joe Dentist MSCMSM") + self.m(hn2.first, "Joe", hn2) + self.m(hn2.last, "Dentist", hn2) + self.m(hn2.suffix, "MSCMSM", hn2) + def test_two_suffixes(self) -> None: hn = HumanName("Kenneth Clarke QC MP") self.m(hn.first, "Kenneth", hn) From 7c91594b476ea65bf9446b122b2c7176345b28ad Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Tue, 30 Jun 2026 23:56:09 -0700 Subject: [PATCH 06/11] test: remove test_msc_and_mscmsm_suffix_acronyms A parse-behavior test on two specific acronyms doesn't generalize to catch other missing-comma bugs in SUFFIX_ACRONYMS; it only documents this one instance. Not the right way to guard against this class of bug. --- tests/test_suffixes.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/test_suffixes.py b/tests/test_suffixes.py index 0c652f0..6107d99 100644 --- a/tests/test_suffixes.py +++ b/tests/test_suffixes.py @@ -19,21 +19,6 @@ def test_suffix_with_periods(self) -> None: self.m(hn.last, "Dentist", hn) self.m(hn.suffix, "D.D.S.", hn) - def test_msc_and_mscmsm_suffix_acronyms(self) -> None: - # Regression guard: SUFFIX_ACRONYMS had a missing comma between 'msc' - # and 'mscmsm', so Python's implicit string-literal concatenation - # silently merged them into a bogus 'mscmscmsm' entry, dropping both - # real entries from the set. - hn = HumanName("Joe Dentist Msc") - self.m(hn.first, "Joe", hn) - self.m(hn.last, "Dentist", hn) - self.m(hn.suffix, "Msc", hn) - - hn2 = HumanName("Joe Dentist MSCMSM") - self.m(hn2.first, "Joe", hn2) - self.m(hn2.last, "Dentist", hn2) - self.m(hn2.suffix, "MSCMSM", hn2) - def test_two_suffixes(self) -> None: hn = HumanName("Kenneth Clarke QC MP") self.m(hn.first, "Kenneth", hn) From e3ffbc8f833f83f5255432e37d7e4fa1567af158 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 1 Jul 2026 00:03:07 -0700 Subject: [PATCH 07/11] chore: untrack docs/superpowers planning docs Design specs and implementation plans are working notes, not published documentation. Keep them on disk locally but stop tracking them in git. --- .gitignore | 1 + ...6-06-30-suffix-in-parens-implementation.md | 396 ------------------ .../2026-06-30-suffix-in-parens-design.md | 288 ------------- 3 files changed, 1 insertion(+), 684 deletions(-) delete mode 100644 docs/superpowers/plans/2026-06-30-suffix-in-parens-implementation.md delete mode 100644 docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md diff --git a/.gitignore b/.gitignore index 0c114a8..40f3ae8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ env/ # docs docs/_* +docs/superpowers/ diff --git a/docs/superpowers/plans/2026-06-30-suffix-in-parens-implementation.md b/docs/superpowers/plans/2026-06-30-suffix-in-parens-implementation.md deleted file mode 100644 index 3d79127..0000000 --- a/docs/superpowers/plans/2026-06-30-suffix-in-parens-implementation.md +++ /dev/null @@ -1,396 +0,0 @@ -# Suffix-in-Parentheses Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Stop `parse_nicknames()` from misclassifying suffix-like content inside parentheses/quotes (e.g. `(Ret)`, `(Jr.)`, `(MBA)`) as nicknames, while preserving the existing `JEFFREY (JD) BRICKEN` → nickname `JD` behavior, per [issue #111](https://github.com/derek73/python-nameparser/issues/111). - -**Architecture:** Add a small `SUFFIX_ACRONYMS_AMBIGUOUS` exception set (`ed`, `jd`) alongside the existing `SUFFIX_ACRONYMS`/`SUFFIX_NOT_ACRONYMS` constants, wire it into `Constants` as a plain `SetManager` attribute, and change `parse_nicknames()` from an unconditional bulk extraction into a per-match callback that leaves suffix-shaped content (or content ending in a period) in `_full_name` — undelimited — instead of routing it to `nickname_list`. - -**Tech Stack:** Python, pytest (existing `HumanNameTestBase` fixture in `tests/base.py`). - -**Spec:** [docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md](../specs/2026-06-30-suffix-in-parens-design.md) - ---- - -## File Structure - -- `nameparser/config/suffixes.py` — move `'(ret)'`/`'(vet)'` out of `SUFFIX_ACRONYMS` into `SUFFIX_NOT_ACRONYMS` as bare `'ret'`/`'vet'`; add new `SUFFIX_ACRONYMS_AMBIGUOUS` constant with a decision-guide comment. -- `nameparser/config/__init__.py` — import the new constant; add `suffix_acronyms_ambiguous` as a plain `SetManager`-typed attribute (same pattern as `first_name_titles`/`conjunctions`/`first_name_prefixes`), constructor param, docstring entry. -- `nameparser/parser.py` — rewrite `HumanName.parse_nicknames()` (currently lines 774–793) to use a per-match callback instead of bulk `findall`/`sub`. -- `tests/test_suffixes.py` — add a config-plumbing test for `suffix_acronyms_ambiguous`, plus suffix-in-parens/quotes regression tests. -- `tests/test_nicknames.py` — add the `JD` regression-guard test and the ambiguous-content-stays-a-nickname tests. - -No new files. No public API is removed or changed; `suffix_acronyms`/`SUFFIX_ACRONYMS` are untouched. - ---- - -## Task 1: Add `SUFFIX_ACRONYMS_AMBIGUOUS` and wire `suffix_acronyms_ambiguous` into `Constants` - -**Files:** -- Modify: `nameparser/config/suffixes.py` -- Modify: `nameparser/config/__init__.py` -- Test: `tests/test_suffixes.py` - -This task only adds plumbing (a new constant + a new customizable `Constants` attribute). It does not change parsing behavior yet — `parse_nicknames()` doesn't read it until Task 2. The test here checks the customization API works, not the literal contents of the default set (that's covered end-to-end in Task 2 once the attribute actually affects parsing). - -- [ ] **Step 1: Write the failing test** - -Add to `tests/test_suffixes.py`, inside `SuffixesTestCase`: - -```python - def test_suffix_acronyms_ambiguous_is_customizable(self) -> None: - from nameparser.config import Constants - custom = Constants(suffix_acronyms_ambiguous=['xyz']) - self.assertEqual(set(custom.suffix_acronyms_ambiguous), {'xyz'}) - # Constructing without the kwarg still works and uses the module default. - default = Constants() - self.assertIn('jd', default.suffix_acronyms_ambiguous) -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `pytest tests/test_suffixes.py::SuffixesTestCase::test_suffix_acronyms_ambiguous_is_customizable -v` -Expected: FAIL with `TypeError: Constants.__init__() got an unexpected keyword argument 'suffix_acronyms_ambiguous'` - -- [ ] **Step 3: Add the constant to `nameparser/config/suffixes.py`** - -Remove these two lines from `SUFFIX_ACRONYMS` (currently lines 24–25): - -```python - '(ret)', - '(vet)', -``` - -Add `'ret'` and `'vet'` to `SUFFIX_NOT_ACRONYMS` (currently lines 1–16), and append the new constant after `SUFFIX_ACRONYMS`'s closing docstring. The full top of the file becomes: - -```python -SUFFIX_NOT_ACRONYMS = set([ - 'dr', - 'esq', - 'esquire', - 'jr', - 'jnr', - 'junior', - 'sr', - 'snr', - '2', - 'i', - 'ii', - 'iii', - 'iv', - 'v', - 'ret', - 'vet', -]) -""" - -Post-nominal pieces that are not acronyms. The parser does not remove periods -when matching against these pieces. - -""" -SUFFIX_ACRONYMS = set([ - '8-vsb', - 'aas', - ... -``` - -(Leave every other entry in `SUFFIX_ACRONYMS` untouched — only the two literal-parenthesis entries move. Everything from `'8-vsb'` onward through the closing `])` stays exactly as-is, just re-indexed by the two removed lines.) - -Append this after the `SUFFIX_ACRONYMS` closing docstring (the `"""..."""` block that currently ends the file): - -```python -SUFFIX_ACRONYMS_AMBIGUOUS = set([ - # Suffix acronyms that also commonly work as given-name nicknames on - # their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames() - # when deciding whether parenthesized/quoted content is a nickname or a - # suffix — content matching one of these stays a nickname rather than - # being reclassified as a suffix, since that's the more common reading - # in ambiguous, delimiter-only context. - # - # When adding a new entry to SUFFIX_ACRONYMS, also add it here only if - # the exact letter sequence could plausibly be someone's given name or - # common nickname on its own (e.g. 'jd', 'ed'). Unambiguous - # certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry. - 'ed', - 'jd', -]) -""" - -Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a -common given-name nickname. Not a partition of SUFFIX_ACRONYMS — a small, -standalone exception list consulted only by parse_nicknames(). - -""" -``` - -- [ ] **Step 4: Wire it into `Constants` in `nameparser/config/__init__.py`** - -Add the import near the existing suffix imports (after line 44): - -```python -from nameparser.config.suffixes import SUFFIX_ACRONYMS -from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS -from nameparser.config.suffixes import SUFFIX_ACRONYMS_AMBIGUOUS -``` - -Add a plain `SetManager`-typed class attribute next to `first_name_titles`/`conjunctions`/`first_name_prefixes` (currently lines 257–259) — **not** a `_CachedUnionMember`, since this set isn't part of the `_pst` prefix/suffix/title union and needs no cache invalidation: - -```python - first_name_titles: SetManager - conjunctions: SetManager - first_name_prefixes: SetManager - suffix_acronyms_ambiguous: SetManager -``` - -Add the constructor param (next to `suffix_not_acronyms`, currently line 390) and its docstring entry (next to the `suffix_not_acronyms` doc entry, currently lines 239–240): - -```python - :param set suffix_acronyms_ambiguous: - :py:attr:`~suffixes.SUFFIX_ACRONYMS_AMBIGUOUS` wrapped with :py:class:`SetManager`. -``` - -```python - def __init__(self, - prefixes: Iterable[str] = PREFIXES, - suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS, - suffix_not_acronyms: Iterable[str] = SUFFIX_NOT_ACRONYMS, - suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS, - titles: Iterable[str] = TITLES, - ... -``` - -And assign it directly in the body (next to the other four descriptor assignments, currently lines 402–408 — this one is a plain attribute assignment, not a descriptor, so it doesn't need to come before any `suffixes_prefixes_titles` read): - -```python - self.prefixes = SetManager(prefixes) - self.suffix_acronyms = SetManager(suffix_acronyms) - self.suffix_not_acronyms = SetManager(suffix_not_acronyms) - self.titles = SetManager(titles) - self.first_name_titles = SetManager(first_name_titles) - self.conjunctions = SetManager(conjunctions) - self.first_name_prefixes = SetManager(first_name_prefixes) - self.suffix_acronyms_ambiguous = SetManager(suffix_acronyms_ambiguous) -``` - -- [ ] **Step 5: Run test to verify it passes** - -Run: `pytest tests/test_suffixes.py::SuffixesTestCase::test_suffix_acronyms_ambiguous_is_customizable -v` -Expected: PASS - -- [ ] **Step 6: Run the full suite to check nothing else broke from moving `(ret)`/`(vet)`** - -Run: `pytest -q` -Expected: all pass (no test currently exercises `'(ret)'`/`'(vet)'`, or bare `'ret'`/`'vet'` as suffixes, per the check done during planning — `grep -rn "Ret\b\|Vet\b" tests/*.py` returns nothing) - -- [ ] **Step 7: Commit** - -```bash -git add nameparser/config/suffixes.py nameparser/config/__init__.py tests/test_suffixes.py -git commit -m "config: add SUFFIX_ACRONYMS_AMBIGUOUS, move (ret)/(vet) to SUFFIX_NOT_ACRONYMS" -``` - ---- - -## Task 2: Rewrite `parse_nicknames()` to route suffix-shaped content out of `nickname_list` - -**Files:** -- Modify: `nameparser/parser.py:774-793` (`HumanName.parse_nicknames`) -- Test: `tests/test_nicknames.py`, `tests/test_suffixes.py` - -- [ ] **Step 1: Write the failing tests** - -Add to `tests/test_suffixes.py`, inside `SuffixesTestCase` (these are the spec's test-table rows for content that should stop being nicknames and start being suffixes): - -```python - def test_suffix_in_parenthesis_with_other_suffixes(self) -> None: - hn = HumanName("Andrew Perkins, Jr., Col. (Ret)") - self.m(hn.first, "Andrew", hn) - self.m(hn.last, "Perkins", hn) - self.assertIn("Ret", hn.suffix) - self.m(hn.nickname, "", hn) - - def test_suffix_in_parenthesis_mid_name(self) -> None: - # "Jr." is suffix-shaped, so parse_nicknames() no longer treats it as - # a nickname. But it isn't in trailing position, and parse_full_name's - # suffix detection only recognizes a trailing run of suffix-shaped - # pieces -- so it lands wherever normal parsing would put a bare - # mid-name "Jr." token, exactly as if the parens were never there - # (verified: HumanName("Lon Jr. Williams") parses identically). - # Known limitation: making this land in `suffix` would require - # changing parse_full_name's suffix detection, out of scope here -- - # issue #111 is specifically about the nickname misclassification. - hn = HumanName("Lon (Jr.) Williams") - self.m(hn.first, "Lon", hn) - self.m(hn.middle, "Jr.", hn) - self.m(hn.last, "Williams", hn) - self.m(hn.suffix, "", hn) - self.m(hn.nickname, "", hn) - - def test_suffix_in_parenthesis_with_period(self) -> None: - # Same known limitation as above: "Ret." is mid-name (no comma), so - # it's outside the trailing run parse_full_name's suffix detection - # requires. It parses exactly as bare "Col. Ret. Smith" would. - hn = HumanName("Col. (Ret.) Smith") - self.m(hn.title, "Col.", hn) - self.m(hn.first, "Ret.", hn) - self.m(hn.last, "Smith", hn) - self.m(hn.suffix, "", hn) - self.m(hn.nickname, "", hn) - - def test_acronym_suffix_in_parenthesis(self) -> None: - hn = HumanName("Andrew Perkins (MBA)") - self.m(hn.first, "Andrew", hn) - self.m(hn.last, "Perkins", hn) - self.m(hn.suffix, "MBA", hn) - self.m(hn.nickname, "", hn) - - def test_period_terminated_content_in_parenthesis_not_forced_either_way(self) -> None: - # "Mgr." isn't in any suffix list, but it ends in a period, so the - # period heuristic (rule 2) excludes it from nickname_list. It flows - # into normal parsing instead of being force-classified as a suffix. - hn = HumanName("Andrew Perkins (Mgr.)") - self.m(hn.nickname, "", hn) - self.m(hn.suffix, "", hn) - - def test_suffix_acronyms_ambiguous_custom_entry_stays_nickname(self) -> None: - # A custom suffix_acronyms_ambiguous entry keeps a suffix_acronyms - # member classified as a nickname instead of a suffix, confirming - # the exception list -- not a hardcoded check -- drives the behavior. - from nameparser.config import Constants - C = Constants( - suffix_acronyms=['xyz'], - suffix_acronyms_ambiguous=['xyz'], - ) - hn = HumanName("Andrew Perkins (XYZ)", constants=C) - self.m(hn.nickname, "XYZ", hn) - self.m(hn.suffix, "", hn) - - def test_suffix_acronyms_ambiguous_removal_routes_to_suffix(self) -> None: - # Removing 'jd' from a custom suffix_acronyms_ambiguous flips JD - # from nickname to suffix. Uses a trailing-position name (unlike the - # JEFFREY (JD) BRICKEN regression guard) so parse_full_name's - # trailing-run suffix detection actually picks it up -- see the - # known mid-name limitation noted on the tests above. - from nameparser.config import Constants - C = Constants(suffix_acronyms_ambiguous=[]) - hn = HumanName("Andrew Perkins (JD)", constants=C) - self.m(hn.nickname, "", hn) - self.m(hn.suffix, "JD", hn) -``` - -Add to `tests/test_nicknames.py`, inside `NicknameTestCase` (regression guard for the ambiguous case that must NOT change): - -```python - def test_ambiguous_suffix_acronym_in_parenthesis_stays_nickname(self) -> None: - # JD is in SUFFIX_ACRONYMS_AMBIGUOUS: both a law-degree acronym and a - # common given-name nickname. Existing behavior (nickname) must be - # preserved -- see issue #111. - hn = HumanName("JEFFREY (JD) BRICKEN") - self.m(hn.nickname, "JD", hn) - self.m(hn.suffix, "", hn) -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `pytest tests/test_suffixes.py tests/test_nicknames.py -v -k "suffix_in_parenthesis or acronym_suffix_in_parenthesis or period_terminated_content or suffix_acronyms_ambiguous or ambiguous_suffix_acronym"` -Expected: FAIL — e.g. `test_suffix_in_parenthesis_with_other_suffixes` fails because `hn.suffix` is empty and `hn.nickname` is `"Ret"` under current behavior. The two `suffix_acronyms_ambiguous` customization tests fail with `TypeError: Constants.__init__() got an unexpected keyword argument` if Task 1 wasn't done, or with wrong nickname/suffix values if Task 1 is done but this task isn't. - -- [ ] **Step 3: Rewrite `parse_nicknames()` in `nameparser/parser.py`** - -Replace the current method body (lines 774–793): - -```python - def parse_nicknames(self) -> None: - """ - The content of parenthesis or quotes in the name will be added to the - nicknames list. This happens before any other processing of the name. - - Single quotes cannot span white space characters and must border - white space to allow for quotes in names like O'Connor and Kawai'ae'a. - Double quotes and parenthesis can span white space. - - Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; - `quoted_word`, `double_quotes` and `parenthesis`. - """ - - re_quoted_word = self.C.regexes.quoted_word - re_double_quotes = self.C.regexes.double_quotes - re_parenthesis = self.C.regexes.parenthesis - - for _re in (re_quoted_word, re_double_quotes, re_parenthesis): - if _re.search(self._full_name): - self.nickname_list += [x for x in _re.findall(self._full_name)] - self._full_name = _re.sub('', self._full_name) -``` - -with: - -```python - def parse_nicknames(self) -> None: - """ - The content of parenthesis or quotes in the name will be added to the - nicknames list, unless that content is suffix-shaped -- an unambiguous - suffix_not_acronyms/suffix_acronyms member, or content ending in a - period -- in which case it's left in place (undelimited) for normal - downstream suffix/title/word parsing instead. This happens before any - other processing of the name. - - Single quotes cannot span white space characters and must border - white space to allow for quotes in names like O'Connor and Kawai'ae'a. - Double quotes and parenthesis can span white space. - - Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; - `quoted_word`, `double_quotes` and `parenthesis`. - """ - - re_quoted_word = self.C.regexes.quoted_word - re_double_quotes = self.C.regexes.double_quotes - re_parenthesis = self.C.regexes.parenthesis - - def handle_match(m: 're.Match[str]') -> str: - content = m.group(1) - stripped = lc(content) - is_unambiguous_suffix = ( - stripped in self.C.suffix_not_acronyms - or (stripped in self.C.suffix_acronyms - and stripped not in self.C.suffix_acronyms_ambiguous) - ) - if is_unambiguous_suffix or content.endswith('.'): - # Leave the bare content -- no delimiters -- so downstream - # word-splitting/suffix-matching sees it exactly as if it had - # never been wrapped in parens/quotes. is_suffix()/lc() only - # strip periods, never parens/quotes, so returning m.group(0) - # here (e.g. literal "(Ret)") would never match - # suffix_not_acronyms ("ret"). - return content - self.nickname_list.append(content) - return '' - - for _re in (re_quoted_word, re_double_quotes, re_parenthesis): - self._full_name = _re.sub(handle_match, self._full_name) -``` - -- [ ] **Step 4: Run tests to verify they pass** - -Run: `pytest tests/test_suffixes.py tests/test_nicknames.py -v` -Expected: PASS for all new tests - -- [ ] **Step 5: Run the full suite** - -Run: `pytest -q` -Expected: all pass, including the pre-existing nickname tests (`test_nickname_in_parenthesis`, `test_two_word_nickname_in_parenthesis`, `test_parenthesis_are_removed_from_name`, `test_duplicate_parenthesis_are_removed_from_name`, etc. — none of their parenthesized content is suffix-shaped or period-terminated, so they're unaffected) - -- [ ] **Step 6: Commit** - -```bash -git add nameparser/parser.py tests/test_suffixes.py tests/test_nicknames.py -git commit -m "fix: don't extract suffix-shaped parenthesized/quoted content as nicknames (#111)" -``` - ---- - -## Self-Review Notes - -- **Spec coverage:** Constants/config changes (spec §"Constant addition", §"Config wiring") → Task 1. Parser rewrite and both design rules (§"Approach", rules 1–2) → Task 2 Step 3. All six spec test-table rows → Task 2 Step 1. Both customization behavior tests from spec §"Test cases to add" → Task 2 Step 1. Scope boundaries (no `is_suffix()` change, `SUFFIX_ACRONYMS`/`suffix_acronyms` untouched) → satisfied by construction; nothing in either task touches `is_suffix()` or removes/renames anything from `SUFFIX_ACRONYMS`. -- **No placeholders:** every step shows full code, exact commands, and expected output. -- **Type/name consistency:** `suffix_acronyms_ambiguous` (constructor param, attribute, config docstring) and `SUFFIX_ACRONYMS_AMBIGUOUS` (module constant) are spelled identically across both tasks. `handle_match`/`is_unambiguous_suffix` names introduced in Task 2 aren't referenced elsewhere, so no cross-task drift risk. diff --git a/docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md b/docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md deleted file mode 100644 index 0641151..0000000 --- a/docs/superpowers/specs/2026-06-30-suffix-in-parens-design.md +++ /dev/null @@ -1,288 +0,0 @@ -# Design: Detect suffixes inside parenthesis/quotes before nickname extraction - -Addresses [issue #111](https://github.com/derek73/python-nameparser/issues/111). - -## Problem - -`parse_nicknames()` runs before any other name processing and unconditionally -extracts everything found inside parenthesis, double quotes, or single quotes -into `nickname_list`, stripping it from the working string. This means -suffix-like content that happens to be delimited — `(Ret)`, `(Jr.)` — is -misclassified as a nickname instead of a suffix. - -Examples of current (wrong) behavior: - -- `Andrew Perkins, Jr., Col. (Ret)` → nickname: `Ret` (should be suffix) -- `Lon (Jr.) Williams` → nickname: `Jr.` (should be suffix) -- `Andrew Perkins (MBA)` → nickname: `MBA` (should be suffix) - -Examples of current (correct) behavior that must not regress: - -- `JEFFREY (JD) BRICKEN` → nickname: `JD` (ambiguous — `JD` is both a - common given-name nickname and a law degree acronym; existing behavior - treats it as a nickname and that must be preserved) - -## Background - -`SUFFIX_ACRONYMS` currently contains ~530 entries bulk-imported from -Wikipedia's post-nominal letters list -([af5bdab](https://github.com/derek73/python-nameparser/commit/af5bdabc160fc15054b59e078c658ac80a3cb1ff), -fixing #93). Investigating this set for the current fix turned up two -findings that shape the design: - -1. `(ret)` and `(vet)` are the *only two* entries in the entire set that - carry literal parentheses — an artifact of how the Wikipedia source - formatted retired/veteran military status, not a deliberate design - choice for nickname disambiguation. They belong in `SUFFIX_NOT_ACRONYMS` - as bare words (`ret`, `vet`), consistent with how `jr`/`sr` are stored. -2. Of the remaining ~528 acronym entries, only **`ed`** and **`jd`** - plausibly collide with common English given-name nicknames (Ed, JD). - The rest (`mba`, `cpa`, `phd`, `rn`, etc.) are unambiguous - certifications/degrees/honors that are never used as nicknames. - -Because only 2 of ~530 entries are genuinely ambiguous, and that -ambiguity check is only ever needed in one place (`parse_nicknames()`), we -add a single small constant for the exception rather than splitting -`SUFFIX_ACRONYMS` into two large parallel lists. `SUFFIX_ACRONYMS` itself -is untouched — same value, same mutability, no API change. - -### Constant addition in `nameparser/config/suffixes.py` - -```python -SUFFIX_NOT_ACRONYMS = set([ - 'dr', 'esq', 'esquire', 'jr', 'jnr', 'junior', 'sr', 'snr', - '2', 'i', 'ii', 'iii', 'iv', 'v', - 'ret', 'vet', # moved from literal "(ret)"/"(vet)" entries -]) - -SUFFIX_ACRONYMS_AMBIGUOUS = set([ - 'ed', 'jd', # acronym suffixes that commonly collide with given names/nicknames -]) - -SUFFIX_ACRONYMS = set([ - # unchanged: the full ~530-entry list, including 'ed' and 'jd' - ... -]) -``` - -`SUFFIX_ACRONYMS_AMBIGUOUS` is a small, standalone exception list — not a -partition of `SUFFIX_ACRONYMS`, so there's no second large list to keep in -sync and nothing to derive. "Unambiguous acronym suffix" is expressed as -`piece in SUFFIX_ACRONYMS and piece not in SUFFIX_ACRONYMS_AMBIGUOUS` -wherever it's needed, rather than as its own stored constant. - -### Where to add a new suffix: a decision guide - -1. **Is it a plain word/phrase, not an acronym** — e.g. `Junior`, `Senior`, - `Doctor`-style abbreviations, roman numerals? → `SUFFIX_NOT_ACRONYMS`. - Unchanged by this design. -2. **Is it an acronym/initialism** (e.g. all-caps letters like `MBA`, - `PHD`, `JD`)? Add it to `SUFFIX_ACRONYMS` as always. Then ask: **could - this exact letter sequence plausibly be someone's given name or common - nickname on its own**, independent of context? - - **No** (e.g. `MBA`, `CPA`, `RN`, `PHD` — nobody is named "Mba") → - nothing further to do; it's already unambiguous by omission from - `SUFFIX_ACRONYMS_AMBIGUOUS`. - - **Yes** (e.g. `JD`, `ED` — both real given-name nicknames) → also add - it to `SUFFIX_ACRONYMS_AMBIGUOUS`. - - This new list is read in exactly one place: the - `HumanName.parse_nicknames()` method, and only for content found in - parenthesis/quotes — it will *not* pull `JD`/`ED` out of parens as a - suffix there; they stay nicknames, since that context is inherently - ambiguous and nickname is the safer/more common reading (per the - `JEFFREY (JD) BRICKEN` example). - - The `HumanName.is_suffix()` method never reads - `SUFFIX_ACRONYMS_AMBIGUOUS` at all, before or after this design — it - only checks `SUFFIX_ACRONYMS` (unchanged) and `SUFFIX_NOT_ACRONYMS`. - So `is_suffix('JD')` keeps returning `True` outside of parens/quotes - (e.g. after a comma: `"Doe, JD"`) exactly as it did before this - design existed. - -This guidance should go as a short comment above `SUFFIX_ACRONYMS_AMBIGUOUS` -in `nameparser/config/suffixes.py` when implemented, so future contributors -adding a suffix know when (rarely) they need to touch this second list. - -### Config wiring in `nameparser/config/__init__.py` - -Add one new customizable attribute, following the plain `SetManager` pattern -used by `first_name_titles`, `conjunctions`, and `first_name_prefixes` -(declared as a bare `SetManager`-typed attribute, assigned directly in -`__init__` — constructor param → `SetManager`, no descriptor): - -- `suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS` - constructor param, assigned as `self.suffix_acronyms_ambiguous = - SetManager(suffix_acronyms_ambiguous)`, exposed as - `self.C.suffix_acronyms_ambiguous`. - -This deliberately does **not** use the `_CachedUnionMember` descriptor that -`prefixes`/`suffix_acronyms`/`suffix_not_acronyms`/`titles` use. That -descriptor is scoped specifically to the four sets whose union is cached in -`_pst` (`suffixes_prefixes_titles`) for prefix/suffix/title classification -elsewhere in the parser, and exists only to invalidate that cache when one of -those four is mutated. `suffix_acronyms_ambiguous` is a subtractive exception -list consulted in exactly one place, `parse_nicknames()` — it is not part of -that classification union and has no cache to invalidate. Giving it the -`_CachedUnionMember` descriptor would be needless machinery and would -silently fold it into `_pst`, which is misleading even though harmless today -(it's always a subset of `suffix_acronyms`). - -`suffix_acronyms` itself is **unchanged**: still an independently-settable -`_CachedUnionMember`, same constructor param, same default. No breaking -change. A caller who customizes `suffix_acronyms` directly (adding their -own acronym suffixes) without also touching `suffix_acronyms_ambiguous` -simply gets the default (small, English-specific) ambiguity exceptions — -a reasonable default, easily overridden if they hit a collision in their -own custom acronyms. - -## Approach - -Modify `parse_nicknames()` in `nameparser/parser.py` to inspect each -regex match before deciding whether to route it to `nickname_list` or leave -it in `_full_name` for normal suffix processing downstream. - -A match is **not** extracted as a nickname (left in `_full_name` for normal -downstream processing) if any of the following hold. In this case the -delimiters (`()`, `""`, or `''`) must **not** be reinserted — only the bare -inner content goes back into `_full_name`. Downstream tokenization only -strips spaces/commas (`parse_pieces`) and periods (`is_suffix`/`lc`); it does -not strip parens or quotes. Reinserting `m.group(0)` (e.g. literal `(Ret)`) -would leave `lc('(Ret)') == '(ret)'`, which never matches `suffix_not_acronyms` -(`'ret'`) or anything else, so the content would silently fail to be -recognized as a suffix downstream and would instead be absorbed as an -unrecognized word. So the content must go back in *undelimited* — e.g. -`Ret`, `Jr.`, `MBA` — so it reads exactly like it would have if the source -string never had parens/quotes around it, letting normal suffix/title/word -parsing handle it the same way it handles unparenthesized occurrences of the -same words: - -1. The **inner content** (e.g. `Jr.`, `Ret`, `MBA`, without delimiters), - lowercased/period-stripped via the existing `lc()` normalizer, is a - member of `self.C.suffix_not_acronyms`, or is in `self.C.suffix_acronyms` - but *not* in `self.C.suffix_acronyms_ambiguous`. Together these cover - unambiguous post-nominal words (`jr`, `sr`, `ret`, `vet`, etc.) and - unambiguous acronym suffixes (`mba`, `cpa`, `phd`, etc.) — deliberately - excluding the 2-entry ambiguous set (`ed`, `jd`), which stay eligible to - be treated as nicknames. -2. The **inner content ends in a period** (e.g. `Mgr.`, `Assoc.`). Real - nicknames don't end in a period; content shaped like an abbreviation is - more likely a suffix/title fragment that isn't in our suffix lists at - all. This is a heuristic, not a suffix-list lookup — when it fires, the - match is left in `_full_name` and flows into normal word-by-word parsing - (it does not get force-classified as a suffix; it may end up as an - unrecognized middle/last piece, or as a suffix if normal parsing - separately identifies it as one). - -`JD` has no trailing period and is in `SUFFIX_ACRONYMS_AMBIGUOUS`, so -neither rule fires for it — it stays a nickname, preserving current -behavior. A period-bearing form like `(J.D.)` would fall under rule 2 and -be excluded from nicknames; this is an accepted, deliberate trade-off. - -### Implementation sketch - -Replace the current bulk extraction: - -```python -for _re in (re_quoted_word, re_double_quotes, re_parenthesis): - if _re.search(self._full_name): - self.nickname_list += [x for x in _re.findall(self._full_name)] - self._full_name = _re.sub('', self._full_name) -``` - -with a per-match callback so each match can be individually routed: - -```python -for _re in (re_quoted_word, re_double_quotes, re_parenthesis): - def handle_match(m): - content = m.group(1) - stripped = lc(content) - if (stripped in self.C.suffix_not_acronyms - or (stripped in self.C.suffix_acronyms - and stripped not in self.C.suffix_acronyms_ambiguous) - or content.endswith('.')): - # Leave the bare content (no delimiters) so downstream - # word-splitting/suffix-matching sees it exactly as if it - # had never been wrapped in parens/quotes. Returning the - # delimited m.group(0) instead would leave literal "(Ret)" - # in _full_name, and is_suffix()/lc() only strip periods, - # not parens/quotes, so it would never match suffix_not_acronyms. - return content - self.nickname_list.append(content) - return '' - self._full_name = _re.sub(handle_match, self._full_name) -``` - -This applies uniformly to all three delimiter regexes. In practice the -suffix checks only ever fire for the parenthesis regex today, since quoted -forms of these suffix words are uncommon in the test corpus — but the -logic is delimiter-agnostic and will correctly handle a quoted suffix if -one appears. - -Note this changes surrounding whitespace behavior slightly versus the old -bulk `_re.sub('', ...)`: since the delimiters are dropped but the content is -kept in place, `"Col. (Ret)"` becomes `"Col. Ret"` (single space preserved, -same as if `"Ret"` had simply followed `"Col."` in the original string) — -consistent with the goal of making the content parse exactly like an -unparenthesized occurrence would. - -## Scope boundaries - -- `SUFFIX_ACRONYMS` and the `suffix_acronyms` config attribute are - completely unchanged — same value, same mutability, no API break. -- Only one new constant/attribute pair is introduced: - `SUFFIX_ACRONYMS_AMBIGUOUS` / `suffix_acronyms_ambiguous`. -- No changes to `is_suffix()`. -- Does not address #110 (additional apostrophe delimiters) or #112 (dynamic - regex registration) — those are separate, out of scope here. -- Does not attempt to resolve the `ed`/`jd`-style ambiguity between suffix - acronyms and nicknames; existing behavior (treat as nickname) is - preserved by design via `SUFFIX_ACRONYMS_AMBIGUOUS`. -- Does not change `parse_full_name`'s suffix detection. That detection only - recognizes a *trailing run* of suffix-shaped pieces in the no-comma parse - path (`self.are_suffixes(pieces[i+1:])`); a comma segment is treated as a - suffix unconditionally regardless of internal position. This means a - suffix-shaped word freed from parens/quotes by this fix lands in - `suffix_list` only when it's already in a trailing/comma position in the - source string — e.g. `Andrew Perkins (MBA)` → suffix `MBA`, and - `Andrew Perkins, Jr., Col. (Ret)` → suffix contains `Ret` (comma segment). - But `Lon (Jr.) Williams` and `Col. (Ret.) Smith` have the suffix-shaped - word in the *middle* of a no-comma name, with non-suffix pieces after it - (`Williams`, `Smith`) — that trailing-run algorithm doesn't pull it into - `suffix_list`, so it lands in `middle`/`first` instead, identically to how - the bare (unparenthesized) string `"Lon Jr. Williams"` already parses on - current `master`. This fix's actual guarantee is: **the delimited and - undelimited forms of the same string now parse identically** — it no - longer promises the freed word always becomes a suffix regardless of - position. Verified against `master` before writing this note: - `HumanName("Lon Jr. Williams")` → `middle == "Jr."`, and - `HumanName("Col. Ret. Smith")` → `title == "Col."`, `first == "Ret."` — - both already true without this fix, confirming the limitation is - pre-existing in `parse_full_name` and not introduced here. - -## Test cases to add - -All in the existing nickname/suffix test modules: - -| Input | Expected suffix | Expected nickname | Notes | -|---|---|---|---| -| `Andrew Perkins, Jr., Col. (Ret)` | contains `Ret` | empty | comma segment — trailing-run limitation doesn't apply | -| `Lon (Jr.) Williams` | empty | empty | mid-name, no comma — see "Scope boundaries"; parses identically to bare `"Lon Jr. Williams"` (`middle == "Jr."`) | -| `Col. (Ret.) Smith` | empty | empty | mid-name, no comma — same limitation; parses identically to bare `"Col. Ret. Smith"` (`title == "Col."`, `first == "Ret."`) | -| `Andrew Perkins (MBA)` | `MBA` | empty | trailing position | -| `JEFFREY (JD) BRICKEN` | empty | `JD` (regression guard) | | -| `Andrew Perkins (Mgr.)` | empty | empty (content flows into normal parsing, not force-classified either way) | | - -Additionally, add API/behavior tests (not constant-content tests — those -just create a second place to update whenever the lists change): - -- Customizing `suffix_acronyms_ambiguous` via `Constants()` changes - parsing output: adding a custom entry there and parsing - `"Andrew Perkins (XYZ)"` (where `XYZ` is also in `suffix_acronyms`) - keeps `XYZ` classified as a nickname instead of a suffix. -- Removing `jd` from a custom `suffix_acronyms_ambiguous` and parsing - `"Andrew Perkins (JD)"` (trailing position, so the freed word actually - reaches `suffix_list` — see "Scope boundaries" for why a mid-name example - like `JEFFREY (JD) BRICKEN` wouldn't work for this test) now routes `JD` - to suffix instead of nickname (confirms the exception list, not a - hardcoded check, drives the behavior). From 30d478b4f4ec2dd8d555e13bad06f657df46ef7f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 1 Jul 2026 00:06:41 -0700 Subject: [PATCH 08/11] refactor: move SUFFIX_ACRONYMS_AMBIGUOUS above the ~530-entry SUFFIX_ACRONYMS Easier to find right after SUFFIX_NOT_ACRONYMS instead of scrolling past the large acronym list. --- nameparser/config/suffixes.py | 44 +++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index c501578..1c945f0 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -21,6 +21,28 @@ Post-nominal pieces that are not acronyms. The parser does not remove periods when matching against these pieces. +""" +SUFFIX_ACRONYMS_AMBIGUOUS = set([ + # Suffix acronyms that also commonly work as given-name nicknames on + # their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames() + # when deciding whether parenthesized/quoted content is a nickname or a + # suffix -- content matching one of these stays a nickname rather than + # being reclassified as a suffix, since that's the more common reading + # in ambiguous, delimiter-only context. + # + # When adding a new entry to SUFFIX_ACRONYMS, also add it here only if + # the exact letter sequence could plausibly be someone's given name or + # common nickname on its own (e.g. 'jd', 'ed'). Unambiguous + # certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry. + 'ed', + 'jd', +]) +""" + +Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a +common given-name nickname. Not a partition of SUFFIX_ACRONYMS -- a small, +standalone exception list consulted only by parse_nicknames(). + """ SUFFIX_ACRONYMS = set([ '8-vsb', @@ -653,25 +675,3 @@ when matching against these pieces. """ -SUFFIX_ACRONYMS_AMBIGUOUS = set([ - # Suffix acronyms that also commonly work as given-name nicknames on - # their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames() - # when deciding whether parenthesized/quoted content is a nickname or a - # suffix -- content matching one of these stays a nickname rather than - # being reclassified as a suffix, since that's the more common reading - # in ambiguous, delimiter-only context. - # - # When adding a new entry to SUFFIX_ACRONYMS, also add it here only if - # the exact letter sequence could plausibly be someone's given name or - # common nickname on its own (e.g. 'jd', 'ed'). Unambiguous - # certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry. - 'ed', - 'jd', -]) -""" - -Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a -common given-name nickname. Not a partition of SUFFIX_ACRONYMS -- a small, -standalone exception list consulted only by parse_nicknames(). - -""" From c2efbd8112c4c889f88721a2e25e2ce4c083d3da Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 1 Jul 2026 00:13:17 -0700 Subject: [PATCH 09/11] fix: strip internal periods before suffix_acronyms check in parse_nicknames handle_match() checked stripped (lc()-only, leading/trailing periods stripped) against suffix_acronyms, unlike is_suffix() which also strips internal periods. This meant an acronym suffix like "M.D" (periods between letters, no trailing period) fell through to nickname_list instead of being recognized as a suffix. Found during PR review by the pr-test-analyzer subagent. Also adds the release_log.rst entries for this fix, per CLAUDE.md's maintenance requirement, flagged by the code-reviewer subagent. --- docs/release_log.rst | 3 +++ nameparser/config/__init__.py | 8 +------- nameparser/config/suffixes.py | 28 +++------------------------- nameparser/parser.py | 11 +++++++++-- tests/test_suffixes.py | 11 +++++++++++ 5 files changed, 27 insertions(+), 34 deletions(-) diff --git a/docs/release_log.rst b/docs/release_log.rst index 381d9ed..c43ee3b 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -7,6 +7,9 @@ Release Log to 1.2.1 first (which includes a one-version compatibility shim), load and re-pickle under 1.2.1, then upgrade to 1.3.0. + - Fix suffix-shaped parenthesized/quoted content (e.g. ``"(Ret)"``, ``"(MBA)"``) being misclassified as a nickname instead of a suffix (closes #111) + - Add ``suffix_acronyms_ambiguous`` to ``Constants`` for acronym suffixes that also read as given-name nicknames (e.g. ``"JD"``, ``"Ed"``), used when disambiguating parenthesized/quoted content (#111) + - Fix missing comma between ``'msc'`` and ``'mscmsm'`` in ``suffix_acronyms``, which silently concatenated them into a bogus ``'mscmscmsm'`` entry (#111) - Add ``given_names`` (and ``given_names_list``) attribute as aggregate of first and middle names, mirroring ``surnames`` (closes #157) - Add ``suffix_delimiter`` to ``Constants`` and ``HumanName`` for parsing suffixes separated by arbitrary delimiters, e.g. ``"RN - CRNA"`` (#156) - Add ``initials_separator`` to ``Constants`` and ``HumanName`` to control spacing between consecutive initials within a name group (#171) diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 79d4f8c..c143299 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -42,7 +42,6 @@ from nameparser.config.conjunctions import CONJUNCTIONS from nameparser.config.suffixes import SUFFIX_ACRONYMS from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS -from nameparser.config.suffixes import SUFFIX_ACRONYMS_AMBIGUOUS from nameparser.config.titles import TITLES from nameparser.config.titles import FIRST_NAME_TITLES from nameparser.config.regexes import EMPTY_REGEX, REGEXES @@ -237,10 +236,8 @@ class Constants: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_not_acronyms: + :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_acronyms_ambiguous: - :py:attr:`~suffixes.SUFFIX_ACRONYMS_AMBIGUOUS` wrapped with :py:class:`SetManager`. :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :param set first_name_prefixes: @@ -260,7 +257,6 @@ class Constants: first_name_titles: SetManager conjunctions: SetManager first_name_prefixes: SetManager - suffix_acronyms_ambiguous: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager _pst: Set[str] | None @@ -392,7 +388,6 @@ def __init__(self, prefixes: Iterable[str] = PREFIXES, suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS, suffix_not_acronyms: Iterable[str] = SUFFIX_NOT_ACRONYMS, - suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS, titles: Iterable[str] = TITLES, first_name_titles: Iterable[str] = FIRST_NAME_TITLES, conjunctions: Iterable[str] = CONJUNCTIONS, @@ -411,7 +406,6 @@ def __init__(self, self.first_name_titles = SetManager(first_name_titles) self.conjunctions = SetManager(conjunctions) self.first_name_prefixes = SetManager(first_name_prefixes) - self.suffix_acronyms_ambiguous = SetManager(suffix_acronyms_ambiguous) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) self.patronymic_name_order = patronymic_name_order diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index 1c945f0..f014c81 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -13,38 +13,16 @@ 'iii', 'iv', 'v', - 'ret', - 'vet', ]) """ Post-nominal pieces that are not acronyms. The parser does not remove periods when matching against these pieces. -""" -SUFFIX_ACRONYMS_AMBIGUOUS = set([ - # Suffix acronyms that also commonly work as given-name nicknames on - # their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames() - # when deciding whether parenthesized/quoted content is a nickname or a - # suffix -- content matching one of these stays a nickname rather than - # being reclassified as a suffix, since that's the more common reading - # in ambiguous, delimiter-only context. - # - # When adding a new entry to SUFFIX_ACRONYMS, also add it here only if - # the exact letter sequence could plausibly be someone's given name or - # common nickname on its own (e.g. 'jd', 'ed'). Unambiguous - # certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry. - 'ed', - 'jd', -]) -""" - -Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a -common given-name nickname. Not a partition of SUFFIX_ACRONYMS -- a small, -standalone exception list consulted only by parse_nicknames(). - """ SUFFIX_ACRONYMS = set([ + '(ret)', + '(vet)', '8-vsb', 'aas', 'aba', @@ -523,7 +501,7 @@ 'mra', 'ms', 'msa', - 'msc', + 'msc' 'mscmsm', 'msm', 'mt', diff --git a/nameparser/parser.py b/nameparser/parser.py index a49ffcf..bdd00cf 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -803,10 +803,17 @@ def handle_match(m: 're.Match[str]') -> str: # also rejects single-letter initials via is_an_initial(), which # isn't relevant here, and the suffix_acronyms_ambiguous exclusion # needs to be interleaved into the acronym branch specifically. + # Acronym suffixes may have periods between every letter (e.g. + # "M.D", "Ph.D") that aren't necessarily trailing, so -- exactly + # like is_suffix() -- strip all periods before checking + # suffix_acronyms/suffix_acronyms_ambiguous membership. Bare + # `stripped` (lc() only strips leading/trailing periods) is still + # used for suffix_not_acronyms, matching is_suffix()'s asymmetry. + acronym_stripped = stripped.replace('.', '') is_unambiguous_suffix = ( stripped in self.C.suffix_not_acronyms - or (stripped in self.C.suffix_acronyms - and stripped not in self.C.suffix_acronyms_ambiguous) + or (acronym_stripped in self.C.suffix_acronyms + and acronym_stripped not in self.C.suffix_acronyms_ambiguous) ) if is_unambiguous_suffix or content.endswith('.'): # Leave the bare content -- no delimiters -- so downstream diff --git a/tests/test_suffixes.py b/tests/test_suffixes.py index 6107d99..9fffb1b 100644 --- a/tests/test_suffixes.py +++ b/tests/test_suffixes.py @@ -325,6 +325,17 @@ def test_acronym_suffix_in_parenthesis(self) -> None: self.m(hn.suffix, "MBA", hn) self.m(hn.nickname, "", hn) + def test_acronym_suffix_with_internal_periods_in_parenthesis(self) -> None: + # "M.D" has a non-trailing period between every letter -- unlike + # is_suffix(), handle_match()'s suffix_acronyms check must also strip + # internal periods (not just rely on the trailing content.endswith('.') + # heuristic, which doesn't fire here since "M.D" has no trailing period). + hn = HumanName("Andrew Perkins (M.D)") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "M.D", hn) + self.m(hn.nickname, "", hn) + def test_period_terminated_content_in_parenthesis_not_forced_either_way(self) -> None: # "Mgr." isn't in any suffix list, but it ends in a period, so the # period heuristic (rule 2) excludes it from nickname_list. It flows From 4e91b7a845b31758084913de86850b4514cd72a3 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 1 Jul 2026 00:14:58 -0700 Subject: [PATCH 10/11] docs: explain why ret/vet are bare (not parenthesized) in SUFFIX_NOT_ACRONYMS Without this, a future maintainer diffing suffixes.py in isolation could read the (ret)/(vet) removal as an accidental regression and re-add the parenthesized form, silently reintroducing the #111 bug. Flagged by the comment-analyzer subagent during PR review. --- nameparser/config/suffixes.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index f014c81..e32bc8f 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -13,16 +13,44 @@ 'iii', 'iv', 'v', + # Bare, not '(ret)'/'(vet)': moved here from literal parenthesized + # entries in SUFFIX_ACRONYMS. parse_nicknames()'s handle_match() now + # strips parens/quotes before this set is consulted, so the bare form + # is correct -- do not re-add the parenthesized form, that would + # silently reintroduce the #111 bug (parenthesized "(Ret)" matching + # literally instead of going through nickname/suffix disambiguation). + 'ret', + 'vet', ]) """ Post-nominal pieces that are not acronyms. The parser does not remove periods when matching against these pieces. +""" +SUFFIX_ACRONYMS_AMBIGUOUS = set([ + # Suffix acronyms that also commonly work as given-name nicknames on + # their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames() + # when deciding whether parenthesized/quoted content is a nickname or a + # suffix -- content matching one of these stays a nickname rather than + # being reclassified as a suffix, since that's the more common reading + # in ambiguous, delimiter-only context. + # + # When adding a new entry to SUFFIX_ACRONYMS, also add it here only if + # the exact letter sequence could plausibly be someone's given name or + # common nickname on its own (e.g. 'jd', 'ed'). Unambiguous + # certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry. + 'ed', + 'jd', +]) +""" + +Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a +common given-name nickname. Not a partition of SUFFIX_ACRONYMS -- a small, +standalone exception list consulted only by parse_nicknames(). + """ SUFFIX_ACRONYMS = set([ - '(ret)', - '(vet)', '8-vsb', 'aas', 'aba', @@ -501,7 +529,7 @@ 'mra', 'ms', 'msa', - 'msc' + 'msc', 'mscmsm', 'msm', 'mt', From bf32594ed853884ebfe921c6dae8cfc513027137 Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Wed, 1 Jul 2026 00:23:21 -0700 Subject: [PATCH 11/11] fix: restore suffix_acronyms_ambiguous wiring lost in a git race nameparser/config/__init__.py's suffix_acronyms_ambiguous attribute (import, docstring, type hint, constructor param, and __init__ assignment -- all from Task 1, commit 135395b) was accidentally reverted by commit c2efbd8. Root cause: a review subagent was running concurrently in this same shared worktree directory and appears to have checked out an older commit in place (rather than in an isolated copy) to compare pre/post-PR behavior for its "verified against unmodified master" claims; that checkout's staged state got swept into c2efbd8's commit alongside my own staged changes, since `git commit` includes everything staged, not just newly `git add`ed paths. The working tree itself was never wrong -- parser.py's handle_match() has referenced self.C.suffix_acronyms_ambiguous correctly this whole time, and pytest passed after every commit since the corruption, because pytest reads the filesystem, not git history. Only the *committed* nameparser/config/__init__.py (and, transiently, the already-pushed PR branch) lacked the attribute -- a fresh clone of c2efbd8..4e91b7a would have raised AttributeError on any name containing parens/quotes. Verified via `git show HEAD:...` and `git diff 135395b HEAD -- nameparser/config/__init__.py` that this commit is the only remaining gap; suffixes.py, parser.py, and the tests were independently re-added correctly in later commits. Also includes docs/usage.rst and docs/customize.rst updates documenting the new suffix_acronyms_ambiguous behavior and SUFFIX_ACRONYMS_AMBIGUOUS constant. --- docs/customize.rst | 1 + docs/usage.rst | 41 +++++++++++++++++++++++++++++++++++ nameparser/config/__init__.py | 8 ++++++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/docs/customize.rst b/docs/customize.rst index fb763ce..1b472b9 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -43,6 +43,7 @@ Editable attributes of nameparser.config.CONSTANTS * :py:data:`~nameparser.config.FIRST_NAME_TITLES` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". * :py:data:`~nameparser.config.SUFFIX_ACRONYMS` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". * :py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". +* :py:data:`~nameparser.config.SUFFIX_ACRONYMS_AMBIGUOUS` - Acronym suffixes from ``SUFFIX_ACRONYMS`` that also plausibly work as a given-name nickname on their own, e.g. "JD", "Ed". When one of these appears alone in parenthesis or quotes (e.g. ``'JEFFREY (JD) BRICKEN'``), it's kept as a nickname rather than reclassified as a suffix, since that's the more common reading in ambiguous, delimiter-only context (see the "Nickname Handling" section in the usage guide). * :py:data:`~nameparser.config.conjunctions.CONJUNCTIONS` - Connectors like "and" that join the preceding piece to the following piece. * :py:data:`~nameparser.config.prefixes.PREFIXES` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. * :py:data:`~nameparser.config.CAPITALIZATION_EXCEPTIONS` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". diff --git a/docs/usage.rst b/docs/usage.rst index 281a803..4334364 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -147,6 +147,47 @@ available from the nickname attribute. nickname: 'John' ]> +Exception: content that looks like a suffix (a member of +:py:data:`~nameparser.config.SUFFIX_ACRONYMS` or +:py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS`, or anything ending in a +period) is treated as a suffix instead of a nickname, since that's usually +what's meant, e.g. a retired military title or a professional designation +written in parenthesis. + +.. doctest:: nicknames + :options: +NORMALIZE_WHITESPACE + + >>> name = HumanName('Andrew Perkins (MBA)') + >>> name + + +A few suffix acronyms, listed in +:py:data:`~nameparser.config.SUFFIX_ACRONYMS_AMBIGUOUS`, also work as common +given-name nicknames on their own (e.g. "JD", "Ed"). These stay nicknames +when found alone in parenthesis or quotes, since that's the more common +reading in that ambiguous context: + +.. doctest:: nicknames + :options: +NORMALIZE_WHITESPACE + + >>> name = HumanName('JEFFREY (JD) BRICKEN') + >>> name + + Change the output string with string formatting ----------------------------------------------- diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index c143299..79d4f8c 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -42,6 +42,7 @@ from nameparser.config.conjunctions import CONJUNCTIONS from nameparser.config.suffixes import SUFFIX_ACRONYMS from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS +from nameparser.config.suffixes import SUFFIX_ACRONYMS_AMBIGUOUS from nameparser.config.titles import TITLES from nameparser.config.titles import FIRST_NAME_TITLES from nameparser.config.regexes import EMPTY_REGEX, REGEXES @@ -236,8 +237,10 @@ class Constants: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_not_acronyms: + :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. + :param set suffix_acronyms_ambiguous: + :py:attr:`~suffixes.SUFFIX_ACRONYMS_AMBIGUOUS` wrapped with :py:class:`SetManager`. :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :param set first_name_prefixes: @@ -257,6 +260,7 @@ class Constants: first_name_titles: SetManager conjunctions: SetManager first_name_prefixes: SetManager + suffix_acronyms_ambiguous: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager _pst: Set[str] | None @@ -388,6 +392,7 @@ def __init__(self, prefixes: Iterable[str] = PREFIXES, suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS, suffix_not_acronyms: Iterable[str] = SUFFIX_NOT_ACRONYMS, + suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS, titles: Iterable[str] = TITLES, first_name_titles: Iterable[str] = FIRST_NAME_TITLES, conjunctions: Iterable[str] = CONJUNCTIONS, @@ -406,6 +411,7 @@ def __init__(self, self.first_name_titles = SetManager(first_name_titles) self.conjunctions = SetManager(conjunctions) self.first_name_prefixes = SetManager(first_name_prefixes) + self.suffix_acronyms_ambiguous = SetManager(suffix_acronyms_ambiguous) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) self.patronymic_name_order = patronymic_name_order