diff --git a/.gitignore b/.gitignore index 0c114a8..40f3ae8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ env/ # docs docs/_* +docs/superpowers/ diff --git a/docs/customize.rst b/docs/customize.rst index fb763ce..1b472b9 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -43,6 +43,7 @@ Editable attributes of nameparser.config.CONSTANTS * :py:data:`~nameparser.config.FIRST_NAME_TITLES` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". * :py:data:`~nameparser.config.SUFFIX_ACRONYMS` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". * :py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". +* :py:data:`~nameparser.config.SUFFIX_ACRONYMS_AMBIGUOUS` - Acronym suffixes from ``SUFFIX_ACRONYMS`` that also plausibly work as a given-name nickname on their own, e.g. "JD", "Ed". When one of these appears alone in parenthesis or quotes (e.g. ``'JEFFREY (JD) BRICKEN'``), it's kept as a nickname rather than reclassified as a suffix, since that's the more common reading in ambiguous, delimiter-only context (see the "Nickname Handling" section in the usage guide). * :py:data:`~nameparser.config.conjunctions.CONJUNCTIONS` - Connectors like "and" that join the preceding piece to the following piece. * :py:data:`~nameparser.config.prefixes.PREFIXES` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. * :py:data:`~nameparser.config.CAPITALIZATION_EXCEPTIONS` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". diff --git a/docs/release_log.rst b/docs/release_log.rst index 381d9ed..c43ee3b 100644 --- a/docs/release_log.rst +++ b/docs/release_log.rst @@ -7,6 +7,9 @@ Release Log to 1.2.1 first (which includes a one-version compatibility shim), load and re-pickle under 1.2.1, then upgrade to 1.3.0. + - Fix suffix-shaped parenthesized/quoted content (e.g. ``"(Ret)"``, ``"(MBA)"``) being misclassified as a nickname instead of a suffix (closes #111) + - Add ``suffix_acronyms_ambiguous`` to ``Constants`` for acronym suffixes that also read as given-name nicknames (e.g. ``"JD"``, ``"Ed"``), used when disambiguating parenthesized/quoted content (#111) + - Fix missing comma between ``'msc'`` and ``'mscmsm'`` in ``suffix_acronyms``, which silently concatenated them into a bogus ``'mscmscmsm'`` entry (#111) - Add ``given_names`` (and ``given_names_list``) attribute as aggregate of first and middle names, mirroring ``surnames`` (closes #157) - Add ``suffix_delimiter`` to ``Constants`` and ``HumanName`` for parsing suffixes separated by arbitrary delimiters, e.g. ``"RN - CRNA"`` (#156) - Add ``initials_separator`` to ``Constants`` and ``HumanName`` to control spacing between consecutive initials within a name group (#171) diff --git a/docs/usage.rst b/docs/usage.rst index 281a803..4334364 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -147,6 +147,47 @@ available from the nickname attribute. nickname: 'John' ]> +Exception: content that looks like a suffix (a member of +:py:data:`~nameparser.config.SUFFIX_ACRONYMS` or +:py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS`, or anything ending in a +period) is treated as a suffix instead of a nickname, since that's usually +what's meant, e.g. a retired military title or a professional designation +written in parenthesis. + +.. doctest:: nicknames + :options: +NORMALIZE_WHITESPACE + + >>> name = HumanName('Andrew Perkins (MBA)') + >>> name + + +A few suffix acronyms, listed in +:py:data:`~nameparser.config.SUFFIX_ACRONYMS_AMBIGUOUS`, also work as common +given-name nicknames on their own (e.g. "JD", "Ed"). These stay nicknames +when found alone in parenthesis or quotes, since that's the more common +reading in that ambiguous context: + +.. doctest:: nicknames + :options: +NORMALIZE_WHITESPACE + + >>> name = HumanName('JEFFREY (JD) BRICKEN') + >>> name + + Change the output string with string formatting ----------------------------------------------- diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index c143299..79d4f8c 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -42,6 +42,7 @@ from nameparser.config.conjunctions import CONJUNCTIONS from nameparser.config.suffixes import SUFFIX_ACRONYMS from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS +from nameparser.config.suffixes import SUFFIX_ACRONYMS_AMBIGUOUS from nameparser.config.titles import TITLES from nameparser.config.titles import FIRST_NAME_TITLES from nameparser.config.regexes import EMPTY_REGEX, REGEXES @@ -236,8 +237,10 @@ class Constants: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_not_acronyms: + :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. + :param set suffix_acronyms_ambiguous: + :py:attr:`~suffixes.SUFFIX_ACRONYMS_AMBIGUOUS` wrapped with :py:class:`SetManager`. :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :param set first_name_prefixes: @@ -257,6 +260,7 @@ class Constants: first_name_titles: SetManager conjunctions: SetManager first_name_prefixes: SetManager + suffix_acronyms_ambiguous: SetManager capitalization_exceptions: TupleManager[str] regexes: RegexTupleManager _pst: Set[str] | None @@ -388,6 +392,7 @@ def __init__(self, prefixes: Iterable[str] = PREFIXES, suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS, suffix_not_acronyms: Iterable[str] = SUFFIX_NOT_ACRONYMS, + suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS, titles: Iterable[str] = TITLES, first_name_titles: Iterable[str] = FIRST_NAME_TITLES, conjunctions: Iterable[str] = CONJUNCTIONS, @@ -406,6 +411,7 @@ def __init__(self, self.first_name_titles = SetManager(first_name_titles) self.conjunctions = SetManager(conjunctions) self.first_name_prefixes = SetManager(first_name_prefixes) + self.suffix_acronyms_ambiguous = SetManager(suffix_acronyms_ambiguous) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = RegexTupleManager(regexes) self.patronymic_name_order = patronymic_name_order diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index f014c81..e32bc8f 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -13,16 +13,44 @@ 'iii', 'iv', 'v', + # Bare, not '(ret)'/'(vet)': moved here from literal parenthesized + # entries in SUFFIX_ACRONYMS. parse_nicknames()'s handle_match() now + # strips parens/quotes before this set is consulted, so the bare form + # is correct -- do not re-add the parenthesized form, that would + # silently reintroduce the #111 bug (parenthesized "(Ret)" matching + # literally instead of going through nickname/suffix disambiguation). + 'ret', + 'vet', ]) """ Post-nominal pieces that are not acronyms. The parser does not remove periods when matching against these pieces. +""" +SUFFIX_ACRONYMS_AMBIGUOUS = set([ + # Suffix acronyms that also commonly work as given-name nicknames on + # their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames() + # when deciding whether parenthesized/quoted content is a nickname or a + # suffix -- content matching one of these stays a nickname rather than + # being reclassified as a suffix, since that's the more common reading + # in ambiguous, delimiter-only context. + # + # When adding a new entry to SUFFIX_ACRONYMS, also add it here only if + # the exact letter sequence could plausibly be someone's given name or + # common nickname on its own (e.g. 'jd', 'ed'). Unambiguous + # certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry. + 'ed', + 'jd', +]) +""" + +Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a +common given-name nickname. Not a partition of SUFFIX_ACRONYMS -- a small, +standalone exception list consulted only by parse_nicknames(). + """ SUFFIX_ACRONYMS = set([ - '(ret)', - '(vet)', '8-vsb', 'aas', 'aba', @@ -501,7 +529,7 @@ 'mra', 'ms', 'msa', - 'msc' + 'msc', 'mscmsm', 'msm', 'mt', diff --git a/nameparser/parser.py b/nameparser/parser.py index 4cda41a..bdd00cf 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -774,7 +774,11 @@ def fix_phd(self) -> None: def parse_nicknames(self) -> None: """ The content of parenthesis or quotes in the name will be added to the - nicknames list. This happens before any other processing of the name. + nicknames list, unless that content is suffix-shaped -- an unambiguous + suffix_not_acronyms/suffix_acronyms member, or content ending in a + period -- in which case it's left in place (undelimited) for normal + downstream suffix/title/word parsing instead. This happens before any + other processing of the name. Single quotes cannot span white space characters and must border white space to allow for quotes in names like O'Connor and Kawai'ae'a. @@ -788,10 +792,45 @@ def parse_nicknames(self) -> None: re_double_quotes = self.C.regexes.double_quotes re_parenthesis = self.C.regexes.parenthesis + def handle_match(m: 're.Match[str]') -> str: + # Fall back to the whole match when the regex has no capturing + # group (e.g. a custom override regex without one, like + # EMPTY_REGEX) -- mirrors the old code's use of findall(), which + # returns the whole match for group-less patterns. + content = m.group(1) if m.lastindex else m.group(0) + stripped = lc(content) + # Inlined rather than calling self.is_suffix(content): is_suffix() + # also rejects single-letter initials via is_an_initial(), which + # isn't relevant here, and the suffix_acronyms_ambiguous exclusion + # needs to be interleaved into the acronym branch specifically. + # Acronym suffixes may have periods between every letter (e.g. + # "M.D", "Ph.D") that aren't necessarily trailing, so -- exactly + # like is_suffix() -- strip all periods before checking + # suffix_acronyms/suffix_acronyms_ambiguous membership. Bare + # `stripped` (lc() only strips leading/trailing periods) is still + # used for suffix_not_acronyms, matching is_suffix()'s asymmetry. + acronym_stripped = stripped.replace('.', '') + is_unambiguous_suffix = ( + stripped in self.C.suffix_not_acronyms + or (acronym_stripped in self.C.suffix_acronyms + and acronym_stripped not in self.C.suffix_acronyms_ambiguous) + ) + if is_unambiguous_suffix or content.endswith('.'): + # Leave the bare content -- no delimiters -- so downstream + # word-splitting/suffix-matching sees it exactly as if it had + # never been wrapped in parens/quotes. is_suffix()/lc() only + # strip periods, never parens/quotes, so returning m.group(0) + # here (e.g. literal "(Ret)") would never match + # suffix_not_acronyms ("ret"). + return content + self.nickname_list.append(content) + return '' + + # Same handle_match for all three delimiters: suffix-shaped content + # is rare in quotes but not impossible, and the logic is delimiter- + # agnostic, so there's no reason to special-case parenthesis here. for _re in (re_quoted_word, re_double_quotes, re_parenthesis): - if _re.search(self._full_name): - self.nickname_list += [x for x in _re.findall(self._full_name)] - self._full_name = _re.sub('', self._full_name) + self._full_name = _re.sub(handle_match, self._full_name) def squash_emoji(self) -> None: """ diff --git a/tests/test_nicknames.py b/tests/test_nicknames.py index c5aea85..fa79e45 100644 --- a/tests/test_nicknames.py +++ b/tests/test_nicknames.py @@ -134,6 +134,14 @@ def test_nickname_and_last_name_with_title(self) -> None: self.m(hn.last, "Edmonds", hn) self.m(hn.nickname, "Rick", hn) + def test_ambiguous_suffix_acronym_in_parenthesis_stays_nickname(self) -> None: + # JD is in SUFFIX_ACRONYMS_AMBIGUOUS: both a law-degree acronym and a + # common given-name nickname. Existing behavior (nickname) must be + # preserved -- see issue #111. + hn = HumanName("JEFFREY (JD) BRICKEN") + self.m(hn.nickname, "JD", hn) + self.m(hn.suffix, "", hn) + # class MaidenNameTestCase(HumanNameTestBase): # diff --git a/tests/test_suffixes.py b/tests/test_suffixes.py index 438b508..9fffb1b 100644 --- a/tests/test_suffixes.py +++ b/tests/test_suffixes.py @@ -274,3 +274,114 @@ def test_suffix_delimiter_inverted_format_known_limitation(self) -> None: # This is a documented limitation — do not "fix" it without a broader solution. hn = HumanName("Doe, Mary - Kate, RN", suffix_delimiter=" - ") self.assertNotEqual(hn.first, "Mary - Kate") + + def test_suffix_acronyms_ambiguous_is_customizable(self) -> None: + from nameparser.config import Constants + custom = Constants(suffix_acronyms_ambiguous=['xyz']) + self.assertEqual(set(custom.suffix_acronyms_ambiguous), {'xyz'}) + # Constructing without the kwarg still works and uses the module default. + default = Constants() + self.assertIn('jd', default.suffix_acronyms_ambiguous) + + def test_suffix_in_parenthesis_with_other_suffixes(self) -> None: + hn = HumanName("Andrew Perkins, Jr., Col. (Ret)") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.assertIn("Ret", hn.suffix) + self.m(hn.nickname, "", hn) + + def test_suffix_in_parenthesis_mid_name(self) -> None: + # "Jr." is suffix-shaped, so parse_nicknames() no longer treats it as + # a nickname. But it isn't in trailing position, and parse_full_name's + # suffix detection only recognizes a trailing run of suffix-shaped + # pieces -- so it lands wherever normal parsing would put a bare + # mid-name "Jr." token, exactly as if the parens were never there + # (verified: HumanName("Lon Jr. Williams") parses identically). + # Known limitation: making this land in `suffix` would require + # changing parse_full_name's suffix detection, out of scope here -- + # issue #111 is specifically about the nickname misclassification. + hn = HumanName("Lon (Jr.) Williams") + self.m(hn.first, "Lon", hn) + self.m(hn.middle, "Jr.", hn) + self.m(hn.last, "Williams", hn) + self.m(hn.suffix, "", hn) + self.m(hn.nickname, "", hn) + + def test_suffix_in_parenthesis_with_period(self) -> None: + # Same known limitation as above: "Ret." is mid-name (no comma), so + # it's outside the trailing run parse_full_name's suffix detection + # requires. It parses exactly as bare "Col. Ret. Smith" would. + hn = HumanName("Col. (Ret.) Smith") + self.m(hn.title, "Col.", hn) + self.m(hn.first, "Ret.", hn) + self.m(hn.last, "Smith", hn) + self.m(hn.suffix, "", hn) + self.m(hn.nickname, "", hn) + + def test_acronym_suffix_in_parenthesis(self) -> None: + hn = HumanName("Andrew Perkins (MBA)") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "MBA", hn) + self.m(hn.nickname, "", hn) + + def test_acronym_suffix_with_internal_periods_in_parenthesis(self) -> None: + # "M.D" has a non-trailing period between every letter -- unlike + # is_suffix(), handle_match()'s suffix_acronyms check must also strip + # internal periods (not just rely on the trailing content.endswith('.') + # heuristic, which doesn't fire here since "M.D" has no trailing period). + hn = HumanName("Andrew Perkins (M.D)") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "M.D", hn) + self.m(hn.nickname, "", hn) + + def test_period_terminated_content_in_parenthesis_not_forced_either_way(self) -> None: + # "Mgr." isn't in any suffix list, but it ends in a period, so the + # period heuristic (rule 2) excludes it from nickname_list. It flows + # into normal parsing instead of being force-classified as a suffix. + hn = HumanName("Andrew Perkins (Mgr.)") + self.m(hn.nickname, "", hn) + self.m(hn.suffix, "", hn) + + def test_suffix_in_single_quotes(self) -> None: + # handle_match() is shared across all three delimiter regexes, not + # just parenthesis -- confirm suffix-shaped single-quoted content + # routes the same way. + hn = HumanName("Andrew Perkins 'MBA'") + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "MBA", hn) + self.m(hn.nickname, "", hn) + + def test_suffix_in_double_quotes(self) -> None: + hn = HumanName('Andrew Perkins "MBA"') + self.m(hn.first, "Andrew", hn) + self.m(hn.last, "Perkins", hn) + self.m(hn.suffix, "MBA", hn) + self.m(hn.nickname, "", hn) + + def test_suffix_acronyms_ambiguous_custom_entry_stays_nickname(self) -> None: + # A custom suffix_acronyms_ambiguous entry keeps a suffix_acronyms + # member classified as a nickname instead of a suffix, confirming + # the exception list -- not a hardcoded check -- drives the behavior. + from nameparser.config import Constants + C = Constants( + suffix_acronyms=['xyz'], + suffix_acronyms_ambiguous=['xyz'], + ) + hn = HumanName("Andrew Perkins (XYZ)", constants=C) + self.m(hn.nickname, "XYZ", hn) + self.m(hn.suffix, "", hn) + + def test_suffix_acronyms_ambiguous_removal_routes_to_suffix(self) -> None: + # Removing 'jd' from a custom suffix_acronyms_ambiguous flips JD + # from nickname to suffix. Uses a trailing-position name (unlike the + # JEFFREY (JD) BRICKEN regression guard in test_nicknames.py) so + # parse_full_name's trailing-run suffix detection actually picks it + # up -- see the known mid-name limitation noted on the tests above. + from nameparser.config import Constants + C = Constants(suffix_acronyms_ambiguous=[]) + hn = HumanName("Andrew Perkins (JD)", constants=C) + self.m(hn.nickname, "", hn) + self.m(hn.suffix, "JD", hn)