Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ env/

# docs
docs/_*
docs/superpowers/
1 change: 1 addition & 0 deletions docs/customize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Editable attributes of nameparser.config.CONSTANTS
* :py:data:`~nameparser.config.FIRST_NAME_TITLES` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
* :py:data:`~nameparser.config.SUFFIX_ACRONYMS` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
* :py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
* :py:data:`~nameparser.config.SUFFIX_ACRONYMS_AMBIGUOUS` - Acronym suffixes from ``SUFFIX_ACRONYMS`` that also plausibly work as a given-name nickname on their own, e.g. "JD", "Ed". When one of these appears alone in parenthesis or quotes (e.g. ``'JEFFREY (JD) BRICKEN'``), it's kept as a nickname rather than reclassified as a suffix, since that's the more common reading in ambiguous, delimiter-only context (see the "Nickname Handling" section in the usage guide).
* :py:data:`~nameparser.config.conjunctions.CONJUNCTIONS` - Connectors like "and" that join the preceding piece to the following piece.
* :py:data:`~nameparser.config.prefixes.PREFIXES` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
* :py:data:`~nameparser.config.CAPITALIZATION_EXCEPTIONS` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
Expand Down
3 changes: 3 additions & 0 deletions docs/release_log.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ Release Log
to 1.2.1 first (which includes a one-version compatibility shim), load and
re-pickle under 1.2.1, then upgrade to 1.3.0.

- Fix suffix-shaped parenthesized/quoted content (e.g. ``"(Ret)"``, ``"(MBA)"``) being misclassified as a nickname instead of a suffix (closes #111)
- Add ``suffix_acronyms_ambiguous`` to ``Constants`` for acronym suffixes that also read as given-name nicknames (e.g. ``"JD"``, ``"Ed"``), used when disambiguating parenthesized/quoted content (#111)
- Fix missing comma between ``'msc'`` and ``'mscmsm'`` in ``suffix_acronyms``, which silently concatenated them into a bogus ``'mscmscmsm'`` entry (#111)
- Add ``given_names`` (and ``given_names_list``) attribute as aggregate of first and middle names, mirroring ``surnames`` (closes #157)
- Add ``suffix_delimiter`` to ``Constants`` and ``HumanName`` for parsing suffixes separated by arbitrary delimiters, e.g. ``"RN - CRNA"`` (#156)
- Add ``initials_separator`` to ``Constants`` and ``HumanName`` to control spacing between consecutive initials within a name group (#171)
Expand Down
41 changes: 41 additions & 0 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,47 @@ available from the nickname attribute.
nickname: 'John'
]>

Exception: content that looks like a suffix (a member of
:py:data:`~nameparser.config.SUFFIX_ACRONYMS` or
:py:data:`~nameparser.config.SUFFIX_NOT_ACRONYMS`, or anything ending in a
period) is treated as a suffix instead of a nickname, since that's usually
what's meant, e.g. a retired military title or a professional designation
written in parenthesis.

.. doctest:: nicknames
:options: +NORMALIZE_WHITESPACE

>>> name = HumanName('Andrew Perkins (MBA)')
>>> name
<HumanName : [
title: ''
first: 'Andrew'
middle: ''
last: 'Perkins'
suffix: 'MBA'
nickname: ''
]>

A few suffix acronyms, listed in
:py:data:`~nameparser.config.SUFFIX_ACRONYMS_AMBIGUOUS`, also work as common
given-name nicknames on their own (e.g. "JD", "Ed"). These stay nicknames
when found alone in parenthesis or quotes, since that's the more common
reading in that ambiguous context:

.. doctest:: nicknames
:options: +NORMALIZE_WHITESPACE

>>> name = HumanName('JEFFREY (JD) BRICKEN')
>>> name
<HumanName : [
title: ''
first: 'JEFFREY'
middle: ''
last: 'BRICKEN'
suffix: ''
nickname: 'JD'
]>

Change the output string with string formatting
-----------------------------------------------

Expand Down
8 changes: 7 additions & 1 deletion nameparser/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from nameparser.config.conjunctions import CONJUNCTIONS
from nameparser.config.suffixes import SUFFIX_ACRONYMS
from nameparser.config.suffixes import SUFFIX_NOT_ACRONYMS
from nameparser.config.suffixes import SUFFIX_ACRONYMS_AMBIGUOUS
from nameparser.config.titles import TITLES
from nameparser.config.titles import FIRST_NAME_TITLES
from nameparser.config.regexes import EMPTY_REGEX, REGEXES
Expand Down Expand Up @@ -236,8 +237,10 @@ class Constants:
:py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`.
:param set suffix_acronyms:
:py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`.
:param set suffix_not_acronyms:
:param set suffix_not_acronyms:
:py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`.
:param set suffix_acronyms_ambiguous:
:py:attr:`~suffixes.SUFFIX_ACRONYMS_AMBIGUOUS` wrapped with :py:class:`SetManager`.
:param set conjunctions:
:py:attr:`conjunctions` wrapped with :py:class:`SetManager`.
:param set first_name_prefixes:
Expand All @@ -257,6 +260,7 @@ class Constants:
first_name_titles: SetManager
conjunctions: SetManager
first_name_prefixes: SetManager
suffix_acronyms_ambiguous: SetManager
capitalization_exceptions: TupleManager[str]
regexes: RegexTupleManager
_pst: Set[str] | None
Expand Down Expand Up @@ -388,6 +392,7 @@ def __init__(self,
prefixes: Iterable[str] = PREFIXES,
suffix_acronyms: Iterable[str] = SUFFIX_ACRONYMS,
suffix_not_acronyms: Iterable[str] = SUFFIX_NOT_ACRONYMS,
suffix_acronyms_ambiguous: Iterable[str] = SUFFIX_ACRONYMS_AMBIGUOUS,
titles: Iterable[str] = TITLES,
first_name_titles: Iterable[str] = FIRST_NAME_TITLES,
conjunctions: Iterable[str] = CONJUNCTIONS,
Expand All @@ -406,6 +411,7 @@ def __init__(self,
self.first_name_titles = SetManager(first_name_titles)
self.conjunctions = SetManager(conjunctions)
self.first_name_prefixes = SetManager(first_name_prefixes)
self.suffix_acronyms_ambiguous = SetManager(suffix_acronyms_ambiguous)
self.capitalization_exceptions = TupleManager(capitalization_exceptions)
self.regexes = RegexTupleManager(regexes)
self.patronymic_name_order = patronymic_name_order
Expand Down
34 changes: 31 additions & 3 deletions nameparser/config/suffixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,44 @@
'iii',
'iv',
'v',
# Bare, not '(ret)'/'(vet)': moved here from literal parenthesized
# entries in SUFFIX_ACRONYMS. parse_nicknames()'s handle_match() now
# strips parens/quotes before this set is consulted, so the bare form
# is correct -- do not re-add the parenthesized form, that would
# silently reintroduce the #111 bug (parenthesized "(Ret)" matching
# literally instead of going through nickname/suffix disambiguation).
'ret',
'vet',
])
"""

Post-nominal pieces that are not acronyms. The parser does not remove periods
when matching against these pieces.

"""
SUFFIX_ACRONYMS_AMBIGUOUS = set([
# Suffix acronyms that also commonly work as given-name nicknames on
# their own (e.g. "Ed", "JD"). Read only by HumanName.parse_nicknames()
# when deciding whether parenthesized/quoted content is a nickname or a
# suffix -- content matching one of these stays a nickname rather than
# being reclassified as a suffix, since that's the more common reading
# in ambiguous, delimiter-only context.
#
# When adding a new entry to SUFFIX_ACRONYMS, also add it here only if
# the exact letter sequence could plausibly be someone's given name or
# common nickname on its own (e.g. 'jd', 'ed'). Unambiguous
# certifications/degrees (e.g. 'mba', 'cpa', 'phd') don't need an entry.
'ed',
'jd',
])
"""

Acronym suffixes from SUFFIX_ACRONYMS that also plausibly collide with a
common given-name nickname. Not a partition of SUFFIX_ACRONYMS -- a small,
standalone exception list consulted only by parse_nicknames().

"""
SUFFIX_ACRONYMS = set([
'(ret)',
'(vet)',
'8-vsb',
'aas',
'aba',
Expand Down Expand Up @@ -501,7 +529,7 @@
'mra',
'ms',
'msa',
'msc'
'msc',
'mscmsm',
'msm',
'mt',
Expand Down
47 changes: 43 additions & 4 deletions nameparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,11 @@ def fix_phd(self) -> None:
def parse_nicknames(self) -> None:
"""
The content of parenthesis or quotes in the name will be added to the
nicknames list. This happens before any other processing of the name.
nicknames list, unless that content is suffix-shaped -- an unambiguous
suffix_not_acronyms/suffix_acronyms member, or content ending in a
period -- in which case it's left in place (undelimited) for normal
downstream suffix/title/word parsing instead. This happens before any
other processing of the name.

Single quotes cannot span white space characters and must border
white space to allow for quotes in names like O'Connor and Kawai'ae'a.
Expand All @@ -788,10 +792,45 @@ def parse_nicknames(self) -> None:
re_double_quotes = self.C.regexes.double_quotes
re_parenthesis = self.C.regexes.parenthesis

def handle_match(m: 're.Match[str]') -> str:
# Fall back to the whole match when the regex has no capturing
# group (e.g. a custom override regex without one, like
# EMPTY_REGEX) -- mirrors the old code's use of findall(), which
# returns the whole match for group-less patterns.
content = m.group(1) if m.lastindex else m.group(0)
stripped = lc(content)
# Inlined rather than calling self.is_suffix(content): is_suffix()
# also rejects single-letter initials via is_an_initial(), which
# isn't relevant here, and the suffix_acronyms_ambiguous exclusion
# needs to be interleaved into the acronym branch specifically.
# Acronym suffixes may have periods between every letter (e.g.
# "M.D", "Ph.D") that aren't necessarily trailing, so -- exactly
# like is_suffix() -- strip all periods before checking
# suffix_acronyms/suffix_acronyms_ambiguous membership. Bare
# `stripped` (lc() only strips leading/trailing periods) is still
# used for suffix_not_acronyms, matching is_suffix()'s asymmetry.
acronym_stripped = stripped.replace('.', '')
is_unambiguous_suffix = (
stripped in self.C.suffix_not_acronyms
or (acronym_stripped in self.C.suffix_acronyms
and acronym_stripped not in self.C.suffix_acronyms_ambiguous)
)
if is_unambiguous_suffix or content.endswith('.'):
# Leave the bare content -- no delimiters -- so downstream
# word-splitting/suffix-matching sees it exactly as if it had
# never been wrapped in parens/quotes. is_suffix()/lc() only
# strip periods, never parens/quotes, so returning m.group(0)
# here (e.g. literal "(Ret)") would never match
# suffix_not_acronyms ("ret").
return content
self.nickname_list.append(content)
return ''

# Same handle_match for all three delimiters: suffix-shaped content
# is rare in quotes but not impossible, and the logic is delimiter-
# agnostic, so there's no reason to special-case parenthesis here.
for _re in (re_quoted_word, re_double_quotes, re_parenthesis):
if _re.search(self._full_name):
self.nickname_list += [x for x in _re.findall(self._full_name)]
self._full_name = _re.sub('', self._full_name)
self._full_name = _re.sub(handle_match, self._full_name)

def squash_emoji(self) -> None:
"""
Expand Down
8 changes: 8 additions & 0 deletions tests/test_nicknames.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ def test_nickname_and_last_name_with_title(self) -> None:
self.m(hn.last, "Edmonds", hn)
self.m(hn.nickname, "Rick", hn)

def test_ambiguous_suffix_acronym_in_parenthesis_stays_nickname(self) -> None:
# JD is in SUFFIX_ACRONYMS_AMBIGUOUS: both a law-degree acronym and a
# common given-name nickname. Existing behavior (nickname) must be
# preserved -- see issue #111.
hn = HumanName("JEFFREY (JD) BRICKEN")
self.m(hn.nickname, "JD", hn)
self.m(hn.suffix, "", hn)


# class MaidenNameTestCase(HumanNameTestBase):
#
Expand Down
111 changes: 111 additions & 0 deletions tests/test_suffixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,3 +274,114 @@ def test_suffix_delimiter_inverted_format_known_limitation(self) -> None:
# This is a documented limitation — do not "fix" it without a broader solution.
hn = HumanName("Doe, Mary - Kate, RN", suffix_delimiter=" - ")
self.assertNotEqual(hn.first, "Mary - Kate")

def test_suffix_acronyms_ambiguous_is_customizable(self) -> None:
from nameparser.config import Constants
custom = Constants(suffix_acronyms_ambiguous=['xyz'])
self.assertEqual(set(custom.suffix_acronyms_ambiguous), {'xyz'})
# Constructing without the kwarg still works and uses the module default.
default = Constants()
self.assertIn('jd', default.suffix_acronyms_ambiguous)

def test_suffix_in_parenthesis_with_other_suffixes(self) -> None:
hn = HumanName("Andrew Perkins, Jr., Col. (Ret)")
self.m(hn.first, "Andrew", hn)
self.m(hn.last, "Perkins", hn)
self.assertIn("Ret", hn.suffix)
self.m(hn.nickname, "", hn)

def test_suffix_in_parenthesis_mid_name(self) -> None:
# "Jr." is suffix-shaped, so parse_nicknames() no longer treats it as
# a nickname. But it isn't in trailing position, and parse_full_name's
# suffix detection only recognizes a trailing run of suffix-shaped
# pieces -- so it lands wherever normal parsing would put a bare
# mid-name "Jr." token, exactly as if the parens were never there
# (verified: HumanName("Lon Jr. Williams") parses identically).
# Known limitation: making this land in `suffix` would require
# changing parse_full_name's suffix detection, out of scope here --
# issue #111 is specifically about the nickname misclassification.
hn = HumanName("Lon (Jr.) Williams")
self.m(hn.first, "Lon", hn)
self.m(hn.middle, "Jr.", hn)
self.m(hn.last, "Williams", hn)
self.m(hn.suffix, "", hn)
self.m(hn.nickname, "", hn)

def test_suffix_in_parenthesis_with_period(self) -> None:
# Same known limitation as above: "Ret." is mid-name (no comma), so
# it's outside the trailing run parse_full_name's suffix detection
# requires. It parses exactly as bare "Col. Ret. Smith" would.
hn = HumanName("Col. (Ret.) Smith")
self.m(hn.title, "Col.", hn)
self.m(hn.first, "Ret.", hn)
self.m(hn.last, "Smith", hn)
self.m(hn.suffix, "", hn)
self.m(hn.nickname, "", hn)

def test_acronym_suffix_in_parenthesis(self) -> None:
hn = HumanName("Andrew Perkins (MBA)")
self.m(hn.first, "Andrew", hn)
self.m(hn.last, "Perkins", hn)
self.m(hn.suffix, "MBA", hn)
self.m(hn.nickname, "", hn)

def test_acronym_suffix_with_internal_periods_in_parenthesis(self) -> None:
# "M.D" has a non-trailing period between every letter -- unlike
# is_suffix(), handle_match()'s suffix_acronyms check must also strip
# internal periods (not just rely on the trailing content.endswith('.')
# heuristic, which doesn't fire here since "M.D" has no trailing period).
hn = HumanName("Andrew Perkins (M.D)")
self.m(hn.first, "Andrew", hn)
self.m(hn.last, "Perkins", hn)
self.m(hn.suffix, "M.D", hn)
self.m(hn.nickname, "", hn)

def test_period_terminated_content_in_parenthesis_not_forced_either_way(self) -> None:
# "Mgr." isn't in any suffix list, but it ends in a period, so the
# period heuristic (rule 2) excludes it from nickname_list. It flows
# into normal parsing instead of being force-classified as a suffix.
hn = HumanName("Andrew Perkins (Mgr.)")
self.m(hn.nickname, "", hn)
self.m(hn.suffix, "", hn)

def test_suffix_in_single_quotes(self) -> None:
# handle_match() is shared across all three delimiter regexes, not
# just parenthesis -- confirm suffix-shaped single-quoted content
# routes the same way.
hn = HumanName("Andrew Perkins 'MBA'")
self.m(hn.first, "Andrew", hn)
self.m(hn.last, "Perkins", hn)
self.m(hn.suffix, "MBA", hn)
self.m(hn.nickname, "", hn)

def test_suffix_in_double_quotes(self) -> None:
hn = HumanName('Andrew Perkins "MBA"')
self.m(hn.first, "Andrew", hn)
self.m(hn.last, "Perkins", hn)
self.m(hn.suffix, "MBA", hn)
self.m(hn.nickname, "", hn)

def test_suffix_acronyms_ambiguous_custom_entry_stays_nickname(self) -> None:
# A custom suffix_acronyms_ambiguous entry keeps a suffix_acronyms
# member classified as a nickname instead of a suffix, confirming
# the exception list -- not a hardcoded check -- drives the behavior.
from nameparser.config import Constants
C = Constants(
suffix_acronyms=['xyz'],
suffix_acronyms_ambiguous=['xyz'],
)
hn = HumanName("Andrew Perkins (XYZ)", constants=C)
self.m(hn.nickname, "XYZ", hn)
self.m(hn.suffix, "", hn)

def test_suffix_acronyms_ambiguous_removal_routes_to_suffix(self) -> None:
# Removing 'jd' from a custom suffix_acronyms_ambiguous flips JD
# from nickname to suffix. Uses a trailing-position name (unlike the
# JEFFREY (JD) BRICKEN regression guard in test_nicknames.py) so
# parse_full_name's trailing-run suffix detection actually picks it
# up -- see the known mid-name limitation noted on the tests above.
from nameparser.config import Constants
C = Constants(suffix_acronyms_ambiguous=[])
hn = HumanName("Andrew Perkins (JD)", constants=C)
self.m(hn.nickname, "", hn)
self.m(hn.suffix, "JD", hn)