Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def _code(p, flags):
flags = p.state.flags | flags

# run the optimizer passes over the parsed pattern
optimize(p)
optimize(p, flags)

code = []

Expand Down Expand Up @@ -267,7 +267,7 @@ def print_2(*args):
i += 1
op = OPCODES[op]
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
MAX_UNTIL, MIN_UNTIL, NEGATE):
MAX_UNTIL, MIN_UNTIL, NEGATE, INVERT):
print_(op)
elif op in (LITERAL, NOT_LITERAL,
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
Expand Down
4 changes: 3 additions & 1 deletion Lib/re/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

# update when constants are added or removed

MAGIC = 20260622
MAGIC = 20260625

from _sre import MAXREPEAT, MAXGROUPS # noqa: F401

Expand Down Expand Up @@ -120,6 +120,8 @@ def _makecodes(*names):
'NOT_LITERAL_UNI_IGNORE',
'RANGE_UNI_IGNORE',

'INVERT', # charset-body item: dual of NEGATE, inverts member tests

# The following opcodes are only occurred in the parser output,
# but not in the compiled code.
'MIN_REPEAT', 'MAX_REPEAT',
Expand Down
115 changes: 78 additions & 37 deletions Lib/re/_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _compile_charset(charset, flags, code):
emit = code.append
for op, av in charset:
emit(op)
if op is NEGATE:
if op in (NEGATE, INVERT):
pass
elif op is LITERAL:
emit(av)
Expand All @@ -58,31 +58,40 @@ def _compile_charset(charset, flags, code):
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set.
#
# The engine's charset() walk toggles polarity on every NEGATE (see
# Modules/_sre/sre_lib.h), so NEGATE markers split the set into
# alternating-polarity segments: a leading NEGATE is a complemented class
# [^...], an interior one is set difference (RL1.3). Each segment is a
# plain union, optimized on its own with the NEGATE boundaries kept in place.
negates = [i for i, (op, _av) in enumerate(charset) if op is NEGATE]
if not negates or negates == [0]:
# The engine's charset() walk toggles its return polarity on every NEGATE
# and its membership-test direction on every INVERT (see sre_lib.h) --
# together they express complement, difference and intersection (RL1.3).
# Each toggle-delimited segment is optimized on its own.
bounds = [i for i, (op, _av) in enumerate(charset)
if op in (NEGATE, INVERT)]
if not bounds or (bounds == [0] and charset[0][0] is NEGATE):
# Fast path: a plain union, optionally complemented as a whole -- every
# charset the parser produces today, optimized as before.
return _optimize_charset_segment(charset, iscased, fixup, fixes)

# Optimize each NEGATE-delimited run on its own. _allow_anyall is off: the
# Optimize each toggle-delimited run on its own. _allow_anyall is off: the
# [\s\S] -> ANY_ALL / [^\s\S] -> empty shortcuts rewrite a whole set and
# would inject or drop a NEGATE mid-segment.
# would inject or drop a toggle mid-segment.
out = []
hascased = False
start = 0
for i in negates + [len(charset)]:
inv = False
for i in bounds + [len(charset)]:
if i > start: # skip an empty run (e.g. a leading NEGATE)
opt, cased = _optimize_charset_segment(
charset[start:i], iscased, fixup, fixes, _allow_anyall=False)
out.extend(opt)
hascased |= cased
if inv:
# An INVERT run intersects its members: re-emit them as
# they are (already optimized, see _single_member).
out.extend(charset[start:i])
else:
opt, cased = _optimize_charset_segment(
charset[start:i], iscased, fixup, fixes, _allow_anyall=False)
out.extend(opt)
hascased |= cased
if i < len(charset):
out.append((NEGATE, None))
tok = charset[i]
out.append(tok) # re-emit the toggle in place
if tok[0] is INVERT:
inv = not inv
start = i + 1
return out, hascased

Expand Down Expand Up @@ -467,43 +476,75 @@ def _fuse_branch(av):
items += cs
return items if tail is None else items + tail

def _fuse_difference(data):
# Replace <flat charset A> (?<![B1]) (?<![B2]) ... with the single charset
# [NEGATE] B1 B2 ... [NEGATE] A. Each negative lookbehind over a flat
# charset subtracts its set from the character A matches.
def _single_member(operand):
# Reduce a flat-charset operand to one member opcode (a lone bitmap,
# range or category), or None. An intersection operand must be a single
# member, because each member under INVERT is a separate test.
items = _parser._flat_items(operand)
if items is None:
return None
opt, _hascased = _optimize_charset(items)
return opt if len(opt) == 1 else None

def _fuse_setops(data, flags):
# Fuse <flat charset A> (?<![B]) (?<=[C]) ... -- a left-associative
# chain of set difference ([A--B]) and intersection ([A&&C]) -- into the
# single charset [NEGATE] <fail items> [NEGATE] [INVERT] A. The chain
# is a pure conjunction, so each lookbehind appends one fail item:
# "ch in B" for a difference, "ch not in C" (under INVERT) for an
# intersection. Not fused under IGNORECASE, where case folding could
# split a single-member intersection operand (see _single_member).
out = []
head = None # _flat_items(A) for the fused difference now at out[-1]
subtrahend = None # its accumulated B items, or None when not fusing
head = None # _flat_items(A) for the fused set operation now at out[-1]
fails = None # its accumulated fail items, or None when not fusing
inv = 0 # the INVERT state at the end of fails
for op, av in data:
if op is ASSERT_NOT and av[0] < 0: # a negative lookbehind
b = _parser._flat_items(av[1].data)
if op in (ASSERT, ASSERT_NOT) and av[0] < 0: # a lookbehind
if op is ASSERT_NOT: # -- difference
b = _parser._flat_items(av[1].data)
tinv = 0
elif not flags & SRE_FLAG_IGNORECASE: # && intersection
b = _single_member(av[1].data)
tinv = 1
else:
b = None
if b is not None:
if subtrahend is None and out:
if fails is None and out:
# the first lookbehind of a run: only now is it worth
# checking whether the preceding item A is a flat charset.
head = _parser._flat_items([out[-1]])
if head is not None:
subtrahend = []
if subtrahend is not None:
subtrahend += b
out[-1] = (IN, [(NEGATE, None)] + subtrahend
+ [(NEGATE, None)] + head)
fails = []
inv = 0
if fails is not None:
if inv != tinv:
fails.append((INVERT, None))
inv ^= 1
fails += b
tail = [(NEGATE, None)]
if inv:
tail.append((INVERT, None))
out[-1] = (IN, [(NEGATE, None)] + fails + tail + head)
continue
head = subtrahend = None
head = fails = None
out.append((op, av))
data[:] = out

def _walk(seq):
def _walk(seq, flags):
for i, (op, av) in enumerate(seq):
for sub in _subpatterns(op, av):
_walk(sub.data)
if op is SUBPATTERN:
# A group can change the flag context, e.g. (?i:...).
_walk(av[3].data, _combine_flags(flags, av[1], av[2]))
else:
for sub in _subpatterns(op, av):
_walk(sub.data, flags)
if op is BRANCH:
items = _fuse_branch(av)
if items is not None:
seq[i] = (IN, items)
_fuse_difference(seq)
_fuse_setops(seq, flags)

def optimize(pattern):
def optimize(pattern, flags):
"""Rewrite a parsed pattern in place and return it."""
_walk(pattern.data)
_walk(pattern.data, flags)
return pattern
7 changes: 7 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1496,6 +1496,13 @@ def test_set_operations(self):
self.assertEqual(re.findall(r'[\d&&1]', s), list('1'))
# A leading '&' is a literal.
self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
# Intersection is not fused under IGNORECASE (case folding can
# split an operand), including when the flag comes from a scoped group.
for pat in (r'(?i)[\w&&[s]]', r'(?i:[\w&&[s]])'):
for c in 'sSſ':
self.assertTrue(re.fullmatch(pat, c), (pat, c))
self.assertTrue(re.fullmatch(r'(?-i:[\w&&[s]])', 's', re.I))
self.assertIsNone(re.fullmatch(r'(?-i:[\w&&[s]])', 'S', re.I))

# Nested sets and lookbehind-mapped operands.
self.assertEqual(re.findall(r'[a-z--[aeiou]]', s),
Expand Down
1 change: 1 addition & 0 deletions Modules/_sre/sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -2078,6 +2078,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
switch (op) {

case SRE_OP_NEGATE:
case SRE_OP_INVERT:
break;

case SRE_OP_LITERAL:
Expand Down
3 changes: 2 additions & 1 deletion Modules/_sre/sre_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* See the sre.c file for information on usage and redistribution.
*/

#define SRE_MAGIC 20260622
#define SRE_MAGIC 20260625
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
Expand Down Expand Up @@ -55,6 +55,7 @@
#define SRE_OP_LITERAL_UNI_IGNORE 40
#define SRE_OP_NOT_LITERAL_UNI_IGNORE 41
#define SRE_OP_RANGE_UNI_IGNORE 42
#define SRE_OP_INVERT 43
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2
Expand Down
39 changes: 28 additions & 11 deletions Modules/_sre/sre_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
/* check if character is a member of the given set */

int ok = 1;
/* INVERT toggles inv, which inverts each following membership test
(set difference/intersection within a single charset). Member tests
are normalized to 0/1 before the ^ inv. */
int inv = 0;

for (;;) {
switch (*set++) {
Expand All @@ -103,29 +107,32 @@ SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)

case SRE_OP_LITERAL:
/* <LITERAL> <code> */
if (ch == set[0])
if ((ch == set[0]) ^ inv)
return ok;
set++;
break;

case SRE_OP_CATEGORY:
/* <CATEGORY> <code> */
if (sre_category(set[0], (int) ch))
if (!!sre_category(set[0], (int) ch) ^ inv)
return ok;
set++;
break;

case SRE_OP_CHARSET:
/* <CHARSET> <bitmap> */
if (ch < 256 &&
(set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1)))))
{
int m = ch < 256 &&
(set[ch/SRE_CODE_BITS] & (1u << (ch & (SRE_CODE_BITS-1))));
if (m ^ inv)
return ok;
set += 256/SRE_CODE_BITS;
break;
}

case SRE_OP_RANGE:
/* <RANGE> <lower> <upper> */
if (set[0] <= ch && ch <= set[1])
if ((set[0] <= ch && ch <= set[1]) ^ inv)
return ok;
set += 2;
break;
Expand All @@ -134,11 +141,14 @@ SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
/* <RANGE_UNI_IGNORE> <lower> <upper> */
{
SRE_CODE uch;
int m;
/* ch is already lower cased */
if (set[0] <= ch && ch <= set[1])
return ok;
uch = sre_upper_unicode(ch);
if (set[0] <= uch && uch <= set[1])
m = set[0] <= ch && ch <= set[1];
if (!m) {
uch = sre_upper_unicode(ch);
m = set[0] <= uch && uch <= set[1];
}
if (m ^ inv)
return ok;
set += 2;
break;
Expand All @@ -148,20 +158,26 @@ SRE(charset)(SRE_STATE* state, const SRE_CODE* set, SRE_CODE ch)
ok = !ok;
break;

case SRE_OP_INVERT:
inv = !inv;
break;

case SRE_OP_BIGCHARSET:
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
{
Py_ssize_t count, block;
int m;
count = *(set++);

if (ch < 0x10000u)
block = ((unsigned char*)set)[ch >> 8];
else
block = -1;
set += 256/sizeof(SRE_CODE);
if (block >=0 &&
m = block >= 0 &&
(set[(block * 256 + (ch & 255))/SRE_CODE_BITS] &
(1u << (ch & (SRE_CODE_BITS-1)))))
(1u << (ch & (SRE_CODE_BITS-1))));
if (m ^ inv)
return ok;
set += count * (256/SRE_CODE_BITS);
break;
Expand Down Expand Up @@ -1616,6 +1632,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
TARGET(SRE_OP_SUBPATTERN):
TARGET(SRE_OP_RANGE):
TARGET(SRE_OP_NEGATE):
TARGET(SRE_OP_INVERT):
TARGET(SRE_OP_BIGCHARSET):
TARGET(SRE_OP_CHARSET):
TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr,
Expand Down
3 changes: 2 additions & 1 deletion Modules/_sre/sre_targets.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* See the sre.c file for information on usage and redistribution.
*/

static void *sre_targets[43] = {
static void *sre_targets[44] = {
&&TARGET_SRE_OP_FAILURE,
&&TARGET_SRE_OP_SUCCESS,
&&TARGET_SRE_OP_ANY,
Expand Down Expand Up @@ -55,4 +55,5 @@ static void *sre_targets[43] = {
&&TARGET_SRE_OP_LITERAL_UNI_IGNORE,
&&TARGET_SRE_OP_NOT_LITERAL_UNI_IGNORE,
&&TARGET_SRE_OP_RANGE_UNI_IGNORE,
&&TARGET_SRE_OP_INVERT,
};
Loading