From ab99c2ab4833ff64cff83039feec2e9b72cce1a0 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 18:09:55 +0800 Subject: [PATCH 1/6] Fix `_Extra.split()` in `zipfile` --- Lib/zipfile/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 418933a2e8d9e8..5434397c16f383 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -218,7 +218,7 @@ def split(cls, data): # use memoryview for zero-copy slices rest = memoryview(data) while rest: - extra, rest = _Extra.read_one(rest) + extra, rest = cls.read_one(rest) yield extra @classmethod From 0e5796f0b238ca88f88ebf1f8f35f2e88be29d3a Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Tue, 30 Jun 2026 18:55:58 +0800 Subject: [PATCH 2/6] Fix incorrect comment in `_decodeExtra` --- Lib/zipfile/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 5434397c16f383..882eaed2b48334 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -2665,7 +2665,7 @@ def _write_end_record(self): extra_data = zinfo.extra min_version = 0 if extra: - # Append a ZIP64 field to the extra's + # Prepend a ZIP64 field to the extra's extra_data = _Extra.strip(extra_data, (1,)) extra_data = struct.pack( ' Date: Tue, 30 Jun 2026 18:20:13 +0800 Subject: [PATCH 3/6] Optimize `_Extra` and add tests --- Lib/test/test_zipfile/test_core.py | 78 +++++++++++++++++++----------- Lib/zipfile/__init__.py | 5 +- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 4f20209927e7b3..c4e0ddba525acd 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -4114,6 +4114,16 @@ def test_read_zipfile_error(self): with self.assertRaises(zipfile.BadZipfile): zipfile.ZipFile(TESTFN, "r").close() + def test_read_zipfile_with_corrupted_extra_field(self): + with zipfile.ZipFile(TESTFN, mode='w') as zf: + zinfo = zipfile.ZipInfo("file.txt") + zinfo.extra = struct.pack(' Date: Tue, 30 Jun 2026 18:23:47 +0800 Subject: [PATCH 4/6] Introduce `validate` parameter for `_Extra.split()` --- Lib/zipfile/__init__.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 937d5682b838c9..e98e3593b2d411 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -205,19 +205,22 @@ def __init__(self, val, id=None): self.id = id @classmethod - def read_one(cls, raw): + def read_one(cls, raw, validate=False): try: xid, xlen = cls.FIELD_STRUCT.unpack_from(raw) except struct.error: xid, xlen = None, 0 + else: + if validate and xlen + 4 > len(raw): + raise BadZipFile("Corrupt extra field %04x (size=%d)" % (xid, xlen)) return cls(raw[:4+xlen], xid), raw[4+xlen:] @classmethod - def split(cls, data): + def split(cls, data, validate=False): # use memoryview for zero-copy slices rest = memoryview(data) while rest: - extra, rest = cls.read_one(rest) + extra, rest = cls.read_one(rest, validate=validate) yield extra @classmethod @@ -584,14 +587,11 @@ def _encodeFilenameFlags(self): def _decodeExtra(self, filename_crc): # Try to decode the extra field. - extra = self.extra unpack = struct.unpack - while len(extra) >= 4: - tp, ln = unpack(' len(extra): - raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln)) + for extra in _Extra.split(self.extra, True): + tp = extra.id if tp == 0x0001: - data = extra[4:ln+4] + data = extra[4:] # ZIP64 extension (large files and/or large archives) try: if self.file_size in (0xFFFF_FFFF_FFFF_FFFF, 0xFFFF_FFFF): @@ -609,7 +609,7 @@ def _decodeExtra(self, filename_crc): raise BadZipFile(f"Corrupt zip64 extra field. " f"{field} not found.") from None elif tp == 0x7075: - data = extra[4:ln+4] + data = extra[4:] # Unicode Path Extra Field try: up_version, up_name_crc = unpack(' Date: Tue, 30 Jun 2026 18:55:03 +0800 Subject: [PATCH 5/6] Introduce `_Extra.update()` Introduce a helper `.update()` that simplifies the common operation of fields updating of extra data bytes, by automatically stripping existing subfields with identical IDs and inserts multiple fields in one step. --- Lib/zipfile/__init__.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index e98e3593b2d411..9f259310ab4969 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -232,6 +232,18 @@ def strip(cls, data, xids): if ex.id not in xids ) + @classmethod + def update(cls, data, extra): + """Insert fields from extra and strip duplicates.""" + extras = { + ex.id: ex + for ex in cls.split(extra, True) + if ex.id is not None + } + # New fields first since data may have a corrupted tail that renders + # following fields inaccessible. + return b''.join(extras.values()) + cls.strip(data, extras) + def _check_zipfile(fp): try: @@ -2663,10 +2675,10 @@ def _write_end_record(self): min_version = 0 if extra: # Prepend a ZIP64 field to the extra's - extra_data = _Extra.strip(extra_data, (1,)) - extra_data = struct.pack( + extra_data = _Extra.update(extra_data, struct.pack( ' Date: Tue, 30 Jun 2026 19:12:35 +0800 Subject: [PATCH 6/6] Rework and optimize `_Extra` Make `_Extra` a normal class rather than a subclass of `bytes`, avoiding the performance penalty of copying the passed bytes upon each instantiation. Introduce `.iter()` in place of `.strip()` to yield each subfield info as a tuple. This provides better semantics and is more performant than subclassing `bytes`. The method name change also enables feature detection for these behavior changes. Stop using memoryview internally because each slice creates an overhead of around 184 bytes, making it inefficient for small bytes operations (each extra field is usually only several to tens of bytes). Remove `.read_one()` since it is now unused and obsolete. --- Lib/zipfile/__init__.py | 61 ++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 9f259310ab4969..fc8b7d6db86d5d 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -195,53 +195,53 @@ class LargeZipFile(Exception): _DD_SIGNATURE = 0x08074b50 -class _Extra(bytes): +class _Extra: FIELD_STRUCT = struct.Struct(' len(raw): - raise BadZipFile("Corrupt extra field %04x (size=%d)" % (xid, xlen)) - return cls(raw[:4+xlen], xid), raw[4+xlen:] + def iter(cls, data, validate=False): + """Iter through and yield each (field, id).""" + # early return for empty extra data + if not data: + return - @classmethod - def split(cls, data, validate=False): - # use memoryview for zero-copy slices - rest = memoryview(data) - while rest: - extra, rest = cls.read_one(rest, validate=validate) - yield extra + pos, data_len = 0, len(data) + while pos < data_len: + try: + xid, xlen = cls.FIELD_STRUCT.unpack_from(data, pos) + except struct.error: + xid, xlen = None, 0 + else: + if validate and pos + 4 + xlen > data_len: + raise BadZipFile( + "Corrupt extra field %04x (size=%d)" % (xid, xlen)) + yield data[pos:pos + 4 + xlen], xid + pos += 4 + xlen @classmethod def strip(cls, data, xids): """Remove Extra fields with specified IDs.""" return b''.join( ex - for ex in cls.split(data) - if ex.id not in xids + for ex, xid in cls.iter(data) + if xid not in xids ) @classmethod def update(cls, data, extra): """Insert fields from extra and strip duplicates.""" + # early return for empty data + if not data: + return extra + extras = { - ex.id: ex - for ex in cls.split(extra, True) - if ex.id is not None + xid: ex + for ex, xid in cls.iter(extra) + if xid is not None } # New fields first since data may have a corrupted tail that renders - # following fields inaccessible. + # following fields inaccessible. (The caller is responsible for making + # sure that extra is valid.) return b''.join(extras.values()) + cls.strip(data, extras) @@ -600,8 +600,7 @@ def _encodeFilenameFlags(self): def _decodeExtra(self, filename_crc): # Try to decode the extra field. unpack = struct.unpack - for extra in _Extra.split(self.extra, True): - tp = extra.id + for extra, tp in _Extra.iter(self.extra, True): if tp == 0x0001: data = extra[4:] # ZIP64 extension (large files and/or large archives)