Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Doc/library/codecs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,18 @@ On Windows, ``cpXXX`` codecs are available for all code pages.
But only codecs listed in the following table are guaranteed to exist on
other platforms.

On platforms that provide the C library's :manpage:`iconv(3)` function
(such as those using the GNU C Library),
every encoding known to ``iconv`` for which Python has no built-in codec
is available as well.
Such an encoding is looked up by its ``iconv`` name (for example ``cp1133``).
Prefixing the name with ``iconv:`` forces the use of the ``iconv``-based codec
even when a built-in codec of the same name exists (for example ``iconv:latin1``),
which is mostly useful for testing.

.. versionchanged:: next
Added support for encodings provided by the C library's ``iconv``.

.. impl-detail::

Some common encodings can bypass the codecs lookup machinery to
Expand Down
10 changes: 10 additions & 0 deletions Doc/whatsnew/3.16.rst
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,16 @@ New modules
Improved modules
================

codecs
------

* On platforms that provide the C library's :manpage:`iconv(3)` function,
every encoding known to ``iconv`` for which Python has no built-in codec
is now available (for example ``cp1133``).
Prefixing an encoding name with ``iconv:`` forces the ``iconv``-based codec
even when a built-in codec of the same name exists.
(Contributed by Serhiy Storchaka in :gh:`152997`.)

curses
------

Expand Down
16 changes: 16 additions & 0 deletions Include/internal/pycore_unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,22 @@ extern int _PyUnicodeWriter_FormatV(
const char *format,
va_list vargs);

/* --- iconv Codec -------------------------------------------------------- */

#ifdef HAVE_ICONV
extern PyObject* _PyUnicode_DecodeIconv(
const char *encoding, /* iconv encoding name */
const char *string, /* encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed); /* bytes consumed, or NULL for non-stateful */

extern PyObject* _PyUnicode_EncodeIconv(
const char *encoding, /* iconv encoding name */
PyObject *unicode, /* Unicode object */
const char *errors); /* error handling */
#endif

/* --- UTF-7 Codecs ------------------------------------------------------- */

extern PyObject* _PyUnicode_EncodeUTF7(
Expand Down
27 changes: 27 additions & 0 deletions Lib/encodings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,30 @@ def win32_code_page_search_function(encoding):
return create_win32_code_page_codec(cp)

codecs.register(win32_code_page_search_function)

try:
from _codecs import iconv_encode as _iconv_encode
except ImportError:
pass
else:
from ._iconv_codecs import create_iconv_codec

# Last-resort search function backed by the C library's iconv(): provides
# any encoding iconv knows that Python has no built-in codec for. Registered
# last, so it never shadows a built-in; an "iconv:" prefix forces it.
def iconv_search_function(encoding):
if encoding.startswith('iconv:'):
name = encoding[len('iconv:'):]
else:
name = encoding
if not name:
return None
# Test if the encoding is supported by iconv.
try:
_iconv_encode(name, '')
except (LookupError, OSError):
return None

return create_iconv_codec(encoding, name)

codecs.register(iconv_search_function)
36 changes: 36 additions & 0 deletions Lib/encodings/_iconv_codecs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import codecs

def create_iconv_codec(name, encoding):
from _codecs import iconv_encode, iconv_decode

def encode(input, errors='strict'):
return iconv_encode(encoding, input, errors)

def decode(input, errors='strict'):
return iconv_decode(encoding, input, errors, True)

class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return iconv_encode(encoding, input, self.errors)[0]

class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
return iconv_decode(encoding, input, errors, final)

class StreamWriter(codecs.StreamWriter):
def encode(self, input, errors='strict'):
return iconv_encode(encoding, input, errors)

class StreamReader(codecs.StreamReader):
def decode(self, input, errors, final=False):
return iconv_decode(encoding, input, errors, final)

return codecs.CodecInfo(
name=name,
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
172 changes: 172 additions & 0 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3586,6 +3586,178 @@ def test_large_utf8_input(self, size):
self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')


def iconv_encoding_available(name):
# The encodings iconv provides are platform-dependent, so tests must probe
# availability rather than assume it.
try:
codecs.iconv_encode(name, '')
except (LookupError, OSError):
return False
return True


# Candidate encodings with sample text; tests probe several and skip only when a
# whole category is unavailable.
_ICONV_SINGLE_BYTE = [
('KOI8-U', 'Привіт, світ!'),
('ISO-8859-7', 'Καλημέρα'),
('ISO-8859-2', 'Zażółć gęślą jaźń'),
('ISO-8859-1', 'Grüße'),
]
_ICONV_MULTIBYTE = ['EUC-JP', 'SHIFT_JIS', 'GBK', 'GB18030', 'BIG5']
# Encodings iconv may provide but for which CPython has no built-in codec
# (cp1047 is EBCDIC, i.e. not ASCII-compatible).
_ICONV_ONLY = ['cp1047', 'cp1133', 'GEORGIAN-PS', 'ARMSCII-8']


@unittest.skipUnless(hasattr(codecs, 'iconv_encode'),
'the iconv codec is not available')
class IconvTest(unittest.TestCase):

def require(self, *names):
for name in names:
if iconv_encoding_available(name):
return name
self.skipTest('no suitable iconv encoding is available')

def require_single_byte(self):
for enc, text in _ICONV_SINGLE_BYTE:
if iconv_encoding_available(enc):
return enc, text
self.skipTest('no single-byte iconv encoding is available')

def test_unknown_encoding(self):
self.assertRaises(LookupError, codecs.iconv_encode, 'no-such-enc-42', 'a')
self.assertRaises(LookupError, codecs.iconv_decode, 'no-such-enc-42', b'a')
self.assertRaises(LookupError, codecs.lookup, 'iconv:no-such-enc-42')

def test_roundtrip(self):
cases = _ICONV_SINGLE_BYTE + [(enc, '日本語') for enc in _ICONV_MULTIBYTE]
tested = False
for enc, text in cases:
if not iconv_encoding_available(enc):
continue
tested = True
with self.subTest(encoding=enc):
data = codecs.iconv_encode(enc, text)[0]
decoded, consumed = codecs.iconv_decode(enc, data, 'strict', True)
self.assertEqual(decoded, text)
self.assertEqual(consumed, len(data))
if not tested:
self.skipTest('none of the test encodings are available')

def test_encode_errors(self):
# A non-ASCII character is not representable in ASCII.
enc = self.require('ASCII')
with self.assertRaises(UnicodeEncodeError) as cm:
codecs.iconv_encode(enc, 'a€b')
self.assertEqual((cm.exception.encoding, cm.exception.start,
cm.exception.end), (enc, 1, 2))
self.assertEqual(codecs.iconv_encode(enc, 'a€b', 'replace')[0], b'a?b')
self.assertEqual(codecs.iconv_encode(enc, 'a€b', 'ignore')[0], b'ab')
self.assertEqual(codecs.iconv_encode(enc, 'a€b', 'backslashreplace')[0],
b'a\\u20acb')
self.assertEqual(codecs.iconv_encode(enc, 'a€b', 'xmlcharrefreplace')[0],
b'a€b')

def test_decode_errors(self):
enc = self.require('ASCII')
bad = b'a\xffb'
with self.assertRaises(UnicodeDecodeError) as cm:
codecs.iconv_decode(enc, bad, 'strict', True)
self.assertEqual((cm.exception.encoding, cm.exception.start), (enc, 1))
self.assertEqual(codecs.iconv_decode(enc, bad, 'replace', True)[0],
'a\ufffdb')
self.assertEqual(codecs.iconv_decode(enc, bad, 'ignore', True)[0], 'ab')
self.assertEqual(codecs.iconv_decode(enc, bad, 'backslashreplace', True)[0],
'a\\xffb')

def test_stateful_decode(self):
enc = self.require(*_ICONV_MULTIBYTE)
full = codecs.iconv_encode(enc, '日本')[0]
# A trailing incomplete multibyte sequence is deferred when final is
# false, and reported through *consumed*.
text, consumed = codecs.iconv_decode(enc, full[:-1], 'strict', False)
self.assertEqual(text, '日')
self.assertEqual(consumed, len(full) - 2)
# With final=True the same input is an error.
self.assertRaises(UnicodeDecodeError,
codecs.iconv_decode, enc, full[:-1], 'strict', True)

def test_empty(self):
enc = self.require('ASCII')
self.assertEqual(codecs.iconv_encode(enc, ''), (b'', 0))
self.assertEqual(codecs.iconv_decode(enc, b'', 'strict', True), ('', 0))

def test_lookup_bare_name(self):
# An encoding that iconv knows but Python has no built-in codec for.
for name in _ICONV_ONLY:
if (iconv_encoding_available(name)
and encodings.search_function(name) is None):
break
else:
self.skipTest('no iconv-only encoding is available')
info = codecs.lookup(name)
self.assertEqual(info.name, name.lower())
# The encoding need not be ASCII-compatible (e.g. EBCDIC), so just
# check that it round-trips.
self.assertEqual('abc'.encode(name).decode(name), 'abc')

def test_lookup_does_not_shadow_builtin(self):
# Built-in codecs must win over the iconv fallback.
self.assertEqual(codecs.lookup('utf-8').name, 'utf-8')
self.assertEqual(codecs.lookup('ascii').name, 'ascii')

def test_iconv_prefix_forces_engine(self):
# These candidates all have a built-in codec to compare against.
enc, text = self.require_single_byte()
info = codecs.lookup('iconv:' + enc)
# The registry lower-cases the requested name.
self.assertEqual(info.name, ('iconv:' + enc).lower())
self.assertEqual(text.encode('iconv:' + enc), text.encode(enc))
self.assertEqual(text.encode('iconv:' + enc).decode('iconv:' + enc), text)

def test_incremental_decode(self):
enc = self.require(*_ICONV_MULTIBYTE)
text = '日本語'
data = codecs.encode(text, 'iconv:' + enc)
dec = codecs.getincrementaldecoder('iconv:' + enc)()
out = ''.join(dec.decode(data[i:i+1]) for i in range(len(data)))
out += dec.decode(b'', True)
self.assertEqual(out, text)

def test_stream(self):
enc = self.require(*_ICONV_MULTIBYTE)
text = '日本語'
raw = codecs.encode(text, 'iconv:' + enc)
reader = codecs.getreader('iconv:' + enc)(io.BytesIO(raw))
self.assertEqual(reader.read(), text)

def test_encode_kinds(self):
# The string's own buffer is fed to iconv per storage kind; check each
# of the 1-, 2- and 4-byte kinds against the built-in codec.
enc = self.require('UTF-8')
for text in ('Gr\xfc\xdfe', 'ĀāĂ', 'A\U0001f389B'):
with self.subTest(text=text):
self.assertEqual(text.encode('iconv:' + enc), text.encode(enc))

def test_encode_surrogateescape(self):
# A lone surrogate lives in the 2-byte kind and round-trips.
enc = self.require('ASCII')
data = b'ab\xff'
s = data.decode(enc, 'surrogateescape')
self.assertEqual(s.encode('iconv:' + enc, 'surrogateescape'), data)

def test_encode_surrogate_pair(self):
# A surrogate pair must stay two code points, never combined into an
# astral character (as UTF-16 would): backslashreplace escapes each
# surrogate separately, not the escape of a single combined character.
pair = '\ud83c\udf89'
latin1 = self.require('ISO-8859-1', 'ASCII')
self.assertEqual(pair.encode('iconv:' + latin1, 'backslashreplace'),
rb'\ud83c\udf89')


class ASCIITest(unittest.TestCase):
def test_encode(self):
self.assertEqual('abc123'.encode('ascii'), b'abc123')
Expand Down
5 changes: 5 additions & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -1656,6 +1656,7 @@ FROZEN_FILES_IN = \
Lib/encodings/aliases.py \
Lib/encodings/utf_8.py \
Lib/encodings/_win_cp_codecs.py \
Lib/encodings/_iconv_codecs.py \
Lib/io.py \
Lib/_collections_abc.py \
Lib/_sitebuiltins.py \
Expand Down Expand Up @@ -1686,6 +1687,7 @@ FROZEN_FILES_OUT = \
Python/frozen_modules/encodings.aliases.h \
Python/frozen_modules/encodings.utf_8.h \
Python/frozen_modules/encodings._win_cp_codecs.h \
Python/frozen_modules/encodings._iconv_codecs.h \
Python/frozen_modules/io.h \
Python/frozen_modules/_collections_abc.h \
Python/frozen_modules/_sitebuiltins.h \
Expand Down Expand Up @@ -1747,6 +1749,9 @@ Python/frozen_modules/encodings.utf_8.h: Lib/encodings/utf_8.py $(FREEZE_MODULE_
Python/frozen_modules/encodings._win_cp_codecs.h: Lib/encodings/_win_cp_codecs.py $(FREEZE_MODULE_DEPS)
$(FREEZE_MODULE) encodings._win_cp_codecs $(srcdir)/Lib/encodings/_win_cp_codecs.py Python/frozen_modules/encodings._win_cp_codecs.h

Python/frozen_modules/encodings._iconv_codecs.h: Lib/encodings/_iconv_codecs.py $(FREEZE_MODULE_DEPS)
$(FREEZE_MODULE) encodings._iconv_codecs $(srcdir)/Lib/encodings/_iconv_codecs.py Python/frozen_modules/encodings._iconv_codecs.h

Python/frozen_modules/io.h: Lib/io.py $(FREEZE_MODULE_DEPS)
$(FREEZE_MODULE) io $(srcdir)/Lib/io.py Python/frozen_modules/io.h

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
On platforms providing the C library's :manpage:`iconv(3)` function,
the :mod:`codecs` module now exposes every encoding known to ``iconv``
for which Python has no built-in codec.
Such an encoding can be used by its name (for example ``"cp1133"``)
or, to force the ``iconv``-based engine even when a built-in codec exists,
with an ``"iconv:"`` prefix (for example ``"iconv:latin1"``).
Loading
Loading