diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 99fcf35aa893e4..db326ffb6836bf 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1087,6 +1087,18 @@ On Windows, ``cpXXX`` codecs are available for all code pages. But only codecs listed in the following table are guaranteed to exist on other platforms. +On platforms that provide the C library's :manpage:`iconv(3)` function +(such as those using the GNU C Library), +every encoding known to ``iconv`` for which Python has no built-in codec +is available as well. +Such an encoding is looked up by its ``iconv`` name (for example ``cp1133``). +Prefixing the name with ``iconv:`` forces the use of the ``iconv``-based codec +even when a built-in codec of the same name exists (for example ``iconv:latin1``), +which is mostly useful for testing. + +.. versionchanged:: next + Added support for encodings provided by the C library's ``iconv``. + .. impl-detail:: Some common encodings can bypass the codecs lookup machinery to diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst index 1a73a79a58b78b..a1366ae8433488 100644 --- a/Doc/whatsnew/3.16.rst +++ b/Doc/whatsnew/3.16.rst @@ -86,6 +86,16 @@ New modules Improved modules ================ +codecs +------ + +* On platforms that provide the C library's :manpage:`iconv(3)` function, + every encoding known to ``iconv`` for which Python has no built-in codec + is now available (for example ``cp1133``). + Prefixing an encoding name with ``iconv:`` forces the ``iconv``-based codec + even when a built-in codec of the same name exists. + (Contributed by Serhiy Storchaka in :gh:`152997`.) + curses ------ diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 75d5068f815b91..012f5da2869cd5 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -182,6 +182,22 @@ extern int _PyUnicodeWriter_FormatV( const char *format, va_list vargs); +/* --- iconv Codec -------------------------------------------------------- */ + +#ifdef HAVE_ICONV +extern PyObject* _PyUnicode_DecodeIconv( + const char *encoding, /* iconv encoding name */ + const char *string, /* encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed); /* bytes consumed, or NULL for non-stateful */ + +extern PyObject* _PyUnicode_EncodeIconv( + const char *encoding, /* iconv encoding name */ + PyObject *unicode, /* Unicode object */ + const char *errors); /* error handling */ +#endif + /* --- UTF-7 Codecs ------------------------------------------------------- */ extern PyObject* _PyUnicode_EncodeUTF7( diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 169c48324f227b..04594eab8224b3 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -176,3 +176,30 @@ def win32_code_page_search_function(encoding): return create_win32_code_page_codec(cp) codecs.register(win32_code_page_search_function) + +try: + from _codecs import iconv_encode as _iconv_encode +except ImportError: + pass +else: + from ._iconv_codecs import create_iconv_codec + + # Last-resort search function backed by the C library's iconv(): provides + # any encoding iconv knows that Python has no built-in codec for. Registered + # last, so it never shadows a built-in; an "iconv:" prefix forces it. + def iconv_search_function(encoding): + if encoding.startswith('iconv:'): + name = encoding[len('iconv:'):] + else: + name = encoding + if not name: + return None + # Test if the encoding is supported by iconv. + try: + _iconv_encode(name, '') + except (LookupError, OSError): + return None + + return create_iconv_codec(encoding, name) + + codecs.register(iconv_search_function) diff --git a/Lib/encodings/_iconv_codecs.py b/Lib/encodings/_iconv_codecs.py new file mode 100644 index 00000000000000..1c701e0af423af --- /dev/null +++ b/Lib/encodings/_iconv_codecs.py @@ -0,0 +1,36 @@ +import codecs + +def create_iconv_codec(name, encoding): + from _codecs import iconv_encode, iconv_decode + + def encode(input, errors='strict'): + return iconv_encode(encoding, input, errors) + + def decode(input, errors='strict'): + return iconv_decode(encoding, input, errors, True) + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return iconv_encode(encoding, input, self.errors)[0] + + class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + return iconv_decode(encoding, input, errors, final) + + class StreamWriter(codecs.StreamWriter): + def encode(self, input, errors='strict'): + return iconv_encode(encoding, input, errors) + + class StreamReader(codecs.StreamReader): + def decode(self, input, errors, final=False): + return iconv_decode(encoding, input, errors, final) + + return codecs.CodecInfo( + name=name, + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 8fdd08df9e4f46..4e5fc498563ffa 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3586,6 +3586,178 @@ def test_large_utf8_input(self, size): self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100') +def iconv_encoding_available(name): + # The encodings iconv provides are platform-dependent, so tests must probe + # availability rather than assume it. + try: + codecs.iconv_encode(name, '') + except (LookupError, OSError): + return False + return True + + +# Candidate encodings with sample text; tests probe several and skip only when a +# whole category is unavailable. +_ICONV_SINGLE_BYTE = [ + ('KOI8-U', 'Привіт, світ!'), + ('ISO-8859-7', 'Καλημέρα'), + ('ISO-8859-2', 'Zażółć gęślą jaźń'), + ('ISO-8859-1', 'Grüße'), +] +_ICONV_MULTIBYTE = ['EUC-JP', 'SHIFT_JIS', 'GBK', 'GB18030', 'BIG5'] +# Encodings iconv may provide but for which CPython has no built-in codec +# (cp1047 is EBCDIC, i.e. not ASCII-compatible). +_ICONV_ONLY = ['cp1047', 'cp1133', 'GEORGIAN-PS', 'ARMSCII-8'] + + +@unittest.skipUnless(hasattr(codecs, 'iconv_encode'), + 'the iconv codec is not available') +class IconvTest(unittest.TestCase): + + def require(self, *names): + for name in names: + if iconv_encoding_available(name): + return name + self.skipTest('no suitable iconv encoding is available') + + def require_single_byte(self): + for enc, text in _ICONV_SINGLE_BYTE: + if iconv_encoding_available(enc): + return enc, text + self.skipTest('no single-byte iconv encoding is available') + + def test_unknown_encoding(self): + self.assertRaises(LookupError, codecs.iconv_encode, 'no-such-enc-42', 'a') + self.assertRaises(LookupError, codecs.iconv_decode, 'no-such-enc-42', b'a') + self.assertRaises(LookupError, codecs.lookup, 'iconv:no-such-enc-42') + + def test_roundtrip(self): + cases = _ICONV_SINGLE_BYTE + [(enc, '日本語') for enc in _ICONV_MULTIBYTE] + tested = False + for enc, text in cases: + if not iconv_encoding_available(enc): + continue + tested = True + with self.subTest(encoding=enc): + data = codecs.iconv_encode(enc, text)[0] + decoded, consumed = codecs.iconv_decode(enc, data, 'strict', True) + self.assertEqual(decoded, text) + self.assertEqual(consumed, len(data)) + if not tested: + self.skipTest('none of the test encodings are available') + + def test_encode_errors(self): + # A non-ASCII character is not representable in ASCII. + enc = self.require('ASCII') + with self.assertRaises(UnicodeEncodeError) as cm: + codecs.iconv_encode(enc, 'a€b') + self.assertEqual((cm.exception.encoding, cm.exception.start, + cm.exception.end), (enc, 1, 2)) + self.assertEqual(codecs.iconv_encode(enc, 'a€b', 'replace')[0], b'a?b') + self.assertEqual(codecs.iconv_encode(enc, 'a€b', 'ignore')[0], b'ab') + self.assertEqual(codecs.iconv_encode(enc, 'a€b', 'backslashreplace')[0], + b'a\\u20acb') + self.assertEqual(codecs.iconv_encode(enc, 'a€b', 'xmlcharrefreplace')[0], + b'a€b') + + def test_decode_errors(self): + enc = self.require('ASCII') + bad = b'a\xffb' + with self.assertRaises(UnicodeDecodeError) as cm: + codecs.iconv_decode(enc, bad, 'strict', True) + self.assertEqual((cm.exception.encoding, cm.exception.start), (enc, 1)) + self.assertEqual(codecs.iconv_decode(enc, bad, 'replace', True)[0], + 'a\ufffdb') + self.assertEqual(codecs.iconv_decode(enc, bad, 'ignore', True)[0], 'ab') + self.assertEqual(codecs.iconv_decode(enc, bad, 'backslashreplace', True)[0], + 'a\\xffb') + + def test_stateful_decode(self): + enc = self.require(*_ICONV_MULTIBYTE) + full = codecs.iconv_encode(enc, '日本')[0] + # A trailing incomplete multibyte sequence is deferred when final is + # false, and reported through *consumed*. + text, consumed = codecs.iconv_decode(enc, full[:-1], 'strict', False) + self.assertEqual(text, '日') + self.assertEqual(consumed, len(full) - 2) + # With final=True the same input is an error. + self.assertRaises(UnicodeDecodeError, + codecs.iconv_decode, enc, full[:-1], 'strict', True) + + def test_empty(self): + enc = self.require('ASCII') + self.assertEqual(codecs.iconv_encode(enc, ''), (b'', 0)) + self.assertEqual(codecs.iconv_decode(enc, b'', 'strict', True), ('', 0)) + + def test_lookup_bare_name(self): + # An encoding that iconv knows but Python has no built-in codec for. + for name in _ICONV_ONLY: + if (iconv_encoding_available(name) + and encodings.search_function(name) is None): + break + else: + self.skipTest('no iconv-only encoding is available') + info = codecs.lookup(name) + self.assertEqual(info.name, name.lower()) + # The encoding need not be ASCII-compatible (e.g. EBCDIC), so just + # check that it round-trips. + self.assertEqual('abc'.encode(name).decode(name), 'abc') + + def test_lookup_does_not_shadow_builtin(self): + # Built-in codecs must win over the iconv fallback. + self.assertEqual(codecs.lookup('utf-8').name, 'utf-8') + self.assertEqual(codecs.lookup('ascii').name, 'ascii') + + def test_iconv_prefix_forces_engine(self): + # These candidates all have a built-in codec to compare against. + enc, text = self.require_single_byte() + info = codecs.lookup('iconv:' + enc) + # The registry lower-cases the requested name. + self.assertEqual(info.name, ('iconv:' + enc).lower()) + self.assertEqual(text.encode('iconv:' + enc), text.encode(enc)) + self.assertEqual(text.encode('iconv:' + enc).decode('iconv:' + enc), text) + + def test_incremental_decode(self): + enc = self.require(*_ICONV_MULTIBYTE) + text = '日本語' + data = codecs.encode(text, 'iconv:' + enc) + dec = codecs.getincrementaldecoder('iconv:' + enc)() + out = ''.join(dec.decode(data[i:i+1]) for i in range(len(data))) + out += dec.decode(b'', True) + self.assertEqual(out, text) + + def test_stream(self): + enc = self.require(*_ICONV_MULTIBYTE) + text = '日本語' + raw = codecs.encode(text, 'iconv:' + enc) + reader = codecs.getreader('iconv:' + enc)(io.BytesIO(raw)) + self.assertEqual(reader.read(), text) + + def test_encode_kinds(self): + # The string's own buffer is fed to iconv per storage kind; check each + # of the 1-, 2- and 4-byte kinds against the built-in codec. + enc = self.require('UTF-8') + for text in ('Gr\xfc\xdfe', 'ĀāĂ', 'A\U0001f389B'): + with self.subTest(text=text): + self.assertEqual(text.encode('iconv:' + enc), text.encode(enc)) + + def test_encode_surrogateescape(self): + # A lone surrogate lives in the 2-byte kind and round-trips. + enc = self.require('ASCII') + data = b'ab\xff' + s = data.decode(enc, 'surrogateescape') + self.assertEqual(s.encode('iconv:' + enc, 'surrogateescape'), data) + + def test_encode_surrogate_pair(self): + # A surrogate pair must stay two code points, never combined into an + # astral character (as UTF-16 would): backslashreplace escapes each + # surrogate separately, not the escape of a single combined character. + pair = '\ud83c\udf89' + latin1 = self.require('ISO-8859-1', 'ASCII') + self.assertEqual(pair.encode('iconv:' + latin1, 'backslashreplace'), + rb'\ud83c\udf89') + + class ASCIITest(unittest.TestCase): def test_encode(self): self.assertEqual('abc123'.encode('ascii'), b'abc123') diff --git a/Makefile.pre.in b/Makefile.pre.in index b9914369ad1bed..786f4fd5c00247 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1656,6 +1656,7 @@ FROZEN_FILES_IN = \ Lib/encodings/aliases.py \ Lib/encodings/utf_8.py \ Lib/encodings/_win_cp_codecs.py \ + Lib/encodings/_iconv_codecs.py \ Lib/io.py \ Lib/_collections_abc.py \ Lib/_sitebuiltins.py \ @@ -1686,6 +1687,7 @@ FROZEN_FILES_OUT = \ Python/frozen_modules/encodings.aliases.h \ Python/frozen_modules/encodings.utf_8.h \ Python/frozen_modules/encodings._win_cp_codecs.h \ + Python/frozen_modules/encodings._iconv_codecs.h \ Python/frozen_modules/io.h \ Python/frozen_modules/_collections_abc.h \ Python/frozen_modules/_sitebuiltins.h \ @@ -1747,6 +1749,9 @@ Python/frozen_modules/encodings.utf_8.h: Lib/encodings/utf_8.py $(FREEZE_MODULE_ Python/frozen_modules/encodings._win_cp_codecs.h: Lib/encodings/_win_cp_codecs.py $(FREEZE_MODULE_DEPS) $(FREEZE_MODULE) encodings._win_cp_codecs $(srcdir)/Lib/encodings/_win_cp_codecs.py Python/frozen_modules/encodings._win_cp_codecs.h +Python/frozen_modules/encodings._iconv_codecs.h: Lib/encodings/_iconv_codecs.py $(FREEZE_MODULE_DEPS) + $(FREEZE_MODULE) encodings._iconv_codecs $(srcdir)/Lib/encodings/_iconv_codecs.py Python/frozen_modules/encodings._iconv_codecs.h + Python/frozen_modules/io.h: Lib/io.py $(FREEZE_MODULE_DEPS) $(FREEZE_MODULE) io $(srcdir)/Lib/io.py Python/frozen_modules/io.h diff --git a/Misc/NEWS.d/next/Library/2026-07-03-19-51-47.gh-issue-152997.i1ErzB.rst b/Misc/NEWS.d/next/Library/2026-07-03-19-51-47.gh-issue-152997.i1ErzB.rst new file mode 100644 index 00000000000000..9983f1b8313f73 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-07-03-19-51-47.gh-issue-152997.i1ErzB.rst @@ -0,0 +1,6 @@ +On platforms providing the C library's :manpage:`iconv(3)` function, +the :mod:`codecs` module now exposes every encoding known to ``iconv`` +for which Python has no built-in codec. +Such an encoding can be used by its name (for example ``"cp1133"``) +or, to force the ``iconv``-based engine even when a built-in codec exists, +with an ``"iconv:"`` prefix (for example ``"iconv:latin1"``). diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 272182f7bf49ac..7cba234fc80b59 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -642,6 +642,31 @@ _codecs_code_page_decode_impl(PyObject *module, int codepage, #endif /* MS_WINDOWS */ +#ifdef HAVE_ICONV + +/*[clinic input] +_codecs.iconv_decode + encoding: str + data: Py_buffer + errors: str(accept={str, NoneType}) = None + final: bool = False + / +[clinic start generated code]*/ + +static PyObject * +_codecs_iconv_decode_impl(PyObject *module, const char *encoding, + Py_buffer *data, const char *errors, int final) +/*[clinic end generated code: output=6c6145a9decc2ba8 input=d15a04d7d3a3e0cd]*/ +{ + Py_ssize_t consumed = data->len; + PyObject *decoded = _PyUnicode_DecodeIconv(encoding, data->buf, data->len, + errors, + final ? NULL : &consumed); + return codec_tuple(decoded, consumed); +} + +#endif /* HAVE_ICONV */ + /* --- Encoder ------------------------------------------------------------ */ /*[clinic input] @@ -952,6 +977,27 @@ _codecs_code_page_encode_impl(PyObject *module, int code_page, PyObject *str, #endif /* MS_WINDOWS */ +#ifdef HAVE_ICONV + +/*[clinic input] +_codecs.iconv_encode + encoding: str + str: unicode + errors: str(accept={str, NoneType}) = None + / +[clinic start generated code]*/ + +static PyObject * +_codecs_iconv_encode_impl(PyObject *module, const char *encoding, + PyObject *str, const char *errors) +/*[clinic end generated code: output=bda0c5acd9be0f17 input=53a1e2028c9e2b43]*/ +{ + return codec_tuple(_PyUnicode_EncodeIconv(encoding, str, errors), + PyUnicode_GET_LENGTH(str)); +} + +#endif /* HAVE_ICONV */ + /* --- Error handler registry --------------------------------------------- */ /*[clinic input] @@ -1107,6 +1153,8 @@ static PyMethodDef _codecs_functions[] = { _CODECS_OEM_DECODE_METHODDEF _CODECS_CODE_PAGE_ENCODE_METHODDEF _CODECS_CODE_PAGE_DECODE_METHODDEF + _CODECS_ICONV_ENCODE_METHODDEF + _CODECS_ICONV_DECODE_METHODDEF _CODECS_REGISTER_ERROR_METHODDEF _CODECS__UNREGISTER_ERROR_METHODDEF _CODECS_LOOKUP_ERROR_METHODDEF diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index 4a40dc660b621c..788b9b706c8fbb 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -1629,6 +1629,90 @@ _codecs_code_page_decode(PyObject *module, PyObject *const *args, Py_ssize_t nar #endif /* defined(MS_WINDOWS) */ +#if defined(HAVE_ICONV) + +PyDoc_STRVAR(_codecs_iconv_decode__doc__, +"iconv_decode($module, encoding, data, errors=None, final=False, /)\n" +"--\n" +"\n"); + +#define _CODECS_ICONV_DECODE_METHODDEF \ + {"iconv_decode", _PyCFunction_CAST(_codecs_iconv_decode), METH_FASTCALL, _codecs_iconv_decode__doc__}, + +static PyObject * +_codecs_iconv_decode_impl(PyObject *module, const char *encoding, + Py_buffer *data, const char *errors, int final); + +static PyObject * +_codecs_iconv_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + const char *encoding; + Py_buffer data = {NULL, NULL}; + const char *errors = NULL; + int final = 0; + + if (!_PyArg_CheckPositional("iconv_decode", nargs, 2, 4)) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("iconv_decode", "argument 1", "str", args[0]); + goto exit; + } + Py_ssize_t encoding_length; + encoding = PyUnicode_AsUTF8AndSize(args[0], &encoding_length); + if (encoding == NULL) { + goto exit; + } + if (strlen(encoding) != (size_t)encoding_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + if (PyObject_GetBuffer(args[1], &data, PyBUF_SIMPLE) != 0) { + goto exit; + } + if (nargs < 3) { + goto skip_optional; + } + if (args[2] == Py_None) { + errors = NULL; + } + else if (PyUnicode_Check(args[2])) { + Py_ssize_t errors_length; + errors = PyUnicode_AsUTF8AndSize(args[2], &errors_length); + if (errors == NULL) { + goto exit; + } + if (strlen(errors) != (size_t)errors_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + } + else { + _PyArg_BadArgument("iconv_decode", "argument 3", "str or None", args[2]); + goto exit; + } + if (nargs < 4) { + goto skip_optional; + } + final = PyObject_IsTrue(args[3]); + if (final < 0) { + goto exit; + } +skip_optional: + return_value = _codecs_iconv_decode_impl(module, encoding, &data, errors, final); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + +#endif /* defined(HAVE_ICONV) */ + PyDoc_STRVAR(_codecs_readbuffer_encode__doc__, "readbuffer_encode($module, data, errors=None, /)\n" "--\n" @@ -2643,6 +2727,79 @@ _codecs_code_page_encode(PyObject *module, PyObject *const *args, Py_ssize_t nar #endif /* defined(MS_WINDOWS) */ +#if defined(HAVE_ICONV) + +PyDoc_STRVAR(_codecs_iconv_encode__doc__, +"iconv_encode($module, encoding, str, errors=None, /)\n" +"--\n" +"\n"); + +#define _CODECS_ICONV_ENCODE_METHODDEF \ + {"iconv_encode", _PyCFunction_CAST(_codecs_iconv_encode), METH_FASTCALL, _codecs_iconv_encode__doc__}, + +static PyObject * +_codecs_iconv_encode_impl(PyObject *module, const char *encoding, + PyObject *str, const char *errors); + +static PyObject * +_codecs_iconv_encode(PyObject *module, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + const char *encoding; + PyObject *str; + const char *errors = NULL; + + if (!_PyArg_CheckPositional("iconv_encode", nargs, 2, 3)) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("iconv_encode", "argument 1", "str", args[0]); + goto exit; + } + Py_ssize_t encoding_length; + encoding = PyUnicode_AsUTF8AndSize(args[0], &encoding_length); + if (encoding == NULL) { + goto exit; + } + if (strlen(encoding) != (size_t)encoding_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + if (!PyUnicode_Check(args[1])) { + _PyArg_BadArgument("iconv_encode", "argument 2", "str", args[1]); + goto exit; + } + str = args[1]; + if (nargs < 3) { + goto skip_optional; + } + if (args[2] == Py_None) { + errors = NULL; + } + else if (PyUnicode_Check(args[2])) { + Py_ssize_t errors_length; + errors = PyUnicode_AsUTF8AndSize(args[2], &errors_length); + if (errors == NULL) { + goto exit; + } + if (strlen(errors) != (size_t)errors_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + } + else { + _PyArg_BadArgument("iconv_encode", "argument 3", "str or None", args[2]); + goto exit; + } +skip_optional: + return_value = _codecs_iconv_encode_impl(module, encoding, str, errors); + +exit: + return return_value; +} + +#endif /* defined(HAVE_ICONV) */ + PyDoc_STRVAR(_codecs_register_error__doc__, "register_error($module, errors, handler, /)\n" "--\n" @@ -2857,6 +3014,10 @@ _codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t #define _CODECS_CODE_PAGE_DECODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */ +#ifndef _CODECS_ICONV_DECODE_METHODDEF + #define _CODECS_ICONV_DECODE_METHODDEF +#endif /* !defined(_CODECS_ICONV_DECODE_METHODDEF) */ + #ifndef _CODECS_MBCS_ENCODE_METHODDEF #define _CODECS_MBCS_ENCODE_METHODDEF #endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */ @@ -2868,4 +3029,8 @@ _codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=505edef891a06329 input=a9049054013a1b77]*/ + +#ifndef _CODECS_ICONV_ENCODE_METHODDEF + #define _CODECS_ICONV_ENCODE_METHODDEF +#endif /* !defined(_CODECS_ICONV_ENCODE_METHODDEF) */ +/*[clinic end generated code: output=912e04020d6a6144 input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 785620a186c9cd..a711a60b325b45 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -67,6 +67,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include #endif +#ifdef HAVE_ICONV +#include // iconv_open() +#endif + #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION # include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar() #endif @@ -8192,6 +8196,353 @@ PyUnicode_AsMBCSString(PyObject *unicode) #endif /* MS_WINDOWS */ +/* --- iconv Codec -------------------------------------------------------- */ + +#ifdef HAVE_ICONV + +/* iconv pivot: native-endian UTF-32, a raw array of Py_UCS4. One input unit is + one code point, so error handlers get the exact position. A platform whose + iconv lacks a UTF-32 endpoint (e.g. UTF-8-only OpenBSD) reports every encoding + as unavailable. */ +#if PY_BIG_ENDIAN +# define ICONV_PIVOT "UTF-32BE" +#else +# define ICONV_PIVOT "UTF-32LE" +#endif + +/* A 2-byte string can be fed to iconv as "UCS-2" only where that is a strict + array of independent code points. Some implementations alias "UCS-2" to + "UTF-16" and would combine an adjacent surrogate pair (a 2-byte string may + hold one as two code points); there the 2-byte kind is widened to UTF-32. + glibc and GNU libiconv keep UCS-2 and UTF-16 separate. */ +#if defined(__GLIBC__) || defined(_LIBICONV_VERSION) +# if PY_BIG_ENDIAN +# define ICONV_UCS2_PIVOT "UCS-2BE" +# else +# define ICONV_UCS2_PIVOT "UCS-2LE" +# endif +#endif + +static iconv_t +iconv_open_or_set_error(const char *tocode, const char *fromcode, + const char *encoding) +{ + iconv_t cd = iconv_open(tocode, fromcode); + if (cd == (iconv_t)-1) { + if (errno == EINVAL) { + PyErr_Format(PyExc_LookupError, "unknown encoding: %s", encoding); + } + else { + PyErr_SetFromErrno(PyExc_OSError); + } + } + return cd; +} + +/* + * Decode bytes with iconv() into a str. + * + * The input is converted to native-endian UTF-32 one chunk at a time and + * appended to a _PyUnicodeWriter. If *consumed* is non-NULL the decode is + * stateful: a trailing incomplete sequence stops and sets *consumed*. + */ +PyObject * +_PyUnicode_DecodeIconv(const char *encoding, + const char *s, Py_ssize_t size, + const char *errors, Py_ssize_t *consumed) +{ + if (size < 0) { + PyErr_BadInternalCall(); + return NULL; + } + + iconv_t cd = iconv_open_or_set_error(ICONV_PIVOT, encoding, encoding); + if (cd == (iconv_t)-1) { + return NULL; + } + + /* Scratch buffer for one iconv() output chunk, as UTF-32 code points. */ + Py_UCS4 chunk[1024]; + const char *starts = s; + const char *in = s; + const char *inend = s + size; + _PyUnicodeWriter writer; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; + + while (in < inend) { + char *inptr = (char *)in; + size_t inleft = (size_t)(inend - in); + char *outptr = (char *)chunk; + size_t outleft = sizeof(chunk); + + size_t ret = iconv(cd, &inptr, &inleft, &outptr, &outleft); + int err = errno; + in = inptr; + + /* Append whatever code points this call produced. */ + Py_ssize_t nch = (Py_UCS4 *)outptr - chunk; + if (nch > 0 && PyUnicodeWriter_WriteUCS4((PyUnicodeWriter *)&writer, + chunk, nch) < 0) { + goto error; + } + + if (ret != (size_t)-1) { + assert(in == inend); + break; + } + + if (err == E2BIG) { + /* The scratch buffer filled up; drain it and continue. */ + continue; + } + + const char *reason; + if (err == EINVAL) { + /* Incomplete multibyte sequence at the end of the input. */ + if (consumed != NULL) { + /* Stateful decoding: stop and report the consumed bytes. */ + break; + } + reason = "incomplete multibyte sequence"; + } + else if (err == EILSEQ) { + reason = "invalid multibyte sequence"; + } + else { + errno = err; + PyErr_SetFromErrno(PyExc_OSError); + goto error; + } + + Py_ssize_t startinpos = in - starts; + Py_ssize_t endinpos = startinpos + 1; + if (unicode_decode_call_errorhandler_writer( + errors, &errorHandler, encoding, reason, + &starts, &inend, &startinpos, &endinpos, &exc, &in, + &writer)) { + goto error; + } + /* The error handler may have skipped bytes; reset the conversion + descriptor to the initial shift state before continuing. */ + iconv(cd, NULL, NULL, NULL, NULL); + } + + if (consumed != NULL) { + *consumed = in - starts; + } + iconv_close(cd); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return _PyUnicodeWriter_Finish(&writer); + +error: + iconv_close(cd); + _PyUnicodeWriter_Dealloc(&writer); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return NULL; +} + +/* Grow the output buffer of a PyBytesWriter, keeping the raw cursor *pout and + the end pointer *poutend valid. Returns 0 on success, -1 on error. */ +static int +iconv_grow_writer(PyBytesWriter *writer, char **pout, char **poutend) +{ + char *base = PyBytesWriter_GetData(writer); + Py_ssize_t used = *pout - base; + Py_ssize_t cursize = PyBytesWriter_GetSize(writer); + Py_ssize_t growby = cursize > 0 ? cursize : 16; + if (PyBytesWriter_Grow(writer, growby) < 0) { + return -1; + } + base = PyBytesWriter_GetData(writer); + *pout = base + used; + *poutend = base + PyBytesWriter_GetSize(writer); + return 0; +} + +/* + * Encode a str to bytes with iconv(). + * + * The string's own buffer is fed to iconv() using the source encoding for its + * kind, avoiding a widening copy: Latin-1 for 1-byte (not ASCII: it may hold + * U+0080..U+00FF), UTF-32 for 4-byte, and UCS-2 -- or a UTF-32 copy where that + * is unsafe (see ICONV_UCS2_PIVOT) -- for 2-byte. One input unit is one code + * point, so the unit index is the string position. + */ +PyObject * +_PyUnicode_EncodeIconv(const char *encoding, PyObject *unicode, + const char *errors) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + + Py_ssize_t ulen = PyUnicode_GET_LENGTH(unicode); + const char *source; /* iconv source encoding for this kind */ + const char *data; /* the units to encode */ + Py_ssize_t unit; /* bytes per code point in *data */ + Py_UCS4 *widened = NULL; /* owned UTF-32 copy of a 2-byte string */ + int kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + source = "ISO-8859-1"; + data = (const char *)PyUnicode_1BYTE_DATA(unicode); + unit = 1; + } + else if (kind == PyUnicode_4BYTE_KIND) { + source = ICONV_PIVOT; + data = (const char *)PyUnicode_4BYTE_DATA(unicode); + unit = 4; + } + else { +#ifdef ICONV_UCS2_PIVOT + /* Known-strict UCS-2: feed the 2-byte buffer directly. */ + source = ICONV_UCS2_PIVOT; + data = (const char *)PyUnicode_2BYTE_DATA(unicode); + unit = 2; +#else + /* UCS-2 may be aliased to UTF-16 here; widen to UTF-32 to be safe. */ + widened = PyUnicode_AsUCS4Copy(unicode); + if (widened == NULL) { + return NULL; + } + source = ICONV_PIVOT; + data = (const char *)widened; + unit = 4; +#endif + } + + iconv_t cd = iconv_open_or_set_error(encoding, source, encoding); + if (cd == (iconv_t)-1) { + PyMem_Free(widened); + return NULL; + } + + PyBytesWriter *writer = NULL; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + PyObject *result = NULL; + const char *ustart = data; + const char *up = data; + const char *uend = data + (size_t)ulen * unit; + int flushing = 0; + + /* A generous initial estimate for the output size. */ + writer = PyBytesWriter_Create(ulen + (ulen >> 1) + 16); + if (writer == NULL) { + goto done; + } + char *out = PyBytesWriter_GetData(writer); + char *outend = out + PyBytesWriter_GetSize(writer); + + for (;;) { + char *inptr = (char *)up; + size_t inleft = (size_t)(uend - up); + size_t outleft = (size_t)(outend - out); + /* When the whole string is converted, a final iconv() call with a + NULL input flushes any pending shift sequence (e.g. ISO-2022). */ + size_t ret = iconv(cd, flushing ? NULL : &inptr, &inleft, &out, &outleft); + if (!flushing) { + up = inptr; + } + + if (ret != (size_t)-1) { + if (flushing) { + break; + } + /* All input consumed; switch to flushing the shift state. */ + flushing = 1; + continue; + } + + if (errno == E2BIG) { + if (iconv_grow_writer(writer, &out, &outend) < 0) { + goto done; + } + continue; + } + if (errno != EILSEQ && errno != EINVAL) { + PyErr_SetFromErrno(PyExc_OSError); + goto done; + } + + /* An unencodable code point at *up; one input unit is one code point. */ + Py_ssize_t pos = (up - ustart) / unit; + Py_ssize_t newpos; + PyObject *rep = unicode_encode_call_errorhandler( + errors, &errorHandler, encoding, "invalid character", + unicode, &exc, pos, pos + 1, &newpos); + if (rep == NULL) { + goto done; + } + + const char *repdata; + Py_ssize_t replen; + PyObject *repbytes = NULL; + if (PyBytes_Check(rep)) { + repdata = PyBytes_AS_STRING(rep); + replen = PyBytes_GET_SIZE(rep); + } + else { + /* A str replacement is encoded through the same codec. */ + assert(PyUnicode_Check(rep)); + repbytes = _PyUnicode_EncodeIconv(encoding, rep, errors); + Py_DECREF(rep); + if (repbytes == NULL) { + goto done; + } + repdata = PyBytes_AS_STRING(repbytes); + replen = PyBytes_GET_SIZE(repbytes); + } + + while (outend - out < replen) { + if (iconv_grow_writer(writer, &out, &outend) < 0) { + if (repbytes != NULL) { + Py_DECREF(repbytes); + } + else { + Py_DECREF(rep); + } + goto done; + } + } + memcpy(out, repdata, replen); + out += replen; + if (repbytes != NULL) { + Py_DECREF(repbytes); + } + else { + Py_DECREF(rep); + } + up = ustart + (size_t)newpos * unit; + /* Reset the shift state after the injected replacement bytes. */ + iconv(cd, NULL, NULL, NULL, NULL); + } + + if (PyBytesWriter_Resize(writer, out - (char *)PyBytesWriter_GetData(writer)) < 0) { + goto done; + } + result = PyBytesWriter_Finish(writer); + writer = NULL; + +done: + if (writer != NULL) { + PyBytesWriter_Discard(writer); + } + iconv_close(cd); + PyMem_Free(widened); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return result; +} + +#endif /* HAVE_ICONV */ + /* --- Character Mapping Codec -------------------------------------------- */ static int diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index 17b98c9d9ec345..5a47a263516ae0 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -334,6 +334,11 @@ $(IntDir)encodings._win_cp_codecs.g.h $(GeneratedFrozenModulesDir)Python\frozen_modules\encodings._win_cp_codecs.h + + encodings._iconv_codecs + $(IntDir)encodings._iconv_codecs.g.h + $(GeneratedFrozenModulesDir)Python\frozen_modules\encodings._iconv_codecs.h + io $(IntDir)io.g.h diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index af3fded0dabf2d..83a2d60ec847b3 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -561,6 +561,9 @@ Python Files + + Python Files + Python Files diff --git a/Python/frozen.c b/Python/frozen.c index 1fae26f8dbccb0..03ee612252343d 100644 --- a/Python/frozen.c +++ b/Python/frozen.c @@ -50,6 +50,7 @@ #include "frozen_modules/encodings.aliases.h" #include "frozen_modules/encodings.utf_8.h" #include "frozen_modules/encodings._win_cp_codecs.h" +#include "frozen_modules/encodings._iconv_codecs.h" #include "frozen_modules/io.h" #include "frozen_modules/_collections_abc.h" #include "frozen_modules/_sitebuiltins.h" @@ -85,6 +86,7 @@ static const struct _frozen stdlib_modules[] = { {"encodings.aliases", _Py_M__encodings_aliases, (int)sizeof(_Py_M__encodings_aliases), false}, {"encodings.utf_8", _Py_M__encodings_utf_8, (int)sizeof(_Py_M__encodings_utf_8), false}, {"encodings._win_cp_codecs", _Py_M__encodings__win_cp_codecs, (int)sizeof(_Py_M__encodings__win_cp_codecs), false}, + {"encodings._iconv_codecs", _Py_M__encodings__iconv_codecs, (int)sizeof(_Py_M__encodings__iconv_codecs), false}, {"io", _Py_M__io, (int)sizeof(_Py_M__io), false}, /* stdlib - startup, with site */ diff --git a/Tools/build/freeze_modules.py b/Tools/build/freeze_modules.py index b8b17ceb4f4291..b29cc730ca8b24 100644 --- a/Tools/build/freeze_modules.py +++ b/Tools/build/freeze_modules.py @@ -54,6 +54,7 @@ 'encodings.aliases', 'encodings.utf_8', 'encodings._win_cp_codecs', + 'encodings._iconv_codecs', 'io', ]), ('stdlib - startup, with site', [ diff --git a/configure b/configure index 01faef615a3d5e..89207d335746ae 100755 --- a/configure +++ b/configure @@ -14640,6 +14640,98 @@ then : fi # Dynamic linking for HP-UX +ac_fn_c_check_header_compile "$LINENO" "iconv.h" "ac_cv_header_iconv_h" "$ac_includes_default" +if test "x$ac_cv_header_iconv_h" = xyes +then : + printf "%s\n" "#define HAVE_ICONV_H 1" >>confdefs.h + +fi + +case $ac_sys_system in #( + Emscripten|WASI) : + py_have_iconv=no ;; #( + *) : + py_have_iconv=$ac_cv_header_iconv_h ;; +esac +if test "$py_have_iconv" = yes; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for iconv" >&5 +printf %s "checking for iconv... " >&6; } +if test ${ac_cv_have_iconv+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) + ac_cv_have_iconv=no + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#include +#include + +int +main (void) +{ + + iconv_t cd = iconv_open("", ""); + iconv(cd, NULL, NULL, NULL, NULL); + iconv_close(cd); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + ac_cv_have_iconv=yes +else case e in #( + e) + py_save_LIBS="$LIBS" + LIBS="-liconv $LIBS" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#include +#include + +int +main (void) +{ + + iconv_t cd = iconv_open("", ""); + iconv(cd, NULL, NULL, NULL, NULL); + iconv_close(cd); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + ac_cv_have_iconv=-liconv +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + LIBS="$py_save_LIBS" + ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_have_iconv" >&5 +printf "%s\n" "$ac_cv_have_iconv" >&6; } + if test "$ac_cv_have_iconv" != no; then + +printf "%s\n" "#define HAVE_ICONV 1" >>confdefs.h + + if test "$ac_cv_have_iconv" = -liconv; then + LIBS="-liconv $LIBS" + fi + fi +fi + for ac_header in execinfo.h link.h dlfcn.h do : diff --git a/configure.ac b/configure.ac index a9fe5c269618fc..0ae58cff4f61ad 100644 --- a/configure.ac +++ b/configure.ac @@ -3913,6 +3913,48 @@ AC_CHECK_LIB([sendfile], [sendfile]) AC_CHECK_LIB([dl], [dlopen]) # Dynamic linking for SunOS/Solaris and SYSV AC_CHECK_LIB([dld], [shl_load]) # Dynamic linking for HP-UX +dnl Check for a working iconv() for the iconv codec. On glibc it is part of +dnl the C library; on some systems it lives in a separate libiconv. +dnl The wasm platforms (Emscripten, WASI) ship a non-conforming iconv() that +dnl does not report unencodable characters, which would make the iconv codec +dnl silently lossy, so treat iconv as unavailable there. +AC_CHECK_HEADERS([iconv.h]) +AS_CASE([$ac_sys_system], + [Emscripten|WASI], [py_have_iconv=no], + [py_have_iconv=$ac_cv_header_iconv_h]) +if test "$py_have_iconv" = yes; then + AC_CACHE_CHECK([for iconv], [ac_cv_have_iconv], [ + ac_cv_have_iconv=no + AC_LINK_IFELSE([AC_LANG_PROGRAM([[ +#include +#include + ]], [[ + iconv_t cd = iconv_open("", ""); + iconv(cd, NULL, NULL, NULL, NULL); + iconv_close(cd); + ]])], [ac_cv_have_iconv=yes], [ + py_save_LIBS="$LIBS" + LIBS="-liconv $LIBS" + AC_LINK_IFELSE([AC_LANG_PROGRAM([[ +#include +#include + ]], [[ + iconv_t cd = iconv_open("", ""); + iconv(cd, NULL, NULL, NULL, NULL); + iconv_close(cd); + ]])], [ac_cv_have_iconv=-liconv]) + LIBS="$py_save_LIBS" + ]) + ]) + if test "$ac_cv_have_iconv" != no; then + AC_DEFINE([HAVE_ICONV], [1], + [Define if you have a working iconv() function.]) + if test "$ac_cv_have_iconv" = -liconv; then + LIBS="-liconv $LIBS" + fi + fi +fi + dnl for faulthandler AC_CHECK_HEADERS([execinfo.h link.h dlfcn.h], [ diff --git a/pyconfig.h.in b/pyconfig.h.in index ce97099315bfe4..2d8ea64798a4d3 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -703,6 +703,12 @@ /* Define this if you have le64toh() */ #undef HAVE_HTOLE64 +/* Define if you have a working iconv() function. */ +#undef HAVE_ICONV + +/* Define to 1 if you have the header file. */ +#undef HAVE_ICONV_H + /* Define to 1 if you have the 'if_nameindex' function. */ #undef HAVE_IF_NAMEINDEX