diff --git a/CHANGES.current b/CHANGES.current index 2a0017822..f99e70398 100644 --- a/CHANGES.current +++ b/CHANGES.current @@ -5,6 +5,11 @@ See the RELEASENOTES file for a summary of changes in each release. Version 3.0.9 (in progress) =========================== +2016-01-27: ahnolds + [Python] Added support for differentiating between Python Bytes + and Unicode objects using by defining SWIG_PYTHON_STRICT_BYTE_CHAR + and SWIG_PYTHON_STRICT_UNICODE_WCHAR. + 2016-01-27: steeve [Go] Ensure structs are properly packed between gc and GCC/clang. diff --git a/Doc/Manual/Python.html b/Doc/Manual/Python.html index c691d89cf..0cd656ff2 100644 --- a/Doc/Manual/Python.html +++ b/Doc/Manual/Python.html @@ -6165,6 +6165,84 @@ For more details about the surrogateescape error handler, please see PEP 383.

+

+In some cases, users may wish to instead handle all byte strings as bytes +objects in Python 3. This can be accomplished by adding +SWIG_PYTHON_STRICT_BYTE_CHAR to the generated code: +

+ +
+%module char_to_bytes
+%begin %{
+#define SWIG_PYTHON_STRICT_BYTE_CHAR
+%}
+
+char *charstring(char *s) {
+  return s;
+}
+
+ +

+This will modify the behavior so that only Python 3 bytes objects will be +accepted and converted to a C/C++ string, and any string returned from C/C++ +will be converted to a bytes object in Python 3: +

+ +
+>>> from char_to_bytes import *
+>>> charstring(b"hi") # Byte string
+b'hi'
+>>> charstring("hi")  # Unicode string
+Traceback (most recent call last):
+  File "<stdin>", line 1, in ?
+TypeError: in method 'charstring', argument 1 of type 'char *'
+
+ +

+Note that in Python 2, defining SWIG_PYTHON_STRICT_BYTE_CHAR has no +effect, since strings in Python 2 are equivalent to Python 3 bytes objects. +However, there is a similar capability to force unicode-only handling for +wide characters C/C++ strings (wchar_t * or std::wstring +types) in Python 2. By default, in Python 2 both strings and unicode strings +are converted to C/C++ wide strings, and returned wide strings are converted +to a Python unicode string. To instead only convert unicode strings to wide +strings, users can add SWIG_PYTHON_STRICT_UNICODE_WCHAR to the +generated code: +

+ +
+%module wchar_to_unicode
+%begin %{
+#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
+%}
+
+wchar_t *wcharstring(wchar_t *s) {
+  return s;
+}
+
+ +

+This ensures that only unicode strings are accepted by wcharstring in both +Python 2 and Python 3: +

+ +
+>>> from wchar_to_unicode import *
+>>> wcharstring(u"hi") # Unicode string
+u'hi'
+>>> wcharstring(b"hi") # Byte string
+Traceback (most recent call last):
+  File "<stdin>", line 1, in ?
+TypeError: in method 'charstring', argument 1 of type 'wchar_t *'
+
+ +

+By defining both SWIG_PYTHON_STRICT_BYTE_CHAR and +SWIG_PYTHON_STRICT_UNICODE_WCHAR, Python wrapper code can support +overloads taking both std::string (as Python bytes) and std::wstring +(as Python unicode). +

+

36.12.5 Python 2 Unicode

@@ -6230,6 +6308,13 @@ but note that they are returned as a normal Python 2 string: >>> +

+Note that defining both SWIG_PYTHON_2_UNICODE and +SWIG_PYTHON_STRICT_BYTE_CHAR at the same time is not allowed, since +the first is allowing unicode conversion and the second is explicitly +prohibiting it. +

+ diff --git a/Examples/test-suite/python/Makefile.in b/Examples/test-suite/python/Makefile.in index 096e624ac..0c47d19ce 100644 --- a/Examples/test-suite/python/Makefile.in +++ b/Examples/test-suite/python/Makefile.in @@ -65,6 +65,7 @@ CPP_TEST_CASES += \ python_overload_simple_cast \ python_pythoncode \ python_richcompare \ + python_strict_unicode \ simutry \ std_containers \ swigobject \ diff --git a/Examples/test-suite/python/python_strict_unicode_runme.py b/Examples/test-suite/python/python_strict_unicode_runme.py new file mode 100644 index 000000000..642e127fa --- /dev/null +++ b/Examples/test-suite/python/python_strict_unicode_runme.py @@ -0,0 +1,79 @@ +import python_strict_unicode +from sys import version_info + +test_bytes = 'hello \x01world\x99' +BYTES = 'BYTES' +test_unicode = u'h\udce9llo w\u00f6rld' + +# Python < 2.6 rejects the b prefix for byte string literals as a SyntaxError, +# so instead create Python3 bytes objects by encoding unicode strings as +# latin-1, which maps code points 0-255 directly to the corresponding bytes. +if version_info[0] >= 3: + test_bytes = test_bytes.encode('latin-1') + BYTES = BYTES.encode('latin-1') + +# Test that byte string inputs and outputs work as expected +bdbl = python_strict_unicode.double_str(test_bytes) +if bdbl != test_bytes + test_bytes: + raise RuntimeError("Failed to double string") +if type(bdbl) != type(BYTES): + raise RuntimeError("Wrong type output for string") +bout = python_strict_unicode.same_str(test_bytes) +if bout != test_bytes: + raise RuntimeError("Failed to copy char*") +if type(bout) != type(BYTES): + raise RuntimeError("Wrong type output for char*") + +# Test that unicode string inputs and outputs work as expected +udbl = python_strict_unicode.double_wstr(test_unicode) +if udbl != test_unicode + test_unicode: + raise RuntimeError("Failed to double wide string") +if type(udbl) != type(u''): + raise RuntimeError("Wrong type output for wide string") +uout = python_strict_unicode.same_wstr(test_unicode) +if uout != test_unicode: + raise RuntimeError("Failed to copy wchar_t*") +if type(uout) != type(u''): + raise RuntimeError("Wrong type output for wchar_t*") + +# Test that overloading is handled properly +bovr = python_strict_unicode.overload(test_bytes) +if bovr != BYTES: + raise RuntimeError("Failed to return bytes from overload") +if type(bovr) != type(BYTES): + raise RuntimeError("Wrong type output from overload") +uovr = python_strict_unicode.overload(test_unicode) +if uovr != u'UNICODE': + raise RuntimeError("Failed to return unicode from overload") +if type(uovr) != type(u''): + raise RuntimeERror("Wrong type output from overload") + +# Test that bytes aren't accepted as wide strings and unicode isn't accepted as narrow strings +try: + python_strict_unicode.double_str(test_unicode) + error = 1 +except TypeError: + error = 0 +if error: + raise RuntimeError("Unicode accepted for string") +try: + python_strict_unicode.same_str(test_unicode) + error = 1 +except TypeError: + error = 0 +if error: + raise RuntimeError("Unicode accepted for char*") +try: + python_strict_unicode.double_wstr(test_bytes) + error = 1 +except TypeError: + error = 0 +if error: + raise RuntimeError("Bytes accepted for wstring") +try: + python_strict_unicode.same_wstr(test_bytes) + error = 1 +except TypeError: + error = 0 +if error: + raise RuntimeError("Bytes accepted for wchar_t*") diff --git a/Examples/test-suite/python_strict_unicode.i b/Examples/test-suite/python_strict_unicode.i new file mode 100644 index 000000000..93240a9b7 --- /dev/null +++ b/Examples/test-suite/python_strict_unicode.i @@ -0,0 +1,41 @@ +%module python_strict_unicode + +%include +%include + +%begin %{ +#define SWIG_PYTHON_STRICT_BYTE_CHAR +#define SWIG_PYTHON_STRICT_UNICODE_WCHAR +%} + +%inline %{ +std::string double_str(const std::string& in) +{ + return in + in; +} + +char *same_str(char* in) +{ + return in; +} + +std::wstring double_wstr(const std::wstring& in) +{ + return in + in; +} + +wchar_t *same_wstr(wchar_t* in) +{ + return in; +} + +std::wstring overload(const std::wstring& in) +{ + return L"UNICODE"; +} + +std::string overload(const std::string& in) +{ + return "BYTES"; +} +%} diff --git a/Lib/python/pystrings.swg b/Lib/python/pystrings.swg index a088c4cea..fd37855eb 100644 --- a/Lib/python/pystrings.swg +++ b/Lib/python/pystrings.swg @@ -6,13 +6,18 @@ SWIGINTERN int SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) { %#if PY_VERSION_HEX>=0x03000000 +%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) + if (PyBytes_Check(obj)) +%#else if (PyUnicode_Check(obj)) +%#endif %#else if (PyString_Check(obj)) %#endif { char *cstr; Py_ssize_t len; %#if PY_VERSION_HEX>=0x03000000 +%#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) if (!alloc && cptr) { /* We can't allow converting without allocation, since the internal representation of string in Python 3 is UCS-2/UCS-4 but we require @@ -21,8 +26,9 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) return SWIG_RuntimeError; } obj = PyUnicode_AsUTF8String(obj); - PyBytes_AsStringAndSize(obj, &cstr, &len); if(alloc) *alloc = SWIG_NEWOBJ; +%#endif + PyBytes_AsStringAndSize(obj, &cstr, &len); %#else PyString_AsStringAndSize(obj, &cstr, &len); %#endif @@ -50,19 +56,27 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc) *alloc = SWIG_OLDOBJ; } } else { - %#if PY_VERSION_HEX>=0x03000000 - assert(0); /* Should never reach here in Python 3 */ - %#endif +%#if PY_VERSION_HEX>=0x03000000 +%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) + *cptr = PyBytes_AsString(obj); +%#else + assert(0); /* Should never reach here with Unicode strings in Python 3 */ +%#endif +%#else *cptr = SWIG_Python_str_AsChar(obj); +%#endif } } if (psize) *psize = len + 1; -%#if PY_VERSION_HEX>=0x03000000 +%#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR) Py_XDECREF(obj); %#endif return SWIG_OK; } else { %#if defined(SWIG_PYTHON_2_UNICODE) +%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) +%#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once" +%#endif %#if PY_VERSION_HEX<0x03000000 if (PyUnicode_Check(obj)) { char *cstr; Py_ssize_t len; @@ -112,11 +126,15 @@ SWIG_FromCharPtrAndSize(const char* carray, size_t size) SWIG_InternalNewPointerObj(%const_cast(carray,char *), pchar_descriptor, 0) : SWIG_Py_Void(); } else { %#if PY_VERSION_HEX >= 0x03000000 +%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR) + return PyBytes_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t)); +%#else %#if PY_VERSION_HEX >= 0x03010000 return PyUnicode_DecodeUTF8(carray, %numeric_cast(size, Py_ssize_t), "surrogateescape"); %#else return PyUnicode_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t)); %#endif +%#endif %#else return PyString_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t)); %#endif diff --git a/Lib/python/pywstrings.swg b/Lib/python/pywstrings.swg index 79f193b61..e64618762 100644 --- a/Lib/python/pywstrings.swg +++ b/Lib/python/pywstrings.swg @@ -16,7 +16,7 @@ SWIG_AsWCharPtrAndSize(PyObject *obj, wchar_t **cptr, size_t *psize, int *alloc) { PyObject *tmp = 0; int isunicode = PyUnicode_Check(obj); -%#if PY_VERSION_HEX < 0x03000000 +%#if PY_VERSION_HEX < 0x03000000 && !defined(SWIG_PYTHON_STRICT_UNICODE_WCHAR) if (!isunicode && PyString_Check(obj)) { obj = tmp = PyUnicode_FromObject(obj); isunicode = 1;