From 33edfa4a6243563a46508343c3836943073cc020 Mon Sep 17 00:00:00 2001
From: Alec Cooper
Date: Wed, 27 Jan 2016 19:45:37 -0500
Subject: [PATCH 1/3] Add support for Python Bytes/Unicode distinction
---
Lib/python/pystrings.swg | 28 +++++++++++++++++++++++-----
Lib/python/pywstrings.swg | 2 +-
2 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/Lib/python/pystrings.swg b/Lib/python/pystrings.swg
index a088c4cea..fd37855eb 100644
--- a/Lib/python/pystrings.swg
+++ b/Lib/python/pystrings.swg
@@ -6,13 +6,18 @@ SWIGINTERN int
SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
{
%#if PY_VERSION_HEX>=0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+ if (PyBytes_Check(obj))
+%#else
if (PyUnicode_Check(obj))
+%#endif
%#else
if (PyString_Check(obj))
%#endif
{
char *cstr; Py_ssize_t len;
%#if PY_VERSION_HEX>=0x03000000
+%#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
if (!alloc && cptr) {
/* We can't allow converting without allocation, since the internal
representation of string in Python 3 is UCS-2/UCS-4 but we require
@@ -21,8 +26,9 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
return SWIG_RuntimeError;
}
obj = PyUnicode_AsUTF8String(obj);
- PyBytes_AsStringAndSize(obj, &cstr, &len);
if(alloc) *alloc = SWIG_NEWOBJ;
+%#endif
+ PyBytes_AsStringAndSize(obj, &cstr, &len);
%#else
PyString_AsStringAndSize(obj, &cstr, &len);
%#endif
@@ -50,19 +56,27 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
*alloc = SWIG_OLDOBJ;
}
} else {
- %#if PY_VERSION_HEX>=0x03000000
- assert(0); /* Should never reach here in Python 3 */
- %#endif
+%#if PY_VERSION_HEX>=0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+ *cptr = PyBytes_AsString(obj);
+%#else
+ assert(0); /* Should never reach here with Unicode strings in Python 3 */
+%#endif
+%#else
*cptr = SWIG_Python_str_AsChar(obj);
+%#endif
}
}
if (psize) *psize = len + 1;
-%#if PY_VERSION_HEX>=0x03000000
+%#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
Py_XDECREF(obj);
%#endif
return SWIG_OK;
} else {
%#if defined(SWIG_PYTHON_2_UNICODE)
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+%#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once"
+%#endif
%#if PY_VERSION_HEX<0x03000000
if (PyUnicode_Check(obj)) {
char *cstr; Py_ssize_t len;
@@ -112,11 +126,15 @@ SWIG_FromCharPtrAndSize(const char* carray, size_t size)
SWIG_InternalNewPointerObj(%const_cast(carray,char *), pchar_descriptor, 0) : SWIG_Py_Void();
} else {
%#if PY_VERSION_HEX >= 0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+ return PyBytes_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
+%#else
%#if PY_VERSION_HEX >= 0x03010000
return PyUnicode_DecodeUTF8(carray, %numeric_cast(size, Py_ssize_t), "surrogateescape");
%#else
return PyUnicode_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
%#endif
+%#endif
%#else
return PyString_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
%#endif
diff --git a/Lib/python/pywstrings.swg b/Lib/python/pywstrings.swg
index 79f193b61..e64618762 100644
--- a/Lib/python/pywstrings.swg
+++ b/Lib/python/pywstrings.swg
@@ -16,7 +16,7 @@ SWIG_AsWCharPtrAndSize(PyObject *obj, wchar_t **cptr, size_t *psize, int *alloc)
{
PyObject *tmp = 0;
int isunicode = PyUnicode_Check(obj);
-%#if PY_VERSION_HEX < 0x03000000
+%#if PY_VERSION_HEX < 0x03000000 && !defined(SWIG_PYTHON_STRICT_UNICODE_WCHAR)
if (!isunicode && PyString_Check(obj)) {
obj = tmp = PyUnicode_FromObject(obj);
isunicode = 1;
From 17a4143dd053eff08df8913993b15738f690c949 Mon Sep 17 00:00:00 2001
From: Alec Cooper
Date: Wed, 27 Jan 2016 20:28:32 -0500
Subject: [PATCH 2/3] Tests for Python Bytes/Unicode distinction
---
Examples/test-suite/python/Makefile.in | 1 +
.../python/python_strict_unicode_runme.py | 79 +++++++++++++++++++
Examples/test-suite/python_strict_unicode.i | 41 ++++++++++
3 files changed, 121 insertions(+)
create mode 100644 Examples/test-suite/python/python_strict_unicode_runme.py
create mode 100644 Examples/test-suite/python_strict_unicode.i
diff --git a/Examples/test-suite/python/Makefile.in b/Examples/test-suite/python/Makefile.in
index 096e624ac..0c47d19ce 100644
--- a/Examples/test-suite/python/Makefile.in
+++ b/Examples/test-suite/python/Makefile.in
@@ -65,6 +65,7 @@ CPP_TEST_CASES += \
python_overload_simple_cast \
python_pythoncode \
python_richcompare \
+ python_strict_unicode \
simutry \
std_containers \
swigobject \
diff --git a/Examples/test-suite/python/python_strict_unicode_runme.py b/Examples/test-suite/python/python_strict_unicode_runme.py
new file mode 100644
index 000000000..642e127fa
--- /dev/null
+++ b/Examples/test-suite/python/python_strict_unicode_runme.py
@@ -0,0 +1,79 @@
+import python_strict_unicode
+from sys import version_info
+
+test_bytes = 'hello \x01world\x99'
+BYTES = 'BYTES'
+test_unicode = u'h\udce9llo w\u00f6rld'
+
+# Python < 2.6 rejects the b prefix for byte string literals as a SyntaxError,
+# so instead create Python3 bytes objects by encoding unicode strings as
+# latin-1, which maps code points 0-255 directly to the corresponding bytes.
+if version_info[0] >= 3:
+ test_bytes = test_bytes.encode('latin-1')
+ BYTES = BYTES.encode('latin-1')
+
+# Test that byte string inputs and outputs work as expected
+bdbl = python_strict_unicode.double_str(test_bytes)
+if bdbl != test_bytes + test_bytes:
+ raise RuntimeError("Failed to double string")
+if type(bdbl) != type(BYTES):
+ raise RuntimeError("Wrong type output for string")
+bout = python_strict_unicode.same_str(test_bytes)
+if bout != test_bytes:
+ raise RuntimeError("Failed to copy char*")
+if type(bout) != type(BYTES):
+ raise RuntimeError("Wrong type output for char*")
+
+# Test that unicode string inputs and outputs work as expected
+udbl = python_strict_unicode.double_wstr(test_unicode)
+if udbl != test_unicode + test_unicode:
+ raise RuntimeError("Failed to double wide string")
+if type(udbl) != type(u''):
+ raise RuntimeError("Wrong type output for wide string")
+uout = python_strict_unicode.same_wstr(test_unicode)
+if uout != test_unicode:
+ raise RuntimeError("Failed to copy wchar_t*")
+if type(uout) != type(u''):
+ raise RuntimeError("Wrong type output for wchar_t*")
+
+# Test that overloading is handled properly
+bovr = python_strict_unicode.overload(test_bytes)
+if bovr != BYTES:
+ raise RuntimeError("Failed to return bytes from overload")
+if type(bovr) != type(BYTES):
+ raise RuntimeError("Wrong type output from overload")
+uovr = python_strict_unicode.overload(test_unicode)
+if uovr != u'UNICODE':
+ raise RuntimeError("Failed to return unicode from overload")
+if type(uovr) != type(u''):
+ raise RuntimeERror("Wrong type output from overload")
+
+# Test that bytes aren't accepted as wide strings and unicode isn't accepted as narrow strings
+try:
+ python_strict_unicode.double_str(test_unicode)
+ error = 1
+except TypeError:
+ error = 0
+if error:
+ raise RuntimeError("Unicode accepted for string")
+try:
+ python_strict_unicode.same_str(test_unicode)
+ error = 1
+except TypeError:
+ error = 0
+if error:
+ raise RuntimeError("Unicode accepted for char*")
+try:
+ python_strict_unicode.double_wstr(test_bytes)
+ error = 1
+except TypeError:
+ error = 0
+if error:
+ raise RuntimeError("Bytes accepted for wstring")
+try:
+ python_strict_unicode.same_wstr(test_bytes)
+ error = 1
+except TypeError:
+ error = 0
+if error:
+ raise RuntimeError("Bytes accepted for wchar_t*")
diff --git a/Examples/test-suite/python_strict_unicode.i b/Examples/test-suite/python_strict_unicode.i
new file mode 100644
index 000000000..93240a9b7
--- /dev/null
+++ b/Examples/test-suite/python_strict_unicode.i
@@ -0,0 +1,41 @@
+%module python_strict_unicode
+
+%include
+%include
+
+%begin %{
+#define SWIG_PYTHON_STRICT_BYTE_CHAR
+#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
+%}
+
+%inline %{
+std::string double_str(const std::string& in)
+{
+ return in + in;
+}
+
+char *same_str(char* in)
+{
+ return in;
+}
+
+std::wstring double_wstr(const std::wstring& in)
+{
+ return in + in;
+}
+
+wchar_t *same_wstr(wchar_t* in)
+{
+ return in;
+}
+
+std::wstring overload(const std::wstring& in)
+{
+ return L"UNICODE";
+}
+
+std::string overload(const std::string& in)
+{
+ return "BYTES";
+}
+%}
From ba40c4a2562c935dd2846dd15cdfc55952c7b8f6 Mon Sep 17 00:00:00 2001
From: Alec Cooper
Date: Thu, 28 Jan 2016 08:29:01 -0500
Subject: [PATCH 3/3] Documentation on Python Bytes/Unicode distinction
---
CHANGES.current | 5 +++
Doc/Manual/Python.html | 85 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 90 insertions(+)
diff --git a/CHANGES.current b/CHANGES.current
index 2a0017822..f99e70398 100644
--- a/CHANGES.current
+++ b/CHANGES.current
@@ -5,6 +5,11 @@ See the RELEASENOTES file for a summary of changes in each release.
Version 3.0.9 (in progress)
===========================
+2016-01-27: ahnolds
+ [Python] Added support for differentiating between Python Bytes
+ and Unicode objects using by defining SWIG_PYTHON_STRICT_BYTE_CHAR
+ and SWIG_PYTHON_STRICT_UNICODE_WCHAR.
+
2016-01-27: steeve
[Go] Ensure structs are properly packed between gc and GCC/clang.
diff --git a/Doc/Manual/Python.html b/Doc/Manual/Python.html
index c691d89cf..0cd656ff2 100644
--- a/Doc/Manual/Python.html
+++ b/Doc/Manual/Python.html
@@ -6165,6 +6165,84 @@ For more details about the surrogateescape error handler, please see
PEP 383.
+
+In some cases, users may wish to instead handle all byte strings as bytes
+objects in Python 3. This can be accomplished by adding
+SWIG_PYTHON_STRICT_BYTE_CHAR to the generated code:
+
+
+
+%module char_to_bytes
+%begin %{
+#define SWIG_PYTHON_STRICT_BYTE_CHAR
+%}
+
+char *charstring(char *s) {
+ return s;
+}
+
+
+
+This will modify the behavior so that only Python 3 bytes objects will be
+accepted and converted to a C/C++ string, and any string returned from C/C++
+will be converted to a bytes object in Python 3:
+
+
+
+>>> from char_to_bytes import *
+>>> charstring(b"hi") # Byte string
+b'hi'
+>>> charstring("hi") # Unicode string
+Traceback (most recent call last):
+ File "<stdin>", line 1, in ?
+TypeError: in method 'charstring', argument 1 of type 'char *'
+
+
+
+Note that in Python 2, defining SWIG_PYTHON_STRICT_BYTE_CHAR has no
+effect, since strings in Python 2 are equivalent to Python 3 bytes objects.
+However, there is a similar capability to force unicode-only handling for
+wide characters C/C++ strings (wchar_t * or std::wstring
+types) in Python 2. By default, in Python 2 both strings and unicode strings
+are converted to C/C++ wide strings, and returned wide strings are converted
+to a Python unicode string. To instead only convert unicode strings to wide
+strings, users can add SWIG_PYTHON_STRICT_UNICODE_WCHAR to the
+generated code:
+
+
+
+%module wchar_to_unicode
+%begin %{
+#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
+%}
+
+wchar_t *wcharstring(wchar_t *s) {
+ return s;
+}
+
+
+
+This ensures that only unicode strings are accepted by wcharstring in both
+Python 2 and Python 3:
+
+
+
+>>> from wchar_to_unicode import *
+>>> wcharstring(u"hi") # Unicode string
+u'hi'
+>>> wcharstring(b"hi") # Byte string
+Traceback (most recent call last):
+ File "<stdin>", line 1, in ?
+TypeError: in method 'charstring', argument 1 of type 'wchar_t *'
+
+
+
+By defining both SWIG_PYTHON_STRICT_BYTE_CHAR and
+SWIG_PYTHON_STRICT_UNICODE_WCHAR, Python wrapper code can support
+overloads taking both std::string (as Python bytes) and std::wstring
+(as Python unicode).
+
+
@@ -6230,6 +6308,13 @@ but note that they are returned as a normal Python 2 string:
>>>
+
+Note that defining both SWIG_PYTHON_2_UNICODE and
+SWIG_PYTHON_STRICT_BYTE_CHAR at the same time is not allowed, since
+the first is allowing unicode conversion and the second is explicitly
+prohibiting it.
+
+