Merge branch 'ahnolds-pyunicode_wstrings'

* ahnolds-pyunicode_wstrings: Documentation on Python Bytes/Unicode distinction Tests for Python Bytes/Unicode distinction Add support for Python Bytes/Unicode distinction
2016-02-06 07:57:30 +00:00 · 2016-02-06 07:57:30 +00:00 · 5c758f6972
commit 5c758f6972
parent 439f049115 ba40c4a256
7 changed files with 235 additions and 6 deletions
--- a/CHANGES.current
+++ b/CHANGES.current
@ -5,6 +5,11 @@ See the RELEASENOTES file for a summary of changes in each release.
 Version 3.0.9 (in progress)
 ===========================

+2016-01-27: ahnolds
+            [Python] Added support for differentiating between Python Bytes
+            and Unicode objects using by defining SWIG_PYTHON_STRICT_BYTE_CHAR
+            and SWIG_PYTHON_STRICT_UNICODE_WCHAR.
+
 2016-01-27: steeve
            [Go] Ensure structs are properly packed between gc and GCC/clang.

--- a/Doc/Manual/Python.html
+++ b/Doc/Manual/Python.html
@ -6165,6 +6165,84 @@ For more details about the <tt>surrogateescape</tt> error handler, please see
 <a href="https://www.python.org/dev/peps/pep-0383/">PEP 383</a>.
 </p>

+<p>
+In some cases, users may wish to instead handle all byte strings as bytes
+objects in Python 3. This can be accomplished by adding
+<tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> to the generated code:
+</p>
+
+<div class="code"><pre>
+%module char_to_bytes
+%begin %{
+#define SWIG_PYTHON_STRICT_BYTE_CHAR
+%}
+
+char *charstring(char *s) {
+  return s;
+}
+</pre></div>
+
+<p>
+This will modify the behavior so that only Python 3 bytes objects will be
+accepted and converted to a C/C++ string, and any string returned from C/C++
+will be converted to a bytes object in Python 3:
+</p>
+
+<div class="targetlang"><pre>
+&gt;&gt;&gt; from char_to_bytes import *
+&gt;&gt;&gt; charstring(b"hi") # Byte string
+b'hi'
+&gt;&gt;&gt; charstring("hi")  # Unicode string
+Traceback (most recent call last):
+  File "&lt;stdin&gt;", line 1, in ?
+TypeError: in method 'charstring', argument 1 of type 'char *'
+</pre></div>
+
+<p>
+Note that in Python 2, defining <tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> has no
+effect, since strings in Python 2 are equivalent to Python 3 bytes objects.
+However, there is a similar capability to force unicode-only handling for
+wide characters C/C++ strings (<tt>wchar_t *</tt> or <tt>std::wstring</tt>
+types) in Python 2. By default, in Python 2 both strings and unicode strings
+are converted to C/C++ wide strings, and returned wide strings are converted
+to a Python unicode string. To instead only convert unicode strings to wide
+strings, users can add <tt>SWIG_PYTHON_STRICT_UNICODE_WCHAR</tt> to the
+generated code:
+</p>
+
+<div class="code"><pre>
+%module wchar_to_unicode
+%begin %{
+#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
+%}
+
+wchar_t *wcharstring(wchar_t *s) {
+  return s;
+}
+</pre></div>
+
+<p>
+This ensures that only unicode strings are accepted by wcharstring in both
+Python 2 and Python 3:
+</p>
+
+<div class="targetlang"><pre>
+&gt;&gt;&gt; from wchar_to_unicode import *
+&gt;&gt;&gt; wcharstring(u"hi") # Unicode string
+u'hi'
+&gt;&gt;&gt; wcharstring(b"hi") # Byte string
+Traceback (most recent call last):
+  File "&lt;stdin&gt;", line 1, in ?
+TypeError: in method 'charstring', argument 1 of type 'wchar_t *'
+</pre></div>
+
+<p>
+By defining both <tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> and
+<tt>SWIG_PYTHON_STRICT_UNICODE_WCHAR</tt>, Python wrapper code can support
+overloads taking both std::string (as Python bytes) and std::wstring
+(as Python unicode).
+</p>
+
 <H3><a name="Python_2_unicode">36.12.5 Python 2 Unicode</a></H3>


@ -6230,6 +6308,13 @@ but note that they are returned as a normal Python 2 string:
 &gt;&gt;&gt;
 </pre></div>

+<p>
+Note that defining both <tt>SWIG_PYTHON_2_UNICODE</tt> and
+<tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> at the same time is not allowed, since
+the first is allowing unicode conversion and the second is explicitly
+prohibiting it.
+</p>
+
 </body>
 </html>

--- a/Examples/test-suite/python/Makefile.in
+++ b/Examples/test-suite/python/Makefile.in
@ -65,6 +65,7 @@ CPP_TEST_CASES += \
 	python_overload_simple_cast \
 	python_pythoncode \
 	python_richcompare \
+	python_strict_unicode \
 	simutry \
 	std_containers \
 	swigobject \
--- a/Examples/test-suite/python/python_strict_unicode_runme.py
+++ b/Examples/test-suite/python/python_strict_unicode_runme.py
@ -0,0 +1,79 @@
+import python_strict_unicode
+from sys import version_info
+
+test_bytes   =  'hello \x01world\x99'
+BYTES        =  'BYTES'
+test_unicode = u'h\udce9llo w\u00f6rld'
+
+# Python < 2.6 rejects the b prefix for byte string literals as a SyntaxError,
+# so instead create Python3 bytes objects by encoding unicode strings as
+# latin-1, which maps code points 0-255 directly to the corresponding bytes.
+if version_info[0] >= 3:
+    test_bytes = test_bytes.encode('latin-1')
+    BYTES      = BYTES.encode('latin-1')
+
+# Test that byte string inputs and outputs work as expected
+bdbl = python_strict_unicode.double_str(test_bytes)
+if bdbl != test_bytes + test_bytes:
+    raise RuntimeError("Failed to double string")
+if type(bdbl) != type(BYTES):
+    raise RuntimeError("Wrong type output for string")
+bout = python_strict_unicode.same_str(test_bytes)
+if bout != test_bytes:
+    raise RuntimeError("Failed to copy char*")
+if type(bout) != type(BYTES):
+    raise RuntimeError("Wrong type output for char*")
+
+# Test that unicode string inputs and outputs work as expected
+udbl = python_strict_unicode.double_wstr(test_unicode)
+if udbl != test_unicode + test_unicode:
+    raise RuntimeError("Failed to double wide string")
+if type(udbl) != type(u''):
+    raise RuntimeError("Wrong type output for wide string")
+uout = python_strict_unicode.same_wstr(test_unicode)
+if uout != test_unicode:
+    raise RuntimeError("Failed to copy wchar_t*")
+if type(uout) != type(u''):
+    raise RuntimeError("Wrong type output for wchar_t*")
+
+# Test that overloading is handled properly
+bovr = python_strict_unicode.overload(test_bytes)
+if bovr != BYTES:
+    raise RuntimeError("Failed to return bytes from overload")
+if type(bovr) != type(BYTES):
+    raise RuntimeError("Wrong type output from overload")
+uovr = python_strict_unicode.overload(test_unicode)
+if uovr != u'UNICODE':
+    raise RuntimeError("Failed to return unicode from overload")
+if type(uovr) != type(u''):
+    raise RuntimeERror("Wrong type output from overload")
+
+# Test that bytes aren't accepted as wide strings and unicode isn't accepted as narrow strings
+try:
+    python_strict_unicode.double_str(test_unicode)
+    error = 1
+except TypeError:
+    error = 0
+if error:
+    raise RuntimeError("Unicode accepted for string")
+try:
+    python_strict_unicode.same_str(test_unicode)
+    error = 1
+except TypeError:
+    error = 0
+if error:
+    raise RuntimeError("Unicode accepted for char*")
+try:
+    python_strict_unicode.double_wstr(test_bytes)
+    error = 1
+except TypeError:
+    error = 0
+if error:
+    raise RuntimeError("Bytes accepted for wstring")
+try:
+    python_strict_unicode.same_wstr(test_bytes)
+    error = 1
+except TypeError:
+    error = 0
+if error:
+    raise RuntimeError("Bytes accepted for wchar_t*")
--- a/Examples/test-suite/python_strict_unicode.i
+++ b/Examples/test-suite/python_strict_unicode.i
@ -0,0 +1,41 @@
+%module python_strict_unicode
+
+%include <std_string.i>
+%include <std_wstring.i>
+
+%begin %{
+#define SWIG_PYTHON_STRICT_BYTE_CHAR
+#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
+%}
+
+%inline %{
+std::string double_str(const std::string& in)
+{
+  return in + in;
+}
+
+char *same_str(char* in)
+{
+  return in;
+}
+
+std::wstring double_wstr(const std::wstring& in)
+{
+  return in + in;
+}
+
+wchar_t *same_wstr(wchar_t* in)
+{
+  return in;
+}
+
+std::wstring overload(const std::wstring& in)
+{
+  return L"UNICODE";
+}
+
+std::string overload(const std::string& in)
+{
+  return "BYTES";
+}
+%}
--- a/Lib/python/pystrings.swg
+++ b/Lib/python/pystrings.swg
@ -6,13 +6,18 @@ SWIGINTERN int
 SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
 {
 %#if PY_VERSION_HEX>=0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+  if (PyBytes_Check(obj))
+%#else
  if (PyUnicode_Check(obj))
+%#endif
 %#else  
  if (PyString_Check(obj))
 %#endif
  {
    char *cstr; Py_ssize_t len;
 %#if PY_VERSION_HEX>=0x03000000
+%#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
    if (!alloc && cptr) {
        /* We can't allow converting without allocation, since the internal
           representation of string in Python 3 is UCS-2/UCS-4 but we require
@ -21,8 +26,9 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
        return SWIG_RuntimeError;
    }
    obj = PyUnicode_AsUTF8String(obj);
-    PyBytes_AsStringAndSize(obj, &cstr, &len);
    if(alloc) *alloc = SWIG_NEWOBJ;
+%#endif
+    PyBytes_AsStringAndSize(obj, &cstr, &len);
 %#else
    PyString_AsStringAndSize(obj, &cstr, &len);
 %#endif
@ -50,19 +56,27 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
 	  *alloc = SWIG_OLDOBJ;
 	}
      } else {
-	%#if PY_VERSION_HEX>=0x03000000
-	assert(0); /* Should never reach here in Python 3 */
-	%#endif
+%#if PY_VERSION_HEX>=0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+	*cptr = PyBytes_AsString(obj);
+%#else
+	assert(0); /* Should never reach here with Unicode strings in Python 3 */
+%#endif
+%#else
 	*cptr = SWIG_Python_str_AsChar(obj);
+%#endif
      }
    }
    if (psize) *psize = len + 1;
-%#if PY_VERSION_HEX>=0x03000000
+%#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
    Py_XDECREF(obj);
 %#endif
    return SWIG_OK;
  } else {
 %#if defined(SWIG_PYTHON_2_UNICODE)
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+%#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once"
+%#endif
 %#if PY_VERSION_HEX<0x03000000
    if (PyUnicode_Check(obj)) {
      char *cstr; Py_ssize_t len;
@ -112,11 +126,15 @@ SWIG_FromCharPtrAndSize(const char* carray, size_t size)
 	SWIG_InternalNewPointerObj(%const_cast(carray,char *), pchar_descriptor, 0) : SWIG_Py_Void();
    } else {
 %#if PY_VERSION_HEX >= 0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+      return PyBytes_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
+%#else
 %#if PY_VERSION_HEX >= 0x03010000
      return PyUnicode_DecodeUTF8(carray, %numeric_cast(size, Py_ssize_t), "surrogateescape");
 %#else
      return PyUnicode_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
 %#endif
+%#endif
 %#else
      return PyString_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
 %#endif
--- a/Lib/python/pywstrings.swg
+++ b/Lib/python/pywstrings.swg
@ -16,7 +16,7 @@ SWIG_AsWCharPtrAndSize(PyObject *obj, wchar_t **cptr, size_t *psize, int *alloc)
 {
  PyObject *tmp = 0;
  int isunicode = PyUnicode_Check(obj);
-%#if PY_VERSION_HEX < 0x03000000
+%#if PY_VERSION_HEX < 0x03000000 && !defined(SWIG_PYTHON_STRICT_UNICODE_WCHAR)
  if (!isunicode && PyString_Check(obj)) {
    obj = tmp = PyUnicode_FromObject(obj);
    isunicode = 1;