From 33edfa4a6243563a46508343c3836943073cc020 Mon Sep 17 00:00:00 2001
From: Alec Cooper <ahnolds@gmail.com>
Date: Wed, 27 Jan 2016 19:45:37 -0500
Subject: [PATCH 1/3] Add support for Python Bytes/Unicode distinction

---
 Lib/python/pystrings.swg  | 28 +++++++++++++++++++++++-----
 Lib/python/pywstrings.swg |  2 +-
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/Lib/python/pystrings.swg b/Lib/python/pystrings.swg
index a088c4cea..fd37855eb 100644
--- a/Lib/python/pystrings.swg
+++ b/Lib/python/pystrings.swg
@@ -6,13 +6,18 @@ SWIGINTERN int
 SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
 {
 %#if PY_VERSION_HEX>=0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+  if (PyBytes_Check(obj))
+%#else
   if (PyUnicode_Check(obj))
+%#endif
 %#else  
   if (PyString_Check(obj))
 %#endif
   {
     char *cstr; Py_ssize_t len;
 %#if PY_VERSION_HEX>=0x03000000
+%#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
     if (!alloc && cptr) {
         /* We can't allow converting without allocation, since the internal
            representation of string in Python 3 is UCS-2/UCS-4 but we require
@@ -21,8 +26,9 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
         return SWIG_RuntimeError;
     }
     obj = PyUnicode_AsUTF8String(obj);
-    PyBytes_AsStringAndSize(obj, &cstr, &len);
     if(alloc) *alloc = SWIG_NEWOBJ;
+%#endif
+    PyBytes_AsStringAndSize(obj, &cstr, &len);
 %#else
     PyString_AsStringAndSize(obj, &cstr, &len);
 %#endif
@@ -50,19 +56,27 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
 	  *alloc = SWIG_OLDOBJ;
 	}
       } else {
-	%#if PY_VERSION_HEX>=0x03000000
-	assert(0); /* Should never reach here in Python 3 */
-	%#endif
+%#if PY_VERSION_HEX>=0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+	*cptr = PyBytes_AsString(obj);
+%#else
+	assert(0); /* Should never reach here with Unicode strings in Python 3 */
+%#endif
+%#else
 	*cptr = SWIG_Python_str_AsChar(obj);
+%#endif
       }
     }
     if (psize) *psize = len + 1;
-%#if PY_VERSION_HEX>=0x03000000
+%#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
     Py_XDECREF(obj);
 %#endif
     return SWIG_OK;
   } else {
 %#if defined(SWIG_PYTHON_2_UNICODE)
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+%#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once"
+%#endif
 %#if PY_VERSION_HEX<0x03000000
     if (PyUnicode_Check(obj)) {
       char *cstr; Py_ssize_t len;
@@ -112,11 +126,15 @@ SWIG_FromCharPtrAndSize(const char* carray, size_t size)
 	SWIG_InternalNewPointerObj(%const_cast(carray,char *), pchar_descriptor, 0) : SWIG_Py_Void();
     } else {
 %#if PY_VERSION_HEX >= 0x03000000
+%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
+      return PyBytes_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
+%#else
 %#if PY_VERSION_HEX >= 0x03010000
       return PyUnicode_DecodeUTF8(carray, %numeric_cast(size, Py_ssize_t), "surrogateescape");
 %#else
       return PyUnicode_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
 %#endif
+%#endif
 %#else
       return PyString_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
 %#endif
diff --git a/Lib/python/pywstrings.swg b/Lib/python/pywstrings.swg
index 79f193b61..e64618762 100644
--- a/Lib/python/pywstrings.swg
+++ b/Lib/python/pywstrings.swg
@@ -16,7 +16,7 @@ SWIG_AsWCharPtrAndSize(PyObject *obj, wchar_t **cptr, size_t *psize, int *alloc)
 {
   PyObject *tmp = 0;
   int isunicode = PyUnicode_Check(obj);
-%#if PY_VERSION_HEX < 0x03000000
+%#if PY_VERSION_HEX < 0x03000000 && !defined(SWIG_PYTHON_STRICT_UNICODE_WCHAR)
   if (!isunicode && PyString_Check(obj)) {
     obj = tmp = PyUnicode_FromObject(obj);
     isunicode = 1;

From 17a4143dd053eff08df8913993b15738f690c949 Mon Sep 17 00:00:00 2001
From: Alec Cooper <ahnolds@gmail.com>
Date: Wed, 27 Jan 2016 20:28:32 -0500
Subject: [PATCH 2/3] Tests for Python Bytes/Unicode distinction

---
 Examples/test-suite/python/Makefile.in        |  1 +
 .../python/python_strict_unicode_runme.py     | 79 +++++++++++++++++++
 Examples/test-suite/python_strict_unicode.i   | 41 ++++++++++
 3 files changed, 121 insertions(+)
 create mode 100644 Examples/test-suite/python/python_strict_unicode_runme.py
 create mode 100644 Examples/test-suite/python_strict_unicode.i

diff --git a/Examples/test-suite/python/Makefile.in b/Examples/test-suite/python/Makefile.in
index 096e624ac..0c47d19ce 100644
--- a/Examples/test-suite/python/Makefile.in
+++ b/Examples/test-suite/python/Makefile.in
@@ -65,6 +65,7 @@ CPP_TEST_CASES += \
 	python_overload_simple_cast \
 	python_pythoncode \
 	python_richcompare \
+	python_strict_unicode \
 	simutry \
 	std_containers \
 	swigobject \
diff --git a/Examples/test-suite/python/python_strict_unicode_runme.py b/Examples/test-suite/python/python_strict_unicode_runme.py
new file mode 100644
index 000000000..642e127fa
--- /dev/null
+++ b/Examples/test-suite/python/python_strict_unicode_runme.py
@@ -0,0 +1,79 @@
+import python_strict_unicode
+from sys import version_info
+
+test_bytes   =  'hello \x01world\x99'
+BYTES        =  'BYTES'
+test_unicode = u'h\udce9llo w\u00f6rld'
+
+# Python < 2.6 rejects the b prefix for byte string literals as a SyntaxError,
+# so instead create Python3 bytes objects by encoding unicode strings as
+# latin-1, which maps code points 0-255 directly to the corresponding bytes.
+if version_info[0] >= 3:
+    test_bytes = test_bytes.encode('latin-1')
+    BYTES      = BYTES.encode('latin-1')
+
+# Test that byte string inputs and outputs work as expected
+bdbl = python_strict_unicode.double_str(test_bytes)
+if bdbl != test_bytes + test_bytes:
+    raise RuntimeError("Failed to double string")
+if type(bdbl) != type(BYTES):
+    raise RuntimeError("Wrong type output for string")
+bout = python_strict_unicode.same_str(test_bytes)
+if bout != test_bytes:
+    raise RuntimeError("Failed to copy char*")
+if type(bout) != type(BYTES):
+    raise RuntimeError("Wrong type output for char*")
+
+# Test that unicode string inputs and outputs work as expected
+udbl = python_strict_unicode.double_wstr(test_unicode)
+if udbl != test_unicode + test_unicode:
+    raise RuntimeError("Failed to double wide string")
+if type(udbl) != type(u''):
+    raise RuntimeError("Wrong type output for wide string")
+uout = python_strict_unicode.same_wstr(test_unicode)
+if uout != test_unicode:
+    raise RuntimeError("Failed to copy wchar_t*")
+if type(uout) != type(u''):
+    raise RuntimeError("Wrong type output for wchar_t*")
+
+# Test that overloading is handled properly
+bovr = python_strict_unicode.overload(test_bytes)
+if bovr != BYTES:
+    raise RuntimeError("Failed to return bytes from overload")
+if type(bovr) != type(BYTES):
+    raise RuntimeError("Wrong type output from overload")
+uovr = python_strict_unicode.overload(test_unicode)
+if uovr != u'UNICODE':
+    raise RuntimeError("Failed to return unicode from overload")
+if type(uovr) != type(u''):
+    raise RuntimeERror("Wrong type output from overload")
+
+# Test that bytes aren't accepted as wide strings and unicode isn't accepted as narrow strings
+try:
+    python_strict_unicode.double_str(test_unicode)
+    error = 1
+except TypeError:
+    error = 0
+if error:
+    raise RuntimeError("Unicode accepted for string")
+try:
+    python_strict_unicode.same_str(test_unicode)
+    error = 1
+except TypeError:
+    error = 0
+if error:
+    raise RuntimeError("Unicode accepted for char*")
+try:
+    python_strict_unicode.double_wstr(test_bytes)
+    error = 1
+except TypeError:
+    error = 0
+if error:
+    raise RuntimeError("Bytes accepted for wstring")
+try:
+    python_strict_unicode.same_wstr(test_bytes)
+    error = 1
+except TypeError:
+    error = 0
+if error:
+    raise RuntimeError("Bytes accepted for wchar_t*")
diff --git a/Examples/test-suite/python_strict_unicode.i b/Examples/test-suite/python_strict_unicode.i
new file mode 100644
index 000000000..93240a9b7
--- /dev/null
+++ b/Examples/test-suite/python_strict_unicode.i
@@ -0,0 +1,41 @@
+%module python_strict_unicode
+
+%include <std_string.i>
+%include <std_wstring.i>
+
+%begin %{
+#define SWIG_PYTHON_STRICT_BYTE_CHAR
+#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
+%}
+
+%inline %{
+std::string double_str(const std::string& in)
+{
+  return in + in;
+}
+
+char *same_str(char* in)
+{
+  return in;
+}
+
+std::wstring double_wstr(const std::wstring& in)
+{
+  return in + in;
+}
+
+wchar_t *same_wstr(wchar_t* in)
+{
+  return in;
+}
+
+std::wstring overload(const std::wstring& in)
+{
+  return L"UNICODE";
+}
+
+std::string overload(const std::string& in)
+{
+  return "BYTES";
+}
+%}

From ba40c4a2562c935dd2846dd15cdfc55952c7b8f6 Mon Sep 17 00:00:00 2001
From: Alec Cooper <ahnolds@gmail.com>
Date: Thu, 28 Jan 2016 08:29:01 -0500
Subject: [PATCH 3/3] Documentation on Python Bytes/Unicode distinction

---
 CHANGES.current        |  5 +++
 Doc/Manual/Python.html | 85 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/CHANGES.current b/CHANGES.current
index 2a0017822..f99e70398 100644
--- a/CHANGES.current
+++ b/CHANGES.current
@@ -5,6 +5,11 @@ See the RELEASENOTES file for a summary of changes in each release.
 Version 3.0.9 (in progress)
 ===========================
 
+2016-01-27: ahnolds
+            [Python] Added support for differentiating between Python Bytes
+            and Unicode objects using by defining SWIG_PYTHON_STRICT_BYTE_CHAR
+            and SWIG_PYTHON_STRICT_UNICODE_WCHAR.
+
 2016-01-27: steeve
             [Go] Ensure structs are properly packed between gc and GCC/clang.
 
diff --git a/Doc/Manual/Python.html b/Doc/Manual/Python.html
index c691d89cf..0cd656ff2 100644
--- a/Doc/Manual/Python.html
+++ b/Doc/Manual/Python.html
@@ -6165,6 +6165,84 @@ For more details about the <tt>surrogateescape</tt> error handler, please see
 <a href="https://www.python.org/dev/peps/pep-0383/">PEP 383</a>.
 </p>
 
+<p>
+In some cases, users may wish to instead handle all byte strings as bytes
+objects in Python 3. This can be accomplished by adding
+<tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> to the generated code:
+</p>
+
+<div class="code"><pre>
+%module char_to_bytes
+%begin %{
+#define SWIG_PYTHON_STRICT_BYTE_CHAR
+%}
+
+char *charstring(char *s) {
+  return s;
+}
+</pre></div>
+
+<p>
+This will modify the behavior so that only Python 3 bytes objects will be
+accepted and converted to a C/C++ string, and any string returned from C/C++
+will be converted to a bytes object in Python 3:
+</p>
+
+<div class="targetlang"><pre>
+&gt;&gt;&gt; from char_to_bytes import *
+&gt;&gt;&gt; charstring(b"hi") # Byte string
+b'hi'
+&gt;&gt;&gt; charstring("hi")  # Unicode string
+Traceback (most recent call last):
+  File "&lt;stdin&gt;", line 1, in ?
+TypeError: in method 'charstring', argument 1 of type 'char *'
+</pre></div>
+
+<p>
+Note that in Python 2, defining <tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> has no
+effect, since strings in Python 2 are equivalent to Python 3 bytes objects.
+However, there is a similar capability to force unicode-only handling for
+wide characters C/C++ strings (<tt>wchar_t *</tt> or <tt>std::wstring</tt>
+types) in Python 2. By default, in Python 2 both strings and unicode strings
+are converted to C/C++ wide strings, and returned wide strings are converted
+to a Python unicode string. To instead only convert unicode strings to wide
+strings, users can add <tt>SWIG_PYTHON_STRICT_UNICODE_WCHAR</tt> to the
+generated code:
+</p>
+
+<div class="code"><pre>
+%module wchar_to_unicode
+%begin %{
+#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
+%}
+
+wchar_t *wcharstring(wchar_t *s) {
+  return s;
+}
+</pre></div>
+
+<p>
+This ensures that only unicode strings are accepted by wcharstring in both
+Python 2 and Python 3:
+</p>
+
+<div class="targetlang"><pre>
+&gt;&gt;&gt; from wchar_to_unicode import *
+&gt;&gt;&gt; wcharstring(u"hi") # Unicode string
+u'hi'
+&gt;&gt;&gt; wcharstring(b"hi") # Byte string
+Traceback (most recent call last):
+  File "&lt;stdin&gt;", line 1, in ?
+TypeError: in method 'charstring', argument 1 of type 'wchar_t *'
+</pre></div>
+
+<p>
+By defining both <tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> and
+<tt>SWIG_PYTHON_STRICT_UNICODE_WCHAR</tt>, Python wrapper code can support
+overloads taking both std::string (as Python bytes) and std::wstring
+(as Python unicode).
+</p>
+
 <H3><a name="Python_2_unicode">36.12.5 Python 2 Unicode</a></H3>
 
 
@@ -6230,6 +6308,13 @@ but note that they are returned as a normal Python 2 string:
 &gt;&gt;&gt;
 </pre></div>
 
+<p>
+Note that defining both <tt>SWIG_PYTHON_2_UNICODE</tt> and
+<tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> at the same time is not allowed, since
+the first is allowing unicode conversion and the second is explicitly
+prohibiting it.
+</p>
+
 </body>
 </html>