Merge branch 'ahnolds-pyunicode_wstrings'

* ahnolds-pyunicode_wstrings:
  Documentation on Python Bytes/Unicode distinction
  Tests for Python Bytes/Unicode distinction
  Add support for Python Bytes/Unicode distinction
This commit is contained in:
William S Fulton 2016-02-06 07:57:30 +00:00
commit 5c758f6972
7 changed files with 235 additions and 6 deletions

View file

@ -5,6 +5,11 @@ See the RELEASENOTES file for a summary of changes in each release.
Version 3.0.9 (in progress)
===========================
2016-01-27: ahnolds
[Python] Added support for differentiating between Python Bytes
and Unicode objects using by defining SWIG_PYTHON_STRICT_BYTE_CHAR
and SWIG_PYTHON_STRICT_UNICODE_WCHAR.
2016-01-27: steeve
[Go] Ensure structs are properly packed between gc and GCC/clang.

View file

@ -6165,6 +6165,84 @@ For more details about the <tt>surrogateescape</tt> error handler, please see
<a href="https://www.python.org/dev/peps/pep-0383/">PEP 383</a>.
</p>
<p>
In some cases, users may wish to instead handle all byte strings as bytes
objects in Python 3. This can be accomplished by adding
<tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> to the generated code:
</p>
<div class="code"><pre>
%module char_to_bytes
%begin %{
#define SWIG_PYTHON_STRICT_BYTE_CHAR
%}
char *charstring(char *s) {
return s;
}
</pre></div>
<p>
This will modify the behavior so that only Python 3 bytes objects will be
accepted and converted to a C/C++ string, and any string returned from C/C++
will be converted to a bytes object in Python 3:
</p>
<div class="targetlang"><pre>
&gt;&gt;&gt; from char_to_bytes import *
&gt;&gt;&gt; charstring(b"hi") # Byte string
b'hi'
&gt;&gt;&gt; charstring("hi") # Unicode string
Traceback (most recent call last):
File "&lt;stdin&gt;", line 1, in ?
TypeError: in method 'charstring', argument 1 of type 'char *'
</pre></div>
<p>
Note that in Python 2, defining <tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> has no
effect, since strings in Python 2 are equivalent to Python 3 bytes objects.
However, there is a similar capability to force unicode-only handling for
wide characters C/C++ strings (<tt>wchar_t *</tt> or <tt>std::wstring</tt>
types) in Python 2. By default, in Python 2 both strings and unicode strings
are converted to C/C++ wide strings, and returned wide strings are converted
to a Python unicode string. To instead only convert unicode strings to wide
strings, users can add <tt>SWIG_PYTHON_STRICT_UNICODE_WCHAR</tt> to the
generated code:
</p>
<div class="code"><pre>
%module wchar_to_unicode
%begin %{
#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
%}
wchar_t *wcharstring(wchar_t *s) {
return s;
}
</pre></div>
<p>
This ensures that only unicode strings are accepted by wcharstring in both
Python 2 and Python 3:
</p>
<div class="targetlang"><pre>
&gt;&gt;&gt; from wchar_to_unicode import *
&gt;&gt;&gt; wcharstring(u"hi") # Unicode string
u'hi'
&gt;&gt;&gt; wcharstring(b"hi") # Byte string
Traceback (most recent call last):
File "&lt;stdin&gt;", line 1, in ?
TypeError: in method 'charstring', argument 1 of type 'wchar_t *'
</pre></div>
<p>
By defining both <tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> and
<tt>SWIG_PYTHON_STRICT_UNICODE_WCHAR</tt>, Python wrapper code can support
overloads taking both std::string (as Python bytes) and std::wstring
(as Python unicode).
</p>
<H3><a name="Python_2_unicode">36.12.5 Python 2 Unicode</a></H3>
@ -6230,6 +6308,13 @@ but note that they are returned as a normal Python 2 string:
&gt;&gt;&gt;
</pre></div>
<p>
Note that defining both <tt>SWIG_PYTHON_2_UNICODE</tt> and
<tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> at the same time is not allowed, since
the first is allowing unicode conversion and the second is explicitly
prohibiting it.
</p>
</body>
</html>

View file

@ -65,6 +65,7 @@ CPP_TEST_CASES += \
python_overload_simple_cast \
python_pythoncode \
python_richcompare \
python_strict_unicode \
simutry \
std_containers \
swigobject \

View file

@ -0,0 +1,79 @@
import python_strict_unicode
from sys import version_info
test_bytes = 'hello \x01world\x99'
BYTES = 'BYTES'
test_unicode = u'h\udce9llo w\u00f6rld'
# Python < 2.6 rejects the b prefix for byte string literals as a SyntaxError,
# so instead create Python3 bytes objects by encoding unicode strings as
# latin-1, which maps code points 0-255 directly to the corresponding bytes.
if version_info[0] >= 3:
test_bytes = test_bytes.encode('latin-1')
BYTES = BYTES.encode('latin-1')
# Test that byte string inputs and outputs work as expected
bdbl = python_strict_unicode.double_str(test_bytes)
if bdbl != test_bytes + test_bytes:
raise RuntimeError("Failed to double string")
if type(bdbl) != type(BYTES):
raise RuntimeError("Wrong type output for string")
bout = python_strict_unicode.same_str(test_bytes)
if bout != test_bytes:
raise RuntimeError("Failed to copy char*")
if type(bout) != type(BYTES):
raise RuntimeError("Wrong type output for char*")
# Test that unicode string inputs and outputs work as expected
udbl = python_strict_unicode.double_wstr(test_unicode)
if udbl != test_unicode + test_unicode:
raise RuntimeError("Failed to double wide string")
if type(udbl) != type(u''):
raise RuntimeError("Wrong type output for wide string")
uout = python_strict_unicode.same_wstr(test_unicode)
if uout != test_unicode:
raise RuntimeError("Failed to copy wchar_t*")
if type(uout) != type(u''):
raise RuntimeError("Wrong type output for wchar_t*")
# Test that overloading is handled properly
bovr = python_strict_unicode.overload(test_bytes)
if bovr != BYTES:
raise RuntimeError("Failed to return bytes from overload")
if type(bovr) != type(BYTES):
raise RuntimeError("Wrong type output from overload")
uovr = python_strict_unicode.overload(test_unicode)
if uovr != u'UNICODE':
raise RuntimeError("Failed to return unicode from overload")
if type(uovr) != type(u''):
raise RuntimeERror("Wrong type output from overload")
# Test that bytes aren't accepted as wide strings and unicode isn't accepted as narrow strings
try:
python_strict_unicode.double_str(test_unicode)
error = 1
except TypeError:
error = 0
if error:
raise RuntimeError("Unicode accepted for string")
try:
python_strict_unicode.same_str(test_unicode)
error = 1
except TypeError:
error = 0
if error:
raise RuntimeError("Unicode accepted for char*")
try:
python_strict_unicode.double_wstr(test_bytes)
error = 1
except TypeError:
error = 0
if error:
raise RuntimeError("Bytes accepted for wstring")
try:
python_strict_unicode.same_wstr(test_bytes)
error = 1
except TypeError:
error = 0
if error:
raise RuntimeError("Bytes accepted for wchar_t*")

View file

@ -0,0 +1,41 @@
%module python_strict_unicode
%include <std_string.i>
%include <std_wstring.i>
%begin %{
#define SWIG_PYTHON_STRICT_BYTE_CHAR
#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
%}
%inline %{
std::string double_str(const std::string& in)
{
return in + in;
}
char *same_str(char* in)
{
return in;
}
std::wstring double_wstr(const std::wstring& in)
{
return in + in;
}
wchar_t *same_wstr(wchar_t* in)
{
return in;
}
std::wstring overload(const std::wstring& in)
{
return L"UNICODE";
}
std::string overload(const std::string& in)
{
return "BYTES";
}
%}

View file

@ -6,13 +6,18 @@ SWIGINTERN int
SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
{
%#if PY_VERSION_HEX>=0x03000000
%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
if (PyBytes_Check(obj))
%#else
if (PyUnicode_Check(obj))
%#endif
%#else
if (PyString_Check(obj))
%#endif
{
char *cstr; Py_ssize_t len;
%#if PY_VERSION_HEX>=0x03000000
%#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
if (!alloc && cptr) {
/* We can't allow converting without allocation, since the internal
representation of string in Python 3 is UCS-2/UCS-4 but we require
@ -21,8 +26,9 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
return SWIG_RuntimeError;
}
obj = PyUnicode_AsUTF8String(obj);
PyBytes_AsStringAndSize(obj, &cstr, &len);
if(alloc) *alloc = SWIG_NEWOBJ;
%#endif
PyBytes_AsStringAndSize(obj, &cstr, &len);
%#else
PyString_AsStringAndSize(obj, &cstr, &len);
%#endif
@ -50,19 +56,27 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
*alloc = SWIG_OLDOBJ;
}
} else {
%#if PY_VERSION_HEX>=0x03000000
assert(0); /* Should never reach here in Python 3 */
%#endif
%#if PY_VERSION_HEX>=0x03000000
%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
*cptr = PyBytes_AsString(obj);
%#else
assert(0); /* Should never reach here with Unicode strings in Python 3 */
%#endif
%#else
*cptr = SWIG_Python_str_AsChar(obj);
%#endif
}
}
if (psize) *psize = len + 1;
%#if PY_VERSION_HEX>=0x03000000
%#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
Py_XDECREF(obj);
%#endif
return SWIG_OK;
} else {
%#if defined(SWIG_PYTHON_2_UNICODE)
%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
%#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once"
%#endif
%#if PY_VERSION_HEX<0x03000000
if (PyUnicode_Check(obj)) {
char *cstr; Py_ssize_t len;
@ -112,11 +126,15 @@ SWIG_FromCharPtrAndSize(const char* carray, size_t size)
SWIG_InternalNewPointerObj(%const_cast(carray,char *), pchar_descriptor, 0) : SWIG_Py_Void();
} else {
%#if PY_VERSION_HEX >= 0x03000000
%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
return PyBytes_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
%#else
%#if PY_VERSION_HEX >= 0x03010000
return PyUnicode_DecodeUTF8(carray, %numeric_cast(size, Py_ssize_t), "surrogateescape");
%#else
return PyUnicode_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
%#endif
%#endif
%#else
return PyString_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
%#endif

View file

@ -16,7 +16,7 @@ SWIG_AsWCharPtrAndSize(PyObject *obj, wchar_t **cptr, size_t *psize, int *alloc)
{
PyObject *tmp = 0;
int isunicode = PyUnicode_Check(obj);
%#if PY_VERSION_HEX < 0x03000000
%#if PY_VERSION_HEX < 0x03000000 && !defined(SWIG_PYTHON_STRICT_UNICODE_WCHAR)
if (!isunicode && PyString_Check(obj)) {
obj = tmp = PyUnicode_FromObject(obj);
isunicode = 1;