Merge branch 'ahnolds-pyunicode_wstrings'
* ahnolds-pyunicode_wstrings: Documentation on Python Bytes/Unicode distinction Tests for Python Bytes/Unicode distinction Add support for Python Bytes/Unicode distinction
This commit is contained in:
commit
5c758f6972
7 changed files with 235 additions and 6 deletions
|
|
@ -5,6 +5,11 @@ See the RELEASENOTES file for a summary of changes in each release.
|
|||
Version 3.0.9 (in progress)
|
||||
===========================
|
||||
|
||||
2016-01-27: ahnolds
|
||||
[Python] Added support for differentiating between Python Bytes
|
||||
and Unicode objects using by defining SWIG_PYTHON_STRICT_BYTE_CHAR
|
||||
and SWIG_PYTHON_STRICT_UNICODE_WCHAR.
|
||||
|
||||
2016-01-27: steeve
|
||||
[Go] Ensure structs are properly packed between gc and GCC/clang.
|
||||
|
||||
|
|
|
|||
|
|
@ -6165,6 +6165,84 @@ For more details about the <tt>surrogateescape</tt> error handler, please see
|
|||
<a href="https://www.python.org/dev/peps/pep-0383/">PEP 383</a>.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
In some cases, users may wish to instead handle all byte strings as bytes
|
||||
objects in Python 3. This can be accomplished by adding
|
||||
<tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> to the generated code:
|
||||
</p>
|
||||
|
||||
<div class="code"><pre>
|
||||
%module char_to_bytes
|
||||
%begin %{
|
||||
#define SWIG_PYTHON_STRICT_BYTE_CHAR
|
||||
%}
|
||||
|
||||
char *charstring(char *s) {
|
||||
return s;
|
||||
}
|
||||
</pre></div>
|
||||
|
||||
<p>
|
||||
This will modify the behavior so that only Python 3 bytes objects will be
|
||||
accepted and converted to a C/C++ string, and any string returned from C/C++
|
||||
will be converted to a bytes object in Python 3:
|
||||
</p>
|
||||
|
||||
<div class="targetlang"><pre>
|
||||
>>> from char_to_bytes import *
|
||||
>>> charstring(b"hi") # Byte string
|
||||
b'hi'
|
||||
>>> charstring("hi") # Unicode string
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in ?
|
||||
TypeError: in method 'charstring', argument 1 of type 'char *'
|
||||
</pre></div>
|
||||
|
||||
<p>
|
||||
Note that in Python 2, defining <tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> has no
|
||||
effect, since strings in Python 2 are equivalent to Python 3 bytes objects.
|
||||
However, there is a similar capability to force unicode-only handling for
|
||||
wide characters C/C++ strings (<tt>wchar_t *</tt> or <tt>std::wstring</tt>
|
||||
types) in Python 2. By default, in Python 2 both strings and unicode strings
|
||||
are converted to C/C++ wide strings, and returned wide strings are converted
|
||||
to a Python unicode string. To instead only convert unicode strings to wide
|
||||
strings, users can add <tt>SWIG_PYTHON_STRICT_UNICODE_WCHAR</tt> to the
|
||||
generated code:
|
||||
</p>
|
||||
|
||||
<div class="code"><pre>
|
||||
%module wchar_to_unicode
|
||||
%begin %{
|
||||
#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
|
||||
%}
|
||||
|
||||
wchar_t *wcharstring(wchar_t *s) {
|
||||
return s;
|
||||
}
|
||||
</pre></div>
|
||||
|
||||
<p>
|
||||
This ensures that only unicode strings are accepted by wcharstring in both
|
||||
Python 2 and Python 3:
|
||||
</p>
|
||||
|
||||
<div class="targetlang"><pre>
|
||||
>>> from wchar_to_unicode import *
|
||||
>>> wcharstring(u"hi") # Unicode string
|
||||
u'hi'
|
||||
>>> wcharstring(b"hi") # Byte string
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in ?
|
||||
TypeError: in method 'charstring', argument 1 of type 'wchar_t *'
|
||||
</pre></div>
|
||||
|
||||
<p>
|
||||
By defining both <tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> and
|
||||
<tt>SWIG_PYTHON_STRICT_UNICODE_WCHAR</tt>, Python wrapper code can support
|
||||
overloads taking both std::string (as Python bytes) and std::wstring
|
||||
(as Python unicode).
|
||||
</p>
|
||||
|
||||
<H3><a name="Python_2_unicode">36.12.5 Python 2 Unicode</a></H3>
|
||||
|
||||
|
||||
|
|
@ -6230,6 +6308,13 @@ but note that they are returned as a normal Python 2 string:
|
|||
>>>
|
||||
</pre></div>
|
||||
|
||||
<p>
|
||||
Note that defining both <tt>SWIG_PYTHON_2_UNICODE</tt> and
|
||||
<tt>SWIG_PYTHON_STRICT_BYTE_CHAR</tt> at the same time is not allowed, since
|
||||
the first is allowing unicode conversion and the second is explicitly
|
||||
prohibiting it.
|
||||
</p>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
||||
|
|
|
|||
|
|
@ -65,6 +65,7 @@ CPP_TEST_CASES += \
|
|||
python_overload_simple_cast \
|
||||
python_pythoncode \
|
||||
python_richcompare \
|
||||
python_strict_unicode \
|
||||
simutry \
|
||||
std_containers \
|
||||
swigobject \
|
||||
|
|
|
|||
79
Examples/test-suite/python/python_strict_unicode_runme.py
Normal file
79
Examples/test-suite/python/python_strict_unicode_runme.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
import python_strict_unicode
|
||||
from sys import version_info
|
||||
|
||||
test_bytes = 'hello \x01world\x99'
|
||||
BYTES = 'BYTES'
|
||||
test_unicode = u'h\udce9llo w\u00f6rld'
|
||||
|
||||
# Python < 2.6 rejects the b prefix for byte string literals as a SyntaxError,
|
||||
# so instead create Python3 bytes objects by encoding unicode strings as
|
||||
# latin-1, which maps code points 0-255 directly to the corresponding bytes.
|
||||
if version_info[0] >= 3:
|
||||
test_bytes = test_bytes.encode('latin-1')
|
||||
BYTES = BYTES.encode('latin-1')
|
||||
|
||||
# Test that byte string inputs and outputs work as expected
|
||||
bdbl = python_strict_unicode.double_str(test_bytes)
|
||||
if bdbl != test_bytes + test_bytes:
|
||||
raise RuntimeError("Failed to double string")
|
||||
if type(bdbl) != type(BYTES):
|
||||
raise RuntimeError("Wrong type output for string")
|
||||
bout = python_strict_unicode.same_str(test_bytes)
|
||||
if bout != test_bytes:
|
||||
raise RuntimeError("Failed to copy char*")
|
||||
if type(bout) != type(BYTES):
|
||||
raise RuntimeError("Wrong type output for char*")
|
||||
|
||||
# Test that unicode string inputs and outputs work as expected
|
||||
udbl = python_strict_unicode.double_wstr(test_unicode)
|
||||
if udbl != test_unicode + test_unicode:
|
||||
raise RuntimeError("Failed to double wide string")
|
||||
if type(udbl) != type(u''):
|
||||
raise RuntimeError("Wrong type output for wide string")
|
||||
uout = python_strict_unicode.same_wstr(test_unicode)
|
||||
if uout != test_unicode:
|
||||
raise RuntimeError("Failed to copy wchar_t*")
|
||||
if type(uout) != type(u''):
|
||||
raise RuntimeError("Wrong type output for wchar_t*")
|
||||
|
||||
# Test that overloading is handled properly
|
||||
bovr = python_strict_unicode.overload(test_bytes)
|
||||
if bovr != BYTES:
|
||||
raise RuntimeError("Failed to return bytes from overload")
|
||||
if type(bovr) != type(BYTES):
|
||||
raise RuntimeError("Wrong type output from overload")
|
||||
uovr = python_strict_unicode.overload(test_unicode)
|
||||
if uovr != u'UNICODE':
|
||||
raise RuntimeError("Failed to return unicode from overload")
|
||||
if type(uovr) != type(u''):
|
||||
raise RuntimeERror("Wrong type output from overload")
|
||||
|
||||
# Test that bytes aren't accepted as wide strings and unicode isn't accepted as narrow strings
|
||||
try:
|
||||
python_strict_unicode.double_str(test_unicode)
|
||||
error = 1
|
||||
except TypeError:
|
||||
error = 0
|
||||
if error:
|
||||
raise RuntimeError("Unicode accepted for string")
|
||||
try:
|
||||
python_strict_unicode.same_str(test_unicode)
|
||||
error = 1
|
||||
except TypeError:
|
||||
error = 0
|
||||
if error:
|
||||
raise RuntimeError("Unicode accepted for char*")
|
||||
try:
|
||||
python_strict_unicode.double_wstr(test_bytes)
|
||||
error = 1
|
||||
except TypeError:
|
||||
error = 0
|
||||
if error:
|
||||
raise RuntimeError("Bytes accepted for wstring")
|
||||
try:
|
||||
python_strict_unicode.same_wstr(test_bytes)
|
||||
error = 1
|
||||
except TypeError:
|
||||
error = 0
|
||||
if error:
|
||||
raise RuntimeError("Bytes accepted for wchar_t*")
|
||||
41
Examples/test-suite/python_strict_unicode.i
Normal file
41
Examples/test-suite/python_strict_unicode.i
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
%module python_strict_unicode
|
||||
|
||||
%include <std_string.i>
|
||||
%include <std_wstring.i>
|
||||
|
||||
%begin %{
|
||||
#define SWIG_PYTHON_STRICT_BYTE_CHAR
|
||||
#define SWIG_PYTHON_STRICT_UNICODE_WCHAR
|
||||
%}
|
||||
|
||||
%inline %{
|
||||
std::string double_str(const std::string& in)
|
||||
{
|
||||
return in + in;
|
||||
}
|
||||
|
||||
char *same_str(char* in)
|
||||
{
|
||||
return in;
|
||||
}
|
||||
|
||||
std::wstring double_wstr(const std::wstring& in)
|
||||
{
|
||||
return in + in;
|
||||
}
|
||||
|
||||
wchar_t *same_wstr(wchar_t* in)
|
||||
{
|
||||
return in;
|
||||
}
|
||||
|
||||
std::wstring overload(const std::wstring& in)
|
||||
{
|
||||
return L"UNICODE";
|
||||
}
|
||||
|
||||
std::string overload(const std::string& in)
|
||||
{
|
||||
return "BYTES";
|
||||
}
|
||||
%}
|
||||
|
|
@ -6,13 +6,18 @@ SWIGINTERN int
|
|||
SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
|
||||
{
|
||||
%#if PY_VERSION_HEX>=0x03000000
|
||||
%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
|
||||
if (PyBytes_Check(obj))
|
||||
%#else
|
||||
if (PyUnicode_Check(obj))
|
||||
%#endif
|
||||
%#else
|
||||
if (PyString_Check(obj))
|
||||
%#endif
|
||||
{
|
||||
char *cstr; Py_ssize_t len;
|
||||
%#if PY_VERSION_HEX>=0x03000000
|
||||
%#if !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
|
||||
if (!alloc && cptr) {
|
||||
/* We can't allow converting without allocation, since the internal
|
||||
representation of string in Python 3 is UCS-2/UCS-4 but we require
|
||||
|
|
@ -21,8 +26,9 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
|
|||
return SWIG_RuntimeError;
|
||||
}
|
||||
obj = PyUnicode_AsUTF8String(obj);
|
||||
PyBytes_AsStringAndSize(obj, &cstr, &len);
|
||||
if(alloc) *alloc = SWIG_NEWOBJ;
|
||||
%#endif
|
||||
PyBytes_AsStringAndSize(obj, &cstr, &len);
|
||||
%#else
|
||||
PyString_AsStringAndSize(obj, &cstr, &len);
|
||||
%#endif
|
||||
|
|
@ -50,19 +56,27 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
|
|||
*alloc = SWIG_OLDOBJ;
|
||||
}
|
||||
} else {
|
||||
%#if PY_VERSION_HEX>=0x03000000
|
||||
assert(0); /* Should never reach here in Python 3 */
|
||||
%#endif
|
||||
%#if PY_VERSION_HEX>=0x03000000
|
||||
%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
|
||||
*cptr = PyBytes_AsString(obj);
|
||||
%#else
|
||||
assert(0); /* Should never reach here with Unicode strings in Python 3 */
|
||||
%#endif
|
||||
%#else
|
||||
*cptr = SWIG_Python_str_AsChar(obj);
|
||||
%#endif
|
||||
}
|
||||
}
|
||||
if (psize) *psize = len + 1;
|
||||
%#if PY_VERSION_HEX>=0x03000000
|
||||
%#if PY_VERSION_HEX>=0x03000000 && !defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
|
||||
Py_XDECREF(obj);
|
||||
%#endif
|
||||
return SWIG_OK;
|
||||
} else {
|
||||
%#if defined(SWIG_PYTHON_2_UNICODE)
|
||||
%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
|
||||
%#error "Cannot use both SWIG_PYTHON_2_UNICODE and SWIG_PYTHON_STRICT_BYTE_CHAR at once"
|
||||
%#endif
|
||||
%#if PY_VERSION_HEX<0x03000000
|
||||
if (PyUnicode_Check(obj)) {
|
||||
char *cstr; Py_ssize_t len;
|
||||
|
|
@ -112,11 +126,15 @@ SWIG_FromCharPtrAndSize(const char* carray, size_t size)
|
|||
SWIG_InternalNewPointerObj(%const_cast(carray,char *), pchar_descriptor, 0) : SWIG_Py_Void();
|
||||
} else {
|
||||
%#if PY_VERSION_HEX >= 0x03000000
|
||||
%#if defined(SWIG_PYTHON_STRICT_BYTE_CHAR)
|
||||
return PyBytes_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
|
||||
%#else
|
||||
%#if PY_VERSION_HEX >= 0x03010000
|
||||
return PyUnicode_DecodeUTF8(carray, %numeric_cast(size, Py_ssize_t), "surrogateescape");
|
||||
%#else
|
||||
return PyUnicode_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
|
||||
%#endif
|
||||
%#endif
|
||||
%#else
|
||||
return PyString_FromStringAndSize(carray, %numeric_cast(size, Py_ssize_t));
|
||||
%#endif
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ SWIG_AsWCharPtrAndSize(PyObject *obj, wchar_t **cptr, size_t *psize, int *alloc)
|
|||
{
|
||||
PyObject *tmp = 0;
|
||||
int isunicode = PyUnicode_Check(obj);
|
||||
%#if PY_VERSION_HEX < 0x03000000
|
||||
%#if PY_VERSION_HEX < 0x03000000 && !defined(SWIG_PYTHON_STRICT_UNICODE_WCHAR)
|
||||
if (!isunicode && PyString_Check(obj)) {
|
||||
obj = tmp = PyUnicode_FromObject(obj);
|
||||
isunicode = 1;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue